src/hg/makeDb/doc/droAna3.txt 1.3

1.3 2009/04/07 16:55:43 angie
Swapped chains/nets from dm3 to get liftOver chains to dm3 -- user request.
Index: src/hg/makeDb/doc/droAna3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/droAna3.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/makeDb/doc/droAna3.txt	23 Oct 2006 20:09:02 -0000	1.2
+++ src/hg/makeDb/doc/droAna3.txt	7 Apr 2009 16:55:43 -0000	1.3
@@ -1,223 +1,235 @@
 # for emacs: -*- mode: sh; -*-
 
 # Drosophila ananassae -- Agencourt "CAF1" via Eisen's 12-fly site
 
 # THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT
 
 
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE 9/20/06 angie)
     ssh kkstore05
     mkdir /cluster/store12/droAna3
     ln -s /cluster/store12/droAna3 /cluster/data/droAna3
     mkdir /cluster/data/droAna3/downloads
     cd /cluster/data/droAna3/downloads
     wget http://rana.lbl.gov/drosophila/caf1/dana_caf1.tar.gz
     tar xvzf dana_caf1.tar.gz
     cd dana
     faSize scaffolds.bases
 #230993012 bases (17074195 N's 213918817 real 213918817 upper 0 lower) in 13749 sequences in 1 files
 #Total size: mean 16800.7 sd 387178.2 min 55 (scaffold_13714) max 23697760 (scaffold_13340) median 1517
 #N count: mean 1241.8 sd 9836.0
 #U count: mean 15558.9 sd 382128.9
 #L count: mean 0.0 sd 0.0
 
 
 #########################################################################
 # MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/20/06 angie)
     ssh kkstore05
     cd /cluster/data/droAna3
     cat > droAna3.config.ra <<EOF
 # Config parameters for makeGenomeDb.pl:
 db droAna3
 clade insect
 scientificName Drosophila ananassae
 assemblyDate Feb. 2006
 assemblyLabel Agencourt CAF1
 orderKey 54
 mitoAcc none
 fastaFiles /cluster/data/droAna3/downloads/dana/scaffolds.bases
 agpFiles /cluster/data/droAna3/downloads/dana/assembly.agp
 dbDbSpeciesDir drosophila
 EOF
 
     # Stop at db step so we can use featureBits, but don't do dbDb and trackDb
     # because we're not building an actual browser for now.
     makeGenomeDb.pl droAna3.config.ra -stop=db \
       >& makeGenomeDb.log & tail -f makeGenomeDb.log
 
 
 #########################################################################
 # REPEATMASKER (DONE 9/20/06 angie)
     ssh kkstore05
     # Run -debug to create the dir structure and preview the scripts:
     doRepeatMasker.pl droAna3 -verbose 3 -debug
     # Run it for real and tail the log:
     doRepeatMasker.pl droAna3 -species drosophila -verbose 3 \
       >& /cluster/data/droAna3/bed/RepeatMasker.2006-09-20/do.log &
     tail -f /cluster/data/droAna3/bed/RepeatMasker.2006-09-20/do.log
     # RepeatMasker and lib version from do.log:
 #    March 20 2006 (open-3-1-5) version of RepeatMasker
 #CC   RELEASE 20060315;                                            *
     # Compare coverage to previous assembly:
     featureBits -chrom=scaffold_13340 droAna3 rmsk
 #688661 bases of 23471240 (2.934%) in intersection
     featureBits -chrom=scaffold_13340 droAna2 rmsk
 #682697 bases of 23466339 (2.909%) in intersection
     nice featureBits droAna3 rmsk
 #35293237 bases of 213918817 (16.498%) in intersection
 
 
 #########################################################################
 # SIMPLE REPEATS (TRF) (DONE 9/20/06 angie)
     ssh kolossus
     nice tcsh
     mkdir /cluster/data/droAna3/bed/simpleRepeat
     cd /cluster/data/droAna3/bed/simpleRepeat
     twoBitToFa ../../droAna3.unmasked.2bit stdout \
     | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
       -bedAt=simpleRepeat.bed -tempDir=/tmp \
     >& trf.log & tail -f trf.log
     # ~110 minutes (longer than D. mel, must be because of the scaffolds)
 
     # Make a filtered version for sequence masking:
     awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
 
     # Load unfiltered repeats into the database:
     ssh hgwdev
     hgLoadBed droAna3 simpleRepeat \
       /cluster/data/droAna3/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     # Compare coverage to previous assembly:
     featureBits -chrom=scaffold_13340 droAna3 simpleRepeat
 #520511 bases of 23471240 (2.218%) in intersection
     featureBits -chrom=scaffold_13340 droAna2 simpleRepeat
 #515863 bases of 23466339 (2.198%) in intersection
 
 
 #########################################################################
 # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/20/06 angie)
     ssh kolossus
     cd /cluster/data/droAna3
     time twoBitMask droAna3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed droAna3.2bit
     # This warning is OK -- the extra fields are not block coordinates.
 #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
 #0.266u 0.341s 0:02.01 29.8%     0+0k 0+0io 0pf+0w
 
     # Because this is a no-browser build (just masking for alignment)
     # I did not make the usual /gbdb/$db/$db.2bit link.
 
 
 ###########################################################################
 # WINDOWMASKER EXPERIMENT (DONE 10/17/06 angie)
     # The blastz run vs. droWil1 was just destroyed by mega-output,
     # even with -chainFilterMinScore=10000 and M=50 (trying M=20 but not
     # too hopeful)... so let's try a de-novo masker before alignment:
     ssh kolossus
     mkdir /cluster/data/droAna3/bed/windowmasker.2006-10-17
     cd /cluster/data/droAna3/bed/windowmasker.2006-10-17
     # Instructions for running windowmasker:
     wget ftp://ftp.ncbi.nlm.nih.gov/pub/agarwala/windowmasker/README.windowmasker
     twoBitToFa /cluster/data/droAna3/droAna3.2bit tmp.fa
     # First, collect counts:
     /cluster/bin/x86_64/windowmasker -mk_counts true -input tmp.fa \
       -output wm.counts
 
     # Then use those counts to mask sequence:
     # Note: this can't run on a small cluster host because it dies if it 
     # can't open a socket to www.ncbi.nlm.nih.gov:80 !
     /cluster/bin/x86_64/windowmasker -ustat wm.counts -input tmp.fa \
       -output wm.intervals 
     perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \
                if (/^(\d+) - (\d+)/) { \
                  $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \
                }' wm.intervals > windowmasker.bed
     # Quick coverage:
     awk '{print $3 - $2;}' windowmasker.bed | total
 #77984750
     awk '{print $2;}' ../../chrom.sizes  | total
 #230993012
     calc 77984750 / 230993012
 #77984750 / 230993012 = 0.337607
     # and that's including gaps -- Much better than the 16.498% coverage 
     # from RepeatMasker!
     # Make a masked .2bit:
     twoBitMask ../../droAna3.2bit windowmasker.bed ../../droAna3.WM.2bit
 
     # Now try with -sdust to additionally mask low-complexity sequence:
     /cluster/bin/x86_64/windowmasker -ustat wm.counts -sdust true \
        -input tmp.fa -output wm.sdust.intervals
     perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \
                if (/^(\d+) - (\d+)/) { \
                  $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \
                }' wm.sdust.intervals > windowmasker.sdust.bed
     awk '{print $3 - $2;}' windowmasker.sdust.bed | total
 #97909778
     calc 97909778 / 230993012
 #97909778 / 230993012 = 0.423865
     # Make a masked .2bit (even if we don't end up needing it):
     twoBitMask ../../droAna3.2bit windowmasker.sdust.bed \
       ../../droAna3.WMSDust.2bit
 
     ssh hgwdev
     cd /cluster/data/droAna3/bed/windowmasker.2006-10-17
     hgLoadBed droAna3 windowmasker windowmasker.bed
     hgLoadBed droAna3 windowmaskerSdust windowmasker.sdust.bed
     nice featureBits droAna3 rmsk windowmasker
 #20661102 bases of 213918817 (9.658%) in intersection
     nice featureBits droAna3 rmsk \!windowmasker
 #14632135 bases of 213918817 (6.840%) in intersection
     nice featureBits droAna3 \!rmsk windowmasker
 #57323648 bases of 213918817 (26.797%) in intersection
     # Sanity checks on windowmasker+Sdust being a superset of windowmasker:
     nice featureBits droAna3 windowmasker windowmaskerSdust
 #77984750 bases of 213918817 (36.455%) in intersection
     nice featureBits droAna3 windowmasker \!windowmaskerSdust
 #0 bases of 213918817 (0.000%) in intersection
     nice featureBits droAna3 \!windowmasker windowmaskerSdust
 #19925028 bases of 213918817 (9.314%) in intersection
     rm tmp.fa bed.tab
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROWIL1 (DONE 10/20/06 angie)
     # Using WindowMasker'd sequence since prior runs died due to massive 
     # output (netChainSubset out of memory), even with chainMinScore=10000
     # or chainMinScore=8000 + M=20.
     ssh kkstore05
     mkdir /cluster/data/droAna3/bed/blastz.droWil1.2006-10-19
     cd /cluster/data/droAna3/bed/blastz.droWil1.2006-10-19
     cat << '_EOF_' > DEF
 # D. ananassae vs. D. willistoni
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. ananassae
 SEQ1_DIR=/san/sanvol1/scratch/droAna3/droAna3.WM.2bit
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/san/sanvol1/scratch/droAna3/chrom.sizes
 SEQ1_LIMIT=50
 
 # QUERY - D. willistoni
 SEQ2_DIR=/san/sanvol1/scratch/droWil1/droWil1.WM.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/san/sanvol1/scratch/droWil1/chrom.sizes
 SEQ2_LIMIT=200
 
 BASE=/cluster/data/droAna3/bed/blastz.droWil1.2006-10-19
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub=pk \
       -blastzOutRoot /san/sanvol1/scratch/droAna3droWil1 >& do.log &
     tail -f do.log
     ln -s blastz.droWil1.2006-10-19 /cluster/data/droAna3/bed/blastz.droWil1
 
 
+#########################################################################
+# SWAP DM3 CHAIN/NET (DONE 4/2/09 angie)
+    mkdir /hive/data/genomes/droAna3/bed/blastz.dm3.swap
+    cd /hive/data/genomes/droAna3/bed/blastz.dm3.swap
+    doBlastzChainNet.pl -swap -bigClusterHub swarm -smallClusterHub memk \
+      -workhorse kolossus \
+      /hive/data/genomes/dm3/bed/blastz.droAna3/DEF >& do.log &
+    tail -f do.log
+    ln -s blastz.dm3.swap /hive/data/genomes/droAna3/bed/blastz.dm3
+
+
+#########################################################################