src/hg/makeDb/doc/dm3.txt 1.24

1.24 2009/02/23 23:41:00 angie
Added bdtnpChipper (liftOver from dm2).
Index: src/hg/makeDb/doc/dm3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/dm3.txt,v
retrieving revision 1.23
retrieving revision 1.24
diff -b -B -U 1000000 -r1.23 -r1.24
--- src/hg/makeDb/doc/dm3.txt	27 Oct 2008 20:10:07 -0000	1.23
+++ src/hg/makeDb/doc/dm3.txt	23 Feb 2009 23:41:00 -0000	1.24
@@ -1,2744 +1,2860 @@
 # for emacs: -*- mode: sh; -*-
 
 # Drosophila melanogaster -- BDGP Release 5.0 (Apr. 2006)
 
 
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE 7/7/06 angie)
     ssh kkstore05
     mkdir /cluster/store12/dm3
     ln -s /cluster/store12/dm3 /cluster/data/dm3
     mkdir /cluster/data/dm3/downloads
     cd /cluster/data/dm3/downloads
     wget ftp://ftp.fruitfly.org/pub/download/compressed/dmel_release5-2006-04-16.tgz
     tar xvzf dmel_release5-2006-04-16.tgz
     cd Dmel_Release5
     faSize na*
 #168717020 bases (6368725 N's 162348295 real 162348295 upper 0 lower) in 14 sequences in 14 files
 #Total size: mean 12051215.7 sd 11751902.6 min 204112 (XHet) max 29004656 (ArmUextra) median 10049037
     # Follow FlyBase's lead on the chromosome names, but still use our 
     # "chr" prefix:
     mkdir ../renamedChromFa
     foreach f (na_*.dmel.RELEASE5)
       set chr = `echo $f:r:r | sed -r -e 's/^na_(arm)?/chr/;'`
       echo $chr
       if (`grep '^>' $f | wc -l` != 1) then
         echo $f does not have exactly one fasta record\!
         break
       endif
       sed -e 's/^>.*/>'$chr'/' $f > ../renamedChromFa/$chr.fa
     end
     cd ../renamedChromFa/
     faSize *
 #168717020 bases (6368725 N's 162348295 real 162348295 upper 0 lower) in 14 sequences in 14 files
 #Total size: mean 12051215.7 sd 11751902.6 min 204112 (chrXHet) max 29004656 (chrUextra) median 10049037
 
 
 #########################################################################
 # MAKE GENOME DB (DONE 7/10/06 angie)
     ssh kkstore05
     cd /cluster/data/dm3
     cat > dm3.config.ra <<EOF
 # Config parameters for makeGenomeDb.pl:
 db dm3
 clade insect
 scientificName Drosophila melanogaster
 assemblyDate Apr. 2006
 assemblyLabel BDGP Release 5
 orderKey 50
 mitoAcc 5835233
 fastaFiles /cluster/data/dm3/downloads/renamedChromFa/*.fa
 dbDbSpeciesDir drosophila
 fakeAgpMinContigGap 20
 fakeAgpMinScaffoldGap 50000
 EOF
 
     ~/kent/src/utils/makeGenomeDb.pl dm3.config.ra \
       >& makeGenomeDb.log & tail -f makeGenomeDb.log
 
 
 #########################################################################
 # REPEATMASKER (DONE 7/31/06 angie - REDONE 6/20/07)
 # 6/19/07 -- Debugged the coverage drop w/Robert Hubley.  It was due to 
 # version lag between the NCBI taxonomy label for the Drosophila genus 
 # vs. the label used in the Library file.  I fixed the local lib file, 
 # and Robert said it will be fixed in the next released lib file.
     ssh kkstore05
     # Run -debug to create the dir structure and preview the scripts:
     ~/kent/src/hg/utils/automation/doRepeatMasker.pl dm3 -verbose 3 -debug
     # Run it for real and tail the log:
     ~/kent/src/hg/utils/automation/doRepeatMasker.pl dm3 -verbose 3 \
       >& /cluster/data/dm3/bed/RepeatMasker.2007-06-20/do.log &
     tail -f /cluster/data/dm3/bed/RepeatMasker.2007-06-20/do.log
     # RepeatMasker and lib version from do.log:
 #    May 17 2007 (open-3-1-8) version of RepeatMasker
 #grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
 #CC   RELEASE 20061006;                                            *
 
     # Compare coverage to previous assembly:
     featureBits dm3 rmsk
 #44719009 bases of 162367812 (27.542%) in intersection
     featureBits dm2 rmsk
 #16178239 bases of 131698467 (12.284%) in intersection
     # Wow!  What an increase... largely due to the new chrUextra:
     featureBits dm3 rmsk -chrom=chrUextra
 #20898714 bases of 25541756 (81.822%) in intersection
     featureBits dm3 rmsk -chrom=chr3R
 #1544878 bases of 27905053 (5.536%) in intersection
     featureBits dm2 rmsk -chrom=chr3R
 #1538239 bases of 27905053 (5.512%) in intersection
 
 
 #########################################################################
 # SIMPLE REPEATS (TRF) (DONE 8/1/06 angie)
     ssh kolossus
     nice tcsh
     mkdir /cluster/data/dm3/bed/simpleRepeat
     cd /cluster/data/dm3/bed/simpleRepeat
     twoBitToFa ../../dm3.unmasked.2bit stdout \
     | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
       -bedAt=simpleRepeat.bed -tempDir=/tmp \
     >& trf.log & tail -f trf.log
     # Only about 20 minutes, but dm3 is only 168Mbp...
     # Make a filtered version for sequence masking:
     awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
     splitFileByColumn trfMask.bed trfMaskChrom
 
     # Load unfiltered repeats into the database:
     ssh hgwdev
     hgLoadBed dm3 simpleRepeat \
       /cluster/data/dm3/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     # Compare coverage to previous assembly:
     featureBits dm3 simpleRepeat
 #6837416 bases of 162367812 (4.211%) in intersection
     featureBits dm2 simpleRepeat
 #1597029 bases of 131698467 (1.213%) in intersection
     # Looks like the increase is due to chrUextra:
     featureBits dm3 simpleRepeat -chrom=chrUextra
 #3871170 bases of 25541756 (15.156%) in intersection
     featureBits dm3 simpleRepeat -chrom=chr2L
 #225239 bases of 23011344 (0.979%) in intersection
 
 
 #########################################################################
 # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 8/1/06 angie - REDONE 6/21/07)
     ssh kolossus
     cd /cluster/data/dm3
     time twoBitMask dm3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dm3.2bit
     # This warning is OK -- the extra fields are not block coordinates.
 #Warning: BED file bed/simpleRepeat/trfMask.bed has 10 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
 #0.136u 0.272s 0:01.98 20.2%     0+0k 0+0io 3pf+0w
     # Link to it from /gbdb:
     ssh hgwdev
     ln -s /cluster/data/dm3/dm3.2bit /gbdb/dm3/dm3.2bit
 
 
 #########################################################################
 # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 8/4/06 angie - REDONE 6/21/07)
     cd /cluster/data/dm3
     makeDownloads.pl dm3 -verbose=2 >& jkStuff/downloads.log &
     tail -f jkStuff/downloads.log
     # Edit README.txt files when done:
 # *** Edit each README.txt to resolve any notes marked with "***":
 #     /cluster/data/dm3/goldenPath/database/README.txt
 #     /cluster/data/dm3/goldenPath/bigZips/README.txt
 #     /cluster/data/dm3/goldenPath/chromosomes/README.txt
     # 6/21/07 -- since I'm regenerating these after the GenBank tables are 
     # in place... our bigZips/upstream* files are from FlyBase Genes -- 
     # remove the refseq versions generated by the script and restore 
     # the FlyBase upstream:
     ssh hgwdev
     rm /cluster/data/dm3/goldenPath/bigZips/upstream*
     rm /usr/local/apache/htdocs/goldenPath/dm3/bigZips/upstream*
     ln -s /cluster/data/dm3/bed/flybase5.1/bigZips/up* \
       /usr/local/apache/htdocs/goldenPath/dm3/bigZips/
     cat /cluster/data/dm3/bed/flybase5.1/bigZips/md5sum.txt \
       >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
     # Edited out the old up*.gz sums.
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET DP3 (DONE 8/14/06 angie)
     mkdir /cluster/data/dm3/bed/blastz.dp3.2006-08-14
     cd /cluster/data/dm3/bed/blastz.dp3.2006-08-14
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. pseudoobscura
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm3/dm3.2bit
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. pseudoobscura
 SEQ2_DIR=/iscratch/i/dp3/nib
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/dp3/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.dp3.2006-08-14
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /cluster/bluearc/dm3dp3 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L chainDp3Link
 #17523073 bases of 23011344 (76.150%) in intersection
 #dm2: 74.901%
     ln -s blastz.dp3.2006-08-14 /cluster/data/dm3/bed/blastz.dp3
 
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE 8/18/06 angie - REDONE 6/19/07)
     # Use -repMatch=100 (based on size -- for human we use 1024, and 
     # fly size is ~4.4% of human judging by gapless dm1 genome size from 
     # featureBits -- we would use 45, but bump that up a bit to be more 
     # conservative).
     ssh kolossus
     blat /cluster/data/dm3/dm3.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/dm3/11.ooc -repMatch=100
 #Wrote 7657 overused 11-mers to /cluster/bluearc/dm3/11.ooc
     # More than twice as many as dm2... probably due to chrUextra.
 
     # 6/19/07 -- That seems to be adversely affecting genbank alignments,
     # so bump up repMatch to get a .ooc count similar to dm2's 3031.
     # repMatch  #overused
     # 100       7657
     # 150       3709
     # 165       3152 <-- close enough
     # 180       2688
     # 200       2242
     blat /cluster/data/dm3/dm3.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/dm3/11.scaled.ooc -repMatch=165
 #Wrote 3152 overused 11-mers to /cluster/bluearc/dm3/11.scaled.ooc
     # MarkD did a test run, results look good, so I'll replace the 
     # original .ooc.  Also, Mark requested moving the ooc to the same
     # dir as the genbank .2bit (iscratch).
     mv /cluster/bluearc/dm3/11.scaled.ooc /cluster/bluearc/dm3/11.ooc
     foreach i (1 2 3 4 6 7)
       rsync -av /cluster/bluearc/dm3/11.ooc kkr${i}u00:/iscratch/i/dm3/
     end
     # Edited makeDb/genbank/etc/genbank.conf to specify new ooc location.
 
 
 #########################################################################
 # GENBANK AUTO UPDATE (DONE 10/9/06 angie - REDONE 6/20/07 markd)
 # 6/20/07 - MarkD rebuilt with regenerated 11.ooc.
     # Make a liftAll.lft that specifies 5M chunks for genbank:
     ssh kkstore05
     cd /cluster/data/dm3
     ~/kent/src/utils/simplePartition.pl dm3.2bit 5000000 /tmp/dm3P
     cat /tmp/dm3P/*/*.lft > jkStuff/liftAll.lft
     rm -r /tmp/dm3P
 
     # align with latest genbank process.
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add dm3 just after dm2...
 
 # dm3 (D. melanogaster)
 dm3.serverGenome = /cluster/data/dm3/dm3.2bit
 dm3.clusterGenome = /iscratch/i/dm3/dm3.2bit
 dm3.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 dm3.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 dm3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 dm3.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 dm3.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 dm3.ooc = /iscratch/i/dm3/11.ooc
 dm3.lift = /cluster/data/dm3/jkStuff/liftAll.lft
 dm3.genbank.mrna.xeno.load = yes
 dm3.downloadDir = dm3
 
     cvs ci -m "Added dm3." etc/genbank.conf
     # update /cluster/data/genbank/:
     make etc-update
 
     ssh kkstore02
     cd /cluster/data/genbank
     nice bin/gbAlignStep -initial dm3 &
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     nice ./bin/gbDbLoadStep -drop -initialLoad dm3 &
 
     # enable daily alignment and update of hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add dm3 to:
         etc/align.dbs
         etc/hgwdev.dbs 
     cvs commit
     make etc-update
 
     # 6/20/07 rebuild by MarkD (realign and reload native cDNAs using new 
     # ooc file):
     ssh kkstore02
     cd /cluster/data/genbank
     rm -f data/aligned/genbank.159.0/dm3/*/*native*
     rm -f data/aligned/refseq.23/dm3/*/*native*
 
     ./bin/gbAlignStep -initial -orgCat=native dm3
     ./bin/gbDbLoadStep -drop -initialLoad dm3
 
 
 #########################################################################
 # BACEND PAIRS TRACK (DONE 8/18/06 angie)
     ssh kkstore05
     # Plenty of room at the moment:
     df -h /cluster/data/dm3
 #                      4.5T  2.1T  2.4T  47% /cluster/store12
     mkdir -p /cluster/data/dm3/bed/cloneend/ncbi
     cd /cluster/data/dm3/bed/cloneend/ncbi
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/drosophila_melanogaster/\*
     # Wow... the directory exists, but contains no files.
 
     # Use the libs that Roger Hoskins pointed us to for dm2.
     mkdir /cluster/data/dm3/bed/bacends
     cd /cluster/data/dm3/bed/bacends
     wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL
     wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM
     faSize *
 #40016446 bases (3778317 N's 36238129 real 33260718 upper 2977391 lower) in 41500 sequences in 2 files
 #Total size: mean 964.3 sd 202.7 min 1 (AL0AA030DD12A1) max 1373 (AM0AA015AE07BP1) median 1009
 #N count: mean 91.0 sd 80.8
 #U count: mean 801.5 sd 207.4
 #L count: mean 71.7 sd 45.5
     cat banque_Projet_A* > bacends.fa
     # Edit the <!DOCTYPE.../HTML> part out of bacends.fa.
 
     # In dm2, we parsed bacEndPairs.txt out of sequence headers.
     # The sequences have not changed since then, so just copy over 
     # bacEndPairs.txt.
     cp /cluster/data/dm2/bed/bacends/bacEndPairs.txt .
 
     # Fly is much smaller than human so we can get away with one job 
     # per chrom, no splitting...
     ssh kki
     mkdir /cluster/bluearc/dm3/bacends
     cd /cluster/bluearc/dm3/bacends
     cp /cluster/data/dm3/bed/bacends/bacends.fa .
     mkdir out
     echo bacends.fa > bacends.lst
     awk '{print "/iscratch/i/dm3/dm3.2bit:" $1}' /cluster/data/dm3/chrom.sizes \
       > dm3.lst
 cat > gsub << 'EOF'
 #LOOP
 /cluster/bin/x86_64/blat -noHead $(path1) $(path2) -ooc=/cluster/bluearc/dm3/11.ooc {check out exists out/$(file1).$(root2).psl}
 #ENDLOOP
 'EOF'
     # << for emacs
     gensub2 dm3.lst bacends.lst gsub jobList
     para make jobList
     para time
 #Completed: 15 of 15 jobs
 #CPU time in finished jobs:       1721s      28.68m     0.48h    0.02d  0.000 y
 #IO & Wait Time:                    23s       0.39m     0.01h    0.00d  0.000 y
 #Average job time:                 116s       1.94m     0.03h    0.00d
 #Longest finished job:             744s      12.40m     0.21h    0.01d
 #Submission to last job:           744s      12.40m     0.21h    0.01d
 
     # Back on kolossus:
     cd /cluster/bluearc/dm3/bacends
     cat out/*.psl \
     | pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
         stdin bacEnds.unpaired.psl /dev/null
     # Roger's suggested 50kb-250kb range lost some (especially on the short 
     # side) in dm2, so I ended up going with Terry's range for human, 35kb-350kb:
 # NOTE FOR NEXT TIME: Use Roger's range -- physical sizes don't exceed 250kb!
     pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
              -slopval=10000 -hardMax=500000 \
              -slop -short -long -orphan -mismatch -verbose \
       bacEnds.unpaired.psl /cluster/data/dm3/bed/bacends/bacEndPairs.txt \
       all_bacends bacEnds
     wc -l bacEnds.*
 #      17 bacEnds.long
 #      96 bacEnds.mismatch
 #    6935 bacEnds.orphan
 #   11957 bacEnds.pairs
 #      28 bacEnds.short
 #     120 bacEnds.slop
 #  186593 bacEnds.unpaired.psl
 
     # Filter by score and sort by {chrom,chromStart}:
     awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
     cat bacEnds.{slop,short,long,mismatch,orphan} \
     | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
     wc -l *.bed
 #  11938 bacEndPairs.bed
 #   4824 bacEndPairsBad.bed
 
     # Filter the alignments into those we'll need in the all_bacends table:
     extractPslLoad -noBin bacEnds.unpaired.psl \
       bacEndPairs.bed bacEndPairsBad.bed \
     | sorttbl tname tstart | headchg -del \
     > bacEnds.load.psl
     mv * /cluster/data/dm3/bed/bacends/
 
     # load into database
     ssh hgwdev
     cd /cluster/data/dm3/bed/bacends
     # load BAC end sequences
     mkdir -p /gbdb/dm3/bacends
     ln -s /cluster/data/dm3/bed/bacends/bacends.fa /gbdb/dm3/bacends/
     hgLoadSeq dm3 /gbdb/dm3/bacends/bacends.fa
     hgLoadBed dm3 bacEndPairs bacEndPairs.bed \
       -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql 
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed dm3 bacEndPairsBad bacEndPairsBad.bed \
       -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     hgLoadPsl dm3 -table=all_bacends bacEnds.load.psl
 
     # Compares favorably to previous release? (good cov up, bad cov down):
     # Well, this is way up:
     featureBits dm3 all_bacends
 #32765057 bases of 162367812 (20.180%) in intersection
     featureBits dm2 all_bacends
 #22221379 bases of 131698467 (16.873%) in intersection
     # The percentage dropped but I think that's because of the increase in 
     # assembly size (yep, once again blame it on chrUextra :).  The absolute
     # count is up which seems good.
     featureBits dm3 bacEndPairs
 #15484492 bases of 162367812 (9.537%) in intersection
     featureBits dm2 bacEndPairs
 #14771250 bases of 131698467 (11.216%) in intersection
 
     # Clean up.
     rm bed.tab
     rm -r out
     rm -r /cluster/bluearc/dm3/bacends/
 
 # end BACEND PAIRS TRACK
 
 
 #########################################################################
 # GC5BASE (DONE 8/18/06 angie)
 # In the future this will be included in the makeGenomeDb.pl run.
     # Make gc5Base wiggle files.
     ssh kolossus
     mkdir -p /cluster/data/dm3/bed/gc5Base
     cd /cluster/data/dm3/bed/gc5Base
     hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 dm3 \
       /cluster/data/dm3/dm3.unmasked.2bit \
     | wigEncode stdin gc5Base.{wig,wib}
 
     ssh hgwdev
     # Load gc5base
     mkdir -p /gbdb/dm3/wib
     rm -f /gbdb/dm3/wib/gc5Base.wib
     ln -s /cluster/data/dm3/bed/gc5Base.wib /gbdb/dm3/wib
     hgLoadWiggle -pathPrefix=/gbdb/dm3/wib \
       dm3 gc5Base /cluster/data/dm3/bed/gc5Base/gc5Base.wig
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET DROSIM1 (DONE 8/18/06 angie)
     mkdir /cluster/data/dm3/bed/blastz.droSim1.2006-08-18
     cd /cluster/data/dm3/bed/blastz.droSim1.2006-08-18
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. simulans
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. simulans
 SEQ2_DIR=/san/sanvol1/scratch/droSim1/droSim1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droSim1/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droSim1.2006-08-18
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \
       -blastzOutRoot /san/sanvol1/scratch/dm3droSim1 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L chainDroSim1Link
 #21667700 bases of 23011344 (94.161%) in intersection
 # dm2: 91.756%
     rm -f /cluster/data/dm3/bed/blastz.droSim1
     ln -s blastz.droSim1.2006-08-18 /cluster/data/dm3/bed/blastz.droSim1
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET DROYAK2 (DONE 8/18/06 angie)
     mkdir /cluster/data/dm3/bed/blastz.droYak2.2006-08-18
     cd /cluster/data/dm3/bed/blastz.droYak2.2006-08-18
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. yakuba
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. yakuba
 SEQ2_DIR=/san/sanvol1/scratch/droYak2/droYak2.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droYak2/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droYak2.2006-08-18
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \
       -blastzOutRoot /san/sanvol1/scratch/dm3droYak2 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L chainDroYak2Link
 #21965072 bases of 23011344 (95.453%) in intersection
 # dm2: 91.943%
     rm -f /cluster/data/dm3/bed/blastz.droYak2
     ln -s blastz.droYak2.2006-08-18 /cluster/data/dm3/bed/blastz.droYak2
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET DROSEC1 (DONE 8/21/06 angie)
     mkdir /cluster/data/dm3/bed/blastz.droSec1.2006-08-18
     cd /cluster/data/dm3/bed/blastz.droSec1.2006-08-18
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. sechellia
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. sechellia
 SEQ2_DIR=/san/sanvol1/scratch/droSec1/droSec1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droSec1/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droSec1.2006-08-18
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \
       -blastzOutRoot /san/sanvol1/scratch/dm3droSec1 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L chainDroSec1Link
 #21913800 bases of 23011344 (95.230%) in intersection
 # dm2: 92.860%
     rm -f /cluster/data/dm3/bed/blastz.droSec1
     ln -s blastz.droSec1.2006-08-18 /cluster/data/dm3/bed/blastz.droSec1
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET DROERE1 (DONE 8/22/06 angie)
     mkdir /cluster/data/dm3/bed/blastz.droEre1.2006-08-21
     cd /cluster/data/dm3/bed/blastz.droEre1.2006-08-21
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. sechellia
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. sechellia
 SEQ2_DIR=/san/sanvol1/scratch/droEre1/droEre1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droEre1/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droEre1.2006-08-21
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \
       -blastzOutRoot /san/sanvol1/scratch/dm3droEre1 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L chainDroEre1Link
 #21706597 bases of 23011344 (94.330%) in intersection
 # dm2: 91.103%
     rm -f /cluster/data/dm3/bed/blastz.droEre1
     ln -s blastz.droEre1.2006-08-21 /cluster/data/dm3/bed/blastz.droEre1
 
 
 # BLASTZ/CHAIN/NET DROANA2 (DONE 8/22/06 angie)
     mkdir /cluster/data/dm3/bed/blastz.droAna2.2006-08-22
     cd /cluster/data/dm3/bed/blastz.droAna2.2006-08-22
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. ananassae
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. ananassae
 SEQ2_DIR=/san/sanvol1/scratch/droAna2/droAna2.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droAna2/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droAna2.2006-08-22
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -bigClusterHub pk -smallClusterHub pk \
       -blastzOutRoot /san/sanvol1/scratch/dm3droAna2 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L chainDroAna2Link
 #19329381 bases of 23011344 (83.999%) in intersection
 #dm3: 81.385%
     rm -f /cluster/data/dm3/bed/blastz.droAna2
     ln -s blastz.droAna2.2006-08-22 /cluster/data/dm3/bed/blastz.droAna2
 
 
 #########################################################################
 # LIFTOVER FLYBASE 4.2 ANNOTATIONS FROM DM1 (TEMPORARY) (DONE 8/18/06 angie)
     # Until FlyBase releases Release 5 annotations, liftOver'd 4.2 genes 
     # will be better than nothing.
     ssh hgwdev
     mkdir /cluster/data/dm3/bed/flybase4.2.liftOver
     cd /cluster/data/dm3/bed/flybase4.2.liftOver
     hgsql dm2 -N -e 'select * from flyBaseGene' > dm2.flyBaseGene.tab
     hgsql dm2 -N -e 'select * from flyBaseNoncoding' > dm2.flyBaseNoncoding.tab
     liftOver -genePred dm2.flyBaseGene.tab \
       /cluster/data/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \
       flyBaseLiftGene.tab flyBaseLiftGene.unmapped.tab
     liftOver -bedPlus=12 -hasBin dm2.flyBaseNoncoding.tab \
       /cluster/data/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \
       flyBaseLiftNoncoding.tab flyBaseLiftNoncoding.unmapped.tab
     # Not quite perfect but not bad:
     wc -l *.tab
 #  19178 dm2.flyBaseGene.tab
 #    727 dm2.flyBaseNoncoding.tab
 #  19173 flyBaseLiftGene.tab
 #     10 flyBaseLiftGene.unmapped.tab
 #    727 flyBaseLiftNoncoding.tab
 #      0 flyBaseLiftNoncoding.unmapped.tab
     ldHgGene -predTab dm3 flyBaseLiftGene flyBaseLiftGene.tab
     hgLoadBed -hasBin dm3 flyBaseLiftNoncoding flyBaseLiftNoncoding.tab
     # Copy pep table from dm2:
     hgPepPred dm3 tab flyBaseLiftGenePep \
       /cluster/data/dm2/bed/flybase4.2/flyBasePep.tab
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROSIM1 (DONE 11/14/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droSim1.2006-11-14
     cd /cluster/data/dm3/bed/blastz.droSim1.2006-11-14
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. simulans
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. simulans
 SEQ2_DIR=/san/sanvol1/scratch/droSim1/droSim1.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droSim1/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droSim1.2006-11-14
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droSim1 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroSim1Link
 #flyBaseLiftGene 22.721%, chainDroSim1Link 94.107%, both 22.094%, cover 97.24%, enrich 1.03x
     rm -f /cluster/data/dm3/bed/blastz.droSim1
     ln -s blastz.droSim1.2006-11-14 /cluster/data/dm3/bed/blastz.droSim1
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROSEC1 (DONE 11/29/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droSec1.2006-11-28
     cd /cluster/data/dm3/bed/blastz.droSec1.2006-11-28
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. sechellia
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. sechellia
 SEQ2_DIR=/san/sanvol1/scratch/droSec1/droSec1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/san/sanvol1/scratch/droSec1/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droSec1.2006-11-28
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droSec1 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroSec1Link
 #flyBaseLiftGene 22.721%, chainDroSec1Link 95.197%, both 22.491%, cover 98.99%, enrich 1.04x
     rm -f /cluster/data/dm3/bed/blastz.droSec1
     ln -s blastz.droSec1.2006-11-28 /cluster/data/dm3/bed/blastz.droSec1
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROYAK2 (DONE 11/15/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droYak2.2006-11-15
     cd /cluster/data/dm3/bed/blastz.droYak2.2006-11-15
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. yakuba
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. yakuba
 SEQ2_DIR=/san/sanvol1/scratch/droYak2/droYak2.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droYak2/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droYak2.2006-11-15
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droYak2 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroYak2Link
 #flyBaseLiftGene 22.721%, chainDroYak2Link 95.403%, both 22.489%, cover 98.98%, enrich 1.04x
     rm -f /cluster/data/dm3/bed/blastz.droYak2
     ln -s blastz.droYak2.2006-11-15 /cluster/data/dm3/bed/blastz.droYak2
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROPER1 (DONE 11/20/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droPer1.2006-11-15
     cd /cluster/data/dm3/bed/blastz.droPer1.2006-11-15
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. persimilis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. persimilis
 SEQ2_DIR=/san/sanvol1/scratch/droPer1/droPer1.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/san/sanvol1/scratch/droPer1/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droPer1.2006-11-15
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droPer1 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroPer1Link
 #flyBaseLiftGene 22.721%, chainDroPer1Link 76.570%, both 20.929%, cover 92.11%, enrich 1.20x
     rm -f /cluster/data/dm3/bed/blastz.droPer1
     ln -s blastz.droPer1.2006-11-15 /cluster/data/dm3/bed/blastz.droPer1
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROERE2 (DONE 11/30/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droEre2.2006-11-29
     cd /cluster/data/dm3/bed/blastz.droEre2.2006-11-29
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. erecta
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. erecta
 SEQ2_DIR=/san/sanvol1/scratch/droEre2/droEre2.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droEre2/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droEre2.2006-11-29
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -smallClusterHub pk \
       -blastzOutRoot /cluster/bluearc/dm3droEre2 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroEre2Link
 #flyBaseLiftGene 22.721%, chainDroEre2Link 94.227%, both 22.472%, cover 98.90%, enrich 1.05x
     rm -f /cluster/data/dm3/bed/blastz.droEre2
     ln -s blastz.droEre2.2006-11-29 /cluster/data/dm3/bed/blastz.droEre2
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROGRI2 (DONE 12/1/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droGri2.2006-11-30
     cd /cluster/data/dm3/bed/blastz.droGri2.2006-11-30
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. grimshawi
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. grimshawi
 SEQ2_DIR=/san/sanvol1/scratch/droGri2/droGri2.2bit
 SEQ2_CHUNK=50000000
 SEQ2_LAP=10000
 SEQ2_LEN=/san/sanvol1/scratch/droGri2/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droGri2.2006-11-30
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -smallClusterHub pk \
       -blastzOutRoot /cluster/bluearc/dm3droGri2 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroGri2Link
 #flyBaseLiftGene 22.721%, chainDroGri2Link 66.170%, both 20.274%, cover 89.23%, enrich 1.35x
     rm -f /cluster/data/dm3/bed/blastz.droGri2
     ln -s blastz.droGri2.2006-11-30 /cluster/data/dm3/bed/blastz.droGri2
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROWIL1 (DONE 12/4/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droWil1.2006-12-04
     cd /cluster/data/dm3/bed/blastz.droWil1.2006-12-04
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. willistoni
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. willistoni
 SEQ2_DIR=/san/sanvol1/scratch/droWil1/droWil1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/san/sanvol1/scratch/droWil1/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droWil1.2006-12-04
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -smallClusterHub pk \
       -blastzOutRoot /cluster/bluearc/dm3droWil1 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroWil1Link
 #flyBaseLiftGene 22.721%, chainDroWil1Link 73.133%, both 20.583%, cover 90.59%, enrich 1.24x
     rm -f /cluster/data/dm3/bed/blastz.droWil1
     ln -s blastz.droWil1.2006-12-04 /cluster/data/dm3/bed/blastz.droWil1
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROANA3 (DONE 12/4/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droAna3.2006-12-04
     cd /cluster/data/dm3/bed/blastz.droAna3.2006-12-04
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. ananassae
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. ananassae
 SEQ2_DIR=/san/sanvol1/scratch/droAna3/droAna3.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/san/sanvol1/scratch/droAna3/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droAna3.2006-12-04
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -smallClusterHub pk \
       -blastzOutRoot /cluster/bluearc/dm3droAna3 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroAna3Link
 #flyBaseLiftGene 22.721%, chainDroAna3Link 83.841%, both 21.651%, cover 95.29%, enrich 1.14x
     rm -f /cluster/data/dm3/bed/blastz.droAna3
     ln -s blastz.droAna3.2006-12-04 /cluster/data/dm3/bed/blastz.droAna3
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROMOJ3 (DONE 12/4/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droMoj3.2006-12-04
     cd /cluster/data/dm3/bed/blastz.droMoj3.2006-12-04
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. mojavensis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. mojavensis
 SEQ2_DIR=/iscratch/i/droMoj3/droMoj3.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droMoj3/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droMoj3.2006-12-04
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -blastzOutRoot /panasas/store/dm3droMoj3 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroMoj3Link
 #flyBaseLiftGene 22.721%, chainDroMoj3Link 65.253%, both 20.171%, cover 88.78%, enrich 1.36x
     rm -f /cluster/data/dm3/bed/blastz.droMoj3
     ln -s blastz.droMoj3.2006-12-04 /cluster/data/dm3/bed/blastz.droMoj3
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROVIR3 (DONE 12/4/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.droVir3.2006-12-04
     cd /cluster/data/dm3/bed/blastz.droVir3.2006-12-04
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. virilis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. virilis
 SEQ2_DIR=/iscratch/i/droVir3/droVir3.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droVir3/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.droVir3.2006-12-04
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -blastzOutRoot /cluster/bluearc/dm3droVir3 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroVir3Link
 #flyBaseLiftGene 22.721%, chainDroVir3Link 67.299%, both 20.340%, cover 89.52%, enrich 1.33x
     rm -f /cluster/data/dm3/bed/blastz.droVir3
     ln -s blastz.droVir3.2006-12-04 /cluster/data/dm3/bed/blastz.droVir3
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DP4 (DONE 12/4/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.dp4.2006-12-04
     cd /cluster/data/dm3/bed/blastz.dp4.2006-12-04
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. pseudoobscura
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - D. pseudoobscura
 SEQ2_DIR=/san/sanvol1/scratch/dp4/dp4.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/dp4/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.dp4.2006-12-04
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -blastzOutRoot /san/sanvol1/scratch/dm3dp4 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDp4Link
 #flyBaseLiftGene 22.721%, chainDp4Link 76.299%, both 20.966%, cover 92.28%, enrich 1.21x
     rm -f /cluster/data/dm3/bed/blastz.dp4
     ln -s blastz.dp4.2006-12-04 /cluster/data/dm3/bed/blastz.dp4
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET ANOGAM1 (DONE 12/5/06)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.anoGam1.2006-12-04
     cd /cluster/data/dm3/bed/blastz.anoGam1.2006-12-04    
     cat <<_EOF_ > DEF
 # D. melanogaster vs. A. gambiae
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - A. gambiae
 SEQ2_DIR=/iscratch/i/anoGam1/nib
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/anoGam1/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.anoGam1.2006-12-04
 _EOF_
 
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -blastzOutRoot /cluster/bluearc/dm3anoGam1 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainAnoGam1Link
 #flyBaseLiftGene 22.721%, chainAnoGam1Link 19.536%, both 12.711%, cover 55.94%, enrich 2.86x
     rm -f /cluster/data/dm3/bed/blastz.anoGam1
     ln -s blastz.anoGam1.2006-12-04 /cluster/data/dm3/bed/blastz.anoGam1
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET APIMEL3 (DONE 7/5/07 except for loading of netApiMel3)
     ssh kkstore05
     mkdir /cluster/data/dm3/bed/blastz.apiMel3.2006-12-11
     cd /cluster/data/dm3/bed/blastz.apiMel3.2006-12-11    
     cat <<_EOF_ > DEF
 # D. melanogaster vs. A. mellifera
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - A. mellifera
 SEQ2_DIR=/san/sanvol1/scratch/apiMel3/apiMel3.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/apiMel3/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.apiMel3.2006-12-11
 _EOF_
 
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -smallClusterHub pk \
       -blastzOutRoot /cluster/bluearc/dm3apiMel3 >& do.log &
     tail -f do.log
 # Died in loading phase -- netClass failed because there is no apiMel3 db!
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainApiMel3Link
 #flyBaseLiftGene 22.721%, chainApiMel3Link 17.865%, both 8.790%, cover 38.68%, enrich 2.17x
     rm -f /cluster/data/dm3/bed/blastz.apiMel3
     ln -s blastz.apiMel3.2006-12-11 /cluster/data/dm3/bed/blastz.apiMel3
     # 7/5/07: continue to make download files.
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -bigClusterHub pk -smallClusterHub pk \
       -blastzOutRoot /cluster/bluearc/dm3apiMel3 \
       -continue download >>& do.log &
     tail -f do.log
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET TRICAS2 (DONE 12/11/06)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/blastz.triCas2.2006-12-11
     cd /cluster/data/dm3/bed/blastz.triCas2.2006-12-11    
     cat <<_EOF_ > DEF
 # D. melanogaster vs. T. castaneum
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm3/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm3/chrom.sizes
 
 # QUERY - T. castaneum
 SEQ2_DIR=/iscratch/i/triCas2/triCas2.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/iscratch/i/triCas2/chrom.sizes
 
 BASE=/cluster/data/dm3/bed/blastz.triCas2.2006-12-11
 _EOF_
 
     doBlastzChainNet.pl DEF -chainMinScore 3000 \
       -blastzOutRoot /cluster/bluearc/dm3triCas2 >& do.log &
     tail -f do.log
     featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainTriCas2Link
 #flyBaseLiftGene 22.721%, chainTriCas2Link 20.039%, both 10.463%, cover 46.05%, enrich 2.30x
     rm -f /cluster/data/dm3/bed/blastz.triCas2
     ln -s blastz.triCas2.2006-12-11 /cluster/data/dm3/bed/blastz.triCas2
 
 
 ###########################################################################
 # MULTIZ15WAY (DONE 12/11/06 angie)
 # (((((((((dm3,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2)
     ssh kkstore04
     mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11
     cd /cluster/data/dm3/bed/multiz15way.2006-12-11
 
     # copy MAF's to cluster-friendly server
     mkdir /san/sanvol1/scratch/dm3/mafNet
     foreach s (droSim1 droSec1 droYak2 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 anoGam1 apiMel3 triCas2)
       echo $s
       rsync -av /cluster/data/dm3/bed/blastz.$s/mafNet/* \
         /san/sanvol1/scratch/dm3/mafNet/$s/
     end
 
     echo '(((((((((dm3,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2)' \
       > tree-commas.nh
     sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
     sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     ssh pk
     cd /cluster/data/dm3/bed/multiz15way.2006-12-11
     mkdir maf run
     cd run
 
     # stash binaries 
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
     set db = dm3
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/mafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../{tree.nh,species.lst} $tmp
     pushd $tmp
     foreach s (`cat species.lst`)
         set in = $pairs/$s/$c.maf
         set out = $db.$s.sing.maf
         if ($s == $db) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
     # << emacs
     chmod +x autoMultiz.csh
 
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/dm3/bed/multiz15way.2006-12-11/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
     # << emacs
 
     awk '{print $1}' /cluster/data/dm3/chrom.sizes > chrom.lst
     gensub2 chrom.lst single spec jobList
     para make jobList
     para time
 #Completed: 15 of 15 jobs
 #CPU time in finished jobs:      44158s     735.97m    12.27h    0.51d  0.001 y
 #IO & Wait Time:                   320s       5.33m     0.09h    0.00d  0.000 y
 #Average job time:                2965s      49.42m     0.82h    0.03d
 #Longest finished job:            9309s     155.15m     2.59h    0.11d
 #Submission to last job:          9309s     155.15m     2.59h    0.11d
 
 
 # ANNOTATE MULTIZ15WAY MAF AND LOAD TABLES (DONE 12/11/2006 angie)
     ssh kolossus
     mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11/anno
     cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno
     mkdir maf run
     cd run
     rm -f sizes nBeds
     foreach db (`cat /cluster/data/dm3/bed/multiz15way.2006-12-11/species.lst`)
       ln -s  /cluster/data/$db/chrom.sizes $db.len
       if (! -e /cluster/data/$db/$db.N.bed) then
         twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
       endif
       ln -s  /cluster/data/$db/$db.N.bed $db.bed
       echo $db.bed  >> nBeds
       echo $db.len  >> sizes
     end
     echo date > jobs.csh
     # do smaller jobs first:
     foreach f (`ls -1rS ../../maf/*.maf`)
       echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
         /cluster/data/dm3/dm3.2bit ../maf/`basename $f` >> jobs.csh
     end 
     echo date >> jobs.csh
     csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
     # 5min.
 
     # Load anno/maf
     ssh hgwdev
     cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf
     mkdir -p /gbdb/dm3/multiz15way/anno/maf
     ln -s /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf/*.maf \
       /gbdb/dm3/multiz15way/anno/maf
     date
     nice hgLoadMaf -pathPrefix=/gbdb/dm3/multiz15way/anno/maf dm3 multiz15way
     date
     # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
     # machine and then load on hgwdev:
     ssh kolossus
     cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf
     cat *.maf \
     | nice hgLoadMafSummary dm3 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
              -test multiz15waySummary stdin
 #Created 171317 summary blocks from 13563512 components and 1633505 mafs from stdin
     ssh hgwdev
     cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf
     sed -e 's/mafSummary/multiz15waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
       > /tmp/multiz15waySummary.sql
     time nice hgLoadSqlTab dm3 multiz15waySummary /tmp/multiz15waySummary.sql \
       multiz15waySummary.tab
 #0.000u 0.002s 0:02.59 0.0%      0+0k 0+0io 2pf+0w
     rm *.tab
     ln -s multiz15way.2006-12-11 /cluster/data/dm3/bed/multiz15way
 
 
 # MULTIZ15WAY DOWNLOADABLES (ANNOTATED) (DONE 12/11/2006 angie -- UPSTREAM REDONE 5/18/08, 10/27/08)
 # 5/18/08: regenerating upstream* to fix bug (truncated refGene names to
 # NM_[0-9]{6} but there are some valid NM_[0-9]{9} IDs now).
 # 10/27/08: regenerating with FlyBase Genes and mafFrags -txStarts.
     # Annotated MAF is now documented, so use anno/maf for downloads.
     ssh hgwdev
     mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads
     cd /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads
     # upstream mafs 
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits dm3 flyBaseGene:upstream:$i -fa=/dev/null -bed=stdout \
         | sed -re 's/ [^\t]*\t/\t/;' > up.bed
         nice mafFrags -txStarts dm3 multiz15way up.bed upstream$i.maf \
                 -orgs=../species.lst
         rm up.bed
     end
     date
 'EOF'
     # << emacs
     time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
 #171.365u 94.950s 21:16.74 20.8% 0+0k 0+0io 0pf+0w
 
     ssh kolossus
     cd /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads
 cat > downloads.csh << 'EOF'
     date
     foreach f (../anno/maf/chr*.maf)
         set c = $f:t:r
         echo $c
         nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     # << emacs
     time csh -efx downloads.csh >&! downloads.log
 #459.706u 5.793s 8:23.71 92.4%   0+0k 0+0io 0pf+0w
     nice gzip up*.maf
     nice md5sum up*.maf.gz >> md5sum.txt
     # Make a README.txt .
 
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/dm3/multiz15way
     mkdir $dir
     ln -s /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads/*.{gz,txt} $dir
 
 
 # MULTIZ15WAY MAF FRAMES (DONE 12/11/2006 angie)
     ssh hgwdev
     mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11/frames
     cd /cluster/data/dm3/bed/multiz15way.2006-12-11/frames
     # The following is adapted from MarkD's Makefile used for mm7...
 
     #------------------------------------------------------------------------
     # get the genes for all genomes (when available)
     # currently we have nothing for:
     # droSec1 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 triCas2
 
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
     # using mrna table as genes: droSim1 droYak2 anoGam1
     mkdir genes
     foreach queryDb (droSim1 droYak2 anoGam1)
       set tmpExt = `mktemp temp.XXXXXX`
       set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
       set tmpMrna = ${queryDb}.mrna.${tmpExt}
       set tmpCds = ${queryDb}.cds.${tmpExt}
       echo $queryDb
       hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                    from all_mrna,gbCdnaInfo,cds \
                    where (all_mrna.qName = gbCdnaInfo.acc) and \
                      (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
        ${queryDb} > ${tmpMrnaCds}
       cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
       cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
       mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
         stdout \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
     # using refGene for dm3
     # genePreds; (must keep only the first 10 columns for knownGene)
     set gpFields = 'name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds'
     foreach queryDb (dm3)
       if ($queryDb == "dm3") then
         set geneTbl = refGene
       endif
       hgsql -N -e "select $gpFields from $geneTbl" ${queryDb} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
 
     # MarkD's 1-pass maf methodology
     ssh kolossus
     cd /cluster/data/dm3/bed/multiz15way/frames
     nice tcsh # easy way to get process niced
     (cat  ../anno/maf/*.maf \
      | time genePredToMafFrames dm3 stdin stdout \
          dm3 genes/dm3.gp.gz \
          droSim1 genes/droSim1.gp.gz droYak2 genes/droYak2.gp.gz \
          anoGam1 genes/anoGam1.gp.gz \
      | gzip -c > multiz15way.mafFrames.gz) >& frames.log
 
     ssh hgwdev
     cd /cluster/data/dm3/bed/multiz15way/frames
     hgLoadMafFrames dm3 multiz15wayFrames multiz15way.mafFrames.gz
 
 
 # PHASTCONS 15WAY (DONE 12/11/06 angie)
 # (((((((((dm3,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2)
     # Estimation for dm2 phastCons w/same tree took a long time, and didn't
     # lead to any real change in params (thank god no iteration req'd).
     # So skip estimation and use the tree from dm2.
     ssh kkstore04
     mkdir -p /san/sanvol1/scratch/dm3/chrom
     foreach chr (`awk '{print $1;}' /cluster/data/dm3/chrom.sizes`)
       twoBitToFa -seq=$chr /cluster/data/dm3/dm3.2bit \
         /san/sanvol1/scratch/dm3/chrom/$chr.fa
     end
     # Split chrom fa into smaller windows for phastCons:
     ssh pk
     mkdir /cluster/data/dm3/bed/multiz15way/phastCons
     mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.split
     cd /cluster/data/dm3/bed/multiz15way/phastCons/run.split
     set WINDOWS = /san/sanvol1/scratch/dm3/phastCons/WINDOWS
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
     cat << 'EOF' > doSplit.sh
 #!/bin/csh -ef
 
 set PHAST=/cluster/bin/phast
 set FA_SRC=/san/sanvol1/scratch/dm3/chrom
 set WINDOWS=/san/sanvol1/scratch/dm3/phastCons/WINDOWS
 
 set maf=$1
 set c = $maf:t:r
 set tmpDir = /scratch/msa_split/$c
 rm -rf $tmpDir
 mkdir -p $tmpDir
 ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
    -O dm3,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel3,triCas2 \
    -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
 cd $tmpDir
 foreach file ($c.*.ss)
   gzip -c $file > ${WINDOWS}/$file.gz
 end
 rm -f $tmpDir/$c.*.ss
 rmdir $tmpDir
 'EOF'
 # << for emacs
     chmod a+x doSplit.sh
     rm -f jobList
     foreach file (/cluster/data/dm3/bed/multiz15way/maf/*.maf)
       if (-s $file) then
         echo "doSplit.sh {check in exists+ $file}" >> jobList
       endif
     end
     para make jobList
     para time
 #Completed: 15 of 15 jobs
 #CPU time in finished jobs:       1728s      28.80m     0.48h    0.02d  0.000 y
 #IO & Wait Time:                  1578s      26.30m     0.44h    0.02d  0.000 y
 #Average job time:                 220s       3.67m     0.06h    0.00d
 #Longest finished job:            1946s      32.43m     0.54h    0.02d
 #Submission to last job:          1962s      32.70m     0.55h    0.02d
 
     ############### SKIP PARAMETER ESTIMATION #############
     mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.estimate
     cd /cluster/data/dm3/bed/multiz15way/phastCons/run.estimate
     foreach mod (/cluster/data/dm2/bed/multiz15way/phastCons/run.estimate/ave*.mod)
       sed -e 's/dm2/dm3/g; s/apiMel2/apiMel3/g;' $mod > $mod:t
     end
 
     ############### COMPUTE SCORES AND ELEMENTS #############
     ssh pk
     mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.phast
     cd /cluster/data/dm3/bed/multiz15way/phastCons/run.phast
     cat << 'EOF' > doPhastCons.sh
 #!/bin/csh -ef
 set pref = $1:t:r:r
 set chr = `echo $pref | awk -F\. '{print $1}'`
 set tmpfile = /scratch/phastCons.$$
 zcat $1 \
 | /cluster/bin/phast/phastCons - \
     ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
     --expected-lengths 23.8 --target-coverage 0.393 \
     --quiet --seqname $chr --idpref $pref \
     --viterbi /san/sanvol1/scratch/dm3/phastCons/ELEMENTS/$pref.bed --score \
     --require-informative 0 \
   > $tmpfile
 gzip -c $tmpfile > /san/sanvol1/scratch/dm3/phastCons/POSTPROBS/$pref.pp.gz
 rm $tmpfile
 'EOF'
 # << for emacs
     chmod a+x doPhastCons.sh
     rm -fr /san/sanvol1/scratch/dm3/phastCons/{POSTPROBS,ELEMENTS}
     mkdir -p /san/sanvol1/scratch/dm3/phastCons/{POSTPROBS,ELEMENTS}
     rm -f jobList
     foreach f (/san/sanvol1/scratch/dm3/phastCons/WINDOWS/*.ss.gz)
       echo doPhastCons.sh $f >> jobList
     end
     para make jobList
     para time
 #Completed: 178 of 178 jobs
 #CPU time in finished jobs:       1836s      30.60m     0.51h    0.02d  0.000 y
 #IO & Wait Time:                 16855s     280.92m     4.68h    0.20d  0.001 y
 #Average job time:                 105s       1.75m     0.03h    0.00d
 #Longest finished job:             544s       9.07m     0.15h    0.01d
 #Submission to last job:           579s       9.65m     0.16h    0.01d
 
     # back on kolossus:
     # combine predictions and transform scores to be in 0-1000 interval
     cd /cluster/data/dm3/bed/multiz15way/phastCons
     awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
       /san/sanvol1/scratch/dm3/phastCons/ELEMENTS/*.bed \
     | /cluster/bin/scripts/lodToBedScore > all.bed
 
     ssh hgwdev
     # Now measure coverage of CDS by conserved elements. 
     # We want the "cover" figure to be close to 68.9%.
     cd /cluster/data/dm3/bed/multiz15way/phastCons
     featureBits -enrichment dm3 flyBaseLiftGene:cds all.bed
     # 69.26... not exactly our 68.9% target, but close enough.
 #flyBaseLiftGene:cds 13.517%, all.bed 32.328%, both 9.361%, cover 69.26%, enrich 2.14x
 
     # Having met the CDS coverage target, load up the results.
     hgLoadBed dm3 phastConsElements15way all.bed
 
     # Create wiggle
     ssh pk
     mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.wib
     cd /cluster/data/dm3/bed/multiz15way/phastCons/run.wib
     rm -rf /san/sanvol1/scratch/dm3/phastCons/wib
     mkdir -p /san/sanvol1/scratch/dm3/phastCons/wib
     cat << 'EOF' > doWigEncode
 #!/bin/csh -ef
 set chr = $1
 cd /san/sanvol1/scratch/dm3/phastCons/wib
 zcat `ls -1 /san/sanvol1/scratch/dm3/phastCons/POSTPROBS/$chr.*.pp.gz \
       | sort -t\. -k2,2n` \
 | wigEncode stdin ${chr}_phastCons.wi{g,b}
 'EOF'
 # << for emacs
     chmod a+x doWigEncode
     rm -f jobList
     foreach chr (`ls -1 /san/sanvol1/scratch/dm3/phastCons/POSTPROBS \
                   | awk -F\. '{print $1}' | sort -u`)
       echo doWigEncode $chr >> jobList
     end
     para make jobList
     para time
 #Completed: 15 of 15 jobs
 #CPU time in finished jobs:         92s       1.53m     0.03h    0.00d  0.000 y
 #IO & Wait Time:                    71s       1.19m     0.02h    0.00d  0.000 y
 #Average job time:                  11s       0.18m     0.00h    0.00d
 #Longest finished job:              24s       0.40m     0.01h    0.00d
 #Submission to last job:            24s       0.40m     0.01h    0.00d
 
     # back on kkstore04, copy wibs, wigs and POSTPROBS (people sometimes want 
     # the raw scores) from san/sanvol1
     cd /cluster/data/dm3/bed/multiz15way/phastCons
     rm -rf wib POSTPROBS
     rsync -av /san/sanvol1/scratch/dm3/phastCons/wib .
     rsync -av /san/sanvol1/scratch/dm3/phastCons/POSTPROBS .
 
     # load wiggle component of Conservation track
     ssh hgwdev
     mkdir /gbdb/dm3/multiz15way/wib
     cd /cluster/data/dm3/bed/multiz15way/phastCons
     chmod 775 . wib
     chmod 664 wib/*.wib
     ln -s `pwd`/wib/*.wib /gbdb/dm3/multiz15way/wib/
     hgLoadWiggle dm3 phastCons15way \
       -pathPrefix=/gbdb/dm3/multiz15way/wib wib/*.wig
     rm wiggle.tab
 
     # and clean up san/sanvol1.
     rm -r /san/sanvol1/scratch/dm3/phastCons/{ELEMENTS,POSTPROBS,wib}
     rm -r /san/sanvol1/scratch/dm3/chrom
     # Offer raw scores for download since fly folks are likely to be interested:
     ssh kkstore04
     cd /cluster/data/dm3/bed/multiz15way/phastCons/POSTPROBS
     mkdir ../postprobsDownload
     foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
       zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
         > ../postprobsDownload/$chr.pp.gz
     end
     cd ../postprobsDownload
     md5sum *.gz > md5sum.txt
     # Make a README.txt there too.
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dm3/phastCons15way
     cd /usr/local/apache/htdocs/goldenPath/dm3/phastCons15way
     ln -s /cluster/data/dm3/bed/multiz15way/phastCons/postprobsDownload/* .
 
     # make a tree picture using phyloGif.
     cd /cluster/data/dm3/bed/multiz15way
     cp tree-commas.nh tree-commas-orgNames.nh
     # Edit tree-commas-orgNames.nh to use org names.
     /usr/local/apache/cgi-bin/phyloGif -phyloGif_tree=tree-commas-orgNames.nh \
       -phyloGif_width=260 -phyloGif_height=480 > dm3_15way.gif
     cp -p dm3_15way.gif ~/browser/images/phylo/dm3_15way.gif
     # Check in browser/images/phylo/dm3_15way.gif.  In a clean ~/browser:
     cd ~/browser
     make alpha
 
 
 #########################################################################
 # FLYBASE 5.1 ANNOTATIONS (DONE 4/6/07 angie)
 # OBSOLETED -- see FLYBASE 5.3 ANNOTATIONS BELOW
     ssh kkstore05
     mkdir /cluster/data/dm3/bed/flybase5.1
     cd /cluster/data/dm3/bed/flybase5.1
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.1/gff/dmel-all-r5.1.gff.gz
     gunzip dmel-all-r5.1.gff.gz
     # What data sources are represented in this file?
     grep -v '^#' dmel-all-r5.1.gff | awk '{print $2 "\t" $3;}' | sort | uniq -c
     # excerpt (many other sources, including blastx:... , sim4:... and 
     # tblastx:...; also various other types for source "FlyBase" and "."):
   67107 FlyBase CDS
   65481 FlyBase exon
   14601 FlyBase gene
   19781 FlyBase mRNA
      92 FlyBase miRNA
      95 FlyBase ncRNA
      51 FlyBase pseudogene
      98 FlyBase rRNA
      47 FlyBase snRNA
      65 FlyBase snoRNA
     314 FlyBase tRNA
    6003 FlyBase transposable_element
   37267 FlyBase transposable_element_insertion_site
     # Very similar to 4.3 except no more ". transcription_start_site".
 
     # What keywords are defined in the 9th field?  
     # Oh dear, 4.3 had HTML-escaped special characters but 5.1 has 
     # unescaped ; willy-nilly in there with the ; which separate var=val's.
 #    | perl -wpe 's/&\w\w\w\w?;//g; s/=[^;]+;/\n/g; s/=.*$//;' \
     grep -v '^#' dmel-all-r5.1.gff \
     | awk '{print $9;}' \
     | perl -wpe 'while (s/(\w+)=//) {print "$1\n";} s/^.*\n$//;' \
     | sort | uniq -c
   17604 Alias
  375522 Dbxref
 5523024 ID
 5521150 Name
 3415862 Parent
 3962606 Target
   18400 associated_with
   15770 cyto_range
   37956 derived_cyto_location
   15600 gbunit
   12101 programversion
   11795 putative_ortholog_of
   13174 to_species
     # Notable differences from 4.3: no more Synonym; a lot fewer Alias,
     # a *lot* more Dbxref; some keywords have disappeared but not ones 
     # we use.  
 
     # Copy over 4.3 script and modify to fit the latest Immortal Unchanging
     # Widely Adopted Standard GFF3.
     # Alias no longer holds the CG*/CR* ID's; now it's Dbxref, with a 
     # prefix that must be stripped.
     # Two pseudogenes do not have CR*; must get those from gene line.
     # 'dmel_mitochondrion_genome' --> 'chrM'
     cp /cluster/data/dm2/bed/flybase4.3/extractGenes.pl .
     extractGenes.pl dmel-all-r5.1.gff
     mv flyBaseGene.src=FlyBase.gtf flyBaseGene.gtf
 
     # Get predicted proteins (for main annotations only)
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.1/fasta/dmel-all-translation-r5.1.fasta.gz
     # Use the FBgn IDs in headers to retrieve CG-R's to match flyBaseGene:
     zcat dmel-all-translation-r5.1.fasta.gz \
     | perl -we 'open(X, "flyBase2004Xref.tab") || die; \
                 while (<X>) { \
                   @w = split("\t"); \
                   $fbtr2cgr{$w[3]} = $w[0]; \
                 } \
                 while (<>) { \
                   if (/^>.*parent=FBgn\d+,(FBtr\d+)/) { \
                     $cgr = $fbtr2cgr{$1}; \
                     if ($cgr) { \
                       s/^>.*/>$cgr/; \
                     } else { die; } \
                   } \
                   print; \
                 } \
                ' \
       > flyBasePep.fa
 
     ssh hgwdev
     cd /cluster/data/dm3/bed/flybase5.1
     # Protein-coding genes:
     ldHgGene -gtf dm3 flyBaseGene flyBase*.gtf
 #  19781 groups 7 seqs 1 sources 2 feature types
     hgPepPred dm3 generic flyBasePep flyBasePep.fa
     # Noncoding genes:
     hgLoadBed dm3 flyBaseNoncoding flyBaseNoncoding.bed
 #Loaded 762 elements of size 12
     # Cross-referencing info for both coding and noncoding:
     hgLoadSqlTab dm3 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
       flyBase2004Xref.tab
 
     # Make sure there are no joinerCheck errors at this point, because 
     # if left here they can spread via doHgNearBlastp:
     runJoiner.csh dm3 flyBasePep
 
     # See if there are still large regions where refGene has genes but
     # flybase doesn't:
     featureBits dm3 refGene \!flyBaseGene -minSize=1000 -bed=stdout
     # Not so bad now -- some cases where sequence might be duplicated and 
     # we make different calls about where the exons land, but no huge 
     # missing regions of regular chroms (just no annotations on chr*Het).
 
     # add upstream* downloadable files
     ssh hgwdev
     mkdir /cluster/data/dm3/bed/flybase5.1/bigZips
     cd /cluster/data/dm3/bed/flybase5.1/bigZips
     foreach size (1000 2000 5000)
       echo upstream$size
       nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \
       | nice gzip -c > upstream$size.fa.gz
     end
     nice md5sum *.gz > md5sum.txt
     ln -s /cluster/data/dm3/bed/flybase5.1/bigZips/*.gz \
       /usr/local/apache/htdocs/goldenPath/dm3/bigZips/
     cat md5sum.txt \
       >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
 
 
 #########################################################################
 # CHROMOSOME BANDS (DONE (5.1) 4/5/07 angie - REDONE (5.3) 9/18/07)
 # 10/21/08 (5.12): generated files, but no change since 5.3, so I did not
 # reload the tables.
     ssh hgwdev
     cd /cluster/data/dm3/bed/flybase5.3
     grep chromosome_band dmel-all-r5.3.gff \
     | perl -wpe 'chomp; @w = split; \
                  $w[3]--; $w[3] = 0 if ($w[3] < 0); \
                  $w[8] =~ s/^.*Name=band-(\w+)(;.*)?$/$1/; \
                  if ($w[8] !~ /^\d+[A-Z]\d+$/) { s/^.*$//; next; } \
                  if ($w[4] <= 0) { s/^.*$//; } else { \
                  s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\tn\/a\n/; }' \
       > cytoBand.txt
     hgLoadSqlTab dm3 cytoBand ~/kent/src/hg/lib/cytoBand.sql cytoBand.txt
     # Remove the items that are off the end of their chroms:
     perl -wpe 's/^(\w+)\t(\d+)$/ \
       delete from cytoBand where chrom="$1" and chromStart >= $2; \
       update cytoBand set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
       ../../chrom.sizes \
     | hgsql dm3
     checkTableCoords -verbose=2 dm3 cytoBand
 
     # As of 5.3, the GFF now includes what we need to make cytoBandIdeo
     # directly:
     grep chromosome_band dmel-all-r5.3.gff \
     | perl -wpe 'chomp; @w = split; \
                  $w[3]--; $w[3] = 0 if ($w[3] < 0); \
                  $w[8] =~ s/^.*Name=band-(\w+)(;.*)?$/$1/; \
                  if ($w[8] !~ /^\d+[A-Z]$/) { s/^.*$//; next; } \
                  if ($w[4] <= 0) { s/^.*$//; } else { \
                  s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\tn\/a\n/; }' \
       > cytoBandIdeo.txt
     hgLoadSqlTab dm3 cytoBandIdeo ~/kent/src/hg/lib/cytoBandIdeo.sql \
       cytoBandIdeo.txt
     # Remove the items that are off the end of their chroms:
     perl -wpe 's/^(\w+)\t(\d+)$/ \
       delete from cytoBandIdeo where chrom="$1" and chromStart >= $2; \
       update cytoBandIdeo set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
       ../../chrom.sizes \
     | hgsql dm3
     checkTableCoords -verbose=2 dm3 cytoBandIdeo
 
 
 #########################################################################
 # FLYBASE ACODE CROSS-REFERENCING DATA (N/A 4/5/07 angie)
     # Hmmm, it seems that flybase no longer offser acode.  No dice on
     # ftping from the old location
     # (ftp://flybase.bio.indiana.edu/flybase/genes/genes.txt -- not 
     # flybase.net either)
     # or googling site:flybase.net acode.
     # Looks like Chado XML is what is offered now.  And an interrupted 
     # fetch of it is hanging my FireFox.
     # When it recovers, I guess I could try downloading it properly 
     # and seeing if it has the same info that hgFlyBase used to parse 
     # out of acode.
 ftp://ftp.flybase.net/releases/FB2006_01/chado-xml/chado_genes.xml.gz
     # This might be smaller...
 ftp://ftp.flybase.net/releases/FB2006_01/dmel_r5.1/chado-xml/chado_dmel_genes.xml.gz
 
 
 #########################################################################
 # MAP UNIPROT IDS TO CG*-R* IDS
 # FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07)
 # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
 # FlyBase 5.3 history: (DONE 9/20/07 angie)
 # FlyBase 5.12 history: (DONE 10/24/08 angie)
     # In dm2, I combined three sources: specific transcript mappings 
     # from uniProt.description, less specific gene mappings from 
     # uniProt.description, and fbUniProt mappings extracted from 
     # FlyBase acode.  We don't have flyBase acode anymore, so combine 
     # just the first two (uniProt) sources.
     ssh hgwdev
     mkdir /cluster/data/dm3/bed/uniProtToFlyBase
     cd /cluster/data/dm3/bed/uniProtToFlyBase
     set taxon = `hgsql -N uniProt -e 'select id from taxon where binomial = "Drosophila melanogaster";'`
     # Grab UniProt descriptions of all D. mel. proteins so we can parse out
     # CG*-R* transcript IDs (and CG* gene IDs when -R* not specified).
     hgsql -N uniProt -e \
      'select description.* from description,accToTaxon where \
         description.val not like "CG% (Fragment)." and \
         accToTaxon.taxon = '$taxon' and accToTaxon.acc = description.acc' \
       > uniProt.description.txt
     # Grab the complete list of CG*-R* transcript IDs:
     hgsql -N dm3 -e 'select name from flyBaseGene' | sort > transcriptIds.txt
     ssh kolossus
     cd /cluster/data/dm3/bed/uniProtToFlyBase
     # Wrote a perl script, gleanDescription.pl, to parse the various patterns
     # used to encode real mappings (not mappings to homologs or things that 
     # look like CG*-P? IDs but aren't really).  Update the dm2 copy to handle
     # the new CG\d+-R\w\w transcripts:
     cp /cluster/data/dm2/bed/uniProtToFlyBase/gleanDescription.pl .
     # Loosened a couple regexes in gleanDescription.pl for 5.12.
     ./gleanDescription.pl uniProt.description.txt | sort > uniProtMapping.txt
     # Look for duplicates (if we didn't trim the "(Fragment)." entries above,
     # there would be 82!  But now we have only one.
     foreach cg (`cut -f 1 uniProtMapping.txt | uniq -c \
                  | awk '$1 != 1 {print $2;}' | sed -e 's/-R/-P/'`)
       echo $cg
       grep $cg uniProt.description.txt
       echo ""
     end
 #CG9339-PH
 #Q6Q4F0  CG9339-PH.
 #Q9VIH7  CG9339-PA, isoform A (CG9339-PB, isoform B) (CG9339-PH, isoform H) (LD10117p).
     # I looked up both Q6Q4F0 and Q9VIH7 on http://www.pir.uniprot.org/;
     # Q9VIH7 is a bit longer, so I'll arbitrarily go with that one and
     # delete the Q6Q4F0 mapping from uniProtMapping.txt.
 
     # Wrote a perl script, vagueDescription.pl, to parse out CG* without 
     # isoform specs -- can use those as backups when we don't have a more 
     # specific mapping for a transcript.  (updated from dm2)
     cp /cluster/data/dm2/bed/uniProtToFlyBase/vagueDescription.pl .
     ./vagueDescription.pl uniProt.description.txt | sort \
       > uniProtMappingVague.txt
     # acode is no longer available, but UniProt mappings can be gleaned from
     # FlyBase GFF3.  
     perl -we 'while (<>) { chomp; @w = split; \
         next unless ($w[8] && $w[1] eq "FlyBase" && $w[2] eq "gene" && \
                      $w[8] =~ /FlyBase_Annotation_IDs:(CG\d+)/); \
         $cg = $1; \
         while ($w[8] =~ s/^.*?UniProt\/[-\w]+:(\w+)//) { \
           print "$cg\t$1\n"; \
         } \
       }' ../flybase5.12/dmel-all-r5.12.gff \
       > flyBaseCgToUniProt.txt
     # Using dm2 perl script, combineMappings.pl, to add any additional mappings
     # available from acode to the UniProt mappings.  Since the second arg
     # is vague-gene and third arg is vague-transcript, feed both vague-gene
     # files in as the second input:
     cat uniProtMappingVague.txt flyBaseCgToUniProt.txt \
     | /cluster/data/dm2/bed/uniProtToFlyBase/combineMappings.pl transcriptIds.txt \
       uniProtMapping.txt - /dev/null \
       > flyBaseToUniProt.txt
     # Make a 2-column version for loading -- and add back in the transcripts for
     # which we couldn't find a UniProt acc, because they have to have something in
     # flyBaseToUniProt in order for hgNear's oneGeneQuery to work!:
     join -t '	' -a 1 -e n/a -o 1.1,2.2 transcriptIds.txt flyBaseToUniProt.txt \
       > flyBaseToUniProtLoad.txt
     # Load into a genericAlias-type table:
     ssh hgwdev
     cd /cluster/data/dm3/bed/uniProtToFlyBase
     sed -e 's/genericAlias/flyBaseToUniProt/' \
       ~/kent/src/hg/lib/genericAlias.sql > flyBaseToUniProt.sql
     hgLoadSqlTab dm3 flyBaseToUniProt flyBaseToUniProt.sql \
       flyBaseToUniProtLoad.txt
 
 
 #########################################################################
 # HGNEAR TABLES
 
 # HGNEAR PROTEIN BLAST TABLES 
 # FlyBase 5.1 history: (dm3.* DONE 4/6/07 - dm3.* REDONE 6/18/07,
 # *.dmBlastTab DONE 7/18/07)
 # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
 # FlyBase 5.3 history: (DONE 9/18/07)
 # FlyBase 5.12 history: (DONE 10/21/08)
     ssh hgwdev
     mkdir /cluster/data/dm3/bed/hgNearBlastp
     cd /cluster/data/dm3/bed/hgNearBlastp
     mkdir 081021
     cd 081021
     # Get the proteins used by the other hgNear organisms:
     pepPredToFa hg18 knownGenePep hg18.known.faa
     pepPredToFa mm9 knownGenePep mm9.known.faa
     pepPredToFa rn4 knownGenePep rn4.known.faa
     pepPredToFa danRer3 ensPep danRer3.ensPep.faa
     pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
     pepPredToFa ce6 sangerPep ce6.sangerPep.faa
     pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
 
     # Galt's synBlastp.csh filtering, which uses liftOver chains,
     # would be the next step if dm3 were reasonably closely related
     # (like mammal-mammal) to another Gene Sorter organism.  But it is 
     # pretty distant from all the others, so specify recipBest for all
     # query databases.
     cat << _EOF_ > config.ra
 # Latest fly vs. other Gene Sorter orgs:
 # human, mouse, rat, zebrafish, worm, yeast
 
 targetGenesetPrefix flyBase
 targetDb dm3
 queryDbs  hg18 mm9 rn4 danRer3 ce6 sacCer1
 recipBest hg18 mm9 rn4 danRer3 ce6 sacCer1
 
 dm3Fa     /cluster/data/dm3/bed/hgNearBlastp/081021/dm3.flyBasePep.faa
 hg18Fa    /cluster/data/dm3/bed/hgNearBlastp/081021/hg18.known.faa
 mm9Fa     /cluster/data/dm3/bed/hgNearBlastp/081021/mm9.known.faa
 rn4Fa     /cluster/data/dm3/bed/hgNearBlastp/081021/rn4.known.faa
 danRer3Fa /cluster/data/dm3/bed/hgNearBlastp/081021/danRer3.ensPep.faa
 ce6Fa     /cluster/data/dm3/bed/hgNearBlastp/081021/ce6.sangerPep.faa
 sacCer1Fa /cluster/data/dm3/bed/hgNearBlastp/081021/sacCer1.sgdPep.faa
 
 buildDir /cluster/data/dm3/bed/hgNearBlastp/081021
 scratchDir /san/sanvol1/scratch/dm3HgNearBlastp
 _EOF_
     # Run with -noLoad so we can eyeball files, manually load dm3 tables now,
     # and later overload other databases' dmBlastTab tables.
     doHgNearBlastp.pl -noLoad config.ra >& do.log &
     tail -f do.log
     # Ran the load scripts for dm3 tables manually as suggested by the
     # output of doHgNearBlastp.pl:
 # *** -noLoad was specified -- you can run this script manually to load dm3 tables:
         run.dm3.dm3/loadPairwise.csh
 #Loading database with 904530 rows
 # *** -noLoad was specified -- you can run these scripts manually to load dm3 tables:
         run.dm3.hg18/loadPairwise.csh
 #Loading database with 5946 rows
         run.dm3.mm9/loadPairwise.csh
 #Loading database with 5906 rows
         run.dm3.rn4/loadPairwise.csh
 #Loading database with 3058 rows
         run.dm3.danRer3/loadPairwise.csh
 #Loading database with 5328 rows
         run.dm3.ce6/loadPairwise.csh
 #Loading database with 4665 rows
         run.dm3.sacCer1/loadPairwise.csh
 #Loading database with 2205 rows
 
      # 7/18/07,9/18/07,10/21/08: dm3 is on RR, so time to load*.dmBlastTab:
 # *** -noLoad was specified -- you can run these scripts manually to load dmBlastTab in query databases:
         run.hg18.dm3/loadPairwise.csh
 #Loading database with 5946 rows
         run.mm9.dm3/loadPairwise.csh
 #Loading database with 5906 rows
         run.rn4.dm3/loadPairwise.csh
 #Loading database with 3058 rows
         run.danRer3.dm3/loadPairwise.csh
 #Loading database with 5328 rows
         run.ce6.dm3/loadPairwise.csh
 #Loading database with 4665 rows
         run.sacCer1.dm3/loadPairwise.csh
 #Loading database with 2205 rows
 
 
 # CLUSTER GENES AND MAP TO OTHER DATA
 # FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07)
 # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
 # FlyBase 5.3 history: (DONE 9/20/07 angie)
 # FlyBase 5.12 history: (DONE 10/24/08 angie)
     ssh hgwdev
     hgClusterGenes dm3 flyBaseGene flyBaseIsoforms flyBaseCanonical \
       -protAccQuery='select name,alias from flyBaseToUniProt'
 #Got 13784 clusters, from 21236 genes in 15 chromosomes
     # Create table that maps between flyBase genes and RefSeq
     hgMapToGene dm3 refGene flyBaseGene flyBaseToRefSeq
     # Create table that maps between known genes and Pfam domains
     hgMapViaSwissProt dm3 flyBaseToUniProt name alias Pfam flyBaseToPfam
 
     # Create a table that maps between flyBase genes and the 
     # Stanford Microarray Project expression data. (see makeHgFixed.doc)
     hgsql dm3 -e 'drop table if exists flyBaseToCG; create table flyBaseToCG \
       select name,SUBSTRING_INDEX(name,"-",1) as value from flyBaseGene'
     hgsql dm3 -e 'create index name on flyBaseToCG(name(12))'
     nice hgExpDistance dm3 hgFixed.arbFlyLifeMedianRatio dummyArg \
       arbExpDistance -lookup=flyBaseToCG
 
 
 # MAKE A DESCRIPTION TABLE FOR HGNEAR KNOWNDETAILS
 # FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07)
 # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
 # FlyBase 5.3 history: (DONE 9/20/07 angie)
 # FlyBase 5.12 history: (DONE 10/24/08 angie)
     # hgNear's knownDetails column type expects a single table that links 
     # transcript ID with uniProt description, so whip one up here:
     # (used by hgGene too though hgGene could join on the fly)
     ssh hgwdev
     hgsql dm3 -e 'drop table if exists flyBaseToDescription; \
       create table flyBaseToDescription \
       select flyBaseToUniProt.name,uniProt.description.val \
       from flyBaseToUniProt,uniProt.description \
       where flyBaseToUniProt.alias = uniProt.description.acc'
     hgsql dm3 -e 'create index name on flyBaseToDescription(name(12))'
 
 
 # FLYP2P
 # FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07)
 # Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
 # FlyBase 5.3 history: (hgLoadNetDist REDONE 9/18/07 angie)
 # FlyBase 5.12 history: (hgLoadNetDist REDONE 10/21/08 angie)
     # See hg/near/makeNear.doc...
     mkdir /cluster/data/dm3/bed/p2p
     cd /cluster/data/dm3/bed/p2p
     # Watch out... hgLoadNetDist overwrites and removes flyP2P.tab!
     # so rename here:
     cp -p /cluster/data/dm1/p2p/flyP2P.tab flyP2P.raw.txt
     ssh -x kolossus hgNetDist /cluster/data/dm3/bed/p2p/{flyP2P.raw.txt,tmp.tab}
     hgLoadNetDist tmp.tab dm3 flyP2P \
       -sqlRemap='select fbgn,name from flyBase2004Xref'
 #hgLoadNetDist 512 id-remapping misses, see missing.tab
     # That's OK, some genes were in dm1 but are not in dm3 (flybase 3 vs. 5)
 
 
 # MAKE ORGANISM-SPECIFIC HGNEARDATA AND HGGENEDATA FILES (DONE 4/6/07 angie)
     cd ~/kent/src/hg/near/hgNear/hgNearData
     # Made dm3 subdir of D_melanogaster with usual .ra files.
     # Similarly for hgGene/hgGeneData.
     # Make alpha on genome-test for hgNear, hgGene and trackDb DBS=dm3 .
 
 
 # ENABLE HGNEAR FOR DM3 IN HGCENTRALTEST (DONE 4/6/07 angie)
     hgsql -h genome-testdb hgcentraltest \
       -e  "update dbDb set hgNearOk = 1 where name = 'dm3';"
 
 # END OF HGNEAR STUFF
 #########################################################################
 
 
 #########################################################################
 # BDGP IN SITU IMAGE LINKS (DONE 4/7/06)
 # grabbed data again 10/24/08, no change.
     # HGGENE
     ssh hgwdev
     cd /cluster/data/dm3/bed/bdgpExprLink
     wget -O summary.txt.081024 \
     'http://www.fruitfly.org/cgi-bin/ex/bquery.pl?qpage=entry&qtype=summarytext'
     # remove header line.
     hgLoadSqlTab dm3 bdgpExprLink ~/kent/src/hg/lib/bdgpExprLink.sql \
       summary.txt.070406
 
 
 #########################################################################
 # "FLYBASE 5.1" DHGP HETEROCHROMATIN ANNOTATIONS (DONE 6/18/07 angie)
 # OBSOLETED by load of FlyBase 5.3 genes 9/18/07.
 # These heterochromatin annotations have just been published in Science
 # http://www.sciencemag.org/cgi/content/full/316/5831/1586
 # -- future updates will be from FlyBase, but for now the GFF is in the
 # paper's Supplemental Data!
     ssh kkstore05
     mkdir /cluster/data/dm3/bed/flybase5.1/Het
     cd /cluster/data/dm3/bed/flybase5.1/Het
     wget ftp://ftp.dhgp.org/pub/DHGP/Science_2007_Supplemental_Data/Supplemental_File_B_GFF/R5_all_GFF_results.tar.gz
     foreach chr (2LHet 2RHet 3LHet 3RHet U Uextra XHet YHet)
       wget ftp://ftp.dhgp.org/pub/DHGP/Science_2007_Supplemental_Data/Supplemental_File_C_FASTA/${chr}_translation_dmel_RELEASE5-1.FASTA
     end
     tar xvzf R5_all_GFF_results.tar.gz
     # Of course they invented a completely new encoding of names and 
     # properties into GFF 9th col and FASTA headers.  So instead of 
     # attempting to reuse the flybase-parsing script, just pick out
     # the rows we need and parse each row type...  we will not get
     # enough info to complete flyBase2004Xref, but c'est la vie.
     # There are files for non-het chroms... I took a look at 2L_annotation
     # and its annotations seem to be relative to chr2L:22000975... adding
     # that offset, almost all of the ones I checked appear in our current
     # dm3 flyBaseGene.  So I'll just use the Het files and ignore the others.
     foreach type (3_UTR 5_UTR CDS CDS_exon annotation exon transcript)
       echo $type
       awk '{print $3;}' {*Het,U*}_${type}_dmel* | sort | uniq -c
     end
     # Looks like the file-types and $3-types we need are these:
     # CDS_exon   CDS_exon
     # annotation exon, gene, ncRNA, pseudogene, rRNA
     # I will exclude RR* (repeat region)
     # I'll build up a .gff type by type:
     cp /dev/null extract.gff
     awk '$9 !~ /^name=RR/ && $3 == "CDS_exon" {print;}' {*Het,U*}_CDS_exon*GFF \
     | perl -wpe 's/^/chr/; s/CDS_exon/CDS/; \
                  s/name=\w+:\d+_(C\w\d+-R[A-Z]+).*/$1/ || die "$_";' \
       >> extract.gff
     awk '$9 !~ /^genegrp=RR/ && $3 == "exon" {print;}' {*Het,U*}_annotation*GFF \
     | perl -wpe 's/^/chr/; \
                  s/genegrp=\w+; transgrp=(C\w\d+-R[A-Z]+).*/$1/ || die "$_";' \
       >> extract.gff
     # All of the rows break down into either CG* IDs (coding) or CR* (nonc):
     wc -l extract.gff 
 #3245 extract.gff
     awk '$9 ~ /^CR/ {print;}' extract.gff | wc -l
 #261
     awk '$9 ~ /^CG/ {print;}' extract.gff | wc -l
 #2984
     expr 261 + 2984
 #3245
     # Break out coding and noncoding, and sort for readability:
     awk '$9 ~ /^CR/ {print;}' extract.gff \
     | sort -k 9,9 -k 1,1 -k 4n,4n \
     > flyBaseNoncodingHet.gff
     awk '$9 ~ /^CG/ {print;}' extract.gff \
     | sort -k 9,9 -k 1,1 -k 4n,4n \
     > flyBaseGeneHet.gff
 
     # gff -> genePred -> bed for noncoding
     ldHgGene dummy dummy flyBaseNoncodingHet.gff -nobin -out=stdout \
     | genePredToBed \
     | awk 'BEGIN {OFS="\t";} $7 == 0 && $8 == 0 {$7 = $2; $8 = $2;} {print;}' \
       > flyBaseNoncodingHet.bed
 #Read 127 transcripts in 261 lines in 1 files
 
     # Now get what we can get for flyBase2004Xref:
     awk '$3 ~ /^(gene|pseudogene|.*RNA)$/ {print;}' {*Het,U*}_annotation*GFF \
     | perl -pe 's/^\w+\tgadfly\t(pseudogene|\w+RNA)?(gene)?.*name=(\w+); symbol=([^;]*); (dbxref=FlyBase:(FBgn\d+))?.*$/$3\t$4\t\t\t$6\t\t\t$1/ \
                 || die "$_";\
                 s/^(\w+)\t\t/$1\t$1\t/;' \
     | sort -k 1,1 \
       > flyBase2004XrefHet.tab
     # That almost works but it needs C*-R* IDs and has only C*'s.
     # Join with the C*-R* IDs:
     awk '{print $9;}' fly*gff \
     | sort -u \
     | perl -wpe 's/^(\w+)/$1\t$1/;' \
     > cgTocgr.txt
     join -j 1 -a 2 -t'	' -o '1.2 2.2 2.3 2.4 2.5 2.6 2.7 2.8' \
       cgTocgr.txt flyBase2004XrefHet.tab \
     > tmp.tab
     mv tmp.tab flyBase2004XrefHet.tab
     # Uh-oh, some of these are already in flyBase2004Xref, so we'll get 
     # mysql errors (unique index) from trying to overload... and with
     # more of the fields filled in, so omit them from the new load.
     awk '{print $1;}' ../flyBase2004Xref.tab | sort > tmp1
     awk '{print $1;}' flyBase2004XrefHet.tab | sort > tmp2
     comm -1 -2 tmp1 tmp2
 #CG41431-RA
 #CG41432-RA
 #CG41434-RA
 #CG41436-RA
     # Only 4 such -- manually removed them from flyBase2004XrefHet.tab.
     perl -wpe 's/^>(\w+)-P(\w+)/>$1-R$2/' {*Het,U*}_*.FASTA \
       > flyBasePepHet.fa
     # Manually removed those 4 from the fa too.
     # Also removed this lone RR element from the fa: RR40941-RA
 
     # Now *additively* load the new data on top of the old:
     ssh hgwdev
     cd /cluster/data/dm3/bed/flybase5.1/Het
     # Protein-coding genes:
     ldHgGene -oldTable -nobin dm3 flyBaseGene flyBaseGeneHet.gff
 #Read 503 transcripts in 2984 lines in 1 files
 #  503 groups 7 seqs 1 sources 2 feature types
     cat ../flyBasePep.fa flyBasePepHet.fa \
     | hgPepPred dm3 generic flyBasePep stdin
     # Noncoding genes:
     hgLoadBed -oldTable dm3 flyBaseNoncoding flyBaseNoncodingHet.bed
 #Loaded 127 elements of size 12
     # Cross-referencing info for both coding and noncoding:
     hgLoadSqlTab -oldTable dm3 flyBase2004Xref \
       ~/kent/src/hg/lib/flyBase2004Xref.sql flyBase2004XrefHet.tab
 
     # Make sure there are no joinerCheck errors at this point, because 
     # if left here they can spread via doHgNearBlastp:
     runJoiner.csh dm3 flyBasePep
 
     # See if there are still large regions where refGene has genes but
     # flybase doesn't:
     featureBits dm3 refGene \!flyBaseGene -minSize=1000 -bed=stdout
     # better now that the Het's (except for Uextra) are included.
 
     # add upstream* downloadable files
     ssh hgwdev
     mkdir /cluster/data/dm3/bed/flybase5.1/bigZips
     cd /cluster/data/dm3/bed/flybase5.1/bigZips
     foreach size (1000 2000 5000)
       echo upstream$size
       nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \
       | nice gzip -c > upstream$size.fa.gz
     end
     nice md5sum *.gz > md5sum.txt
     ln -s /cluster/data/dm3/bed/flybase5.1/bigZips/*.gz \
       /usr/local/apache/htdocs/goldenPath/dm3/bigZips/
     cat md5sum.txt \
       >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
     # Edited out the old upstream* md5sums.
 
 
 #########################################################################
 # TWEAK DEFAULT POSITION (DONE 6/18/07 angie)
     ssh hgwdev
     hgsql hgcentraltest -e 'update dbDb set defaultPos="chr2L:826001-851000" where name = "dm3";'
 
 #########################################################################
 # ORegAnno - Open Regulatory Annotations 
 # update Oct 26, 2007; July 7, 2008
 # loaded by Belinda Giardine, in same manner as hg18 ORegAnno track
 
 
 #########################################################################
 # FLYBASE 5.3 ANNOTATIONS (DONE 9/17/07 angie)
 # OBSOLETED by load of FlyBase 5.12 genes 10/21/08.
     ssh kkstore05
     mkdir /cluster/data/dm3/bed/flybase5.3
     cd /cluster/data/dm3/bed/flybase5.3
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.3_FB2007_02/gff/dmel-all-r5.3.gff.gz
     gunzip dmel-all-r5.3.gff.gz
     # What data sources are represented in this file?
     grep -v '^#' dmel-all-r5.3.gff | awk '{print $2 "\t" $3;}' | sort | uniq -c
     # excerpt (many other sources, including blastx:... , sim4:... and 
     # tblastx:...; also various other types for source "FlyBase" and "."):
   69531 FlyBase CDS
   68426 FlyBase exon
   15185 FlyBase gene
   20738 FlyBase mRNA
      90 FlyBase miRNA
     105 FlyBase ncRNA
      94 FlyBase pseudogene
     164 FlyBase rRNA
      47 FlyBase snRNA
     249 FlyBase snoRNA
     314 FlyBase tRNA
    5385 FlyBase transposable_element
   38724 FlyBase transposable_element_insertion_site
     # Same types as 5.1, good.
 
     # What keywords are defined in the 9th field?  
     grep -v '^#' dmel-all-r5.3.gff \
     | awk '{print $9;}' \
     | perl -wpe 'while (s/(\w+)=//) {print "$1\n";} s/^.*\n$//;' \
     | sort | uniq -c
   17505 Alias
  412018 Dbxref
 6571566 ID
 6530043 Name
 4008974 Parent
 3048194 Target
   16921 cyto_range
     # Comparable to 5.1, good.
 
     cp ../flybase5.1/extractGenes.pl .
     # Mods to 5.1 script:
     # - so many new keywords, just say which ones we're ignoring instead
     #   of die-ing if we don't recognize a new one.
     # - protein line: Parent --> Derives_from.
     # - some ncRNA's have CG-*, not CR-*, IDs:
     time ./extractGenes.pl dmel-all-r5.3.gff
 #Noncoding FBtr0113460 has CG ID (CG33294-RB)
 #Noncoding FBtr0113490 has CG ID (CG34647-RA)
 #Noncoding FBtr0113491 has CG ID (CG34648-RA)
 #Noncoding FBtr0113492 has CG ID (CG34649-RA)
 #Keywords that have been ignored in one or more line types:
 #CDS: ID
 #CDS: Name
 #exon: Dbxref
 #exon: ID
 #exon: Name
 #gene: Alias
 #gene: Ontology_term
 #gene: cyto_range
 #gene: gbunit
 #mRNA: Alias
 #mRNA: Name
 #mRNA: score
 #mRNA: score_text
 #noncoding: Alias
 #noncoding: score
 #noncoding: score_text
 #protein: Alias
 #protein: Dbxref
 #protein: Name
 #protein: derived_isoelectric_point
 #protein: derived_molecular_weight
 #71.391u 1.296s 1:12.75 99.9%    0+0k 0+0io 0pf+0w
     mv flyBaseGene.src=FlyBase.gtf flyBaseGene.gtf
 
     # Get predicted proteins (for main annotations only)
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.3_FB2007_02/fasta/dmel-all-translation-r5.3.fasta.gz
     # Use the FBgn IDs in headers to retrieve CG-R's to match flyBaseGene:
     zcat dmel-all-translation-r5.3.fasta.gz \
     | perl -we 'open(X, "flyBase2004Xref.tab") || die; \
                 while (<X>) { \
                   @w = split("\t"); \
                   $fbtr2cgr{$w[3]} = $w[0]; \
                 } \
                 while (<>) { \
                   if (/^>.*parent=FBgn\d+,(FBtr\d+)/) { \
                     $cgr = $fbtr2cgr{$1}; \
                     if ($cgr) { \
                       s/^>.*/>$cgr/; \
                     } else { die; } \
                   } \
                   print; \
                 } \
                ' \
       > flyBasePep.fa
 
     ssh hgwdev
     cd /cluster/data/dm3/bed/flybase5.3
     # Protein-coding genes:
     ldHgGene -gtf dm3 flyBaseGene flyBase*.gtf
 #  20738 groups 14 seqs 1 sources 2 feature types
     hgPepPred dm3 generic flyBasePep flyBasePep.fa
     # Noncoding genes:
     hgLoadBed dm3 flyBaseNoncoding flyBaseNoncoding.bed
 #Loaded 1063 elements of size 12
     # Cross-referencing info for both coding and noncoding:
     hgLoadSqlTab dm3 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
       flyBase2004Xref.tab
 
     # Make sure there are no joinerCheck errors amongst those 3 tables
     # at this point, because if left here they can spread via doHgNearBlastp.
     # Since this is an update of those tables to a new FlyBase version,
     # and other hgNear tables already exist with the old version's set of 
     # IDs, ignore errors for missing CG items in tables that will be 
     # rebuilt in the hgNear process.:
     runJoiner.csh dm3 flyBasePep
 
     # See if there are still large regions where refGene has genes but
     # flybase doesn't:
     featureBits dm3 refGene \!flyBaseGene -minSize=1000 -bed=stdout
 #335940 bases of 162367812 (0.207%) in intersection
     # Spot-checked... some transcripts removed here and there, looks OK.
 
     # add upstream* downloadable files
     ssh hgwdev
     mkdir /cluster/data/dm3/bed/flybase5.3/bigZips
     cd /cluster/data/dm3/bed/flybase5.3/bigZips
     foreach size (1000 2000 5000)
       echo upstream$size
       nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \
       | nice gzip -c > upstream$size.fa.gz
     end
     nice md5sum *.gz > md5sum.txt
     ln -s /cluster/data/dm3/bed/flybase5.3/bigZips/*.gz \
       /usr/local/apache/htdocs/goldenPath/dm3/bigZips/
     # When updating, edit out the old upstream* sums from md5sum.txt, then:
     cat md5sum.txt \
       >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
 
 #############################################################################
 # N-SCAN GENES track ( markd)
 # create a composite track with exists ab-inito and new PASA N-SCAN predictions
 
     # download predictions
     cd /cluster/data/dm3/bed/nscan
     wget -nv http://mblab.wustl.edu/predictions/Drosophila/dm3/dm3.gtf
     wget -nv http://mblab.wustl.edu/predictions/Drosophila/dm3/dm3.prot.fa
     wget -nv http://mblab.wustl.edu/predictions/Drosophila/dm3/readme.txt
 
     bzip2 dm3.*
     chmod a-w *
 
     ldHgGene -gtf -genePredExt dm3 nscanPasaGene dm3.gtf.bz2
     hgPepPred dm3 generic nscanPasaPep  dm3.prot.fa.bz2
     rm *.tab
 
     # update trackDb; need a dm3-specific page to describe informants and PASA
     # make sure termRegex matches naming
     drosophila/dm3/nscanPasaGene.html
     drosophila/dm3/dm3/trackDb.ra
 
 #########################################################################
 
 
 ##########################################################################
 # EVOFOLD (Done, 10.02.2007) Jakob Skou Pedersen
 # RNA secondary structure predictions lifted from dm2 and filtered
   ssh -C hgwdev
   mkdir -p /cluster/data/dm3/bed/evofold
   cd /cluster/data/dm3/bed/evofold
   echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql dm2 | sed -e 1d > foldsDm2.bed
   liftOver -minMatch=1.0 foldsDm2.bed /cluster/data/dm2/bed/liftOver/dm2ToDm3.over.chain.gz tmp.bed unmapped.bed
   # remove elements which are wrong size after lifting
   awk '$3-$2 == $7' tmp.bed | sort -k4,4 > rawFoldsDm3.bed
 
   # structure filters
   # first, remove pairs that can't form in human 
   cut -f 1-6 rawFoldsDm3.bed > tmp.bed
   # sequenceForBed can be found and compiled from here: $HOME/kent/src/hg/altSplice/altSplice/
   nice /cluster/home/sugnet/bin/i386/sequenceForBed -db=dm3 -bedIn=tmp.bed -fastaOut=tmp.fa 
   cat tmp.fa | sed -e 's/\.[+-]\.chr.*$//' \
              | sed -e '/^>/s/$/\t/' | tr -d '\n' | sed -e 's/>/\n/g' | sed -e '1d' -e '$s/$/\n/' | sort -k1,1 > foldsDm3Seq.tab
   join -1 4 -2 1 -o "1.4 1.8 2.2" rawFoldsDm3.bed foldsDm3Seq.tab | sed -e 's/  */\t/g' | sort -k1,1 \
 	     | /cluster/home/jsp/scripts/tabFoldFilter.py > cleanFolds.tab
   join -1 4 -2 1 -o "1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 1.9" rawFoldsDm3.bed cleanFolds.tab | sed -e 's/  */\t/g' > tmp1.bed
   # second, remove poor predictions 
   # scripts can be found in cvs tree at: cvsroot/jsp/scripts/. They use a few modules which can be found at: cvsroot/jsp/py_modules
   cat tmp1.bed | /cluster/home/jsp/scripts/bedRnassFilter.py --dangling --minAvrStemSize=3 | /cluster/home/jsp/scripts/bedRnassFilter.sh 1 3 \
 	       | /cluster/home/jsp/scripts/roundListFloats.py -c9 > foldsDm3.bed
   # clean up
   rm tmp.bed tmp1.bed foldsDm2.bed foldsDm3Seq.tab rawFoldsDm3.bed tmp.fa cleanFolds.tab
   # comment: all the above filtering did not remove any
   # predictions... apparently dm2 and dm3 are quite similar.
 
   # upload
   hgLoadBed -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/evofold.sql dm3 evofold foldsDm3.bed
 
 #############################################################################
 # CONTRAST GENES (2007-10-02 markd)
 # recieved predictions from Sam Gross <ssgross@stanford.edu>
 
     cd /cluster/data/dm3/bed/contrastGene/
     wget http://www.stanford.edu/~ssgross/contrast.dm3.bed
     # this is a custom track, not a pure BED
     tail +2 contrast.dm3.bed | hgLoadBed -tab dm3 contrastGene stdin
 
     # verify 
     # load track db (ra and contrastGene.html are global
     # request push of contrastGene
 
 
 ###########################################################################
 # GENE DISRUPTION PROJECT/PSCREEN (DONE 5/1/08 angie)
     mkdir /cluster/data/dm3/bed/pscreen
     cd /cluster/data/dm3/bed/pscreen
     # Saved Excel spreadsheet emailed by Karen Schulze kschulze at bcm dot
     # tmc dot edu as tab-separated text.  Translate to our table format;
     dos2unix pscreen.txt
     tail +2 pscreen.txt \
     | perl -we ' \
         while (<>) { \
           chomp; \
           ($stock, $strain, $arm, $end, $ori, undef, undef, \
            $gene1, undef, $gene2, undef, $gene3) = split("\t"); \
           next if (! $end); \
           $chr = "chr" . $arm; \
           $strand = ($ori eq "Minus") ? "-" : "+"; \
           $geneCount = 0;  $geneIds = ""; \
           foreach $gene ($gene1, $gene2, $gene3) { \
             if ($gene) { $geneCount++;  $geneIds .= "$gene,"; } \
           } \
           print join("\t", $chr, $end-1, $end, $strain, 0, $strand, \
                            $stock, $geneCount, $geneIds) . "\n"; \
         }' \
     | sort -k1,1 -k2n,2n \
       > pscreen.bed
     hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pscreen.sql -notItemRgb \
        -onServer -tab dm3 pscreen pscreen.bed
 #Loaded 7672 elements of size 9
     rm bed.tab
 
 #########################################################################
 # hgPal downloads (DONE braney 2008-09-30)
     ssh hgwdev
     screen
     bash
     rm -rf /cluster/data/dm3/bed/multiz15way/pal
     mkdir /cluster/data/dm3/bed/multiz15way/pal
     cd /cluster/data/dm3/bed/multiz15way/pal
     echo $db > order.lst
     echo "select settings from trackDb where tableName='$mz'" | \
 	hgsql $db | sed 's/\\n/\n/g'| grep speciesOrder | tr ' ' '\n' | \
 	tail -n +2 >> order.lst
 
     mz=multiz15way
     gp=refGene
     db=dm3
 
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > refGene.jobs
 
     time sh -x refGene.jobs > refGene.job.log 2>&1 & 
     sleep 1
     tail -f refGene.job.log
 
 # real    35m20.802s
 # user    5m26.007s
 # sys     1m25.462s
 
     zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     # we're only distributing exons at the moment
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
     mz=multiz15way
     gp=flyBaseGene
     db=dm3
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.$mz.jobs
 
     time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & 
     sleep 1
     tail -f $gp.$mz.job.log
 
 # real    35m13.992s
 # user    5m18.930s
 # sys     1m27.205s
 
     zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     # we're only distributing exons at the moment
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
 #########################################################################
 # LIFTOVER TO DM2 (DONE 6/11/08 angie)
     doSameSpeciesLiftOver.pl dm3 dm2 > & dm3ToDm2.log & tail -f dm3ToDm2.log
     mv dm3ToDm2.log /cluster/data/dm3/bed/blat.dm2.2008-06-11/
 
 
 #########################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd -- UNDONE 2008-10-27 markd)
 update genbank.conf:
 dm3.upstreamGeneTbl = refGene
 dm3.upstreamMaf = multiz15way /hive/data/genomes/dm3/bed/multiz15way/species.lst
 # Those lines have been removed because Ann, Angie and Mark decided to
 # use FlyBase Genes as the reference set instead of RefSeq.
 
 
 #########################################################################
 # FLYBASE 5.12 ANNOTATIONS (DONE 10/21/08 angie)
     mkdir /hive/data/genomes/dm3/bed/flybase5.11
     cd /hive/data/genomes/dm3/bed/flybase5.11
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.12_FB2008_09/gff/dmel-all-r5.12.gff.gz
     gunzip dmel-all-r5.12.gff.gz
     # What data sources are represented in this file?
     grep -v '^#' dmel-all-r5.12.gff | awk '{print $2 "\t" $3;}' | sort | uniq -c
     # excerpt (many other sources, including blastx:... , sim4:... and 
     # tblastx:...; also various other types for source "FlyBase" and "."):
   72096 FlyBase CDS
   69371 FlyBase exon
   21140 FlyBase five_prime_UTR
   15140 FlyBase gene
   21243 FlyBase mRNA
      90 FlyBase miRNA
     154 FlyBase ncRNA
      95 FlyBase pseudogene
     161 FlyBase rRNA
      47 FlyBase snRNA
     249 FlyBase snoRNA
     314 FlyBase tRNA
   15546 FlyBase three_prime_UTR
    5413 FlyBase transposable_element
   43178 FlyBase transposable_element_insertion_site
     # Same types as 5.1, good.
 
     # What keywords are defined in the 9th field?  (again, ignoring a bunch)
     grep -v '^#' dmel-all-r5.12.gff \
     | awk '{print $9;}' \
     | perl -wpe 'while (s/(\w+)=//) {print "$1\n";} s/^.*\n$//;' \
     | sort | uniq -c
   79889 Alias
  408477 Dbxref
   21243 Derives_from
 3079555 ID
 7236214 Name
 4460030 Parent
 5376450 Target
    1384 cyto_range
     # Comparable to 5.1, good.
 
     cp ../flybase5.3/extractGenes.pl .
     time ./extractGenes.pl dmel-all-r5.12.gff
 #Noncoding FBtr0070097 has CG ID (CG18275-RA)
 #Keywords that have been ignored in one or more line types:
 #CDS: ID
 #CDS: Name
 #exon: Dbxref
 #exon: ID
 #exon: Name
 #gene: Alias
 #gene: Ontology_term
 #gene: derived_computed_cyto
 #gene: fullname
 #gene: gbunit
 #mRNA: Alias
 #mRNA: Name
 #mRNA: score
 #mRNA: score_text
 #noncoding: Alias
 #noncoding: score
 #noncoding: score_text
 #protein: Alias
 #protein: Dbxref
 #protein: Name
 #protein: derived_isoelectric_point
 #protein: derived_molecular_weight
 #81.970u 2.490s 1:25.12 99.2%    0+0k 0+0io 0pf+0w
     mv flyBaseGene.src=FlyBase.gtf flyBaseGene.gtf
 
     # Get predicted proteins (for main annotations only)
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.12_FB2008_09/fasta/dmel-all-translation-r5.12.fasta.gz
     # Use the FBgn IDs in headers to retrieve CG-R's to match flyBaseGene:
     zcat dmel-all-translation-r5.12.fasta.gz \
     | perl -we 'open(X, "flyBase2004Xref.tab") || die; \
                 while (<X>) { \
                   @w = split("\t"); \
                   $fbtr2cgr{$w[3]} = $w[0]; \
                 } \
                 while (<>) { \
                   if (/^>.*parent=FBgn\d+,(FBtr\d+)/) { \
                     $cgr = $fbtr2cgr{$1}; \
                     if ($cgr) { \
                       s/^>.*/>$cgr/; \
                     } else { die; } \
                   } \
                   print; \
                 } \
                ' \
       > flyBasePep.fa
 
     # Load Protein-coding genes:
     ldHgGene -gtf dm3 flyBaseGene flyBase*.gtf
 #  21243 groups 14 seqs 1 sources 2 feature types
     hgPepPred dm3 generic flyBasePep flyBasePep.fa
     # Noncoding genes:
     hgLoadBed dm3 flyBaseNoncoding flyBaseNoncoding.bed
 #Loaded 1063 elements of size 12
     # Cross-referencing info for both coding and noncoding:
     hgLoadSqlTab dm3 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
       flyBase2004Xref.tab
 
     # Make sure there are no joinerCheck errors amongst those 3 tables
     # at this point, because if left here they can spread via doHgNearBlastp.
     # Since this is an update of those tables to a new FlyBase version,
     # and other hgNear tables already exist with the old version's set of 
     # IDs, ignore errors for missing CG items in tables that will be 
     # rebuilt in the hgNear process.:
     runJoiner.csh dm3 flyBasePep
 
     # See if there are still large regions where refGene has genes but
     # flybase doesn't:
     featureBits dm3 flyBaseGene -or flyBaseNoncoding -bed=tmp.bed
     featureBits dm3 refGene \!tmp.bed -minSize=1000 -bed=stdout
 #279626 bases of 162367812 (0.172%) in intersection
     # Spot-checked... some transcripts removed here and there, looks OK.
 
     # add upstream* downloadable files
     mkdir /hive/data/genomes/dm3/bed/flybase5.12/bigZips
     cd /hive/data/genomes/dm3/bed/flybase5.12/bigZips
     foreach size (1000 2000 5000)
       echo upstream$size
       nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \
       | nice gzip -c > upstream$size.fa.gz
     end
     nice md5sum *.gz > md5sum.txt
     rm /usr/local/apache/htdocs/goldenPath/dm3/bigZips/upstream?000.fa.gz
     ln -s /hive/data/genomes/dm3/bed/flybase5.12/bigZips/*.gz \
       /usr/local/apache/htdocs/goldenPath/dm3/bigZips/
     # When updating, edit out the old upstream* sums from md5sum.txt, then:
     cat md5sum.txt \
       >> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
     # Now update a bunch of tables:
     # arbExpDistance, *BlastTab, flyBase{Canonical,Isoforms}, flyBaseTo*,
     # and any other tables used in conjunction with flyBaseGene in hgGene
     # or hgNear.
 
 
 #########################################################################
+# LIFTOVER BDTNP DATA FROM DM2 (DONE 1/29/09 angie - REDONE with more data 2/2/09)
+    ssh hgwdev
+    mkdir /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver
+    cd /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver
+    # Original files are varStep with unspecified step=1 -- translate
+    # to bed3, liftOver, and translate back to varStep:
+    foreach f (/hive/data/genomes/dm2/bed/bdtnpChipper/*.wig)
+      echo $f:t:r
+      grep -v ^track $f \
+      | perl -wpe 'chomp; @w = split(/[ \t=]/); \
+          if ($w[0] eq "variableStep") { $chr = $w[2];  $_ = ""; } \
+          elsif (defined $w[1]) { $_ = "$chr\t" . ($w[0]-1) . "\t$w[0]\t$w[1]\n"; }' \
+      | liftOver -bedPlus=4 stdin /hive/data/genomes/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \
+          stdout $f:t:r.unmapped \
+      | sort -k1,1 -k2n,2n \
+      | perl -wpe 'chomp; @w=split("\t"); next unless $w[3]; \
+          print "variableStep chrom=$w[0]\n" if (\!defined $prevChr || $prevChr ne $w[0]); \
+          $_ = "$w[2]\t$w[3]\n";  $prevChr = $w[0];' \
+        > $f:t:r.lo.wig
+    end
+    wc -l *.unmapped | grep -v '^ 0 '
+# 2 bdtnpDl3Fdr25.unmapped
+# 2 total
+    # This is the one unmapped item:
+    cat bdtnpDl3Fdr25.unmapped
+##Deleted in new
+#chrX    3521682 3521683 0.983636307483747
+
+    mkdir wigEncoded
+    cd wigEncoded
+    foreach f (../*Fdr*.lo.wig)
+      wigEncode $f $f:t:r:r.{wig,wib}
+    end
+    # Same output as for dm2, good:
+#Converted ../bdtnpBcd1Fdr1.wig, upper limit 8.14, lower limit 0.80
+#Converted ../bdtnpBcd1Fdr25.wig, upper limit 8.14, lower limit 0.80
+#Converted ../bdtnpBcd2Fdr1.wig, upper limit 10.74, lower limit 0.68
+#Converted ../bdtnpBcd2Fdr25.wig, upper limit 10.74, lower limit 0.68
+#Converted ../bdtnpCad1Fdr1.wig, upper limit 16.20, lower limit 1.06
+#Converted ../bdtnpCad1Fdr25.wig, upper limit 16.20, lower limit 0.48
+#Converted ../bdtnpGt2Fdr1.wig, upper limit 22.22, lower limit 1.21
+#Converted ../bdtnpGt2Fdr25.wig, upper limit 22.22, lower limit 0.49
+#Converted ../bdtnpHb1Fdr1.wig, upper limit 14.38, lower limit 0.92
+#Converted ../bdtnpHb1Fdr25.wig, upper limit 14.38, lower limit 0.86
+#Converted ../bdtnpHb2Fdr1.wig, upper limit 24.25, lower limit 0.88
+#Converted ../bdtnpHb2Fdr25.wig, upper limit 24.25, lower limit 0.78
+#Converted ../bdtnpKni1Fdr1.wig, upper limit 8.08, lower limit 1.78
+#Converted ../bdtnpKni1Fdr25.wig, upper limit 8.08, lower limit 1.01
+#Converted ../bdtnpKni2Fdr1.wig, upper limit 8.44, lower limit 1.07
+#Converted ../bdtnpKni2Fdr25.wig, upper limit 8.44, lower limit 0.76
+#Converted ../bdtnpKr1Fdr1.wig, upper limit 14.47, lower limit 0.68
+#Converted ../bdtnpKr1Fdr25.wig, upper limit 14.47, lower limit 0.68
+#Converted ../bdtnpKr2Fdr1.wig, upper limit 12.92, lower limit 0.66
+#Converted ../bdtnpKr2Fdr25.wig, upper limit 12.92, lower limit 0.66
+#Converted ../bdtnpPolIIFdr1.wig, upper limit 30.86, lower limit 0.93
+#Converted ../bdtnpPolIIFdr25.wig, upper limit 30.86, lower limit 0.70
+#Converted ../bdtnpZ2Fdr1.wig, upper limit 11.49, lower limit 0.95
+#Converted ../bdtnpZ2Fdr25.wig, upper limit 11.49, lower limit 0.79
+#Converted ../bdtnpD1Fdr1.wig, upper limit 20.33, lower limit 0.51
+#Converted ../bdtnpD1Fdr25.wig, upper limit 20.33, lower limit 0.51
+#Converted ../bdtnpDa2Fdr1.wig, upper limit 26.09, lower limit 0.79
+#Converted ../bdtnpDa2Fdr25.wig, upper limit 26.09, lower limit 0.49
+#Converted ../bdtnpDl3Fdr1.wig, upper limit 16.97, lower limit 0.65
+#Converted ../bdtnpDl3Fdr25.wig, upper limit 16.97, lower limit 0.62
+#Converted ../bdtnpFtz3Fdr1.wig, upper limit 16.54, lower limit 1.07
+#Converted ../bdtnpFtz3Fdr25.wig, upper limit 16.54, lower limit 0.84
+#Converted ../bdtnpH1Fdr1.wig, upper limit 18.52, lower limit 0.68
+#Converted ../bdtnpH1Fdr25.wig, upper limit 18.52, lower limit 0.68
+#Converted ../bdtnpH2Fdr1.wig, upper limit 13.23, lower limit 0.68
+#Converted ../bdtnpH2Fdr25.wig, upper limit 13.23, lower limit 0.65
+#Converted ../bdtnpHkb1Fdr1.wig, upper limit 11.84, lower limit 1.03
+#Converted ../bdtnpHkb1Fdr25.wig, upper limit 11.84, lower limit 0.35
+#Converted ../bdtnpHkb2Fdr1.wig, upper limit 10.73, lower limit 0.87
+#Converted ../bdtnpHkb2Fdr25.wig, upper limit 10.73, lower limit 0.33
+#Converted ../bdtnpHkb3Fdr1.wig, upper limit 10.50, lower limit 0.90
+#Converted ../bdtnpHkb3Fdr25.wig, upper limit 10.50, lower limit 0.37
+#Converted ../bdtnpMad2Fdr1.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMad2Fdr25.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMed2Fdr1.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMed2Fdr25.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpPrd1Fdr1.wig, upper limit 13.97, lower limit 0.78
+#Converted ../bdtnpPrd1Fdr25.wig, upper limit 13.97, lower limit 0.78
+#Converted ../bdtnpPrd2Fdr1.wig, upper limit 12.00, lower limit 0.84
+#Converted ../bdtnpPrd2Fdr25.wig, upper limit 12.00, lower limit 0.84
+#Converted ../bdtnpRun1Fdr1.wig, upper limit 10.02, lower limit 0.60
+#Converted ../bdtnpRun1Fdr25.wig, upper limit 10.02, lower limit 0.60
+#Converted ../bdtnpRun2Fdr1.wig, upper limit 7.58, lower limit 1.16
+#Converted ../bdtnpRun2Fdr25.wig, upper limit 7.58, lower limit 0.58
+#Converted ../bdtnpShn2Fdr1.wig, upper limit 12.62, lower limit 1.19
+#Converted ../bdtnpShn2Fdr25.wig, upper limit 12.62, lower limit 0.81
+#Converted ../bdtnpShn3Fdr1.wig, upper limit 10.28, lower limit 1.09
+#Converted ../bdtnpShn3Fdr25.wig, upper limit 10.28, lower limit 0.95
+#Converted ../bdtnpSlp11Fdr1.wig, upper limit 40.39, lower limit 1.06
+#Converted ../bdtnpSlp11Fdr25.wig, upper limit 40.39, lower limit 0.62
+#Converted ../bdtnpSna1Fdr1.wig, upper limit 8.41, lower limit 0.97
+#Converted ../bdtnpSna1Fdr25.wig, upper limit 8.41, lower limit 0.77
+#Converted ../bdtnpSna2Fdr1.wig, upper limit 14.30, lower limit 0.57
+#Converted ../bdtnpSna2Fdr25.wig, upper limit 14.30, lower limit 0.57
+#Converted ../bdtnpTFIIB1Fdr1.wig, upper limit 21.81, lower limit 0.59
+#Converted ../bdtnpTFIIB1Fdr25.wig, upper limit 21.81, lower limit 0.59
+#Converted ../bdtnpTll1Fdr1.wig, upper limit 16.66, lower limit 0.98
+#Converted ../bdtnpTll1Fdr25.wig, upper limit 16.66, lower limit 0.57
+#Converted ../bdtnpTwi1Fdr1.wig, upper limit 26.11, lower limit 0.77
+#Converted ../bdtnpTwi1Fdr25.wig, upper limit 26.11, lower limit 0.68
+#Converted ../bdtnpTwi2Fdr1.wig, upper limit 26.66, lower limit 0.79
+#Converted ../bdtnpTwi2Fdr25.wig, upper limit 26.66, lower limit 0.55
+
+    mkdir /gbdb/dm3/bdtnp
+    ln -s /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver/wigEncoded/*.wib /gbdb/dm3/bdtnp/
+    cd /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver/wigEncoded
+    foreach f (*.wig)
+      hgLoadWiggle -pathPrefix=/gbdb/dm3/bdtnp dm3 $f:r $f
+    end
+    rm wiggle.tab
+
+