src/hg/makeDb/doc/dm2.txt 1.23

1.23 2009/02/23 23:40:23 angie
bdtnpChipper (old change): corrected regex, found another subtrack.
Index: src/hg/makeDb/doc/dm2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/dm2.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/dm2.txt	3 Feb 2009 00:06:40 -0000	1.22
+++ src/hg/makeDb/doc/dm2.txt	23 Feb 2009 23:40:23 -0000	1.23
@@ -1,5943 +1,5947 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # Drosophila Melanogaster -- 
 # 
 # euchromatin (2L, 2R, 3L, 3R, 4, X):
 # Berkeley Drosophila Genome Project (fruitfly.org) release 4 (Apr. 2004)
 # http://www.fruitfly.org/annot/release4.html
 #
 # heterochromatin (2h, 3h, 4h, Xh, Yh, U):
 # Drosophila Heterochromatin Genome Project (dhgp.org) release 3.2 
 # (submitted to GenBank June 2004) 
 # http://www.dhgp.org/index_release_notes.html
 #
 # Gene annotations:
 # FlyBase (http://flybase.bio.indiana.edu/) last updated ???
 #
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +-------------+-------------------------+
 #  | tableName   | type                    |
 #  +-------------+-------------------------+
 #  | flyBaseGene | genePred flyBasePep     |
 #  | refGene     | genePred refPep refMrna |
 #  | nscanGene   | genePred                |
 #  | geneid      | genePred geneidPep      |
 #  | genscan     | genePred genscanPep     |
 #  | augustus    | genePred                |
 #  +-------------+-------------------------+
 
 
 # DOWNLOAD SEQUENCE (DONE 9/8/04 angie)
     ssh kksilo
     mkdir /cluster/store8/dm2
     ln -s /cluster/store8/dm2 /cluster/data/dm2
     cd /cluster/data/dm2
     wget ftp://ftp.fruitfly.org/pub/download/compressed/na_euchromatin_genomic_dmel_RELEASE4.FASTA.gz
     zcat na_euchromatin_genomic_dmel_RELEASE4.FASTA.gz \
     | faSplit byname stdin dummyArg
     # Follow FlyBase's lead on the chromosome names, but still use our 
     # "chr" prefix:
     foreach c (2L 2R 3L 3R 4 X)
       mkdir $c
       sed -e 's/^>arm_/>chr/' arm_$c.fa > $c/chr$c.fa
       echo arm_$c.fa size:
       faSize arm_$c.fa
       echo $c/chr$c.fa size:
       faSize $c/chr$c.fa
       echo comparison:
       faCmp arm_$c.fa $c/chr$c.fa
       echo ""
     end
     # heterochromatin sold separately... ftp://ftp.dhgp.org/pub/DHGP still 
     # has release 3.2 as its latest.  But that's June 2004... wtf, grab it:
     wget ftp://ftp.dhgp.org/pub/DHGP/Release3.2/FASTA/super-scaffolds/heterochromatin_super-scaffolds-genomic_dmel_RELEASE3.2.FASTA.tar.gz
     tar xvzf heterochromatin_super-scaffolds-genomic_dmel_RELEASE3.2.FASTA.tar.gz
     foreach c (2h 3h 4h Xh Yh U)
       mkdir $c
       perl -wpe 's/^>(\w+).*/>chr$1/' $c.FASTA > $c/chr$c.fa
       echo $c.FASTA size:
       faSize $c.FASTA
       echo $c/chr$c.fa size:
       faSize $c/chr$c.fa
       echo comparison:
       faCmp $c.FASTA $c/chr$c.fa
       echo ""
     end
     # Carefully review output of those commands, then:
     rm *.fa *.FASTA
 
     # put away the big download files
     mkdir downloads
     mv *.gz downloads/
 
 
 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 9/9/04 angie)
     mkdir /cluster/data/dm2/M
     cd /cluster/data/dm2/M
     # go to http://www.ncbi.nih.gov/ and search Nucleotide for 
     # "drosophila melanogaster mitochondrion genome".  That shows the gi number:
     # 5835233
     # Use that number in the entrez linking interface to get fasta:
     wget -O chrM.fa \
       'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5835233&dopt=FASTA'
     # Edit chrM.fa: make sure the long fancy header line says it's the 
     # Drosophila melanogaster mitochondrion complete genome, and then replace the 
     # header line with just ">chrM".
 
 
 # SPLIT CHROM FA INTO SMALLER CHUNKS BY GAPS (DONE 9/9/04 angie)
     ssh kksilo
     cd /cluster/data/dm2
     foreach c (?{,?})
       faSplit -minGapSize=100 -lift=$c/chr$c.lft \
         gap $c/chr$c.fa 2000000 $c/chr${c}_
     end
     foreach ctgFa (?{,?}/chr*_*.fa)
       set ctg = $ctgFa:r
       mkdir $ctg
       mv $ctgFa $ctg
     end
 
 
 # CREATING DATABASE (DONE 9/9/04 angie)
     ssh hgwdev
     # Make sure there is at least 5 gig free for the database
     df -h /var/lib/mysql
 #/dev/sdc1             1.8T  535G  1.1T  33% /var/lib/mysql
     # Create the database.
     hgsql '' -e 'create database dm2'
 
 
 # EXTRACT GAP INFORMATION FROM FASTA, LOAD GAP TRACK (DONE 9/9/04 angie)
     ssh kksilo
     cd /cluster/data/dm2
     # Do as Jim suggested back when we got dm1 sequence:
     # I think that we can probably just show all gaps as bridged
     # in the non-h chromosomes, and as unbridged in the h chromosomes
     # and leave it at that.
 
     # Extract gaps using scaffoldFaToAgp.  It's really meant for a different 
     # purpose, so clean up its output: remove the .lft and .agp, and remove 
     # the last line of .gap (extra gap added at end).  Also substitute in 
     # the correct chrom name in .gap.  
     foreach c (?{,?})
       set chr = chr$c
       pushd $c
       mv $chr.lft $chr.lft.bak
       scaffoldFaToAgp -minGapSize=100 $chr.fa
       rm $chr.{lft,agp}
       mv $chr.lft.bak $chr.lft
       set chrSize = `faSize $chr.fa | awk '{print $1;}'`
       set origLines = `cat $chr.gap | wc -l`
       awk '($2 != '$chrSize'+1) {print;}' $chr.gap \
       | sed -e "s/chrUn/$chr/" > $chr.gap2
       set newLines = `cat $chr.gap2 | wc -l`
       if ($newLines == ($origLines - 1)) then
         mv $chr.gap2 $chr.gap
       else
         echo "Error: $chr/$chr.gap2 has wrong number of lines."
       endif
       popd
     end
     # Call the gaps unbridged in chrU and chr*h:
     foreach c (U ?h)
       set chr = chr$c
       sed -e 's/yes/no/' $c/$chr.gap > $c/$chr.gap2
       mv $c/$chr.gap2 $c/$chr.gap
     end
     ssh hgwdev
     hgLoadGap dm2 /cluster/data/dm2
 
 
 # MAKE JKSTUFF AND BED DIRECTORIES (DONE 9/9/04 angie)
     # This used to hold scripts -- better to keep them inline in the .doc 
     # so they're in CVS.  Now it should just hold lift file(s) and 
     # temporary scripts made by copy-paste from this file.  
     mkdir /cluster/data/dm2/jkStuff
     # This is where most tracks will be built:
     mkdir /cluster/data/dm2/bed
 
 
 # MAKE LIFTALL.LFT (DONE 9/9/04 angie)
     ssh kksilo
     cd /cluster/data/dm2
     cat ?{,?}/chr*.lft > jkStuff/liftAll.lft
 
 
 # RUN REPEAT MASKER (DONE 9/9/04 angie)
     # Note: drosophila library ("drosophila.lib") is dated May 27 '03.
     # Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make 
     # RepeatMasker runs manageable on the cluster ==> results need lifting.
 
     # Split contigs into 500kb chunks:
     ssh kksilo
     cd /cluster/data/dm2
     foreach d ( */chr*_?{,?} )
       cd $d
       set contig = $d:t
       faSplit -minGapSize=100 -lift=$contig.lft -maxN=500000 \
         gap $contig.fa 500000 ${contig}_
       cd ../..
     end
 
     #- Make the run directory and job list:
     cd /cluster/data/dm2
     cat << '_EOF_' > jkStuff/RMDrosophila
 #!/bin/csh -fe
 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/dm2/$2
 /bin/cp $2 /tmp/dm2/$2/
 cd /tmp/dm2/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2
 popd
 /bin/cp /tmp/dm2/$2/$2.out ./
 if (-e /tmp/dm2/$2/$2.tbl) /bin/cp /tmp/dm2/$2/$2.tbl ./
 if (-e /tmp/dm2/$2/$2.cat) /bin/cp /tmp/dm2/$2/$2.cat ./
 /bin/rm -fr /tmp/dm2/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/dm2/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/dm2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x jkStuff/RMDrosophila
     mkdir RMRun
     cp /dev/null RMRun/RMJobs
     foreach d ( ?{,?}/chr*_?{,?} )
       set ctg = $d:t
       foreach f ( $d/${ctg}_?{,?}.fa )
         set f = $f:t
         echo /cluster/data/dm2/jkStuff/RMDrosophila \
              /cluster/data/dm2/$d $f /cluster/data/dm2/$d \
            '{'check out line+ /cluster/data/dm2/$d/$f.out'}' \
         >> RMRun/RMJobs
       end
     end
 
     # do the run
     ssh kk9
     cd /cluster/data/dm2/RMRun
     para create RMJobs
     para try, check, push, check,...
 #Completed: 288 of 288 jobs
 #Average job time:                3169s      52.82m     0.88h    0.04d
 #Longest job:                     4752s      79.20m     1.32h    0.06d
 #Submission to last job:         13769s     229.48m     3.82h    0.16d
 
     # Lift up the split-contig .out's to contig-level .out's
     ssh kksilo
     cd /cluster/data/dm2
     foreach d ( ?{,?}/chr*_?{,?} )
       cd $d
       set contig = $d:t
       liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
       cd ../..
     end
 
     # Lift up the contig-level .out's to chr-level
     foreach c (?{,?})
       cd $c
       if (-e chr$c.lft && ! -z chr$c.lft) then
         echo lifting $c
         /cluster/bin/i386/liftUp chr$c.fa.out chr$c.lft warn \
           `awk '{print $2"/"$2".fa.out";}' chr$c.lft` > /dev/null
       else
         echo Can\'t find $c/chr$c.lft \!
       endif
       cd ..
     end
 
     # Load the .out files into the database with:
     ssh hgwdev
     hgLoadOut dm2 /cluster/data/dm2/?{,?}/*.fa.out
 
 
 # SIMPLE REPEATS (TRF) (DONE 9/9/04 angie)
     ssh kksilo
     mkdir /cluster/data/dm2/bed/simpleRepeat
     cd /cluster/data/dm2/bed/simpleRepeat
     mkdir trf
     cp /dev/null jobs.csh
     foreach f (/cluster/data/dm2/?{,?}/chr*_*/chr?{,?}_?{,?}.fa)
         set fout = $f:t:r.bed
         echo $fout
         echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
         >> jobs.csh
     end
     tcsh jobs.csh >&! jobs.log &
     # check on this with
     tail -f jobs.log
     wc -l jobs.csh
     ls -1 trf | wc -l
     # When job is done do:
     liftUp simpleRepeat.bed /cluster/data/dm2/jkStuff/liftAll.lft warn \
       trf/*.bed
 
     # Load this into the database as so
     ssh hgwdev
     hgLoadBed dm2 simpleRepeat \
       /cluster/data/dm2/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
 
 
 # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 9/9/04 angie)
     # make a filtered version # of the trf output: 
     # keep trf's with period <= 12:
     ssh kksilo
     cd /cluster/data/dm2/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/*.bed)
         echo "filtering $f"
         awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
     # Lift up filtered trf output to chrom coords as well:
     cd /cluster/data/dm2
     mkdir bed/simpleRepeat/trfMaskChrom
     foreach c (?{,?})
       liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.lft warn \
         `awk '{print "bed/simpleRepeat/trfMask/"$2".bed";}' $c/chr$c.lft`
     end
 
 
 # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 9/9/04 angie)
     ssh kksilo
     cd /cluster/data/dm2
     foreach c (?{,?})
       echo repeat- and trf-masking chr$c.fa
       /cluster/home/kent/bin/i386/maskOutFa -soft \
         $c/chr$c.fa $c/chr$c.fa.out $c/chr$c.fa
       /cluster/home/kent/bin/i386/maskOutFa -softAdd \
         $c/chr$c.fa bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.fa
     end
     foreach c (?{,?})
       echo repeat- and trf-masking contigs of chr$c
       foreach ctgFa ($c/chr*/chr${c}_?{,?}.fa)
         set trfMask=bed/simpleRepeat/trfMask/$ctgFa:t:r.bed
         /cluster/home/kent/bin/i386/maskOutFa -soft $ctgFa $ctgFa.out $ctgFa
         /cluster/home/kent/bin/i386/maskOutFa -softAdd $ctgFa $trfMask $ctgFa
       end
     end
 
 
 # STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 9/9/04 angie)
     # Translate to nib
     ssh kksilo
     cd /cluster/data/dm2
     mkdir nib
     foreach c (?{,?})
       faToNib -softMask $c/chr$c.fa nib/chr$c.nib
     end
 
     # Make symbolic links from /gbdb/dm2/nib to the real nibs.
     ssh hgwdev
     mkdir -p /gbdb/dm2/nib
     foreach f (/cluster/data/dm2/nib/chr*.nib)
       ln -s $f /gbdb/dm2/nib
     end
 
     # Load /gbdb/dm2/nib paths into database and save size info.
     hgsql dm2  < ~/src/hg/lib/chromInfo.sql
     hgNibSeq -preMadeNib dm2 /gbdb/dm2/nib /cluster/data/dm2/?{,?}/chr?{,?}.fa
     echo "select chrom,size from chromInfo" | hgsql -N dm2 \
       > /cluster/data/dm2/chrom.sizes
 
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 9/9/04 angie)
     # Copy all the data from the table "grp" 
     # in the existing database dm1 to the new database
     ssh hgwdev
     hgsql dm2 -e "create table grp (PRIMARY KEY(NAME)) select * from dm1.grp"
 
 
 # MAKE GCPERCENT (DONE 2/2/04 angie)
      ssh hgwdev
      mkdir /cluster/data/dm2/bed/gcPercent
      cd /cluster/data/dm2/bed/gcPercent
      # create and load gcPercent table
      hgsql dm2  < ~/src/hg/lib/gcPercent.sql
      hgGcPercent dm2 ../../nib
 
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DROSOPHILA (DONE 9/9/04 angie)
     # Warning: must genome and organism fields must correspond
     # with defaultDb values
     echo 'INSERT INTO dbDb \
         (name, description, nibPath, organism, \
                 defaultPos, active, orderKey, genome, scientificName, \
                 htmlPath, hgNearOk, hgPbOk, sourceName) values \
         ("dm2", "Apr. 2004", "/gbdb/dm2/nib", "D. melanogaster", \
                "chr2L:825964-851061", 1, 55, "D. melanogaster", \
                 "Drosophila melanogaster", "/gbdb/dm2/html/description.html", \
                 0, 0, "BDGP v. 4 / DHGP v. 3.2");' \
       | hgsql -h genome-testdb hgcentraltest
     echo 'update defaultDb set name = "dm2" where genome = "D. melanogaster"' \
       | hgsql -h genome-testdb hgcentraltest
 
     # Make trackDb table so browser knows what tracks to expect:
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/trackDb
     cvs up -d -P
 
     # Edit that makefile to add dm2 to DBS and do
     make update
 
     # go public on genome-test
     cvs commit makefile
     make alpha
 
     # Add trackDb directories and description.html
     mkdir drosophila/dm2
     cvs add drosophila/dm2
     # Write ~/kent/src/hg/makeDb/trackDb/drosophila/dm2/description.html 
     # with a description of the assembly and some sample position queries.  
     chmod a+r drosophila/dm2/description.html
     # Check it in and copy (via "make alpha" in trackDb/) to 
     # /cluster/data/dm2/html/.  
     cvs add drosophila/dm2/description.html
     cvs commit drosophila/dm2
     mkdir -p /gbdb/dm2/html
     make alpha
 
 
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DROSOPHILA (DONE 2/?/05 galt/angie)
     ssh hgwdev
     # Get appropriate hostname and port numbers from cluster admins:
     echo 'insert into blatServers values("dm2", "blat14", 17794, 1, 0); \
           insert into blatServers values("dm2", "blat14", 17795, 0, 1);' \
       | hgsql -h genome-testdb hgcentraltest
 
 
 # PUT NIBS ON ISCRATCH (DONE 9/9/04 angie)
     ssh kkr1u00
     mkdir /iscratch/i/dm2
     cd /iscratch/i/dm2
     cp -pR /cluster/data/dm2/nib .
     iSync
     # Added "contigs" (chunks) 9/16/04
     mkdir maskedContigs
     cp -p /cluster/data/dm2/*/chr*_*/chr?{,?}_?{,?}.fa maskedContigs
     iSync
 
 
 # AUTO UPDATE GENBANK MRNA RUN  (DONE 2/1/05 angie)
     # Update genbank config and source in CVS:
     cd ~/kent/src/hg/makeDb/genbank
     cvsup .
     # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
     # edits, check them in if necessary:
     diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf
 
     # Edit etc/genbank.conf and add these lines:
 # dm2 (D. melanogaster)
 dm2.genome = /iscratch/i/dm2/nib/chr*.nib
 dm2.lift = /cluster/data/dm2/jkStuff/liftAll.lft
 dm2.genbank.mrna.xeno.load = yes
 dm2.genbank.est.xeno.load = no
 dm2.downloadDir = dm2
 
     cvs ci etc/genbank.conf
 
     # Install to /cluster/data/genbank:
     make install-server
 
     ssh eieio
     cd /cluster/data/genbank
     # This is an -initial run, refseq only:
     nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial dm2 &
     # Load results:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -drop -initialLoad dm2
     featureBits dm2 refGene
 #28230571 bases of 131698467 (21.436%) in intersection
     # Clean up:
     rm -rf work/initial.dm2
 
     ssh eieio
     cd /cluster/data/genbank
     # This is an -initial run, mRNA only:
     nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial dm2 &
     # Load results:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -drop -initialLoad dm2
     featureBits dm2 mrna
 #23752517 bases of 131698467 (18.036%) in intersection
     featureBits dm2 xenoMrna
 #5943805 bases of 131698467 (4.513%) in intersection
     # Clean up:
     rm -rf work/initial.dm2
 
     ssh eieio
     # -initial for ESTs:
     nice bin/gbAlignStep -srcDb=genbank -type=est -initial dm2 &
     # Load results:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep dm2
     featureBits dm2 intronEst
 #12644283 bases of 131698467 (9.601%) in intersection
     featureBits dm2 est
 #30993609 bases of 131698467 (23.534%) in intersection
     # Clean up:
     rm -rf work/initial.dm2
 
 
 # PRODUCING GENSCAN PREDICTIONS (DONE 9/10/04 angie)
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/genscan
     cd /cluster/data/dm2/bed/genscan
     cvs co hg3rdParty/genscanlinux
     ssh kksilo
     cd /cluster/data/dm2/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Make hard-masked contigs
     foreach f (/cluster/data/dm2/?{,?}/chr*/chr?{,?}_?{,?}.fa)
       maskOutFa $f hard $f.masked
     end
     # Generate a list file, contigs.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     rm -f contigs.list
     touch contigs.list
     foreach f ( `ls -1S /cluster/data/dm2/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` )
       egrep '[ACGT]' $f > /dev/null
       if ($status == 0) echo $f >> contigs.list
     end
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line keeps emacs coloring happy
     gensub2 contigs.list single gsub jobList
     # Run on small cluster -- genscan needs big mem.
     ssh kki
     cd /cluster/data/dm2/bed/genscan
     para create jobList
     para try, check, push, check, ...
 #Completed: 81 of 81 jobs
 #Average job time:                  48s       0.80m     0.01h    0.00d
 #Longest job:                      212s       3.53m     0.06h    0.00d
 #Submission to last job:           589s       9.82m     0.16h    0.01d
 
     # If there are crashes, diagnose with "para problems".  
     # If a job crashes due to genscan running out of memory, re-run it 
     # manually with "-window=1200000" instead of "-window=2400000".
     
     # Convert these to chromosome level files as so:
     ssh kksilo
     cd /cluster/data/dm2/bed/genscan
     liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
     liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
     cat pep/*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/dm2/bed/genscan
     ldHgGene -gtf dm2 genscan genscan.gtf
     hgPepPred dm2 generic genscanPep genscan.pep
     hgLoadBed dm2 genscanSubopt genscanSubopt.bed
     featureBits dm2 genscan
 #24711981 bases of 131698467 (18.764%) in intersection
 
 # MYTOUCH FIX - Jen - 2006-01-24
   sudo mytouch dm2 genscanPep 0412141800.00
 
 # MAKE DOWNLOADABLE FILES (DONE 1/26/05 angie)
     ssh kksilo
     cd /cluster/data/dm2
     mkdir zips
     zip -j zips/chromOut.zip ?{,?}/chr?{,?}.fa.out
     zip -j zips/chromFa.zip ?{,?}/chr?{,?}.fa
     foreach f (?{,?}/chr?{,?}.fa)
       maskOutFa $f hard $f.masked
     end
     zip -j zips/chromFaMasked.zip ?{,?}/chr?{,?}.fa.masked
     cd bed/simpleRepeat
     zip ../../zips/chromTrf.zip trfMaskChrom/chr*.bed
     cd ../..
     # Make a starter mrna.zip -- it will get updated regularly on the RR. 
     /cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank \
       -db=dm2 -native genbank mrna zips/mrna.fa
     gzip zips/mrna.fa
     foreach f (zips/*.zip)
       echo $f
       unzip -t $f | tail -1
     end
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dm2
     cd /usr/local/apache/htdocs/goldenPath/dm2
     mkdir bigZips database
     # Create README.txt files in bigZips/ and database/ to explain the files.
     cp -p /cluster/data/dm2/zips/*.{zip,gz} bigZips
     cd bigZips
     md5sum *.{zip,gz} > md5sum.txt
 
 
 # BLASTZ D.PSEUDOOBSCURA (DONE 9/10/04 angie)
 # CHAIN PSEUDOOBSCURA BLASTZ (DONE 9/10/04 angie)
 # NET PSEUDOOBSCURA BLASTZ (DONE 9/11/04 angie)
 # GENERATE DP2 MAF FOR MULTIZ FROM NET (DONE 9/11/04 angie)
 # MAKE VSDP2 DOWNLOADABLES (DONE 2/15/05 angie)
 # -- dp2; Originally run with default blastz params; obsoleted by dp3 which 
 # was later run with better params, below.
 
 # BLASTZ ANOPHELES (DONE 9/13/04 angie)
 # CHAIN ANOPHELES BLASTZ (DONE 9/13/04 angie)
 # NET ANOPHELES BLASTZ (DONE 9/13/04 angie)
 # MAKE VSANOGAM1 DOWNLOADABLES (DONE 1/28/05 angie)
 # GENERATE ANOGAM1 MAF FOR MULTIZ FROM NET (DONE 9/13/04 angie)
 # -- Originally run with "human-fugu" blastz params; replaced by a run with 
 # better params, below.
 
 
 # MULTIZ MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/22/04 angie)
     # put the MAFs on bluearc
     ssh kksilo
     mkdir -p /cluster/bluearc/multiz.flymo/my
     cp /cluster/data/dm2/bed/blastz.droYak1.2004-09-10/mafNet/*.maf \
       /cluster/bluearc/multiz.flymo/my
     mkdir -p /cluster/bluearc/multiz.flymo/mp
     cp /cluster/data/dm2/bed/blastz.dp2.2004-09-10/mafNet/*.maf \
       /cluster/bluearc/multiz.flymo/mp
     mkdir -p /cluster/bluearc/multiz.flymo/ma
     cp /cluster/data/dm2/bed/blastz.anoGam1.2004-09-13/mafNet/*.maf \
       /cluster/bluearc/multiz.flymo/ma
 
     ssh kki
     mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1
     mkdir mypa
     # Use PSU's new var_multiz:
     cat << '_EOF_' > doMultiz
 #!/bin/csh -ef
 set path = (/cluster/bin/penn/var_multiz.2004.08.12 $path)
 set chr = $1:t:r
 if (-s $1 && -s $2) then
   set tmp = /scratch/$chr.tmp.maf
   var_multiz $1 $2 0 0 > $tmp
   maf_project $tmp dm2.$chr > /scratch/$chr.myp.maf
   rm $tmp
 else if (-s $1) then
   cp $1 /scratch/$chr.myp.maf
 else if (-s $2) then
   cp $2 /scratch/$chr.myp.maf
 endif
 if (-s /scratch/$chr.myp.maf && -s $3) then
   set tmp = /scratch/$chr.tmp.maf
   var_multiz /scratch/$chr.myp.maf $3 1 0 > $tmp
   maf_project $tmp dm2.$chr > $4
   rm $tmp
 else if (-s /scratch/$chr.myp.maf) then
   cp /scratch/$chr.myp.maf $4
 else if (-s $3) then
   cp $3 $4
 endif
 rm /scratch/$chr.myp.maf
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doMultiz
     cp /dev/null jobList
     foreach chr (`awk '{print $1;}' /cluster/data/dm2/chrom.sizes`)
       set f1 = /cluster/bluearc/multiz.flymo/my/$chr.maf
       set f2 = /cluster/bluearc/multiz.flymo/mp/$chr.maf
       set f3 = /cluster/bluearc/multiz.flymo/ma/$chr.maf
       echo "doMultiz $f1 $f2 $f3 mypa/$chr.maf" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 13 of 13 jobs
 #Average job time:                 246s       4.09m     0.07h    0.00d
 #Longest job:                      811s      13.52m     0.23h    0.01d
 #Submission to last job:           811s      13.52m     0.23h    0.01d
     du -sh mypa
 #417M    mypa
 
     # clean up bluearc
     rm -r /cluster/bluearc/multiz.flymo
 
     # setup external files for database reference
     ssh hgwdev
     mkdir /gbdb/dm2/mzDy1Dp2Ag1_phast
     ln -s /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/mypa/chr*.maf \
       /gbdb/dm2/mzDy1Dp2Ag1_phast/
     # load into database
     cd /tmp
     hgLoadMaf -warn dm2 mzDy1Dp2Ag1_phast
     cd /gbdb/dm2
     mkdir d_pseudoobscura_mypa d_yakuba_mypa a_gambiae_mypa
     cd /tmp
     ln -s /cluster/data/dm2/bed/blastz.dp2.2004-09-10/mafNet/*.maf \
       /gbdb/dm2/d_pseudoobscura_mypa
     hgLoadMaf -WARN dm2 d_pseudoobscura_mypa
     ln -s /cluster/data/dm2/bed/blastz.droYak1.2004-09-10/mafNet/*.maf \
       /gbdb/dm2/d_yakuba_mypa
     hgLoadMaf -WARN dm2 d_yakuba_mypa
     ln -s /cluster/data/dm2/bed/blastz.anoGam1.2004-09-13/mafNet/*.maf \
       /gbdb/dm2/a_gambiae_mypa
     hgLoadMaf -WARN dm2 a_gambiae_mypa
 
 
 # PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/22/04 angie)
     ssh kksilo
     # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces
     mkdir -p /cluster/bluearc/dm2/chrom
     cp -p /cluster/data/dm2/?{,?}/chr*.fa /cluster/bluearc/dm2/chrom/
     ssh kki
     mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
     mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.split
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.split
     set WINDOWS = /cluster/bluearc/dm2/phastCons/WINDOWS
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
     cat << 'EOF' > doSplit.sh
 #!/bin/csh -ef
 
 set PHAST=/cluster/bin/phast
 set FA_SRC=/cluster/bluearc/dm2/chrom
 set WINDOWS=/cluster/bluearc/dm2/phastCons/WINDOWS
 
 set maf=$1
 set c = $maf:t:r
 set tmpDir = /scratch/msa_split/$c
 rm -rf $tmpDir
 mkdir -p $tmpDir
 ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O dm2,droYak1,dp2,anoGam1 \
    -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
 cd $tmpDir
 foreach file ($c.*.ss)
   gzip -c $file > ${WINDOWS}/$file.gz
 end
 rm -f $tmpDir/$c.*.ss
 rmdir $tmpDir
 'EOF'
 # << for emacs
     chmod a+x doSplit.sh
     rm -f jobList
     foreach file (/cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/mypa/*.maf) 
       if (-s $file) then
         echo "doSplit.sh {check in line+ $file}" >> jobList
       endif
     end
     para create jobList
     para try,  check, push, check...
 #Completed: 13 of 13 jobs
 #Average job time:                  23s       0.38m     0.01h    0.00d
 #Longest job:                       53s       0.88m     0.01h    0.00d
 #Submission to last job:            53s       0.88m     0.01h    0.00d
     cd ..
 
     # use the model previously estimated (see makeDm1.doc) as a starting model
     sed -e 's/dm1/dm2/g' /cluster/data/dm1/bed/phastCons4way/rev-dg.mod \
       > starting-tree.mod
     # -- Because of the very long branch length to anoGam1 being pretty 
     # much impossible to estimate from alignment data, edit that file to 
     # reduce the anoGam1 branch length from 2.66 to 0.5.  Otherwise 
     # estimation process blows up.  So our starting tree becomes
 #TREE: (((dm2:0.058,droYak1:0.074):0.133,dp2:0.200):0,anoGam1:0.5);
 
     # Get genome-wide average GC content (for all species together,
     # not just the reference genome).  If you have a globally
     # estimated tree model, as above, you can get this from the
     # BACKGROUND line in the .mod file.  E.g.,
 # ALPHABET: A C G T
 # ...
 # BACKGROUND: 0.276938 0.223190 0.223142 0.276730
     # add up the C and G:
     awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod
 #0.446
     # Great, use 0.446 as the --gc parameter in phastCons below:.
 
     # Now set up cluster job to estimate model parameters.  
     # Parameters will be estimated separately for each alignment fragment 
     # then will be combined across fragments.
     # Use --gc from above, and --target-coverage computed as follows:
     # 1. Jim would like phastConsElements coverage to be 25% for fly.
     # 2. 83% of dm2 is covered by chainDroYak1Link, so use .25 / .83 = .30 
     #    as an initial --target-coverage.
     # 3. If actual coverage is different from our target, come back to this 
     #    step, adjust --target-coverage and rerun up through phastConsElements.
     # -- Actually, Adam suggests starting with what proved to work for worm:
     #    --target-coverage 0.4 and --expected-lengths 25 (not 12 as for mammal)
     # -- OK, that led to too-high coverage:
     # 51511266 bases of 131698467 (39.113%) in intersection
     #    so next try 0.3, 30:
     # 49886398 bases of 131698467 (37.879%) in intersection
     #   how about 0.3, 20?
     # 42067218 bases of 131698467 (31.942%) in intersection
     #   0.25, 20?
     # 38397215 bases of 131698467 (29.155%) in intersection
     #   0.25, 15?
     # 35039841 bases of 131698467 (26.606%) in intersection
     # -- that's close enough to the target.  If the "bumpiness" of the 
     #    wiggle looks aesthetically pleasing then we're done.
     # -- OK, Adam would like to see more smoothing so that coding exons 
     #    stand out better, so try 0.20, 25:
     # 37574714 bases of 131698467 (28.531%) in intersection
     # More smooting would be nice for coding exons, so tried 0.20, 50, 
     # but Adam didn't like that as much overall so stick with 0.20, 25.  
 
     mkdir run.estimate
     cd run.estimate
     cat << '_EOF_' > doEstimate.sh
 #!/bin/csh -ef
 zcat $1 \
 | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.435 --nrates 1,1 \
     --no-post-probs --ignore-missing --expected-lengths 25 \
     --target-coverage 0.20 --quiet --log $2 --estimate-trees $3
 '_EOF_'
 # << for emacs
     chmod a+x doEstimate.sh
     rm -fr LOG TREES
     mkdir -p LOG TREES
     rm -f jobList
     foreach f (/cluster/bluearc/dm2/phastCons/WINDOWS/*.ss.gz)
       set root = $f:t:r:r
       echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
     end
     # run cluster job
     ssh kk9
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.estimate
     para create jobList
     para try, check, push, check, ...
 #Completed: 139 of 139 jobs
 #Average job time:                 133s       2.22m     0.04h    0.00d
 #Longest job:                      221s       3.68m     0.06h    0.00d
 #Submission to last job:           328s       5.47m     0.09h    0.00d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ssh kksilo
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.estimate
     ls -1 TREES/*.cons.mod > cons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
       --output-average ave.cons.mod > cons_summary.txt
     ls -1 TREES/*.noncons.mod > noncons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
       --output-average ave.noncons.mod > noncons_summary.txt
     grep TREE ave*.mod
 #ave.cons.mod:TREE: (((dm2:0.028707,droYak1:0.019740):0.039091,dp2:0.065825):0.086142,anoGam1:0.086142);
 #ave.noncons.mod:TREE: (((dm2:0.118202,droYak1:0.079147):0.162923,dp2:0.275544):0.360361,anoGam1:0.360361);
     # look over the files cons_summary.txt and noncons_summary.txt.
     # The means and medians should be roughly equal and the stdevs
     # should be reasonably small compared to the means, particularly
     # for rate matrix parameters (at bottom) and for branches to the
     # leaves of the tree.  The stdevs may be fairly high for branches
     # near the root of the tree; that's okay.  Some min values may be
     # 0 for some parameters.  That's okay, but watch out for very large
     # values in the max column, which might skew the mean.  If you see
     # any signs of bad outliers, you may have to track down the
     # responsible .mod files and throw them out.  I've never had to do
     # this; the estimates generally seem pretty well behaved.
 
     # NOTE: Actually, a random sample of several hundred to a thousand
     # alignment fragments (say, a number equal to the number of
     # available cluster nodes) should be more than adequate for
     # parameter estimation.  If pressed for time, use this strategy.
 
     # Now we are ready to set up the cluster job for computing the
     # conservation scores and predicted elements.  It's all downhill
     # from here.
     ssh kk9
     mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.phast
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.phast
     cat << 'EOF' > doPhastCons.sh
 #!/bin/csh -ef
 set pref = $1:t:r:r
 set chr = `echo $pref | awk -F\. '{print $1}'`
 set tmpfile = /scratch/phastCons.$$
 zcat $1 \
 | /cluster/bin/phast/phastCons - \
     ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
     --expected-lengths 25 --target-coverage 0.20 --quiet --seqname $chr \
     --idpref $pref \
     --viterbi /cluster/bluearc/dm2/phastCons/ELEMENTS/$pref.bed --score \
     --require-informative 0 \
   > $tmpfile
 gzip -c $tmpfile > /cluster/bluearc/dm2/phastCons/POSTPROBS/$pref.pp.gz
 rm $tmpfile
 'EOF'
 # << for emacs
     chmod a+x doPhastCons.sh
     rm -fr /cluster/bluearc/dm2/phastCons/{POSTPROBS,ELEMENTS}
     mkdir -p /cluster/bluearc/dm2/phastCons/{POSTPROBS,ELEMENTS}
     rm -f jobList
     foreach f (/cluster/bluearc/dm2/phastCons/WINDOWS/*.ss.gz)
       echo doPhastCons.sh $f >> jobList
     end
     para create jobList
     para try, check, push, check, ...
 #Completed: 139 of 139 jobs
 #Average job time:                  23s       0.38m     0.01h    0.00d
 #Longest job:                       49s       0.82m     0.01h    0.00d
 #Submission to last job:            52s       0.87m     0.01h    0.00d
 
     # back on kksilo:
     # combine predictions and transform scores to be in 0-1000 interval
     # do in a way that avoids limits on numbers of args
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
     awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
       /cluster/bluearc/dm2/phastCons/ELEMENTS/*.bed \
     | /cluster/bin/scripts/lodToBedScore > all.bed
 
     ssh hgwdev
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
     featureBits dm2 all.bed
 #37574714 bases of 131698467 (28.531%) in intersection
     # OK, close enough.
     hgLoadBed dm2 phastConsElements all.bed
 
     # Create wiggle on the small cluster
     ssh kki
     mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.wib
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.wib
     rm -rf /cluster/bluearc/dm2/phastCons/wib
     mkdir -p /cluster/bluearc/dm2/phastCons/wib
     cat << 'EOF' > doWigAsciiToBinary
 #!/bin/csh -ef
 set chr = $1
 zcat `ls -1 /cluster/bluearc/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
       | sort -t\. -k2,2n` \
 | wigAsciiToBinary -chrom=$chr \
     -wibFile=/cluster/bluearc/dm2/phastCons/wib/${chr}_phastCons stdin 
 'EOF'
 # << for emacs
     chmod a+x doWigAsciiToBinary
     rm -f jobList
     foreach chr (`ls -1 /cluster/bluearc/dm2/phastCons/POSTPROBS \
                   | awk -F\. '{print $1}' | sort -u`)
       echo doWigAsciiToBinary $chr >> jobList
     end
     para create jobList
     para try, check, push, check, ...
 #Completed: 13 of 13 jobs
 #Average job time:                  14s       0.24m     0.00h    0.00d
 #Longest job:                       36s       0.60m     0.01h    0.00d
 #Submission to last job:            36s       0.60m     0.01h    0.00d
 
     # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want 
     # the raw scores) from bluearc
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
     rm -rf wib POSTPROBS
     rsync -av /cluster/bluearc/dm2/phastCons/wib .
     rsync -av /cluster/bluearc/dm2/phastCons/POSTPROBS .
 
     # load wiggle component of Conservation track
     ssh hgwdev
     mkdir -p /gbdb/dm2/wib/mzDy1Dp2Ag1_phast
     cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
     chmod 775 . wib
     chmod 664 wib/*.wib
     ln -s `pwd`/wib/*.wib /gbdb/dm2/wib/mzDy1Dp2Ag1_phast/
     hgLoadWiggle dm2 mzDy1Dp2Ag1_phast_wig \
       -pathPrefix=/gbdb/dm2/wib/mzDy1Dp2Ag1_phast wib/*.wig
 
 #NOT DONE -- still using dm1 for this:
     # make top-5000 list and launcher on Adam's home page:
     sort -k5,5nr raw.bed | head -5000 > top5000.bed
     /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
       /cse/grads/acs/public_html/dm-top5000-4way \
       "top 5000 conserved elements (4way)" dm2
 
     # and clean up bluearc.
     rm -r /cluster/bluearc/dm2/phastCons
     rm -r /cluster/bluearc/dm2/chrom
 
 
 # LIFTOVER BDGP 3.2 ANNOTATIONS FROM DM1 (TEMPORARY) (DONE 9/15/04 angie)
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/bdgp3.2.liftOver
     cd /cluster/data/dm2/bed/bdgp3.2.liftOver
     hgsql dm1 -N -e 'select * from bdgpGene' > dm1.bdgpGene.tab
     hgsql dm1 -N -e 'select * from bdgpNonCoding' > dm1.bdgpNonCoding.tab
     ssh kksilo
     cd /cluster/data/dm2/bed/bdgp3.2.liftOver
 # lift files to try out:
 # blastz:
 #  /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChain/dm1ToDm2.over.chain \
 # blat on dm1 chroms vs. dm2 2Mb chunks:
 #  /cluster/data/dm1/bed/blat.dm2.2004-09-16/dm1ToDm2.over.chain \
 # blat on dm1 chroms vs. dm2 3kb chunks:
 #  /cluster/data/dm1/bed/blat.dm2.2004-09-17/dm1ToDm2.over.chain \
     liftOver -genePred dm1.bdgpGene.tab \
       /cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \
       bdgpLiftGene.tab bdgpLiftGene.unmapped.tab
     liftOver -genePred dm1.bdgpNonCoding.tab \
       /cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \
       bdgpLiftNonCoding.tab bdgpLiftNonCoding.unmapped.tab
     # Not perfect but not bad:
     wc -l bdgp*.tab
 #  18707 bdgpLiftGene.tab
 #     78 bdgpLiftGene.unmapped.tab
 #   2134 bdgpLiftNonCoding.tab
 #     62 bdgpLiftNonCoding.unmapped.tab
 
     ssh hgwdev
     cd /cluster/data/dm2/bed/bdgp3.2.liftOver
     ldHgGene -predTab dm2 bdgpLiftGene bdgpLiftGene.tab
     ldHgGene -predTab dm2 bdgpLiftNonCoding bdgpLiftNonCoding.tab
     # Copy over the gene info tables
     hgsql dm2 -e 'create table bdgpLiftGeneInfo (PRIMARY KEY(bdgpName(7)), INDEX(flyBaseId(11))) select * from dm1.bdgpGeneInfo'
     hgsql dm2 -e 'create table bdgpLiftNonCodingInfo (INDEX(bdgpName(16)), INDEX(flyBaseId(11))) select * from dm1.bdgpNonCodingInfo'
 
 
 # FLYBASE ANNOTATIONS (DONE 1/12/05 angie)
 # REPLACED 2/23/05 -- SEE "FLYBASE 4.1" BELOW
     ssh kksilo
     mkdir /cluster/data/dm2/bed/flybase
     cd /cluster/data/dm2/bed/flybase
     foreach c (2L 2R 3L 3R 4 X)
       wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.0_20041119/gff/dmel-$c-r4.0.gff.gz
     end
     zcat *.gff.gz > flybase.gff3
     # What data sources are represented in this file?
     grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
     # excerpt (many other sources, including blastx:... , sim4:... and 
     # tblastx:...; also various other types for source "."):
   18747 .       CDS
   62629 .       exon
   13472 .       gene
   19301 .       mRNA
      70 .       ncRNA
      40 .       pseudogene
      96 .       rRNA
      28 .       snRNA
      28 .       snoRNA
     288 .       tRNA
   36921 .       transcription_start_site
    1571 .       transposable_element
    4680 .       transposable_element_insertion_site
 
     # What keywords are defined in the 9th field?
     grep -v '^#' flybase.gff3 \
     | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \
     | sort | uniq -c
     # This incarnation of gff3 looks similar to the one encountered in 
     # dp3, but still uses slightly different keywords and there are different 
     # data sources of interest.  So one-shot perl-script again:
     extractGenes.pl flybase.gff3
 
     # Get predicted proteins (for main annotations only)
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.0_20041119/fasta/dmel-all-translation-r4.0.fasta.gz
     zcat dmel-all-translation-r4.0.fasta.gz \
     | perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' > flybasePep.fa
 
     ssh hgwdev
     cd /cluster/data/dm2/bed/flybase
     # Protein-coding genes:
     ldHgGene -gtf dm2 flyBaseGene flybase.gtf
     hgPepPred dm2 generic flyBasePep flybasePep.fa
     # Noncoding genes:
     hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
     # Cross-referencing info for both coding and noncoding:
     hgsql dm2 < $HOME/kent/src/hg/lib/flyBase2004Xref.sql
     hgsql dm2 -e 'load data local infile "flyBase2004Xref.tab" \
       into table flyBase2004Xref'
 
     # add upstream* downloadable files (added 2/10/05)
     cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
     foreach size (1000 2000 5000)
       echo upstream$size
       featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
       | gzip -c > upstream$size.fa.gz
     end
     md5sum upstream*.fa.gz >> md5sum.txt
 
 
 # BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 1/18/05 angie)
 # CHAIN PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie)
 # NET PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie)
 # GENERATE DP3 AXTNET AND MAF FOR MULTIZ (DONE 1/18/05 angie)
 # MAKE VSDP3 DOWNLOADABLES (DONE 1/18/05 angie)
 # -- Originally run with default blastz params; replaced by a run with 
 # better params, below.
 
 
 # BLASTZ.V7 EVAL: BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 1/18/05 angie)
     ssh kk9
     mkdir /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
     cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
     ln -s blastzv7.dp3.2005-01-18 /cluster/data/dm2/bed/blastzv7.dp3
     # Note the full path for BLASTZ so we get blastz.v7 (which is not in 
     # PSU's CVS, argh!):
     cat << '_EOF_' > DEF
 # D.melanogaster vs. D.pseudoobscura
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=/cluster/bin/penn/blastz.2004-12-27/blastz-source/blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 # unused: SEQ1_RMSK=
 SEQ1_SMSK=
 SEQ1_FLAG=-drosophila
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - D. pseudoobscura
 SEQ2_DIR=/iscratch/i/dp3/nib
 # unused: SEQ2_RMSK=
 SEQ2_SMSK=
 SEQ2_FLAG=-drosophila
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # Create run dir, job list, and raw/ dir structure:
     mkdir run
     ~/kent/src/utils/blastz-make-joblist DEF \
       /cluster/data/dm2/chrom.sizes /cluster/data/dp3/chrom.sizes \
     > run/j
     csh -ef ./xdir.sh
     cd run
     # cluster run
     para create j
     para try, check, push, check, ....
 #Completed: 552 of 552 jobs
 #Average job time:                 100s       1.66m     0.03h    0.00d
 #Longest job:                      480s       8.00m     0.13h    0.01d
 #Submission to last job:           733s      12.22m     0.20h    0.01d
 
     # back on kksilo...
     mkdir /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1
     cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1
     /cluster/bin/scripts/blastz-make-out2lav ../DEF .. > j
     # small cluster run
     ssh kki
     cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1
     para create j
     para try, check, push, check, ....
 #Completed: 23 of 23 jobs
 #Average job time:                   8s       0.14m     0.00h    0.00d
 #Longest job:                       29s       0.48m     0.01h    0.00d
 #Submission to last job:            60s       1.00m     0.02h    0.00d
 
     # Translate .lav to axt:
     ssh kksilo
     cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
     rm -r raw
     mkdir axtChrom
     foreach c (lav/*)
       pushd $c
       set chr=$c:t
       set out=axtChrom/$chr.axt
       echo "Translating $chr lav to $out"
       cat `ls -1 *.lav | sort -g` \
         | lavToAxt stdin /cluster/data/dm2/nib /cluster/data/dp3/nib stdout \
         | axtSort stdin ../../$out
       popd
     end
 
 
 # BLASTZ.V7 EVAL: CHAIN PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir chain
     ls -1S /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh -ef
 axtChain -verbose=0 $1 \
   /iscratch/i/dm2/nib \
   /iscratch/i/dp3/nib stdout \
 | chainAntiRepeat /iscratch/i/dm2/nib /iscratch/i/dp3/nib \
     stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 #Completed: 13 of 13 jobs
 #Average job time:                  10s       0.17m     0.00h    0.00d
 #Longest job:                       22s       0.37m     0.01h    0.00d
 #Submission to last job:            22s       0.37m     0.01h    0.00d
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r
       textHistogram -binSize=10000 /tmp/score.$f:t:r
       echo ""
     end
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain dm2 ${c}_chainBz7Dp3 $i
     end
     # Compare to regular (blastz.v6) chains:
     featureBits dm2 chainDp3Link
 #76557614 bases of 131698467 (58.131%) in intersection
     featureBits dm2 chainBz7Dp3Link
 #76340831 bases of 131698467 (57.966%) in intersection
     # Looks like v6 found more than v7 in general, but not too different.
     # Look at some cases where blastz.v7 found something but v6 didn't:
     featureBits dm2 chainBz7Dp3Link \!chainDp3Link -minSize=20 -bed=stdout
 #chrYh   212776  212811  chrYh.1
 ...
 #chr3R   18027130        18027160        chr3R.59
 ...
 #chr2h   300968  301032  chr2h.1
 ...
 #34843 bases of 131698467 (0.026%) in intersection
 
 
     # BLATZ EVAL: D. PSEUDOOBSCURA (DP3) (IN PROGRESS 1/24/05 angie)
     ssh kk9
     mkdir /cluster/data/dm2/bed/blatz.dp3.2004-01-19
     cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19
     mkdir chainRaw
     partitionSequence.pl 10000000 10000 /iscratch/i/dm2/nib \
       /cluster/data/dm2/chrom.sizes > dm2.lst
     partitionSequence.pl 10000000 10000 /iscratch/i/dp3/nib \
       /cluster/data/dp3/chrom.sizes > dp3.lst
     cat << '_EOF_' > gsub
 #LOOP
 blatz $(path1) $(path2) {check out line chainRaw/$(file1)_$(file2).chain}
 #ENDLOOP
 '_EOF_'
     # << this line keeps emacs coloring happy
     gensub2 dm2.lst dp3.lst gsub spec
     para create spec
     para try, check, push, check, ...
 #Completed: 549 of 552 jobs
 #Crashed: 3 jobs
 #Average job time:                 289s       4.82m     0.08h    0.00d
 #Longest job:                     1664s      27.73m     0.46h    0.02d
 #Submission to last job:          6206s     103.43m     1.72h    0.07d
     # With default blatz params, 3 jobs crashed due to out-of-mem, 
     # all others succeeded.  Try again with thresholds more like 
     # blastz human-mouse, i.e. -minGapless=3000?  No, Jim says forget 
     # the crashers for now (narrow down later) and just forge ahead 
     # with the comparison to blastz (v6).
     ssh kksilo
     cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19
     sed -e 's@/iscratch/i/d../nib/@@g; s/.nib//g;' chainRaw/* \
     | chainMergeSort stdin > all.chain
     chainSplit chain all.chain
 
     # take a look at score distr's -- LOTS of low-scoring chains
     foreach f (chain/*.chain)
       grep ^chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r
       textHistogram -binSize=10000 /tmp/score.$f:t:r
       echo ""
     end
 
     # Jim recommends -minScore=5000 for cross-species.  Filter for now:
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain
     chainSplit chain all.chain
     gzip all.chain.unfiltered
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain dm2 ${c}_chainBz1Dp3 $i
     end
     # Compare to regular (blastz.v6) chains:
     featureBits dm2 -enrichment flyBaseGene:CDS chainDp3Link
 #flyBaseGene:CDS 16.551%, chainDp3Link 58.131%, both 15.303%, cover 92.46%, enrich 1.59x
     featureBits dm2 -enrichment flyBaseGene:CDS chainBz1Dp3Link
 #flyBaseGene:CDS 16.551%, chainBz1Dp3Link 79.858%, both 16.179%, cover 97.76%, enrich 1.22x
     # Look at some examples where blatz covers some CDS that blastz doesn't:
     featureBits -chrom=chr3R dm2 flyBaseGene:CDS chainBz1Dp3Link \
       \!chainDp3Link -minSize=30 -bed=/tmp/bz1Cds.bed
 
 
 # BLASTZ D.YAKUBA (DONE 1/26/05 angie)
 # CHAIN YAKUBA BLASTZ (DONE 1/26/05 angie)
 # NET YAKUBA BLASTZ (DONE 1/26/05 angie)
 # GENERATE DROYAK1 MAF FOR MULTIZ FROM NET (DONE 1/26/05 angie)
 # MAKE VSDROYAK1 DOWNLOADABLES (DONE 1/26/05 angie)
 # -- Originally run with default blastz params; replaced by a run with 
 # better params, below.
 
 
 # BLASTZ D.ANANASSAE (DONE 1/28/05 angie)
 # CHAIN ANANASSAE BLASTZ (DONE 1/28/05 angie)
 # NET ANANASSAE BLASTZ (DONE 1/28/05 angie)
 # GENERATE DROANA1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie)
 # MAKE VSDROANA1 DOWNLOADABLES (DONE 1/28/05 angie)
 # -- Originally run with default blastz params; replaced by a run with 
 # better params, below.
 
 
 # BLASTZ D.VIRILIS (DONE 1/28/05 angie)
 # CHAIN VIRILIS BLASTZ (DONE 1/28/05 angie)
 # NET VIRILIS BLASTZ (DONE 1/28/05 angie)
 # GENERATE DROVIR1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie)
 # MAKE VSDROVIR1 DOWNLOADABLES (DONE 1/28/05 angie)
 # -- Originally run with default blastz params; replaced by a run with 
 # better params, below.
 
 
 # BLASTZ D.MOJAVENSIS (DONE 1/28/05 angie)
 # CHAIN MOJAVENSIS BLASTZ (DONE 1/28/05 angie)
 # NET MOJAVENSIS BLASTZ (DONE 1/28/05 angie)
 # GENERATE DROMOJ1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie)
 # MAKE VSDROMOJ1 DOWNLOADABLES (DONE 1/28/05 angie)
 # -- Originally run with default blastz params; replaced by a run with 
 # better params, below.
 
 
 # BLASTZ A.MELLIFERA (DONE 1/28/05 angie)
 # CHAIN MELLIFERA BLASTZ (DONE 1/31/05 angie)
 # NET MELLIFERA BLASTZ (DONE 1/31/05 angie)
 # GENERATE APIMEL1 AXTNET AND MAF FOR MULTIZ (DONE 1/31/05 angie)
 # MAKE VSAPIMEL1 DOWNLOADABLES (DONE 1/31/05 angie)
 # -- Originally run on apiMel2 with "human-fugu" blastz params; outdated by 
 # apiMel2 which was later run with better params, below.
 
 
 # BLASTZ A.MELLIFERA 2.0 (DONE 2/8/05 Andy)
 # CHAIN MELLIFERA 2.0 BLASTZ (DONE 2/9/05 Andy)
 # NET MELLIFERA 2.0 BLASTZ (DONE 2/9/05 Andy)
 # GENERATE APIMEL2 AXTNET AND MAF FOR MULTIZ (DONE 2/9/05 Andy)
 # MAKE VSAPIMEL2 DOWNLOADABLES (DONE 2/9/2005 Andy)
 # -- Originally run with "human-fugu" blastz params; replaced by a run with 
 # better params, below.
 
 
 # MULTIZ.V10 8WAY (6 FLIES, MOSQUITO, HONEYBEE) (REDONE 3/7/05 angie
 # originally done 2/1/05
 # REDONE 3/7/05 angie -- User found that yakuba was missing (due to 
 # pairwise maf files having different .suffixes, and the script tolerating 
 # missing files... yikes!!!).  :(
 # REPLACED with 9WAY 5/23/05
 
 # MULTIZ.V10 9WAY (7 FLIES, MOSQUITO, HONEYBEE) (DONE 5/24/05 angie)
 # Better, looser blastz params were used to redo alignments of all
 # other insects to dm2.  Much better coverage on the pairwise inputs
 # now, so re-run multiz and phastCons to get a better Conservation
 # track... also using dp3 and apiMel2 instead of dp2 and apiMel1, 
 # and adding droSim1.
     # Tree (9-way):
     # ((((((dm2 droYak1) droAna1) dp3) (droVir1 droMoj1)) anoGam1) apiMel2)
     ssh kkstore01
     mkdir /cluster/data/dm2/bed/multiz9way.2005-05-23
     ln -s /cluster/data/dm2/bed/multiz9way.2005-05-23 \
       /cluster/data/dm2/bed/multiz9way
     cd /cluster/data/dm2/bed/multiz9way
     # Setup: Copy pairwise MAF to /santest/scratch:
     mkdir /santest/scratch/flyMultiz9way
     foreach db (droSim1 droYak1 droAna1 dp3 droVir1 droMoj1 anoGam1 apiMel2)
       cp -pR /cluster/data/dm2/bed/blastz.$db/mafNet \
         /santest/scratch/flyMultiz9way/$db
     end
     ls -lLR /santest/scratch/flyMultiz9way
     # Make output dir:
     mkdir maf
     # Create script to run multiz.v10 in the right order:
     cat << '_EOF_' > doMultiz.csh
 #!/bin/csh -fe
 set chr = $1
 set tmp = /scratch/flyMultiz9way.$chr
 mkdir $tmp
 
 set REF = dm2.$chr
 set SIM = /santest/scratch/flyMultiz9way/droSim1/$chr.maf.gz
 set YAK = /santest/scratch/flyMultiz9way/droYak1/$chr.maf.gz
 set ANA = /santest/scratch/flyMultiz9way/droAna1/$chr.maf.gz
 set PSE = /santest/scratch/flyMultiz9way/dp3/$chr.maf.gz
 set VIR = /santest/scratch/flyMultiz9way/droVir1/$chr.maf.gz
 set MOJ = /santest/scratch/flyMultiz9way/droMoj1/$chr.maf.gz
 set ANO = /santest/scratch/flyMultiz9way/anoGam1/$chr.maf.gz
 set API = /santest/scratch/flyMultiz9way/apiMel2/$chr.maf.gz
 set DEST = /cluster/data/dm2/bed/multiz9way/maf/$chr.maf
 
 set MZ10 = /cluster/bin/penn/multiz.v10
 set PROJECT = /cluster/bin/penn/maf_project
 
 if ( -s $SIM && -s $YAK ) then 
     echo "Aligning $SIM $YAK..."
     zcat $SIM > $tmp/$chr.Sim.maf
     zcat $YAK > $tmp/$chr.Yak.maf
     $MZ10 $tmp/$chr.Sim.maf $tmp/$chr.Yak.maf 1 > $tmp/$chr.tmp.maf
     echo "Projecting on $REF..."
     $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYak.maf
 else if ( -s $SIM ) then
     touch missing.$chr.Sim
     zcat $SIM > $tmp/$chr.SimYak.maf
 else if ( -s $YAK ) then
     touch missing.$chr.Yak
     zcat $YAK > $tmp/$chr.SimYak.maf
 endif
 
 if ( -s $tmp/$chr.SimYak.maf && -s $ANA ) then 
     echo "Aligning $tmp/$chr.SimYak.maf $ANA..."
     zcat $ANA > $tmp/$chr.Ana.maf
     $MZ10 $tmp/$chr.SimYak.maf $tmp/$chr.Ana.maf 1 > $tmp/$chr.tmp.maf
     echo "Projecting on $REF..."
     $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAna.maf
 else if ( -s $tmp/$chr.SimYak.maf ) then
     touch missing.$chr.Ana
     cp $tmp/$chr.SimYak.maf $tmp/$chr.SimYakAna.maf
 else if ( -s $ANA ) then
     touch missing.$chr.SimYak
     zcat $ANA > $tmp/$chr.SimYakAna.maf
 endif
 
 if ( -s $PSE && -s $tmp/$chr.SimYakAna.maf ) then
     echo "Adding $PSE..."
     zcat $PSE > $tmp/$chr.Pse.maf
     $MZ10 $tmp/$chr.SimYakAna.maf $tmp/$chr.Pse.maf 1 > $tmp/$chr.tmp.maf
     echo "Projecting on $REF..."
     $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPse.maf
 else if ( -s $PSE ) then
     touch missing.$chr.SimYakAna
     zcat $PSE > $tmp/$chr.SimYakAnaPse.maf
 else if ( -s $tmp/$chr.SimYakAna.maf ) then
     touch missing.$chr.Pse
     cp $tmp/$chr.SimYakAna.maf $tmp/$chr.SimYakAnaPse.maf
 endif 
 
 # droVir1 and droMoj1 are a subtree -- run on just them, then fold in:
 if ( -s $VIR && -s $MOJ ) then 
     echo "Aligning $VIR $MOJ..."
     zcat $VIR > $tmp/$chr.Vir.maf
     zcat $MOJ > $tmp/$chr.Moj.maf
     $MZ10 $tmp/$chr.Vir.maf $tmp/$chr.Moj.maf 0 > $tmp/$chr.tmp.maf
     echo "Projecting on $REF..."
     $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.VirMoj.maf
 else if ( -s $VIR ) then
     touch missing.$chr.Moj
     zcat $VIR > $tmp/$chr.VirMoj.maf
 else if ( -s $MOJ ) then
     touch missing.$chr.Vir
     zcat $MOJ > $tmp/$chr.VirMoj.maf
 endif
 
 if ( -s $tmp/$chr.VirMoj.maf && -s $tmp/$chr.SimYakAnaPse.maf ) then
     echo "Adding $tmp/$chr.VirMoj.maf..."
     $MZ10 $tmp/$chr.SimYakAnaPse.maf $tmp/$chr.VirMoj.maf 1 > $tmp/$chr.tmp.maf
     echo "Projecting on $REF..."
     $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPseVirMoj.maf
 else if ( -s $tmp/$chr.VirMoj.maf ) then
     touch missing.$chr.SimYakAnaPse
     cp $tmp/$chr.VirMoj.maf $tmp/$chr.SimYakAnaPseVirMoj.maf
 else if ( -s $tmp/$chr.SimYakAnaPse.maf ) then
     touch missing.$chr.VirMoj
     cp $tmp/$chr.SimYakAnaPse.maf $tmp/$chr.SimYakAnaPseVirMoj.maf
 endif 
 
 if ( -s $ANO && -s $tmp/$chr.SimYakAnaPseVirMoj.maf ) then
     echo "Adding $ANO..."
     zcat $ANO > $tmp/$chr.Ano.maf
     $MZ10 $tmp/$chr.SimYakAnaPseVirMoj.maf $tmp/$chr.Ano.maf 1 > $tmp/$chr.tmp.maf
     echo "Projecting on $REF..."
     $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPseVirMojAno.maf
 else if ( -s $ANO ) then
     touch missing.$chr.SimYakAnaPseVirMoj
     zcat $ANO > $tmp/$chr.SimYakAnaPseVirMojAno.maf
 else if ( -s $tmp/$chr.SimYakAnaPseVirMoj.maf ) then
     touch missing.$chr.Ano
     cp $tmp/$chr.SimYakAnaPseVirMoj.maf $tmp/$chr.SimYakAnaPseVirMojAno.maf
 endif 
 
 if ( -s $API && -s $tmp/$chr.SimYakAnaPseVirMojAno.maf ) then
     echo "Adding $API..."
     zcat $API > $tmp/$chr.Api.maf
     $MZ10 $tmp/$chr.SimYakAnaPseVirMojAno.maf $tmp/$chr.Api.maf 1 > $tmp/$chr.tmp.maf
     echo "Projecting on $REF..."
     $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.final.maf
     cp $tmp/$chr.final.maf $DEST
 else if ( -s $API ) then
     touch missing.$chr.SimYakAnaPseVirMojAno
     zcat $API > $DEST
 else if ( -s $tmp/$chr.SimYakAnaPseVirMojAno.maf ) then
     touch missing.$chr.Api
     cp $tmp/$chr.SimYakAnaPseVirMojAno.maf $DEST
 endif 
 
 rm $tmp/$chr.*.maf
 rmdir $tmp
 '_EOF_'
     # << keep emacs coloring happy    
     chmod 755 doMultiz.csh
     awk '{print "./doMultiz.csh " $1;}' /cluster/data/dm2/chrom.sizes \
       > jobs.lst
     # Run on small cluster
     ssh kki
     cd /cluster/data/dm2/bed/multiz9way
     para create jobs.lst
     para try, check, push, check, ...
 #Completed: 13 of 13 jobs
 #Average job time:                2000s      33.34m     0.56h    0.02d
 #Longest finished job:            5969s      99.48m     1.66h    0.07d
 #Submission to last job:         15605s     260.08m     4.33h    0.18d
     ls -1 missing*
 #ls: No match.
 
     # make /gbdb/ links to 9way maf files:
     ssh hgwdev
     mkdir -p /gbdb/dm2/multiz9way/maf/multiz9way
     ln -s /cluster/data/dm2/bed/multiz9way/maf/chr*.maf \
       /gbdb/dm2/multiz9way/maf/multiz9way/
     # load into database
     cd /tmp
     hgLoadMaf -warn dm2 multiz9way \
       -pathPrefix=/gbdb/dm2/multiz9way/maf/multiz9way
     # load summary table to replace pairwise
     time cat /cluster/data/dm2/bed/multiz9way/maf/chr*.maf \
     | nice hgLoadMafSummary dm2 multiz9waySummary stdin
 
     # put 9way MAF out for download
     ssh kkstore01
     cd /cluster/data/dm2/bed/multiz9way
     mkdir mafDownload
     foreach f (maf/*.maf)
       nice gzip -c $f > mafDownload/$f:t.gz
     end
     cd mafDownload
     md5sum *.maf.gz > md5sum.txt
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dm2/multiz9way
     ln -s /cluster/data/dm2/bed/multiz9way/mafDownload/{*.maf.gz,md5sum.txt} \
       /usr/local/apache/htdocs/goldenPath/dm2/multiz9way
     # make a README.txt
 
     # Cleanup
     rm -rf /santest/scratch/flyMultiz9way/
 
 
 # PHASTCONS 8WAY WITH METHODS FROM PAPER (DONE 3/8/05 angie)
 # originally done 2/2/05
 # REDONE 3/8/05 angie -- after regenerating multiz alignments above.
     # Same procedure as for latest dm1 8way -- using the param estimation 
     # methods described in Adam's Genome Research paper.
 # REPLACED with 9way 5/23/05
 
 
 # PHASTCONS 9WAY WITH METHODS FROM PAPER (DONE 5/27/05 angie)
     ssh kkstore01
     mkdir -p /santest/scratch/dm2/chrom
     cp -p /cluster/data/dm2/?{,?}/chr*.fa /santest/scratch/dm2/chrom/
     # Split chrom fa into smaller windows for phastCons:
     ssh kki
     mkdir /cluster/data/dm2/bed/multiz9way/phastCons
     mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.split
     cd /cluster/data/dm2/bed/multiz9way/phastCons/run.split
     set WINDOWS = /santest/scratch/dm2/phastCons/WINDOWS
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
     cat << 'EOF' > doSplit.sh
 #!/bin/csh -ef
 
 set PHAST=/cluster/bin/phast
 set FA_SRC=/santest/scratch/dm2/chrom
 set WINDOWS=/santest/scratch/dm2/phastCons/WINDOWS
 
 set maf=$1
 set c = $maf:t:r
 set tmpDir = /scratch/msa_split/$c
 rm -rf $tmpDir
 mkdir -p $tmpDir
 ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
    -O dm2,droSim1,droYak1,droAna1,dp3,droVir1,droMoj1,anoGam1,apiMel2 \
    -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
 cd $tmpDir
 foreach file ($c.*.ss)
   gzip -c $file > ${WINDOWS}/$file.gz
 end
 rm -f $tmpDir/$c.*.ss
 rmdir $tmpDir
 'EOF'
 # << for emacs
     chmod a+x doSplit.sh
     rm -f jobList
     foreach file (/cluster/data/dm2/bed/multiz9way/maf/*.maf)
       if (-s $file) then
         echo "doSplit.sh {check in exists+ $file}" >> jobList
       endif
     end
     para create jobList
     para try,  check, push, check...
 #Completed: 13 of 13 jobs
 #Average job time:                  44s       0.73m     0.01h    0.00d
 #Longest finished job:             117s       1.95m     0.03h    0.00d
 #Submission to last job:           117s       1.95m     0.03h    0.00d
 
     ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
     # Use consEntropy --NH to make it suggest a --expected-lengths param 
     # that we should try next.  Adam ran this on hg17 to find out the 
     # total entropy of the hg17 model:
     # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
 #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
 #Relative entropy: H=0.608216 bits/site
 #Required length: N=16.085437 sites
 #Total entropy: NH=9.783421 bits
     # Our target is that NH result: 9.7834 bits.  
     # Use phyloFit to make an initial model:
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz9way/phastCons
     /cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \
         --aggregate dm2,droSim1,droYak1,droAna1,dp3,droVir1,droMoj1,anoGam1,apiMel2 \
         -i MAF -o SS > all.ss
     /cluster/bin/phast/phyloFit all.ss \
       --tree "(((((((dm2,droSim1),droYak1),droAna1),dp3),(droVir1,droMoj1)),anoGam1),apiMel2)" \
       -i SS --out-root starting-tree
     cat starting-tree.mod 
 #ALPHABET: A C G T 
 #ORDER: 0
 #SUBST_MOD: REV
 #TRAINING_LNL: -393252389.008460
 #BACKGROUND: 0.286960 0.213375 0.213302 0.286363 
 #RATE_MAT:
 #  -0.918686    0.202545    0.451023    0.265118 
 #   0.272394   -1.108053    0.230681    0.604978 
 #   0.606771    0.230760   -1.109751    0.272220 
 #   0.265671    0.450782    0.202768   -0.919221 
 #TREE: (((((((dm2:0.031118,droSim1:0.035465):0.029649,droYak1:0.076216):0.089605,droAna1:0.210217):0.044178,dp3:0.215369):0.046188,(droVir1:0.128011,droMoj1:0.139931):0.195030):0.109752,anoGam1:0.342365):0.228550,apiMel2:0.228550);
 # also get GC content from model -- if similar enough, no need to extract it 
 # separately above.
 awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod 
 #0.426677
     # OK, use .427 for --gc below.
 
     # Use the values of --target-coverage and --expected-lengths from the 
     # last iteration of the 8way run, 0.573 and 36.8.
     # Multiply each subst rate on the TREE line by 3.5 which is roughly the 
     # ratio of noncons to cons in 
     # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
     /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
       > starting-tree.noncons.mod
     /cluster/bin/phast/consEntropy --NH 9.7834 0.573 36.8 \
       starting-tree{,.noncons}.mod
 #( Solving for new omega: 36.800000 36.416935 36.415050 )
 #Transition parameters: gamma=0.573000, omega=36.800000, mu=0.027174, nu=0.036465
 #Relative entropy: H=1.931731 bits/site
 #Expected min. length: L_min=5.081081 sites
 #Expected max. length: L_max=5.398876 sites
 #Total entropy: L_min*H=9.815283 bits
 #Recommended expected length: omega=36.415050 sites (for L_min*H=9.783400)
     # OK, use --expected-lengths 36.4.
 
     ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
     # We're here because the actual target coverage was not satisfactory,
     # so we're changing the --target-coverage param.  Given that we're 
     # changing that, take a guess at how we should change --expected-lengths
     # in order to also hit the total entropy target.
     cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
     # SECOND ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.52 36.8 ave.{cons,noncons}.mod
 #Recommended expected length: omega=32.240755 sites (for L_min*H=9.783400)
     # OK, --expected-lengths 32.2
     # THIRD ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.45 32.2 ave.{cons,noncons}.mod
 #Recommended expected length: omega=27.095993 sites (for L_min*H=9.783400)
     # ==> 27.1
     # FOURTH ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.425 27.1 ave.{cons,noncons}.mod
 #Recommended expected length: omega=25.440532 sites (for L_min*H=9.783400)
     # ==> 25.4
     # FIFTH ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.393 25.4 ave.{cons,noncons}.mod
 #Recommended expected length: omega=23.420199 sites (for L_min*H=9.783400)
     # ==> 23.4
     # SIXTH ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.4 ave.{cons,noncons}.mod
 #Recommended expected length: omega=23.736472 sites (for L_min*H=9.783400)
     # ==> 23.7
 
     # Now set up cluster job to estimate model parameters given free params 
     # --target-coverage and --expected-lengths and the data.  
     ssh kk9
     mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
     cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
     # FIRST ITERATION: Use ../starting-tree.mod:
     cat << '_EOF_' > doEstimate.sh
 #!/bin/csh -ef
 zcat $1 \
 | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.427 --nrates 1,1 \
     --no-post-probs --ignore-missing \
     --expected-lengths 36.4 --target-coverage 0.573 \
     --quiet --log $2 --estimate-trees $3
 '_EOF_'
 # << for emacs
     # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
     cat << '_EOF_' > doEstimate.sh
 #!/bin/csh -ef
 zcat $1 \
 | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.427 --nrates 1,1 \
     --no-post-probs --ignore-missing \
     --expected-lengths 23.7 --target-coverage 0.398 \
     --quiet --log $2 --estimate-trees $3
 '_EOF_'
 # << for emacs
     chmod a+x doEstimate.sh
     rm -fr LOG TREES
     mkdir -p LOG TREES
     rm -f jobList
     foreach f (/santest/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
       set root = $f:t:r:r
       echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
     end
     # run cluster job
     para create jobList
     para try, check, push, check, ...#
 #Completed: 139 of 139 jobs
 #Average job time:                4131s      68.85m     1.15h    0.05d
 #Longest finished job:            8787s     146.45m     2.44h    0.10d
 #Submission to last job:          8886s     148.10m     2.47h    0.10d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
     ls -1 TREES/*.cons.mod > cons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
       --output-average ave.cons.mod > cons_summary.txt
     ls -1 TREES/*.noncons.mod > noncons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
       --output-average ave.noncons.mod > noncons_summary.txt
     grep TREE ave*.mod
     # FIRST ITERATION:
 #ave.cons.mod:TREE: (((((((dm2:0.019592,droSim1:0.027067):0.014875,droYak1:0.047389):0.041218,droAna1:0.116484):0.020166,dp3:0.125948):0.018479,(droVir1:0.073409,droMoj1:0.080389):0.106180):0.056105,anoGam1:0.235476):0.127958,apiMel2:0.127958);
 #ave.noncons.mod:TREE: (((((((dm2:0.059164,droSim1:0.077944):0.047044,droYak1:0.144159):0.135352,droAna1:0.373788):0.067147,dp3:0.405695):0.062350,(droVir1:0.234677,droMoj1:0.258111):0.355922):0.191534,anoGam1:0.773044):0.423894,apiMel2:0.423894);
     # SECOND ITERATION:
 #ave.cons.mod:TREE: (((((((dm2:0.019133,droSim1:0.026408):0.014525,droYak1:0.046282):0.040238,droAna1:0.113841):0.019632,dp3:0.123153):0.017914,(droVir1:0.071760,droMoj1:0.078554):0.104006):0.054630,anoGam1:0.230601):0.124952,apiMel2:0.124952);
 #ave.noncons.mod:TREE: (((((((dm2:0.058357,droSim1:0.076814):0.046405,droYak1:0.142227):0.133457,droAna1:0.368999):0.065975,dp3:0.400678):0.061001,(droVir1:0.231694,droMoj1:0.254750):0.352032):0.188199,anoGam1:0.764503):0.418063,apiMel2:0.418063);
     # THIRD ITERATION:
 #ave.cons.mod:TREE: (((((((dm2:0.018454,droSim1:0.025454):0.014002,droYak1:0.044650):0.038757,droAna1:0.109828):0.018811,dp3:0.118845):0.017034,(droVir1:0.069220,droMoj1:0.075740):0.100602):0.052271,anoGam1:0.222690):0.120267,apiMel2:0.120267);
 #ave.noncons.mod:TREE: (((((((dm2:0.057221,droSim1:0.075276):0.045493,droYak1:0.139568):0.130778,droAna1:0.362074):0.064250,dp3:0.393146):0.058938,(droVir1:0.227290,droMoj1:0.249820):0.346379):0.183000,anoGam1:0.750810):0.409058,apiMel2:0.409058);
     # FOURTH ITERATION:
 #ave.cons.mod:TREE: (((((((dm2:0.018200,droSim1:0.025097):0.013808,droYak1:0.044040):0.038208,droAna1:0.108335):0.018509,dp3:0.117244):0.016712,(droVir1:0.068273,droMoj1:0.074693):0.099332):0.051396,anoGam1:0.219717):0.118553,apiMel2:0.118553);
 #ave.noncons.mod:TREE: (((((((dm2:0.056840,droSim1:0.074761):0.045186,droYak1:0.138661):0.129841,droAna1:0.359716):0.063642,dp3:0.390643):0.058209,(droVir1:0.225781,droMoj1:0.248129):0.344408):0.181133,anoGam1:0.745987):0.406113,apiMel2:0.406113);
     # FIFTH ITERATION:
 #ave.cons.mod:TREE: (((((((dm2:0.017875,droSim1:0.024638):0.013564,droYak1:0.043255):0.037506,droAna1:0.106409):0.018128,dp3:0.115161):0.016304,(droVir1:0.067046,droMoj1:0.073340):0.097682):0.050261,anoGam1:0.215779):0.116296,apiMel2:0.116296);
 #ave.noncons.mod:TREE: (((((((dm2:0.056321,droSim1:0.074061):0.044780,droYak1:0.137445):0.128624,droAna1:0.356472):0.062873,dp3:0.387024):0.057266,(droVir1:0.223703,droMoj1:0.245817):0.341759):0.178650,anoGam1:0.739131):0.401733,apiMel2:0.401733);
     # SIXTH ITERATION:
 #ave.cons.mod:TREE: (((((((dm2:0.017924,droSim1:0.024709):0.013600,droYak1:0.043377):0.037613,droAna1:0.106707):0.018185,dp3:0.115488):0.016365,(droVir1:0.067237,droMoj1:0.073549):0.097937):0.050437,anoGam1:0.216405):0.116660,apiMel2:0.116660);
 #ave.noncons.mod:TREE: (((((((dm2:0.056410,droSim1:0.074182):0.044846,droYak1:0.137649):0.128814,droAna1:0.357039):0.062987,dp3:0.387708):0.057413,(droVir1:0.224066,droMoj1:0.246214):0.342186):0.179046,anoGam1:0.740329):0.402594,apiMel2:0.402594);
 
     cat cons_summary.txt 
     # look over the files cons_summary.txt and noncons_summary.txt.
     # The means and medians should be roughly equal and the stdevs
     # should be reasonably small compared to the means, particularly
     # for rate matrix parameters (at bottom) and for branches to the
     # leaves of the tree.  The stdevs may be fairly high for branches
     # near the root of the tree; that's okay.  Some min values may be
     # 0 for some parameters.  That's okay, but watch out for very large
     # values in the max column, which might skew the mean.  If you see
     # any signs of bad outliers, you may have to track down the
     # responsible .mod files and throw them out.  I've never had to do
     # this; the estimates generally seem pretty well behaved.
 
     # NOTE: Actually, a random sample of several hundred to a thousand
     # alignment fragments (say, a number equal to the number of
     # available cluster nodes) should be more than adequate for
     # parameter estimation.  If pressed for time, use this strategy.
 
     # Check the total entropy figure to see if we're way off.
     # We probably don't need to do this, since it has always been very close 
     # if not the same as what we used above, but it only takes a second.
     # FIRST ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.573 36.4 ave.{cons,noncons}.mod
 #Recommended expected length: omega=36.779882 sites (for L_min*H=9.783400)
     # OK, tweak --expected-lengths to 36.8 below (and for next iteration).
     # SECOND ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.52 32.2 ave.{cons,noncons}.mod
 #Recommended expected length: omega=32.239216 sites (for L_min*H=9.783400)
     # ==> keep at 32.2.
     # THIRD ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.45 27.1 ave.{cons,noncons}.mod
 #Recommended expected length: omega=27.102528 sites (for L_min*H=9.783400)
     # ==> keep at 27.1.
     # FOURTH ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.425 25.4 ave.{cons,noncons}.mod
 #Recommended expected length: omega=25.445150 sites (for L_min*H=9.783400)
     # ==> keep at 25.4
     # FIFTH ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.393 23.4 ave.{cons,noncons}.mod
 #Recommended expected length: omega=23.427025 sites (for L_min*H=9.783400)
     # ==> keep at 23.4
     # SIXTH ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.7 ave.{cons,noncons}.mod
 #Recommended expected length: omega=23.735901 sites (for L_min*H=9.783400)
     # ==> keep at 23.7
 
     # Now we are ready to set up the cluster job for computing the
     # conservation scores and predicted elements.  The we measure the 
     # conserved elements coverage, and if that's not satisfactory then we 
     # adjust parameters and repeat.  
     ssh kk9
     mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.phast
     cd /cluster/data/dm2/bed/multiz9way/phastCons/run.phast
     cat << 'EOF' > doPhastCons.sh
 #!/bin/csh -ef
 set pref = $1:t:r:r
 set chr = `echo $pref | awk -F\. '{print $1}'`
 set tmpfile = /scratch/phastCons.$$
 zcat $1 \
 | /cluster/bin/phast/phastCons - \
     ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
     --expected-lengths 23.7 --target-coverage 0.398 \
     --quiet --seqname $chr --idpref $pref \
     --viterbi /santest/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \
     --require-informative 0 \
   > $tmpfile
 gzip -c $tmpfile > /santest/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz
 rm $tmpfile
 'EOF'
 # << for emacs
     chmod a+x doPhastCons.sh
     rm -fr /santest/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
     mkdir -p /santest/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
     rm -f jobList
     foreach f (/santest/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
       echo doPhastCons.sh $f >> jobList
     end
     para create jobList
     para try, check, push, check, ...
 #Completed: 139 of 139 jobs
 #Average job time:                  18s       0.31m     0.01h    0.00d
 #Longest finished job:              29s       0.48m     0.01h    0.00d
 #Submission to last job:            46s       0.77m     0.01h    0.00d
 
     # back on kolossus:
     # combine predictions and transform scores to be in 0-1000 interval
     cd /cluster/data/dm2/bed/multiz9way/phastCons
     awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
       /santest/scratch/dm2/phastCons/ELEMENTS/*.bed \
     | /cluster/bin/scripts/lodToBedScore > all.bed
 
     ssh hgwdev
     # Now measure coverage of CDS by conserved elements. 
     # We want the "cover" figure to be close to 68.9%.
     cd /cluster/data/dm2/bed/multiz9way/phastCons
     featureBits -enrichment dm2 flyBaseGene:cds all.bed
     # FIRST ITERATION: too high; decrease --target-coverage, re-estimate.
 #flyBaseGene:cds 16.567%, all.bed 42.995%, both 12.251%, cover 73.95%, enrich 1.72x
     # SECOND ITERATION: still too high.
 #flyBaseGene:cds 16.567%, all.bed 41.408%, both 12.030%, cover 72.62%, enrich 1.75x
     # THIRD ITERATION: still...
 #flyBaseGene:cds 16.567%, all.bed 39.143%, both 11.693%, cover 70.58%, enrich 1.80x
     # FOURTH ITERATION: dang, should have used a calculator.
 #flyBaseGene:cds 16.567%, all.bed 38.329%, both 11.563%, cover 69.80%, enrich 1.82x
     # FIFTH ITERATION: close... but overshot.  Back off just a bit.
 #flyBaseGene:cds 16.567%, all.bed 37.277%, both 11.388%, cover 68.74%, enrich 1.84x
     # SIXTH ITERATION: done.
 #flyBaseGene:cds 16.567%, all.bed 37.445%, both 11.417%, cover 68.92%, enrich 1.84x
 
     # Having met the CDS coverage target, load up the results.
     hgLoadBed dm2 phastConsElements9way all.bed
 
     # Create wiggle on the small cluster
     ssh kki
     mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.wib
     cd /cluster/data/dm2/bed/multiz9way/phastCons/run.wib
     rm -rf /santest/scratch/dm2/phastCons/wib
     mkdir -p /santest/scratch/dm2/phastCons/wib
     cat << 'EOF' > doWigEncode
 #!/bin/csh -ef
 set chr = $1
 cd /santest/scratch/dm2/phastCons/wib
 zcat `ls -1 /santest/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
       | sort -t\. -k2,2n` \
 | wigEncode stdin ${chr}_phastCons.wi{g,b}
 'EOF'
 # << for emacs
     chmod a+x doWigEncode
     rm -f jobList
     foreach chr (`ls -1 /santest/scratch/dm2/phastCons/POSTPROBS \
                   | awk -F\. '{print $1}' | sort -u`)
       echo doWigEncode $chr >> jobList
     end
     para create jobList
     para try, check, push, check, ...
 #Completed: 13 of 13 jobs
 #Average job time:                  10s       0.16m     0.00h    0.00d
 #Longest finished job:              23s       0.38m     0.01h    0.00d
 #Submission to last job:            23s       0.38m     0.01h    0.00d
 
     # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want 
     # the raw scores) from santest
     cd /cluster/data/dm2/bed/multiz9way/phastCons
     rm -rf wib POSTPROBS
     rsync -av /santest/scratch/dm2/phastCons/wib .
     rsync -av /santest/scratch/dm2/phastCons/POSTPROBS .
 
     # load wiggle component of Conservation track
     ssh hgwdev
     mkdir /gbdb/dm2/multiz9way/wib
     cd /cluster/data/dm2/bed/multiz9way/phastCons
     chmod 775 . wib
     chmod 664 wib/*.wib
     ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz9way/wib/
     hgLoadWiggle dm2 phastCons9way \
       -pathPrefix=/gbdb/dm2/multiz9way/wib wib/*.wig
     rm wiggle.tab
 
     # make top-5000 list and launcher on Adam's home page:
     sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
     | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
     > top5000.bed
     /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
       /cse/grads/acs/public_html/dm-top5000-9way \
       "top 5000 conserved elements (9way)" dm2
 
     # and clean up santest.
     rm -r /santest/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib}
     rm -r /santest/scratch/dm2/chrom
     # Offer raw scores for download since fly folks are likely to be interested:
     ssh kkstore01
     cd /cluster/data/dm2/bed/multiz9way/phastCons/POSTPROBS
     mkdir ../postprobsDownload
     foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
       zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
         > ../postprobsDownload/$chr.pp.gz
     end
     cd ../postprobsDownload
     md5sum *.gz > md5sum.txt
     # Make a README.txt there too.
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons9way
     cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons9way
     ln -s /cluster/data/dm2/bed/multiz9way/phastCons/postprobsDownload/* .
 
 
 # MAP FEEP PROBES (DONE 2/3/05 angie)
     ssh kolossus
     mkdir /cluster/data/dm2/bed/flyFeep
     cd /cluster/data/dm2/bed/flyFeep
     set feepDir = /projects/compbio/data/microarray/flyFEEP
     # Make per-chrom probe FASTA files:
     foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh)
       echo $c
       awk '$3 == "'$c'" {printf ">%s\n%s\n", $1, $7;}' \
         $feepDir/probes3.1-Exons.1_apr11_exon \
         $feepDir/probes3.1-Noncoding.1_apr11_tiling \
         > chr${c}_exonNonExonProbes.fa
       awk '$1 != "ID" && $5 == "'$c'+" {printf ">%s\n%s\n", $1, $8;}' \
         $feepDir/probes3.1-junctions.txt \
         > chr${c}_junctionProbes.fa
     end
     # For exon/non-exon probes (36 consecutive bases), require 36 matching 
     # bases (for some reason -minScore=36 doesn't filter anything out!).
     foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh)
       blat -noHead \
         /cluster/data/dm2/nib/chr$c.nib chr${c}_exonNonExonProbes.fa stdout \
       | grep ^36 | uniq \
       > chr${c}_exonNonExonProbes.psl
     end
     # For splice junction probes (2 blocks of 18 consecutive bases), 
     # use a small tileSize and -fine, and still require 36 matching bases:
     # This was slow, do as small cluster job next time, will still take hours.
     foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh)
       blat -noHead -tileSize=6 -fine \
         /cluster/data/dm2/nib/chr$c.nib chr${c}_junctionProbes.fa stdout \
       | grep ^36 | uniq \
       > chr${c}_junctionProbes.psl
     end
     # Check on how many probes were aligned:
     foreach f (chr*exon*.fa chr*junc*.fa)
       set probes = `cat $f | wc -l`
       set probes = `expr $probes / 2`
       set aligned = `cat $f:r.psl | wc -l`
       set reallyAligned = `awk '{print $10;}' $f:r.psl | sort -u | wc -l `
       set not = `expr $probes - $reallyAligned`
       echo $f:r"\t"$probes"\t"$aligned"\t"$reallyAligned"\t"\($not\)
     end
     # set			count	aligned	reallyA	not
 #chr2h_exonNonExonProbes	644	611	611	(33)
 #chr2L_exonNonExonProbes	26631	26749	26627	(4)
 #chr2R_exonNonExonProbes	26100	26111	26098	(2)
 #chr3h_exonNonExonProbes	833	833	833	(0)
 #chr3L_exonNonExonProbes	28756	28773	28749	(7)
 #chr3R_exonNonExonProbes	35060	35064	35055	(5)
 #chr4_exonNonExonProbes	    	1798	1797	1797	(1)
 #chrX_exonNonExonProbes	    	28912	28917	28912	(0)
 #chrXh_exonNonExonProbes	354	354	354	(0)
 #chrYh_exonNonExonProbes	97	97	97	(0)
 #chr2h_junctionProbes    	3       3       3       (0)
 #chr2L_junctionProbes    	5048    5015    5012    (36)
 #chr2R_junctionProbes        	5697    5709    5652    (45)
 #chr3h_junctionProbes        	1       1       1       (0)
 #chr3L_junctionProbes        	6498    6437    6429    (69)
 #chr3R_junctionProbes        	8747    8695    8684    (63)
 #chr4_junctionProbes         	1056    1048    1048    (8)
 #chrXh_junctionProbes        	50      50      50      (0)
 #chrX_junctionProbes         	3688    3684    3680    (8)
 #chrYh_junctionProbes        	0       0       0       (0)
     # Most probes successfully mapped, but a fair number of duplicates.  
     # Dug into one of those (159937) and found that its probe sequence 
     # halves were from a tandem repeat!  (And didn't map to where they 
     # were supposed to in dm1 either.)  I suspect the probe sequences 
     # might not be correct in the files I downloaded.  Sent Chris Mason 
     # an email about that.  
     # If the probe dm1 locations are correct (I sure hope so), just not 
     # some given probe sequences, I could extract dm1 sequence and map 
     # that to dm2 instead of mapping the given probes as above.  
 
     # Translate PSL to bed12 (this is too simple for PSL in general, but 
     # since we require the entire query to map perfectly, it works here):
     awk '{printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t%d\t%s\t%s\n", \
                  $14, $16, $17, $10, $9, $16, $17, $18, $19, $20;}' \
       chr*.psl \
       > flyFeepProbesBed12.bed
 
     # Load probes
     ssh hgwdev
     cd /cluster/data/dm2/bed/flyFeep
     # Add scores to bed:
     hgMapMicroarray -bedIn flyFeep.bed hgFixed.flyFeepMedianRatio \
       flyFeepProbesBed12.bed
     hgLoadBed dm2 flyFeep flyFeep.bed
     # (back on fileserver) Winnow probes by Bussemaker lab's lists:
     set dm1FeepDir = /cluster/data/dm1/bed/flyFeep
     $dm1FeepDir/winnowBed.pl $dm1FeepDir/probesExpressedAboveBackground.txt \
       flyFeep.bed \
       > flyFeepPEAB.bed
     $dm1FeepDir/winnowBed.pl $dm1FeepDir/probesAnovaDiffExpressed.txt \
       flyFeep.bed \
       > flyFeepAnova.bed
     # (back on hgwdev) Load winnowed sets:
     hgLoadBed dm2 flyFeepPEAB flyFeepPEAB.bed
     hgLoadBed dm2 flyFeepAnova flyFeepAnova.bed
 
 
 # FLYBASE 4.1 ANNOTATIONS (DONE 2/28/05 angie)
     ssh kksilo
     mkdir /cluster/data/dm2/bed/flybase4.1
     cd /cluster/data/dm2/bed/flybase4.1
     foreach c (2L 2R 3L 3R 4 X)
       wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.1_20050207/gff/dmel-$c-r4.1.gff.gz
     end
     zcat *.gff.gz > flybase.gff3
     # What data sources are represented in this file?
     grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
     # excerpt (many other sources, including blastx:... , sim4:... and 
     # tblastx:...; also various other types for source "."):
   18941 .       CDS
   63033 .       exon
   14066 .       gene
   18941 .       mRNA
     144 .       ncRNA
      39 .       pseudogene
      96 .       rRNA
      29 .       snRNA
      28 .       snoRNA
     295 .       tRNA
   36921 .       transcription_start_site
    1571 .       transposable_element
   16404 .       transposable_element_insertion_site
     # What keywords are defined in the 9th field?
     grep -v '^#' flybase.gff3 \
     | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \
     | sort | uniq -c
     # Once again, the previous round's parsing needed some updates to 
     # handle the new data.
     extractGenes.pl flybase.gff3
 #Oddball parentless ID=Sps2-exon-10336472..10336531 for exon
 
     # Get predicted proteins (for main annotations only)
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.1_20050207/fasta/dmel-all-translation-r4.1.fasta.gz
     zcat dmel-all-translation-r4.1.fasta.gz \
     | perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' > flybasePep.fa
 
     ssh hgwdev
     cd /cluster/data/dm2/bed/flybase4.1
     # Protein-coding genes:
     ldHgGene -gtf dm2 flyBaseGene flybase.gtf
     hgPepPred dm2 generic flyBasePep flybasePep.fa
     # Fix typo caught by Galt & all.joiner:
     hgsql dm2 -e 'update flyBasePep set name = "CG6207-RC" where name = "GLCAT-P-PC"'
     # Noncoding genes:
     hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
     # Cross-referencing info for both coding and noncoding:
     hgsql dm2 < $HOME/kent/src/hg/lib/flyBase2004Xref.sql
     hgsql dm2 -e 'load data local infile "flyBase2004Xref.tab" \
       into table flyBase2004Xref'
     # Some featureBits comparisons with refGene which is pretty much like 
     # version 4.0 but apparently includes some noncoding genes too:
     featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout
 #chr3R   8221731 8222987 chr3R.1
 #...
 #87577 bases of 131698467 (0.066%) in intersection
     # This shows only the dropped isoforms/duplicates (ignores noncoding):
     featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \
        -bed=stdout
 #chr3R   15620310        15621493        chr3R.1
 #chr2L   4698094 4700940 chr2L.1
 #chrX    3638700 3639895 chrX.1
 #49911 bases of 131698467 (0.038%) in intersection
 
     # add upstream* downloadable files
     cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
     foreach size (1000 2000 5000)
       echo upstream$size
       nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
       | nice gzip -c > upstream$size.fa.gz
     end
     md5sum *.zip *.gz > md5sum.txt
 
 
 # FLYREG (DONE 2/23/05 angie)
     ssh kksilo
     mkdir /cluster/data/dm2/bed/flyreg
     cd /cluster/data/dm2/bed/flyreg
     wget http://www.gen.cam.ac.uk/casey/data/Bergman2004/v2.0/Footprint.GFF
     # This is not GTF; it should really be bed +.  The contributor, 
     # Casey Bergman, says that coords are 0-based half-open, so 
     # translation will be even easier.
     grep -v '^#' Footprint.GFF \
     | perl -wpe 'if (! s/^(\S+)\tBergman_data\tbinding_site\t(\d+)\t(\d+)\t.\t.\t.\tFactor \"([^\"]+)\"; Target \"([^\"]+)\"; PMID \"(\d+)\"; FPID \"(\d+)\"$/chr$1\t$2\t$3\t$4\t$5\t$6\t$7/) { die "Cant parse line $.:\n$_\n"; }' \
       > flyreg2.bed
     ssh hgwdev
     hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/flyreg2.sql \
       dm2 flyreg2 /cluster/data/dm2/bed/flyreg/flyreg2.bed
     # Add Dan Pollard's MEME motif data for the footprints:
     ssh kksilo
     cd /cluster/data/dm2/bed/flyreg
     wget http://rana.lbl.gov/~dan/matrices/bergman2004/footprint_matrices.txt
     /cluster/data/dm1/bed/flyreg/extractMatrices.pl \
       footprint_matrices.txt > flyregMotif.tab
     ssh hgwdev
     cd /cluster/data/dm2/bed/flyreg
     sed -e 's/dnaMotif/flyregMotif/' $HOME/kent/src/hg/lib/dnaMotif.sql \
     > flyregMotif.sql
     hgsql dm2 < flyregMotif.sql
     hgsql dm2 -e 'load data local infile "flyregMotif.tab" into table flyregMotif'
 
 
 # LIFTOVER DM1 FLYREG, COMPARE TO DM2 FLYREG (DONE 2/25/05 angie)
     # Casey Bergman is very eager to see alternate confirmation of his 
     # dm1->dm2 mapping of flyreg, so run liftOver on it:
     ssh kksilo
     cd /cluster/data/dm2/bed/flyreg
     liftOver /cluster/data/dm1/bed/flyreg/flyreg.bed \
       /cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \
       flyregLift.bed flyregLift.unmapped
     wc -l flyreg2.bed flyregLift.bed flyregLift.unmapped 
 #  1362 flyreg2.bed
 #  1363 flyregLift.bed
 #     0 flyregLift.unmapped
     awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5;}' flyreg2.bed | sort \
       > /tmp/1
     awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5;}' flyregLift.bed | sort \
       > /tmp/2
     diff -c /tmp/[12] > newVsLift
     # Analyzed the differences: 2 duplicates in original (uniquified in v2),
     # 1 new item in v2, and 4 items incorrectly mapped by liftOver (fooled 
     # by what looks like a slightly diverged local duplication) but probably
     # correctly mapped by Casey.
 
 
 # SELF ALIGNMENTS (DONE 2/24/05 angie)
     # Doing this largely as a test of doBlastzChainNet.pl...
     ssh kksilo
     mkdir /cluster/data/dm2/bed/blastz.dm2.2005-02-23
     cd /cluster/data/dm2/bed/blastz.dm2.2005-02-23
     cat << '_EOF_' > DEF
 # D. melanogaster vs. self
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. melanogaster
 SEQ2_DIR=/iscratch/i/dm2/nib
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/dm2/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.dm2.2005-02-23
 
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2SelfOut >& do.log &
     tail -f do.log
     # The script died at the start of the cat step when kki couldn't see 
     # /panasas (doh!).  Asked cluster-admin to fix that.  
     # When fixed, restarted with -continue, appending to do.log:
     rm -r run.cat
     doBlastzChainNet.pl -continue cat DEF \
       -blastzOutRoot /panasas/store/dm2SelfOut >>& do.log &
     tail -f do.log
     rmdir /panasas/store/dm2SelfOut
     ln -s blastz.dm2.2005-02-23 /cluster/data/dm2/bed/blastz.dm2
     # chainSelf is already in top-level trackDb.ra, so no need to add.
     # Add /usr/local/apache/htdocs/goldenPath/dm2/vsSelf/README.txt
 
 
 # BLASTZ/CHAIN/NET DROSIM1 (DONE 4/13/05 angie)
 # -- Originally run with default blastz params; replaced by a run with 
 # better params, below.
 
 
 # BLASTZ/CHAIN/NET ANOGAM1 -- LOOSER PARAMS (DONE 5/19/05 angie)
     # Lower the L threshold from 6000 to 4000 (5-18) and then 3000 (5-19), 
     # see if that increases coverage from this measurement from the previous 
     # "human-fugu" params:
     # featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainAnoGam1Link
 #flyBaseGene 22.967%, chainAnoGam1Link 15.999%, both 12.567%, cover 54.72%, enrich 3.42x
 # 5/19, L=3000:  too much...  low-complexity-anchored alignments cluttering.
 # but interesting to know we can get this kind of coverage:
 #flyBaseGene 22.967%, chainAnoGam1Link 60.227%, both 18.542%, cover 80.73%, enrich 1.34x
     # So use results of this L=4000 run:
     ssh kkstore01
     mkdir /cluster/data/dm2/bed/blastz.anoGam1.2005-05-18
     cd /cluster/data/dm2/bed/blastz.anoGam1.2005-05-18
     cat << '_EOF_' > DEF
 # D.melanogaster vs. A. gambiae
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - A. gambiae
 SEQ2_DIR=/iscratch/i/anoGam1/nib
 SEQ2_CHUNK=5000000
 SEQ2_LAP=0
 SEQ2_LEN=/cluster/data/anoGam1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.anoGam1.2005-05-18
 
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /santest/scratch/dm2anoGam1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainAnoGam1Link
 # 5/18, L=4000:
 #flyBaseGene 22.967%, chainAnoGam1Link 22.072%, both 14.077%, cover 61.29%, enrich 2.78x
     # Looks good in the browser -- top-level net chains extend farther 
     # without too much extra crap like we saw with L=3000.
     rm /cluster/data/dm2/bed/blastz.anoGam1
     ln -s blastz.anoGam1.2005-05-18 /cluster/data/dm2/bed/blastz.anoGam1
 
 
 # BLASTZ/CHAIN/NET APIMEL2 -- LOOSER PARAMS (DONE 5/19/05 angie)
     # Since L=4000 noticeably helped mosquito, try it on the honeybee,
     # which had this coverage with L=6000:
 #flyBaseGene 22.967%, chainApiMel2Link 13.767%, both 7.802%, cover 33.97%, enrich 2.47x
     mkdir /cluster/data/dm2/bed/blastz.apiMel2.2005-05-19
     cd /cluster/data/dm2/bed/blastz.apiMel2.2005-05-19
     cat << '_EOF_' > DEF
 # D.melanogaster vs. A. mellifera
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - A. mellifer
 SEQ2_DIR=/iscratch/i/apiMel2/nib
 SEQ2_CHUNK=5000000
 SEQ2_LAP=0
 SEQ2_LEN=/cluster/data/apiMel2/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.apiMel2.2005-05-19
 
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /santest/scratch/dm2apiMel2 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainApiMel2Link
 #flyBaseGene 22.967%, chainApiMel2Link 36.701%, both 11.095%, cover 48.31%, enrich 1.32x
     # Wow, quite an increase in coverage!  And drop in enrichment... some 
     # of the new alignments are on the low-complexity side but not nearly 
     # as bad as mosquito at L=3000.  Never know what to expect from these 
     # param experiments!  :|  Well, using L=4000 seems like a net gain so 
     # go with it...
     rm /cluster/data/dm2/bed/blastz.apiMel2
     ln -s blastz.apiMel2.2005-05-19 /cluster/data/dm2/bed/blastz.apiMel2
 
 
 # BLASTZ/CHAIN/NET DROMOJ1 -- LOOSER PARAMS (DONE 5/19/05 angie)
     # Here's what L=6000 got us:
 #flyBaseGene 22.967%, chainDroMoj1Link 59.129%, both 20.159%, cover 87.77%, enrich 1.48x
     # OK, let's see what droMoj1 looks like at L=4000...
     mkdir /cluster/data/dm2/bed/blastz.droMoj1.2005-05-19
     cd /cluster/data/dm2/bed/blastz.droMoj1.2005-05-19
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. mojavensis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. mojavensis
 SEQ2_DIR=/iscratch/i/droMoj1/droMoj1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droMoj1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droMoj1.2005-05-19
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /santest/scratch/dm2droMoj1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj1Link
 #flyBaseGene 22.967%, chainDroMoj1Link 65.901%, both 20.698%, cover 90.12%, enrich 1.37x
     # Wow, actually looks pretty good in the browser.  Finds more non-top-
     # level alignments than L=6000, but not a huge number, and most look 
     # reasonable in the detailed view.
     rm -f /cluster/data/dm2/bed/blastz.droMoj1
     ln -s blastz.droMoj1.2005-05-19 /cluster/data/dm2/bed/blastz.droMoj1
 
 
 # BLASTZ/CHAIN/NET DROVIR1 -- LOOSER PARAMS (DONE 5/20/05 angie)
     # coverage with default blastz params:
 #flyBaseGene 22.967%, chainDroVir1Link 47.957%, both 18.585%, cover 80.92%, enrich 1.69x
     # OK, let's see what droVir1 looks like at L=4000...
     mkdir /cluster/data/dm2/bed/blastz.droVir1.2005-05-19
     cd /cluster/data/dm2/bed/blastz.droVir1.2005-05-19
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. virilis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. virilis
 SEQ2_DIR=/santest/scratch/droVir1/droVir1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/santest/scratch/droVir1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droVir1.2005-05-19
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /santest/scratch/dm2droVir1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir1Link
 #flyBaseGene 22.967%, chainDroVir1Link 66.657%, both 20.876%, cover 90.90%, enrich 1.36x
     rm -f /cluster/data/dm2/bed/blastz.droVir1
     ln -s blastz.droVir1.2005-05-19 /cluster/data/dm2/bed/blastz.droVir1
 
 
 # BLASTZ/CHAIN/NET DP3 -- LOOSER PARAMS (DONE 5/20/05 angie)
     # coverage with default blastz params was not too bad but let's see if 
     # we can do better:
 #flyBaseGene 22.967%, chainDp3Link 67.232%, both 20.112%, cover 87.57%, enrich 1.30x
     mkdir /cluster/data/dm2/bed/blastz.dp3.2005-05-20
     cd /cluster/data/dm2/bed/blastz.dp3.2005-05-20
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. pseudoobscura
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. pseudoobscura
 SEQ2_DIR=/iscratch/i/dp3/nib
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/dp3/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.dp3.2005-05-20
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /santest/scratch/dm2dp3 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDp3Link
 #flyBaseGene 22.967%, chainDp3Link 74.901%, both 21.335%, cover 92.89%, enrich 1.24x
     # Again, reasonable extension of coverage without too much crap.
     rm -f /cluster/data/dm2/bed/blastz.dp3
     ln -s blastz.dp3.2005-05-20 /cluster/data/dm2/bed/blastz.dp3
 
 
 # BLASTZ/CHAIN/NET DROANA1 -- LOOSER PARAMS (DONE 5/21/05 angie)
     # coverage with default blastz params:
 #flyBaseGene 22.967%, chainDroAna1Link 73.508%, both 20.981%, cover 91.35%, enrich 1.24x
     # OK, let's see what droAna1 looks like at L=4000...
     mkdir /cluster/data/dm2/bed/blastz.droAna1.2005-05-20
     cd /cluster/data/dm2/bed/blastz.droAna1.2005-05-20
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. ananassae
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. ananassae
 SEQ2_DIR=/santest/scratch/droAna1/droAna1.2bit
 SEQ2_CHUNK=1000000
 SEQ2_LAP=10000
 SEQ2_LEN=/santest/scratch/droAna1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droAna1.2005-05-20
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /santest/scratch/dm2droAna1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna1Link
 #flyBaseGene 22.967%, chainDroAna1Link 80.519%, both 21.949%, cover 95.57%, enrich 1.19x
     rm -f /cluster/data/dm2/bed/blastz.droAna1
     ln -s blastz.droAna1.2005-05-20 /cluster/data/dm2/bed/blastz.droAna1
 
 
 # BLASTZ/CHAIN/NET DROYAK1 -- LOOSER PARAMS (DONE 5/21/05 angie)
     # coverage with default blastz params:
 #flyBaseGene 22.967%, chainDroYak1Link 90.253%, both 22.586%, cover 98.34%, enrich 1.09x
     # OK, let's see what droYak1 looks like at L=4000...
     mkdir /cluster/data/dm2/bed/blastz.droYak1.2005-05-20
     cd /cluster/data/dm2/bed/blastz.droYak1.2005-05-20
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. yakuba
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. yakuba
 SEQ2_DIR=/iscratch/i/droYak1/nib
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droYak1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droYak1.2005-05-20
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /santest/scratch/dm2droYak1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroYak1Link
 #flyBaseGene 22.967%, chainDroYak1Link 91.896%, both 22.736%, cover 98.99%, enrich 1.08x
     rm -f /cluster/data/dm2/bed/blastz.droYak1
     ln -s blastz.droYak1.2005-05-20 /cluster/data/dm2/bed/blastz.droYak1
 
 
 # BLASTZ/CHAIN/NET DROSIM1 -- LOOSER PARAMS (DONE 5/21/05 angie)
     # coverage with default blastz params:
 #flyBaseGene 22.967%, chainDroSim1Link 90.656%, both 22.139%, cover 96.40%, enrich 1.06x
     # OK, let's see what droSim1 looks like at L=4000...
     mkdir /cluster/data/dm2/bed/blastz.droSim1.2005-05-21
     cd /cluster/data/dm2/bed/blastz.droSim1.2005-05-21
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. simulans
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=5000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. simulans
 SEQ2_DIR=/iscratch/i/droSim1/droSim1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droSim1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droSim1.2005-05-21
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /santest/scratch/dm2droSim1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroSim1Link
 #flyBaseGene 22.967%, chainDroSim1Link 91.756%, both 22.420%, cover 97.62%, enrich 1.06x
     rm -f /cluster/data/dm2/bed/blastz.droSim1
     ln -s blastz.droSim1.2005-05-21 /cluster/data/dm2/bed/blastz.droSim1
 
 # AUGUSTUS GENE PREDICTIONS (Done 6/1/2005 Andy)
     ssh hgwdev
     cd /cluster/data/dm2/bed
     mkdir augustus
     cd augustus/
     cat > cleanAugustus.awk << _EOF_
 # Add the chrom name to the gene and transcript IDs.
 {
    ch = \$1
    while (match(\$0, /"g[0-9\.]+"/))
        {
        before = substr(\$0, 1, RSTART)
        name = substr(\$0, RSTART+1, RLENGTH)
        after = substr(\$0, RSTART + RLENGTH + 1)
        \$0 = before "" ch "." name "" after
        }
    print
 }
 _EOF_
     wget http://augustus.gobics.de/predictions/dm2/dm2.allchr.augustus.gtf.gz
     zcat dm2.allchr.augustus.gtf.gz | awk -f cleanAugustus.awk | gzip > dm2.allchr.augustus.clean.gtf.gz
     ldHgGene -gtf dm2 augustus dm2.allchr.augustus.clean.gtf.gz
     rm dm2.allchr.augustus.gtf.gz
 
 # BLAT FlyBase predicted DM proteins against DM (DONE 2005-06-27 braney)
     ssh hgwdev
     cd /cluster/data/dm2/bed
     mkdir blat.dm2FB
     cd blat.dm2FB
     pepPredToFa dm2 flyBasePep dm2FB.fa
     hgPepPred dm2 generic blastFBPep01 dm2FB.fa
     ssh kk
     cd /cluster/data/dm2/bed/blat.dm2FB
     cat << '_EOF_' > blatSome
 #!/bin/csh -fe
 /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
 '_EOF_'
     # << keep emacs happy
     chmod +x blatSome
     ls -1S /iscratch/i/dm2/nib/*.nib > bug.lst
     mkdir fbfas
     cd fbfas
     faSplit sequence ../dm2FB.fa 5010 fb
     cd ..
     ls -1S fbfas/*.fa > fb.lst
     cat << '_EOF_' > blatGsub
 #LOOP
 blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
     gensub2 bug.lst fb.lst blatGsub blatSpec
     mkdir psl
     cd psl
     foreach i (`cat ../bug.lst`)
 	mkdir `basename $i .nib`
     end
     cd ..
     para create blatSpec
     para push
 
 # Completed: 63193 of 63193 jobs
 # CPU time in finished jobs:     712543s   11875.72m   197.93h    8.25d  0.023 y
 # IO & Wait Time:                172878s    2881.30m    48.02h    2.00d  0.005 y
 # Average job time:                  14s       0.23m     0.00h    0.00d
 # Longest finished job:            2172s      36.20m     0.60h    0.03d
 # Submission to last job:          3192s      53.20m     0.89h    0.04d
 
     ssh eieio
     cd /cluster/data/dm2/bed/blat.dm2FB
     pslSort dirs raw.psl /tmp psl/*
     pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null
     sort -rn cov90.psl | pslUniq stdin dm2FB.psl
     pslxToFa dm2FB.psl dm2FB_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
     fbName dm2 dm2FB.psl blastFBRef01
     ssh hgwdev
     cd /cluster/data/dm2/bed/blat.dm2FB
     hgsql dm2 < ~/kent/src/hg/lib/blastRef.sql
     echo "rename table blastRef to blastFBRef01" | hgsql dm2
     echo "load data local infile 'blastFBRef01' into table blastFBRef01" | hgsql dm2
 
 # MAKE Human Proteins track (DONE 06-30-05 braney)
     ssh kk
     mkdir -p /cluster/data/dm2/blastDb
     cd /cluster/data/dm2/blastDb
     for i in ../*/*/*_[0-9]*_[0-9]*.fa; do ln -s $i .; done
     for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
     rm *.fa *.log
 
     ssh kkr1u00
     mkdir -p /iscratch/i/dm2/blastDb
     cp /cluster/data/dm2/blastDb/* /iscratch/i/dm2/blastDb
     iSync 2>&1 > sync.out
 
     mkdir -p /cluster/data/dm2/bed/tblastn.hg17KG
     cd /cluster/data/dm2/bed/tblastn.hg17KG
     ls -1S /iscratch/i/dm2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst
 
     mkdir kgfa
     # calculate a reasonable number of jobs
     calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc target.lst | awk "{print \\\$1}"`\)
 # 37365/(150000/288) = 71.740800
 
     split -l 72 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg
     cd kgfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     rm -rf  /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut
     mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut
     ln -s  /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut
     for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
     tcsh
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
 if /cluster/bin/i386/blastToPsl $f.1 $f.2
 then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dm2/jkStuff/subChr.lft warn $f.2  
 	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/dm2/jkStuff/liftAll.lft warn $f.3  
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4
         mv $3.tmp $3
         rm -f $f.1 $f.2 $f.3 $f.4
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
 
     chmod +x blastSome
     gensub2 target.lst kg.lst blastGsub blastSpec
 
     ssh kk
     cd /cluster/data/dm2/bed/tblastn.hg17KG
     para create blastSpec
     para push
 # Completed: 149472 of 149472 jobs
 # CPU time in finished jobs:    5837692s   97294.87m  1621.58h   67.57d  0.185 y
 # IO & Wait Time:               1404308s   23405.13m   390.09h   16.25d  0.045 y
 # Average job time:                  48s       0.81m     0.01h    0.00d
 # Longest finished job:            1095s      18.25m     0.30h    0.01d
 # Submission to last job:         50465s     841.08m    14.02h    0.58d
 
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1)
 #ENDLOOP
 '_EOF_'
 
     ssh kki
     cd /cluster/data/dm2/bed/tblastn.hg17KG
     tcsh
     cat << '_EOF_' > chainSome
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainSome
 
     ls -1dS `pwd`/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
 
     para create chainSpec
     para push
 
 # Completed: 519 of 519 jobs
 # CPU time in finished jobs:       3835s      63.91m     1.07h    0.04d  0.000 y
 # IO & Wait Time:                  5794s      96.57m     1.61h    0.07d  0.000 y
 # Average job time:                  19s       0.31m     0.01h    0.00d
 # Longest finished job:             114s       1.90m     0.03h    0.00d
 # Submission to last job:           824s      13.73m     0.23h    0.01d
 
     ssh kkstore01
     cd /cluster/data/dm2/bed/tblastn.hg17KG/blastOut
     for i in kg??
     do 
 	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/dm2/bed/tblastn.hg17KG/blastHg17KG.psl
     cd ..
 
     ssh hgwdev
     cd /cluster/data/dm2/bed/tblastn.hg17KG
     hgLoadPsl dm2 blastHg17KG.psl
     exit
 
     # back to kksilo
     rm -rf blastOut
 
 # End tblastn
 
 
 # BLASTZ/CHAIN/NET TRICAS1 (DONE 7/11/05 Andy)
     ssh kkstore01
     mkdir /cluster/data/dm2/bed/blastz.triCas1.2005-07-10
     cd /cluster/data/dm2/bed/blastz.triCas1.2005-07-10    
     cat << "_EOF_" > DEF
 # D. melanogaster vs. T. castaneum
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. virilis
 SEQ2_DIR=/panasas/store/triCas1/triCas1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/panasas/store/triCas1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.triCas1.2005-07-10
 _EOF_
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2triCas1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir1Link
 #flyBaseGene 22.967%, chainDroVir1Link 66.657%, both 20.876%, cover 90.90%, enrich 1.36x
     rm -f /cluster/data/dm2/bed/blastz.triCas1
     ln -s blastz.triCas1.2005-05-19 /cluster/data/dm2/bed/blastz.triCas1
 
 
 # GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie)
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/geneid
     cd /cluster/data/dm2/bed/geneid
     foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
       wget http://genome.imim.es/genepredictions/D.melanogaster/golden_path_200404/geneidv1.2/$chr.gtf
     end
     ldHgGene -gtf -genePredExt dm2 geneid *.gtf
 
 
 # BLASTZ/CHAIN/NET DROERE1 (DONE 8/8/05 angie)
     mkdir /cluster/data/dm2/bed/blastz.droEre1.2005-08-08
     cd /cluster/data/dm2/bed/blastz.droEre1.2005-08-08
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. erecta
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. erecta
 SEQ2_DIR=/iscratch/i/droEre1/droEre1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droEre1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droEre1.2005-08-08
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droEre1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroEre1Link
 #flyBaseGene 22.967%, chainDroEre1Link 91.103%, both 22.725%, cover 98.95%, enrich 1.09x
     rm -f /cluster/data/dm2/bed/blastz.droEre1
     ln -s blastz.droEre1.2005-08-08 /cluster/data/dm2/bed/blastz.droEre1
 
 
 # BLASTZ/CHAIN/NET DROANA2 (DONE 8/8/05 angie)
     mkdir /cluster/data/dm2/bed/blastz.droAna2.2005-08-08
     cd /cluster/data/dm2/bed/blastz.droAna2.2005-08-08
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. ananassae
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. ananassae
 SEQ2_DIR=/iscratch/i/droAna2/droAna2.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droAna2/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droAna2.2005-08-08
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droAna2 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna2Link
 #flyBaseGene 22.967%, chainDroAna2Link 81.385%, both 22.050%, cover 96.01%, enrich 1.18x
     rm -f /cluster/data/dm2/bed/blastz.droAna2
     ln -s blastz.droAna2.2005-08-08 /cluster/data/dm2/bed/blastz.droAna2
 
 
 # BLASTZ/CHAIN/NET DROMOJ2 (DONE 8/9/05 angie)
     mkdir /cluster/data/dm2/bed/blastz.droMoj2.2005-08-08
     cd /cluster/data/dm2/bed/blastz.droMoj2.2005-08-08
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. mojavensis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. mojavensis
 SEQ2_DIR=/iscratch/i/droMoj2/droMoj2.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droMoj2/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droMoj2.2005-08-08
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droMoj2 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj2Link
 #flyBaseGene 22.967%, chainDroMoj2Link 66.524%, both 20.799%, cover 90.56%, enrich 1.36x
     rm -f /cluster/data/dm2/bed/blastz.droMoj2
     ln -s blastz.droMoj2.2005-08-08 /cluster/data/dm2/bed/blastz.droMoj2
 
 
 # BLASTZ/CHAIN/NET DROGRI1 (DONE 8/9/05 angie)
     mkdir /cluster/data/dm2/bed/blastz.droGri1.2005-08-08
     cd /cluster/data/dm2/bed/blastz.droGri1.2005-08-08
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. grimshawi
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. grimshawi
 SEQ2_DIR=/iscratch/i/droGri1/droGri1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droGri1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droGri1.2005-08-08
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droGri1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroGri1Link
 #flyBaseGene 22.967%, chainDroGri1Link 68.520%, both 20.822%, cover 90.66%, enrich 1.32x
     rm -f /cluster/data/dm2/bed/blastz.droGri1
     ln -s blastz.droGri1.2005-08-08 /cluster/data/dm2/bed/blastz.droGri1
 
 
 # BLASTZ/CHAIN/NET DROVIR2 (DONE 8/12/05 angie)
     mkdir /cluster/data/dm2/bed/blastz.droVir2.2005-08-11
     cd /cluster/data/dm2/bed/blastz.droVir2.2005-08-11
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. virilis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. virilis
 SEQ2_DIR=/iscratch/i/droVir2/droVir2.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droVir2/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droVir2.2005-08-11
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droVir2 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir2Link
 #flyBaseGene 22.967%, chainDroVir2Link 67.025%, both 20.901%, cover 91.01%, enrich 1.36x
     rm -f /cluster/data/dm2/bed/blastz.droVir2
     ln -s blastz.droVir2.2005-08-11 /cluster/data/dm2/bed/blastz.droVir2
 
 
 # RE-RUN NETTOAXT, AXTTOMAF IN PREP FOR MULTIZ (DONE 8/19/05 angie)
     # Kate checked in a fix to netToAxt, to prevent overlapping blocks 
     # which were causing problems for (display of) multiple alignments.  
     # Before running multiz, regenerate the axtNet/ and mafNet/ pairwise 
     # inputs (axtNet downloads should probably be repushed after this).  
     # The /cluster/data/dm2/bed/blastz.*/axtChain/netChains.csh files 
     # contain the original axtNet and mafNet commands, adapted here:
     ssh kolossus
     cd /tmp
     cat << '_EOF_' > /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh
 #!/bin/csh -efx
 foreach oDb (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \
              anoGam1 apiMel2 triCas1)
   set oSeq = /iscratch/i/$oDb/$oDb.2bit
   if (! -e $oSeq) then
     set oSeq = /iscratch/i/$oDb/nib
   endif
   cd /cluster/data/dm2/bed/blastz.$oDb/axtChain
   netSplit dm2.$oDb.net.gz net
   chainSplit chain dm2.$oDb.all.chain.gz
   cd ..
   mv axtNet axtNet.bak
   mkdir axtNet
   foreach f (axtChain/net/*.net)
     netToAxt $f axtChain/chain/$f:t:r.chain \
       /iscratch/i/dm2/nib $oSeq stdout \
     | axtSort stdin stdout \
     | gzip -c > axtNet/$f:t:r.dm2.$oDb.net.axt.gz
   end
   mv mafNet mafNet.bak
   mkdir mafNet
   foreach f (axtNet/*.dm2.$oDb.net.axt.gz)
     axtToMaf -tPrefix=dm2. -qPrefix=$oDb. $f \
          /cluster/data/dm2/chrom.sizes /cluster/data/$oDb/chrom.sizes \
          stdout \
     | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
   end
 end
 '_EOF_'
     # << for emacs
     chmod 775 /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh
     /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh >& re.log
     echo $status
 #0
     # Made sure everything ran smoothly.  Spot-checked some .bak's vs. 
     # new to make sure they're of comparable size -- newer can be slightly 
     # smaller due to lack of dup's, or slightly larger due to more headers.  
     # Clean up:
     foreach oDb (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \
                  anoGam1 apiMel2 triCas1)
       rm -r /cluster/data/dm2/bed/blastz.$oDb/axtChain/{chain,net}
       rm -r /cluster/data/dm2/bed/blastz.$oDb/{axt,maf}Net.bak
     end
 
 
 # MULTIZ.V10 12WAY (7 FLIES, MOSQUITO, HONEYBEE) (DONE 8/20/05 angie)
     # Since 9way, adding D. erecta, D. grimshawi, T. castaneum;
     # dro{Ana,Moj,Vir}1 --> dro{Ana,Moj,Vir}2.
     # Tree (12-way):
     # ((((((((dm2 droSim1) (droYak1 droEre1)) droAna2) dp3) ((droVir2 droMoj2) droGri1)) anoGam1) apiMel2) triCas1)
     ssh kkstore01
     mkdir /cluster/data/dm2/bed/multiz12way.2005-08-20
     ln -s /cluster/data/dm2/bed/multiz12way.2005-08-20 \
       /cluster/data/dm2/bed/multiz12way
     cd /cluster/data/dm2/bed/multiz12way
     # Setup: Copy pairwise MAF to /san/sanvol1/scratch:
     mkdir /san/sanvol1/scratch/flyMultiz12way
     foreach db (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \
                 anoGam1 apiMel2 triCas1)
       echo $db
       cp -pR /cluster/data/dm2/bed/blastz.$db/mafNet \
         /san/sanvol1/scratch/flyMultiz12way/$db
     end
     ls -lLR /san/sanvol1/scratch/flyMultiz12way
     # Make output dir:
     mkdir maf
     # Create scripts to run multiz.v10 in the right order:
     # (Thanks to Andy for the idea to use a subroutine for individual 
     # multiz invocations)
     cat << '_EOF_' > /cluster/bin/scripts/runMultizV10.csh
 #!/bin/csh -fex
 # Run multiz on two inputs.
 set closerMaf = $1
 set fartherMaf = $2
 set v = $3
 set ref = $4
 set tmpDir = $5
 set outMaf = $6
 
 if ($outMaf == "") then
   echo "usage: $0 closerMaf fartherMaf v ref tmpDir outMaf"
   exit 1
 endif
 
 set MZ10 = /cluster/bin/penn/multiz.v10
 set PROJECT = /cluster/bin/penn/maf_project
 
 if (-e $closerMaf && $closerMaf:e == "gz") then
   gunzip -c $closerMaf > $tmpDir/closer.maf
   set closerMaf = $tmpDir/closer.maf
 endif
 if (-e $fartherMaf && $fartherMaf:e == "gz") then
   gunzip -c $fartherMaf > $tmpDir/farther.maf
   set fartherMaf = $tmpDir/farther.maf
 endif
 
 set closerSubbed = `echo $closerMaf | sed -e 's@/@_@g;'`
 set fartherSubbed = `echo $fartherMaf | sed -e 's@/@_@g;'`
 
 if ( -s $closerMaf && -s $fartherMaf ) then
     $MZ10 $fartherMaf $closerMaf $v > $tmpDir/tmp.maf
     $PROJECT $tmpDir/tmp.maf $ref > $outMaf
 else if ( -s $closerMaf ) then
     touch missing.$fartherSubbed
     cp $closerMaf $outMaf
 else if ( -s $fartherMaf ) then
     touch missing.$closerSubbed
     cp $fartherMaf $outMaf
 else
     touch missing.$closerSubbed
     touch missing.$fartherSubbed
 endif 
 '_EOF_'
     # << for emacs
     cat << '_EOF_' > doMultizAll.csh
 #!/bin/csh -fex
 set chr = $1
 set tmpDir = /scratch/flyMultiz12way.$chr
 mkdir $tmpDir
 
 set mafScratch = /san/sanvol1/scratch/flyMultiz12way
 
 # Really should write a perl script to take a tree like this and generate 
 # commands like the ones below:
 # ((((((((dm2 droSim1) (droYak1 droEre1)) droAna2) dp3) ((droVir2 droMoj2) droGri1)) anoGam1) apiMel2) triCas1)
 
 /cluster/bin/scripts/runMultizV10.csh \
   $mafScratch/droYak1/$chr.maf.gz \
   $mafScratch/droEre1/$chr.maf.gz \
   0 dm2.$chr $tmpDir $tmpDir/$chr.YakEre.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $mafScratch/droSim1/$chr.maf.gz \
   $tmpDir/$chr.YakEre.maf \
   1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEre.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $tmpDir/$chr.SimYakEre.maf \
   $mafScratch/droAna2/$chr.maf.gz \
   1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAna.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $tmpDir/$chr.SimYakEreAna.maf \
   $mafScratch/dp3/$chr.maf.gz \
   1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPse.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $mafScratch/droVir2/$chr.maf.gz \
   $mafScratch/droMoj2/$chr.maf.gz \
   0 dm2.$chr $tmpDir $tmpDir/$chr.VirMoj.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $tmpDir/$chr.SimYakEreAnaPse.maf \
   $tmpDir/$chr.VirMoj.maf \
   1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMoj.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $tmpDir/$chr.SimYakEreAnaPseVirMoj.maf \
   $mafScratch/droGri1/$chr.maf.gz \
   1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGri.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $tmpDir/$chr.SimYakEreAnaPseVirMojGri.maf \
   $mafScratch/anoGam1/$chr.maf.gz \
   1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGriAno.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $tmpDir/$chr.SimYakEreAnaPseVirMojGriAno.maf \
   $mafScratch/apiMel2/$chr.maf.gz \
   1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGriAnoApi.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $tmpDir/$chr.SimYakEreAnaPseVirMojGriAnoApi.maf \
   $mafScratch/triCas1/$chr.maf.gz \
   1 dm2.$chr $tmpDir maf/$chr.maf
 
 rm -f $tmpDir/*.maf
 rmdir $tmpDir
 '_EOF_'
     # << for emacs
     chmod 775 /cluster/bin/scripts/runMultizV10.csh
     chmod 775 doMultizAll.csh
     awk '{print "./doMultizAll.csh " $1;}' /cluster/data/dm2/chrom.sizes \
       > jobs.lst
     # Run on small cluster
     ssh kki
     cd /cluster/data/dm2/bed/multiz12way
     para make jobs.lst
     para time
 #Completed: 13 of 13 jobs
 #Average job time:                 124s       2.07m     0.03h    0.00d
 #Longest finished job:             538s       8.97m     0.15h    0.01d
 #Submission to last job:         15659s     260.98m     4.35h    0.18d
     ls -1 missing*
 #ls: No match.
 
     # make /gbdb/ links to 12way maf files:
     ssh hgwdev
     mkdir -p /gbdb/dm2/multiz12way/maf/multiz12way
     ln -s /cluster/data/dm2/bed/multiz12way/maf/chr*.maf \
       /gbdb/dm2/multiz12way/maf/multiz12way/
     # load into database
     cd /tmp
     hgLoadMaf -warn dm2 multiz12way \
       -pathPrefix=/gbdb/dm2/multiz12way/maf/multiz12way
     # load summary table to replace pairwise
     cat /cluster/data/dm2/bed/multiz12way/maf/chr*.maf \
     | nice hgLoadMafSummary dm2 multiz12waySummary stdin
 
     # Dropped unused indexes (2006-05-09 kate)
     # NOTE: this is not required in the future, as the loader
     # has been fixed to not generate these indexes
     hgsql dm2 -e "alter table multiz9waySummary drop index chrom_2"
     hgsql dm2 -e "alter table multiz9waySummary drop index chrom_3"
 
     # put 12way MAF out for download
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz12way
     mkdir mafDownload
     foreach f (maf/*.maf)
       nice gzip -c $f > mafDownload/$f:t.gz
     end
     cd mafDownload
     md5sum *.maf.gz > md5sum.txt
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dm2/multiz12way
     ln -s /cluster/data/dm2/bed/multiz12way/mafDownload/{*.maf.gz,md5sum.txt} \
       /usr/local/apache/htdocs/goldenPath/dm2/multiz12waya
     # make a README.txt
 
     # Cleanup
     rm -rf /san/sanvol1/scratch/flyMultiz12way/
 
 
 # PHASTCONS 12WAY WITH METHODS FROM PAPER (DONE 9/20/05 angie)
 # ((((((((dm2,droSim1),(droYak1,droEre1)),droAna2),dp3),((droVir2,droMoj2),droGri1)),anoGam1),apiMel2),triCas1)
     ssh kkstore01
     mkdir -p /san/sanvol1/scratch/dm2/chrom
     cp -p /cluster/data/dm2/?{,?}/chr*.fa /san/sanvol1/scratch/dm2/chrom/
     # Split chrom fa into smaller windows for phastCons:
     ssh pk
     mkdir /cluster/data/dm2/bed/multiz12way/phastCons
     mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.split
     cd /cluster/data/dm2/bed/multiz12way/phastCons/run.split
     set WINDOWS = /san/sanvol1/scratch/dm2/phastCons/WINDOWS
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
     cat << 'EOF' > doSplit.sh
 #!/bin/csh -ef
 
 set PHAST=/cluster/bin/phast
 set FA_SRC=/san/sanvol1/scratch/dm2/chrom
 set WINDOWS=/san/sanvol1/scratch/dm2/phastCons/WINDOWS
 
 set maf=$1
 set c = $maf:t:r
 set tmpDir = /scratch/msa_split/$c
 rm -rf $tmpDir
 mkdir -p $tmpDir
 ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
    -O dm2,droSim1,droYak1,droEre1,droAna2,dp3,droVir2,droMoj2,droGri1,anoGam1,apiMel2,triCas1 \
    -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
 cd $tmpDir
 foreach file ($c.*.ss)
   gzip -c $file > ${WINDOWS}/$file.gz
 end
 rm -f $tmpDir/$c.*.ss
 rmdir $tmpDir
 'EOF'
 # << for emacs
     chmod a+x doSplit.sh
     rm -f jobList
     foreach file (/cluster/data/dm2/bed/multiz12way/maf/*.maf)
       if (-s $file) then
         echo "doSplit.sh {check in exists+ $file}" >> jobList
       endif
     end
     para make jobList
     para time
 #Completed: 13 of 13 jobs
 #Average job time:                  81s       1.35m     0.02h    0.00d
 #Longest finished job:             219s       3.65m     0.06h    0.00d
 #Submission to last job:           219s       3.65m     0.06h    0.00d
 
     ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
     # Use consEntropy --NH to make it suggest a --expected-lengths param 
     # that we should try next.  Adam ran this on hg17 to find out the 
     # total entropy of the hg17 model:
     # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
 #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
 #Relative entropy: H=0.608216 bits/site
 #Required length: N=16.085437 sites
 #Total entropy: NH=9.783421 bits
     # Our target is that NH result: 9.7834 bits.  
     # Use phyloFit to make an initial model:
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz12way/phastCons
     /cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \
         --aggregate dm2,droSim1,droYak1,droEre1,droAna2,dp3,droVir2,droMoj2,droGri1,anoGam1,apiMel2,triCas1 \
         -i MAF -o SS > all.ss
     /cluster/bin/phast/phyloFit all.ss \
       --tree "((((((((dm2,droSim1),(droYak1,droEre1)),droAna2),dp3),((droVir2,droMoj2),droGri1)),anoGam1),apiMel2),triCas1)" \
       -i SS --out-root starting-tree
     cat starting-tree.mod 
 #ALPHABET: A C G T 
 #ORDER: 0
 #SUBST_MOD: REV
 #TRAINING_LNL: -496988087.624743
 #BACKGROUND: 0.288383 0.211930 0.211868 0.287819 
 #RATE_MAT:
 #  -0.922499    0.207559    0.437636    0.277304 
 #   0.282436   -1.104124    0.227654    0.594033 
 #   0.595687    0.227721   -1.105784    0.282377 
 #   0.277848    0.437405    0.207862   -0.923114 
 #TREE: ((((((((dm2:0.028572,droSim1:0.042955):0.010136,(droYak1:0.058491,droEre1:0.056179):0.048245):0.060386,droAna2:0.259454):0.024698,dp3:0.263678):0.012004,((droVir2:0.126666,droMoj2:0.147640):0.138076,droGri1:0.248877):0.126753):0.094211,anoGam1:0.382011):0.052522,apiMel2:0.456711):0.211776,triCas1:0.211776);
 # also get GC content from model -- if similar enough, no need to extract it 
 # separately above.
 awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod 
 #0.423798
     # OK, use .424 for --gc below.
 
     # Use the values of --target-coverage and --expected-lengths from the 
     # last iteration of the 9way run, 0.398 and 23.7.
     # Multiply each subst rate on the TREE line by 3.5 which is roughly the 
     # ratio of noncons to cons in 
     # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
     /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
       > starting-tree.noncons.mod
     /cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.7 \
       starting-tree{,.noncons}.mod
 #( Solving for new omega: 23.700000 24.343724 24.335125 )
 #Transition parameters: gamma=0.398000, omega=23.700000, mu=0.042194, nu=0.027896
 #Relative entropy: H=2.651656 bits/site
 #Expected min. length: L_min=3.660302 sites
 #Expected max. length: L_max=3.808284 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.705862 bits
 #Recommended expected length: omega=24.335125 sites (for L_min*H=9.783400)
     # OK, use --expected-lengths 24.3.
 
     ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
     # We're here because the actual target coverage was not satisfactory,
     # so we're changing the --target-coverage param.  Given that we're 
     # changing that, take a guess at how we should change --expected-lengths
     # in order to also hit the total entropy target.
     cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
     # SECOND ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.393 24.1 ave.{cons,noncons}.mod
 #Recommended expected length: omega=23.801397 sites (for L_min*H=9.783400)
     # OK, --expected-lengths 23.8
     # THIRD ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.45 32.2 ave.{cons,noncons}.mod
 #Recommended expected length: omega=27.095993 sites (for L_min*H=9.783400)
     # ==> 27.1
 
     # Now set up cluster job to estimate model parameters given free params 
     # --target-coverage and --expected-lengths and the data.  
     ssh pk
     mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
     cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
     # FIRST ITERATION: Use ../starting-tree.mod:
     cat << '_EOF_' > doEstimate.sh
 #!/bin/csh -ef
 zcat $1 \
 | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.424 --nrates 1,1 \
     --no-post-probs --ignore-missing \
     --expected-lengths 24.3 --target-coverage 0.398 \
     --quiet --log $2 --estimate-trees $3
 '_EOF_'
 # << for emacs
     # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
     cat << '_EOF_' > doEstimate.sh
 #!/bin/csh -ef
 zcat $1 \
 | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.424 --nrates 1,1 \
     --no-post-probs --ignore-missing \
     --expected-lengths 23.8 --target-coverage 0.393 \
     --quiet --log $2 --estimate-trees $3
 '_EOF_'
 # << for emacs
     chmod a+x doEstimate.sh
     rm -fr LOG TREES
     mkdir -p LOG TREES
     rm -f jobList
     foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
       set root = $f:t:r:r
       echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
     end
     para make jobList
     para time
 #Completed: 139 of 139 jobs
 #Average job time:                6022s     100.37m     1.67h    0.07d
 #Longest finished job:           14397s     239.95m     4.00h    0.17d
 #Submission to last job:         14419s     240.32m     4.01h    0.17d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
     ls -1 TREES/*.cons.mod > cons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
       --output-average ave.cons.mod > cons_summary.txt
     ls -1 TREES/*.noncons.mod > noncons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
       --output-average ave.noncons.mod > noncons_summary.txt
     grep TREE ave*.mod
     # FIRST ITERATION:
 #ave.cons.mod:TREE: ((((((((dm2:0.015283,droSim1:0.028475):0.004064,(droYak1:0.031904,droEre1:0.033703):0.024654):0.021822,droAna2:0.128599):0.008507,dp3:0.135654):0.003923,((droVir2:0.064832,droMoj2:0.074814):0.064856,droGri1:0.130122):0.056057):0.035045,anoGam1:0.225562):0.032118,apiMel2:0.207067):0.100280,triCas1:0.100280);
 #ave.noncons.mod:TREE: ((((((((dm2:0.049188,droSim1:0.086476):0.013855,(droYak1:0.101928,droEre1:0.105461):0.080386):0.076467,droAna2:0.435801):0.030536,dp3:0.463395):0.014225,((droVir2:0.218767,droMoj2:0.254568):0.225537,droGri1:0.443544):0.200405):0.126301,anoGam1:0.786660):0.117326,apiMel2:0.720501):0.348402,triCas1:0.348402);
     # SECOND ITERATION:
 #ave.cons.mod:TREE: ((((((((dm2:0.015261,droSim1:0.028419):0.004049,(droYak1:0.031866,droEre1:0.033655):0.024630):0.021816,droAna2:0.128628):0.008503,dp3:0.135728):0.003923,((droVir2:0.064840,droMoj2:0.074832):0.064900,droGri1:0.130188):0.056115):0.035117,anoGam1:0.225895):0.032170,apiMel2:0.207375):0.100397,triCas1:0.100397);
 #ave.noncons.mod:TREE: ((((((((dm2:0.049200,droSim1:0.086486):0.013823,(droYak1:0.101982,droEre1:0.105514):0.080420):0.076507,droAna2:0.436376):0.030537,dp3:0.464200):0.014233,((droVir2:0.219076,droMoj2:0.254936):0.225862,droGri1:0.444226):0.200721):0.126658,anoGam1:0.788468):0.117561,apiMel2:0.722456):0.349173,triCas1:0.349173);
 # THIRD ITERATION:
 
     cat cons_summary.txt 
     # look over the files cons_summary.txt and noncons_summary.txt.
     # The means and medians should be roughly equal and the stdevs
     # should be reasonably small compared to the means, particularly
     # for rate matrix parameters (at bottom) and for branches to the
     # leaves of the tree.  The stdevs may be fairly high for branches
     # near the root of the tree; that's okay.  Some min values may be
     # 0 for some parameters.  That's okay, but watch out for very large
     # values in the max column, which might skew the mean.  If you see
     # any signs of bad outliers, you may have to track down the
     # responsible .mod files and throw them out.  I've never had to do
     # this; the estimates generally seem pretty well behaved.
 
     # NOTE: Actually, a random sample of several hundred to a thousand
     # alignment fragments (say, a number equal to the number of
     # available cluster nodes) should be more than adequate for
     # parameter estimation.  If pressed for time, use this strategy.
 
     # FIRST ITERATION ONLY:
     # Check the total entropy figure to see if we're way off.
     # This takes an hour for 12way (exponential in #species) and has never 
     # produced a different answer from the input after the first iteration,
     # so do this for the first iteration only:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.398 24.1 ave.{cons,noncons}.mod
 #Recommended expected length: omega=24.095425 sites (for L_min*H=9.783400)
     # ==> use 24.1 instead of 24.3 below.
 
     # Now we are ready to set up the cluster job for computing the
     # conservation scores and predicted elements.  The we measure the 
     # conserved elements coverage, and if that's not satisfactory then we 
     # adjust parameters and repeat.  
     ssh pk
     mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.phast
     cd /cluster/data/dm2/bed/multiz12way/phastCons/run.phast
     cat << 'EOF' > doPhastCons.sh
 #!/bin/csh -ef
 set pref = $1:t:r:r
 set chr = `echo $pref | awk -F\. '{print $1}'`
 set tmpfile = /scratch/phastCons.$$
 zcat $1 \
 | /cluster/bin/phast/phastCons - \
     ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
     --expected-lengths 23.8 --target-coverage 0.393 \
     --quiet --seqname $chr --idpref $pref \
     --viterbi /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \
     --require-informative 0 \
   > $tmpfile
 gzip -c $tmpfile > /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz
 rm $tmpfile
 'EOF'
 # << for emacs
     chmod a+x doPhastCons.sh
     rm -fr /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
     mkdir -p /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
     rm -f jobList
     foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
       echo doPhastCons.sh $f >> jobList
     end
     para make jobList
     para time
 #Completed: 139 of 139 jobs
 #Average job time:                  15s       0.25m     0.00h    0.00d
 #Longest finished job:              19s       0.32m     0.01h    0.00d
 #Submission to last job:            45s       0.75m     0.01h    0.00d
 
     # back on kolossus:
     # combine predictions and transform scores to be in 0-1000 interval
     cd /cluster/data/dm2/bed/multiz12way/phastCons
     awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
       /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/*.bed \
     | /cluster/bin/scripts/lodToBedScore > all.bed
 
     ssh hgwdev
     # Now measure coverage of CDS by conserved elements. 
     # We want the "cover" figure to be close to 68.9%.
     cd /cluster/data/dm2/bed/multiz12way/phastCons
     featureBits -enrichment dm2 flyBaseGene:cds all.bed
     # FIRST ITERATION: just a bit too high, reduce target-coverage a bit:
 #flyBaseGene:cds 16.567%, all.bed 35.522%, both 11.442%, cover 69.07%, enrich 1.94x
     # SECOND ITERATION: OK, this is the first time I've seen a reduction in 
     # target-coverage result in an increase in CDS coverage.  Dang.  Well, 
     # I'll take it as a sign that we're close enough (esp. since each iteration 
     # take a half-day now).
 #flyBaseGene:cds 16.567%, all.bed 35.514%, both 11.446%, cover 69.09%, enrich 1.95x
 
     # Having met the CDS coverage target, load up the results.
     hgLoadBed dm2 phastConsElements12way all.bed
 
     # Create wiggle
     ssh pk
     mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.wib
     cd /cluster/data/dm2/bed/multiz12way/phastCons/run.wib
     rm -rf /san/sanvol1/scratch/dm2/phastCons/wib
     mkdir -p /san/sanvol1/scratch/dm2/phastCons/wib
     cat << 'EOF' > doWigEncode
 #!/bin/csh -ef
 set chr = $1
 cd /san/sanvol1/scratch/dm2/phastCons/wib
 zcat `ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
       | sort -t\. -k2,2n` \
 | wigEncode stdin ${chr}_phastCons.wi{g,b}
 'EOF'
 # << for emacs
     chmod a+x doWigEncode
     rm -f jobList
     foreach chr (`ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS \
                   | awk -F\. '{print $1}' | sort -u`)
       echo doWigEncode $chr >> jobList
     end
     para make jobList
     para time
 #Completed: 13 of 13 jobs
 #Average job time:                  10s       0.16m     0.00h    0.00d
 #Longest finished job:              24s       0.40m     0.01h    0.00d
 #Submission to last job:            99s       1.65m     0.03h    0.00d
 
     # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want 
     # the raw scores) from san/sanvol1
     cd /cluster/data/dm2/bed/multiz12way/phastCons
     rm -rf wib POSTPROBS
     rsync -av /san/sanvol1/scratch/dm2/phastCons/wib .
     rsync -av /san/sanvol1/scratch/dm2/phastCons/POSTPROBS .
 
     # load wiggle component of Conservation track
     ssh hgwdev
     mkdir /gbdb/dm2/multiz12way/wib
     cd /cluster/data/dm2/bed/multiz12way/phastCons
     chmod 775 . wib
     chmod 664 wib/*.wib
     ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz12way/wib/
     hgLoadWiggle dm2 phastCons12way \
       -pathPrefix=/gbdb/dm2/multiz12way/wib wib/*.wig
     rm wiggle.tab
 
     # and clean up san/sanvol1.
     rm -r /san/sanvol1/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib}
     rm -r /san/sanvol1/scratch/dm2/chrom
     # Offer raw scores for download since fly folks are likely to be interested:
     ssh kkstore01
     cd /cluster/data/dm2/bed/multiz12way/phastCons/POSTPROBS
     mkdir ../postprobsDownload
     foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
       zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
         > ../postprobsDownload/$chr.pp.gz
     end
     cd ../postprobsDownload
     md5sum *.gz > md5sum.txt
     # Make a README.txt there too.
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons12way
     cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons12way
     ln -s /cluster/data/dm2/bed/multiz12way/phastCons/postprobsDownload/* .
 
 
 # MAKE 11.OOC FILE FOR BLAT (DONE 9/19/05 angie)
     # Use -repMatch=100 (based on size -- for human we use 1024, and 
     # fly size is ~4.4% of human judging by gapless dm1 genome size from 
     # featureBits -- we would use 45, but bump that up a bit to be more 
     # conservative).
     ssh kolossus
     blat /cluster/data/dm2/dm2.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/dm2/11.ooc -repMatch=100
 #Wrote 3031 overused 11-mers to /cluster/bluearc/dm2/11.ooc
 
 
 # BAC END PAIRS (DONE 9/20/05 angie)
     # Excerpt from an email from Roger Hoskins, 9/17/05:
     # > There are two end-sequenced BAC libraries. The RPCI-98 BAC library, 
     # > produced by Pieter and Kazu, is called "BDGP" on the Genoscope 
     # > website; individual clones are named BACR01A01 - BACR48H12 (96-well 
     # > format; R stands for EcoRI). The DrosBAC library, produced in France 
     # > for the European Drosophila Genome Project, is called "EDGP" on the 
     # > Genoscope website; individual clones are named BACN01A01 - BACN47H12 
     # > and BACH48A01-BACH61H12 (N stands for NdeII; H stands for HinDIII).
     # >
     # > It's not necessary to query GenBank for these data. The BAC end 
     # > sequences are available from the Genoscope web site, Resources page:
     # > http://www.genoscope.cns.fr/externe/English/Outils/
     # > For RPCI-98:
     # > http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL
     # > For DrosBAC (EDGP):
     # > http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM
     # >
     # > Note that if you decide to do this, it would be important to have a 
     # > fairly high stringency imposed on the sequence alignments for mapping. 
     # > BACs only go to one location in the genome; paired ends must point 
     # > toward each other and be separated by an interval of say 50kb to 250 
     ssh kolossus
     # fileserver is unhappy... so work on bluearc for now, copy over later:
     mkdir -p /cluster/bluearc/dm2/bed/bacends
     cd /cluster/bluearc/dm2/bed/bacends
     wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL
     wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM
     faSize *
 #40016446 bases (3778317 N's 36238129 real 33260718 upper 2977391 lower) in 41500 sequences in 2 files
 #Total size: mean 964.3 sd 202.7 min 1 (AL0AA030DD12A1) max 1373 (AM0AA015AE07BP1) median 1009
 #N count: mean 91.0 sd 80.8
 #U count: mean 801.5 sd 207.4
 #L count: mean 71.7 sd 45.5
     cat banque_Projet_A* > bacends.fa
     # Edit the <!DOCTYPE.../HTML> part out of bacends.fa.
     # Fly is much smaller than human so we can get away with one job 
     # per chrom, no splitting...
     ssh kki
     cd /cluster/bluearc/dm2/bed/bacends
     mkdir out
     echo bacends.fa > bacends.lst
     ls -1S /iscratch/i/dm2/nib/* > dm2.lst
 cat > gsub << 'EOF'
 #LOOP
 /cluster/bin/x86_64/blat -noHead $(path1) $(path2) -ooc=/cluster/bluearc/dm2/11.ooc {check out exists out/$(root1).$(root2).psl}
 #ENDLOOP
 'EOF'
     # << for emacs
     gensub2 dm2.lst bacends.lst gsub jobList
     para make jobList
     para time
 #Completed: 13 of 13 jobs
 #Average job time:                  79s       1.32m     0.02h    0.00d
 #Longest finished job:             360s       6.00m     0.10h    0.00d
 #Submission to last job:           360s       6.00m     0.10h    0.00d
 
     # Back on kolossus:
     cat out/*.psl \
     | pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
         stdin bacEnds.unpaired.psl /dev/null
 #..Processed 318892 alignments
 
     # Translate the particular style of FASTA headers into bacEndPairs.txt:
     perl -we 'while (<>) { \
                 if (/^>(\w+)\s+BAC=(\w+)\s+END=(\w+)/) { \
                   if (! defined $bacs{$2}) { \
                     $bacs{$2}->[0] = "";  $bacs{$2}->[1] = ""; \
                   } \
                   if ($3 eq "T7") { \
                     $bacs{$2}->[0] .= "$1,"; \
                   } else { \
                     $bacs{$2}->[1] .= "$1,"; \
                   } \
                 } elsif (/^>/) { \
                   die "parse:\n$_\n"; \
                 } \
               } \
               foreach $bac (keys %bacs) { \
                 $t7 = $bacs{$bac}->[0];  $t7 = "," if ($t7 eq ""); \
                 $other = $bacs{$bac}->[1];   $other = "," if ($other eq ""); \
                 print "$t7\t$other\t$bac\n"; \
               }' bacends.fa \
     > bacEndPairs.txt
 
     # Roger's suggested 50kb-250kb range lost some (especially on the short 
     # side), so I ended up going with Terry's range for human, 35kb-350kb:
     pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
              -slopval=10000 -hardMax=500000 \
              -slop -short -long -orphan -mismatch -verbose \
       bacEnds.unpaired.psl bacEndPairs.txt all_bacends bacEnds
     wc -l bacEnds.*
 #      21 bacEnds.long
 #     180 bacEnds.mismatch
 #    7037 bacEnds.orphan
 #   11417 bacEnds.pairs
 #      37 bacEnds.short
 #     126 bacEnds.slop
 #  118041 bacEnds.unpaired.psl
       # (With Roger's sugg. 50kb-250kb, there were 575 short, 53 long)
 
     # Filter by score and sort by {chrom,chromStart}:
     awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
     cat bacEnds.{slop,short,long,mismatch,orphan} \
     | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
     wc -l *.bed
 #  11410 bacEndPairs.bed
 #   4842 bacEndPairsBad.bed
 
     # Filter the alignments into those we'll need in the all_bacends table:
     extractPslLoad -noBin bacEnds.unpaired.psl \
       bacEndPairs.bed bacEndPairsBad.bed \
     | sorttbl tname tstart | headchg -del \
     > bacEnds.load.psl
 
     cd ..
     mv /cluster/bluearc/dm2/bed/bacends /cluster/data/dm2/bed/bacends
 
     # load into database
     ssh hgwdev
     cd /cluster/data/dm2/bed/bacends
     hgLoadBed dm2 bacEndPairs bacEndPairs.bed \
       -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql 
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed dm2 bacEndPairsBad bacEndPairsBad.bed \
       -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     hgLoadPsl dm2 -table=all_bacends bacEnds.load.psl
     # load BAC end sequences
     mkdir -p /gbdb/dm2/bacends
     ln -s /cluster/data/dm2/bed/bacends/bacends.fa /gbdb/dm2/bacends/
     hgLoadSeq dm2 /gbdb/dm2/bacends/bacends.fa
 
 
 ##########################################################################
 # NSCAN track - (2005-09-29 markd)  loaded proteins 2005-10-13
     cd /cluster/data/dm2/bed/nscan/
     # obtained NSCAN-EST predictions from michael brent's group at WUSTL
     wget http://genome.cse.wustl.edu/predictions/Drosophila/Dm2_09_19_05/Dm2_09_19_05.tar.gz
     tar -zxf Dm2_09_19_05.tar.gz
 
     # change protein fasta file to have transcript id in header
     foreach f (chr_ptx/*.ptx)
         awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
     end
 
     ldHgGene -gtf -genePredExt dm2 nscanGene  chr_gtf/chr*.gtf
     hgPepPred mm6 generic nscanPep chr_ptx/chr*.fix
     rm -rf chr_* *.tab
 
     # update trackDb; need a dm2-specific page to describe informants
     mouse/dm2/nscanGene.html    
     mouse/dm2/trackDb.ra
 
 
 # BLASTZ/CHAIN/NET DROYAK2 (DONE 11/23/05 angie)
     mkdir /cluster/data/dm2/bed/blastz.droYak2.2005-11-23
     cd /cluster/data/dm2/bed/blastz.droYak2.2005-11-23
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. yakuba
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. yakuba
 SEQ2_DIR=/iscratch/i/droYak2/droYak2.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droYak2/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droYak2.2005-11-23
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /cluster/bluearc/dm2droYak2 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroYak2Link
 #flyBaseGene 22.967%, chainDroYak2Link 91.943%, both 22.758%, cover 99.09%, enrich 1.08x
     rm -f /cluster/data/dm2/bed/blastz.droYak2
     ln -s blastz.droYak2.2005-11-23 /cluster/data/dm2/bed/blastz.droYak2
 
 
 # BLASTZ/CHAIN/NET DROPER1 (DONE 11/29/05 angie)
     mkdir /cluster/data/dm2/bed/blastz.droPer1.2005-11-28
     cd /cluster/data/dm2/bed/blastz.droPer1.2005-11-28
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. persimilis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. persimilis
 SEQ2_DIR=/iscratch/i/droPer1/droPer1.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droPer1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droPer1.2005-11-28
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droPer1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroPer1Link
 #flyBaseGene 22.967%, chainDroPer1Link 74.744%, both 21.320%, cover 92.83%, enrich 1.24x
     rm -f /cluster/data/dm2/bed/blastz.droPer1
     ln -s blastz.droPer1.2005-11-28 /cluster/data/dm2/bed/blastz.droPer1
 
 
 # BLASTZ/CHAIN/NET DROSEC1 (DONE 12/5/05 angie)
     mkdir /cluster/data/dm2/bed/blastz.droSec1.2005-11-30
     cd /cluster/data/dm2/bed/blastz.droSec1.2005-11-30
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. sechellia
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. sechellia
 SEQ2_DIR=/iscratch/i/droSec1/droSec1.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droSec1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droSec1.2005-11-30
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droSec1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroSec1Link
 #flyBaseGene 22.967%, chainDroSec1Link 92.860%, both 22.759%, cover 99.09%, enrich 1.07x
     rm -f /cluster/data/dm2/bed/blastz.droSec1
     ln -s blastz.droSec1.2005-11-30 /cluster/data/dm2/bed/blastz.droSec1
 
 
 # FLYBASE 4.2 ANNOTATIONS (DONE (except for contacting flybase-help) 1/13/06 angie)
 # RELOADED flyBaseNoncoding and flyBase2004Xref 5/30/06 after changing 
 # extractGenes to catch some new RNA keywords after user reported loss of 
 # many mir's.  (They were marked "ncRNA" in 4.1 but "miRNA" in 4.2.)  
 # Now extractGenes catches [a-z]+RNA.
 # RELOADED flyBaseGene 7/12/06 after user reported a problem with split
 # start codons -- cause by error in extractGenes.pl that caused us not to
 # notice a single-base overlap between CDS range and exon range.
     ssh kkstore01
     mkdir /cluster/data/dm2/bed/flybase4.2
     cd /cluster/data/dm2/bed/flybase4.2
     foreach c (2L 2R 3L 3R 4 X)
       wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.2_20050909/gff/dmel-$c-r4.2.1.gff.gz
     end
     zcat *.gff.gz > flybase.gff3
     # What data sources are represented in this file?
     grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
     # excerpt (many other sources, including blastx:... , sim4:... and 
     # tblastx:...; also various other types for source "."):
   19178 .       CDS
   63583 .       exon
   14380 .       gene
   19178 .       mRNA
      66 .       miRNA
     113 .       ncRNA
      49 .       pseudogene
      96 .       rRNA
      46 .       snRNA
      63 .       snoRNA
     294 .       tRNA
   36921 .       transcription_start_site
    6006 .       transposable_element
   33268 .       transposable_element_insertion_site
     # What keywords are defined in the 9th field?
     grep -v '^#' flybase.gff3 \
     | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \
     | sort | uniq -c
  117483 Dbxref
 3111982 ID
 1428395 Name
  847565 Parent
   20326 cyto_range
   46928 dbxref_2nd
   15705 gbunit
       6 species
   92077 synonym
   10032 synonym_2nd
     # Wow... for the first time, it looks like the same flavor of GFF3
     # has been used as last time!
     cp ../flybase4.1/extractGenes.pl .
     extractGenes.pl flybase.gff3
 
     # Get predicted proteins (for main annotations only)
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.2_20050909/fasta/dmel-all-translation-r4.2.1.fasta
     perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' dmel-all-translation-r4.2.1.fasta \
       > flybasePep.fa
 
     ssh hgwdev
     cd /cluster/data/dm2/bed/flybase4.2
     # Protein-coding genes:
     ldHgGene -gtf dm2 flyBaseGene flybase.gtf
     hgPepPred dm2 generic flyBasePep flybasePep.fa
     # Noncoding genes:
     hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
     # Cross-referencing info for both coding and noncoding:
     hgLoadSqlTab dm2 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
       flyBase2004Xref.tab
     # Some featureBits comparisons with refGene which shoudl be pretty much 
     # like version 4.0 with a few noncoding genes tossed in... but this is 
     # a bigger difference than we expect:
     featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout
 #516631 bases of 131698467 (0.392%) in intersection
     # Hmmm, flyBaseGene has nothing on the last 175,000bp of chrX!:
     # chrX:22,039,591-22,224,390
     # nor the first 40,000bp of chrX:
     # chrX:1-102,664
     # nor the last 270,000bp of chr3L:
     # chr3L:23,273,167-23,771,897
     # nor the first 500,000bp of chr2R:
     # chr2R:1-600,000
     # Also quite a few on the chr*h but I won't look at all those.
 #TODO: contact flybase-help about the lack of annotations in those ranges...
 
     # This shows only the dropped isoforms/duplicates (ignores noncoding):
     featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \
        -bed=stdout
 #chr3R   15620310        15621493        chr3R.1
 #chr2L   4698094 4700940 chr2L.1
 #chrX    3638700 3639895 chrX.1
 #474282 bases of 131698467 (0.360%) in intersection
 
     # add upstream* downloadable files
     cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
     foreach size (1000 2000 5000)
       echo upstream$size
       nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
       | nice gzip -c > upstream$size.fa.gz
     end
     nice md5sum *.zip *.gz > md5sum.txt
 
     # 2/6/06: fill in missing symbol values with accession so hgNear
     # has something to draw a link on...
     hgsql dm2 -e 'update flyBase2004Xref set symbol = name where symbol = ""'
 
     # 7/12/06: reload flyBaseGene after correcting extractGenes.pl.
     ldHgGene -gtf dm2 flyBaseGene flybase.gtf
 
 
 # FLYBASE ACODE CROSS-REFERENCING DATA (DONE 1/27/06 angie)
     # 10/11/06 -- reloaded after updating flyBaseGene, but no change.
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/flyBase
     cd /cluster/data/dm2/bed/flyBase
     wget -O genes.txt.061011 \
       ftp://flybase.bio.indiana.edu/flybase/genes/genes.txt
     # Had to edit genes.txt.060126 to get around these two lines that didn't 
     # start with acode symbols:
     #line 1609033
 #Spiracle sheaths dark.
     #line 3664838 (3664839 of original)
 #Ommatidia are no
     hgFlyBase dm2 genes.txt.061011 -geneTable=flyBaseGene 
 #Processed 57068 records in 4067314 lines
 
 
 # MAP UNIPROT IDS TO CG*-R* IDS (DONE 1/27/06 angie - REDONE 10/11/06)
     # The UniProt.description table actually contains the most specific 
     # mapping of UniProt IDs to CG*-R* transcript IDs.  Use that when 
     # available, and otherwise use the more generic mapping to gene IDs 
     # that we extracted from the acode into fbUniProt.
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/uniProtToFlyBase
     cd /cluster/data/dm2/bed/uniProtToFlyBase
     set taxon = `hgsql -N uniProt -e 'select id from taxon where binomial = "Drosophila melanogaster";'`
     # Grab UniProt descriptions of all D. mel. proteins so we can parse out
     # CG*-R* transcript IDs (and CG* gene IDs when -R* not specified).
     hgsql -N uniProt -e \
      'select description.* from description,accToTaxon where accToTaxon.taxon = '$taxon' and accToTaxon.acc = description.acc' \
       > uniProt.description.txt
     # fbUniProt maps FBgn* IDs to multiple UniProt IDs; use flyBase2004Xref
     # to map FBgn* to CG* (actually CG*-R* which implies more precision than
     # there really is).
     hgsql -N dm2 -e \
      'select name,uniProtId from flyBase2004Xref,fbUniProt where fbgn=geneId' \
       | sort > flyBaseMapping.txt
     # Grab the complete list of CG*-R* transcript IDs:
     hgsql -N dm2 -e 'select name from flyBaseGene' | sort > transcriptIds.txt
     ssh kolossus
     cd /cluster/data/dm2/bed/uniProtToFlyBase
     # Wrote a perl script, gleanDescription.pl, to parse the various patterns
     # used to encode real mappings (not mappings to homologs or things that 
     # look like CG*-P? IDs but aren't really).
     chmod a+x gleanDescription.pl
     ./gleanDescription.pl uniProt.description.txt | sort > uniProtMapping.txt
     # Look for duplicates:
     awk '{print $1;}' uniProtMapping.txt | uniq -c | awk '$1 != 1 {print;}'
 #       2 CG17131-RA
 #       5 CG32637-RA
     grep CG17131-PA uniProt.description.txt 
 #Q8MS37  RE15579p (CG17131-PA).
 #Q7PLU3  CG17131-PA.3.
     # I looked up both Q8MS37 and Q7PLU3 on http://www.pir.uniprot.org/
     # and both do mention CG17131-PA... Q8MS37 has been updated more recently
     # so I'll just delete the Q7PLU3 mapping.
     grep CG32637-PA uniProt.description.txt
 #Q5K354  CG32637-PA protein (Fragment).
 #Q5K356  CG32637-PA protein (Fragment).
 #Q5K357  CG32637-PA protein (Fragment).
 #Q5K361  CG32637-PA protein (Fragment).
 #Q8IR71  CG32637-PA.
     # Here it's more clear-cut -- delete the fragments.
     egrep -v '(Q7PLU3|Q5K354|Q5K356|Q5K357|Q5K361)' uniProtMapping.txt \
       > tmp; mv tmp uniProtMapping.txt
     # Wrote a perl script, vagueDescription.pl, to parse out CG* without 
     # isoform specs -- can use those as backups when we don't have a more 
     # specific mapping for a transcript.
     chmod a+x vagueDescription.pl
     ./vagueDescription.pl uniProt.description.txt | sort \
       > uniProtMappingVague.txt
     # Write a perl script, combineMappings.pl, to add any additional mappings
     # available from acode to the UniProt mappings.
     chmod a+x combineMappings.pl
     ./combineMappings.pl transcriptIds.txt \
       uniProtMapping.txt uniProtMappingVague.txt flyBaseMapping.txt \
       > flyBaseToUniProt.txt
     # Make a 2-column version for loading:
     awk '{print $1 "\t" $2;}' flyBaseToUniProt.txt > flyBaseToUniProtLoad.txt
     # Load into a genericAlias-type table (this ignores the third column).
     ssh hgwdev
     cd /cluster/data/dm2/bed/uniProtToFlyBase
     sed -e 's/genericAlias/flyBaseToUniProt/' \
       ~/kent/src/hg/lib/genericAlias.sql > flyBaseToUniProt.sql
     hgLoadSqlTab dm2 flyBaseToUniProt flyBaseToUniProt.sql \
       flyBaseToUniProtLoad.txt
 
 
 ##########################################################################
 # HGNEAR TABLES
 
 # HGNEAR PROTEIN BLAST TABLES (DONE 1/31/06 angie - REDONE 4/6/07)
 # mmBlastTab redone 5/1/06 -- makeMm8.doc run used outdated proteins
 # hgBlastTab redone 5/9/06 -- makeHg18.doc run used outdated proteins
 # Redone 4/3/07 with flybase4.3; mitochondrial proteins yanked 4/6/07 
 # because they aren't in flyBase2004Xref and were causing joinerCheck errors.
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/hgNearBlastp
     cd /cluster/data/dm2/bed/hgNearBlastp
     mkdir 070403
     cd 070403
     # Get the proteins used by the other hgNear organisms:
     pepPredToFa hg18 knownGenePep hg18.known.faa
     pepPredToFa mm8 knownGenePep mm8.known.faa
     pepPredToFa rn4 knownGenePep rn4.known.faa
     pepPredToFa danRer3 ensPep danRer3.ensPep.faa
     pepPredToFa dm2 flyBasePep dm2.flyBasePep.faa
     pepPredToFa ce2 sangerPep ce2.sangerPep.faa
     pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
 
     # Galt's synBlastp.csh filtering, which uses liftOver chains,
     # would be the next step if dm2 were reasonably closely related
     # (like mammal-mammal) to another Gene Sorter organism.  But it is 
     # pretty distant from all the others, so specify recipBest for all
     # query databases.
     cat << _EOF_ > config.ra
 # Latest fly vs. other Gene Sorter orgs:
 # human, mouse, rat, zebrafish, worm, yeast
 
 targetGenesetPrefix flyBase
 targetDb dm2
 queryDbs hg18 mm8 rn4 danRer3 ce2 sacCer1
 recipBest hg18 mm8 rn4 danRer3 ce2 sacCer1
 
 dm2Fa     /cluster/data/dm2/bed/hgNearBlastp/070403/dm2.flyBasePep.faa
 hg18Fa    /cluster/data/dm2/bed/hgNearBlastp/070403/hg18.known.faa
 mm8Fa     /cluster/data/dm2/bed/hgNearBlastp/070403/mm8.known.faa
 rn4Fa     /cluster/data/dm2/bed/hgNearBlastp/070403/rn4.known.faa
 danRer3Fa /cluster/data/dm2/bed/hgNearBlastp/070403/danRer3.ensPep.faa
 ce2Fa     /cluster/data/dm2/bed/hgNearBlastp/070403/ce2.sangerPep.faa
 sacCer1Fa /cluster/data/dm2/bed/hgNearBlastp/070403/sacCer1.sgdPep.faa
 
 buildDir /cluster/data/dm2/bed/hgNearBlastp/070403
 scratchDir /san/sanvol1/scratch/dm2HgNearBlastp
 _EOF_
     # Run with -noLoad so we can eyeball files, manually load dm2 tables now,
     # and later overload other databases' dmBlastTab tables.
     ~/kent/src/hg/utils/automation/doHgNearBlastp.pl -noLoad -workhorse kkr4u00 config.ra >& do.log &
     tail -f do.log
     # Ran these manually:
 # *** -noLoad was specified -- you can run this script manually to load dm2 tables:
         run.dm2.dm2/loadPairwise.csh
 #Loading database with 1171077 rows
 
 # *** -noLoad was specified -- you can run these scripts manually to load dm2 tables:
         run.dm2.hg18/loadPairwise.csh
 #Loading database with 5833 rows
         run.dm2.mm8/loadPairwise.csh
 #Loading database with 5557 rows
         run.dm2.rn4/loadPairwise.csh
 #Loading database with 3015 rows
         run.dm2.danRer3/loadPairwise.csh
 #Loading database with 5253 rows
         run.dm2.ce2/loadPairwise.csh
 #Loading database with 4537 rows
         run.dm2.sacCer1/loadPairwise.csh
 #Loading database with 2172 rows
 
     # 4/6/07: remove mitochondrial genes from *BlastTab because they 
     # are missing from flyBase2004Xref and caused joinerCheck errors.
     # Would have been better to have joinerCheck'd flyBasePep before 
     # running doHgNearBlastp.
     hgsql dm2 -e 'delete from flyBaseBlastTab where target like "mt:%"'
     foreach table (hgBlastTab mmBlastTab rnBlastTab \
                    drBlastTab ceBlastTab scBlastTab)
       echo $table
       hgsql dm2 -e 'delete from '$table' where query like "mt:%"'
     end
 
     # 4/6/07: dm2 FlyBase/hgNear/hgGene update is in the pushQ, so I think 
     # it's a fine time to update dmBlastTab in other databases with the 
     # latest, so hgwdev is all consistent.  When that push is done, will 
     # need another push request for *.dmBlastTab loaded here.
 # *** -noLoad was specified -- you can run these scripts manually to load dmBlastTab in query databases:
         run.hg18.dm2/loadPairwise.csh
         run.mm8.dm2/loadPairwise.csh
         run.rn4.dm2/loadPairwise.csh
         run.danRer3.dm2/loadPairwise.csh
         run.ce2.dm2/loadPairwise.csh
         run.sacCer1.dm2/loadPairwise.csh
     # Yank the "mt:%" targets from those to avoid joinerCheck errors.
     foreach db (hg18 mm8 rn4 danRer3 ce2 sacCer1)
       hgsql $db -e 'delete from dmBlastTab where target like "mt:%"'
     end
 
 
 # CLUSTER GENES AND MAP TO OTHER DATA (DONE 2/6/06 angie - REDONE 4/3/07)
 # flyBaseToRefSeq regenerated 5/9/06
     ssh hgwdev
     # First we have to add a proteinId column to flyBaseGene.
     hgClusterGenes dm2 flyBaseGene flyBaseIsoforms flyBaseCanonical \
       -protAccQuery='select name,alias from flyBaseToUniProt'
 #Got 13566 clusters, from 19452 genes in 13 chromosomes
     # Create table that maps between flyBase genes and RefSeq
     # Re-ran 5/9/06 to resolve some joinerCheck errors about retired refSeqs:
     hgMapToGene dm2 refGene flyBaseGene flyBaseToRefSeq
     # Create table that maps between known genes and Pfam domains
     hgMapViaSwissProt dm2 flyBaseToUniProt name alias Pfam flyBaseToPfam
 
     # Create a table that maps between flyBase genes and the 
     # Stanford Microarray Project expression data. (see makeHgFixed.doc)
     hgsql dm2 -e 'drop table if exists flyBaseToCG; create table flyBaseToCG \
       select name,SUBSTRING_INDEX(name,"-",1) as value from flyBaseGene'
     hgsql dm2 -e 'create index name on flyBaseToCG(name(12))'
     hgExpDistance dm2 hgFixed.arbFlyLifeMedianRatio dummyArg \
       arbExpDistance -lookup=flyBaseToCG
 #TODO: add the yale expr data too (feep)
 
     # Make sure that GO database is up to date.
     See README in /cluster/store1/geneOntology.
 
 
 # MAKE A DESCRIPTION TABLE FOR HGNEAR KNOWNDETAILS (DONE 2/6/06 - REDONE 4/3/07)
     # hgNear's knownDetails column type expects a single table that links 
     # transcript ID with uniProt description, so whip one up here:
     # (used by hgGene too though hgGene could join on the fly)
     ssh hgwdev
     hgsql dm2 -e 'drop table if exists flyBaseToDescription; \
       create table flyBaseToDescription \
       select flyBaseToUniProt.name,uniProt.description.val \
       from flyBaseToUniProt,uniProt.description \
       where flyBaseToUniProt.alias = uniProt.description.acc'
     hgsql dm2 -e 'create index name on flyBaseToDescription(name(12))'
 
 
 # FLYP2P (DONE 2/6/06 angie)
     # See hg/near/makeNear.doc...
 
 
 # MAKE ORGANISM-SPECIFIC HGNEARDATA AND HGGENEDATA FILES (DONE 2/7/06 angie)
     cd ~/kent/src/hg/near/hgNear/hgNearData
     # Made dm1 and dm2 subdirs of D_melanogaster with separate sets of 
     # .ra files... see cvs revision history.
     # Similarly for hgGene/hgGeneData.
 
 
 # ENABLE HGNEAR FOR DM2 IN HGCENTRALTEST (DONE 2/7/06 angie)
     hgsql -h genome-testdb hgcentraltest \
       -e  "update dbDb set hgNearOk = 1 where name = 'dm2';"
 
 # REDO DM2-HG18 PAIRWISE BLAST (DONE 2/24/06 angie)
     # Oops, hgBlastTab got overwritten with dm2 4.1 genes by the 
     # hg18 build process.  Tweak back to 4.2:
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/hgNearBlastp
     cd /cluster/data/dm2/bed/hgNearBlastp
     cat << _EOF_ > config.ra.hg18redo
 # Redo vs. hg18 with 4.2 genes:
 
 targetGenesetPrefix flyBase
 targetDb dm2
 queryDbs hg18
 
 dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
 hg18Fa /cluster/data/hg18/bed/blastp/known.faa
 
 buildDir /cluster/data/dm2/bed/hgNearBlastp
 scratchDir /san/sanvol1/scratch/dm2HgNearBlastp
 _EOF_
     doHgNearBlastp.pl -noSelf config.ra.hg18redo >& do.log & tail -f do.log
 
 # END OF HGNEAR STUFF
 #############################################################################
 
 
 # RECIPROCAL BEST DM2-DROSIM1 (DONE 2/28/06 angie - REDONE 6/19/06)
     # Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago)
     ssh kolossus
     cd /cluster/data/dm2/bed/blastz.droSim1/axtChain
     # Swap dm2-best chains to be droSim1-referenced:
     chainStitchId dm2.droSim1.over.chain.gz stdout \
     | chainSwap stdin stdout \
     | chainSort stdin droSim1.dm2.tBest.chain
     # Net those on droSim1 to get droSim1-ref'd reciprocal best net:
     chainPreNet droSim1.dm2.tBest.chain \
       /cluster/data/{droSim1,dm2}/chrom.sizes stdout \
     | chainNet -minSpace=1 stdin /cluster/data/{droSim1,dm2}/chrom.sizes \
         stdout /dev/null \
     | netSyntenic stdin droSim1.dm2.rbest.net
     # Extract droSim1-ref'd reciprocal best chain:
     netChainSubset droSim1.dm2.rbest.net droSim1.dm2.tBest.chain stdout \
     | chainStitchId stdin stdout \
     | chainSort stdin droSim1.dm2.rbest.chain
     # Swap to get dm2-ref'd reciprocal best chain:
     chainSwap droSim1.dm2.rbest.chain stdout \
     | chainSort stdin dm2.droSim1.rbest.chain
     # Net those on dm2 to get dm2-ref'd reciprocal best net:
     chainPreNet dm2.droSim1.rbest.chain \
       /cluster/data/{dm2,droSim1}/chrom.sizes stdout \
     | chainNet -minSpace=1 -minScore=0 \
       stdin /cluster/data/{dm2,droSim1}/chrom.sizes stdout /dev/null \
     | netSyntenic stdin dm2.droSim1.rbest.net
     # Compress and clean up.
     rm droSim1.dm2.tBest.chain
     gzip *.rbest.*
     md5sum *.rbest.*.gz > md5sum.txt
 
     # Test coverage of *.rbest.* -- should be equal.
     netToBed -maxGap=1 droSim1.dm2.rbest.net.gz droSim1.dm2.rbest.net.bed
     netToBed -maxGap=1 dm2.droSim1.rbest.net.gz dm2.droSim1.rbest.net.bed
     chainToPsl droSim1.dm2.rbest.chain.gz \
       /cluster/data/{droSim1,dm2}/chrom.sizes \
       /cluster/data/droSim1/droSim1.2bit /cluster/data/dm2/dm2.2bit \
       droSim1.dm2.rbest.chain.psl
     chainToPsl dm2.droSim1.rbest.chain.gz \
       /cluster/data/{dm2,droSim1}/chrom.sizes \
       /cluster/data/dm2/dm2.2bit /cluster/data/droSim1/droSim1.2bit \
       dm2.droSim1.rbest.chain.psl
     ssh hgwdev
     cd /cluster/data/dm2/bed/blastz.droSim1/axtChain
     featureBits droSim1 droSim1.dm2.rbest.net.bed
 #104047585 bases of 127256433 (81.762%) in intersection
     featureBits droSim1 droSim1.dm2.rbest.chain.psl
 #104047585 bases of 127256433 (81.762%) in intersection
     featureBits dm2     dm2.droSim1.rbest.chain.psl
 #104047585 bases of 131698467 (79.004%) in intersection
     featureBits dm2     dm2.droSim1.rbest.net.bed
 #104047585 bases of 131698467 (79.004%) in intersection
     mkdir experiments
     mv *.bed *.psl experiments
 
     # Put out for download (hgwdev-only)
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dm2/vsDroSim1/reciprocalBest
     cd /usr/local/apache/htdocs/goldenPath/dm2/vsDroSim1/reciprocalBest
     rm -f *.gz md5sum.txt
     ln -s /cluster/data/dm2/bed/blastz.droSim1/axtChain/*.rbest.*.gz .
     ln -s /cluster/data/dm2/bed/blastz.droSim1/axtChain/md5sum.txt .
     # Make a README.txt.
 
 
 # RECIPROCAL BEST DM2-DROYAK2 (DONE 6/20/06 angie)
     # Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago)
     ssh kolossus
     cd /cluster/data/dm2/bed/blastz.droYak2/axtChain
     # Swap dm2-best chains to be droYak2-referenced:
     chainStitchId dm2.droYak2.over.chain.gz stdout \
     | chainSwap stdin stdout \
     | chainSort stdin droYak2.dm2.tBest.chain
     # Net those on droYak2 to get droYak2-ref'd reciprocal best net:
     chainPreNet droYak2.dm2.tBest.chain \
       /cluster/data/{droYak2,dm2}/chrom.sizes stdout \
     | chainNet -minSpace=1 stdin /cluster/data/{droYak2,dm2}/chrom.sizes \
         stdout /dev/null \
     | netSyntenic stdin droYak2.dm2.rbest.net
     # Extract droYak2-ref'd reciprocal best chain:
     netChainSubset droYak2.dm2.rbest.net droYak2.dm2.tBest.chain stdout \
     | chainStitchId stdin droYak2.dm2.rbest.chain
     # Swap to get dm2-ref'd reciprocal best chain:
     chainSwap droYak2.dm2.rbest.chain stdout \
     | chainSort stdin dm2.droYak2.rbest.chain
     # Net those on dm2 to get dm2-ref'd reciprocal best net:
     chainPreNet dm2.droYak2.rbest.chain \
       /cluster/data/{dm2,droYak2}/chrom.sizes stdout \
     | chainNet -minSpace=1 -minScore=0 \
       stdin /cluster/data/{dm2,droYak2}/chrom.sizes stdout /dev/null \
     | netSyntenic stdin dm2.droYak2.rbest.net
     # Compress and clean up.
     rm droYak2.dm2.tBest.chain
     gzip *.rbest.*
     md5sum *.rbest.*.gz > md5sum.rbest.txt
 
     # Test coverage of *.rbest.* -- should be equal.
     netToBed -maxGap=1 droYak2.dm2.rbest.net.gz droYak2.dm2.rbest.net.bed
     netToBed -maxGap=1 dm2.droYak2.rbest.net.gz dm2.droYak2.rbest.net.bed
     chainToPsl droYak2.dm2.rbest.chain.gz \
       /cluster/data/{droYak2,dm2}/chrom.sizes \
       /cluster/data/droYak2/droYak2.2bit /cluster/data/dm2/dm2.2bit \
       droYak2.dm2.rbest.chain.psl
     chainToPsl dm2.droYak2.rbest.chain.gz \
       /cluster/data/{dm2,droYak2}/chrom.sizes \
       /cluster/data/dm2/dm2.2bit /cluster/data/droYak2/droYak2.2bit \
       dm2.droYak2.rbest.chain.psl
     ssh hgwdev
     cd /cluster/data/dm2/bed/blastz.droYak2/axtChain
     featureBits droYak2 droYak2.dm2.rbest.net.bed
 #105927235 bases of 162681153 (65.113%) in intersection
     featureBits droYak2 droYak2.dm2.rbest.chain.psl
 #105927235 bases of 162681153 (65.113%) in intersection
     featureBits dm2     dm2.droYak2.rbest.chain.psl
 #105927235 bases of 131698467 (80.432%) in intersection
     featureBits dm2     dm2.droYak2.rbest.net.bed
 #105927235 bases of 131698467 (80.432%) in intersection
     mkdir experiments
     mv *.bed *.psl experiments
 
     # Put out for download (hgwdev-only)
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dm2/vsDroYak2/reciprocalBest
     cd /usr/local/apache/htdocs/goldenPath/dm2/vsDroYak2/reciprocalBest
     rm -f *.gz md5sum.txt
     ln -s /cluster/data/dm2/bed/blastz.droYak2/axtChain/*.rbest.*.gz .
     ln -s /cluster/data/dm2/bed/blastz.droYak2/axtChain/md5sum.rbest.txt md5sum.txt
     # Make a README.txt.
 
 
 ########################################################################
 ### microRNA targets tracks  (DONE - 2006-03-29 - 2006-04-26 - Hiram)
 ### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
 ### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
 ### Yi-Lu Wang ylw205@nyu.edu
 ### dg@thp.Uni-Koeln.DE
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/picTar
     cd /cluster/data/dm2/bed/picTar
     wget --timestamping \
 	'http://pictar.bio.nyu.edu/ucsc/new_melanogaster_S1_bed' \
 	    -O newMelanogasterS1.bed
 
     wget --timestamping \
 	'http://pictar.bio.nyu.edu/ucsc/new_melanogaster_S3_bed' \
 	-O newMelanogasterS3.bed
 
     grep -v "^track" newMelanogasterS1.bed \
 	| hgLoadBed -strict dm2 picTarMiRNAS1 stdin
     #	Loaded 24103 elements of size 9
     grep -v "^track" newMelanogasterS3.bed \
 	| hgLoadBed -strict dm2 picTarMiRNAS3 stdin
     #	Loaded 11480 elements of size 9
 
     nice -n +19 featureBits dm2 picTarMiRNAS1
     #	59881 bases of 131698467 (0.045%) in intersection
     nice -n +19 featureBits dm2 picTarMiRNAS3
     #	26231 bases of 131698467 (0.020%) in intersection
 
 
 # FLYBASE IN SITU IMAGES / EXPRESSION (DONE 5/3/06 angie - REDONE 4/7/07)
     # FlyBase has downloadable in situ images for BACs:
     # ftp://flybase.net/flybase/images/bac_insitu_pic/*
     # and FBti's:
     # ftp://flybase.net/flybase/images/in-situ-images/insitus.zip
     # but what Jim is interested in is the insitus for expression data.
     # Don't see that in the ftp listing, but they do make it easy to link in.
     ssh hgwdev
     cd /cluster/data/dm2/bed/flyBase
     wget -O summary.txt.070406 \
     'http://www.fruitfly.org/cgi-bin/ex/bquery.pl?qpage=entry&qtype=summarytext'
     # remove header line.
     hgLoadSqlTab dm2 bdgpExprLink ~/kent/src/hg/lib/bdgpExprLink.sql \
       summary.txt.070406
 
 
 ###########################################################################
 # LIFTOVER CHAINS DM2 -> DM3 (DONE 8/1/06 angie)
     # Started 8/1 -- This creates the directory and scripts:
     ~/kent/src/utils/doSameSpeciesLiftOver.pl dm2 dm3 -debug
     # Run it for real and log it:
     cd /cluster/data/dm2/bed/blat.dm3.2006-08-01
     ~/kent/src/utils/doSameSpeciesLiftOver.pl dm2 dm3 \
        >& do.log & tail -f do.log
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROERE2 (DONE 9/27/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/blastz.droEre2.2006-09-27
     cd /cluster/data/dm2/bed/blastz.droEre2.2006-09-27
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. erecta
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. erecta
 SEQ2_DIR=/iscratch/i/droEre2/droEre2.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droEre2/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droEre2.2006-09-27
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droEre2 >& do.log &
     tail -f do.log
 
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroEre2Link
 #flyBaseGene 23.333%, chainDroEre2Link 90.665%, both 23.063%, cover 98.84%, enrich 1.09x
     rm -f /cluster/data/dm2/bed/blastz.droEre2
     ln -s blastz.droEre2.2006-09-27 /cluster/data/dm2/bed/blastz.droEre2
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROGRI2 (DONE 9/27/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/blastz.droGri2.2006-09-27
     cd /cluster/data/dm2/bed/blastz.droGri2.2006-09-27
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. grimshawi
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. grimshawi
 SEQ2_DIR=/iscratch/i/droGri2/droGri2.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droGri2/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droGri2.2006-09-27
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droGri2 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroGri2Link
 #flyBaseGene 23.333%, chainDroGri2Link 63.808%, both 20.793%, cover 89.12%, enrich 1.40x
     rm -f /cluster/data/dm2/bed/blastz.droGri2
     ln -s blastz.droGri2.2006-09-27 /cluster/data/dm2/bed/blastz.droGri2
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROWIL1 (DONE 9/27/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/blastz.droWil1.2006-09-27
     cd /cluster/data/dm2/bed/blastz.droWil1.2006-09-27
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. willistoni
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. willistoni
 SEQ2_DIR=/iscratch/i/droWil1/droWil1.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droWil1/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droWil1.2006-09-27
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /cluster/bluearc/dm2droWil1 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroWil1Link
 #flyBaseGene 23.333%, chainDroWil1Link 67.722%, both 21.106%, cover 90.45%, enrich 1.34x
     rm -f /cluster/data/dm2/bed/blastz.droWil1
     ln -s blastz.droWil1.2006-09-27 /cluster/data/dm2/bed/blastz.droWil1
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROANA3 (DONE 9/27/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/blastz.droAna3.2006-09-27
     cd /cluster/data/dm2/bed/blastz.droAna3.2006-09-27
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. ananassae
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. ananassae
 SEQ2_DIR=/iscratch/i/droAna3/droAna3.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droAna3/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droAna3.2006-09-27
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droAna3 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna3Link
 #flyBaseGene 23.333%, chainDroAna3Link 79.307%, both 22.211%, cover 95.19%, enrich 1.20x
     rm -f /cluster/data/dm2/bed/blastz.droAna3
     ln -s blastz.droAna3.2006-09-27 /cluster/data/dm2/bed/blastz.droAna3
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROMOJ3 (DONE 9/27/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/blastz.droMoj3.2006-09-27
     cd /cluster/data/dm2/bed/blastz.droMoj3.2006-09-27
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. mojavensis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. mojavensis
 SEQ2_DIR=/iscratch/i/droMoj3/droMoj3.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droMoj3/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droMoj3.2006-09-27
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droMoj3 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj3Link
 #flyBaseGene 23.333%, chainDroMoj3Link 60.956%, both 20.676%, cover 88.61%, enrich 1.45x
     rm -f /cluster/data/dm2/bed/blastz.droMoj3
     ln -s blastz.droMoj3.2006-09-27 /cluster/data/dm2/bed/blastz.droMoj3
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROVIR3 (DONE 9/27/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/blastz.droVir3.2006-09-27
     cd /cluster/data/dm2/bed/blastz.droVir3.2006-09-27
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. virilis
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. virilis
 SEQ2_DIR=/iscratch/i/droVir3/droVir3.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droVir3/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.droVir3.2006-09-27
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dm2droVir3 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir3Link
 #flyBaseGene 23.333%, chainDroVir3Link 62.998%, both 20.854%, cover 89.38%, enrich 1.42x
     rm -f /cluster/data/dm2/bed/blastz.droVir3
     ln -s blastz.droVir3.2006-09-27 /cluster/data/dm2/bed/blastz.droVir3
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DP4 (DONE 9/27/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/blastz.dp4.2006-09-27
     cd /cluster/data/dm2/bed/blastz.dp4.2006-09-27
     cat << '_EOF_' > DEF
 # D. melanogaster vs. D. pseudoobscura
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - D. pseudoobscura
 SEQ2_DIR=/iscratch/i/dp4/dp4.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/dp4/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.dp4.2006-09-27
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /cluster/bluearc/dm2dp4 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDp4Link
 #flyBaseGene 23.333%, chainDp4Link 73.891%, both 21.518%, cover 92.22%, enrich 1.25x
     rm -f /cluster/data/dm2/bed/blastz.dp4
     ln -s blastz.dp4.2006-09-27 /cluster/data/dm2/bed/blastz.dp4
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET TRICAS2 (DONE 9/28/06)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/blastz.triCas2.2006-09-28
     cd /cluster/data/dm2/bed/blastz.triCas2.2006-09-28    
     cat <<_EOF_ > DEF
 # D. melanogaster vs. T. castaneum
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - D. melanogaster
 SEQ1_DIR=/iscratch/i/dm2/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dm2/chrom.sizes
 
 # QUERY - T. castaneum
 SEQ2_DIR=/iscratch/i/triCas2/triCas2.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=10000
 SEQ2_LEN=/iscratch/i/triCas2/chrom.sizes
 
 BASE=/cluster/data/dm2/bed/blastz.triCas2.2006-09-28
 _EOF_
 
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /cluster/bluearc/dm2triCas2 >& do.log &
     tail -f do.log
     featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainTriCas2Link
 #flyBaseGene 23.333%, chainTriCas2Link 16.438%, both 10.740%, cover 46.03%, enrich 2.80x
     rm -f /cluster/data/dm2/bed/blastz.triCas2
     ln -s blastz.triCas2.2006-09-28 /cluster/data/dm2/bed/blastz.triCas2
 
 
 ###########################################################################
 # MULTIZ15WAY (DONE 9/28/06 angie)
 # (((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28
     cd /cluster/data/dm2/bed/multiz15way.2006-09-28
 
     # copy MAF's to cluster-friendly server
     mkdir /san/sanvol1/scratch/dm2/mafNet
     foreach s (droSim1 droSec1 droYak2 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 anoGam1 apiMel2 triCas2)
       echo $s
       rsync -av /cluster/data/dm2/bed/blastz.$s/mafNet/* \
         /san/sanvol1/scratch/dm2/mafNet/$s/
     end
 
     echo '(((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)' \
       > tree-commas.nh
     sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
     sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     ssh pk
     cd /cluster/data/dm2/bed/multiz15way.2006-09-28
     mkdir maf run
     cd run
 
     # stash binaries 
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
     set db = dm2
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/mafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../{tree.nh,species.lst} $tmp
     pushd $tmp
     foreach s (`cat species.lst`)
         set in = $pairs/$s/$c.maf
         set out = $db.$s.sing.maf
         if ($s == $db) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
     # << emacs
     chmod +x autoMultiz.csh
 
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/dm2/bed/multiz15way.2006-09-28/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
     # << emacs
 
     awk '{print $1}' /cluster/data/dm2/chrom.sizes > chrom.lst
     gensub2 chrom.lst single spec jobList
     para make jobList
     para time
 #Completed: 13 of 13 jobs
 #CPU time in finished jobs:      37955s     632.58m    10.54h    0.44d  0.001 y
 #IO & Wait Time:                   315s       5.25m     0.09h    0.00d  0.000 y
 #Average job time:                2944s      49.06m     0.82h    0.03d
 #Longest finished job:            9401s     156.68m     2.61h    0.11d
 #Submission to last job:          9429s     157.15m     2.62h    0.11d
 
 
 # ANNOTATE MULTIZ15WAY MAF AND LOAD TABLES (DONE 9/28/2006 angie)
     ssh kolossus
     mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/anno
     cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno
     mkdir maf run
     cd run
     rm -f sizes nBeds
     foreach db (`cat /cluster/data/dm2/bed/multiz15way.2006-09-28/species.lst`)
       ln -s  /cluster/data/$db/chrom.sizes $db.len
       if (! -e /cluster/data/$db/$db.N.bed) then
         twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
       endif
       ln -s  /cluster/data/$db/$db.N.bed $db.bed
       echo $db.bed  >> nBeds
       echo $db.len  >> sizes
     end
     echo date > jobs.csh
     # do smaller jobs first:
     foreach f (`ls -1rS ../../maf/*.maf`)
       echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
         /cluster/data/dm2/dm2.2bit ../maf/`basename $f` >> jobs.csh
     end 
     echo date >> jobs.csh
     csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
     # 4min.
 
     # Load anno/maf
     ssh hgwdev
     cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf
     mkdir -p /gbdb/dm2/multiz15way/anno/maf
     ln -s /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf/*.maf \
       /gbdb/dm2/multiz15way/anno/maf
     date
     nice hgLoadMaf -pathPrefix=/gbdb/dm2/multiz15way/anno/maf dm2 multiz15way
     date
     # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
     # machine and then load on hgwdev:
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf
     cat *.maf \
     | nice hgLoadMafSummary dm2 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
              -test multiz15waySummary stdin
 #Created 112880 summary blocks from 10610488 components and 1151094 mafs from stdin
     ssh hgwdev
     cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf
     sed -e 's/mafSummary/multiz15waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
       > /tmp/multiz15waySummary.sql
     time nice hgLoadSqlTab dm2 multiz15waySummary /tmp/multiz15waySummary.sql \
       multiz15waySummary.tab
 #0.000u 0.001s 0:01.59 0.0%      0+0k 0+0io 0pf+0w
     rm *.tab
     ln -s multiz15way.2006-09-28 /cluster/data/dm2/bed/multiz15way
 
 
 # MULTIZ15WAY DOWNLOADABLES (UNANNOTATED) (DONE 10/3/2006 angie)
     # Annotated MAF is now documented, so use anno/maf for downloads.
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads
     cd /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads
     # upstream mafs 
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits dm2 refGene:upstream:$i -fa=/dev/null -bed=up.bad
         awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         nice mafFrags dm2 multiz15way up.bed upstream$i.maf \
                 -orgs=../species.lst
         rm up.bed
     end
     date
 'EOF'
     # << emacs
     time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
 #86.335u 42.090s 6:47.44 31.5%   0+0k 0+0io 0pf+0w
 
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads
 cat > downloads.csh << 'EOF'
     date
     foreach f (../anno/maf/chr*.maf)
         set c = $f:t:r
         echo $c
         nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     # << emacs
     time csh -efx downloads.csh >&! downloads.log
 #325.699u 4.199s 5:34.17 98.7%   0+0k 0+0io 1pf+0w
     nice gzip up*.maf
     nice md5sum up*.maf.gz >> md5sum.txt
     # Make a README.txt .
 
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/dm2/multiz15way
     mkdir $dir
     ln -s /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads/*.{gz,txt} $dir
 
 
 # MULTIZ15WAY MAF FRAMES (DONE 9/29/2006 angie)
 # multiz15wayFrames regenerated 11/28/06.
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/frames
     cd /cluster/data/dm2/bed/multiz15way.2006-09-28/frames
     # The following is adapted from MarkD's Makefile used for mm7...
 
     #------------------------------------------------------------------------
     # get the genes for all genomes (when available)
     # currently we have nothing for:
     # droSec1 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 triCas2
 
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
     # using mrna table as genes: droSim1 droYak2 anoGam1 apiMel2
     mkdir genes
     foreach queryDb (droSim1 droYak2 anoGam1 apiMel2)
       set tmpExt = `mktemp temp.XXXXXX`
       set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
       set tmpMrna = ${queryDb}.mrna.${tmpExt}
       set tmpCds = ${queryDb}.cds.${tmpExt}
       echo $queryDb
       hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                    from all_mrna,gbCdnaInfo,cds \
                    where (all_mrna.qName = gbCdnaInfo.acc) and \
                      (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
        ${queryDb} > ${tmpMrnaCds}
       cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
       cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
       mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
         stdout \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
     # using refGene for dm2
     # genePreds; (must keep only the first 10 columns for knownGene)
     foreach queryDb (dm2)
       if ($queryDb == "dm2") then
         set geneTbl = refGene
       endif
       hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
 
     # MarkD's 1-pass maf methodology
     # This section was re-run *with* dm2 genes (oops, thanks Brooke) 11/28/06.
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz15way/frames
     nice tcsh # easy way to get process niced
     (cat  ../anno/maf/*.maf \
      | time genePredToMafFrames dm2 stdin stdout \
          dm2 genes/dm2.gp.gz \
          droSim1 genes/droSim1.gp.gz droYak2 genes/droYak2.gp.gz \
          anoGam1 genes/anoGam1.gp.gz apiMel2 genes/apiMel2.gp.gz \
      | gzip -c > multiz15way.mafFrames.gz) >& frames.log &
 
     ssh hgwdev
     cd /cluster/data/dm2/bed/multiz15way/frames
     hgLoadMafFrames dm2 multiz15wayFrames multiz15way.mafFrames.gz
 
 
 # PHASTCONS 15WAY WITH METHODS FROM PAPER (DONE 10/3/06 angie)
 # (((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)
 # NOTE FROM QA (5/7/08):  Overlapping data points were found in dm3 phastCons
 # data by user Michael Hiller:
 # http://www.soe.ucsc.edu/pipermail/genome/2008-April/016241.html
 # The overlapping data points were caused by a bug in phastCons that Adam has
 # since fixed.  The bug is present in dm2 phastCons data, but as it is not the
 # most recent assembly for this organism, we are going to leave it as-is.  An
 # example region containing the overlap is: chr3R:6,999,990-7,000,010 .
 
     ssh kkstore04
     mkdir -p /san/sanvol1/scratch/dm2/chrom
     cp -p /cluster/data/dm2/?{,?}/chr*.fa /san/sanvol1/scratch/dm2/chrom/
     # Split chrom fa into smaller windows for phastCons:
     ssh pk
     mkdir /cluster/data/dm2/bed/multiz15way/phastCons
     mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.split
     cd /cluster/data/dm2/bed/multiz15way/phastCons/run.split
     set WINDOWS = /san/sanvol1/scratch/dm2/phastCons/WINDOWS
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
     cat << 'EOF' > doSplit.sh
 #!/bin/csh -ef
 
 set PHAST=/cluster/bin/phast
 set FA_SRC=/san/sanvol1/scratch/dm2/chrom
 set WINDOWS=/san/sanvol1/scratch/dm2/phastCons/WINDOWS
 
 set maf=$1
 set c = $maf:t:r
 set tmpDir = /scratch/msa_split/$c
 rm -rf $tmpDir
 mkdir -p $tmpDir
 ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
    -O dm2,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel2,triCas2 \
    -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
 cd $tmpDir
 foreach file ($c.*.ss)
   gzip -c $file > ${WINDOWS}/$file.gz
 end
 rm -f $tmpDir/$c.*.ss
 rmdir $tmpDir
 'EOF'
 # << for emacs
     chmod a+x doSplit.sh
     rm -f jobList
     foreach file (/cluster/data/dm2/bed/multiz15way/maf/*.maf)
       if (-s $file) then
         echo "doSplit.sh {check in exists+ $file}" >> jobList
       endif
     end
     para make jobList
     para time
 #Completed: 13 of 13 jobs
 #CPU time in finished jobs:       1075s      17.92m     0.30h    0.01d  0.000 y
 #IO & Wait Time:                   101s       1.68m     0.03h    0.00d  0.000 y
 #Average job time:                  90s       1.51m     0.03h    0.00d
 #Longest finished job:             259s       4.32m     0.07h    0.00d
 #Submission to last job:           270s       4.50m     0.07h    0.00d
 
     ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
     # Use consEntropy --NH to make it suggest a --expected-lengths param 
     # that we should try next.  Adam ran this on hg17 to find out the 
     # total entropy of the hg17 model:
     # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
 #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
 #Relative entropy: H=0.608216 bits/site
 #Required length: N=16.085437 sites
 #Total entropy: NH=9.783421 bits
     # Our target is that NH result: 9.7834 bits.  
     # Use phyloFit to make an initial model:
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz15way/phastCons
     /cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \
         --aggregate dm2,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel2,triCas2 \
         -i MAF -o SS > all.ss
     /cluster/bin/phast/phyloFit all.ss \
       --tree "(((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)" \
       -i SS --out-root starting-tree
     cat starting-tree.mod 
 #ALPHABET: A C G T 
 #ORDER: 0
 #SUBST_MOD: REV
 #TRAINING_LNL: -533323268.267133
 #BACKGROUND: 0.285852 0.214455 0.214368 0.285325 
 #RATE_MAT:
 #  -0.924927    0.204278    0.451988    0.268661 
 #   0.272287   -1.098668    0.225404    0.600978 
 #   0.602709    0.225494   -1.100568    0.272364 
 #   0.269158    0.451705    0.204631   -0.925493 
 #TREE: (((((((((dm2:0.032478,(droSim1:0.017650,droSec1:0.015740):0.017736):0.026088,(droYak2:0.058116,droEre2:0.055952):0.031922):0.084953,droAna3:0.218988):0.051563,(dp4:0.013624,droPer1:0.015374):0.210705):0.046101,droWil1:0.292357):0.019351,((droVir3:0.109131,droMoj3:0.142623):0.047595,droGri2:0.154583):0.189999):0.113599,anoGam1:0.357817):0.092848,apiMel2:0.386309):0.169225,triCas2:0.169225);
 # also get GC content from model -- if similar enough, no need to extract it 
 # separately above.
 awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod 
 #0.428823
     # OK, use .429 for --gc below.
 
     # Use the values of --target-coverage and --expected-lengths from the 
     # last iteration of the 9way run, 0.398 and 23.7.
     # Multiply each subst rate on the TREE line by 3.5 which is roughly the 
     # ratio of noncons to cons in 
     # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
     /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
       > starting-tree.noncons.mod
     /cluster/bin/phast/consEntropy --NH 9.7834 0.393 23.8 \
       starting-tree{,.noncons}.mod
     # Uh-oh.... out of memory.  OK!  Well, proceed with params from before.
 
     # Now set up cluster job to estimate model parameters given free params 
     # --target-coverage and --expected-lengths and the data.  
     ssh pk
     mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate
     cd /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate
     # FIRST ITERATION: Use ../starting-tree.mod:
     cat << '_EOF_' > doEstimate.sh
 #!/bin/csh -ef
 zcat $1 \
 | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.429 --nrates 1,1 \
     --no-post-probs --ignore-missing \
     --expected-lengths 23.8 --target-coverage 0.393 \
     --quiet --log $2 --estimate-trees $3
 '_EOF_'
 # << for emacs
     # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
     cat << '_EOF_' > doEstimate.sh
 #!/bin/csh -ef
 zcat $1 \
 | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.429 --nrates 1,1 \
     --no-post-probs --ignore-missing \
     --expected-lengths 23.8 --target-coverage 0.393 \
     --quiet --log $2 --estimate-trees $3
 '_EOF_'
 # << for emacs
     chmod a+x doEstimate.sh
     rm -fr LOG TREES
     mkdir -p LOG TREES
     rm -f jobList
     foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
       set root = $f:t:r:r
       echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
     end
     para make jobList
     para time
 #Completed: 139 of 139 jobs
 #CPU time in finished jobs:    1771638s   29527.30m   492.12h   20.51d  0.056 y
 #IO & Wait Time:                  1154s      19.23m     0.32h    0.01d  0.000 y
 #Average job time:               12754s     212.56m     3.54h    0.15d
 #Longest finished job:           18435s     307.25m     5.12h    0.21d
 #Submission to last job:         26561s     442.68m     7.38h    0.31d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ssh kolossus
     cd /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate
     ls -1 TREES/*.cons.mod > cons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
       --output-average ave.cons.mod > cons_summary.txt
     ls -1 TREES/*.noncons.mod > noncons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
       --output-average ave.noncons.mod > noncons_summary.txt
     grep TREE ave*.mod
     # FIRST ITERATION:
 #ave.cons.mod:TREE: (((((((((dm2:0.017975,(droSim1:0.012862,droSec1:0.012586):0.011123):0.012087,(droYak2:0.031478,droEre2:0.032714):0.015973):0.033568,droAna3:0.107755):0.019847,(dp4:0.008814,droPer1:0.009578):0.104435):0.016369,droWil1:0.150471):0.005457,((droVir3:0.056362,droMoj3:0.072079):0.020331,droGri2:0.083731):0.092042):0.049021,anoGam1:0.218175):0.046658,apiMel2:0.187335):0.106621,triCas2:0.106621);
 #ave.noncons.mod:TREE: (((((((((dm2:0.059235,(droSim1:0.039799,droSec1:0.039025):0.035598):0.041349,(droYak2:0.104304,droEre2:0.106715):0.054017):0.119983,droAna3:0.377089):0.071962,(dp4:0.028079,droPer1:0.031175):0.369228):0.060569,droWil1:0.536488):0.019784,((droVir3:0.195542,droMoj3:0.254453):0.073077,droGri2:0.290773):0.336972):0.180643,anoGam1:0.783559):0.174661,apiMel2:0.670401):0.382291,triCas2:0.382291);
     # SECOND ITERATION:
 
     cat cons_summary.txt 
     # look over the files cons_summary.txt and noncons_summary.txt.
     # The means and medians should be roughly equal and the stdevs
     # should be reasonably small compared to the means, particularly
     # for rate matrix parameters (at bottom) and for branches to the
     # leaves of the tree.  The stdevs may be fairly high for branches
     # near the root of the tree; that's okay.  Some min values may be
     # 0 for some parameters.  That's okay, but watch out for very large
     # values in the max column, which might skew the mean.  If you see
     # any signs of bad outliers, you may have to track down the
     # responsible .mod files and throw them out.  I've never had to do
     # this; the estimates generally seem pretty well behaved.
 
     # NOTE: Actually, a random sample of several hundred to a thousand
     # alignment fragments (say, a number equal to the number of
     # available cluster nodes) should be more than adequate for
     # parameter estimation.  If pressed for time, use this strategy.
 
     # No consEntropy check because it runs out of mem on an 8G-mem small 
     # cluster node and has not completed in a day on 16G-mem kolossus.
 
     # Now we are ready to set up the cluster job for computing the
     # conservation scores and predicted elements.  The we measure the 
     # conserved elements coverage, and if that's not satisfactory then we 
     # adjust parameters and repeat.  
     ssh pk
     mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.phast
     cd /cluster/data/dm2/bed/multiz15way/phastCons/run.phast
     cat << 'EOF' > doPhastCons.sh
 #!/bin/csh -ef
 set pref = $1:t:r:r
 set chr = `echo $pref | awk -F\. '{print $1}'`
 set tmpfile = /scratch/phastCons.$$
 zcat $1 \
 | /cluster/bin/phast/phastCons - \
     ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
     --expected-lengths 23.8 --target-coverage 0.393 \
     --quiet --seqname $chr --idpref $pref \
     --viterbi /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \
     --require-informative 0 \
   > $tmpfile
 gzip -c $tmpfile > /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz
 rm $tmpfile
 'EOF'
 # << for emacs
     chmod a+x doPhastCons.sh
     rm -fr /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
     mkdir -p /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
     rm -f jobList
     foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
       echo doPhastCons.sh $f >> jobList
     end
     para make jobList
     para time
 #Completed: 139 of 139 jobs
 #CPU time in finished jobs:       1388s      23.14m     0.39h    0.02d  0.000 y
 #IO & Wait Time:                   402s       6.70m     0.11h    0.00d  0.000 y
 #Average job time:                  13s       0.21m     0.00h    0.00d
 #Longest finished job:              17s       0.28m     0.00h    0.00d
 #Submission to last job:           170s       2.83m     0.05h    0.00d
 
     # back on kolossus:
     # combine predictions and transform scores to be in 0-1000 interval
     cd /cluster/data/dm2/bed/multiz15way/phastCons
     awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
       /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/*.bed \
     | /cluster/bin/scripts/lodToBedScore > all.bed
 
     ssh hgwdev
     # Now measure coverage of CDS by conserved elements. 
     # We want the "cover" figure to be close to 68.9%.
     cd /cluster/data/dm2/bed/multiz15way/phastCons
     featureBits -enrichment dm2 flyBaseGene:cds all.bed
     # FIRST ITERATION: 68.9'll do!
 #flyBaseGene:cds 16.649%, all.bed 37.421%, both 11.471%, cover 68.90%, enrich 1.84x
 
     # Having met the CDS coverage target, load up the results.
     hgLoadBed dm2 phastConsElements15way all.bed
 
     # Create wiggle
     ssh pk
     mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.wib
     cd /cluster/data/dm2/bed/multiz15way/phastCons/run.wib
     rm -rf /san/sanvol1/scratch/dm2/phastCons/wib
     mkdir -p /san/sanvol1/scratch/dm2/phastCons/wib
     cat << 'EOF' > doWigEncode
 #!/bin/csh -ef
 set chr = $1
 cd /san/sanvol1/scratch/dm2/phastCons/wib
 zcat `ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
       | sort -t\. -k2,2n` \
 | wigEncode stdin ${chr}_phastCons.wi{g,b}
 'EOF'
 # << for emacs
     chmod a+x doWigEncode
     rm -f jobList
     foreach chr (`ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS \
                   | awk -F\. '{print $1}' | sort -u`)
       echo doWigEncode $chr >> jobList
     end
     para make jobList
     para time
 #Completed: 13 of 13 jobs
 #CPU time in finished jobs:         71s       1.18m     0.02h    0.00d  0.000 y
 #IO & Wait Time:                    28s       0.47m     0.01h    0.00d  0.000 y
 #Average job time:                   8s       0.13m     0.00h    0.00d
 #Longest finished job:              17s       0.28m     0.00h    0.00d
 #Submission to last job:           101s       1.68m     0.03h    0.00d
 
     # back on kkstore04, copy wibs, wigs and POSTPROBS (people sometimes want 
     # the raw scores) from san/sanvol1
     cd /cluster/data/dm2/bed/multiz15way/phastCons
     rm -rf wib POSTPROBS
     rsync -av /san/sanvol1/scratch/dm2/phastCons/wib .
     rsync -av /san/sanvol1/scratch/dm2/phastCons/POSTPROBS .
 
     # load wiggle component of Conservation track
     ssh hgwdev
     mkdir /gbdb/dm2/multiz15way/wib
     cd /cluster/data/dm2/bed/multiz15way/phastCons
     chmod 775 . wib
     chmod 664 wib/*.wib
     ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz15way/wib/
     hgLoadWiggle dm2 phastCons15way \
       -pathPrefix=/gbdb/dm2/multiz15way/wib wib/*.wig
     rm wiggle.tab
 
     # and clean up san/sanvol1.
     rm -r /san/sanvol1/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib}
     rm -r /san/sanvol1/scratch/dm2/chrom
     # Offer raw scores for download since fly folks are likely to be interested:
     ssh kkstore04
     cd /cluster/data/dm2/bed/multiz15way/phastCons/POSTPROBS
     mkdir ../postprobsDownload
     foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
       zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
         > ../postprobsDownload/$chr.pp.gz
     end
     cd ../postprobsDownload
     md5sum *.gz > md5sum.txt
     # Make a README.txt there too.
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons15way
     cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons15way
     ln -s /cluster/data/dm2/bed/multiz15way/phastCons/postprobsDownload/* .
 
     # make a tree picture using phyloGif.
     cd /cluster/data/dm2/bed/multiz15way
     /usr/local/apache/cgi-bin/phyloGif -phyloGif_tree=tree-commas-orgNames.nh \
       -phyloGif_width=260 -phyloGif_height=480 > dm2_15way.gif
     cp -p dm2_15way.gif ~/browser/images/phylo/dm2_15way.gif
     # Check in browser/images/phylo/dm2_15way.gif.  In a clean ~/browser:
     cd ~/browser
     make alpha
 
 
 #########################################################################
 # ORegAnno Open Regulatory Annotation 		Belinda Giardine April 2007
 # 
 # Update data via a reload
 # trackDb is at top level
 # parser for dump from Obi Griffith is at PSU.
 # cp data to UCSC and load
 hgsql dm2
 	truncate table oreganno;
 	truncate table oregannoAttr;
 	truncate table oregannoLink;
 
 #load the tables
 hgsql hgFixed
     load data local infile "oregannoAttr.dm2.txt" into table oregannoAttr;
     load data local infile "oregannoLink.dm2.txt" into table oregannoLink;
 grep "^chr" oreganno.dm2.txt | sort -k1,1 -k2,2n > oreganno.bed
 hgLoadBed dm2 oreganno oreganno.bed -noSort -oldTable -tab
 rm oreganno.bed
 
 #########################################################################
 # FLYBASE 4.3 ANNOTATIONS (DONE (except for contacting flybase-help) 10/11/06 angie - UPDATED 4/6/07)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/flybase4.3
     cd /cluster/data/dm2/bed/flybase4.3
     foreach c (2L 2R 3L 3R 4 X)
       wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.3_20060303/gff/$c.gff
     end
     cat *.gff > flybase.gff3
     # What data sources are represented in this file?
     grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
     # excerpt (many other sources, including blastx:... , sim4:... and 
     # tblastx:...; also various other types for source "FlyBase" and "."):
   85778 FlyBase CDS
   64452 FlyBase exon
   14449 FlyBase gene
   19376 FlyBase mRNA
      66 FlyBase miRNA
     116 FlyBase ncRNA
      51 FlyBase pseudogene
      96 FlyBase rRNA
      46 FlyBase snRNA
      63 FlyBase snoRNA
     292 FlyBase tRNA
   36921 .     transcription_start_site
    6003 FlyBase transposable_element
   35734 FlyBase transposable_element_insertion_site
     # Notable differences since 4.2: "FlyBase" instead of "." for most of 
     # those, and now there seens to be more than one CDS per exon while it 
     # use to be ~1 CDS per gene.  Hmmmm....
 
     # What keywords are defined in the 9th field?  Watch out for stray ;'s
     # from html-escaped special characters...
     grep -v '^#' flybase.gff3 \
     | awk '{print $9;}' \
     | perl -wpe 's/&\w\w\w\w?;//g; s/=[^;]+;/\n/g; s/=.*$//;' \
     | sort | uniq -c
 4250095 
   74483 Alias
   31137 Dbxref
 2795760 ID
 1432800 Name
     765 Ontology_term
  515484 Parent
 1046375 Synonym
   17901 associated_with
   15773 cyto_range
   36666 derived_cyto_location
   96162 evidence
   15657 gbunit
   12101 programversion
   11837 putative_ortholog_of
   11790 to_name
  121126 to_species
     # Notable differences: synonym -> Synonym;  no more synonym_2nd or 
     # dbxref_2nd (or species); some new keywords but I think we can ignore.
 
     # Copy over 4.2 script and modify to fit the latest Immortal Unchanging
     # Widely Adopted Standard GFF3.
     cp ../flybase4.2/extractGenes.pl .
     extractGenes.pl flybase.gff3
 
     # Get predicted proteins (for main annotations only)
     wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.3_20060303/fasta/dmel-all-translation-r4.3.fasta
     # save some space.
     nice gzip ?{,?}.gff dmel-all-transl*
     # Translate the fancy headers (some of which already start with CG-P's 
     # that match the FBtr's -- but not all!) into CG-R's to match flyBaseGene:
     zcat dmel-all-translation-r4.3.fasta.gz \
     | perl -we 'open(X, "flyBase2004Xref.tab") || die; \
                 while (<X>) { \
                   @w = split("\t"); \
                   $fbtr2cgr{$w[3]} = $w[0]; \
                 } \
                 while (<>) { \
                   if (/^>.*parent=(FBtr\d+)/) { \
                     $cgr = $fbtr2cgr{$1}; \
                     s/^>.*/>$cgr/ if ($cgr); \
                   } \
                   print; \
                 } \
                ' \
       > flybasePep.fa
 
     ssh hgwdev
     cd /cluster/data/dm2/bed/flybase4.3
     # Protein-coding genes:
     ldHgGene -gtf dm2 flyBaseGene flyBase*.gtf
     hgPepPred dm2 generic flyBasePep flybasePep.fa
     # Noncoding genes:
     hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
     # Cross-referencing info for both coding and noncoding:
     hgLoadSqlTab dm2 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
       flyBase2004Xref.tab
     # -- had to remove a couple duplicate lines in flyBase2004Xref.tab.  
     # CG4110-RA and CG13904-RA had multiple mappings, both with consecutive 
     # FBtr's so I tossed out the earlier FBtr for both.  
 
     # Do this next time around, to catch joinerCheck errors before 
     # running doHgNearBlastp with these proteins:
     runJoiner.csh dm2 flyBasePep
 
     # Some featureBits comparisons with refGene which shoudl be pretty much 
     # like version 4.0 with a few noncoding genes tossed in... but this is 
     # a bigger difference than we expect:
     featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout
 #604659 bases of 131698467 (0.459%) in intersection
     # Hmmm, flyBaseGene has nothing on the last 165,000bp of chrX!:
     # chrX:22,058,584-22,224,390
     # Here's anothre 176,000 gap in chrX:
     # chrX:21,836,109-22,012,685
     # nor the last 496,000bp of chr3L:
     # chr3L:23,275,448-23,771,897
     # nor the first 527,000bp of chr2R:
     # chr2R:1-527,716
     # No files for chr2h, chr3h, chr4h, chrXh, chrYh, chrU, so those are 
     # entirely devoid of FlyBase Genes... yikes.
 #TODO: contact flybase-help about the lack of annotations in those ranges...
 
     # This shows only the dropped isoforms/duplicates (ignores noncoding):
     featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \
        -bed=stdout
 #chr3R   15620310        15621493        chr3R.1
 #chr2L   4698094 4700940 chr2L.1
 #chrX    3638700 3639895 chrX.1
 #474282 bases of 131698467 (0.360%) in intersection
 
     # add upstream* downloadable files
     cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
     foreach size (1000 2000 5000)
       echo upstream$size
       nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
       | nice gzip -c > upstream$size.fa.gz
     end
     nice md5sum *.zip *.gz > md5sum.txt
     # Now go redo all the sections above for tables related to flyBase*.
 
     # 4/6/07: remove mitochondrial genes from flyBasePep because they 
     # are missing from flyBase2004Xref and caused joinerCheck errors.
     hgsql dm2 -e 'delete from flyBasePep where name like "mt:%"'
 
 
 #########################################################################
 # AFFYMETRIX TIMECOURSE (DONE 12/21/06 angie)
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/affyDrosDev
     cd /cluster/data/dm2/bed/affyDrosDev
     # Download files from Affy:
     wget -r --level=1 --directory-prefix=transfrags --no-directories \
       http://transcriptome.affymetrix.com/download/publication/dros_dev/transfrags
     foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
       mkdir $chr
       pushd $chr
       foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
         wget --timestamping http://transcriptome.affymetrix.com/download/publication/dros_dev/graphs/$chr/Dro_Total_AS_${t}_C01.sig.gr
       end
       popd
     end
 
     # Transfrags -- Add chromosome to make bed3 (start is 0-based per Sujit):
     foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
       cp /dev/null affyDrosDevTransfrags$t.bed
       foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
         if ($chr == "chrM") continue
         awk '{print "'$chr'\t" $1 "\t" $2;}' \
           transfrags/$chr.Dro_Total_AS_${t}_C01 >> affyDrosDevTransfrags$t.bed
       end
     end
     # Transcription level -- Add wig format variableStep lines and 
     # agglomerate into per-timepoint .wig files:
     foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
       cp /dev/null affyDrosDevSignal$t.varStep.wig
       foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
         if ($chr == "chrM") continue
         echo 'variableStep chrom='$chr >> affyDrosDevSignal$t.varStep.wig
         cat $chr/Dro_Total_AS_${t}_C01.sig.gr >> affyDrosDevSignal$t.varStep.wig
       end
       wigEncode affyDrosDevSignal$t.varStep.wig affyDrosDevSignal$t.wi{g,b}
     end
 #Converted affyDrosDevSignal1.varStep.wig, upper limit 1156.00, lower limit -860.00
 #Converted affyDrosDevSignal2.varStep.wig, upper limit 1499.75, lower limit -580.50
 #Converted affyDrosDevSignal3.varStep.wig, upper limit 1627.50, lower limit -586.00
 #Converted affyDrosDevSignal4.varStep.wig, upper limit 1794.50, lower limit -718.50
 #Converted affyDrosDevSignal5.varStep.wig, upper limit 2277.25, lower limit -913.00
 #Converted affyDrosDevSignal6.varStep.wig, upper limit 2095.25, lower limit -864.00
 #Converted affyDrosDevSignal7.varStep.wig, upper limit 1071.75, lower limit -560.50
 #Converted affyDrosDevSignal8.varStep.wig, upper limit 1182.50, lower limit -1019.00
 #Converted affyDrosDevSignal9.varStep.wig, upper limit 2028.75, lower limit -942.50
 #Converted affyDrosDevSignal10.varStep.wig, upper limit 1212.00, lower limit -779.00
 #Converted affyDrosDevSignal11.varStep.wig, upper limit 1456.75, lower limit -454.50
 #Converted affyDrosDevSignal12.varStep.wig, upper limit 1203.50, lower limit -459.50
     # --> trackDb setting: type wig -1019.0 2277.25
 
     # Load tables (--> composite tracks)
     ssh hgwdev
     cd /cluster/data/dm2/bed/affyDrosDev
     mkdir /gbdb/dm2/affyDrosDev
     ln -s /cluster/data/dm2/bed/affyDrosDev/affyDrosDevSignal*.wib \
       /gbdb/dm2/affyDrosDev
     foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
       hgLoadBed dm2 affyDrosDevTransfrags$t affyDrosDevTransfrags$t.bed
       hgLoadWiggle dm2 affyDrosDevSignal$t \
         -pathPrefix=/gbdb/dm2/affyDrosDev affyDrosDevSignal$t.wig
     end
     rm bed.tab wiggle.tab
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE,  2007-01-26, braney)
     ssh kkstore04
     bash # if not using bash shell already
     # use blastDb database made for hg17 protein track
 
     mkdir -p  /san/sanvol1/scratch/dm2/blastDb; 
     cd /cluster/data/dm2/blastDb
     for i in nhr nin nsq; 
     do 
 	cp *.$i /san/sanvol1/scratch/dm2/blastDb; 
     done
 
    mkdir /cluster/data/dm2/bed/tblastn.hg18KG
    cd  /cluster/data/dm2/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/dm2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
 # 288 query.lst
 
    # we want around 250000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
 # 36727/(250000/288) = 42.309504
 
    mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa
    split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/dm2/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dm2/jkStuff/subChr.lft warn $f.2  
 	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/dm2/jkStuff/liftAll.lft warn $f.3  
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4
 
         if pslCheck -prot $3.tmp                                                  
         then                                                                      
             mv $3.tmp $3                                                          
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0                                                                    
     fi                                                                            
 fi                                                                                
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     
     # then run the Blast cluster jobs
     ssh pk
     cd /cluster/data/dm2/bed/tblastn.hg18KG
     para create blastSpec
     para try, check, push, check etc.
     para time
 
 # Completed: 176544 of 176544 jobs
 # CPU time in finished jobs:    1614468s   26907.81m   448.46h   18.69d  0.051 y
 # IO & Wait Time:                589357s    9822.61m   163.71h    6.82d  0.019 y
 # Average job time:                  12s       0.21m     0.00h    0.00d
 # Longest finished job:              73s       1.22m     0.02h    0.00d
 # Submission to last job:         58680s     978.00m    16.30h    0.68d
 
     ssh kkstore03
     cd /cluster/data/dm2/bed/tblastn.hg18KG
     tcsh
     mkdir chainRun
     cd chainRun
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS \
       /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh kk
     cd /cluster/data/dm2/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para try, check, push, check etc.
 
 # Completed: 613 of 613 jobs
 # CPU time in finished jobs:       6560s     109.33m     1.82h    0.08d  0.000 y
 # IO & Wait Time:                 31904s     531.74m     8.86h    0.37d  0.001 y
 # Average job time:                  63s       1.05m     0.02h    0.00d
 # Longest finished job:             176s       2.93m     0.05h    0.00d
 # Submission to last job:          1338s      22.30m     0.37h    0.02d
 
     ssh kkstore03
     cd /cluster/data/dm2/bed/tblastn.hg18KG/blastOut
     bash # if using another shell
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/dm2/bed/tblastn.hg18KG/blastHg18KG.psl
     pslCheck /cluster/data/dm2/bed/tblastn.hg18KG/blastHg18KG.psl
     # this is ok.
     # load table 
     ssh hgwdev
     cd /cluster/data/dm2/bed/tblastn.hg18KG
     hgLoadPsl dm2 blastHg18KG.psl
     # check coverage
     featureBits dm2 blastHg18KG
 # 5946352 bases of 131698467 (4.515%) in intersection
 
     featureBits dm2 refGene:cds blastHg18KG -enrichment
 # refGene:cds 17.097%, blastHg17KG 4.427%, both 4.246%, cover 24.83%, enrich 5.61x
 
     ssh kkstore04
     rm -rf /cluster/data/dm2/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut
 
 #end human proteins
 #################################################################
 
 
 #########################################################################
 # CHROMOSOME BANDS (DONE 6/19/07 angie)
     ssh hgwdev
     cd /cluster/data/dm2/bed/flybase4.3
     grep chromosome_band flybase.gff3 \
     | perl -wpe 'chomp; @w = split; \
                  $w[3]--; $w[3] = 0 if ($w[3] < 0); \
                  $w[8] =~ s/^.*ID=band-(\w+);$/$1/; \
                  if ($w[4] <= 0) { s/^.*$//; } else { \
                  s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\t\n/; }' \
       > cytoBand.txt
     # Did a bit of manual cleanup... there are some rows that have negative
     # starts that should clearly have negative ends, but for some reason 
     # their end coords have been substituted with 10, 20, 30, 40 --
     # I just deleted those 12 rows.
     hgLoadSqlTab dm2 cytoBand ~/kent/src/hg/lib/cytoBand.sql cytoBand.txt
     # Drop or modify the items that are off the end of their chroms:
     awk '{print "delete from dm2.cytoBand where chrom = \""$1"\" and chromStart >= "$2"; update dm2.cytoBand set chromEnd = "$2" where chrom = \""$1"\" and chromEnd > "$2";";}' \
       /cluster/data/dm2/chrom.sizes \
     | hgsql dm2
     checkTableCoords -verbose=2 dm2 cytoBand
 
     # Normally we would just create cytoBandIdeo by select * from cytoBand --
     # but that gives too high of a resolution for any bands to be visible!  
     # So make a boiled-down cytoBandIdeo that just has {number, letter} 
     # instead of {number, letter, number}.
     perl -we 'while (<>) { \
                 chomp; @w=split(" "); \
                 if ($w[3] =~ /(\d+[A-Z]+)\d+/) { \
                   $b = $1; \
                 } else { die "doh!" } \
                 if (! defined $lastB) { \
                   $start = $w[1]; \
                 } elsif ($lastB ne $b) { \
                   print "$chrom\t$start\t$end\t$lastB\t\n"; \
                   $start = $w[1]; \
                 } \
                 ($chrom, $end, $lastB) = ($w[0], $w[2], $b); \
               } \
               print "$chrom\t$start\t$end\t$lastB\t\n";' cytoBand.txt \
       > cytoBandIdeo.txt
     hgLoadSqlTab dm2 cytoBandIdeo ~/kent/src/hg/lib/cytoBandIdeo.sql \
       cytoBandIdeo.txt
     # Drop or modify the items that are off the end of their chroms:
     awk '{print "delete from dm2.cytoBandIdeo where chrom = \""$1"\" and chromStart >= "$2"; update dm2.cytoBandIdeo set chromEnd = "$2" where chrom = \""$1"\" and chromEnd > "$2";";}' \
       /cluster/data/dm2/chrom.sizes \
     | hgsql dm2
     checkTableCoords -verbose=2 dm2 cytoBandIdeo
 
 
 
 #########################################################################
 # EvoFold track (done, 10.02.2007, Jakob Skou Pedersen)
 #
 # EvoFold predictions were made based on the drosophila subset (12
 # species) of the 15-way multiz insect alignment. The data generation
 # process will change in the future and is therefore not described in
 # detail here. Instead we start with the final set of predictions
 # supplied by me.
 
   ssh hgwdev
   mkdir -p /cluster/data/dm2/bed/evofold
   cd /cluster/data/dm2/bed/evofold
   cp /cluster/home/jsp/projects/flies/finalDataSets/finalPredAll.bed foldsDm2.bed
   # The folds_hg17.bed is a 9-column bed file: column 1-6 provide
   # standard bed information, column 7 is element length, column 8 is
   # the RNA secondary structure in parentheses format, and column nine
   # is a commaseparated list of position specific confidence scores
   # (floats).
   hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql dm2 evofold foldsDm2.bed
 
 
 #########################################################################
 # BDTNP CHIP-CHIP (DONE 4/9/08 angie)
 # New data added 2/2/09, see next section.
     # Berkeley Drosophila Transcription Network Project's ChIP-Chip data,
     # from Mark Biggin (MDBiggin at lbl dot gov).
     ssh kkstore04
     mkdir /cluster/data/dm2/bed/bdtnpChipper
     cd /cluster/data/dm2/bed/bdtnpChipper
     wget http://rana.lbl.gov/~smacarth/ChIPChip/Public_Release/wiggle/all_public_080409_1FDR.wig.bz2
     wget http://rana.lbl.gov/~smacarth/ChIPChip/Public_Release/wiggle/all_public_080409_25FDR.wig.bz2
     # The files contain concatenated custom tracks; split those out into
     # one .wig file per track, with UCSC table names:
     bzcat all_public* | perl -we \
       'my $fh; \
        while (<>) { \
          if (/^track .* name="([\w-]+)"/) { \
            close($fh) if ($fh); \
            my $name = $1; \
            $name =~ s/([a-zI]+)_(H14)?(\d+)?_\d+_675bp_smoothed_scores-sym-(\d+)/bdtnp\u$1$3Fdr$4/ \
              || die "parse:\n$name\n\t"; \
            open($fh, ">$name.wig") || die "Cant open >$name.wig: $!\n"; \
          } else { \
            die "data before track line?  at input line $.:\n$_" if (! $fh); \
            print $fh $_; \
          } \
        }'
     # These two warnings are for the polII filenames, no problem:
 #Use of uninitialized value in concatenation (.) or string at -e line 6, <> line 714856.
 #Use of uninitialized value in concatenation (.) or string at -e line 6, <> line 3415135.
 
     mkdir wigEncoded
     cd wigEncoded
     foreach f (../*Fdr*.wig)
       wigEncode $f $f:t:r.{wig,wib}
     end
 #Converted ../bdtnpBcd1Fdr1.wig, upper limit 8.14, lower limit 0.80
 #Converted ../bdtnpBcd1Fdr25.wig, upper limit 8.14, lower limit 0.80
 #Converted ../bdtnpBcd2Fdr1.wig, upper limit 10.74, lower limit 0.68
 #Converted ../bdtnpBcd2Fdr25.wig, upper limit 10.74, lower limit 0.68
 #Converted ../bdtnpCad1Fdr1.wig, upper limit 16.20, lower limit 1.06
 #Converted ../bdtnpCad1Fdr25.wig, upper limit 16.20, lower limit 0.48
 #Converted ../bdtnpGt2Fdr1.wig, upper limit 22.22, lower limit 1.21
 #Converted ../bdtnpGt2Fdr25.wig, upper limit 22.22, lower limit 0.49
 #Converted ../bdtnpHb1Fdr1.wig, upper limit 14.38, lower limit 0.92
 #Converted ../bdtnpHb1Fdr25.wig, upper limit 14.38, lower limit 0.86
 #Converted ../bdtnpHb2Fdr1.wig, upper limit 24.25, lower limit 0.88
 #Converted ../bdtnpHb2Fdr25.wig, upper limit 24.25, lower limit 0.78
 #Converted ../bdtnpKni1Fdr1.wig, upper limit 8.08, lower limit 1.78
 #Converted ../bdtnpKni1Fdr25.wig, upper limit 8.08, lower limit 1.01
 #Converted ../bdtnpKni2Fdr1.wig, upper limit 8.44, lower limit 1.07
 #Converted ../bdtnpKni2Fdr25.wig, upper limit 8.44, lower limit 0.76
 #Converted ../bdtnpKr1Fdr1.wig, upper limit 14.47, lower limit 0.68
 #Converted ../bdtnpKr1Fdr25.wig, upper limit 14.47, lower limit 0.68
 #Converted ../bdtnpKr2Fdr1.wig, upper limit 12.92, lower limit 0.66
 #Converted ../bdtnpKr2Fdr25.wig, upper limit 12.92, lower limit 0.66
 #Converted ../bdtnpPolIIFdr1.wig, upper limit 30.86, lower limit 0.93
 #Converted ../bdtnpPolIIFdr25.wig, upper limit 30.86, lower limit 0.70
 #Converted ../bdtnpZ2Fdr1.wig, upper limit 11.49, lower limit 0.95
 #Converted ../bdtnpZ2Fdr25.wig, upper limit 11.49, lower limit 0.79
 
     ssh hgwdev
     mkdir /gbdb/dm2/bdtnp
     ln -s /cluster/data/dm2/bed/bdtnpChipper/wigEncoded/*.wib /gbdb/dm2/bdtnp/
     cd /cluster/data/dm2/bed/bdtnpChipper/wigEncoded
     foreach f (*.wig)
       hgLoadWiggle -pathPrefix=/gbdb/dm2/bdtnp dm2 $f:r $f
     end
     rm wiggle.tab
 
     # Print out template for 24 subtrack defs (longLabel will need editing):
     set pri = 1
     foreach f (*.wig)
       echo "    track $f:r"
       echo "    subTrack bdtnpChipper"
       set fac = `echo $f:r | perl -wpe 's/^bdtnp([A-Za-z]+).*/\l$1/ || die;'`
       set FAC = `echo $fac | perl -wpe 's/^(\w+)$/\U$1/;'`
       set ab  = `echo $f:r | perl -wpe 's/^bdtnp[A-Za-z]+(\d)?F.*/$1/ || die;'`
       set fdr = `echo $f:r | perl -wpe 's/.*Fdr(\d+)$/$1/ || die;'`
       echo "    subGroups fdr=$fdr factor=$FAC"
       echo "    shortLabel $fac AB $ab FDR $fdr%"
       echo "    longLabel ($fac) antibody $ab, stage 4-5 embryos, False Discovery Rate (FDR) $fdr%"
       if ($fdr == 1) then
         echo "    color 50,50,200"
       else
         echo "    color 25,150,25"
       endif
       echo "    priority $pri"
       echo ""
       set pri = `expr $pri + 1`
     end
 
 
 #########################################################################
 # BDTNP CHIP-CHIP (DONE 2/2/09 angie)
     ssh hgwdev
     cd /hive/data/genomes/dm2/bed/bdtnpChipper
     wget http://rana.lbl.gov/~simi/wiggle/BDTNP_all_new_020209_1FDR.wig.bz2
     wget http://rana.lbl.gov/~simi/wiggle/BDTNP_all_new_020209_25FDR.wig.bz2
     # The files contain concatenated custom tracks; split those out into
     # one .wig file per track, with UCSC table names:
     bzcat BDTNP_all_new* | perl -we \
       'my $fh; \
        while (<>) { \
-         if (/^track .* name="([\w-]+)"/) { \
+         if (/^track .* name="([\w-\.]+)"/) { \
            close($fh) if ($fh); \
            my $name = $1; \
            $name =~ s/_FQ_/_1_/;  $name =~ s/_BQ_/_2_/; \
            $name =~ s/([a-zA-Z0-9]+)_(H14)?(\d+)?_\d+_(FactorResults\.)?675bp_smoothed_scores-sym-(\d+)/bdtnp\u$1$3Fdr$5/ \
              || die "parse:\n$name\n\t"; \
            print "$name\n"; \
            open($fh, ">$name.wig") || die "Cant open >$name.wig: $!\n"; \
+         } elsif (/^track /) { \
+           die "missed a track line:\n$_\n\t"; \
          } else { \
            die "data before track line?  at input line $.:\n$_" if (! $fh); \
            print $fh $_; \
          } \
        }'
     cd wigEncoded
     foreach f (../*Fdr*.wig)
       if (! -e $f:t:r.wig) then
         wigEncode $f $f:t:r.{wig,wib}
       endif
     end
 #Converted ../bdtnpD1Fdr1.wig, upper limit 20.33, lower limit 0.51
 #Converted ../bdtnpD1Fdr25.wig, upper limit 20.33, lower limit 0.51
 #Converted ../bdtnpDa2Fdr1.wig, upper limit 26.09, lower limit 0.79
 #Converted ../bdtnpDa2Fdr25.wig, upper limit 26.09, lower limit 0.49
 #Converted ../bdtnpDl3Fdr1.wig, upper limit 16.97, lower limit 0.65
 #Converted ../bdtnpDl3Fdr25.wig, upper limit 16.97, lower limit 0.62
 #Converted ../bdtnpFtz3Fdr1.wig, upper limit 16.54, lower limit 1.07
 #Converted ../bdtnpFtz3Fdr25.wig, upper limit 16.54, lower limit 0.84
 #Converted ../bdtnpH1Fdr1.wig, upper limit 18.52, lower limit 0.68
 #Converted ../bdtnpH1Fdr25.wig, upper limit 18.52, lower limit 0.68
 #Converted ../bdtnpH2Fdr1.wig, upper limit 13.23, lower limit 0.68
 #Converted ../bdtnpH2Fdr25.wig, upper limit 13.23, lower limit 0.65
 #Converted ../bdtnpHkb1Fdr1.wig, upper limit 11.84, lower limit 1.03
 #Converted ../bdtnpHkb1Fdr25.wig, upper limit 11.84, lower limit 0.35
 #Converted ../bdtnpHkb2Fdr1.wig, upper limit 10.73, lower limit 0.87
 #Converted ../bdtnpHkb2Fdr25.wig, upper limit 10.73, lower limit 0.33
 #Converted ../bdtnpHkb3Fdr1.wig, upper limit 10.50, lower limit 0.90
 #Converted ../bdtnpHkb3Fdr25.wig, upper limit 10.50, lower limit 0.37
 #Converted ../bdtnpMad2Fdr1.wig, upper limit 12.58, lower limit 0.67
 #Converted ../bdtnpMad2Fdr25.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMed2Fdr1.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMed2Fdr25.wig, upper limit 12.58, lower limit 0.67
 #Converted ../bdtnpPrd1Fdr1.wig, upper limit 13.97, lower limit 0.78
 #Converted ../bdtnpPrd1Fdr25.wig, upper limit 13.97, lower limit 0.78
 #Converted ../bdtnpPrd2Fdr1.wig, upper limit 12.00, lower limit 0.84
 #Converted ../bdtnpPrd2Fdr25.wig, upper limit 12.00, lower limit 0.84
 #Converted ../bdtnpRun1Fdr1.wig, upper limit 10.02, lower limit 0.60
 #Converted ../bdtnpRun1Fdr25.wig, upper limit 10.02, lower limit 0.60
 #Converted ../bdtnpRun2Fdr1.wig, upper limit 7.58, lower limit 1.16
 #Converted ../bdtnpRun2Fdr25.wig, upper limit 7.58, lower limit 0.58
 #Converted ../bdtnpShn2Fdr1.wig, upper limit 12.62, lower limit 1.19
 #Converted ../bdtnpShn2Fdr25.wig, upper limit 12.62, lower limit 0.81
 #Converted ../bdtnpShn3Fdr1.wig, upper limit 10.28, lower limit 1.09
 #Converted ../bdtnpShn3Fdr25.wig, upper limit 10.28, lower limit 0.95
 #Converted ../bdtnpSlp11Fdr1.wig, upper limit 40.39, lower limit 1.06
 #Converted ../bdtnpSlp11Fdr25.wig, upper limit 40.39, lower limit 0.62
 #Converted ../bdtnpSna1Fdr1.wig, upper limit 8.41, lower limit 0.97
 #Converted ../bdtnpSna1Fdr25.wig, upper limit 8.41, lower limit 0.77
 #Converted ../bdtnpSna2Fdr1.wig, upper limit 14.30, lower limit 0.57
 #Converted ../bdtnpSna2Fdr25.wig, upper limit 14.30, lower limit 0.57
 #Converted ../bdtnpTFIIB1Fdr1.wig, upper limit 21.81, lower limit 0.59
 #Converted ../bdtnpTFIIB1Fdr25.wig, upper limit 21.81, lower limit 0.59
 #Converted ../bdtnpTll1Fdr1.wig, upper limit 16.66, lower limit 0.98
 #Converted ../bdtnpTll1Fdr25.wig, upper limit 16.66, lower limit 0.57
 #Converted ../bdtnpTwi1Fdr1.wig, upper limit 26.11, lower limit 0.77
 #Converted ../bdtnpTwi1Fdr25.wig, upper limit 26.11, lower limit 0.68
 #Converted ../bdtnpTwi2Fdr1.wig, upper limit 26.66, lower limit 0.79
 #Converted ../bdtnpTwi2Fdr25.wig, upper limit 26.66, lower limit 0.55
 
     foreach f (/hive/data/genomes/dm2/bed/bdtnpChipper/wigEncoded/*.wib)
       if (! -l /gbdb/dm2/bdtnp/$f:t) then
         ln -s $f /gbdb/dm2/bdtnp/
         hgLoadWiggle -pathPrefix=/gbdb/dm2/bdtnp dm2 $f:t:r $f:t:r.wig
       endif
     end
     rm wiggle.tab
 
     # Print out template for new subtrack defs (longLabel will need editing):
     set pri = 25
     foreach f (`ls -l *.wig | grep Feb | awk '{print $9;}'`)
       echo "    track $f:r"
       echo "    subTrack bdtnpChipper off"
       set fac = `echo $f:r | perl -wpe 's/^bdtnp([A-Za-z]+).*/\l$1/ || die;'`
       set FAC = `echo $fac | perl -wpe 's/^(\w+)$/\U$1/;'`
       set ab  = `echo $f:r | perl -wpe 's/^bdtnp[A-Za-z]+(\d+)?Fdr.*/$1/ || die;'`
       set fdr = `echo $f:r | perl -wpe 's/.*Fdr(\d+)$/$1/ || die;'`
       echo "    subGroups fdr=$fdr factor=$FAC"
       echo "    shortLabel $fac AB $ab FDR $fdr%"
       echo "    longLabel ($fac) antibody $ab, stage 4-5 embryos, False Discovery Rate (FDR) $fdr%"
       if ($fdr == 1) then
         echo "    color 50,50,200"
       else
         echo "    color 25,150,25"
       endif
       echo "    priority $pri"
       echo ""
       set pri = `expr $pri + 1`
     end
 
 
 #########################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 dm2.upstreamGeneTbl = refGene
 dm2.upstreamMaf = multiz15way /hive/data/genomes/dm2/bed/multiz15way/species.lst