src/hg/makeDb/doc/dasNov1.txt 1.7

1.7 2009/07/21 21:01:41 markd
added transmap update blurb
Index: src/hg/makeDb/doc/dasNov1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/dasNov1.txt,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 1000000 -r1.6 -r1.7
--- src/hg/makeDb/doc/dasNov1.txt	4 Dec 2008 00:59:07 -0000	1.6
+++ src/hg/makeDb/doc/dasNov1.txt	21 Jul 2009 21:01:41 -0000	1.7
@@ -1,846 +1,855 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # Armadillo May 2005 Broad Assembly
 #
 
 
 # DOWNLOAD SEQUENCE 
     ssh kkstore01
     mkdir /cluster/store9/dasNov1
     cd /cluster/data
     ln -s /cluster/store9/dasNov1 dasNov1
     cd /cluster/data/dasNov1
     mkdir jkStuff bed
     mkdir broad
     cd broad
     # ftp'ed with password from Broad
     # -rw-rw-r--  1 braney protein   69920541 May 20 07:52 assembly.agp
     # -rw-rw-r--  1 braney protein  745077193 May 20 08:15 assembly.bases.gz
     # -rw-rw-r--  1 braney protein 1222132418 May 20 09:08 assembly.quals.gz
     # -rw-rw-r--  1 braney protein  423586027 May 20 09:52 unplaced.fasta.gz
     # -rw-rw-r--  1 braney protein 1294098192 May 20 10:38 unplaced.qual.gz
 
     gunzip assembly.bases.gz 
     faSize assembly.bases
 # 2146362222 bases (0 N's 2146362222 real 2146362222 upper 0 lower) in 848432 sequences in 1 files
 # Total size: mean 2529.8 sd 1972.8 min 61 (contig_80213) max 194887 (contig_17) median 2053
 # N count: mean 0.0 sd 0.0
 # U count: mean 2529.8 sd 1972.8
 # L count: mean 0.0 sd 0.0
     ssh kolossus
     cd /cluster/data/dasNov1/broad
     /cluster/bin/x86_64/agpAllToFaFile assembly.agp assembly.bases ../dasNov1.fa
     cd ..
     faSize dasNov1.fa
 # 3856777364 bases (1710415142 N's 2146362222 real 2146362222 upper 0 lower) in 304391 sequences in 1 files
 # Total size: mean 12670.5 sd 24184.9 min 1000 (scaffold_304389) max 974481 (scaffold_0) median 4960
 # N count: mean 5619.1 sd 15102.0
 # U count: mean 7051.3 sd 10915.0
 # L count: mean 0.0 sd 0.0
 
     /cluster/bin/scripts/agpToLift < assembly.agp > ../jkStuff/assembly.lft
 
 # PARTITION SCAFFOLDS FOR REPEATMASKER RUN 
     # glom the tiny scaffolds up into ~50k collections 
     ssh kkstore01
     cd /cluster/data/dasNov1
     mkdir chunks50k
     zcat broad/assembly.bases.gz | faSplit about stdin 50000 chunks50k/chunk_
     cd chunks50k
     for i in 0 1 2 3 4 5 6 7 8 9; do mkdir $i; mv *$i.fa $i; done
 
 # RUN REPEAT MASKER 
     # make the run directory, output directory, and job list
     ssh kkstore01
     cd /cluster/data/dasNov1
     cat << '_EOF_' > jkStuff/RMArmadillo
 #!/bin/csh -fe
 
 cd $1
 /bin/mkdir -p /tmp/dasNov1/$2
 /bin/cp $3 /tmp/dasNov1/$2/
 pushd /tmp/dasNov1/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -s -species eutheria $2
 popd
 /bin/cp /tmp/dasNov1/$2/$2.out ./
 /bin/rm -fr /tmp/dasNov1/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/dasNov1/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/dasNov1
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x jkStuff/RMArmadillo
 #    mkdir RMRun RMOut
 #    for i in chunks800k/*.fa
 #     do
 #	d="/cluster/data/dasNov1"
 #	c=`basename $i`
 #	echo "../jkStuff/RMArmadillo $d/RMOut $c  { check in line+ $d/$i} {check out line+ $d/RMOut/$c.out}"
 #    done > RMRun/RMJobs
 
     mkdir RMRun1 RMOut1
     cd RMOut1
     mkdir 0 1 2 3 4 5 6 7 8 9
     cd ../chunks50k
     for i in */*.fa
     do
 	e=`dirname $i`
 	d="/cluster/data/dasNov1"
 	c=`basename $i`
     echo "../jkStuff/RMArmadillo $d/RMOut1/$e $c  {check in line+ $d/chunks50k/$i} {check out line+ $d/RMOut1/$e/$c.out}"
     done > ../RMRun1/RMJobs
 
     # do the run
     ssh kk9
     cd /cluster/data/dasNov1/RMRun
     para create RMJobs
     para try, check, push, check,...
 
 # Completed: 41361 of 41361 jobs
 # CPU time in finished jobs:   52130147s  868835.79m 14480.60h  603.36d  1.653 y
 # IO & Wait Time:                224553s    3742.55m    62.38h    2.60d  0.007 y
 # Average job time:                1266s      21.10m     0.35h    0.01d
 # Longest finished job:            6884s     114.73m     1.91h    0.08d
 # Submission to last job:        150113s    2501.88m    41.70h    1.74d
 
     # Lift up the split-scaffold .out's to scaffold .out's
     ssh kkstore01
     cd /cluster/data/dasNov1/RMOut1
     for i in 0 1 2 3 4 5 6 7 8 9; do echo $i; liftUp -nohead $i.out ../jkStuff/assembly.lft warn $i/*.fa.out>/dev/null; done
     head -3 0.out > dasNov1.out
     for i in 0 1 2 3 4 5 6 7 8 9; do tail +4 $i.out; done >> dasNov1.out
     # Load the .out files into the database with:
     ssh hgwdev
     hgLoadOut dasNov1 /cluster/data/dasNov1/RMOut1/dasNov1.out
     # Loading up table dasNov1_rmsk
     # note: 24 records dropped due to repStart > repEnd
 
     hgsql dasNov1 -e 'rename table dasNov1_rmsk to rmsk'
     # Fix up the indices too:
     hgsql dasNov1 -e 'drop index bin       on rmsk; \
                   drop index genoStart on rmsk; \
                   drop index genoEnd   on rmsk; \
                   create index bin       on rmsk (genoName(16), bin); \
                   create index genoStart on rmsk (genoName(16), genoStart); \
                   create index genoEnd   on rmsk (genoName(16), genoEnd);'
 
 
 # EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 11/5/04 angie)
     ssh kkstore01
     mkdir /cluster/data/dasNov1/bed/fakeAgp
     cd /cluster/data/dasNov1/bed/fakeAgp
     faGapSizes ../../downloads/scaffolds.fasta \
         -niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000
     # Wow, all block of N's seem to be exactly 100bp long.  
     # hgFakeAgp's default -minContigGap of 25 will be fine.  
     hgFakeAgp ../../downloads/scaffolds.fasta fake.agp
     ssh hgwdev
     hgLoadGap -unsplit dasNov1 /cluster/data/dasNov1/bed/fakeAgp/fake.agp
 
 
 # SIMPLE REPEATS (TRF) 
     ssh kkstore01
     mkdir /cluster/data/dasNov1/bed/simpleRepeat
     cd /cluster/data/dasNov1/bed/simpleRepeat
     nice trfBig -trf=/cluster/bin/i386/trf ../../dasNov1.fa  /dev/null -bedAt=simpleRepeat.bed -tempDir=/tmp \
     | egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)'  > trf.log &
     # check on this with
     tail -f trf.log
 
     # Load this into the database as so
     ssh hgwdev
     hgLoadBed dasNov1 simpleRepeat /cluster/data/dasNov1/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
 
 
 # FILTER SIMPLE REPEATS (TRF) INTO MASK
     # make a filtered version of the trf output: 
     # keep trf's with period <= 12:
     ssh kkstore01
     cd /cluster/data/dasNov1/bed/simpleRepeat
     awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
 
 
 # MASK FA USING REPEATMASKER AND FILTERED TRF FILES 
     ssh kolossus
     cd /cluster/data/dasNov1
     nice /cluster/bin/x86_64/maskOutFa -soft dasNov1.fa bed/simpleRepeat/trfMask.bed dasNov1.simple.fa
     nice /cluster/bin/x86_64/maskOutFa -softAdd dasNov1.simple.fa RMOut1/dasNov1.out dasNov1.masked.fa
     # WARNING: negative rEnd: -104 scaffold_8757:558-754 L1M4
     # WARNING: negative rEnd: -511 scaffold_34788:4357-4433 L1MCc
     # WARNING: negative rEnd: -154 scaffold_34788:5061-5302 L1MCc
     # WARNING: negative rEnd: -383 scaffold_119083:539-652 L1M4b
     # WARNING: negative rEnd: -99 scaffold_181954:1627-2036 L1MCc
     # WARNING: negative rEnd: -1033 scaffold_269:42483-42807 L1MEc
     # WARNING: negative rEnd: -15 scaffold_18137:4036-4256 L1M4
     # WARNING: negative rEnd: -301 scaffold_79493:4217-4508 L1MCc
     # WARNING: negative rEnd: -542 scaffold_1800:103399-103624 L1M4b
     # WARNING: negative rEnd: -594 scaffold_10038:2784-3058 L1MCc
     # WARNING: negative rEnd: -916 scaffold_58908:40949-41369 L1MEc
     # WARNING: negative rEnd: -247 scaffold_58908:41562-41685 L1MEc
     # WARNING: negative rEnd: -99 scaffold_287090:3-415 L1MCc
     # WARNING: negative rEnd: -162 scaffold_25:281259-281527 L1MCc
     # WARNING: negative rEnd: -99 scaffold_45825:10398-10807 L1MCc
     # WARNING: negative rEnd: -34 scaffold_47500:8156-8385 L1M4b
     # WARNING: negative rEnd: -99 scaffold_67:113583-113881 L1MCc
 
     mv dasNov1.fa dasNov1.unmasked.fa
 
     # Now clean up the unmasked split scaffolds to avoid confusion later.
     rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft
 
 
 # CREATING DATABASE
     # Create the database.
     ssh hgwdev
     # Make sure there is at least 5 gig free for the database
     df -h /var/lib/mysql
 Filesystem            Size  Used Avail Use% Mounted on
 # /dev/sdc1             1.8T  915G  746G  56% /var/lib/mysql
     hgsql '' -e 'create database dasNov1'
 
 # STORE SEQUENCE AND ASSEMBLY INFORMATION 
     # Translate to 2bit
     ssh kolossus
     cd /cluster/data/dasNov1
     /cluster/bin/x86_64/faToTwoBit dasNov1.masked.fa dasNov1.2bit
     # Make chromInfo.tab.
     mkdir bed/chromInfo
     twoBitInfo dasNov1.2bit stdout | awk '{printf "%s\t%s\t/gbdb/dasNov1/dasNov1.2bit\n",$1,$2;}' \
 	| sort > bed/chromInfo/chromInfo.tab
 
     # Make symbolic a link from /gbdb/dasNov1 to the 2bit.
     ssh hgwdev
     mkdir -p /gbdb/dasNov1
     ln -s /cluster/data/dasNov1/dasNov1.2bit /gbdb/dasNov1/
     # Load chromInfo table.
     hgsql dasNov1 < $HOME/kent/src/hg/lib/chromInfo.sql
     hgsql dasNov1 -e 'load data local infile "/cluster/data/dasNov1/bed/chromInfo/chromInfo.tab" into table chromInfo'
     # Make chrom.sizes from chromInfo contents and check scaffold count.
     hgsql dasNov1 -N -e 'select chrom,size from chromInfo' > /cluster/data/dasNov1/chrom.sizes
     wc -l /cluster/data/dasNov1/chrom.sizes
 # 304391 /cluster/data/dasNov1/chrom.sizes
 
 # CREATING GRP TABLE FOR TRACK GROUPING 
     # Copy all the data from the table "grp" 
     # in an existing database to the new database
     ssh hgwdev
     hgsql dasNov1 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp'
 
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE 
     # Warning: genome and organism fields must correspond
     # with defaultDb values
     echo 'INSERT INTO genomeClade (genome,clade,priority) values ("Armadillo", "vertebrate",56);' \
       | hgsql -h genome-testdb hgcentraltest
     echo 'INSERT INTO dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, \
 	 htmlPath, hgNearOk, hgPbOk, sourceName) values  ("dasNov1", "May. 2005", "/gbdb/dasNov1", "Armadillo", \
 	 "", 1, 57,  "Armadillo", "Dasypus novemcinctus", "/gbdb/dasNov1/html/description.html",  0, 0, "Broad May 2005");'\
       | hgsql -h genome-testdb hgcentraltest
     echo 'INSERT INTO defaultDb (genome, name) values ("Armadillo", "dasNov1");' \
       | hgsql -h genome-testdb hgcentraltest
 
     # Make trackDb table so browser knows what tracks to expect:
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/trackDb
     cvs up -d -P
 
     # Edit trackDb/makefile to add dasNov1 to the DBS variable.
     mkdir drosophila/dasNov1
     # Create a simple drosophila/dasNov1/description.html file.
     cvs add drosophila/dasNov1
     cvs add drosophila/dasNov1/description.html
     make update DBS=dasNov1 ZOO_DBS=
 
     # go public on genome-test
     cvs ci makefile
     cvs ci drosophila/dasNov1
     mkdir /gbdb/dasNov1/html
     # in a clean, updated tree's kent/src/hg/makeDb/trackDb:
     make alpha
 
 
 # PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 11/3/04 angie)
     # First, agglomerate small scaffolds into chunks of ~100k median 
     # (many scaffolds are larger than that) so we don't have too many 
     # files for one dir, but keep a reasonably low job run time:
     ssh kkstore01
     cd /cluster/data/dasNov1
     mkdir chunksUnsplit
     faSplit about scaffolds.fa 100000 chunksUnsplit/chunk_
     ssh kkr1u00
     mkdir /iscratch/i/dasNov1
     cp -pR /cluster/data/dasNov1/chunksUnsplit /iscratch/i/dasNov1/
     cp -p /cluster/data/dasNov1/dasNov1.2bit /iscratch/i/dasNov1/
     iSync
 
 
 # PRODUCING GENSCAN PREDICTIONS (DONE 11/4/04 angie)
     ssh kkstore01
     # Make hard-masked scaffolds and split up for processing:
     cd /cluster/data/dasNov1
     maskOutFa scaffolds.fa hard scaffolds.fa.masked
     mkdir chunksUnsplitMasked
     faSplit about scaffolds.fa.masked 100000 chunksUnsplitMasked/chunk_
     mkdir /cluster/data/dasNov1/bed/genscan
     cd /cluster/data/dasNov1/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     ls -1S ../../chunksUnsplitMasked/chunk*.fa > chunks.list
     cat << '_EOF_' > gsub
 #LOOP
 gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line keeps emacs coloring happy
     gensub2 chunks.list single gsub jobList
     ssh kki
     cd /cluster/data/dasNov1/bed/genscan
     para create jobList
     para try, check, push, check, ...
 #Completed: 463 of 463 jobs
 #Average job time:                  12s       0.21m     0.00h    0.00d
 #Longest job:                      317s       5.28m     0.09h    0.00d
 #Submission to last job:           445s       7.42m     0.12h    0.01d
 
     # If there are crashes, diagnose with "para problems".  
     # If a job crashes due to genscan running out of memory, re-run it 
     # manually with "-window=1200000" instead of "-window=2400000".
     
     # Concatenate scaffold-level results:
     ssh kkstore01
     cd /cluster/data/dasNov1/bed/genscan
     cat gtf/*.gtf > genscan.gtf
     cat subopt/*.bed > genscanSubopt.bed
     cat pep/*.pep > genscan.pep
     # Clean up
     rm -r /cluster/data/dasNov1/chunksUnsplitMasked
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/dasNov1/bed/genscan
     # Reloaded without -genePredExt 1/6/05:
     ldHgGene -gtf dasNov1 genscan genscan.gtf
     hgPepPred dasNov1 generic genscanPep genscan.pep
     hgLoadBed dasNov1 genscanSubopt genscanSubopt.bed
 
 
 # MAKE DOWNLOADABLE FILES (DONE 11/4/04 angie)
 # RECREATE BIGZIPS DOWNLOADS AND README FILE (DONE, 2006-05-05, hartera)
 # ADDED md5sum.txt FILE FOR LIFTOVER DOWNLOADS AND CREATED CORRECT md5sum.txt
 # FOR vsMm7 AND vsHg18 DOWNLOADS (DONE, 2006-05-23, hartera)    
     ssh kkstore01
     mkdir /cluster/data/dasNov1/zips
     cd /cluster/data/dasNov1
     zip -j zips/scaffoldOut.zip RMOut/scaffolds.fa.out
     zip -j zips/scaffoldFa.zip scaffolds.fa
     zip -j zips/scaffoldFaMasked.zip scaffolds.fa.masked
     zip -j zips/scaffoldTrf.zip bed/simpleRepeat/trfMask.bed
     foreach f (zips/*.zip)
       echo $f
       unzip -t $f | tail -1
     end
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dasNov1
     cd /usr/local/apache/htdocs/goldenPath/dasNov1
     mkdir bigZips database
     # Create README.txt files in bigZips/ and database/ to explain the files.
     cd bigZips
     cp -p /cluster/data/dasNov1/zips/*.zip .
     md5sum *.zip > md5sum.txt
     
     # Add more bigZips downloads. Some of the above downloads don't exist 
     # anymore in bigZips in /usr/local/apache/htdocs/goldenPath/... on hgwdev
     # (2006-05-05, hartera)
     ssh kkstore04
     mkdir /cluster/data/dasNov1/bigZips
     cd /cluster/data/dasNov1
     # soft-masked scaffolds sequences
     cp -p dasNov1.masked.fa.gz ./bigZips/scaffoldSoftMask.fa.gz
     # assembly agp
     cp -p ./broad/assembly.agp.gz ./bigZips/
     # Simple Repeats
     cp -p ./bed/simpleRepeat/trfMask.bed ./bigZips/trf.bed 
     # RepeatMasker output
     cp -p ./RMOut1/dasNov1.out ./bigZips/rmsk.out
     # unmasked scaffolds sequences
     cp -p dasNov1.unmasked.fa.gz ./bigZips/scaffold.fa.gz
     cd bigZips
     gzip trf.bed
     gzip rmsk.out
     # check integrity of files
     foreach f (*.gz)
       echo $f
       gunzip -t $f | tail -1
     end
     md5sum *.gz > md5sum.txt
     
     # link the *.gz and *.txt files to hgwdev:/usr/local/apache/....
     ssh hgwdev
     set gp=/usr/local/apache/htdocs/goldenPath/dasNov1
     rm -r $gp/bigZips
     mkdir -p $gp/bigZips
     ln -s /cluster/data/dasNov1/bigZips/{*.gz,*.txt} $gp/bigZips
     # copy over README.txt and edit for bigZips
     # dasNov1ToMm7.over.chain.gz is in the md5sum.txt in the
     # vsMm7 directory. It should be in the liftOver directory so move it there
     # and re-make md5sum.txt for vsMm7 dir.
     # Add liftOver md5sum.txt (hartera, 2006-05-23)
     ssh hgwdev
     set gp=/usr/local/apache/htdocs/goldenPath/dasNov1
     # copy over from hgwdevold
     cd /cluster/data/dasNov1/bed/liftOver
     scp hgwdevold:$gp/liftOver/dasNov1ToMm7.over.chain.gz .
     ln -s /cluster/data/dasNov1/bed/liftOver/*.gz $gp/liftOver
     cd $gp/liftOver
     # create md5sum.txt
     md5sum *.gz > md5sum.txt
     # recreate md5sum.txt for vsMm7 files
     cd $gp/vsMm7
     rm md5sum.txt
     md5sum *.gz > md5sum.txt
     # recreate md5sum.txt for vsHg18 files as this includes files in an 
     # axtNet directory that is not in this directory.
     cd $gp/vsHg18
     rm md5sum.txt
     md5sum *.gz > md5sum.txt
     # Check that README.txt is correct for vsMm7 and vsHg18.
 
 # SWAP DM1-DROANA1 BLASTZ (DONE 11/4/04 angie)
     ssh kkstore01
     mkdir /cluster/data/dasNov1/bed/blastz.dm1.swap.2004-11-03
     ln -s blastz.dm1.swap.2004-11-03 /cluster/data/dasNov1/bed/blastz.dm1
     cd /cluster/data/dasNov1/bed/blastz.dm1
     set aliDir = /cluster/data/dm1/bed/blastz.dasNov1
     cp $aliDir/S1.len S2.len
     cp $aliDir/S2.len S1.len
     # With 11k scaffolds, we don't want a directory with one file per 
     # scaffold.  So just make one .axt with everything -- not too huge 
     # anyway, given these little insect genomes.
     cat $aliDir/axtChrom/chr*.axt \
     | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
     | axtSort stdin dm1.axt
     du -sh $aliDir/axtChrom dm1.axt
 #389M    /cluster/data/dm1/bed/blastz.dasNov1/axtChrom
 #389M    dm1.axt
 
 
 # CHAIN MELANOGASTER BLASTZ (DONE 11/4/04 angie)
     # Run axtChain on kolossus (one big dm1.axt input)
     ssh kolossus
     mkdir /cluster/data/dasNov1/bed/blastz.dm1/axtChain
     cd /cluster/data/dasNov1/bed/blastz.dm1/axtChain
     axtChain -verbose=0 ../dm1.axt /cluster/data/dasNov1/dasNov1.2bit \
       /cluster/data/dm1/nib stdout \
     | chainAntiRepeat /cluster/data/dasNov1/dasNov1.2bit \
       /cluster/data/dm1/nib stdin stdout \
     | chainMergeSort stdin > all.chain
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/dasNov1/bed/blastz.dm1/axtChain
     hgLoadChain -tIndex dasNov1 chainDm1 all.chain
 
 
 # NET MELANOGASTER BLASTZ (DONE 11/4/04 angie)
     ssh kkstore01
     cd /cluster/data/dasNov1/bed/blastz.dm1/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/dasNov1/bed/blastz.dm1/axtChain
     netClass -noAr noClass.net dasNov1 dm1 melanogaster.net \
     |& g -v "table gap doesn't exist"
 
     # Make a 'syntenic' subset:
     ssh kkstore01
     cd /cluster/data/dasNov1/bed/blastz.dm1/axtChain
     rm noClass.net
     netFilter -syn melanogaster.net > melanogasterSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/dasNov1/bed/blastz.dm1/axtChain
     netFilter -minGap=10 melanogaster.net |  hgLoadNet dasNov1 netDm1 stdin
     netFilter -minGap=10 melanogasterSyn.net \
     | hgLoadNet dasNov1 netSyntenyDm1 stdin
 
 
 # MAKE AXTNET (DONE 11/4/04 angie)
     ssh kkstore01
     cd /cluster/data/dasNov1/bed/blastz.dm1/axtChain
     netToAxt melanogaster.net all.chain /cluster/data/dasNov1/dasNov1.2bit \
         /cluster/data/dm1/nib stdout \
       | axtSort stdin melanogasterNet.axt
 
 
 # MAKE VSDM1 DOWNLOADABLES (DONE 11/4/04 angie)
     ssh kkstore01
     cd /cluster/data/dasNov1/bed/blastz.dm1/axtChain
     nice gzip *.{chain,net,axt}
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/dasNov1/vsDm1
     cd /usr/local/apache/htdocs/goldenPath/dasNov1/vsDm1
     cp -p /cluster/data/dasNov1/bed/blastz.dm1/axtChain/all.chain.gz \
       melanogaster.chain.gz
     cp -p /cluster/data/dasNov1/bed/blastz.dm1/axtChain/melanogaster.net.gz .
     cp -p /cluster/data/dasNov1/bed/blastz.dm1/axtChain/melanogasterNet.axt.gz .
     # Make a README.txt which explains the files & formats.
     md5sum *.gz */*.gz > md5sum.txt
 
 
 # MAKE 11.OOC FILE FOR BLAT (DONE 11/4/04 angie)
     # Use -repMatch=100 (based on size -- for human we use 1024, and 
     # fly size is ~4.4% of human judging by gapless dm1 genome size from 
     # featureBits -- we would use 45, but bump that up a bit to be more 
     # conservative).
     ssh kkr1u00
     mkdir /cluster/bluearc/dasNov1
     blat /cluster/data/dasNov1/dasNov1.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/dasNov1/11.ooc -repMatch=100
 #Wrote 9721 overused 11-mers to /cluster/bluearc/dasNov1/11.ooc
     cp -p /cluster/bluearc/dasNov1/*.ooc /iscratch/i/dasNov1/
     iSync
 
 
 # GET GENBANK mRNA AND EST COUNTS (DONE 11/8/04 angie)
     # Go to the latest GenBank full release dir and get an idea of how
     # many mRNAs and ESTs there are to align.
     ssh eieio
     cd /cluster/data/genbank/data/processed/genbank.144.0/full
     awk '$4 == "Drosophila" {print $4 " " $5;}' mrna.gbidx | sort | uniq -c
 #      9 Drosophila ananassae
 #      1 Drosophila mojavensis
 #     33 Drosophila virilis
     # Wow, questionable whether we should have a native mRNA track here.
     awk '$4 == "Drosophila" {print $4 " " $5;}' est*.gbidx | sort | uniq -c
 # 382439 Drosophila melanogaster
 #   4105 Drosophila simulans
 #    779 Drosophila yakuba
     # And a native EST track isn't even a possibility for the new flies 
     # at this point!  
 
 
 # AUTO UPDATE GENBANK MRNA RUN (DONE 11/16/04 angie)
     ssh hgwdev
 
     # Update genbank config and source in CVS:
     cd ~/kent/src/hg/makeDb/genbank
     cvsup .
 
     # Edit etc/genbank.conf and add these lines (note scaffold-browser settings):
 # dasNov1 (D. ananassae)
 dasNov1.genome = /iscratch/i/dasNov1/dasNov1.2bit
 dasNov1.mondoTwoBitParts = 1000
 dasNov1.lift = no
 dasNov1.refseq.mrna.native.load = no
 dasNov1.refseq.mrna.xeno.load = yes
 dasNov1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
 dasNov1.genbank.mrna.xeno.load = yes
 # GenBank has no D. ananassae ESTs at this point... that may change.
 dasNov1.genbank.est.native.load = no
 dasNov1.genbank.est.xeno.load = no
 dasNov1.downloadDir = dasNov1
 dasNov1.perChromTables = no
 
     cvs ci etc/genbank.conf
     # Since D. ananassae is a new species for us, edit src/lib/gbGenome.c.  
     # Pick some other browser species, & monkey-see monkey-do.  
     cvs diff src/lib/gbGenome.c
     make
     cvs ci src/lib/gbGenome.c
     # Edit src/align/gbBlat to add /iscratch/i/dasNov1/11.ooc
     cvs diff src/align/gbBlat
     make
     cvs ci src/align/gbBlat
 
     # Install to /cluster/data/genbank:
     make install-server
 
     ssh `fileServer /cluster/data/genbank/`
     cd /cluster/data/genbank
     # This is an -initial run, (xeno) RefSeq only:
     nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial dasNov1 &
     tail -f [its logfile]
     # Load results:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dasNov1
     featureBits dasNov1 xenoRefGene
 #16520385 bases of 165766797 (9.966%) in intersection
     # Clean up:
     rm -rf work/initial.dasNov1
 
     # This is an -initial run, mRNA only:
     nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial dasNov1 &
     tail -f [its logfile]
     # Load results:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dasNov1
     featureBits dasNov1 all_mrna
 #19602 bases of 165766797 (0.012%) in intersection
     featureBits dasNov1 xenoMrna
 #17295487 bases of 165766797 (10.434%) in intersection
     # Clean up:
     rm -rf work/initial.dasNov1
 
 
 # MAKE GCPERCENT (DONE 11/4/04 angie)
     ssh hgwdev
     mkdir /cluster/data/dasNov1/bed/gcPercent
     cd /cluster/data/dasNov1/bed/gcPercent
     # create and load gcPercent table
     hgGcPercent dasNov1 /cluster/data/dasNov1
 
 
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 7-20-05 braney)
     ssh hgwdev
     echo 'insert into blatServers values("dasNov1", "blat2", "17780", 1, 0); \
           insert into blatServers values("dasNov1", "blat2", "17781", 0, 1);' \
       | hgsql -h genome-testdb hgcentraltest
 
 
 # MAKE Drosophila Proteins track  (DONE braney 11/17/04)
     ssh kkstore01
     mkdir -p /cluster/data/dasNov1/blastDb
     cd /cluster/data/dasNov1/blastDb
     faSplit sequence ../scaffolds.fa 400 x
     for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
     rm *.fa *.log
 
     ssh kkr1u00
     mkdir -p /iscratch/i/dasNov1/blastDb
     cp /cluster/data/dasNov1/blastDb/* /iscratch/i/dasNov1/blastDb
     (iSync) 2>&1 > sync.out
     
     mkdir -p /cluster/data/dasNov1/bed/tblastn.dm1FB
     cd /cluster/data/dasNov1/bed/tblastn.dm1FB
     ls -1S /iscratch/i/dasNov1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst
     exit
 
     # back to kkstore01
     cd /cluster/data/dasNov1/bed/tblastn.dm1FB
     mkdir fbfa
     # calculate a reasonable number of jobs
     calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\)
     # 18735/(150000/396) = 49.460400
     split -l 49 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
     cd fbfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S fbfa/*.fa > fb.lst
     mkdir blastOut
     for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
 if /cluster/bin/i386/blastToPsl $f.1 $f.2
 then
 	liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2
 
         mv $3.tmp $3
         rm -f $f.1 $f.2 
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp 
 exit 1
 '_EOF_'
 
     chmod +x blastSome
     gensub2 bug.lst fb.lst blastGsub blastSpec
 
     ssh kk
     cd /cluster/data/dasNov1/bed/tblastn.dm1FB
     para create blastSpec
     para try, push
 
 # Completed: 151668 of 151668 jobs
 # CPU time in finished jobs:    2932565s   48876.08m   814.60h   33.94d  0.093 y
 # IO & Wait Time:                694006s   11566.77m   192.78h    8.03d  0.022 y
 # Average job time:                  24s       0.40m     0.01h    0.00d
 # Longest job:                     2721s      45.35m     0.76h    0.03d
 # Submission to last job:         73860s    1231.00m    20.52h    0.85d
 
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainSome
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainSome
 
     ls -1dS `pwd`/blastOut/fb?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     para create chainSpec
 
     # should run this on the mini-cluster or with my shove script
     # so you can limit the number of jobs starting to 3 or 4
     para try, push...
 
 # Completed: 383 of 383 jobs
 # CPU time in finished jobs:        327s       5.44m     0.09h    0.00d  0.000 y
 # IO & Wait Time:                  8218s     136.97m     2.28h    0.10d  0.000 y
 # Average job time:                  22s       0.37m     0.01h    0.00d
 # Longest job:                       54s       0.90m     0.01h    0.00d
 # Submission to last job:           674s      11.23m     0.19h    0.01d
 
     exit
     # back to kkstore01
     cd /cluster/data/dasNov1/bed/tblastn.dm1FB/blastOut
     for i in fb??
     do 
 	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/dasNov1/bed/tblastn.dm1FB/blastDm1FB.psl
 
     ssh hgwdev
     cd /cluster/data/dasNov1/bed/tblastn.dm1FB
 
     hgLoadPsl dasNov1 blastDm1FB.psl
 
 # End tblastn
 
 
 # SWAP CHAINS FROM DM2, BUILD NETS ETC. (DONE 3/2/05 angie)
     ssh kkstore01
     mkdir /cluster/data/dasNov1/bed/blastz.dm2.swap
     cd /cluster/data/dasNov1/bed/blastz.dm2.swap
     doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.dasNov1/DEF \
       >& do.log &
     tail -f do.log
     # Add {chain,net}Dm2 to trackDb.ra if necessary.
     # Add /usr/local/apache/htdocs/goldenPath/dasNov1/vsDm2/README.txt
 
 
 # CHAIN  AND NET SWAPPED HUMAN BLASTZ (WORKING 12/22/05 kate)
 #  Working in Brian's blastz dir (not doced).
 #  This procedure follows conventions in doBlastzNetChain.pl,
 #  so it can be used to complete the processing
     ssh kki
     cd /cluster/data/dasNov1/bed/zb.hg17
     ln -s `pwd` /cluster/data/hg17/bed/blastz.dasNov1 
     mkdir -p axtChain/run/chain
     ls -1S /cluster/data/dasNov1/bed/zb.hg17/axtChrom/*.axt.gz | \
         sed 's/.gz//' >  axtChain/run/input.lst
     cd axtChain/run
 
 cat > chain.csh << 'EOF'
 #!/bin/csh -ef
 zcat $1 | \
 axtChain -verbose=0 -minScore=3000 -linearGap=medium stdin \
     /cluster/data/hg17/nib /cluster/data/dasNov1/dasNov1.2bit stdout | \
 chainAntiRepeat /cluster/data/hg17/nib /cluster/data/dasNov1/dasNov1.2bit \
           stdin $2
 'EOF'
     # << happy emacs
 cat > gsub << 'EOF'
 #LOOP
 chain.csh {check in exists $(path1).gz} {check out line+ chain/$(root1).chain}
 #ENDLOOP
 'EOF'
     # << happy emacs
     chmod a+x chain.csh
     gensub2 input.lst single gsub jobList
     para create jobList; para try
     para check 
     para push
     # wait for completion
     para time >  run.time
 
     # finish pipeline
     cd /cluster/data/dasNov1/bed/zb.hg17
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -continue=chainMerge \
 	-bigClusterHub=pk \
 	`pwd`/DEF >& blastz.out &
 
 
 ############################################################################
 ##  BLASTZ swap from mm9 alignments (2007-11-11 - markd)
     ssh hgwdev
     mkdir /cluster/data/dasNov1/bed/blastz.mm9.swap
     cd /cluster/data/dasNov1/bed/blastz.mm9.swap
     ln -s blastz.mm9.swap ../blastz.mm9
     /cluster/bin/scripts/doBlastzChainNet.pl \
         -swap /cluster/data/mm9/bed/blastz.dasNov1/DEF >& swap.out&
 
     nice +19 featureBits dasNov1 chainMm9Link
     # 438318232 bases of 2146362222 (20.421%) in intersection
 
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 
 ############################################################################
 #  dasNov1 - Armadillo - Ensembl Genes version 51  (DONE - 2008-12-03 - hiram)
     ssh kkr14u07
     cd /hive/data/genomes/dasNov1
     cat << '_EOF_' > dasNov1.ensGene.ra
 # required db variable
 db dasNov1
 # do we need to translate geneScaffold coordinates
 geneScaffolds yes
 # ignore genes that do not properly convert to a gene pred, and contig
 #	names that are not in the UCSC assembly
 skipInvalid yes
 # ignore the three genes that have invalid structures from Ensembl:
 # 18192: ENSDNOT00000004471 no exonFrame on CDS exon 7
 # 35247: ENSDNOT00000007696 no exonFrame on CDS exon 8
 # 38952: ENSDNOT00000019234 no exonFrame on CDS exon 0
 '_EOF_'
 #  << happy emacs
 
     doEnsGeneUpdate.pl -ensVersion=51 dasNov1.ensGene.ra
     ssh hgwdev
     cd /hive/data/genomes/dasNov1/bed/ensGene.51
     featureBits dasNov1 ensGene
     # 22658142 bases of 2146362222 (1.056%) in intersection
 
  *** All done!  (through the 'makeDoc' step)
  *** Steps were performed in /hive/data/genomes/dasNov1/bed/ensGene.51
 
 ############################################################################
+############################################################################
+# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
+
+vertebrate-wide transMap alignments were built  Tracks are created and loaded
+by a single Makefile. This is available from:
+   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
+
+see doc/builds.txt for specific details.
+############################################################################