src/hg/makeDb/doc/taeGut1.txt 1.17

1.17 2009/06/02 23:50:26 markd
rebuild incorrect data that was submitted
Index: src/hg/makeDb/doc/taeGut1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/taeGut1.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/makeDb/doc/taeGut1.txt	18 Apr 2009 05:13:37 -0000	1.16
+++ src/hg/makeDb/doc/taeGut1.txt	2 Jun 2009 23:50:26 -0000	1.17
@@ -1,710 +1,715 @@
 # for emacs: -*- mode: sh; -*-
 
 # zebra finch ( Taeniopygia guttata )
 
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE braney 2008-08-05)
     ssh kkstore03
     mkdir /cluster/store7/taeGut1
     ln -s /cluster/store7/taeGut1 /cluster/data
     mkdir /cluster/data/taeGut1/fromWustl
     cd /cluster/data/taeGut1/fromWustl
 
     wget "http://genome.wustl.edu/pub/user/lhillier/private/T_guttata/Taeniopygia_guttata-3.2.4.tar.gz"
 
     gunzip *
     mkdir tmp
     cd tmp
     tar xvf ../*.tar
     cat *.fa | sed 's/Chr/chr/' | sed 's/LG/chrLG/' |  gzip -c > ../assembly.bases.gz
     cat *.agp | sed 's/Chr/chr/' | sed 's/LG/chrLG/' > ../assembly.agp
     # some scaffolds missing quality values
 XXX - this is broken.  These *qual files are already in chrom coordinates,
 XXX - they do not need a lift.  The lift had the effect of applying -mScore=98
 XXX - to all values.  --Hiram 2008-12-04
     cat *qual | sed 's/Chr/chr/' | sed 's/LG/chrLG/'  | qaToQac stdin  stdout | qacAgpLift -mScore=98 ../assembly.agp stdin ../taeGut1.qual.qac
     cd ..
 
    cut -f 1 assembly.agp | uniq -c | wc -l 
    # Number of scaffolds: 69
 
 #########################################################################
 # Create .ra file and run makeGenomeDb.pl (DONE braney 2008-08-05)
     ssh kkstore03
     cd /cluster/data/taeGut1
 cat << _EOF_ >taeGut1.config.ra
 # Config parameters for makeGenomeDb.pl:
 db taeGut1
 clade vertebrate
 genomeCladePriority 68
 scientificName Taeniopygia guttata
 commonName Zebra finch
 assemblyDate Jul. 2008
 assemblyLabel Washington University taeGut1 
 orderKey 437
 mitoAcc NC_007897
 fastaFiles /cluster/data/taeGut1/fromWustl/assembly.bases.gz
 agpFiles /cluster/data/taeGut1/fromWustl/assembly.agp
 qualFiles /cluster/data/taeGut1/fromWustl/taeGut1.qual.qac
 dbDbSpeciesDir zebraFinch
 _EOF_
 
 # use 'screen' make sure on kkstore03
     makeGenomeDb.pl -verbose=2 taeGut1.config.ra > makeGenomeDb.out 2>&1 &
 
 # 'ctl-a ctl -d' returns to previous shell
 cut -f 2 chrom.sizes | ave stdin
 # Q1 432944.500000
 # median 2517995.000000
 # Q3 17897912.000000
 # average 17616947.728571
 # min 9909.000000
 # max 175225315.000000
 # count 70
 # total 1233186341.000000
 # standard deviation 35378770.640295
 
 
 #########################################################################
 # SIMPLE REPEATS TRF (DONE braney 2008-08-05)
     ssh kkstore03
     screen # use a screen to manage this job
     mkdir /cluster/data/taeGut1/bed/simpleRepeat
     cd /cluster/data/taeGut1/bed/simpleRepeat
     # 
     doSimpleRepeat.pl -buildDir=/cluster/data/taeGut1/bed/simpleRepeat \
 	taeGut1 > do.log 2>&1 &
 
     #### When done
     ssh pk
     para time
 
     featureBits taeGut1 simpleRepeat
     # 25363702 bases of 1222864691 (2.074%) in intersection
 
     # Link to it from /gbdb: 
     ln -s /cluster/data/taeGut1/taeGut1.2bit /gbdb/taeGut1/taeGut1.2bit
 
 #########################################################################
 # Run WindowMasker (DONE braney  2008-08-05)
     ssh kkstore03
     screen
     mkdir /cluster/data/taeGut1/bed/windowMasker
     cd /cluster/data/taeGut1/bed/windowMasker
     nice doWindowMasker.pl -workhorse=kolossus \
         -buildDir=/cluster/data/taeGut1/bed/windowMasker taeGut1 > wmRun.log 2>&1 &
 
     #	load this initial data to get ready to clean it
     ssh hgwdev
     cd /cluster/data/taeGut1/bed/windowMasker
     hgLoadBed taeGut1 windowmaskerSdust windowmasker.sdust.bed.gz
     # Loaded 7487462 elements of size 3
 
     #	eliminate the gaps from the masking
     featureBits taeGut1 -not gap -bed=notGap.bed
     featureBits taeGut1 windowmaskerSdust
 # 260083327 bases of 1222864691 (21.268%) in intersection
 
     time nice -n +19 featureBits taeGut1 windowmaskerSdust notGap.bed \
         -bed=stdout | gzip -c > cleanWMask.bed.gz
     # 249761677 bases of 1222864691 (20.424%) in intersection
 
     #	reload track to get it clean
     hgLoadBed taeGut1 windowmaskerSdust cleanWMask.bed.gz
     featureBits taeGut1 windowmaskerSdust
     # 249761677 bases of 1222864691 (20.424%) in intersection
 
     #	mask the sequence with this clean mask
     zcat cleanWMask.bed.gz \
 	| twoBitMask ../../taeGut1.unmasked.2bit stdin \
 	    -type=.bed taeGut1.cleanWMSdust.2bit
     twoBitToFa taeGut1.cleanWMSdust.2bit stdout | faSize stdin \
         > taeGut1.cleanWMSdust.faSize.txt
     cat taeGut1.cleanWMSdust.faSize.txt
 # 1233186341 bases (10321650 N's 1222864691 real 973103014 upper 249761677
 # lower) in 70 sequences in 1 files
 # Total size: mean 17616947.7 sd 35634216.3 min 9909 (chr16) max 175225315
 # (chrUn) median 2517995
 # N count: mean 147452.1 sd 400992.9
 # U count: mean 13901471.6 sd 27798925.9
 # L count: mean 3568024.0 sd 7705836.1
 # %20.25 masked total, %20.42 masked real
 
     cp taeGut1.cleanWMSdust.2bit /cluster/data/taeGut1/taeGut1.2bit
     ln -s /cluster/data/taeGut1/taeGut1.2bit /gbdb/taeGut1
 
 #########################################################################
 # Create OOC file for genbank runs (DONE braney 2008-08-05)
 # use same repMatch value as bosTau2
     ssh kkstore03
     cd /cluster/data/taeGut1
     blat taeGut1.2bit /dev/null /dev/null -tileSize=11 \
     	-makeOoc=taeGut1.11.1005.ooc -repMatch=1005
 # -rw-rw-r--  1 braney protein 6368 Aug  5 14:31 taeGut1.11.1005.ooc
 
     blat taeGut1.2bit /dev/null /dev/null -tileSize=11 \
 	-makeOoc=taeGut1.11.ooc -repMatch=1024
 # -rw-rw-r--  1 braney protein 6044 Jul 30 16:03 taeGut1.11.ooc
 
     mkdir -p /cluster/bluearc/scratch/data/taeGut1
     cp taeGut1.2bit taeGut1.11.ooc chrom.sizes /cluster/bluearc/scratch/data/taeGut1
     # request admins to sync these to pk
 
 #########################################################################
 ## Create a lift file for genbank work(DONE braney 2008-08-05)
     ssh kkstore03
     cd /cluster/data/taeGut1
     cat  fromWustl/assembly.agp |  /cluster/bin/scripts/agpToLift -revStrand > jkStuff/liftAll.lft
 
 #########################################################################
 # GENBANK AUTO UPDATE (DONE braney 2008-08-06) 
     # align with latest genbank process.
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add taeGut1 
 
 # taeGut1
 taeGut1.serverGenome = /cluster/data/taeGut1/taeGut1.2bit
 taeGut1.clusterGenome = /scratch/data/taeGut1/taeGut1.2bit
 taeGut1.ooc = /scratch/data/taeGut1/11.ooc
 taeGut1.align.unplacedChroms = chrUn,chr*_random
 taeGut1.lift = /cluster/data/taeGut1/jkStuff/liftAll.lft
 taeGut1.refseq.mrna.native.pslCDnaFilter  =
 ${ordered.refseq.mrna.native.pslCDnaFilter}
 taeGut1.refseq.mrna.xeno.pslCDnaFilter    =
 ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 taeGut1.genbank.mrna.native.pslCDnaFilter =
 ${ordered.genbank.mrna.native.pslCDnaFilter}
 taeGut1.genbank.mrna.xeno.pslCDnaFilter   =
 ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 taeGut1.genbank.est.native.pslCDnaFilter  =
 ${ordered.genbank.est.native.pslCDnaFilter}
 taeGut1.refseq.mrna.native.load = yes
 taeGut1.refseq.mrna.xeno.load = yes
 taeGut1.genbank.mrna.xeno.load = yes
 taeGut1.downloadDir = taeGut1
 
     cvs ci -m "Added taeGut1." etc/genbank.conf
 
     # add taeGut to src/lib/gbGenome.c
     cvs ci -m "Added taeGut" src/lib/gbGenome.c
 
     # update /cluster/data/genbank/:
     make etc-update 
 
     ssh genbank
     cd /cluster/data/genbank
     time nice -n +19 bin/gbAlignStep -initial taeGut1 &
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad taeGut1
 
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add taeGut1 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added taeGut1 - zebra finch" etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 ############################################################################
 ## Default position  (DONE braney 2008-08-06)
     ssh hgwdev
     hgsql -e 'update dbDb set defaultPos="chrZ:56,293,325-56,333,199"
 	where name="taeGut1";' hgcentraltest
 
 ############################################################################
 # MAKE DOWNLOADABLE FILES (restarted braney 2008-08-02)
     ssh kkstore03
     screen
     cd /cluster/data/taeGut1
     # edited makeDownloads.pl to take out repeat masker files
     # makeDownloads.pl taeGut1 > downloads.log 2>&1 &
     /cluster/bin/scripts/fripple taeGut1 > downloads.log 2>&1 &
 
 ############################################################################
 ## Adding a Photograph, from Art Arnold in email (restarted braney 2008-08-03)
     mkdir /cluster/data/taeGut1/photograph
     cd /cluster/data/taeGut1/photograph
     ls -l Taeniopygia_guttata.jpg
 # -rw-rw-r--  1 braney protein 16160 Aug  4 09:43 Taeniopygia_guttata.jpg
 
     #	check this .jpg file into the browser doc source tree
     cvs add -kb Taeniopygia_guttata.jpg
     cvs commit Taeniopygia_guttata.jpg
     cp -p Taeniopygia_guttata.jpg /usr/local/apache/htdocs/images
 
 ############################################################################
 ##########################################################################
 ## Creating pushQ (DONE - braney - 2008-08-06 )
     ssh hgwdev
     mkdir /cluster/data/taeGut1/pushQ
     cd /cluster/data/taeGut1/pushQ
     /cluster/bin/scripts/makePushQSql.pl taeGut1 > taeGut1.sql 2> stderr.out
     ## check the stderr.out for anything that needs to be fixed
     scp taeGut1.sql hgwbeta.cse.ucsc.edu:/tmp
     ssh hgwbeta
     cd /tmp
     hgsql qapushq < taeGut1.sql
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2008-08-09 - braney)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("taeGut1", "blat2", "17782", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("taeGut1", "blat2", "17783", "0", "1");' \
 	    hgcentraltest
 
 
 ############################################################################
 # BAC ends    (working braney...)
 
     ssh kkstore03
     mkdir /cluster/data/taeGut1/bed/bacEnds
     cd /cluster/data/taeGut1/bed/bacEnds
     mkdir fromWustl
     cd fromWustl
     wget "http://genome.wustl.edu/pub/organism/Other_Vertebrates/Taeniopygia_guttata/end_sequences/bac_ends.fa.gz"
     gunzip *
 
     ssh hgwdev
     mkdir -p /gbdb/taeGut1/bacEnds
     ln -s `pwd`/bac_ends.fa /gbdb/taeGut1/bacEnds
     cd /tmp
     hgLoadSeq taeGut1 /gbdb/taeGut1/bacEnds/bac_ends.fa
 
     ssh kkstore03
     cd /cluster/data/taeGut1/bed/bacEnds
     rm -rf /san/sanvol1/scratch/taeGut1.splitEnds
     mkdir -p /san/sanvol1/scratch/taeGut1.splitEnds
     faSplit sequence fromWustl/bac_ends.fa 400 /san/sanvol1/scratch/taeGut1.splitEnds/x
 
     mkdir run.blat
     cd run.blat
     ls /san/sanvol1/scratch/taeGut1.splitEnds/*.fa > inSeq.lst
     rm -rf /cluster/bluearc/taeGut1.blatEnds
     mkdir -p /cluster/bluearc/taeGut1.blatEnds
 
     for i in `cat inSeq.lst`
     do
 	f=`basename $i .fa`.psl
 	echo /cluster/bin/x86_64/blat /scratch/data/taeGut1/taeGut1.2bit  \
 	  -ooc=/scratch/data/taeGut1/taeGut1.11.ooc -tileSize=11 \
 	    $i {check out line+ /cluster/bluearc/taeGut1.blatEnds/$f}
     done > jobList
 
     ssh memk
     cd /cluster/data/taeGut1/bed/bacEnds/run.blat
     para create jobList
     para time
 
 # CPU time in finished jobs:     124366s    2072.77m    34.55h    1.44d  0.004 y
 # IO & Wait Time:                   865s      14.41m     0.24h    0.01d  0.000 y
 # Average job time:                 383s       6.38m     0.11h    0.00d
 # Longest finished job:             557s       9.28m     0.15h    0.01d
 # Submission to last job:          4817s      80.28m     1.34h    0.06d
 
     ssh kkstore03
     cd /cluster/data/taeGut1/bed/bacEnds
     for i in /cluster/bluearc/taeGut1.blatEnds/*.psl; do tail +6 $i; done | pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons stdin bacEnds.psl /dev/null
 
     cat fromWustl/bac_ends.fa | awk '/>/ {print $1}'  | \
 	tr -d '>'| sed 's/\./ /' | \
     awk '{clones[$1] = $2 " " clones[$1] } \
 	END {for (i in clones) print i, clones[i]}' | \
     awk 'BEGIN {OFS="\t"} \
 	{first="";second=""; \ 
 	 for(ii=2; ii <= NF ; ii++) \
 	    {if (($ii == "b1") || ($ii == "b2")) \
 		first=$1 "." $ii "," first; \
 	     if (($ii == "g1") || ($ii == "g2")) \
 		second=$1 "." $ii "," second;} 
 	 print first,second,$1}' | \
     awk '{if (NF == 3) print}' > allPairs.txt
 
     time /cluster/bin/x86_64/pslPairs -tInsert=10000 \
       -minId=0.91 -noBin -min=25000 \
       -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
       -mismatch -verbose bacEnds.psl allPairs.txt \
         all_bacends bacEnds
 
     # Filter by score and sort by {chrom,chromStart}:
     awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
     cat bacEnds.{slop,short,long,mismatch,orphan} \
     | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
 
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $5}' bacEndPairs.bed | sort -u
 
     /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
       bacEndPairsBad.bed \
     | sorttbl tname tstart | headchg -del \
     > bacEnds.load.psl
     wc -l bacEnds.*
 #   526720 bacEnds.load.psl
 #       65 bacEnds.long
 #     1412 bacEnds.mismatch
 #    33163 bacEnds.orphan
 #   111379 bacEnds.pairs
 #   625401 bacEnds.psl
 #      530 bacEnds.short
 #      312 bacEnds.slop
 #  1298982 total
 
     # load into database
     ssh hgwdev
     cd /cluster/data/taeGut1/bed/bacEnds
     hgLoadBed -notItemRgb taeGut1 bacEndPairs bacEndPairs.bed \
         -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     # note - the "Bad" track isn't pushed to RR, just used for assembly QA.
     hgLoadBed -notItemRgb taeGut1 bacEndPairsBad bacEndPairsBad.bed \
       -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     hgLoadPsl taeGut1 -table=all_bacends bacEnds.load.psl
 
     featureBits taeGut1 all_bacends
 # 154544748 bases of 1222864691 (12.638%) in intersection
     featureBits taeGut1 bacEndPairs
 # 118749166 bases of 1222864691 (9.711%) in intersection
     featureBits taeGut1 bacEndPairsBad
 # 21778830 bases of 1222864691 (1.781%) in intersection
 
 #########################################################################
 # PRODUCING GENSCAN PREDICTIONS (DONE 2008-08-29 braney)
     ssh hgwdev
     mkdir /cluster/data/taeGut1/bed/genscan
     cd /cluster/data/taeGut1/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
     ssh swarm
     cd /cluster/data/taeGut1/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt fasta
 
     for i in ../../goldenPath/chromosomes/*.gz
     do
 	echo $i
 	maskOutFa $i hard fasta/`basename $i .gz`
     done
 
     ls fasta/*.fa > genome.lst
     wc -l genome.lst
 # 70 genome.lst
 
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 genome.lst single gsub jobList
     para make jobList
     para time
 #Completed: 255 of 259 jobs
 #Crashed: 4 jobs
 #CPU time in finished jobs:      82258s    1370.97m    22.85h    0.95d  0.003 y
 #IO & Wait Time:                  1053s      17.55m     0.29h    0.01d  0.000 y
 #Average job time:                 327s       5.45m     0.09h    0.00d
 #Longest finished job:           20131s     335.52m     5.59h    0.23d
 #Submission to last job:         23171s     386.18m     6.44h    0.27d
 
     # two jobs crashed due to genscan running out of memory, re-run it 
     # with "-window=1200000" instead of "-window=2400000".
     para crashed | sed 's/2400000/1200000/' > crash.jobs
     para create -batch=crashBatch crash.jobs
     para push -batch=crashBatch
 
     # Convert these to chromosome level files as so:
     cat gtf/*.gtf > genscan.gtf
     cat subopt/*.bed > genscanSubopt.bed 
     cat pep/*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/taeGut1/bed/genscan
     ldHgGene taeGut1 genscan genscan.gtf
     hgPepPred taeGut1 generic genscanPep genscan.pep
     hgLoadBed taeGut1 genscanSubopt genscanSubopt.bed
 
     # cleanup
     rm -rf fasta
 
 #####################################################
 # build 2bit with contigs from random chroms and chrUn (reDONE braney 2008-09-06)
     cd /cluster/data/taeGut1
     fgrep 'random 
 Un' taeGut1.agp | awk '{if ($5 == "W") print $1, $2-1, $3, $6, $9}' \
     | while read chr start stop contig strand; 
     do 
 	echo ">$contig"; 
 	if test $strand == '+'
 	then
 	    twoBitToFa taeGut1.2bit:$chr:$start-$stop stdout \
 		| tail -n +2; 
 	else
 	    twoBitToFa taeGut1.2bit:$chr:$start-$stop stdout | \
 		faRc -keepCase stdin stdout | tail -n +2; 
 	fi
     done > unRandom.fa
 
     fgrep -v 'random 
 Un' chrom.sizes | awk '{print $1}' | \
     twoBitToFa taeGut1.2bit -seqList=stdin noUn.fa
     cat noUn.fa unRandom.fa | faToTwoBit stdin taeGut1.blastz.2bit
 
     twoBitInfo taeGut1.blastz.2bit taeGut1.blastz.sizes
 
 #########################################################################
 ## BLASTZ galGal3 swap (reDONE braney 2008-09-06)
     ##	the original blastz to galGal3 measured
     ssh swarm
     mkdir /cluster/data/taeGut1/bed/blastz.galGal3.swap
     cd /cluster/data/taeGut1/bed/blastz.galGal3.swap
     screen
     doBlastzChainNet.pl \
 	/cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-verbose=2 -bigClusterHub=swarm -smallCluster=swarm \
 	-workhorse=kolossus -swap > swap.log 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/taeGut1/bed/blastz.galGal3.swap
     nice -n +19 featureBits taeGut1 chainGalGal3Link \
 	> fb.taeGut1l.chainGalGal3Link.txt 2>&1  
 # 802731012 bases of 1222864691 (65.643%) in intersection
 
 
 #########################################################################
 ## BLASTZ hg18 swap (reDONE braney 2008-09-10)
     ssh swarm
     mkdir /cluster/data/taeGut1/bed/blastz.hg18.swap
     cd /cluster/data/taeGut1/bed/blastz.hg18.swap
     screen
     doBlastzChainNet.pl \
 	/cluster/data/hg18/bed/blastz.taeGut1.2008-09-09/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-verbose=2 -bigClusterHub=swarm -smallCluster=swarm \
 	-workhorse=kolossus -swap > swap.log 2>&1 &
 
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-09-07)
     # bash  if not using bash shell already
     ssh kolossus
     mkdir /cluster/data/taeGut1/blastDb
     cd /cluster/data/taeGut1
     awk '{if ($2 > 1000000) print $1}' taeGut1.blastz.sizes > 1meg.lst
     twoBitToFa -seqList=1meg.lst  taeGut1.blastz.2bit temp.fa
     faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
     rm temp.fa 1meg.lst
 
     awk '{if ($2 <= 1000000) print $1}' taeGut1.blastz.sizes > less1meg.lst
     twoBitToFa -seqList=less1meg.lst  taeGut1.blastz.2bit temp.fa
     faSplit about temp.fa 1000000 blastDb/y 
 
     cd blastDb
     for i in *.fa
     do
 	/hive/data/outside/blast229/formatdb -i $i -p F
     done
     rm *.fa
     ls *.nsq | wc -l
 # 1243
 
     mkdir -p /cluster/data/taeGut1/bed/tblastn.hg18KG
     cd /cluster/data/taeGut1/bed/tblastn.hg18KG
     echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 
 # 1243 query.lst
 
    # we want around 150000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(150000/`wc query.lst | awk '{print $1}'`\)
 
 # 36727/(150000/1243) = 304.344407
 
    mkdir -p kgfa
    split -l 304 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/taeGut1/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/hive/data/outside/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/taeGut1/blastDb.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
         if pslCheck -prot $3.tmp
         then                  
             mv $3.tmp $3     
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0               
     fi                      
 fi                         
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     exit 
     
     ssh swarm
     cd /cluster/data/taeGut1/bed/tblastn.hg18KG
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 
 # Completed: 150403 of 150403 jobs
 # CPU time in finished jobs:    7194206s  119903.43m  1998.39h   83.27d  0.228 y
 # IO & Wait Time:                576761s    9612.69m   160.21h    6.68d  0.018 y
 # Average job time:                  52s       0.86m     0.01h    0.00d
 # Longest finished job:             120s       2.00m     0.03h    0.00d
 # Submission to last job:         12759s     212.65m     3.54h    0.15d
 
     ssh swarm
     cd /cluster/data/taeGut1/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS ../blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     para create chainSpec
     para try, check, push, check etc.
 
 # Completed: 121 of 121 jobs
 # CPU time in finished jobs:      14143s     235.72m     3.93h    0.16d  0.000 y
 # IO & Wait Time:                 54085s     901.42m    15.02h    0.63d  0.002 y
 # Average job time:                 564s       9.40m     0.16h    0.01d
 # Longest finished job:             995s      16.58m     0.28h    0.01d
 # Submission to last job:          1047s      17.45m     0.29h    0.01d
 
     cd /cluster/data/taeGut1/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort u.*.psl m60* | uniq > ../unliftBlastHg18KG.psl
     cd ..
     pslCheck unliftBlastHg18KG.psl
     liftUp -nohead temp.psl ../../jkStuff/liftAll.lft carry unliftBlastHg18KG.psl 
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n temp.psl  > blastHg18KG.psl
     rm temp.psl
     pslCheck blastHg18KG.psl
 
     # load table 
     ssh hgwdev
     cd /cluster/data/taeGut1/bed/tblastn.hg18KG
     hgLoadPsl taeGut1 blastHg18KG.psl
 
     # check coverage
     featureBits taeGut1 blastHg18KG 
 # 19587281 bases of 1222864691 (1.602%) in intersection
 
     featureBits taeGut1 all_mrna blastHg18KG  -enrichment
 # all_mrna 0.117%, blastHg18KG 1.602%, both 0.049%, cover 41.81%, enrich 26.10x
 
     rm -rf blastOut
 #end tblastn
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 taeGut1.upstreamGeneTbl = refGene
 
 #########################################
 # RepeatMasker ( 2009-02-04  RMH and braney)
 
 mkdir /hive/data/genomes/taeGut1/bed/RMRunRMH
 cd /hive/data/genomes/taeGut1/bed/RMRunRMH
 doRepeatMasker.pl -buildDir `pwd` taeGut1
 
 featureBits taeGut1 rmsk
 # 99288448 bases of 1222864691 (8.119%) in intersection
 
 # was earlier...
 # 97507442 bases of 1222864691 (7.974%) in intersection
 
 #############################################################################
 # N-SCAN gene predictions (nscanGene) - (2009-04-17 markd)
 
 /mblab.wustl.edu/predictions/taeGut1
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     cd /cluster/data/taeGut1/bed/nscan/
 
     wget -nv http://mblab.wustl.edu/predictions/taeGut1/taeGut1.gtf
     wget -nv http://mblab.wustl.edu/predictions/taeGut1/taeGut1.prot.fa
     bzip2 taeGut1.*
     chmod a-w *
 
     # load track
     ldHgGene -bin -gtf -genePredExt taeGut1 nscanGene taeGut1.gtf.bz2
     hgPepPred taeGut1 generic nscanPep  taeGut1.prot.fa.bz2
     rm *.tab
 
     # create zebraFinch/nscanGene.html
 
     # zebraFinch/taeGut1/trackDb.ra, add:
     track nscanGene override
     informant Zerba Finch N-SCAN uses chicken (galGal3) as the informant
 
     searchTable nscanGene
     searchType genePred
     termRegex chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?
     searchPriority 50
 
+    # a mismatch was found in nscanPep; WUStL recreate
+    wget -nv http://mblab.wustl.edu/predictions/taeGut1/taeGut1.prot.fa
+    bzip2 taeGut1.prot.fa
+    chmod a-w *
+    hgPepPred taeGut1 generic nscanPep  taeGut1.prot.fa.bz2