src/hg/makeDb/doc/felCat3.txt 1.56

1.56 2010/04/01 16:37:14 chinhli
Remove grep 829.2 command
Index: src/hg/makeDb/doc/felCat3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/felCat3.txt,v
retrieving revision 1.55
retrieving revision 1.56
diff -b -B -U 1000000 -r1.55 -r1.56
--- src/hg/makeDb/doc/felCat3.txt	20 Sep 2009 17:16:43 -0000	1.55
+++ src/hg/makeDb/doc/felCat3.txt	1 Apr 2010 16:37:14 -0000	1.56
@@ -1,1390 +1,1404 @@
 # for emacs: -*- mode: sh; -*-
 
 # Felis Catus (domestic cat) -- Broad/Agencourt release 3.0 (March 2006)
 
 # file template copied from dm3.txt
 
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE Sep. 14, 2006 Heather)
     ssh kkstore05
     mkdir /cluster/store12/felCat3
     ln -s /cluster/store12/felCat3 /cluster/data/felCat3
     mkdir /cluster/data/felCat3/downloads
     cd /cluster/data/felCat3/downloads
 
     wget "ftp://ftp.broad.mit.edu/pub/assemblies/mammals/cat/felCat3*"
 
 # trimHeader in fasta and qual files
 
     mkdir /cluster/data/felCat3/fixup
     zcat ../downloads/Draft_v3.agp.chromosome.fasta.gz |
          /cluster/home/heather/kent/src/hg/snp/snpLoad/trimHeader stdin 
     mv trimHeader.out fasta.fix
     gzip fasta.fix
     zcat ../downloads/Draft_v3.agp.chromosome.qual.gz |
          /cluster/home/heather/kent/src/hg/snp/snpLoad/trimHeader stdin
     mv trimHeader.out qual.fix
     gzip qual.fix
 
 
 #########################################################################
 # MAKE GENOME DB (DONE Oct. 5, 2006 heather)
     ssh kkstore05
     cd /cluster/data/dm3
     cat > felCat3.config.ra <<EOF
 # Config parameters for makeGenomeDb.pl:
 db felCat3
 clade vertebrate
 scientificName Felis Catus
 assemblyDate March 2006
 assemblyLabel Broad Release 3
 orderKey 17
 dbDbSpeciesDir cat
 mitoAcc none
 agpFiles /cluster/data/felCat3/downloads/assembly.agp
 fastaFiles /cluster/data/felCat3/fixup/fasta.fix.gz
 qualFiles /cluster/data/felCat3/fixup/qual.fix.gz
 EOF
 
     ~/kent/src/utils/makeGenomeDb.pl felCat3.config.ra \
       >& makeGenomeDb.log & tail -f makeGenomeDb.log
 
 
 #########################################################################
 # REPEATMASKER (DONE Oct 2006 Heather)
     ssh kkstore05
     # Run -debug to create the dir structure and preview the scripts:
     ~/kent/src/utils/doRepeatMasker.pl felCat3 -verbose 3 -debug
     # Run it for real and tail the log:
     ~/kent/src/utils/doRepeatMasker.pl felCat3 -verbose 3 \
       >& /cluster/data/felCat3/bed/RepeatMasker.2006-07-31/do.log &
     tail -f /cluster/data/felCat3/bed/RepeatMasker.2006-07-31/do.log
     # RepeatMasker and lib version from do.log:
 #    March 20 2006 (open-3-1-5) version of RepeatMasker
 #CC   RELEASE 20060315;                                            *
 
     # coverage 
     featureBits felCat3 rmsk
     618949830 bases of 1642698377 (37.679%) in intersection
 
 
 #########################################################################
 # SIMPLE REPEATS (TRF) (DONE Oct 2006 Heather)
     ssh kkr1u00
     mkdir /cluster/data/felCat3/bed/simpleRepeat
     cd /cluster/data/felCat3/bed/simpleRepeat
     twoBitToFa ../../felCat3.unmasked.2bit stdout \
     | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
       -bedAt=simpleRepeat.bed -tempDir=/tmp \
     >& trf.log & tail -f trf.log
     # 8 hours to run
     # Make a filtered version for sequence masking:
     awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
     splitFileByColumn trfMask.bed trfMaskChrom
 
     # Load unfiltered repeats into the database:
     ssh hgwdev
     hgLoadBed felCat3 simpleRepeat \
       /cluster/data/felCat3/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     # Coverage 
     featureBits felCat3 simpleRepeat
     47819770 bases of 1642698377 (2.911%) in intersection
 
 
 #########################################################################
 # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE Oct 17, 2006 heather)
     ssh kolossus
     cd /cluster/data/felCat3
     time twoBitMask felCat3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed felCat3.2bit
     # This warning is OK -- the extra fields are not block coordinates.
 #Warning: BED file bed/simpleRepeat/trfMask.bed has 10 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
 #4.857u 5.050s 0:32.63 30.3%     0+0k 0+0io 5pf+0w
     # Link to it from /gbdb:
     ssh hgwdev
     ln -s /cluster/data/felCat3/felCat3.2bit /gbdb/felCat3/felCat3.2bit
 
 
 #########################################################################
 # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE Heather Oct. 20, 2006)
     cd /cluster/data/felCat3
     ln -s /cluster/data/felCat3/bed/RepeatMasker.2006-10-16/felCat3.fa.out /cluster/data/felCat3
     ~/kent/src/utils/automation/makeDownloads.pl felCat3 -verbose=2 >& jkStuff/downloads.log &
     tail -f jkStuff/downloads.log
     # Edit README.txt files when done:
 # *** Edit each README.txt to resolve any notes marked with "***":
 #     /cluster/data/felCat3/goldenPath/database/README.txt
 #     /cluster/data/felCat3/goldenPath/bigZips/README.txt
 
 #########################################################################
 # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE Oct 20, 2006 heather)
     # pitakluster:
     ssh pk
     mkdir /san/sanvol1/scratch/felCat3
     cp /cluster/data/felCat3/felCat3.2bit /san/sanvol1/scratch/felCat3/
     cp /cluster/data/felCat3/chrom.sizes /san/sanvol1/scratch/felCat3/
     mkdir /san/sanvol1/scratch/felCat3/rmsk
     cp -p /cluster/data/felCat3/felCat3.fa.out /san/sanvol1/scratch/felCat3/rmsk
 
     # small cluster:
     ssh kkr1u00
     mkdir -p /iscratch/i/felCat3
     cp -p /cluster/data/felCat3/felCat3.2bit .
     cp -p /cluster/data/felCat3/chrom.sizes .
     iSync 
 
 #########################################################################
 # BLASTZ/CHAIN/NET HG18 (Done Nov 09 2006 heather)
 # Do target = hg18 first because felCat3 is 200K scaffolds (documented in hg18.txt)
 # Then do swap
 
     mkdir /cluster/data/felCat3/bed/blastz.hg18.2006-11-10.swap
     cd /cluster/data/felCat3/bed/blastz.hg18.2006-11-10.swap
     doBlastzChainNet.pl /cluster/data/felCat3/bed/blastz.hg18.2006-11-09/DEF -swap >& do.log &
     # netToAxt took over 4 hours
     nice featureBits felCat3 chainHg18Link
     # 1014983843 bases of 1642698377 (61.788%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET MM8 (Done Nov 09 2006 heather)
 # Do target = mm8 first (documented in mm8.txt)
 
     mkdir /cluster/data/felCat3/bed/blastz.mm8.2006-11-15.swap
     cd /cluster/data/felCat3/bed/blastz.mm8.2006-11-15.swap
     doBlastzChainNet.pl /cluster/data/felCat3/bed/blastz.mm8.2006-11-14/DEF -swap >& do.log &
     nice featureBits felCat3 chainMm8Link
     # 486204459 bases of 1642698377 (29.598%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET CANFAM2 (Done Nov 18 2006 heather)
 # Do target = canFam2 first (documented in canFam2.txt)
 
     mkdir /cluster/data/felCat3/bed/blastz.canFam2.2006-11-18.swap
     cd /cluster/data/felCat3/bed/blastz.canFam2.2006-11-18.swap
     doBlastzChainNet.pl /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16/DEF -swap >& do.log &
     nice featureBits felCat3 chainCanFam2Link
     # 1209933048 bases of 1642698377 (73.655%) in intersection
 
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE Oct 24 2006 heather)
     # Use -repMatch=600 (based on size -- for human we use 1024, and 
     # cat size is 57% of human judging by twoBitInfo -noNs)
     ssh kolossus
     blat /cluster/data/felCat3/felCat3.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/felCat3/11.ooc -repMatch=600
 # Wrote 18561 overused 11-mers to /cluster/bluearc/felCat3/11.ooc
     ssh kkr1u00
     cd /iscratch/i/felCat3
     cp -p /cluster/bluearc/felCat3/11.ooc .
     iSync 
 
 
 #########################################################################
 # GENBANK AUTO UPDATE (DONE Nov 2006 heather)
     # Don't need a liftAll.lft file
     # Guidance from Mark:
     # Genbank does its own partitioning.  The only thing the lift file is used
     # for is locating gaps.  It optimizes by splitting on largish gaps.      
     # The current threshold is 3000000.  If you don't have any gaps over   
     # this threshold, you don't need a lift file.  With 200K scaffolds,
     # you can probably just skip this. 
 
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add felCat3
     # check data/organism.lst for counts of native mrna, est, refseq
     # organism      mrnaCnt estCnt  refSeqCnt
     # Felis catus   593     921     219
     # since small yield, include xeno mrna 
     # always do xeno refseq
     # never do xeno est
 
 # felCat3 (cat) 217790 scaffolds
 felCat3.serverGenome = /cluster/data/felCat3/felCat3.2bit
 felCat3.clusterGenome = /iscratch/i/felCat3/felCat3.2bit
 felCat3.ooc = /iscratch/i/felCat3/11.ooc
 felCat3.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
 felCat3.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
 felCat3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
 felCat3.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
 felCat3.genbank.est.native.pslCDnaFilter   = ${lowCover.genbank.est.native.pslCDnaFilter}
 felCat3.lift = no
 felCat3.refseq.mrna.xeno.load = yes
 felCat3.downloadDir = felCat3
 felCat3.perChromTables = no
 
     # edit src/lib/gbGenome.c
     # static char *felCatNames[] = {"Felis catus", NULL};
     # static struct dbToSpecies dbToSpeciesMap[] = ... {"felCat", felCatNames, NULL}, ...
 
     cvs commit -m "Added cat" etc/genbank.conf src/lib/gbGenome.c
     make install-server
 
     ssh kkstore02
     cd /cluster/data/genbank
     nice bin/gbAlignStep -initial felCat3 &
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     nice ./bin/gbDbLoadStep -drop -initialLoad felCat3 &
 
     # enable daily alignment and update of hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add felCat3 to:
         etc/align.dbs
         etc/hgwdev.dbs 
     cvs commit
     make etc-update
 
 #########################################################################
 
 # CREATE MICROSAT TRACK (DONE Nov. 2006 Heather)
      ssh hgwdev
      cd /cluster/data/felCat3/bed
      mkdir microsat
      cd microsat
      awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
      /cluster/bin/i386/hgLoadBed felCat3 microsat microsat.bed
 
 #########################################################################
 
 # GENSCAN (DONE Nov. 2006 Heather)
 # Assume none of the 200K scaffolds are all Ns
 # (These cause Genscan to run forever)
     ssh hgwdev
     mkdir /cluster/data/felCat3/bed/genscan
     cd /cluster/data/felCat3/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     cvs co hg3rdParty/genscanlinux
 
     # generate hard-masked sequence
     ssh kkstore05
     cd /cluster/data/felCat3/bed/genscan
     zcat /cluster/data/felCat3/goldenPath/bigZips/felCat3.fa.gz | maskOutFa stdin hard felCat3.hardmask.fa
     mkdir split
     cd split
     faSplit about ../felCat3.hardmask.fa 2000000 split
     # This yields 1939 files
     # Generate file list and check that no files are completed masked
     cd ..
     foreach f ( /cluster/data/felCat3/bed/genscan/split/*.fa )
         egrep '[ACGT]' $f > /dev/null
         if ($status == 0) echo $f >> genome.list
     end
     wc -l genome.list
     # 1939 genome.list
 
     # Log into kki (not kk!).  kki is the driver node for the small
     # cluster (kkr1u00-kkr8u00). Genscan has a problem running on the
     # big cluster, due to limited memory and swap space on each
     # processing node.
     ssh kki
     cd /cluster/data/felCat3/bed/genscan
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     /parasol/bin/gensub2 genome.list single gsub jobList
 
     para create jobList
     para try
     para check
     para push 
 
     # Completed: 1939 of 1939 jobs
     # CPU time in finished jobs:     119320s    1988.66m    33.14h    1.38d  0.004 y
     # IO & Wait Time:                  7219s     120.32m     2.01h    0.08d  0.000 y
     # Average job time:                  65s       1.09m     0.02h    0.00d
     # Longest running job:                0s       0.00m     0.00h    0.00d
     # Longest finished job:             242s       4.03m     0.07h    0.00d
     # Submission to last job:         10683s     178.05m     2.97h    0.12d
 
     # Concatenate
     ssh kkstore05
     cd /cluster/data/felCat3/bed/genscan
     cat gtf/*.gtf > genscan.gtf
     cat pep/*.pep > genscan.pep
     cat subopt/*.bed > genscanSubopt.bed
 
     # Load into the database 
     ssh hgwdev
     cd /cluster/data/felCat3/bed/genscan
     ldHgGene -gtf felCat3 genscan genscan.gtf
     # Reading genscan.gtf
     # Read 75440 transcripts in 283819 lines in 1 files
       # 75440 groups 54047 seqs 1 sources 1 feature types
     # 75440 gene predictions
 
     hgPepPred felCat3 generic genscanPep genscan.pep
     hgLoadBed felCat3 genscanSubopt genscanSubopt.bed
 
     featureBits felCat3 genscan
     # 46616293 bases of 1642698377 (2.838%) in intersection
     featureBits felCat3 genscanSubopt
     # 53124220 bases of 1642698377 (3.234%) in intersection
     # Should be zero intersection with rmsk
     featureBits felCat3 genscan rmsk
     # 3334 bases of 1642698377 (0.000%) in intersection
 
 
 #########################################################################
 # CpG Islands (DONE Nov 22, 2006 Heather)
     ssh kkstore05
     cd /cluster/data/felCat3/bed
     mkdir cgpIsland
     cd cgpIsland
     zcat ../../goldenPath/bigZips/felCat3.fa.masked.gz | faSplit about stdin 2000000 split
 
     ssh hgwdev
     cd /cluster/data/felCat3/bed/cgpIsland
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     mv cpglh.exe /cluster/data/felCat3/bed/cpgIsland/
     
     ssh kkstore05
     # could also have used kolossus
     cd /cluster/data/felCat3/bed/cpgIsland
     cat << '_EOF_' > run.csh
 #!/bin/csh -ef
 foreach f (split/*)
     set fout=$f:t:r:r.cgp
     echo $fout
     ./cpglh.exe $f > $fout
 end
 '_EOF_'
     # << this line makes emacs coloring happy
     run.csh
 
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     # << this line makes emacs coloring happy
     awk -f filter.awk x*.cpg > cpgIsland.bed
 
     # load into database
     ssh hgwdev
     cd /cluster/data/felCat3/bed/cpgIsland
     hgLoadBed felCat3 cpgIslandExt -tab \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
     # Loaded 45262 elements of size 10
     wc -l cpgIsland.bed 
     # 45262 cpgIsland.bed
     featureBits felCat3 cpgIslandExt
     # 30893611 bases of 1642698377 (1.881%) in intersection
 
 #########################################################################
 # Build contigAcc table (DONE, Heather, Jan 2007)
 # I found the contig accessions at
 # http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=Nucleotide&cmd=Search&term=AANG01000001:AANG01817956[PACC]
 # They have a regular pattern:
 #   contig_0 --> AANG01000001
 #   contig_1 --> AANG01000002
 #   contig_817955 --> AANG01817956
 
   ssh kkstore05
   cd /cluster/data/felCat3
   cat << '_EOF_' > contigAcc.csh
 
 #!/bin/tcsh
 set contig = 0
 set acc = 1000001
 while ($contig < 817956)
     echo "contig_$contig\tAANG0$acc"
     set contig=`expr $contig + 1`
     set acc=`expr $acc + 1`
 end
 '_EOF_'
     # << this line makes emacs coloring happy
 
   ./contigAcc.csh > contigAcc.out
 
   ssh hgwdev
   cd /cluster/data/felCat3
   hgsql felCat3 < contigAcc.sql
   hgsql -e 'load data local infile "contigAcc.out" into table contigAcc'
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (in progress braney 2007-01-29)
     ssh kkstore05
     bash # if not using bash shell already
 
     mkdir /cluster/data/felCat3/blastDb
     cd /cluster/data/felCat3
     zcat fixup/fasta.fix.gz > temp.fa
     faSplit sequence temp.fa 500 blastDb/
     rm temp.fa
     cd blastDb
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.fa
 
     mkdir -p /san/sanvol1/scratch/felCat3/blastDb
     cd /cluster/data/felCat3/blastDb
     for i in nhr nin nsq; 
     do 
 	echo $i
 	cp *.$i /san/sanvol1/scratch/felCat3/blastDb
     done
 
     mkdir -p /cluster/data/felCat3/bed/tblastn.hg18KG
     cd /cluster/data/felCat3/bed/tblastn.hg18KG
     echo  /san/sanvol1/scratch/felCat3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 499 query.lst
 
    # we want around 200000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
 # 36727/(200000/499) = 91.633865
 
    mkdir -p /cluster/bluearc/felCat3/bed/tblastn.hg18KG/kgfa
    split -l 90 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/felCat3/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/felCat3/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/felCat3/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
 
         if pslCheck -prot $3.tmp                                                  
         then                                                                      
             mv $3.tmp $3                                                          
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0                                                                    
     fi                                                                            
 fi                                                                                
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     # back to bash
     exit 
     
     ssh pk
     cd /cluster/data/felCat3/bed/tblastn.hg18KG
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 # Completed: 134439 of 204091 jobs
 # Crashed: 69652 jobs
 # CPU time in finished jobs:   22099258s  368320.97m  6138.68h  255.78d  0.701 y
 # IO & Wait Time:               2034194s   33903.23m   565.05h   23.54d  0.065 y
 # Average job time:                 180s       2.99m     0.05h    0.00d
 # Longest finished job:            1226s      20.43m     0.34h    0.01d
 # Submission to last job:        288017s    4800.28m    80.00h    3.33d
 
 # Completed: 69652 of 69652 jobs
 # CPU time in finished jobs:    9130572s  152176.19m  2536.27h  105.68d  0.290 y
 # IO & Wait Time:                396459s    6607.66m   110.13h    4.59d  0.013 y
 # Average job time:                 137s       2.28m     0.04h    0.00d
 # Longest finished job:             420s       7.00m     0.12h    0.00d
 # Submission to last job:         52132s     868.87m    14.48h    0.60d
 
 
     ssh kkstore05
     cd /cluster/data/felCat3/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     exit
     chmod +x chainOne
     ls -1dS /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh kk
     cd /cluster/data/felCat3/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para maxNode 30
     para try, check, push, check etc.
 # Completed: 409 of 409 jobs
 # CPU time in finished jobs:       3862s      64.37m     1.07h    0.04d  0.000 y
 # IO & Wait Time:                164467s    2741.12m    45.69h    1.90d  0.005 y
 # Average job time:                 412s       6.86m     0.11h    0.00d
 # Longest finished job:             877s      14.62m     0.24h    0.01d
 # Submission to last job:          5693s      94.88m     1.58h    0.07d
 
 
     ssh kkstore05
     cd /cluster/data/felCat3/bed/tblastn.hg18KG/blastOut
     bash # if using another shell
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/felCat3/bed/tblastn.hg18KG/blastHg18KG.psl
     pslCheck blastHg18KG.psl
     # this is ok.
     # load table 
     ssh hgwdev
     cd /cluster/data/felCat3/bed/tblastn.hg18KG
     hgLoadPsl felCat3 blastHg18KG.psl
     # check coverage
     featureBits felCat3 refGene:cds  blastHg18KG -enrichment
 # refGene:cds 0.013%, blastHg18KG 0.926%, both 0.008%, cover 60.78%, enrich  65.60x
     
     ssh kkstore05
     rm -rf /cluster/data/felCat3/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut
 #end tblastn
 ##########################################################################
 # AUGUSTUS ab initio track (DONE, mario 2007-01-30)
     ssh hgwdev
     mkdir /cluster/data/felCat3/bed/augustus
     cd /cluster/data/felCat3/bed/augustus
 
     # get the program AUGUSTUS from the augustus web server
     wget http://augustus.gobics.de/binaries/augustus.2.0.1.src.tar.gz
     tar xzf augustus.2.0.1.src.tar.gz
 
     # compile the binary if necessary
     cd augustus/src
     make augustus
 
     # create output and error directory
     cd /cluster/data/felCat3/bed/augustus
     mkdir out err
 
     ssh kkstore
     cd /cluster/data/felCat3/bed/augustus
 
     # use the split fasta files that were already generated for genscan, see above
     # (completely masked files are no problem)
 
     ls ../genscan/split/*.fa >> genome.lst
 
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 augustus/src/augustus --AUGUSTUS_CONFIG_PATH=/cluster/data/felCat3/bed/augustus/augustus/config --species=human --sample=100 --/augustus/verbosity=0 {check in line+ $(path1)} --outfile={check out line out/$(root1).gtf} --errfile=err/$(root1).err
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # create the job list
     ssh pk
     cd /cluster/data/felCat3/bed/augustus
     /parasol/bin/gensub2 genome.list single gsub jobList
 
     wc -l jobList
     # 1939
 
     para try
     para check
     para push
 
 # CPU time in finished jobs:    4923552s   82059.20m  1367.65h   56.99d  0.156 y
 # IO & Wait Time:                  9930s     165.50m     2.76h    0.11d  0.000 y
 # Average job time:                2544s      42.41m     0.71h    0.03d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            4050s      67.50m     1.12h    0.05d
 # Submission to last job:         45408s     756.80m    12.61h    0.53d
 
    # check the error files, should be no errors
    cat err/*.err
    # no errors, delete the empty files
    rm err/*.err
 
    cat out/*.gtf | augustus/scripts/join_aug_pred.pl > augustus.pep.gff
    augustus/scripts/getAnnoFasta.pl augustus.pep.gff
    cat augustus.pep.gff | egrep "CDS|codon"> augustus.gff
 
    # load into database
    ssh hgwdev
    cd /cluster/data/felCat3/bed/augustus/
 
    ldHgGene -bin felCat3 augustus augustus.gff
    # 28510 gene predictions
 
    hgPepPred felCat3 generic augustusPep augustus.pep.aa
 
    featureBits felCat3 augustus
    # 21748832 bases of 1642698377 (1.324%) in intersection
 
 ############################################################################
 # GENEID GENES (DONE - 2007-02-01 - Hiram)
 ## re-done with new data 2007-02-05 - Hiram
     ssh kkstore05
     mkdir /cluster/data/felCat3/bed/geneid
     cd /cluster/data/felCat3/bed/geneid
     wget --timestamping \
 "http://genome.imim.es/genepredictions/F.catus/golden_path_200603/geneidv1.2/felCat3.gtf" \
         -O "felCat3.gtf"
     wget --timestamping \
 "http://genome.imim.es/genepredictions/F.catus/golden_path_200603/geneidv1.2/felCat3.prot" \
         -O "felCat3.prot"
 
     # Add missing .1 to protein id's
     perl -wpe 's/^(>scaffold\w+)$/$1.1/' felCat3.prot > felCat3-fixed.prot
     ssh hgwdev
     cd /cluster/data/felCat3/bed/geneid
     #	load predictions
     nice -n +19 ldHgGene -genePredExt -gtf felCat3 geneid felCat3.gtf
     #	Read 204448 transcripts in 509279 lines in 1 files
     #	204448 groups 188121 seqs 1 sources 3 feature types
 
     #	load protein sequences
     nice -n +19 hgPepPred felCat3 generic geneidPep felCat3-fixed.prot
 
     #	after loading with new data set:
     nice -n +19 featureBits felCat3 -enrichment refGene:CDS geneid
     #	refGene:CDS 0.013%, geneid 2.321%, both 0.009%, cover 71.49%,
     #	enrich 30.80x
 
     nice -n +19 featureBits felCat3 -enrichment genscan:CDS geneid
     #	genscan:CDS 2.838%, geneid 2.321%, both 1.395%, cover 49.15%,
     #	enrich 21.17x
 
     ##	With the previous data set
     nice -n +19 featureBits felCat3 -enrichment refGene:CDS geneid
     #	refGene:CDS 0.013%, geneid 1.440%, both 0.004%, cover 31.81%,
     #	enrich 22.09x
     nice -n +19 featureBits felCat3 -enrichment genscan:CDS geneid
     #	genscan:CDS 2.838%, geneid 1.440%, both 0.816%, cover 28.76%,
     #	enrich 19.97x
 
 
 ##########################################################################
 # NSCAN track - (2007-03-08 markd)
 # uses hg18 as informat, with pseudogene masking 
 #
     cd /cluster/data/felCat3/bed/nscan/
 
     # obtainedf NSCAN predictions from michael brent's group
     # at WUSTL
     wget -nv http://mblab.wustl.edu/predictions/cat/felCat3.gtf
     wget -nv http://mblab.wustl.edu/predictions/cat/felCat3/felCat3.ptx.fa
 
     gzip -9 felCat3.*
     chmod a-w *.gz
 
     # load tracks.  Note that these have *utr features, rather than
     # exon features.  currently ldHgGene creates separate genePred exons
     # for these.
     ldHgGene -bin -gtf -genePredExt felCat3 nscanGene felCat3.gtf.gz
     hgPepPred felCat3 generic nscanPep felCat3.ptx.fa.gz
     rm -f *.tab
 
     # update trackDb; need a felCat3-specific page to describe informants
     cat/felCat3/nscanGene.html    
     cat/felCat3/trackDb.ra
 
 
 ##########################################################################
 # SYNTENIC NETS (Heather, Mar 2007)
     ssh hgwdev
     cd /cluster/data/felCat3/bed
 
     cd blastz.hg18.2006-11-10.swap/axtChain
     netFilter -syn felCat3.hg18.net.gz | hgLoadNet felCat3 netSyntenyHg18 stdin
 
     featureBits -countGaps felCat3 refGene:cds netHg18 -enrichment
     # refGene:cds 0.005%, netHg18 66.952%, both 0.005%, cover 99.78%, enrich 1.49x
 
     featureBits -countGaps felCat3 refGene:cds netSyntenyHg18 -enrichment
     # refGene:cds 0.005%, netSyntenyHg18 53.767%, both 0.004%, cover 79.25%, enrich 1.47x
 
     cd blastz.mm8.2006-11-15.swap/axtChain
     netFilter -syn felCat3.mm8.net.gz | hgLoadNet felCat3 netSyntenyMm8 stdin
 
     featureBits -countGaps felCat3 refGene:cds netMm8 -enrichment
     # refGene:cds 0.005%, netMm8 36.836%, both 0.005%, cover 99.33%, enrich 2.70x
 
     featureBits -countGaps felCat3 refGene:cds netSyntenyMm8 -enrichment
     # refGene:cds 0.005%, netSyntenyMm8 23.421%, both 0.003%, cover 56.22%, enrich 2.40x
 
     cd blastz.canFam2.2006-11-18.swap
     netFilter -syn felCat3.canFam2.net.gz | hgLoadNet felCat3 netSyntenyCanFam2 stdin
 
     featureBits -countGaps felCat3 refGene:cds netCanFam2 -enrichment
     # refGene:cds 0.005%, netCanFam2 78.072%, both 0.005%, cover 99.82%, enrich 1.28x
     
     featureBits -countGaps felCat3 refGene:cds netSyntenyCanFam2 -enrichment
     # refGene:cds 0.005%, netSyntenyCanFam2 67.261%, both 0.005%, cover 88.77%, enrich 1.32x
 
 ##########################################################################
 # RECIPROCAL BEST NETS (2007-03-19 kate)
 
     ssh kkstore05
     cd /cluster/data/felCat3
     cd bed/blastz.hg18
     ~/kent/src/hg/utils/automation/doRecipBest.pl felCat3 hg18 >&! rbest.log  &
 
     ssh kkstore05
     cd /cluster/data/felCat3/bed
     cd blastz.mm8.2006-11-15.swap
     ~/kent/src/hg/utils/automation/doRecipBest.pl felCat3 mm8 \
         -buildDir=`pwd` >&! rbest.log  &
 
     ssh kkstore05
     cd /cluster/data/felCat3/bed
     cd blastz.canFam2.2006-11-18.swap
     ~/kent/src/hg/utils/automation/doRecipBest.pl felCat3 canFam2 \
         -buildDir=`pwd` >&! rbest.log  &
 
     # downloadables
     ssh hgwdev
     ln -s /cluster/data/felCat3/bed/blastz.canFam2.2006-11-18.swap/axtRBestNet/felCat3.canFam2.rbest.axt.gz \
         /usr/local/apache/htdocs/goldenPath/felCat3/vsCanFam2/felCat3.canFam2.rbest.axt.gz
     ln -s /cluster/data/felCat3/bed/blastz.hg18/axtRBestNet/felCat3.hg18.rbest.axt.gz \
         /usr/local/apache/htdocs/goldenPath/felCat3/vsHg18/felCat3.hg18.rbest.axt.gz
     ln -s /cluster/data/felCat3/bed/blastz.mm8.2006-11-15.swap/axtRBestNet/felCat3.mm8.rbest.axt.gz \
         /usr/local/apache/htdocs/goldenPath/felCat3/vsMm8/felCat3.mm8.rbest.axt.gz
 
 
 ##########################################################################
 # MULTIZ4WAY (Heather, DONE, May 2007)
 
   # split; write to /san (will be using pk)
   # create 1024 files 000.maf through 1023.maf
   # Files vary from almost 2000 - over 13000 lines each
   # A little over 6,000,000 lines total
   ssh kkstore05
   cd /cluster/data/felCat3/bed/blastz.canFam2.2006-11-18.swap/mafRBestNet
   zcat *gz | mafSplit dummyArg /san/sanvol1/scratch/felCat3/mafNet/canFam2/split
            stdin -byTarget -useHashedName=10
   cd /cluster/data/felCat3/bed/blastz.hg18.2006-11-10.swap/mafRBestNet
   zcat *gz | mafSplit dummyArg /san/sanvol1/scratch/felCat3/mafNet/hg18/split
              stdin -byTarget -useHashedName=10
   cd /cluster/data/felCat3/bed/blastz.mm8.2006-11-15.swap/mafRBestNet
   zcat *gz | mafSplit dummyArg /san/sanvol1/scratch/felCat3/mafNet/mm8/split
              stdin -byTarget -useHashedName=10 
 
   # setup multiz4way 
   # output will be to /cluster/data/felCat3/bed/multiz4way/maf
   cd /cluster/data/felCat3/bed/multiz4way
   mkdir maf
   mkdir run
 
   # tree_doctor
   /cluster/bin/phast.new/tree_doctor  --prune-all-but=felCat3,canFam2,mm8,hg18 \
      /cluster/data/hg18/bed/multiz28way/cons/28way.mod  | awk '$1 == "TREE:" {print $2;}'  > 4way.nh
 
   # use the following for species.lst:
   # felCat3 canFam2 mm8 hg18
   # use the following for tree.nh:
   # ((felCat3 canFam2) (mm8 hg18))
   # use the following for tree-species.nh:
   # ((cat, dog), (mouse, human))
   # create tree image for details page using phyloGif
   # The treeImage setting in trackDb.ra is
   # phylo/felCat3_4way.gif (relative to htdocs/images).
   /data/apache/cgi-bin/phyloGif -phyloGif_tree=tree-species.nh -phyloGif_width=260 \
       -phyloGif_height=260 > felCat3_4way.gif
   # If you haven't already, check out the browser CVS tree in your ~/:
   # (cd; cvs co -d hgwdev:/projects/hg/cvsroot browser)
   cp felCat3_4way.gif ~browser/images/phylo/
   cd ~/browser/images/phylo
   cvs add felCat3_4way.gif
   cvs commit -m "new image" felCat3_4way.gif
   cd ../..
   cvs update -d
   make alpha
 
   cd run
   # create split.lst
   perl -we 'for ($i=0; $i<1024; $i++) { printf "split%03d\n", $i; }' > split.lst
 
   # spec file
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/felCat3/bed/multiz4way/maf/$(dir1)/$(root1).maf}
 #ENDLOOP
 'EOF'
     # << emacs
 
   # autoMultiz.csh 
 cat  << 'EOF' > autoMultiz.csh
 #!/bin/csh -ef
     set db = felCat3
     set c = $1
     set mafOut = $2
     set run = `pwd`
     set pairs = /san/sanvol1/scratch/$db/mafNet
 
 # set up temp dir
     set tmp = /scratch/tmp/$db/multiz.$c
     rm -fr $tmp
     mkdir -p $tmp
     cp ../{tree.nh,species.lst} $tmp
     pushd $tmp
 
     foreach s (`cat species.lst`)
         if ($s == $db) then
             continue
         endif
         set clusterMaf = $pairs/$s/$c.maf
         set localMaf = $db.$s.sing.maf
         if (-e $clusterMaf) then
             cp $clusterMaf $localMaf
         else
             echo "##maf version=1 scoring=autoMZ" > $localMaf
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $mafOut
     rm -fr $tmp
 'EOF'
     # << emacs
 
   # stash binaries
   mkdir penn
   cd penn
   cp /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz .
   cp /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project .
   cp /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ .
 
   # cluster run
   ssh pk
   cd /cluster/data/felCat3/bed/multiz4way/run
   gensub2 split.lst single spec jobList
   para create jobList
   para try
   para check
   para push
   para time
   Completed: 1024 of 1024 jobs
   # CPU time in finished jobs:        740s      12.33m     0.21h    0.01d  0.000 y
   # IO & Wait Time:                  8688s     144.80m     2.41h    0.10d  0.000 y
   # Average job time:                   9s       0.15m     0.00h    0.00d
   # Longest running job:                0s       0.00m     0.00h    0.00d
   # Longest finished job:              51s       0.85m     0.01h    0.00d
   # Submission to last job:          1616s      26.93m     0.45h    0.02d
 
 ##########################################################################
 # ANNOTATE MULTIZ4WAY MAF AND LOAD TABLES (Done, May 2007, Heather)
 
   # create felCat3.Nbed (already have it for hg18, mm8, canFam2)
   ssh kolossus
   cd /cluster/data/felCat3
   twoBitInfo -nBed felCat3.2bit felCat3.N.bed
 
   # directory structure and nBeds file (input to mafAddIRows)
   ssh kkstore05
   mkdir /cluster/data/felCat3/bed/multiz4way/anno
   cd /cluster/data/felCat3/bed/multiz4way/anno
   mkdir maf run
   cd run
   # setup.sh 
   cat  << 'EOF' > setup.sh
 #!/bin/sh
 for DB in canFam2 hg18 mm8
 do
   ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
   echo ${DB}.bed >> nBeds
 done
 'EOF'
   # << emacs
 
   # setup for mafAddIRows
   # makeIrows.sh
   cat << 'EOF' > makeIrows.sh
 #!/bin/sh
 rm -f getIrows.sh
 twobit=/cluster/data/felCat3/felCat3.2bit
 outdir=../maf
 # do smaller jobs first
 for file in `ls -1rS ../../maf/*.maf`
 do
   echo "echo $file" >> getIrows.sh
   base=`basename $file`
   echo nice mafAddIRows -nBeds=nBeds $file $twobit $outdir/$base >>
   getIrows.sh
 done
 chmod +x getIrows.sh
 'EOF'
   # << emacs
 
   # run mafAddIRows
   # takes 30 minutes or so depending on load on kolossus
   # could have used kki (see ornAna1.txt)
   ssh kolossus
   cd /cluster/data/felCat3/bed/multiz4way/anno/run
   ./getIrows.sh
 
   # concat and filter
   ssh kkstore05
   cd /cluster/data/felCat3/bed/multiz4way/anno
   cat maf/*.maf >> felCat3.maf
   # rollup file is 29,932,626 lines
   mafFilter -overlap -reject=rf felCat3.maf > felCat3.mf.maf
 # rejected minRow: 2279
 # total rejected: 2279 blocks
   # mafFilter rejected 2279 single-row (felCat3 only) blocks due to 
   # its default minRow of 2.  That's reasonable, so replace the original 
   # with the filtered version.
   mv felCat3.maf felCat3.preFilter.maf
   mv felCat3.mf.maf felCat3.maf
   # TO DO when kkstore05 isn't so busy:
   # gzip -c felCat3.preFilter.maf > felCat3.preFilter.maf.gz
 
   # load
   ssh hgwdev
   mkdir -p /gbdb/felCat3/multiz4way/anno
   ln -s /cluster/data/felCat3/bed/multiz4way/anno/felCat3.maf /gbdb/felCat3/multiz4way/anno/
   time nice hgLoadMaf -pathPrefix=/gbdb/felCat3/multiz4way/anno felCat3 multiz4way
   # Loaded 2344109 mafs in 1 files from /gbdb/felCat3/multiz4way/anno
   # 60.419u 16.819s 1:48.05 71.4%   0+0k 0+0io 2pf+0w
 
   # summary
   ssh kolossus
   cd /cluster/data/felCat3/bed/multiz4way/anno 
   time nice hgLoadMafSummary felCat3 -minSize=30000 -mergeGap=1500 -maxSize=200000 -test \
       multiz4waySummary felCat3.maf
   # Created 323552 summary blocks from 1510110 components and 2344109 mafs from felCat3.maf
   # 107.717u 21.889s 3:08.00 68.9%  0+0k 0+0io 3pf+0w
   # Creates multiz4waySummary.tab
 
   # load
   ssh hgwdev
   sed -e 's/mafSummary/multiz4waySummary/' ~/kent/src/hg/lib/mafSummary.sql > /tmp/multiz4waySummary.sql
   cd /cluster/data/felCat3/bed/multiz4way/anno 
   time nice hgLoadSqlTab felCat3 multiz4waySummary /tmp/multiz4waySummary.sql multiz4waySummary.tab
   # 0.000u 0.003s 0:04.18 0.0%      0+0k 0+0io 3pf+0w
 
   # summary is only useful for scaffolds larger than 1 million, which is just 7 in felCat3
   # Angie subsequently changed hgLoadMafSummary so by default it will skip small sequences
   mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_217569";
   mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_213970";
   mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_138821";
   mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_149667";
   mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_217324";
   mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_132063";
   mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_194753";
   mysql> rename table multiz4waySummary to multiz4waySummaryAll;
   mysql> rename table multiz4waySummaryNew to multiz4waySummary;
 
 
 #########################################################################
 # MULTIZ4WAY DOWNLOADABLES (Done May 2007 Heather)
     ssh kolossus
     mkdir /cluster/data/felCat3/bed/multiz4way/mafDownloads
 
 # Tried for upstream data with nscanGene, didn't get anything
 # ssh hgwdev
 # cd /cluster/data/felCat3/bed/multiz4way/mafDownloads
 # nice featureBits felCat3 nscanGene:upstream:1000 -fa=/dev/null -bed=up.bad
 # nice featureBits felCat3 nscanGene:upstream:2000 -fa=/dev/null -bed=up.bad
 # nice featureBits felCat3 nscanGene:upstream:5000 -fa=/dev/null -bed=up.bad
 
     # Make a gzipped version of the monolithic annotated maf file:
     cd /cluster/data/felCat3/bed/multiz4way
     gzip -c anno/felCat3.maf > mafDownloads/felCat3.maf.gz
     cd mafDownloads
     md5sum felCat3.maf.gz > md5sum.txt
 
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/felCat3/multiz4way
     mkdir $dir
     ln -s /cluster/data/felCat3/bed/multiz4way/mafDownloads/{*.gz,md5sum.txt} $dir
     cp /usr/local/apache/htdocs/goldenPath/ornAna1/multiz6way/README.txt $dir
     # edit README.txt
     # create /usr/local/apache/htdocs/goldenPath/hg18/multiz4way/4way.nh
 
 
 #########################################################################
 # MULTIZ4WAY MAF FRAMES (Done May 2007 Heather)
     # using nscanGene for felCat3
     # using knownGene for mm8 hg18
     # using ensGene for canFam2 
     # genePreds (name specific fields to get around bin, knownGene extras)
     ssh hgwdev
     mkdir /cluster/data/felCat3/bed/multiz4way/frames
     cd /cluster/data/felCat3/bed/multiz4way/frames
     mkdir genes
     cd genes
 
     cat << '_EOF_' > getGenes.csh
 #!/bin/csh -ef
     foreach queryDb (felCat3 mm8 hg18 canFam2)
       if ($queryDb == "felCat3") then
         set geneTbl = nscanGene
       else if ($queryDb == "canFam2") then
         set geneTbl = ensGene
       else
         set geneTbl = knownGene
       endif
       hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from $geneTbl" ${queryDb} \
       | genePredSingleCover stdin stdout | gzip -2c  > /scratch/tmp/$queryDb.tmp.gz
       mv /scratch/tmp/$queryDb.tmp.gz $queryDb.gp.gz
     end
 '_EOF_'
     # << this line makes emacs coloring happy
     ./getGenes.csh
 
     # create frames
     ssh kolossus
     cd /cluster/data/felCat3/bed/multiz4way/frames
     cat << '_EOF_' > getFrames.csh
 #!/bin/csh -ef
     set multizDir = /cluster/data/felCat3/bed/multiz4way
     set mafDir = $multizDir/mafDownloads
     set geneDir = $multizDir/frames/genes
     cd $multizDir/frames
     genePredToMafFrames felCat3 $mafDir/felCat3.maf.gz felCat3.mafFrames \
       felCat3 genes/felCat3.gp.gz \
       hg18 genes/hg18.gp.gz \
       mm8 genes/mm8.gp.gz \
       canFam2 genes/canFam2.gp.gz
     gzip felCat3.mafFrames
 '_EOF_'
     # << this line makes emacs coloring happy
     ./getFrames.csh
 
     # load the database
     ssh hgwdev
     cd /cluster/data/felCat3/bed/multiz4way/frames
     hgLoadMafFrames felCat3 multiz4wayFrames felCat3.mafFrames.gz
 
 
 ##########################################################################
 # PHASTCONS (DONE, June 2007, Heather)
 # Using Angie's process from ornAna1.txt
 
     ssh hgwdev
     cd /cluster/data/felCat3/bed/multiz4way
     mkdir phastCons
     cd phastCons
 
     # tree_doctor
     # Does the order matter in the prune-all-but argument?
     /cluster/bin/phast.new/tree_doctor  --prune-all-but=felCat3,canFam2,mm8,hg18 \
         /cluster/data/hg18/bed/multiz28way/cons/28way.mod > 4way.mod
 
     # msa_split
     # msa_split requires its MAF and FA inputs to contain one sequence each
     # input is from /cluster/data/felCat3/bed/multiz4way/maf/split*.maf
     # output is to /san/sanvol1/scratch/felCat3/multiz4way/phastCons/ss
     ssh kkstore05
     cd /cluster/data/felCat3/bed/multiz4way/phastCons
     nice ./doSplitOnFileserver.csh > & split.log &
     # this took 5.5 hours to create directories split000 - split999
     # not bothering to restructure as a cluster run (Angie suggested this in ornAna1.txt)
     # 50k WARNINGs in split.log
     # make a list of all output (sort each split dir by size)
     ./makeInput.sh
 
     # check a single chunk
     ssh kolossus
     cd /cluster/data/felCat3/bed/multiz4way/phastCons
     /cluster/bin/phast.new/phyloFit -i SS -E -p MED -s HKY85 \
         --tree "`cat ../tree-commas.nh`" \
         /san/sanvol1/scratch/felCat3/multiz4way/phastCons/ss/split495/scaffold_216010.1-806250.ss \
 	-o phyloFit.tree
 
     # get value of rho
     ssh kolossus
     mkdir run.cons
     cd run.cons
     /cluster/bin/phast.new/phastCons --estimate-rho /tmp/estimatedRho.mod \
         --target-coverage 0.005 --expected-length 12 --no-post-probs \
         /san/sanvol1/scratch/felCat3/multiz4way/phastCons/ss/split495/scaffold_216010.1-806250.ss ../4way.mod
     # rho = 0.226082
 
     # template goes here (uses rho)
 
     # run all
     ssh pk
     cd /cluster/data/felCat3/bed/multiz4way/phastCons/run.cons
     mv ../in.list .
     gensub2 in.list single template jobList
 
     para create jobList
     para try
     para check
     para push 
 
     para time
     # Completed: 103985 of 103985 jobs
     # CPU time in finished jobs:      11039s     183.98m     3.07h    0.13d 0.000 y
     # IO & Wait Time:               1170173s   19502.89m   325.05h   13.54d 0.037 y
     # Average job time:                  11s       0.19m     0.00h    0.00d
     # Longest running job:                0s       0.00m     0.00h    0.00d
     # Longest finished job:             177s       2.95m     0.05h    0.00d
     # Submission to last job:          7596s     126.60m     2.11h    0.09d
 
     # create Most Conserved track
     # Takes about 10 minutes
     ssh kolossus
     cd /san/sanvol1/scratch/felCat3/multiz4way/phastCons
     find ./bed -name "*.bed" \
     | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \
     | sort -k7,7 -k9,9n \
     | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \
     | xargs cat \
     | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
     | /cluster/bin/scripts/lodToBedScore /dev/stdin >
     phastConsElements4way.bed
     cp phastConsElements4way.bed /cluster/data/felCat3/bed/multiz4way/phastCons
 
     # estimate coverage
     # ornAna1 targets: 5% overall coverage, 70% CDS coverage
     # xenTro1 target: 4% overall coverage
     ssh hgwdev
     cd /cluster/data/felCat3/bed/multiz4way/phastCons
     # attempted genome-wide, ran over an hour without completing
 
     # pick scaffolds with NSCAN annotations:
     featureBits -chrom=scaffold_0 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 10.166%, phastConsElements4way.bed 0.358%, both 0.358%, cover 3.53%, enrich 9.84x
     featureBits -chrom=scaffold_1000 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 5.882%, phastConsElements4way.bed 2.677%, both 2.286%, cover 38.87%, enrich 14.52x
     featureBits -chrom=scaffold_100011 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 3.830%, phastConsElements4way.bed 2.286%, both 1.993%, cover 52.04%, enrich 22.76x
     featureBits -chrom=scaffold_100063 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 2.668%, phastConsElements4way.bed 1.487%, both 0.701%, cover 26.29%, enrich 17.69x
     featureBits -chrom=scaffold_100077 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 2.671%, phastConsElements4way.bed 1.784%, both 0.968%, cover 36.25%, enrich 20.32x
     featureBits -chrom=scaffold_100083 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 3.633%, phastConsElements4way.bed 6.358%, both 3.633%, cover 100.00%, enrich 15.73x
     featureBits -chrom=scaffold_100088 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 1.384%, phastConsElements4way.bed 2.254%, both 0.625%, cover 45.15%, enrich 20.03x
 
     # the largest scaffolds:
     featureBits -chrom=scaffold_217569 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 0.877%, phastConsElements4way.bed 3.784%, both 0.607%, cover 69.21%, enrich 18.29x
     featureBits -chrom=scaffold_213970 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 0.000%, phastConsElements4way.bed 1.531%, both 0.000%, cover nan%, enrich  nanx
     featureBits -chrom=scaffold_138821 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 6.129%, phastConsElements4way.bed 5.286%, both 3.572%, cover 58.28%, enrich 11.02x
     featureBits -chrom=scaffold_149667 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 0.658%, phastConsElements4way.bed 0.900%, both 0.344%, cover 52.30%, enrich 58.14x
     featureBits -chrom=scaffold_217324 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 0.746%, phastConsElements4way.bed 3.041%, both 0.531%, cover 71.21%, enrich 23.42x
     featureBits -chrom=scaffold_132063 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 1.903%, phastConsElements4way.bed 2.062%, both 1.142%, cover 60.03%, enrich 29.11x
     featureBits -chrom=scaffold_194753 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed
     # nscanGene:cds 0.843%, phastConsElements4way.bed 2.334%, both 0.230%, cover 27.28%, enrich 11.69x
 
     # not as good as ornAna1, but hopefully good enough
 
     # load
     hgLoadBed felCat3 phastConsElements4way phastConsElements4way.bed
 
     # create merged posterior probability file and wiggle track data files
     ssh kolossus
     cd /san/sanvol1/scratch/felCat3/multiz4way/phastCons/
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     time find ./pp -name "*.pp" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
         sort -k7,7 -k9,9n | \
         sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
         xargs cat | \
         nice wigEncode -noOverlap stdin phastCons4way.wig phastCons4way.wib
         cp -p phastCons4way.wi?  /cluster/data/felCat3/bed/multiz4way/phastCons
     # 0.166u 1.856s 2:10.13 1.5%      0+0k 0+0io 0pf+0w
     cp -p phastCons4way.wi? /cluster/data/felCat3/bed/multiz4way/phastCons
 
     # load gbdb and database with wiggle
     ssh hgwdev
     cd /cluster/data/felCat3/bed/multiz4way/phastCons
     ln -s /cluster/data/felCat3/bed/multiz4way/phastCons/phastCons4way.wib /gbdb/felCat3/multiz4way
     hgLoadWiggle -pathPrefix=/gbdb/felCat3/multiz4way felCat3 phastCons4way phastCons4way.wig
 
     # tally up tree distances
     ssh hgwdev
     cd /cluster/data/felCat3/bed/multiz4way
     tail -1 phastCons/phyloFit.tree.mod | sed -e 's/^TREE: //' > 4way.phyloFit.nh
     /cluster/bin/phast.new/all_dists 4way.nh > 4way.distances.txt
     grep felCat3 4way.distances.txt | sort -k3,3n |  awk '{printf ("%.4f\t%s\n", $3, $1)}' > distances.txt
 
     cat distances.txt
     # 0.1993  canFam2
     # 0.3320  hg18
     # 0.5306  mm8
 
     # downloadables
     ssh kkstore05
     cd /cluster/data/felCat3/bed/multiz4way
     mkdir phastConsDownloads
     cd phastConsDownloads
     ./make.csh
     nice gzip felCat3.pp
     md5sum felCat3.pp.gz > md5sum.txt
 
     ssh hgwdev
     cd /cluster/data/felCat3/bed/multiz4way/phastConsDownloads
     set dir = /usr/local/apache/htdocs/goldenPath/felCat3/phastCons4way
     mkdir $dir
     ln -s /cluster/data/felCat3/bed/multiz4way/phastConsDownloads/{*.gz,md5sum.txt} $dir
     cp /usr/local/apache/htdocs/goldenPath/ornAna1/phastCons6way/README.txt $dir
     # edit README.txt
     cp /cluster/data/felCat3/bed/multiz4way/phastCons/4way.mod /usr/local/apache/htdocs/goldenPath/felCat3/phastCons4way
 
 
     # clean up
     ssh kkstore05
     rm /cluster/data/felCat3/bed/multiz4way/phastCons/*.tab
     rm -r /san/sanvol1/scratch/felCat3/multiz4way/phastCons
 
 
 ############################################################################
 ##  BLASTZ swap from mm9 alignments (2007-11-10  - markd)
     ssh hgwdev
     mkdir /cluster/data/felCat3/bed/blastz.mm9.swap
     cd /cluster/data/felCat3/bed/blastz.mm9.swap
     ln -s blastz.mm9.swap ../blastz.mm9
     /cluster/bin/scripts/doBlastzChainNet.pl \
         -swap /cluster/data/mm9/bed/blastz.felCat3/DEF >& swap.out&
 
     nice +19 featureBits felCat3 chainMm9Link
     # 486766552 bases of 1642698377 (29.632%) in intersection
 
 ############################################################################
 # SGP GENES (DONE - 2007-12-20 - Hiram)
     ssh kkstore05
     mkdir  /cluster/data/felCat3/bed/sgp
     cd  /cluster/data/felCat3/bed/sgp
 
     wget --timestamping \
 "http://genome.imim.es/genepredictions/F.catus/golden_path_200603/SGP/200603mm8/scaffolds.gtf" \
 	-O felCat3.gtf
 
     ssh hgwdev
     cd /cluster/data/felCat3/bed/sgp
     ldHgGene -gtf -genePredExt felCat3 sgpGene felCat3.gtf
     #	Read 51619 transcripts in 153058 lines in 1 files
     #	51619 groups 39049 seqs 1 sources 3 feature types
     #	51619 gene predictions
 
     featureBits felCat3 sgpGene
     #	18130668 bases of 1642698377 (1.104%) in intersection
     featureBits felCat3 -enrichment refGene:CDS sgpGene
     # refGene:CDS 0.014%, sgpGene 1.104%, both 0.006%, cover 41.92%,
     #	enrich 37.98x
 
 #####################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 
 ############################################################################
 #  felCat3 - Cat - Ensembl Genes version 51  (DONE - 2008-12-03 - hiram)
     ssh kolossus
     cd /hive/data/genomes/felCat3
     cat << '_EOF_' > felCat3.ensGene.ra
 # required db variable
 db felCat3
 # do we need to translate geneScaffold coordinates
 geneScaffolds yes
 # ignore genes that do not properly convert to a gene pred, and contig
 #	names that are not in the UCSC assembly
 skipInvalid yes
 # ignore the three genes that have invalid structures from Ensembl:
 # 2100: ENSFCAT00000006929 no exonFrame on CDS exon 16
 # 14578: ENSFCAT00000010965 no exonFrame on CDS exon 1
 # 26634: ENSFCAT00000009384 no exonFrame on CDS exon 0
 '_EOF_'
 #  << happy emacs
 
     doEnsGeneUpdate.pl -ensVersion=51 felCat3.ensGene.ra
     ssh hgwdev
     cd /hive/data/genomes/felCat3/bed/ensGene.51
     featureBits felCat3 ensGene
     # 22158651 bases of 1642698377 (1.349%) in intersection
 
  *** All done!  (through the 'makeDoc' step)
  *** Steps were performed in /hive/data/genomes/felCat3/bed/ensGene.51
 
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
 
 see doc/builds.txt for specific details.
 ############################################################################
+# LIFTOVER TO FelCatV17e (working - 2010-03-29 - Chin )
+    mkdir /hive/data/genomes/felCat3/bed/blat.felCatV17e.2010-03-29
+    cd /hive/data/genomes/felCat3/bed/blat.felCatV17e.2010-03-29
+    # -debug run to create run dir, preview scripts...
+    doSameSpeciesLiftOver.pl -debug -ooc /scratch/data/felCat3/11.ooc \
+       felCat3 felCatV17e
+    # Real run:
+    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
+         -ooc /scratch/data/felCat3/11.ooc \
+        -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
+         felCat3 felCatV17e > do.log 2>&1 &
+    #   real    36m16.693s
+
+############################################################################