src/hg/makeDb/doc/sacCer2.txt 1.3

1.3 2009/02/10 23:56:21 hiram
Human proteins done by Brian, and multiple alignment in progress, genbank done
Index: src/hg/makeDb/doc/sacCer2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/sacCer2.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/makeDb/doc/sacCer2.txt	10 Feb 2009 22:19:18 -0000	1.2
+++ src/hg/makeDb/doc/sacCer2.txt	10 Feb 2009 23:56:21 -0000	1.3
@@ -232,8 +232,9 @@
     hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql
     hgsql sacCer2 -e 'load data local infile "notes.txt" \
           into table sgdOtherDescription;'
 
+############################################################################
 # ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE - 2009-02-10 - Hiram)
     cd /hive/data/sacCer2/bed/sgdAnnotation
     grep "Swiss-Prot" ../../download/chromosomal_feature/dbxref.tab \
 	| awk '{printf("%s\t%s\n", $5, $1);}' > sgdToSwissProt.txt
@@ -245,8 +246,9 @@
     hgsql sacCer2 -e 'load data local infile "sgdToSwissProt.txt" \
           into table sgdToSwissProt;'
     hgProtIdToGenePred sacCer2 sgdGene sgdToSwissProt name value
 
+############################################################################
 # CREATE SGD-BASED CLONE TRACK (DONE - 2009-02-10 - Hiram)
     mkdir /hive/data/genomes/sacCer2/bed/sgdClone
     cd /hive/data/genomes/sacCer2/bed/sgdClone
     # since sacCer1, these coordinates have become 0-relative ?
@@ -259,8 +261,9 @@
 s/^4/chrIV/; s/^2/chrII/; s/^9/chrIX/; s/^1/chrI/;" > sgdClone.bed
     hgLoadBed sacCer2 sgdClone  sgdClone.bed -tab \
         -sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql
 
+############################################################################
 # AUTO UPDATE GENBANK RUN  (Done - 2009-02-10 - Hiram)
     # align with latest genbank process.
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
@@ -290,17 +293,18 @@
     screen		#	use a screen to manage this job
     cd /cluster/data/genbank
     time nice -n +19 bin/gbAlignStep -initial sacCer2 &
     #	logFile: var/build/logs/2009.02.10-11:27:42.sacCer2.initalign.log
-XXX - running Tue Feb 10 11:27:55 PST 2009
+    #	real    173m1.570s
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
-    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ce6
-    #	logFile: var/dbload/hgwdev/logs/2008.05.30-20:29:01.dbload.log
-    #	real    23m8.651s
+    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sacCer2
+    #	logFile: var/dbload/hgwdev/logs/2009.02.10-14:25:48.dbload.log
+    #	real    18m21.436s
 
+XXXX TBD
     # enable daily alignment and update of hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add ce6 to:
@@ -309,8 +313,164 @@
     cvs ci -m "Added ce6 - C. elegans WS190" \
 	etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
+############################################################################
+# DOING MULTIPLE ALIGNMENT WITH OTHER YEAST  (DONE - 2009-02-10 - Hiram)
+#	the sequences are the same set from 2003 that were used in sacCer1
+#	so, just link to them and rerun the alignment
+# in the directory /cluster/data/sacCer1/bed/otherYeast/align
+    mkdir /hive/data/genomes/sacCer2/bed/otherYeast
+    cd /hive/data/genomes/sacCer2/bed/otherYeast
+    for D in sacBay sacCas sacKlu sacKud sacMik sacPar
+do
+    ln -s ../../../sacCer1/bed/otherYeast/${D} .
+done
+    mkdir align
+    cd align
+    ln -s ../../../../sacCer1/bed/otherYeast/align/sac??? .
+    #	helpful to have 2bit files for chainAntiRepeat
+    for G in sac???
+do
+    faToTwoBit ${G} ${G}.2bit
+done
+
+
+    # Create directory full of size info
+    mkdir sizes
+    for F in sac???
+    do
+	faSize $F -detailed > sizes/$F
+    done
+
+    # Create some working directories
+    mkdir lav axtAll axtBest mafIn mafRawOut mafOut
+
+    # Create little shell script to do blastz alignments
+    cat << '_EOF_' > lastz.csh
+#!/bin/csh -ef
+/cluster/bin/penn/$MACHTYPE/lastz $1 $2 > $3
+'_EOF_'
+    # << happy emacs
+    chmod +x lastz.csh
+
+    # create sacCer2 individual chr.fa files:
+    mkdir sacCer2
+    twoBitInfo ../../../sacCer2.2bit stdout | cut -f1 | while read C
+do
+    twoBitToFa ../../../sacCer2.2bit:${C} sacCer2/${C}.fa
+    echo "${C} done"
+done
+-
+    # Create job spec to do all blastz alignments
+    ls -1 sacCer2/*.fa > sacCer.list
+    ls -1 sac??? > other.list
+    cat << '_EOF_' > template
+#LOOP
+lastz.csh $(path1) $(path2) {check out line+ lav/$(root1).$(root2)}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+    ssh swarm
+    cd /hive/data/genomes/sacCer2/bed/otherYeast/align
+    gensub2 sacCer.list other.list template jobList
+
+    # Convert from lav to psl
+    for L in lav/*
+do
+    B=`basename $L`
+    echo $B
+    lavToPsl $L stdout | gzip > psl/$B.psl.gz
+done
+
+    #	chaining
+    mkdir -p axtChain/run
+    cd axtChain/run
+    ls -d ../../sac??? | xargs -L 1 basename > species.list
+    cut -f1 ../../../../../chrom.sizes > chrom.list
+
+    cat << '_EOF_' > template
+#LOOP
+chain.csh $(path1) $(path2) {check out line+ $(root1)/$(root2).chain}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    gensub2 species.list chrom.list template jobList
+    para create jobList
+    para push
+    para time
+# Completed: 108 of 108 jobs
+# CPU time in finished jobs:         73s       1.21m     0.02h    0.00d  0.000 y
+# IO & Wait Time:                  1100s      18.34m     0.31h    0.01d  0.000 y
+# Average job time:                  11s       0.18m     0.00h    0.00d
+# Longest finished job:              12s       0.20m     0.00h    0.00d
+# Submission to last job:            38s       0.63m     0.01h    0.00d
+
+    cd ..
+    #	merge the individual results
+    foreach S (`cat run/species.list`)
+echo $S
+find ./run/${S} -name "*.chain" \
+    | chainMergeSort -inputList=stdin | gzip -c > sacCer2.${S}.all.chain.gz
+end
+    #	split them again to get consistent numbering
+    foreach S (`cat run/species.list`)
+echo $S
+chainSplit chain.${S} sacCer2.${S}.all.chain.gz
+end
+
+    #	netting the chains
+    cat << '_EOF_' > netChains.csh
+#!/bin/csh -ef
+
+set S0 = "../../../../chrom.sizes"
+
+foreach S (`cat run/species.list`)
+    echo $S
+    chainPreNet sacCer2.${S}.all.chain.gz ${S0} ../sizes/${S} stdout \
+    | chainNet stdin -minSpace=1 ${S0} ../sizes/${S} stdout /dev/null \
+        | netSyntenic stdin noClass.${S}.net
+end
+'_EOF_'
+    # << happy emacs
+    chmod +x netChains.csh
+    ./netChains.csh
+
+    cat << '_EOF_' > mkMafs.csh
+#!/bin/csh -ef
+
+set TOP = `pwd`
+set S0 = "/hive/data/genomes/sacCer2/chrom.sizes"
+
+foreach S (`cat run/species.list`)
+    # Make axtNet for download: one .axt per sacCer2 seq.
+    netSplit noClass.${S}.net net.${S}
+    cd ..
+    rm -fr axtNet.${S}
+    mkdir axtNet.${S}
+    foreach f (axtChain/net.${S}/*.net)
+    netToAxt $f axtChain/chain.${S}/$f:t:r.chain \
+      /hive/data/genomes/sacCer2/sacCer2.2bit ${S}.2bit stdout \
+      | axtSort stdin stdout \
+      | gzip -c > axtNet.${S}/$f:t:r.sacCer2.${S}.net.axt.gz
+    end
+
+    # Make mafNet for multiz: one .maf per sacCer2 seq.
+    rm -fr mkdir mafNet.${S}
+    mkdir mafNet.${S}
+    foreach f (axtNet.${S}/*.sacCer2.${S}.net.axt.gz)
+      axtToMaf -tPrefix=sacCer2. -qPrefix=${S}. $f \
+            ${S0} sizes/${S} stdout \
+      | gzip -c > mafNet.${S}/$f:t:r:r:r:r:r.maf.gz
+    end
+    cd ${TOP}
+end
+'_EOF_'
+    # << happy emacs
+    chmod +x mkMafs.csh
+    ./mkMafs.csh
+
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-10 braney )
     # bash  if not using bash shell already
 
@@ -473,4 +633,5 @@
 # blastHg18KG 10.373%, sgdGene 72.883%, both 10.320%, cover 99.49%, enrich 1.37x
 
     rm -rf blastOut
 #end tblastn
+###########################################################################