src/hg/makeDb/doc/sacCer2.txt 1.5

1.5 2009/02/27 19:24:53 hiram
Finished with sacCer2
Index: src/hg/makeDb/doc/sacCer2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/sacCer2.txt,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 4 -r1.4 -r1.5
--- src/hg/makeDb/doc/sacCer2.txt	12 Feb 2009 19:50:19 -0000	1.4
+++ src/hg/makeDb/doc/sacCer2.txt	27 Feb 2009 19:24:53 -0000	1.5
@@ -8,8 +8,28 @@
 #######################################################################
 # Download data  (DONE - 2009-01-30 - Hiram)
     mkdir -p /hive/data/genomes/sacCer2/download
     cd /hive/data/genomes/sacCer2/download
+    TOP=/hive/data/genomes/sacCer2/download
+    for D in gene_registry literature_curation oracle_schema protein_info \
+	protein_info/hypothetical_peptides
+do
+    mkdir -p ${D}
+    cd ${D}
+    wget -l 1 --timestamping -np -nd --cut-dirs=1 -r -X "archive" \
+	"http://downloads.yeastgenome.org/${D}/"
+    rm -f index.* robots.txt
+    cd ${TOP}
+done
+
+    mkdir sgd.chromosomes
+    for C in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt
+do
+    wget --timestamping \
+"http://downloads.yeastgenome.org/sequence/genomic_sequence/chromosomes/fasta/ch
+r${C}.fsa" -O sgd.chromosomes/chr${C}.fsa
+done
+
     #	convert chrom names to something reasonable, they are now roman nums
     mkdir -p chromosomes
     # bash scripting follows:
 runOne() {
@@ -143,10 +163,10 @@
         | sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff
 
 #########################################################################
 # CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE - 2009-02-10 - Hiram)
-    mkdir /hive/data/sacCer2/bed/sgdAnnotations
-    cd /hive/data/sacCer2/bed/sgdAnnotations
+    mkdir /hive/data/genomes/sacCer2/bed/sgdAnnotations
+    cd /hive/data/genomes/sacCer2/bed/sgdAnnotations
     #	trim the delivered S.cerevisiae.gff file to get rid of the FASTA section
     #	and fixup the chrM and 2-micron chrom names:
     awk '
 BEGIN { keepGoing = 1 }
@@ -232,8 +252,44 @@
     hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql
     hgsql sacCer2 -e 'load data local infile "notes.txt" \
           into table sgdOtherDescription;'
 
+    ## Clean up some stray names:
+    cd /hive/data/genomes/sacCer2/bed/sgdAnnotations
+    hgsql -N -e "select name from sgdGene;" sacCer2 \
+	| sort -u > sacCer2.sgdGene.name.txt
+    hgsql -N -e "select name from sgdPep;" sacCer2 \
+	| sort -u > sacCer2.sgdPep.name.txt
+    comm -23 sacCer2.sgdPep.name.txt sacCer2.sgdGene.name.txt | while read N
+do
+    hgsql -e "delete from sgdPep where name=\"$N\";" sacCer2
+done
+
+    hgsql -N -e "select name from sgdDescription;" sacCer2 \
+        | sort -u > sacCer2.sgdDescription.name.txt
+    comm -23 sacCer2.sgdDescription.name.txt sacCer2.sgdGene.name.txt \
+	| while read N
+do
+    hgsql -e "delete from sgdDescription where name=\"${N}\";" sacCer2
+done
+
+############################################################################
+# catch up to other tables in sacCer1 - (DONE - 2009-02-24 - Hiram)
+    #	can simply transfer these tables across from sacCer1:
+    hgsqldump --all -c --tab=. sacCer1 sgdAbundance sgdLocalization sgdToPfam
+
+    #	hgLoadSqlTab doesn't like the comment characters:
+    grep -v "^--" sgdToPfam.sql \
+	| hgLoadSqlTab sacCer2 sgdToPfam stdin sgdToPfam.txt
+    grep -v "^--" sgdAbundance.sql \
+	| hgLoadSqlTab sacCer2 sgdAbundance stdin sgdAbundance.txt
+    grep -v "^--" sgdLocalization.sql \
+	| hgLoadSqlTab sacCer2 sgdLocalization stdin sgdLocalization.txt
+
+    hgsqldump --all -c --tab=. sacCer1 yeastP2P
+    grep -v "^--" yeastP2P.sql \
+	| hgLoadSqlTab sacCer2 yeastP2P stdin yeastP2P.txt
+
 ############################################################################
 # ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE - 2009-02-10 - Hiram)
     cd /hive/data/sacCer2/bed/sgdAnnotation
     grep "Swiss-Prot" ../../download/chromosomal_feature/dbxref.tab \
@@ -302,16 +358,15 @@
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sacCer2
     #	logFile: var/dbload/hgwdev/logs/2009.02.10-14:25:48.dbload.log
     #	real    18m21.436s
 
-XXXX TBD
-    # enable daily alignment and update of hgwdev
+    # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
-    # add ce6 to:
+    # add sacCer2 to:
         etc/align.dbs
         etc/hgwdev.dbs
-    cvs ci -m "Added ce6 - C. elegans WS190" \
+    cvs ci -m "Added sacCer2 - S. cerevisiae" \
 	etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 ############################################################################
@@ -838,9 +893,9 @@
 zcat SS/$chr.ss.gz \
     | /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons - \
 	ave.cons.mod,ave.noncons.mod --expected-lengths 75 \
 	--target-coverage 0.5 --quiet --seqname $chr --idpref $chr \
-	--viterbi ELEMENTS/$pref.bed --score --require-informative 0 > $tmpFile
+	--viterbi ELEMENTS/$chr.bed --score --require-informative 0 > $tmpFile
 gzip -c $tmpFile > POSTPROBS/$chr.pp.gz
 rm $tmpFile
 '_EOF_'
     # << happy emacs
@@ -923,4 +978,267 @@
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
+
+    #	To create the tree diagram for the details page, use this tree
+    #	definition in http://genome.ucsc.edu/cgi-bin/phyloGif
+
+((((((S._cerevisiae,S._paradoxus),S._mikatae),S._kudriavzevii),S._bayanus),S._castelli),S._kluyveri)
+
+#########################################################################
+## Annotate the sacCer2 7-way sequence with genes
+    mkdir /hive/data/genomes/sacCer2/bed/multiz7way/anno
+    cd /hive/data/genomes/sacCer2/bed/multiz7way/anno
+    mkdir genes
+    # using sgdGene
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from sgdGene" sacCer2 \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > genes/sacCer2.sgdGene.gz
+
+    (cat ../maf/*.maf | genePredToMafFrames sacCer2 stdin stdout \
+sacCer2 genes/sacCer2.sgdGene.gz \
+    | gzip > multiz7way.mafFrames.gz) > frames.log 2>&1
+
+    zcat multiz7way.mafFrames.gz \
+	| sort -k1,1 -k2,2n | hgLoadMafFrames sacCer2 multiz7wayFrames stdin
+
+#########################################################################
+## simpleRepeats (DONE - 2009-02-12 - Hiram)
+    mkdir /hive/data/genomes/sacCer2/bed/simpleRepeat
+    cd /hive/data/genomes/sacCer2/bed/simpleRepeat
+    doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \
+	-workhorse=hgwdev sacCer2 > do.log 2>&1
+
+#########################################################################
+## Regulatory Code (DONE - 2009-02-17 - Hiram)
+    #	liftOver from sacCer1
+    mkdir /hive/data/genomes/sacCer2/bed/transRegCode
+    cd /hive/data/genomes/sacCer2/bed/transRegCode
+    #	lifting sacCer1 data to this assembly
+    hgsql -N -e "select chrom,chromStart,chromEnd,name,score,chipEvidence,consSpecies from transRegCode" sacCer1 \
+    | liftOver -bedPlus=5 -tab stdin \
+/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
+	transRegCode.lifted.bed transRegCode.unMapped.bed
+
+    hgsql -N -e "select chrom,chromStart,chromEnd,name,tfCount,tfList,bindVals fromtransRegCodeProbe;" sacCer1 \
+    | liftOver -bedPlus=4 -tab stdin \
+/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
+        transRegCodeProbe.lifted.bed transRegCodeProbe.unMapped.bed
+
+
+    hgLoadBed sacCer2 transRegCode transRegCode.lifted.bed \
+        -sqlTable=$HOME/kent/src/hg/lib/transRegCode.sql
+    #	Loaded 206672 elements of size 7
+    hgLoadBed sacCer2 transRegCodeProbe transRegCodeProbe.lifted.bed \
+        -sqlTable=$HOME/kent/src/hg/lib/transRegCodeProbe.sql -tab
+    #	Loaded 6178 elements of size 7
+
+    hgsql sacCer2 < $HOME/kent/src/hg/lib/transRegCodeCondition.sql
+    hgsql sacCer2 < $HOME/kent/src/hg/lib/transRegCodeMotif.sql
+    hgsql sacCer2 < $HOME/kent/src/hg/lib/growthCondition.sql
+
+    hgsql -N -e "select * from transRegCodeCondition;" sacCer1 \
+    | hgsql sacCer2 -e \
+'load data local infile "/dev/stdin" into table transRegCodeCondition'
+
+    hgsql -N -e "select * from transRegCodeMotif;" sacCer1 \
+    | hgsql sacCer2 -e \
+'load data local infile "/dev/stdin" into table transRegCodeMotif'
+
+    hgsql -N -e "select * from growthCondition;" sacCer1 \
+    | hgsql sacCer2 -e \
+'load data local infile "/dev/stdin" into table growthCondition'
+
+#########################################################################
+# Oreganno track - (DONE - 2009-02-17 - Hiram)
+    #	liftOver from sacCer1 database
+    hgsql -N -e \
+"select chrom,chromStart,chromEnd,id,strand,name from oreganno;" sacCer1 \
+    | liftOver -bedPlus=4 -tab stdin \
+/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
+        oreganno.lifted.bed oreganno.unMapped.bed
+
+    hgsql sacCer2 < $HOME/kent/src/hg/lib/oreganno.sql
+
+    hgLoadBed -oldTable sacCer2 oreganno oreganno.lifted.bed -tab
+    #	Loaded 7302 elements of size 6
+
+    #	and load non-positional tracks from sacCer1:
+    hgsql -N -e "select * from oregannoAttr;" sacCer1 \
+	| hgLoadSqlTab -oldTable sacCer2 oregannoAttr \
+	    ~/humPhen/kent/src/hg/lib/oreganno.sql stdin
+    hgsql -N -e "select * from oregannoLink;" sacCer1 \
+	| hgLoadSqlTab -oldTable sacCer2 oregannoLink \
+	    ~/humPhen/kent/src/hg/lib/oreganno.sql stdin
+
+#########################################################################
+# Regulatory Module - (DONE - 2009-02-17 - Hiram)
+    mkdir /hive/data/genomes/sacCer2/bed/regModule
+    cd /hive/data/genomes/sacCer2/bed/regModule
+    #	liftOver data from sacCer1
+    hgsql -N -e \
+"select chrom,chromStart,chromEnd,name,score,strand from esRegUpstreamRegion;" \
+	sacCer1 | liftOver -bedPlus=6 -tab stdin \
+/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
+        esRegUpstreamRegion.lifted.bed esRegUpstreamRegion.unMapped.bed
+
+    hgsql -N -e \
+"select chrom,chromStart,chromEnd,name,score,strand,gene from esRegGeneToMotif;" \
+	sacCer1 | liftOver -bedPlus=6 -tab stdin \
+/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
+        esRegGeneToMotif.lifted.bed esRegGeneToMotif.unMapped.bed
+
+    #	I do not see instructions in sacCer1 to create these tables,
+    #	so, dump their schemas:
+    hgsqldump --all -c -d --tab=. sacCer1 esRegUpstreamRegion esRegGeneToMotif
+    #	and data for these other two:
+    hgsqldump --all -c --tab=. sacCer1 esRegGeneToModule esRegMotif
+
+    hgLoadBed sacCer2 esRegGeneToMotif -sqlTable=esRegGeneToMotif.sql -tab \
+	esRegGeneToMotif.lifted.bed
+    #	Loaded 4002 elements of size 7
+    hgLoadBed sacCer2 esRegUpstreamRegion -sqlTable=esRegUpstreamRegion.sql \
+	-tab esRegUpstreamRegion.lifted.bed
+    #	Loaded 1670 elements of size 6
+    hgsql sacCer2 < esRegMotif.sql
+    hgsql sacCer2 -e \
+'load data local infile "esRegMotif.txt" into table esRegMotif;'
+    hgsql sacCer2 < esRegGeneToModule.sql
+    hgsql sacCer2 -e \
+'load data local infile "esRegGeneToModule.txt" into table esRegGeneToModule;'
+
+#########################################################################
+# creating tables for Gene Sorter (DONE - 2009-02-17 - Hiram)
+    mkdir /hive/data/genomes/sacCer2/bed/hgNear
+    cd /hive/data/genomes/sacCer2/bed/hgNear
+    hgClusterGenes sacCer2 sgdGene sgdIsoforms sgdCanonical
+    #	Got 6550 clusters, from 6717 genes in 18 chromosomes
+
+    # Make self mapping table for expression. 
+    hgsql -N -e 'select name from sgdGene;' sacCer2 \
+         | awk '{printf("%s\t%s\n", $1, $1);}' > sgdToSgd.tab
+    hgsql sacCer2 -e 'create table sgdToSgd ( \
+          name varchar(10) not null, \
+	  value varchar(10) not null, \
+	  PRIMARY KEY(name), \
+	  UNIQUE (value));'
+    hgsql sacCer2 \
+	-e 'load data local infile "sgdToSgd.tab" into table sgdToSgd'
+
+    # Make expression similarity table. 
+    hgExpDistance sacCer2 hgFixed.yeastChoCellCycle \
+	hgFixed.yeastChoCellCycleExps choExpDistance 
+    #	Have 6259 elements in hgFixed.yeastChoCellCycle
+    #	Got 6259 unique elements in hgFixed.yeastChoCellCycle
+    #	Made choExpDistance.tab
+
+#########################################################################
+# running the blastP operation to the other genomes for the gene sorter
+#	(DONE - 2009-02-18 - Hiram)
+    mkdir /hive/data/genomes/sacCer2/bed/hgNearBlastp
+    cd /hive/data/genomes/sacCer2/bed/hgNearBlastp
+    mkdir tmp 090218
+    pepPredToFa sacCer2 sgdPep 090218/sgdPep.faa
+
+    pepPredToFa hg18 knownGenePep 090218/hg18.known.faa
+    pepPredToFa mm9 knownGenePep 090218/mm9.known.faa
+    pepPredToFa rn4 knownGenePep 090218/rn4.known.faa
+    pepPredToFa danRer5 ensPep 090218/danRer5.ensPep.faa
+    pepPredToFa dm3 flyBasePep 090218/dm3.flyBasePep.faa
+    pepPredToFa ce6 sangerPep 090218/ce6.sangerPep.faa
+
+    # sanity check, number of lines in each faa file
+
+    cd 090218
+    cat << '_EOF_' > config.ra
+# Latest Yeast vs. other Gene Sorter orgs:
+# human, mouse, rat, zebrafish, fly, worm
+
+targetGenesetPrefix known
+targetDb sacCer2
+queryDbs hg18 mm9 rn4 danRer5 dm3 ce6
+
+sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/sgdPep.faa
+hg18Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/hg18.known.faa
+mm9Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/mm9.known.faa
+rn4Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/rn4.known.faa
+danRer5Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/danRer5.ensPep.faa
+dm3Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/dm3.flyBasePep.faa
+ce6Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/ce6.sangerPep.faa
+
+buildDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218
+scratchDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/tmp
+'_EOF_'
+    # << happy emacs
+    #	takes about an hour
+    time nice -n +19 $HOME/kent/src/hg/utils/automation/doHgNearBlastp.pl \
+	config.ra > do.log 2>&1 &
+    #	real    21m32.343s
+
+    #	one name seems to have snuck in here:
+    cd /hive/data/genomes/sacCer2/bed/hgNearBlastp
+    hgsql -N -e "select query from mmBlastTab;" sacCer2 \
+	| sort -u > sacCer2.mmBlastTab.query.txt
+    hgsql -N -e "select name from sgdGene;" sacCer2 \
+	| sort -u > sacCer2.sgdGene.name.txt
+    #	the single one is:
+    comm -23 sacCer2.mmBlastTab.query.txt sacCer2.sgdGene.name.txt
+    #	YDL038C
+    #	it was the same in all of them:
+    hgsql -e "delete from mmBlastTab where query=\"YDL038C\";" sacCer2
+    hgsql -e "delete from drBlastTab where query=\"YDL038C\";" sacCer2
+    hgsql -e "delete from dmBlastTab where query=\"YDL038C\";" sacCer2
+    hgsql -e "delete from ceBlastTab where query=\"YDL038C\";" sacCer2
+
+    hgsql -N -e "select query from knownBlastTab;" sacCer2 \
+	| sort -u > sacCer2.knownBlastTab.query.txt
+    comm -23 sacCer2.knownBlastTab.query.txt sacCer2.sgdGene.name.txt \
+	| while read N
+do
+    hgsql -e "delete from knownBlastTab where query=\"${N}\";" sacCer2
+done
+
+#########################################################################
+#  creating download files and pushQ (DONE - 2009-02-24 - Hiram)
+    cd /hive/data/genoems/sacCer2
+    #	there aren't any repeats on 2micron
+    touch bed/simpleRepeat/trfMaskChrom/2micron.bed
+    #	and, there are no RM files:
+    makeDownloads.pl -ignoreRepeatMasker sacCer2
+    #	edit the README files in:
+    #	./goldenPath/bigZips/README.txt
+    #	./goldenPath/database/README.txt
+    #	./goldenPath/liftOver/README.txt
+    #	./goldenPath/chromosomes/README.txt
+
+    mkdir pushQ
+    makePushQSql.pl sacCer2 > sacCer2.pushQ.sql
+    #	one warning:
+    #	sacCer2 does not have seq
+    #	it could not identify the following tables:
+    #	2micron_est
+    #	2micron_gap
+    #	2micron_gold
+    #	2micron_intronEst
+    #	2micron_mrna
+    #	growthCondition
+    #	sgdToPfam
+    #	yeastP2P
+    scp -p sacCer2.pushQ.sql hiram@hgwbeta:/tmp
+
+    ssh hgwbeta
+    hgsql qapushq < sacCer2.pushQ.sql
+
+#########################################################################
+#  BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
+#	After getting a blat server assigned by the Blat Server Gods,
+    ssh hgwdev
+
+    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
+	VALUES ("sacCer2", "blat10", "17792", "1", "0"); \
+	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
+	VALUES ("sacCer2", "blat10", "17793", "0", "1");' \
+	    hgcentraltest
+    #	test it with some sequence
+
+############################################################################