src/hg/makeDb/doc/hg19.txt 1.3

1.3 2009/03/13 19:14:27 hiram
cpgIslands done, genbank run done, blatServers assigned
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/makeDb/doc/hg19.txt	6 Mar 2009 23:27:05 -0000	1.2
+++ src/hg/makeDb/doc/hg19.txt	13 Mar 2009 19:14:27 -0000	1.3
@@ -1,287 +1,436 @@
 # for emacs: -*- mode: sh; -*-
 
 # This file describes how we made the browser database on
 # NCBI build 37 (February 2009 freeze) aka:
 #	GRCh37 - Genome Reference Consortium Human Reference 37
 #	Assembly Accession: GCA_000001405.1
 
 #	"$Id$";
 
 #############################################################################
 # Download sequence (DONE - 2009-02-04 - Hiram)
     mkdir -p /hive/data/genomes/hg19/download
     cd /hive/data/genomes/hg19/download
     mkdir -p assembled_chromosomes
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=assembled_chromosomes \
         -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes
 
     mkdir -p alternate_loci
 for N in 1 2 3 4 5 6 7 8 9
 do
 wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
     --directory-prefix=alternate_loci \
         -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
 done
 
     mkdir -p unlocalized_scaffolds
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=unlocalized_scaffolds \
 	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds
 
     mkdir -p unplaced_scaffolds
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=unplaced_scaffolds \
 	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds
 
     mkdir -p placed_scaffolds
     wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
         --directory-prefix=placed_scaffolds \
 	    -nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
 ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds
 
     mkdir ucscChr
     cd ucscChr
     for F in ../assembled_chromosomes/FASTA/chr*.fa
 do
     C=`basename $F`
     C=${C/.fa}
     echo -n "${C} "
     H=`head -1 "${F}"`
     chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
     A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
     echo $chrN $A
     grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
         | sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
     echo ">${chrN}" > ${chrN}.fa
     grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
 done
 
     rm -f scaffolds.agp
     find ../alternate_loci -type f | grep ".agp$" | while read F
 do
     grep "^GL" $F | sed -e \
 "s/^GL000250.1/chr6_apd_hap1/" -e \
 "s/^GL000251.1/chr6_cox_hap2/" -e \
 "s/^GL000252.1/chr6_dbb_hap3/" -e \
 "s/^GL000253.1/chr6_mann_hap4/" -e \ 
 "s/^GL000254.1/chr6_mcf_hap5/" -e \
 "s/^GL000255.1/chr6_qbl_hap6/" -e \
-"s/^GL000256.1/chr6_ssto_hap6/" -e \
+"s/^GL000256.1/chr6_ssto_hap7/" -e \
 "s/^GL000257.1/chr4_ctg9_hap1/" -e \
 "s/^GL000258.1/chr17_ctg5_hap1/"
 done > scaffolds.agp
 
     find ../unlocalized_scaffolds -type f | grep ".agp$" \
 | while read F
 do
     C=`basename ${F}`   
     C=${C/.unlocalized.scaf.agp}
     grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
 done >> scaffolds.agp
 
     find ../unplaced_scaffolds -type f | grep ".agp$" \
 | while read F
 do
     grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
 done >> scaffolds.agp
 
     rm -f scaffolds.fa
     find ../alternate_loci -type f | grep ".fa$" | while read F
 do  
     sed -e \
 "s/>.*GL000250.*/>chr6_apd_hap1/" -e \
 "s/>.*GL000251.*/>chr6_cox_hap2/" -e \
 "s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
 "s/>.*GL000253.*/>chr6_mann_hap4/" -e \
 "s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
 "s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
 "s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
 "s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
 "s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
 done > scaffolds.fa
 
     find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
 do
     sed -e \
 "s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
 done >> scaffolds.fa
 
     find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
 do
     sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
 done >> scaffolds.fa
 
 ############################################################################
 ## Create database (DONE - 2009-03-04 - Hiram)
     cd /hive/data/genomes/hg19
     cat << '_EOF_' > hg19.config.ra
 # Config parameters for makeGenomeDb.pl:
 db hg19
 scientificName Homo sapiens
 commonName Human
 assemblyDate Feb. 2009
 assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
 orderKey 14
 mitoAcc NC_001807
 fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
 agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
 # qualFiles /dev/null
 dbDbSpeciesDir human
 taxId	9606
 '_EOF_'
     # << happy emacs
 
     time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
     #	real    14m8.958s
      featureBits -countGaps hg19 gap
     #	239845127 bases of 3137161264 (7.645%) in intersection
     featureBits -noRandom -noHap -countGaps hg19 gap
     #	234344806 bases of 3095693983 (7.570%) in intersection
     #	verify featureBits is properly ignorning haps and randoms:
     egrep -v "_" chrom.sizes | awk '{sum+=$2;print sum,$0}'
     #	3095693983 chrM 16571
     #	same total as in featureBits
 
 
 ############################################################################
 # running repeat masker (DONE - 2009-03-05 - Hiram)
     screen # use screen to manage this day-long job
     mkdir /hive/data/genomes/hg19/bed/repeatMasker
     cd /hive/data/genomes/hg19/bed/repeatMasker
     time doRepeatMasker.pl -bigClusterHub=swarm -buildDir=`pwd` hg19 \
 	> do.log 2>&1
     #	real    525m23.521s
     cat faSize.rmsk.txt
     #	3137161264 bases (239850802 N's 2897310462 real 1431585691
     #	upper 1465724771 lower) in 93 sequences in 1 files
     #	%46.72 masked total, %50.59 masked real
     featureBits -countGaps hg19 rmsk
     #	1465724774 bases of 3137161264 (46.721%) in intersection
     #	this is odd, 3 bases more in featureBits than were masked ?
     #	check it out, make a bed file from the featureBits:
     featureBits -countGaps -bed=rmsk.bed hg19 rmsk
     #	went down a sequence of intersections with this idea, but could
     #	not get it resolved.  It appears there are 75 bases in the rmsk
     #	table that were not masked in the 2bit file ?
     #	Later on, realized that featureBits does not count lower case N's
     #	in the "lower" category, but only in the N's category.
 
     #	trying a non-split table:
     hgsql -e "show tables;" hg19 | grep _rmsk | while read T
 do
     hgsql -e "drop table ${T};" hg19
 done
     hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.fa.out
 bad rep range [4385, 4384] line 1348605 of hg19.fa.out 
 bad rep range [5563, 5562] line 1563988 of hg19.fa.out 
 bad rep range [4539, 4538] line 3111186 of hg19.fa.out 
     #	featureBits still reports 1465724774 bases in rmsk table
     #	cleaning the hg19.fa.out file:
     cp hg19.fa.out hg19.clean.out
     # edit hg19.clean.out and remove the three lines:
 # 1467  20.7  1.2 17.6  chr14     35056767 35056794 (72292746) +  L1ME1          LINE/L1               4385 4384 (1761) 1120962
 # 1943  23.8  5.0 12.6  chr15     65775909 65775924 (36755468) +  L1MC4          LINE/L1               5563 5562 (2480) 1299299
 # 2463  25.1  5.0 11.6  chr3      121291056 121291083 (76731347) +  L1M3           LINE/L1               4539 4538 (1608) 2589267
 
     #	reload the table
     hgsql -e "drop table rmsk;" hg19
     hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.clean.out
 
     #	try masking with this clean file:
     twoBitMask /hive/data/genomes/hg19/hg19.unmasked.2bit hg19.clean.out \
 	hg19.clean.2bit
     twoBitToFa hg19.clean.2bit stdout | faSize stdin > faSize.clean.txt
     cat faSize.clean.txt
     #	this gives the lower by 75 bases result:
     #	3137161264 bases (239850802 N's 2897310462 real 1431585763 upper
     #	1465724699 lower) in 93 sequences in 1 files
     #	%46.72 masked total, %50.59 masked real
     featureBits -countGaps hg19 rmsk
     #	1465724774 bases of 3137161264 (46.721%) in intersection
     #	is the countGaps interferring ?
     featureBits hg19 rmsk
     #	1465724774 bases of 2897316137 (50.589%) in intersection
     #	nope, lets' see what the .out file has:
     grep chr hg19.clean.out | sed -e "s/^  *//" | awk '{print $5,$6-1,$7}' \
 	| sort -k1,1 -k2,2n > hg19.clean.out.bed
     featureBits -countGaps hg19 hg19.clean.out.bed
     #	1465724774 bases of 3137161264 (46.721%) in intersection
     #	is it perhaps not masking N's ?
     twoBitToFa hg19.clean.2bit stdout | grep n | less
     #	that does find some lower case n's, find all N's:
     findMotif -strand=+ -motif=gattaca -verbose=4 hg19.clean.2bit \
 	2> findMotif.out
     grep "^#GAP" findMotif.out | sed -e "s/#GAP //" > nLocations.bed
     #	which cover:
     featureBits -countGaps hg19 nLocations.bed
     #	251299071 bases of 3137161264 (8.010%) in intersection
     #	overlapping rmsk business with these N locations:
     featureBits -countGaps hg19 hg19.clean.out.bed nLocations.bed
     #	6494740 bases of 3137161264 (0.207%) in intersection
     #	and overlapping with gap:
     featureBits -countGaps hg19 gap nLocations.bed
     #	239845127 bases of 3137161264 (7.645%) in intersection
 
 ############################################################################
 # running TRF simple repeats (DONE - 2009-03-05 - Hiram)
     screen # use screen to manage this day-long job
     mkdir /hive/data/genomes/hg19/bed/simpleRepeat
     cd /hive/data/genomes/hg19/bed/simpleRepeat
     time doSimpleRepeat.pl -bigClusterHub=pk -workhorse=hgwdev \
 	-smallClusterHub=pk -buildDir=`pwd` hg19 > do.log 2>&1
     #	real    33m25.815s
 
     twoBitMask bed/repeatMasker/hg19.clean.2bit \
 	-add bed/simpleRepeat/trfMask.bed hg19.2bit
     twoBitToFa hg19.2bit stdout | faSize stdin > faSize.hg19.2bit.txt
 # 3137161264 bases (239850802 N's 2897310462 real 1430387259 upper
 # 1466923203 lower) in 93 sequences in 1 files
 # %46.76 masked total, %50.63 masked real
 
 ############################################################################
 #	prepare cluster data (DONE - 2009-03-06 - Hiram)
     cd /hive/data/genomes/hg19
     rm /gbdb/hg19/hg19.2bit
     ln -s `pwd`/hg19.2bit /gbdb/hg19/hg19.2bit
 
     time blat hg19.2bit \
 	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
     #	Wrote 30675 overused 11-mers to 11.ooc
     #	real    3m11.302s
 
     mkdir /hive/data/staging/data/hg19
     cp -p hg19.2bit /hive/data/staging/data/hg19
     cp -p 11.ooc /hive/data/staging/data/hg19
     cp -p chrom.sizes /hive/data/staging/data/hg19
 
     mkdir separateChrs
     cd separateChrs
     grep -v "_" ../chrom.sizes | awk '{print $1}' | while read C
 do
     twoBitToFa -seq="${C}" ../hg19.2bit stdout
 done | faToTwoBit stdin hg19.chrOnly.2bit
     twoBitInfo hg19.chrOnly.2bit stdout | sort -k2,2nr > chrOnly.chrom.sizes
 
     grep "_hap" ../chrom.sizes | awk '{print $1}' | while read C
 do
     twoBitToFa -seq="${C}" ../hg19.2bit stdout
 done | faToTwoBit stdin hg19.hapOnly.2bit
     twoBitInfo hg19.hapOnly.2bit stdout | sort -k2,2nr > hapOnly.chrom.sizes
 
     grep "_" ../chrom.sizes | grep -v "_hap" | awk '{print $1}' | while read C
 do
     twoBitToFa -seq="${C}" ../hg19.2bit stdout
 done | faToTwoBit stdin hg19.scaffolds.2bit
     twoBitInfo hg19.scaffolds.2bit stdout | sort -k2,2nr > scaffolds.chrom.sizes
 
     cp -p *.2bit *.sizes /hive/data/staging/data/hg19
 
     # ask admin to sync this directory: /hive/data/staging/data/hg19/
     #	to the kluster nodes /scratch/data/hg19/
 
 ############################################################################
+# running cpgIsland business (DONE - 2009-03-06 - Hiram)
+    mkdir /hive/data/genomes/hg19/bed/cpgIsland
+    cd /hive/data/genomes/hg19/bed/cpgIsland
+    cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
+    cd hg3rdParty/cpgIslands
+    # comment out the following two lines if it compiles cleanly
+    # some day  (there were some other fixups too, adding include lines)
+    sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
+    mv tmp.c cpg_lh.c
+    make
+    cd ../../ 
+    ln -s hg3rdParty/cpgIslands/cpglh.exe
+    mkdir -p hardMaskedFa
+    cut -f1 ../../chrom.sizes | while read C
+do
+    echo ${C}
+    twoBitToFa ../../hg19.2bit:$C stdout \
+	| maskOutFa stdin hard hardMaskedFa/${C}.fa
+done
+
+    cut -f1 ../../chrom.sizes > chr.list
+    cat << '_EOF_' > template
+#LOOP
+./runOne $(root1) {check out line results/$(root1).cpg}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    cat << '_EOF_' > runOne
+#!/bin/csh -fe
+./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
+mv /scratch/tmp/$1.$$ $2
+'_EOF_'
+    # << happy emacs
+
+    gensub2 chr.list single template jobList
+    para create jobList
+    para try
+    para check ... etc
+    para time
+# Completed: 93 of 93 jobs
+# CPU time in finished jobs:        172s       2.86m     0.05h    0.00d  0.000 y
+# IO & Wait Time:                  1748s      29.14m     0.49h    0.02d  0.000 y
+# Average job time:                  21s       0.34m     0.01h    0.00d
+# Longest finished job:              34s       0.57m     0.01h    0.00d
+# Submission to last job:            83s       1.38m     0.02h    0.00d
+
+    # Transform cpglh output to bed +
+    catDir results | awk '{
+$2 = $2 - 1;
+width = $3 - $2;
+printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
+       $1, $2, $3, $5,$6, width,
+       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
+}' > cpgIsland.bed
+
+    cd /hive/data/genomes/hg19/bed/cpgIsland
+    hgLoadBed hg19 cpgIslandExt -tab \
+      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
+
+# Reading cpgIsland.bed
+# Loaded 28226 elements of size 10
+# Sorted
+# Saving bed.tab
+# Loading hg18
+
+############################################################################
+# create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
+    mkdir /hive/data/genomes/hg19/bed/gap
+    cd /hive/data/genomes/hg19/bed/gap
+    gapToLift hg19 hg19.unBridged.lift -bedFile=unBridged.lift.bed
+    cp -p hg19.unBridged.lift ../../jkStuff
+    cp -p hg19.unBridged.lift /hive/data/staging/data/hg19
+
+############################################################################
+# AUTO UPDATE GENBANK RUN  (DONE - 2009-03-07,13 - Hiram)
+    # align with latest genbank process.
+    cd ~/kent/src/hg/makeDb/genbank
+    cvsup
+    # edit etc/genbank.conf to add hg19 just after hg18
+
+# hg19 - GRCh37 - Genome Reference Consortium Human Reference 37
+#       Assembly Accession: GCA_000001405.1
+hg19.serverGenome = /hive/data/genomes/hg19/hg19.2bit
+hg19.clusterGenome = /scratch/data/hg19/hg19.2bit
+hg19.ooc = /scratch/data/hg19/11.ooc
+hg19.lift = /scratch/data/hg19/hg19.unBridged.lift
+# hg19.hapRegions = /hive/data/genomes/hg19/bed/haplotypePos/haplotypePos.psl
+hg19.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
+hg19.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
+hg19.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
+hg19.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
+hg19.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
+hg19.genbank.est.xeno.pslCDnaFilter   = ${finished.genbank.est.xeno.pslCDnaFilter}
+hg19.genbank.est.xeno.load = yes
+hg19.refseq.mrna.xeno.load  = yes
+hg19.refseq.mrna.xeno.loadDesc = yes
+hg19.mgc = yes
+hg19.orfeome = yes
+hg19.downloadDir = hg19
+# hg19.ccds.ncbiBuild = 36.3
+# hg19.upstreamGeneTbl = refGene
+# hg19.upstreamMaf = multiz28way
+# /hive/data/genomes/hg19/bed/multiz28way/species.lst multiz44way
+# /hive/data/genomes/hg19/bed/multiz44way/species.list
+hg19.genbank.mrna.blatTargetDb = yes
+
+    cvs ci -m "Added hg19." etc/genbank.conf
+    # update /cluster/data/genbank/:
+    make etc-update
+
+    ssh genbank
+    screen		#	use a screen to manage this job
+    cd /cluster/data/genbank
+    time nice -n +19 bin/gbAlignStep -initial hg19 &
+    #	logFile: var/build/logs/2009.03.10-20:28:44.hg19.initalign.log
+    #	real    2761m13.680s
+    #	that ran on the swarm with little interference and no problems
+
+    # load database when finished
+    ssh hgwdev
+    screen	# use screen to manage this long running command
+    cd /cluster/data/genbank
+    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19 &
+    # logFile: var/dbload/hgwdev/logs/2009.03.12-21:10:02.dbload.log
+    #	real    369m11.941s
+
+    # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
+    cd ~/kent/src/hg/makeDb/genbank
+    cvsup
+    # add hg19 to:
+        etc/align.dbs
+        etc/hgwdev.dbs
+    cvs ci -m "Added hg19 - Human - GRCh37" etc/align.dbs etc/hgwdev.dbs
+    make etc-update
+
+#########################################################################
+#  BLATSERVERS ENTRY (DONE - 2009-03-09 - Hiram)
+#	After getting a blat server assigned by the Blat Server Gods,
+    ssh hgwdev
+
+    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
+	VALUES ("hg19", "blat13", "17778", "1", "0"); \
+	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
+	VALUES ("hg19", "blat13", "17779", "0", "1");' \
+	    hgcentraltest
+    #	test it with some sequence
+
+############################################################################