src/hg/makeDb/doc/tetNig2.txt 1.2

1.2 2009/08/06 20:57:39 hiram
Through masking and Ensembl v55 genes loaded
Index: src/hg/makeDb/doc/tetNig2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/tetNig2.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/makeDb/doc/tetNig2.txt	6 Aug 2009 16:44:35 -0000	1.1
+++ src/hg/makeDb/doc/tetNig2.txt	6 Aug 2009 20:57:39 -0000	1.2
@@ -37,6 +37,156 @@
 ##########################################################################
 # Initial browser (DONE - 2009-08-06 - Hiram)
     cd /hive/data/genomes/tetNig2
     cat << '_EOF_' > tetNig2.config.ra
+# Config parameters for makeGenomeDb.pl:
+db tetNig2
+clade vertebrate
+scientificName Tetraodon nigroviridis
+commonName Tetraodon
+assemblyDate Mar. 2007
+assemblyLabel Genoscope Tetraodon v8.0 (NCBI project 12350, CAAE01000000)
+orderKey 459
+mitoAcc NC_007176
+fastaFiles /hive/data/genomes/tetNig2/genoscope/chr*.fa.gz
+agpFiles /hive/data/genomes/tetNig2/genoscope/chr.agp
+# qualFiles none
+dbDbSpeciesDir tetraodon
+taxId 99883
 '_EOF_'
     # << happy emacs
+
+    time nice -n +19 makeGenomeDb.pl -verbose=2 -workhorse=hgwdev \
+	-noGoldGapSplit -stop=agp tetNig2.config.ra > agp.log 2>&1
+
+    time nice -n +19 makeGenomeDb.pl -verbose=2 -workhorse=hgwdev \
+	-noGoldGapSplit -continue=db -stop=db tetNig2.config.ra > db.log 2>&1
+
+    time nice -n +19 makeGenomeDb.pl -verbose=2 -workhorse=hgwdev \
+	-noGoldGapSplit -continue=dbDb tetNig2.config.ra > dbDb.log 2>&1
+
+    #	add the trackDb files to the source tree and entry to trackDb/makefile
+
+##########################################################################
+# Repeat Masker (DONE - 2009-08-06 - Hiram)
+    mkdir /hive/data/genomes/tetNig2/bed/repeatMasker
+    cd /hive/data/genomes/tetNig2/bed/repeatMasker
+    doRepeatMasker.pl -verbose=2 -workhorse=hgwdev \
+	-noSplit -buildDir=`pwd` tetNig2 > do.log 2>&1
+    cat faSize.rmsk.txt
+# 358618246 bases (56303458 N's 302314788 real 292078336 upper 10236452 lower)
+# in 27 sequences in 1 files
+# %2.85 masked total, %3.39 masked real
+
+    # since this doesn't mask very much, use windowmasker instead
+    hgsql -e "drop table rmsk;" tetNig2
+    #	this leaves the interrupted repeats track showing on genome-test
+
+########################################################################
+# Simple Repeats (DONE - 2009-08-06 - Hiram)
+    mkdir /hive/data/genomes/tetNig2/bed/simpleRepeat
+    cd /hive/data/genomes/tetNig2/bed/simpleRepeat
+    doSimpleRepeat.pl -workhorse=hgwdev \
+	-buildDir=`pwd` tetNig2 > do.log 2>&1 &
+    #	fails on the job for chrM, make an empty result:
+    touch /hive/data/genomes/tetNig2/TrfPart/009/009.lst.bed
+    doSimpleRepeat.pl -workhorse=hgwdev -continue=filter \
+	-buildDir=`pwd` tetNig2 > filter.log 2>&1 &
+    cat fb.simpleRepeat 
+    #	11549259 bases of 332311746 (3.475%) in intersection
+
+########################################################################
+# WindowMasker (DONE - 2009-08-06 - Hiram)
+    mkdir /hive/data/genomes/tetNig2/bed/windowMasker
+    cd /hive/data/genomes/tetNig2/bed/windowMasker
+    doWindowMasker.pl -workhorse=hgwdev -buildDir=`pwd` tetNig2 > do.log 2>&1
+    twoBitToFa tetNig2.wmsk.sdust.2bit stdout | faSize stdin
+    #	358618246 bases (56303458 N's 302314788 real 241249522 upper
+    #	61065266 lower) in 27 sequences in 1 files
+    #	%17.03 masked total, %20.20 masked real
+
+    #	load this initial data to get ready to clean it
+    ssh hgwdev
+    cd /hive/data/genomes/tetNig2/bed/windowMasker
+    hgLoadBed tetNig2 windowmaskerSdust windowmasker.sdust.bed.gz
+    #	Loaded 1649525 elements of size 3
+    featureBits -countGaps tetNig2 windowmaskerSdust
+    #	117367586 bases of 358618246 (32.728%) in intersection
+
+    #	eliminate the gaps from the masking
+    featureBits tetNig2 -not gap -bed=notGap.bed
+    time nice -n +19 featureBits tetNig2 windowmaskerSdust notGap.bed \
+        -bed=stdout | gzip -c > cleanWMask.bed.gz
+    #	91061086 bases of 332311746 (27.402%) in intersection
+    #	reload track to get it clean
+    hgLoadBed tetNig2 windowmaskerSdust cleanWMask.bed.gz
+    #	Loaded 1647549 elements of size 4
+    featureBits -countGaps tetNig2 windowmaskerSdust
+    #	91061086 bases of 358618246 (25.392%) in intersection
+
+    #	mask the sequence with this clean mask
+    zcat cleanWMask.bed.gz \
+	| twoBitMask ../../tetNig2.unmasked.2bit stdin \
+	    -type=.bed tetNig2.cleanWMSdust.2bit
+    twoBitToFa tetNig2.cleanWMSdust.2bit stdout | faSize stdin \
+        > tetNig2.cleanWMSdust.faSize.txt
+    cat tetNig2.cleanWMSdust.faSize.txt
+    #	358618246 bases (56303458 N's 302314788 real 241249522 upper 61065266
+    #	lower) in 27 sequences in 1 files
+    #	%17.03 masked total, %20.20 masked real
+
+########################################################################
+# MASK SEQUENCE WITH WM+TRF (DONE - 2009-08-06 - Hiram)
+    cd /hive/data/genomes/tetNig2
+    twoBitMask -add bed/windowMasker/tetNig2.cleanWMSdust.2bit \
+	bed/simpleRepeat/trfMask.bed tetNig2.2bit
+    #	safe to ignore the warnings about BED file with >=13 fields
+    twoBitToFa tetNig2.2bit stdout | faSize stdin > faSize.tetNig2.txt
+    cat faSize.tetNig2.wmskSdust.TRF.txt
+    #	358618246 bases (56303458 N's 302314788 real 241039472 upper 61275316
+    #	lower) in 27 sequences in 1 files
+    #	%17.09 masked total, %20.27 masked real
+
+    #	create symlink to gbdb
+    ssh hgwdev
+    rm /gbdb/tetNig2/tetNig2.2bit
+    ln -s `pwd`/tetNig2.2bit /gbdb/tetNig2/tetNig2.2bit
+
+#########################################################################
+# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2009-08-06 - Hiram)
+    # Use -repMatch=130 (based on size -- for human we use 1024, and 
+    # Tetraodon size is ~12.7% of human judging by gapless tetNig2 vs.
+    #	hg18 genome size from featureBits.
+    # genome.  Bump that up a bit to be more conservative.
+    #	100*302314788/2897310462 = 10.434324
+
+    cd /hive/data/genomes/tetNig2
+    blat tetNig2.2bit /dev/null /dev/null -tileSize=11 \
+      -makeOoc=jkStuff/tetNig2.11.ooc -repMatch=130
+    #	Wrote 8132 overused 11-mers to jkStuff/tetNig2.11.ooc
+
+    #	copy all of this stuff to the klusters:
+    cd /hive/data/genomes/tetNig2
+    mkdir /hive/data/staging/data/tetNig2
+    cp -p jkStuff/tetNig2.11.ooc chrom.sizes tetNig2.2bit \
+	/hive/data/staging/data/tetNig2
+
+#########################################################################
+# Ensembl genes v55 (DONE - 2009-08-06 - Hiram)
+    cd /hive/data/genomes/tetNig2
+    cat << '_EOF_' > tetNig2.ensGene.ra
+# required db variable
+db tetNig2
+# optional nameTranslation, the sed command that will transform
+#       Ensemble names to UCSC names.  With quotes just to make sure.
+nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/"
+'_EOF_'
+#  << happy emacs
+
+    doEnsGeneUpdate.pl -verbose=2 -workhorse=hgwdev \
+	-ensVersion=55 -stop=process tetNig2.ensGene.ra > tetNig2.55.log 2>&1
+    doEnsGeneUpdate.pl -verbose=2 -workhorse=hgwdev \
+	-ensVersion=55 -continue=load tetNig2.ensGene.ra >> tetNig2.55.log 2>&1
+
+    featureBits tetNig2 ensGene
+    # 31637658 bases of 332311746 (9.520%) in intersection
+#########################################################################