src/hg/makeDb/doc/susScr1.txt 1.1
1.1 2009/11/16 19:40:42 hiram
Working on the genbank run
Index: src/hg/makeDb/doc/susScr1.txt
===================================================================
RCS file: src/hg/makeDb/doc/susScr1.txt
diff -N src/hg/makeDb/doc/susScr1.txt
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/susScr1.txt 16 Nov 2009 19:40:42 -0000 1.1
@@ -0,0 +1,197 @@
+# for emacs: -*- mode: sh; -*-
+
+# Sus scrofa - SGSC Sscrofa8 NCBI project 10718, CM000812
+
+#########################################################################
+# DOWNLOAD SEQUENCE (DONE - 2009-10-14 - Hiram)
+ mkdir /hive/data/genomes/susScr1
+ cd /hive/data/genomes/susScr1
+ mkdir sanger
+ cd sanger
+for F in README Sus_scrofa.Sscrofa9.53.dna.chromosome.fa.bz2 \
+ Sus_scrofa.Sscrofa9.53_repeat_coords.txt.bz2 \
+ Sus_scrofa9.agp Sus_scrofa9.pgp
+do
+ wget --timestamping \
+"ftp://ftp.sanger.ac.uk/pub/S_scrofa/assemblies/Ensembl_Sscrofa9/${F}"
+done
+
+ bunzip *.bz2
+ gzip Sus_scrofa.Sscrofa9.53.dna.chromosome.fa \
+ Sus_scrofa.Sscrofa9.53_repeat_coords.txt
+ grep -v "^#" Sus_scrofa9.agp > susScr1.agp
+ zcat Sus_scrofa.Sscrofa9.53.dna.chromosome.fa.gz \
+ | sed -e "s/^>/>chr/" | gzip > susScr1.fa.gz
+
+#########################################################################
+# Initial makeGenomeDb.pl (DONE - 2009-11-06 - Hiram)
+ cd /hive/data/genomes/susScr1
+ cat << '_EOF_' > susScr1.config.ra
+# Config parameters for makeGenomeDb.pl:
+db susScr1
+clade mammal
+genomeCladePriority 35
+scientificName Sus scrofa
+commonName Pig
+assemblyDate Apr. 2009
+assemblyLabel SGSC Sscrofa8 (NCBI project 10718, CM000812)
+orderKey 235
+mitoAcc NC_012095
+fastaFiles /hive/data/genomes/susScr1/sanger/susScr1.fa.gz
+agpFiles /hive/data/genomes/susScr1/sanger/susScr1.agp
+# qualFiles none
+dbDbSpeciesDir pig
+taxId 9823
+'_EOF_'
+
+ makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev susScr1.config.ra \
+ > makeGenomeDb.log 2>&1
+
+ # add the trackDb entries to the source tree
+ ln -s `pwd`/susScr1.unmasked.2bit /gbdb/susScr1/susScr1.2bit
+ # browser should function now
+
+#########################################################################
+# RepeatMasker (DONE - 2009-11-06 - Hiram)
+ mkdir /hive/data/genomes/susScr1/bed/repeatMasker
+ cd /hive/data/genomes/susScr1/bed/repeatMasker
+ doRepeatMasker.pl -buildDir=`pwd` -workhorse=hgwdev -bigClusterHub=pk \
+ -noSplit susScr1 > do.log 2>&1
+ # about 7.5 hours
+ cat faSize.rmsk.txt
+ # 2262596571 bases (31264552 N's 2231332019 real 1286238193 upper
+ # 945093826 lower) in 20 sequences in 1 files
+ # %41.77 masked total, %42.36 masked real
+
+#########################################################################
+# simpleRepeats (DONE - 2009-11-06 - Hiram)
+ mkdir /hive/data/genomes/susScr1/bed/simpleRepeat
+ cd /hive/data/genomes/susScr1/bed/simpleRepeat
+ doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev -bigClusterHub=pk \
+ -smallClusterHub=pk susScr1 > do.log 2>&1
+ cat fb.simpleRepeat
+ # 26577444 bases of 2231496571 (1.191%) in intersection
+
+ # add to the repeatMasker
+ cd /hive/data/genomes/susScr1
+ twoBitMask susScr1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed susScr1.2bit
+ twoBitToFa susScr1.2bit stdout | faSize stdin > susScr1.2bit.faSize.txt
+ cat susScr1.2bit.faSize.txt
+ # 2262596571 bases (31264552 N's 2231332019 real 1285077160 upper
+ # 946254859 lower) in 20 sequences in 1 files
+ # %41.82 masked total, %42.41 masked real
+
+########################################################################
+# Marking *all* gaps - they are not all in the AGP file
+# (DONE - 2009-11-09 - Hiram)
+ mkdir /hive/data/genomes/susScr1/bed/allGaps
+ cd /hive/data/genomes/susScr1/bed/allGaps
+ time nice -n +19 findMotif -motif=gattaca -verbose=4 \
+ -strand=+ ../../susScr1.unmasked.2bit > findMotif.txt 2>&1
+ # real 1m12.153s
+ grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
+ featureBits susScr1 -not gap -bed=notGap.bed
+ featureBits susScr1 allGaps.bed notGap.bed -bed=new.gaps.bed
+ # what is the last index in the existing gap table:
+ hgsql -N -e "select ix from gap;" susScr1 | sort -n | tail -1
+ # 27297
+ cat << '_EOF_' > mkGap.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $ix=`hgsql -N -e "select ix from gap;" susScr1 | sort -n | tail -1`;
+chomp $ix;
+
+open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
+while (my $line = <FH>) {
+ my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
+ ++$ix;
+ printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
+ $chromEnd, $ix, $chromEnd-$chromStart;
+}
+close (FH);
+'_EOF_'
+ # << happy emacs
+ chmod +x ./mkGap.pl
+ ./mkGap.pl > other.gap
+ hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
+ -noLoad susScr1 otherGap other.gap
+ # Loaded 96565
+ # adding this many:
+ wc -l bed.tab
+ # 96565
+ # starting with this many
+ hgsql -e "select count(*) from gap;" susScr1
+ # 100202
+ hgsql susScr1 -e 'load data local infile "bed.tab" into table gap;'
+ # result count:
+ hgsql -e "select count(*) from gap;" susScr1
+ # 196767
+ # == 100202 + 96565
+
+########################################################################
+# Create kluster run files (DONE - 2009-11-09 - Hiram)
+ cd /hive/data/genomes/susScr1
+ blat susScr1.2bit \
+ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/susScr1.11.ooc \
+ -repMatch=800
+ # Wrote 28011 overused 11-mers to jkStuff/susScr1.11.ooc
+ mkdir /hive/data/staging/data/susScr1
+ cp -p susScr1.2bit jkStuff/susScr1.11.ooc /hive/data/staging/data/susScr1
+ cp -p chrom.sizes /hive/data/staging/data/susScr1
+ gapToLift susScr1 jkStuff/nonBridged.lft -bedFile=jkStuff/nonBridged.bed
+ cp -p jkStuff/nonBridged.lft \
+ /hive/data/staging/data/susScr1/susScr1.nonBridged.lft
+
+#########################################################################
+# GENBANK AUTO UPDATE (DONE - 2009-11-09 - Hiram)
+ ssh hgwdev
+ cd ~/kent/src/hg/makeDb/genbank
+ cvsup
+ # edit etc/genbank.conf to add susScr1 just before tetNig1
+
+# susScr1 (Tetraodon)
+susScr1.serverGenome = /hive/data/genomes/susScr1/susScr1.2bit
+susScr1.clusterGenome = /scratch/data/susScr1/susScr1.2bit
+susScr1.ooc = /scratch/data/susScr1/susScr1.11.ooc
+susScr1.lift = /scratch/data/susScr1/susScr1.contigs.lift
+susScr1.align.unplacedChroms = chr*_random
+susScr1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter}
+susScr1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
+susScr1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
+susScr1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
+susScr1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter}
+susScr1.downloadDir = susScr1
+susScr1.genbank.mrna.xeno.loadDesc = yes
+susScr1.refseq.mrna.native.load = no
+
+ cvs ci -m "Added susScr1" etc/genbank.conf
+ # update /cluster/data/genbank/:
+ make etc-update
+
+ ssh genbank
+ screen # use a screen to manage this job
+ cd /cluster/data/genbank
+ time nice -n +19 bin/gbAlignStep -initial susScr1 &
+ # logFile: var/build/logs/2009.08.10-16:42:06.susScr1.initalign.log
+ # real 578m42.777s
+
+ # load database when finished
+ ssh hgwdev
+ cd /cluster/data/genbank
+ time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad susScr1
+ # logFile: var/dbload/hgwdev/logs/2009.08.11-09:22:29.dbload.log
+ # real 31m29.282s
+
+ # enable daily alignment and update of hgwdev
+ cd ~/kent/src/hg/makeDb/genbank
+ cvsup
+ # add susScr1 to:
+ etc/align.dbs
+ etc/hgwdev.dbs
+ cvs ci -m "Added susScr1 - Tetraodon Nigirividis" \
+ etc/align.dbs etc/hgwdev.dbs
+ make etc-update
+