src/hg/makeDb/doc/hg19.txt 1.4
1.4 2009/04/02 20:58:37 hiram
venter1 chain and net, self lastz chain, chimp lastz chain and net, pushQ entry, PAR region, Gorilla lastz chain and net, lineage specific repeats
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 4 -r1.3 -r1.4
--- src/hg/makeDb/doc/hg19.txt 13 Mar 2009 19:14:27 -0000 1.3
+++ src/hg/makeDb/doc/hg19.txt 2 Apr 2009 20:58:37 -0000 1.4
@@ -433,4 +433,360 @@
hgcentraltest
# test it with some sequence
############################################################################
+# Making download files (DONE - 2009-03-13 - Hiram)
+ cd /hive/data/genomes/hg19
+ makeDownloads.pl -allowMissedTrfs -noChromRoot hg19 \
+ > downloads.log 2>&1
+############################################################################
+# Venter1 chain, net experiment (DONE - Hiram - 2009-03-15)
+doBlastzChainNet.pl `pwd`/DEF \
+ -stop=partition -bigClusterHub=swarm \
+ -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
+ -workhorse=hgwdev -fileServer=hgwdev > partition.log 2>&1
+
+doBlastzChainNet.pl `pwd`/DEF \
+ -continue=blastz -stop=blastz -bigClusterHub=swarm \
+ -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
+ -workhorse=hgwdev -fileServer=hgwdev > blastz.log 2>&1
+
+doBlastzChainNet.pl `pwd`/DEF \
+ -continue=cat -stop=net -bigClusterHub=swarm \
+ -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
+ -workhorse=hgwdev -fileServer=hgwdev > net.log 2>&1
+real 163m28.438s
+
+ # to load, run it in debug, then check the load script
+doBlastzChainNet.pl `pwd`/DEF \
+ -noLoadChainSplit -continue=load -stop=load -bigClusterHub=swarm \
+ -debug -smallClusterHub=swarm -chainMinScore=1000 \
+ -chainLinearGap=medium \
+ -workhorse=hgwdev -fileServer=hgwdev > load.log 2>&1
+
+ # and create a synNet for multiz, run in debug, and examine script
+ # to make sure it works correctly
+doBlastzChainNet.pl `pwd`/DEF \
+ -syntenicNet -continue=syntenicNet -stop=syntenicNet \
+ -debug -bigClusterHub=swarm \
+ -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
+ -workhorse=hgwdev -fileServer=hgwdev > synNet.log 2>&1
+ # real 31m11.216s
+
+############################################################################
+# reset position to chr6 haplotype situation
+ hgsql -e \
+'update dbDb set defaultPos="chr6:28343766-33555363" where name="hg19";' \
+ hgcentraltest
+
+############################################################################
+# Self Lastz run (WORKING - 2009-03-19 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
+ cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
+ cat << '_EOF_'
+# human vs human
+BLASTZ=lastz
+# maximum M allowed with lastz is only 255
+BLASTZ_M=255
+# lastz does not like the O= and E= lines in the matrix file
+# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
+BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+# other parameters from hg18 vs venter1 lastz on advice from Webb
+BLASTZ_K=10000
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/hg19.2bit
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+SEQ1_IN_CONTIGS=0
+
+# QUERY: Human Hg19
+SEQ2_DIR=/scratch/data/hg19/hg19.2bit
+SEQ2_LEN=/scratch/data/hg19/chrom.sizes
+SEQ2_CHUNK=10000000
+SEQ2_LAP=0
+SEQ2_IN_CONTIGS=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ screen # use screen to manage this long-running job
+ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
+ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
+ -workhorse=hgwdev \
+ -stop=net -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
+ # cluster difficulties, finished manually, then:
+ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
+ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
+ -continue=cat -workhorse=hgwdev \
+ -stop=net -smallClusterHub=pk -bigClusterHub=swarm > cat.log 2>&1 &
+
+ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
+ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
+ -continue=load -debug -workhorse=hgwdev \
+ -stop=load -smallClusterHub=pk -bigClusterHub=swarm > load.debug.log 2>&1 &
+ # that indicates it would do:
+ hgLoadChain -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
+ # adding -normScore
+ hgLoadChain -normScore -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
+
+############################################################################
+# Chimp Lastz run (WORKING - 2009-03-19 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
+ cd /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
+ cat << '_EOF_'
+# human vs chimp
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+# lastz does not like the O= and E= lines in the matrix file
+# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
+BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+# other parameters from panTro2 vs hg18 lastz on advice from Webb
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/hg19.2bit
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+SEQ1_IN_CONTIGS=0
+
+# QUERY: Chimp PanTro2
+SEQ2_DIR=/scratch/data/panTro2/panTro2.2bit
+SEQ2_LEN=/scratch/data/panTro2/chrom.sizes
+SEQ2_CHUNK=10000000
+SEQ2_LAP=0
+SEQ2_IN_CONTIGS=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ screen # use screen to manage this long-running job
+ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
+ # real 173m22.880s
+ # cluster problems, continuing after lastz done:
+ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=cat \
+ -stop=net -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
+ > net.log 2>&1 &
+ # real 81m20.209s
+ # continuing with the load and adding syntenicNet
+ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=load \
+ -syntenicNet -noLoadChainSplit -chainMinScore=5000 \
+ -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
+ > load.log 2>&1 &
+ # real 47m17.871s
+
+############################################################################
+# Creating the pushQ entry (DONE - 2009-03-20 - Hiram)
+ mkdir /hive/data/genomes/hg19/pushQ
+ cd /hive/data/genomes/hg19/pushQ
+ makePushQSql.pl hg19 > hg19.pushQ.sql 2> make.err
+ # many complaints about the chain and net tables from the haplotype
+ # experiments, and this table:
+ # orfeomeGenes
+ # which is probably in genbank, and these usual ones:
+ # hg19 does not have seq
+ # hg19 does not have extFile
+
+############################################################################
+# Determine PAR region of X and Y (DONE - 2009-03-20 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/parRegion
+ cd /hive/data/genomes/hg19/bed/parRegion
+ awk '$5 != "N"' ../../X/chrX.agp | awk '{print $6}' | sort > chrX.cloneList
+ awk '$5 != "N"' ../../Y/chrY.agp | awk '{print $6}' | sort > chrY.cloneList
+ comm -12 chrX.cloneList chrY.cloneList > chrXY.par.clone.list
+ cat chrXY.par.clone.list \
+ | while read C; do grep "${C}" ../../X/chrX.agp; done \
+ | sort -k1,1 -k2,2n >> chrX.par.region.agp
+ cat chrXY.par.clone.list \
+ | while read C; do grep "${C}" ../../Y/chrY.agp; done \
+ | sort -k1,1 -k2,2n >> chrY.par.region.agp
+ awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrY.par.region.agp \
+ > chrY.par.region.bed
+ awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrX.par.region.agp \
+ > chrX.par.region.bed
+ # use those bed files in custom tracks on hg19 to verify that they
+ # are two continuous regions with only gaps between these items
+ # these location extents are: (zero relative)
+ # chrX 60000 2722842
+ # chrX 154906585 155260560
+ # chrY 10000 2649520
+ # chrY 59034049 59363566
+
+############################################################################
+# Gorilla Lastz run (WORKING - 2009-03-21 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
+ cd /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
+ cat << '_EOF_'
+# human vs gorilla
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+# lastz does not like the O= and E= lines in the matrix file
+# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
+BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+# other parameters from panTro2 vs hg18 lastz on advice from Webb
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/hg19.2bit
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=100000000
+SEQ1_LAP=10000
+SEQ1_IN_CONTIGS=0
+
+# QUERY: Gorilla gorGor1
+SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
+SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LIMIT=300
+SEQ2_LAP=0
+SEQ2_IN_CONTIGS=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ screen # use screen to manage this long-running job
+ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
+ > do.log 2>&1 &
+# XXX running
+Sat Mar 21 22:22:18 PDT 2009
+
+############################################################################
+# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2009-04-02 - Hiram)
+ ssh pk
+ mkdir /hive/data/genomes/hg19/bed/linSpecRep
+ cd /hive/data/genomes/hg19/bed/linSpecRep
+ # create individual .out files from the master record in ../repeatMasker
+ mkdir splitOut
+ cat << '_EOF_' > split.csh
+#!/bin/csh -fe
+set C = $1
+head -3 ../repeatMasker/hg19.clean.out > splitOut/${C}.out
+grep "${C} " ../repeatMasker/hg19.clean.out >> splitOut/${C}.out
+'_EOF_'
+ # << happy emacs
+
+ cat << '_EOF_' > template
+#LOOP
+split.csh $(root1) {check out line+ splitOut/$(root1).out}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ cut -f1 ../../chrom.sizes > chrom.list
+ gensub2 chrom.list single template jobList
+ para create jobList
+ para try ... check ... push ... etc...
+# Completed: 93 of 93 jobs
+# CPU time in finished jobs: 127s 2.12m 0.04h 0.00d 0.000 y
+# IO & Wait Time: 17154s 285.90m 4.76h 0.20d 0.001 y
+# Average job time: 186s 3.10m 0.05h 0.00d
+# Longest finished job: 224s 3.73m 0.06h 0.00d
+# Submission to last job: 280s 4.67m 0.08h 0.00d
+
+ # now, we can date and process each of those .out files
+ # this really should be a single creation of notInOthers
+ # These four different ones all end up to be the same anyhow
+ # the notInMouse becomes notInOthers below and the others are removed.
+ mkdir dateRepeats
+ cd dateRepeats
+ cat << '_EOF_' > mkLSR
+#!/bin/csh -fe
+rm -f $1.out_mus-musculus_rattus_canis-familiaris_bos-taurus
+ln -s ../splitOut/$1.out .
+/scratch/data/RepeatMasker/DateRepeats \
+ $1.out -query human -comp mouse -comp rat -comp dog -comp cow
+rm $1.out
+mkdir -p ../notInMouse ../notInRat ../notInDog ../notInCow
+/cluster/bin/scripts/extractRepeats 1 $1.out_mus*-taurus \
+ > ../notInMouse/$1.out.spec
+/cluster/bin/scripts/extractRepeats 2 $1.out_mus*-taurus \
+ > ../notInRat/$1.out.spec
+/cluster/bin/scripts/extractRepeats 3 $1.out_mus*-taurus \
+ > ../notInDog/$1.out.spec
+/cluster/bin/scripts/extractRepeats 4 $1.out_mus*-taurus \
+ > ../notInCow/$1.out.spec
+'_EOF_'
+ # << happy emacs
+ chmod +x mkLSR
+
+ cat << '_EOF_' > template
+#LOOP
+./mkLSR $(path1) {check out line+ $(path1).out_mus-musculus_rattus_canis-familiaris_bos-taurus}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ gensub2 ../chrom.list single template jobList
+ para try ... check ... push ... etc...
+ para time
+# Completed: 93 of 93 jobs
+# CPU time in finished jobs: 2441s 40.69m 0.68h 0.03d 0.000 y
+# IO & Wait Time: 332s 5.53m 0.09h 0.00d 0.000 y
+# Average job time: 30s 0.50m 0.01h 0.00d
+# Longest finished job: 125s 2.08m 0.03h 0.00d
+# Submission to last job: 454s 7.57m 0.13h 0.01d
+
+ done
+
+ # these four types of out.spec results all turn out to be identical
+ # To check identical
+ cd /hive/data/genomes/hg19/bed/linSpecRep
+ find . -name "*.out.spec" | \
+ while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
+ | sort -k1,1n | sort -t"/" -k3,3 | sed -e "s#./notIn.*/##" \
+ | sort | uniq -c | less
+ # You will see they are all a count of 4
+ # Set them up on scratch data and get to all the kluster nodes:
+ mkdir /hive/data/staging/data/hg19/lineageSpecificRepeats
+ cd notInMouse
+ rsync -a --progress ./ /hive/data/staging/data/hg19/lineageSpecificRepeats
+ cd ..
+ mv notInMouse notInOthers
+ # do not need to keep all of these
+ rm -fr notInRat notInDog notInCow
+
+ # We also need the nibs for blastz runs with lineage specific repeats
+ mkdir /hive/data/genomes/hg19/bed/nibs
+ cd /hive/data/genomes/hg19/bed/nibs
+ cut -f1 ../../chrom.sizes | while read C
+do
+ twoBitToFa -seq=${C} ../../hg19.2bit stdout \
+ | faToNib -softMask stdin ${C}.nib
+ echo "${C} done"
+done
+ mkdir /hive/data/staging/data/hg19/nib
+ rsync -a --progress ./ /hive/data/staging/data/hg19/nib
+
+ # Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
+
+#############################################################################