src/hg/makeDb/doc/hg19.txt 1.12
1.12 2009/05/13 20:51:33 hiram
Running lastz alignments
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.11
retrieving revision 1.12
diff -b -B -U 1000000 -r1.11 -r1.12
--- src/hg/makeDb/doc/hg19.txt 11 May 2009 20:50:57 -0000 1.11
+++ src/hg/makeDb/doc/hg19.txt 13 May 2009 20:51:33 -0000 1.12
@@ -1,2175 +1,2389 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on
# NCBI build 37 (February 2009 freeze) aka:
# GRCh37 - Genome Reference Consortium Human Reference 37
# Assembly Accession: GCA_000001405.1
# "$Id$";
#############################################################################
# Download sequence (DONE - 2009-02-04 - Hiram)
mkdir -p /hive/data/genomes/hg19/download
cd /hive/data/genomes/hg19/download
mkdir -p assembled_chromosomes
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=assembled_chromosomes \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes
mkdir -p alternate_loci
for N in 1 2 3 4 5 6 7 8 9
do
wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=alternate_loci \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
done
mkdir -p unlocalized_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=unlocalized_scaffolds \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds
mkdir -p unplaced_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=unplaced_scaffolds \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds
mkdir -p placed_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=placed_scaffolds \
-nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds
mkdir ucscChr
cd ucscChr
for F in ../assembled_chromosomes/FASTA/chr*.fa
do
C=`basename $F`
C=${C/.fa}
echo -n "${C} "
H=`head -1 "${F}"`
chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
echo $chrN $A
grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
| sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
echo ">${chrN}" > ${chrN}.fa
grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
done
rm -f scaffolds.agp
find ../alternate_loci -type f | grep ".agp$" | while read F
do
grep "^GL" $F | sed -e \
"s/^GL000250.1/chr6_apd_hap1/" -e \
"s/^GL000251.1/chr6_cox_hap2/" -e \
"s/^GL000252.1/chr6_dbb_hap3/" -e \
"s/^GL000253.1/chr6_mann_hap4/" -e \
"s/^GL000254.1/chr6_mcf_hap5/" -e \
"s/^GL000255.1/chr6_qbl_hap6/" -e \
"s/^GL000256.1/chr6_ssto_hap7/" -e \
"s/^GL000257.1/chr4_ctg9_hap1/" -e \
"s/^GL000258.1/chr17_ctg5_hap1/"
done > scaffolds.agp
find ../unlocalized_scaffolds -type f | grep ".agp$" \
| while read F
do
C=`basename ${F}`
C=${C/.unlocalized.scaf.agp}
grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
done >> scaffolds.agp
find ../unplaced_scaffolds -type f | grep ".agp$" \
| while read F
do
grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
done >> scaffolds.agp
rm -f scaffolds.fa
find ../alternate_loci -type f | grep ".fa$" | while read F
do
sed -e \
"s/>.*GL000250.*/>chr6_apd_hap1/" -e \
"s/>.*GL000251.*/>chr6_cox_hap2/" -e \
"s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
"s/>.*GL000253.*/>chr6_mann_hap4/" -e \
"s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
"s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
"s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
"s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
"s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
done > scaffolds.fa
find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
do
sed -e \
"s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
done >> scaffolds.fa
find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
do
sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
done >> scaffolds.fa
############################################################################
## Create database (DONE - 2009-03-04 - Hiram)
cd /hive/data/genomes/hg19
cat << '_EOF_' > hg19.config.ra
# Config parameters for makeGenomeDb.pl:
db hg19
scientificName Homo sapiens
commonName Human
assemblyDate Feb. 2009
assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
orderKey 14
mitoAcc NC_001807
fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
# qualFiles /dev/null
dbDbSpeciesDir human
taxId 9606
'_EOF_'
# << happy emacs
time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
# real 14m8.958s
featureBits -countGaps hg19 gap
# 239845127 bases of 3137161264 (7.645%) in intersection
featureBits -noRandom -noHap -countGaps hg19 gap
# 234344806 bases of 3095693983 (7.570%) in intersection
# verify featureBits is properly ignorning haps and randoms:
egrep -v "_" chrom.sizes | awk '{sum+=$2;print sum,$0}'
# 3095693983 chrM 16571
# same total as in featureBits
# much later on, discovered that we needed a chrM definition in the
# agp files, added by hand to hg19/M/chrM.agp and hg19/hg19.agp the line:
# chrM 1 16571 1 F NC001807 1 16571 +
# the spaces there are tabs
############################################################################
# running repeat masker (DONE - 2009-03-05 - Hiram)
screen # use screen to manage this day-long job
mkdir /hive/data/genomes/hg19/bed/repeatMasker
cd /hive/data/genomes/hg19/bed/repeatMasker
time doRepeatMasker.pl -bigClusterHub=swarm -buildDir=`pwd` hg19 \
> do.log 2>&1
# real 525m23.521s
cat faSize.rmsk.txt
# 3137161264 bases (239850802 N's 2897310462 real 1431585691
# upper 1465724771 lower) in 93 sequences in 1 files
# %46.72 masked total, %50.59 masked real
featureBits -countGaps hg19 rmsk
# 1465724774 bases of 3137161264 (46.721%) in intersection
# this is odd, 3 bases more in featureBits than were masked ?
# check it out, make a bed file from the featureBits:
featureBits -countGaps -bed=rmsk.bed hg19 rmsk
# went down a sequence of intersections with this idea, but could
# not get it resolved. It appears there are 75 bases in the rmsk
# table that were not masked in the 2bit file ?
# Later on, realized that featureBits does not count lower case N's
# in the "lower" category, but only in the N's category.
# trying a non-split table:
hgsql -e "show tables;" hg19 | grep _rmsk | while read T
do
hgsql -e "drop table ${T};" hg19
done
hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.fa.out
bad rep range [4385, 4384] line 1348605 of hg19.fa.out
bad rep range [5563, 5562] line 1563988 of hg19.fa.out
bad rep range [4539, 4538] line 3111186 of hg19.fa.out
# featureBits still reports 1465724774 bases in rmsk table
# cleaning the hg19.fa.out file:
cp hg19.fa.out hg19.clean.out
# edit hg19.clean.out and remove the three lines:
# 1467 20.7 1.2 17.6 chr14 35056767 35056794 (72292746) + L1ME1 LINE/L1 4385 4384 (1761) 1120962
# 1943 23.8 5.0 12.6 chr15 65775909 65775924 (36755468) + L1MC4 LINE/L1 5563 5562 (2480) 1299299
# 2463 25.1 5.0 11.6 chr3 121291056 121291083 (76731347) + L1M3 LINE/L1 4539 4538 (1608) 2589267
# reload the table
hgsql -e "drop table rmsk;" hg19
hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.clean.out
# try masking with this clean file:
twoBitMask /hive/data/genomes/hg19/hg19.unmasked.2bit hg19.clean.out \
hg19.clean.2bit
twoBitToFa hg19.clean.2bit stdout | faSize stdin > faSize.clean.txt
cat faSize.clean.txt
# this gives the lower by 75 bases result:
# 3137161264 bases (239850802 N's 2897310462 real 1431585763 upper
# 1465724699 lower) in 93 sequences in 1 files
# %46.72 masked total, %50.59 masked real
featureBits -countGaps hg19 rmsk
# 1465724774 bases of 3137161264 (46.721%) in intersection
# is the countGaps interferring ?
featureBits hg19 rmsk
# 1465724774 bases of 2897316137 (50.589%) in intersection
# nope, lets' see what the .out file has:
grep chr hg19.clean.out | sed -e "s/^ *//" | awk '{print $5,$6-1,$7}' \
| sort -k1,1 -k2,2n > hg19.clean.out.bed
featureBits -countGaps hg19 hg19.clean.out.bed
# 1465724774 bases of 3137161264 (46.721%) in intersection
# is it perhaps not masking N's ?
twoBitToFa hg19.clean.2bit stdout | grep n | less
# that does find some lower case n's, find all N's:
findMotif -strand=+ -motif=gattaca -verbose=4 hg19.clean.2bit \
2> findMotif.out
grep "^#GAP" findMotif.out | sed -e "s/#GAP //" > nLocations.bed
# which cover:
featureBits -countGaps hg19 nLocations.bed
# 251299071 bases of 3137161264 (8.010%) in intersection
# overlapping rmsk business with these N locations:
featureBits -countGaps hg19 hg19.clean.out.bed nLocations.bed
# 6494740 bases of 3137161264 (0.207%) in intersection
# and overlapping with gap:
featureBits -countGaps hg19 gap nLocations.bed
# 239845127 bases of 3137161264 (7.645%) in intersection
############################################################################
# running TRF simple repeats (DONE - 2009-03-05 - Hiram)
screen # use screen to manage this day-long job
mkdir /hive/data/genomes/hg19/bed/simpleRepeat
cd /hive/data/genomes/hg19/bed/simpleRepeat
time doSimpleRepeat.pl -bigClusterHub=pk -workhorse=hgwdev \
-smallClusterHub=pk -buildDir=`pwd` hg19 > do.log 2>&1
# real 33m25.815s
twoBitMask bed/repeatMasker/hg19.clean.2bit \
-add bed/simpleRepeat/trfMask.bed hg19.2bit
twoBitToFa hg19.2bit stdout | faSize stdin > faSize.hg19.2bit.txt
# 3137161264 bases (239850802 N's 2897310462 real 1430387259 upper
# 1466923203 lower) in 93 sequences in 1 files
# %46.76 masked total, %50.63 masked real
############################################################################
# prepare cluster data (DONE - 2009-03-06 - Hiram)
cd /hive/data/genomes/hg19
rm /gbdb/hg19/hg19.2bit
ln -s `pwd`/hg19.2bit /gbdb/hg19/hg19.2bit
time blat hg19.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
# Wrote 30675 overused 11-mers to 11.ooc
# real 3m11.302s
mkdir /hive/data/staging/data/hg19
cp -p hg19.2bit /hive/data/staging/data/hg19
cp -p 11.ooc /hive/data/staging/data/hg19
cp -p chrom.sizes /hive/data/staging/data/hg19
mkdir separateChrs
cd separateChrs
grep -v "_" ../chrom.sizes | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.chrOnly.2bit
twoBitInfo hg19.chrOnly.2bit stdout | sort -k2,2nr > chrOnly.chrom.sizes
grep "_hap" ../chrom.sizes | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.hapOnly.2bit
twoBitInfo hg19.hapOnly.2bit stdout | sort -k2,2nr > hapOnly.chrom.sizes
grep "_" ../chrom.sizes | grep -v "_hap" | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.scaffolds.2bit
twoBitInfo hg19.scaffolds.2bit stdout | sort -k2,2nr > scaffolds.chrom.sizes
cp -p *.2bit *.sizes /hive/data/staging/data/hg19
# ask admin to sync this directory: /hive/data/staging/data/hg19/
# to the kluster nodes /scratch/data/hg19/
############################################################################
# running cpgIsland business (DONE - 2009-03-06 - Hiram)
mkdir /hive/data/genomes/hg19/bed/cpgIsland
cd /hive/data/genomes/hg19/bed/cpgIsland
cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
# comment out the following two lines if it compiles cleanly
# some day (there were some other fixups too, adding include lines)
sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
mv tmp.c cpg_lh.c
make
cd ../../
ln -s hg3rdParty/cpgIslands/cpglh.exe
mkdir -p hardMaskedFa
cut -f1 ../../chrom.sizes | while read C
do
echo ${C}
twoBitToFa ../../hg19.2bit:$C stdout \
| maskOutFa stdin hard hardMaskedFa/${C}.fa
done
cut -f1 ../../chrom.sizes > chr.list
cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out line results/$(root1).cpg}
#ENDLOOP
'_EOF_'
# << happy emacs
cat << '_EOF_' > runOne
#!/bin/csh -fe
./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
mv /scratch/tmp/$1.$$ $2
'_EOF_'
# << happy emacs
gensub2 chr.list single template jobList
para create jobList
para try
para check ... etc
para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 172s 2.86m 0.05h 0.00d 0.000 y
# IO & Wait Time: 1748s 29.14m 0.49h 0.02d 0.000 y
# Average job time: 21s 0.34m 0.01h 0.00d
# Longest finished job: 34s 0.57m 0.01h 0.00d
# Submission to last job: 83s 1.38m 0.02h 0.00d
# Transform cpglh output to bed +
catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed
cd /hive/data/genomes/hg19/bed/cpgIsland
hgLoadBed hg19 cpgIslandExt -tab \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 28226 elements of size 10
# Sorted
# Saving bed.tab
# Loading hg19
############################################################################
# create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
mkdir /hive/data/genomes/hg19/bed/gap
cd /hive/data/genomes/hg19/bed/gap
gapToLift hg19 hg19.unBridged.lift -bedFile=unBridged.lift.bed
cp -p hg19.unBridged.lift ../../jkStuff
cp -p hg19.unBridged.lift /hive/data/staging/data/hg19
############################################################################
# AUTO UPDATE GENBANK RUN (DONE - 2009-03-07,13 - Hiram)
# align with latest genbank process.
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add hg19 just after hg18
# hg19 - GRCh37 - Genome Reference Consortium Human Reference 37
# Assembly Accession: GCA_000001405.1
hg19.serverGenome = /hive/data/genomes/hg19/hg19.2bit
hg19.clusterGenome = /scratch/data/hg19/hg19.2bit
hg19.ooc = /scratch/data/hg19/11.ooc
hg19.lift = /scratch/data/hg19/hg19.unBridged.lift
# hg19.hapRegions = /hive/data/genomes/hg19/bed/haplotypePos/haplotypePos.psl
hg19.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter}
hg19.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter}
hg19.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
hg19.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter}
hg19.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
hg19.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter}
hg19.genbank.est.xeno.load = yes
hg19.refseq.mrna.xeno.load = yes
hg19.refseq.mrna.xeno.loadDesc = yes
hg19.mgc = yes
hg19.orfeome = yes
hg19.downloadDir = hg19
# hg19.ccds.ncbiBuild = 36.3
# hg19.upstreamGeneTbl = refGene
# hg19.upstreamMaf = multiz28way
# /hive/data/genomes/hg19/bed/multiz28way/species.lst multiz44way
# /hive/data/genomes/hg19/bed/multiz44way/species.list
hg19.genbank.mrna.blatTargetDb = yes
cvs ci -m "Added hg19." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial hg19 &
# logFile: var/build/logs/2009.03.10-20:28:44.hg19.initalign.log
# real 2761m13.680s
# that ran on the swarm with little interference and no problems
# load database when finished
ssh hgwdev
screen # use screen to manage this long running command
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19 &
# logFile: var/dbload/hgwdev/logs/2009.03.12-21:10:02.dbload.log
# real 369m11.941s
# enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add hg19 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added hg19 - Human - GRCh37" etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# BLATSERVERS ENTRY (DONE - 2009-03-09 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("hg19", "blat13", "17778", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("hg19", "blat13", "17779", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
# Making download files (DONE - 2009-03-13 - Hiram)
cd /hive/data/genomes/hg19
makeDownloads.pl -allowMissedTrfs -noChromRoot hg19 \
> downloads.log 2>&1
############################################################################
# Venter1 chain, net experiment (DONE - Hiram - 2009-03-15)
doBlastzChainNet.pl `pwd`/DEF \
-stop=partition -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > partition.log 2>&1
doBlastzChainNet.pl `pwd`/DEF \
-continue=blastz -stop=blastz -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > blastz.log 2>&1
doBlastzChainNet.pl `pwd`/DEF \
-continue=cat -stop=net -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > net.log 2>&1
real 163m28.438s
# to load, run it in debug, then check the load script
doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -continue=load -stop=load -bigClusterHub=swarm \
-debug -smallClusterHub=swarm -chainMinScore=1000 \
-chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > load.log 2>&1
# and create a synNet for multiz, run in debug, and examine script
# to make sure it works correctly
doBlastzChainNet.pl `pwd`/DEF \
-syntenicNet -continue=syntenicNet -stop=syntenicNet \
-debug -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > synNet.log 2>&1
# real 31m11.216s
############################################################################
# reset position to chr6 haplotype situation
hgsql -e \
'update dbDb set defaultPos="chr6:28343766-33555363" where name="hg19";' \
hgcentraltest
# reset to a smaller range (2009-04-24 - Brooke)
# this is the SOD1 gene, implicated in Lou Gehrig's disease.
hgsql -e \
'update dbDb set defaultPos="chr21:33,031,597-33,041,570" where name="hg19";' \
hgcentraltest
############################################################################
# Self Lastz run (DONE - 2009-03-19 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
cat << '_EOF_'
# human vs human
BLASTZ=lastz
# maximum M allowed with lastz is only 255
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Human Hg19
SEQ2_DIR=/scratch/data/hg19/hg19.2bit
SEQ2_LEN=/scratch/data/hg19/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-workhorse=hgwdev \
-stop=net -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
# cluster difficulties, finished manually, then:
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-continue=cat -workhorse=hgwdev \
-stop=net -smallClusterHub=pk -bigClusterHub=swarm > cat.log 2>&1 &
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-continue=load -debug -workhorse=hgwdev \
-stop=load -smallClusterHub=pk -bigClusterHub=swarm > load.debug.log 2>&1 &
# that indicates it would do:
hgLoadChain -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
# adding -normScore
hgLoadChain -normScore -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
############################################################################
# Chimp Lastz run (WORKING - 2009-03-19 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
cd /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
cat << '_EOF_'
# human vs chimp
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/data/panTro2/panTro2.2bit
SEQ2_LEN=/scratch/data/panTro2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
# real 173m22.880s
# cluster problems, continuing after lastz done:
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=cat \
-stop=net -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> net.log 2>&1 &
# real 81m20.209s
# continuing with the load and adding syntenicNet
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=load \
-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
-chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> load.log 2>&1 &
# real 47m17.871s
# running the swap
ssh swarm
mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap
cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
> swap.log 2>&1 &
# real 723m41.377s
cat fb.panTro2.chainHg19Link.txt
# 2761343871 bases of 2909485072 (94.908%) in intersection
############################################################################
# Creating the pushQ entry (DONE - 2009-03-20 - Hiram)
mkdir /hive/data/genomes/hg19/pushQ
cd /hive/data/genomes/hg19/pushQ
makePushQSql.pl hg19 > hg19.pushQ.sql 2> make.err
# many complaints about the chain and net tables from the haplotype
# experiments, and this table:
# orfeomeGenes
# which is probably in genbank, and these usual ones:
# hg19 does not have seq
# hg19 does not have extFile
############################################################################
# Determine PAR region of X and Y (DONE - 2009-03-20 - Hiram)
mkdir /hive/data/genomes/hg19/bed/parRegion
cd /hive/data/genomes/hg19/bed/parRegion
awk '$5 != "N"' ../../X/chrX.agp | awk '{print $6}' | sort > chrX.cloneList
awk '$5 != "N"' ../../Y/chrY.agp | awk '{print $6}' | sort > chrY.cloneList
comm -12 chrX.cloneList chrY.cloneList > chrXY.par.clone.list
cat chrXY.par.clone.list \
| while read C; do grep "${C}" ../../X/chrX.agp; done \
| sort -k1,1 -k2,2n >> chrX.par.region.agp
cat chrXY.par.clone.list \
| while read C; do grep "${C}" ../../Y/chrY.agp; done \
| sort -k1,1 -k2,2n >> chrY.par.region.agp
awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrY.par.region.agp \
> chrY.par.region.bed
awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrX.par.region.agp \
> chrX.par.region.bed
# use those bed files in custom tracks on hg19 to verify that they
# are two continuous regions with only gaps between these items
# these location extents are: (zero relative)
# chrX 60000 2722842
# chrX 154906585 155260560
# chrY 10000 2649520
# chrY 59034049 59363566
############################################################################
# Gorilla Lastz run (WORKING - 2009-03-21 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
cd /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
cat << '_EOF_'
# human vs gorilla
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Gorilla gorGor1
SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> do.log 2>&1 &
-# XXX running
-Sat Mar 21 22:22:18 PDT 2009
+ cat fb.hg19.chainGorGor1Link.txt
+ # 1723432141 bases of 2897316137 (59.484%) in intersection
+ doRecipBest.pl -buildDir=`pwd` hg19 gorGor1 > rbest.log 2>&1
############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2009-04-02 - Hiram)
ssh pk
mkdir /hive/data/genomes/hg19/bed/linSpecRep
cd /hive/data/genomes/hg19/bed/linSpecRep
# create individual .out files from the master record in ../repeatMasker
mkdir splitOut
cat << '_EOF_' > split.csh
#!/bin/csh -fe
set C = $1
head -3 ../repeatMasker/hg19.clean.out > splitOut/${C}.out
grep "${C} " ../repeatMasker/hg19.clean.out >> splitOut/${C}.out
'_EOF_'
# << happy emacs
cat << '_EOF_' > template
#LOOP
split.csh $(root1) {check out line+ splitOut/$(root1).out}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 ../../chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
para create jobList
para try ... check ... push ... etc...
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 127s 2.12m 0.04h 0.00d 0.000 y
# IO & Wait Time: 17154s 285.90m 4.76h 0.20d 0.001 y
# Average job time: 186s 3.10m 0.05h 0.00d
# Longest finished job: 224s 3.73m 0.06h 0.00d
# Submission to last job: 280s 4.67m 0.08h 0.00d
# now, we can date and process each of those .out files
# this really should be a single creation of notInOthers
# These four different ones all end up to be the same anyhow
# the notInMouse becomes notInOthers below and the others are removed.
mkdir dateRepeats
cd dateRepeats
cat << '_EOF_' > mkLSR
#!/bin/csh -fe
rm -f $1.out_mus-musculus_rattus_canis-familiaris_bos-taurus
ln -s ../splitOut/$1.out .
/scratch/data/RepeatMasker/DateRepeats \
$1.out -query human -comp mouse -comp rat -comp dog -comp cow
rm $1.out
mkdir -p ../notInMouse ../notInRat ../notInDog ../notInCow
/cluster/bin/scripts/extractRepeats 1 $1.out_mus*-taurus \
> ../notInMouse/$1.out.spec
/cluster/bin/scripts/extractRepeats 2 $1.out_mus*-taurus \
> ../notInRat/$1.out.spec
/cluster/bin/scripts/extractRepeats 3 $1.out_mus*-taurus \
> ../notInDog/$1.out.spec
/cluster/bin/scripts/extractRepeats 4 $1.out_mus*-taurus \
> ../notInCow/$1.out.spec
'_EOF_'
# << happy emacs
chmod +x mkLSR
cat << '_EOF_' > template
#LOOP
./mkLSR $(path1) {check out line+ $(path1).out_mus-musculus_rattus_canis-familiaris_bos-taurus}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../chrom.list single template jobList
para try ... check ... push ... etc...
para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 2441s 40.69m 0.68h 0.03d 0.000 y
# IO & Wait Time: 332s 5.53m 0.09h 0.00d 0.000 y
# Average job time: 30s 0.50m 0.01h 0.00d
# Longest finished job: 125s 2.08m 0.03h 0.00d
# Submission to last job: 454s 7.57m 0.13h 0.01d
done
# these four types of out.spec results all turn out to be identical
# To check identical
cd /hive/data/genomes/hg19/bed/linSpecRep
find . -name "*.out.spec" | \
while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
| sort -k1,1n | sort -t"/" -k3,3 | sed -e "s#./notIn.*/##" \
| sort | uniq -c | less
# You will see they are all a count of 4
# Set them up on scratch data and get to all the kluster nodes:
mkdir /hive/data/staging/data/hg19/lineageSpecificRepeats
cd notInMouse
rsync -a --progress ./ /hive/data/staging/data/hg19/lineageSpecificRepeats
cd ..
mv notInMouse notInOthers
# do not need to keep all of these
rm -fr notInRat notInDog notInCow
# We also need the nibs for blastz runs with lineage specific repeats
mkdir /hive/data/genomes/hg19/bed/nibs
cd /hive/data/genomes/hg19/bed/nibs
cut -f1 ../../chrom.sizes | while read C
do
twoBitToFa -seq=${C} ../../hg19.2bit stdout \
| faToNib -softMask stdin ${C}.nib
echo "${C} done"
done
mkdir /hive/data/staging/data/hg19/nib
rsync -a --progress ./ /hive/data/staging/data/hg19/nib
# Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
#############################################################################
# create gc5Base download file (DONE - 2009-04-24 - Hiram)
cd /hive/data/genomes/hg19/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
/cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz
#############################################################################
# Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram)
mkdir /hive/data/genomes/hg19/bed/ctgPos
cd /hive/data/genomes/hg19/bed/ctgPos
cat << '_EOF_' > mkCtgPos.sh
AGP="/hive/data/genomes/hg19/download/assembled_chromosomes/AGP"
export AGP
for F in `(cd ${AGP}; ls chr*.agp | grep -v ".comp.agp")`
do
C=${F/.agp/}
grep "^CM" "${AGP}/${F}" | awk '$5 != "N"' | awk '
{
printf "%s\t%d\t%s\t%d\t%d\n", $6, $8-$7+1, "'${C}'", $2-1+$7-1, $2-1+$8
}
'
done
'_EOF_'
# << happy emacs
chmod +x mkCtgPos.sh
./mkCtgPos.sh > ctgPos.tab
cat << '_EOF_' > mkRanCtgPos.sh
AGP="/hive/data/genomes/hg19/download/unlocalized_scaffolds/AGP"
export AGP
for F in `(cd ${AGP}; ls chr*.agp)`
do
C=${F/.unlocalized.scaf.agp/}
c=${C/chr/}
export C c
grep "^GL" "${AGP}/${F}" | awk '$5 != "N"' | awk '
BEGIN {
ctgName=""
ctgStart=0
ctgEnd=0
chrom="'${c}'"
ctgNameLower=""
}
{
if (match(ctgName,$1)) {
ctgEnd = $3
} else {
if (length(ctgName) > 0) {
size=ctgEnd - ctgStart
printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower,
ctgStart, ctgEnd
}
ctgStart = $2 - 1
ctgEnd = $3
ctgName = $1
ctgNameLower = tolower($1)
sub(".1$","",ctgNameLower)
}
}
END {
size=ctgEnd - ctgStart
printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower,
ctgStart, ctgEnd
}
'
done
'_EOF_'
# << happy emacs
chmod +x mkRanCtgPos.sh
./mkRanCtgPos.sh >> ctgPos.tab
# fetch .sql definition from hg18
chmod 777 .
hgsqldump --all -c --tab=. hg18 ctgPos
chmod 775 .
hgsql hg19 < ctgPos.sql
hgsql -e 'load data local infile "ctgPos.tab" into table ctgPos;' hg19
#############################################################################
# CLONE ENDS - first step for BACEND/CytoBand tracks
# (DONE - 2009-04-28 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/cloneend/ncbi
cd /hive/data/genomes/hg19/bed/cloneend/ncbi
wget --timestamping \
'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_ends*.mfa.gz'
wget --timestamping \
'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_info*.txt.gz'
cd /hive/data/genomes/hg19/bed/cloneend
# seems like the *.mfa files were split just for convenience
# concatenate
for F in ncbi/*.mfa.gz
do
zcat "${F}"
echo "${F}" 1>&2
done | gzip > all.mfa.gz
# that 1>&2 echos to stderr so you can see the file name and not
# interfere with the pipe stdout output to gzip
# Convert the title line of the all.mfa file
zcat all.mfa.gz \
| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#\.[0-9]|.*##" \
| gzip > cloneEnds.fa.gz
zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz
# make sure nothing got broken:
faSize all.mfa.gz
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
faSize cloneEnds.fa.gz
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
# identical numbers
# you can also carefully check the names:
zcat all.mfa.gz | grep "^>" | awk -F'|' '{print $4}' \
| sed -e "s/\.[0-9]$//" | sort > mfa.names
# should be the same as:
zcat cloneEnds.fa.gz | grep "^>" | sed -e "s/>//" | sort > clone.names
# concatenate the text files, too
bash
for F in ncbi/*.txt.gz
do
zcat "${F}"
echo "${F}" 1>&2
done | gzip > all.txt.gz
# generate cloneEndPairs.txt and cloneEndSingles.txt
zcat all.txt.gz >all.txt
$HOME/kent/src/hg/utils/cloneEndParse.pl all.txt
# Reading in end info
# Writing out pair info
# Writing out singleton info
# 302264 pairs and 203094 singles
# examined all the clone names and all the bac end names in these two
# files and compared with business from all.txt to make sure we properly
# classified all of them correctly. We had 833,173 clone sequences,
# and 501,135 bac end names
# faSplit does not function correctly if given a .gz source file
# AND, we need the unzipped file for sequence loading below
gunzip cloneEnds.fa.gz
# split
mkdir splitdir
cd splitdir
faSplit sequence ../cloneEnds.fa 100 cloneEnds
# Check to ensure no breakage:
cat *.fa | faSize stdin
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
# same numbers as before
# load sequences
ssh hgwdev
mkdir /gbdb/hg19/cloneend
cd /gbdb/hg19/cloneend
ln -s /hive/data/genomes/hg19/bed/cloneend/cloneEnds.fa .
cd /tmp
hgLoadSeq hg19 /gbdb/hg19/cloneend/cloneEnds.fa
# Advisory lock created
# Creating .tab file
# Adding /gbdb/hg19/cloneend/cloneEnds.fa
# 833173 sequences
# Updating seq table
# Advisory lock has been released
# All done
##############################################################################
# BACEND SEQUENCE ALIGNMENTS (WORKING - 2009-04-28 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/bacends/run.blat
cd /hive/data/genomes/hg19/bed/bacends/run.blat
# going to run separate runs for the golden path sequence vs. the
# randoms, haplotypes, chrUn and chrM
partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
| egrep -v "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://; s/:/./" > hg19.list
ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
> bacEnds.list
ssh swarm
cd /hive/data/genomes/hg19/bed/bacends/run.blat
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(path2) {check out line+ psl/$(root1)/$(file1).$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set target = $1
set query = $2
set result = $3
set partSpec = `echo $target | sed -e "s/\./:/"`
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set dir = $result:h
set chr = `echo $target | sed -e "s/\..*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
# echo $tmpFile
# echo "chr: $chr $start $end -> size: $chrSize, range: $range"
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p $dir
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
rm -f $result
liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
# << happy emacs
gensub2 hg19.list bacEnds.list template jobList
para create jobList
# 62034 jobs in batch
# these jobs run quickly, limit them to 250 at a time
para try, check, -maxJob=250 push, etc ...
# Completed: 62034 of 62034 jobs
# CPU time in finished jobs: 506023s 8433.72m 140.56h 5.86d 0.016 y
# IO & Wait Time: 175853s 2930.88m 48.85h 2.04d 0.006 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest finished job: 752s 12.53m 0.21h 0.01d
# Submission to last job: 3533s 58.88m 0.98h 0.04d
# combine the alignments
time pslSort dirs raw.psl temp psl/chr*
# 62034 files in 24 dirs
# Got 62034 files 249 files per mid file
# real 81m2.820s
# -rw-rw-r-- 1 13410334441 Apr 29 12:00 raw.psl
# cleanup
rmdir temp
time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
# real 5m55.990s
# Processed 106254032 alignments
# -rw-rw-r-- 1 372734361 Apr 29 12:56 bacEnds.psl
wc -l bacEnds.psl
# 2852977 bacEnds.psl
time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
all_bacends bacEnds
# Reading pair file
# Reading psl file
# Creating Pairs
# Writing to files
# real 0m18.851s
# this creates the files:
# -rw-rw-r-- 1 21178741 Apr 29 13:00 bacEnds.pairs
# -rw-rw-r-- 1 5250873 Apr 29 13:00 bacEnds.orphan
# -rw-rw-r-- 1 738045 Apr 29 13:00 bacEnds.short
# -rw-rw-r-- 1 463560 Apr 29 13:00 bacEnds.slop
# -rw-rw-r-- 1 146369 Apr 29 13:00 bacEnds.mismatch
# -rw-rw-r-- 1 3528 Apr 29 13:00 bacEnds.long
# filter and sort
awk '$5 >= 300' bacEnds.pairs | sort -k1,1 -k2,2n > bacEndPairs.bed
awk '$5 >= 300' bacEnds.slop bacEnds.short bacEnds.long \
bacEnds.mismatch bacEnds.orphan | sort -k1,1 -k2,2n > bacEndPairsBad.bed
extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
> bacEndPairs.load.psl
############################################################################
# BACEND Randoms SEQUENCE ALIGNMENTS (WORKING - 2009-04-28 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/bacends/run.randoms
cd /hive/data/genomes/hg19/bed/bacends/run.randoms
# this separate run for the randoms, haplotypes, chrUn and chrM
partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
| egrep "random|_hap|chrUn" \
| sed -e "s/.*2bit://; s/:/./" > random.list
cat tParts/*.lst | sed -e "s/.*2bit://; s/:/./" >> random.list
ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
> bacEnds.list
ssh swarm
cd /hive/data/genomes/hg19/bed/bacends/run.randoms
gensub2 random.list bacEnds.list ../run.blat/template jobList
# very similar runOne.csh script as above, but it doesn't need to do
# the lift
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set target = $1
set query = $2
set result = $3
set partSpec = `echo $target | sed -e "s/\./:/"`
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set dir = $result:h
set chr = `echo $target | sed -e "s/\..*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
# echo $tmpFile
# echo "chr: $chr $start $end -> size: $chrSize, range: $range"
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p $dir
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
rm -f $result
mv $tmpFile.psl $result
echo rm -f $tmpFile.lift
'_EOF_'
# << happy emacs
# these jobs run fast, do not let too many of them run
para -maxJob=100 try...check...push
para time
# Completed: 6762 of 6762 jobs
# CPU time in finished jobs: 20357s 339.29m 5.65h 0.24d 0.001 y
# IO & Wait Time: 17839s 297.31m 4.96h 0.21d 0.001 y
# Average job time: 6s 0.09m 0.00h 0.00d
# Longest finished job: 261s 4.35m 0.07h 0.00d
# Submission to last job: 508s 8.47m 0.14h 0.01d
time pslSort dirs raw.psl temp psl/chr*
# 6762 files in 69 dirs
# Got 6762 files 82 files per mid file
# real 6m37.177s
# 37044 files in 98 dirs
# Got 37044 files 192 files per mid file
# real 32m24.804s
# -rw-rw-r-- 1 6487445210 Feb 2 21:08 raw.psl
time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl randomEnds.psl randomReps.psr > pslReps.out 2>&1 &
# real 0m5.761s
# Processed 1254273 alignments
# cleanup
rmdir temp
wc -l randomEnds.psl
# 367567 randomEnds.psl
time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose randomEnds.psl \
/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
all_bacends bacEnds
# Reading pair file
# Reading psl file
# Creating Pairs
# Writing to files
# real 0m11.221s
# this creates the files:
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.slop
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.short
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.mismatch
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.long
# -rw-rw-r-- 1 141836 Apr 29 14:53 bacEnds.pairs
# -rw-rw-r-- 1 649907 Apr 29 14:53 bacEnds.orphan
##############################################################################
# BacEnds track - both results loaded together (DONE - 2009-04-29 - Hiram)
ssh hgwdev
cd /hive/data/genomes/hg19/bed/bacends
# filter and sort
awk '$5 >= 300' run.blat/bacEnds.pairs run.randoms/bacEnds.pairs \
| sort -k1,1 -k2,2n > bacEndPairs.bed
awk '$5 >= 300' run.blat/bacEnds.slop run.blat/bacEnds.short \
run.blat/bacEnds.long run.blat/bacEnds.mismatch \
run.blat/bacEnds.orphan run.randoms/bacEnds.slop \
run.randoms/bacEnds.short run.randoms/bacEnds.long \
run.randoms/bacEnds.mismatch run.randoms/bacEnds.orphan \
| sort -k1,1 -k2,2n > bacEndPairsBad.bed
head -5 run.blat/bacEnds.psl > bacEnds.psl
headRest 5 run.blat/bacEnds.psl > t.psl
headRest 5 run.randoms/randomEnds.psl >> t.psl
sort -k14,14 -k16,16n t.psl >> bacEnds.psl
extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
> bacEnds.load.psl
# load them into the database
ssh hgwdev
cd /hive/data/genomes/hg19/bed/bacends
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $4}' bacEndPairs.bed | grep " "
awk '{print $5}' bacEndPairs.bed | sort | uniq -c
# result should be the scores, no extraneous strings:
# 156984 1000
# 195 300
# 316 375
# 297 500
# 1476 750
# edit the file and fix it if it has a bad name.
hgLoadBed -notItemRgb hg19 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 208922 elements of size 11
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed -notItemRgb hg19 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 79004 elements of size 11
#hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl hg19 -table=all_bacends bacEnds.load.psl
# one complaint, there appears to be a bogus insert count in one
# of the blat results:
# < 585 797 67 0 3 2 -63 9 79188 + AQ743980 852 42 846 chr19_gl000208_random 92689 4045 84100 11 14,124,84,496,53,6,20,28,28,10,4, 42,56,180,200,696,750,756,776,804,832,842, 4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
Became:
# > 585 797 67 0 3 2 0 9 79188 + AQ743980 852 42 846 chr19_gl000208_random 92689 4045 84100 11 14,124,84,496,53,6,20,28,28,10,4, 42,56,180,200,696,750,756,776,804,832,842, 4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
hgsql -N -e "select count(*) from all_bacends;" hg19
# 2289275
hgsql -N -e "select count(*) from all_bacends;" hg18
# 1727387
hgsql -N -e "select count(*) from all_bacends;" hg17
# 1729146
nice featureBits hg19 all_bacends
# 230917362 bases of 2897316137 (7.970%) in intersection
nice featureBits hg18 all_bacends
# 227770876 bases of 2881515245 (7.905%) in intersectio
nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection
nice featureBits hg19 bacEndPairs
# 236889607 bases of 2897316137 (8.176%) in intersection
nice featureBits hg18 bacEndPairs
# 162690030 bases of 2881515245 (5.646%) in intersection
nice featureBits hg17 bacEndPairs
# 162099487 bases of 2866216770 (5.656%) in intersection
nice featureBits hg19 bacEndPairsBad
# 38344094 bases of 2897316137 (1.323%) in intersection
nice featureBits hg18 bacEndPairsBad
# 37326990 bases of 2881515245 (1.295%) in intersection
nice featureBits hg17 bacEndPairsBad
# 37437558 bases of 2866216770 (1.306%) in intersection
############################################################################
# STS MARKERS (DONE - 2009-04-30 - 2009-05-06 - Hiram)
mkdir /hive/data/outside/ncbi/sts.2009-04
cd /hive/data/outside/ncbi
ln -s sts.2009-04 sts.11
cd /hive/data/outside/ncbi/sts.2009-04
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
gunzip sts.gz
mv sts dbSTS.fa
# these items are copied in from the previous builds
cp -p /cluster/data/ncbi/sts.10/all.STS.fa ./all.STS.fa.prev
cp -p /cluster/data/ncbi/sts.10/stsInfo2.bed ./stsInfo2.bed.prev
# edit stsInfo2.bed.prev for a
# manual fixup of error that is in the hg18 bed file, replace
# the line for AFM067XA9 to fix bogus long list of aliases to be:
# 22788^IAFM067XA9^I1^IZ66598^I1^IGDB:1221611,^I5^I067XA9,GDB:1221611,W202,Z66598,SWSS2303^I69047^I0^I^ITCTTGGGGTTTAATTGCTTT^ICTTTGCCACAATCTTACACA^I149^IHomo sapiens^I1^I2^I6453,6454,^I0^I^I^I^I0^I0^I^I^I0^I0^IAFM067XA9^Ichr7^I145^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0
# as taken directly out of the hg18.stsInfo2 table which was fixed
# by Bob and Archana
# Convert the title line of the dbSTS.fa file
# Verify that column 3 only contains gb emb dbj
grep "^>" dbSTS.fa | awk -F'|' '{print $3}' | sort | uniq -c
# 39124 dbj
# 57375 emb
# 1212541 gb
# if that is true, this sed will work:
cat dbSTS.fa \
| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#^>gi.[0-9]*.dbj.#>#; s#\.[0-9]|.*##" \
> UniSTS.convert.fa
# get accessions
grep ">" UniSTS.convert.fa | sed -e "s/^>//" | sort > UniSTS.acc
# head and tail that to ensure names are reasonable, odd names would
# show up at the beginning or end
wc -l UniSTS.acc
# 1309040 UniSTS.acc
# NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
# all.STS.fa, stsAlias.bed files
updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
UniSTS.sts UniSTS.aliases UniSTS.convert.fa new
# verify the number of aliases is reasonable:
awk '{print $3}' new.alias | sort | uniq -c | sort -rn | less
# 50 D7S831
# 34 CHLC.GATA2B06.465
# 24 CHLC.GATA11E11
# 23 AFM276ZF5
# 23 AFM273YH9
# 22 SHGC-133043
# ... etc ...
# verify there are no unusually long or short lines:
awk '{printf "%d\n", length($0)}' new.info | sort -n | head -3
# 143
# 144
# 144
awk '{printf "%d\n", length($0)}' new.info | sort -n | tail -3
# 552
# 553
# 644
# check for null in the new files:
grep -i null new.*
# if the new files look good, they can become the set to use:
mv new.info stsInfo2.bed
mv new.primers all.primers
mv new.alias stsAlias.bed
mv new.fa all.STS.fa
# get list of all STS id's in the fasta file
sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n > all.STS.id
wc -l all.STS.id
# 100520 total sequences
# in hg18 this was: 93698 total sequences
$HOME/kent/src/hg/stsMarkers/convertPrimerToFA all.primers > all.primers.fa
# check that fasta file for unusual length sequences:
faSize all.primers.fa
# 97815329 bases (83677626 N's 14137703 real 14137703 upper 0 lower) in 317592 sequences in 1 files
# Total size: mean 308.0 sd 279.3 min 40 (dbSTS_144) max 30000 (dbSTS_156892) median 244
# Copy stsInfo2.bed and stsAlias.bed to data directory becuase
# these will be loaded into the database later
mkdir -p /hive/data/genomes/hg19/bed/sts
cp -p stsInfo2.bed /hive/data/genomes/hg19/bed/sts/
cp -p stsAlias.bed /hive/data/genomes/hg19/bed/sts/
# Create sts sequence alignments
mkdir /hive/data/genomes/hg19/bed/sts/split
faSplit sequence all.STS.fa 100 /hive/data/genomes/hg19/bed/sts/split/sts
ssh swarm
mkdir /hive/data/genomes/hg19/bed/sts/run
cd /hive/data/genomes/hg19/bed/sts/run
# going to run separate runs for the golden path sequence vs. the
# randoms, haplotypes, chrUn and chrM
# 40,000,000 chunck sizes, 20,000 overlap
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep -v "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://;" > hg19.list
ls -1S ../split > sts.list
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec \
../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
# rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
# << happy emacs
chmod +x runOne.csh
gensub2 hg19.list sts.list template jobList
# these jobs run quickly, allow only 100 at a time
para -maxJob=100 create jobList
# 8367 jobs in batch
para try ... check ... push ... etc
# Completed: 8366 of 8366 jobs
# CPU time in finished jobs: 89744s 1495.74m 24.93h 1.04d 0.003 y
# IO & Wait Time: 25467s 424.44m 7.07h 0.29d 0.001 y
# Average job time: 14s 0.23m 0.00h 0.00d
# Longest finished job: 53s 0.88m 0.01h 0.00d
# Submission to last job: 1592s 26.53m 0.44h 0.02d
# and, run the randoms as a separate run:
mkdir /hive/data/genomes/hg19/bed/sts/run.randoms
cd /hive/data/genomes/hg19/bed/sts/run.randoms
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep "tParts|random|_hap|chrUn"
cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
ls -1S ../split > sts.list
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec \
../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
/bin/rm -f $tmpFile.psl
'_EOF_'
# << happy emacs
chmod +x runOne.csh
gensub2 hg19.list sts.list template jobList
# these jobs run quickly, allow only 100 at a time
para -maxJob=100 create jobList
# 6486 jobs in batch
para try ... check ... push ... etc
# Completed: 6486 of 6486 jobs
# CPU time in finished jobs: 2206s 36.77m 0.61h 0.03d 0.000 y
# IO & Wait Time: 16505s 275.08m 4.58h 0.19d 0.001 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest finished job: 21s 0.35m 0.01h 0.00d
# Submission to last job: 601s 10.02m 0.17h 0.01d
# Compile sts sequence results
ssh hgwdev
cd /hive/data/genomes/hg19/bed/sts/run
time pslSort dirs raw.psl temp psl/chr*
# 8366 files in 89 dirs
# Got 8366 files 91 files per mid file
# real 8m50.714s
# -rw-rw-r-- 1 810438277 May 1 11:45 raw.psl
cd /hive/data/genomes/hg19/bed/sts/run.randoms
time pslSort dirs raw.psl temp psl/chr*
# 6486 files in 69 dirs
# Got 6486 files 81 files per mid file
# real 1m42.120s
# -rw-rw-r-- 1 18378188 May 1 11:52 raw.psl
rmdir temp
cd /hive/data/genomes/hg19/bed/sts
cat run*/raw.psl | egrep -v "^$|^psLayout|^match|^ |^-" \
| pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
stsMarkers.psl /dev/null
# Processed 7412166 alignments
# -rw-rw-r-- 1 12031760 May 1 11:57 stsMarkers.psl
$HOME/kent/src/hg/stsMarkers/extractPslInfo -h stsMarkers.psl
# creates stsMarkers.psl.initial
# -rw-rw-r-- 1 4485053 May 1 12:06 stsMarkers.psl.initial
wc -l stsMarkers.psl.initial
# 101338 stsMarkers.psl.initial
# this command needs a chrom_names file to work correctly with this
# new style of layout for hg19:
cd /hive/data/genomes/hg19
cut -f1 chrom.sizes | sed -e "s/chr//" > chrom_names
cd /hive/data/genomes/hg19/bed/sts
$HOME/kent/src/hg/stsMarkers/findAccession.pl -agp stsMarkers.psl.initial \
/cluster/data/hg19
wc -l stsMarkers.psl.initial.acc
# 101338 stsMarkers.psl.initial.acc
sort -k4,4n stsMarkers.psl.initial.acc > stsMarkers.final
# determine found markers (4th field in file)
cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
wc -l stsMarkers.found
# 96472 stsMarkers.found
# out of 100520 total sequences from:
wc -l /hive/data/outside/ncbi/sts.2009-04/all.STS.id
# There are lots of duplicates:
wc -l stsMarkers.final
# 101338 stsMarkers.final
# And a lot of them are just completely haywire:
awk '$3-$2 < 1001' stsMarkers.final | wc -l
# 98382
# filter out markers that are too long
awk '$3-$2 < 1001' stsMarkers.final > stsMarkers.1K.size.filtered
# alignment of primers
ssh swarm
cd /hive/data/outside/ncbi/sts.2009-04
awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
all.primers > all.primers.ispcr
mkdir primerAlign
cd primerAlign
mkdir split
cd split
split -l 5000 ../../all.primers.ispcr primer_
ls > ../primer.list
cd ..
# we need a 10.ooc file for this business
time blat /scratch/data/hg19/hg19.2bit \
/dev/null /dev/null -tileSize=10 -makeOoc=10.ooc -repMatch=1024
# Wrote 146902 overused 10-mers to 10.ooc
# real 19m16.758s
# separate runs for whole genome vs. randoms
mkdir run
cd run
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep -v "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://;" > hg19.list
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
-ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
# << happy emacs
chmod +x runOne.csh
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 hg19.list ../primer.list template jobList
para create jobList
# 5696 jobs in batch
para try ... check ... push ... etc
# Completed: 5696 of 5696 jobs
# CPU time in finished jobs: 203899s 3398.32m 56.64h 2.36d 0.006 y
# IO & Wait Time: 22049s 367.48m 6.12h 0.26d 0.001 y
# Average job time: 40s 0.66m 0.01h 0.00d
# Longest finished job: 5314s 88.57m 1.48h 0.06d
# Submission to last job: 5418s 90.30m 1.50h 0.06d
# Estimated complete: 0s 0.00m 0.00h 0.00d
# sort and filter the results
cd psl
pslSort dirs raw.psl temp chr*
# 5696 files in 89 dirs
# Got 5696 files 75 files per mid file
# -rw-rw-r-- 1 456802973 May 4 13:32 raw.psl
cd ..
mkdir filter
pslQuickFilter -minMatch=26 -maxMismatch=5 \
-maxTinsert=5000 -verbose psl/ filter/
# -rw-rw-r-- 1 50302564 May 4 13:35 raw.psl
# And, for the randoms
mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://;" > hg19.list
cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
cat tParts/* > hg19.list
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
-ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
'_EOF_'
# << happy emacs
chmod +x runOne.csh
# can not use line+ check here, many of them are empty
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 hg19.list ../primer.list template jobList
# they run quickly, limit to 100
para -maxJob=100 create jobList
para try ... check ... push ... etc
# Completed: 4416 of 4416 jobs
# CPU time in finished jobs: 1746s 29.09m 0.48h 0.02d 0.000 y
# IO & Wait Time: 11407s 190.12m 3.17h 0.13d 0.000 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest finished job: 8s 0.13m 0.00h 0.00d
# Submission to last job: 147s 2.45m 0.04h 0.00d
# sort and filter the results
cd psl
pslSort dirs raw.psl temp chr*
# 4416 files in 69 dirs
# Got 4416 files 66 files per mid file
rmdir temp
# -rw-rw-r-- 1 9066053 May 4 13:31 raw.psl
# putting the two runs together
mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
ln -s ../run/filter/raw.psl run.psl
ln -s ../runRandoms/filter/raw.psl runRandoms.psl
# -rw-rw-r-- 1 50302564 May 4 13:35 run.psl
# -rw-rw-r-- 1 825973 May 4 13:35 runRandoms.psl
cd ..
pslSort dirs primers.psl temp psl
# 2 files in 1 dirs
# Got 2 files 1 files per mid file
# -rw-rw-r-- 1 51128110 May 4 13:39 primers.psl
wc -l primers.psl
# 448107 primers.psl
rmdir temp
pslFilterPrimers primers.psl ../all.primers primers.filter.psl
# creates primers.filter.unlifted.psl.notfound.primers
wc -l primers*
# 237962 primers.filter.psl
# 97191 primers.filter.psl.notfound.primers
# see if ePCR can find some of these notfound
ssh swarm
mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
mkdir split
cd split
split -l 5000 ../../primers.filter.psl.notfound.primers primers_
cd ..
ls -1S split > primers.lst
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| grep -v tParts | sed -e "s/.*2bit://;" > hg19.list
cat tParts/* | sed -e "s/.*2bit://;" >> hg19.list
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set primer = split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p epcr/$partSpec
/bin/rm -f $tmpFile.psl
twoBitToFa /scratch/data/hg19/hg19.2bit:$partSpec $tmpFile.fa
/cluster/bin/scripts/runEpcr64 $primer $tmpFile.fa $tmpFile.epcr
/bin/rm -f $result
/bin/mv $tmpFile.epcr $result
rm -f $tmpFile.fa $tmpFile.lift $tmpFile.psl $tmpFile.*
'_EOF_'
# << happy emacs
chmod +x runOne.csh
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line epcr/$(file1)/$(root2).epcr}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 hg19.list primers.lst template jobList
para create jobList
# 3160 jobs
para try ... check ... push ... etc ...
# Completed: 3160 of 3160 jobs
# CPU time in finished jobs: 86253s 1437.54m 23.96h 1.00d 0.003 y
# IO & Wait Time: 11196s 186.61m 3.11h 0.13d 0.000 y
# Average job time: 31s 0.51m 0.01h 0.00d
# Longest finished job: 89s 1.48m 0.02h 0.00d
# Submission to last job: 237s 3.95m 0.07h 0.00d
find ./epcr -type f | xargs cat > all.epcr
wc -l all.epcr
# 797286 all.epcr
# convert the coordinates from the partitionSequence.pl to a lift file
awk '{print $1}' all.epcr | sort -u > hg19.partSpec.txt
$HOME/kent/src/hg/stsMarkers/liftFromSpec.pl hg19 hg19.partSpec.txt \
> all.epcr.lift
cat all.epcr | sed -e "s/\.\./ /; s/ */\t/g" \
| liftUp -type=.bed stdout all.epcr.lift error stdin \
| awk '
{
printf "%s %d..%d %d %d\n", $1, $2, $3, $4, $5
}
' > all.epcr.lifted
pslFilterPrimers -epcr=all.epcr.lifted -verbose=1 ../primers.psl \
/cluster/home/hiram/bin/x86_64/pslFilterPrimers -epcr=all.epcr.lifted \
-verbose=1 ../primers.psl ../../all.primers epcr.primers.psl
# this took a long time, many hours
# -rw-rw-r-- 1 2785254 May 5 17:28 epcr.not.found
# -rw-rw-r-- 1 27343510 May 5 17:28 epcr.primers.psl
# -rw-rw-r-- 1 1616885 May 5 17:28 epcr.primers.psl.notfound.primers
time ./epcrToHgPsl.pl epcr.not.found ../../all.primers \
time $HOME/kent/src/hg/stsMarkers/epcrToPsl epcr.not.found \
../../all.primers /hive/data/genomes/hg19
# real 69m38.444s
# -rw-rw-r-- 1 0 May 6 14:18 epcr.not.found.nomatch
# -rw-rw-r-- 1 8369138 May 6 15:26 epcr.not.found.psl
# combining everything together now
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign
sort -u primers.filter.psl epcr/epcr.primers.psl epcr/epcr.not.found.psl \
| sort -k15,15 -k17,17n > primers.final.psl
wc -l primers.final.psl
# 310705 primers.final.psl
time $HOME/kent/src/hg/stsMarkers/fixPrimersQueryGaps.pl \
../all.primers primers.final.psl > primers.final.fix.psl
# real 0m19.580s
wc -l primers.final.fix.psl
# 310705 primers.final.fix.psl
# Extract relevant info, make alignments unique, and create final file to
# be merged with full sequence alignments
$HOME/kent/src/hg/stsMarkers/extractPslInfo -h primers.final.fix.psl
# real 0m15.303s
# -rw-rw-r-- 1 15660447 May 6 15:44 primers.final.fix.psl.initial
wc -l primers.final.fix.psl.initial
# 308210 primers.final.fix.psl.initial
$HOME/kent/src/hg/stsMarkers/findAccession.pl -agp \
primers.final.fix.psl.initial /hive/data/genomes/hg19
wc -l primers.final.fix.psl.initial.acc
# 308210 primers.final.fix.psl.initial.acc
$HOME/kent/src/hg/stsMarkers/getStsId ../stsInfo2.bed \
primers.final.fix.psl.initial.acc | sort -k 4n > primers.final
wc -l primers.final
# 308210 primers.final
# There doesn't appear to be any use for this primers.ids list
# except for curiosity. Check the head and tail of this list to
# verify no garbage is in here. There should just be numbers.
awk '{print $4}' primers.final | sort -n | uniq > primers.ids
wc -l primers.ids
# 290961 primers.ids
# Merge primer and sequence files to create final bed file
# Merge (combineSeqPrimerPos) takes about an hour to run
cd /hive/data/genomes/hg19/bed/sts
time $HOME/kent/src/hg/stsMarkers/combineSeqPrimerPos stsMarkers.final \
/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final
# real 0m12.310s
# -rw-rw-r-- 1 15222346 May 6 15:55 stsMarkers_pos.rdb
wc -l stsMarkers_pos.rdb
# 315308 stsMarkers_pos.rdb
time /cluster/bin/scripts/createSTSbed \
/hive/data/outside/ncbi/sts.2009-04/stsInfo2.bed \
stsMarkers_pos.rdb > stsMap.bed
# real 0m31.886s
# -rw-rw-r-- 1 38244880 May 6 16:25 stsMap.bed
wc -l stsMap.bed
# 305914 stsMap.bed
# Set up sequence files
ssh hgwdev
mkdir /gbdb/hg19/sts.11/
ln -s /hive/data/outside/ncbi/sts.11/all.STS.fa \
/gbdb/hg19/sts.11/all.STS.fa
ln -s /hive/data/outside/ncbi/sts.11/all.primers.fa \
/gbdb/hg19/sts.11/all.primers.fa
# Load all files
cd /hive/data/genomes/hg19/bed/sts
hgLoadSeq hg19 /gbdb/hg19/sts.11/all.STS.fa /gbdb/hg19/sts.11/all.primers.fa
# Creating seq.tab file
# Adding /gbdb/hg19/sts.11/all.STS.fa
# 100520 sequences
# Adding /gbdb/hg19/sts.11/all.primers.fa
# 317592 sequences
# Updating seq table
# Advisory lock has been released
# All done
hgsql hg19 < $HOME/kent/src/hg/lib/stsInfo2.sql
hgsql hg19 < $HOME/kent/src/hg/lib/stsAlias.sql
# these files already exist here from previous operations
# cp -p /hive/data/outside/ncbi/sts.11/{stsInfo2.bed,stsAlias.bed} .
hgsql hg19 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
hgsql hg19 -e 'load data local infile "stsAlias.bed" into table stsAlias'
# a couple minutes for each load above
# filter the stsMap.bed to eliminate items longer than 5,000 bases,
# takes out about 850:
awk '$3-$2 < 5001' stsMap.bed | sort -k1,1 -k2,2n \
> stsMap.filtered.5000.bed
hgLoadBed -notItemRgb -noBin -tab \
-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql hg19 stsMap \
stsMap.filtered.5000.bed
# Loaded 305064 elements of size 28
ln -s \
/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final.fix.psl \
primers.psl
hgLoadPsl -nobin -table=all_sts_primer hg19 primers.psl
hgLoadPsl -nobin -table=all_sts_seq hg19 stsMarkers.psl
##############################################################################
# FISH CLONES (WORKING - 2009-04-29 - Hiram)
# The STS Marker and BAC End Pairs tracks must be completed prior to
# creating this track.
mkdir /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
cd /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
# Download information from NCBI
# point browser at:
# http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
# change "Sequence tag:" to "placed on contig"
# change "Show details on sequence-tag" to "yes"
# change "Download or Display" to "Download table for UNIX"
# press Submit - save as
# /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
chmod 664 /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
# Unfortunately the format of this hbrc file has changed since
# last time. The columns have been rearranged, and one important
# column is missing, the contig information. So, let's see if we
# can recover the original format by putting this together with
# some other things we have here.
$HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
/hive/data/genomes/hg19/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
2> dbg
# the seq_clone.pmd file was obtained via email from Wonhee Jang
# jang at ncbi.nlm.nih.gov - I have asked for clarification where
# such a file can be fetched without resorting to email.
# Get current clone/accession information
wget --timestamping http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
# Create initial Fish Clones bed file
ssh kkstore02
mkdir /hive/data/genomes/hg19/bed/fishClones
cd /hive/data/genomes/hg19/bed/fishClones
# Copy previous sts info from fhcrc
cp -p /hive/data/genomes/hg18/bed/fishClones/fhcrc.sts .
# This fhcrc.sts listing doesn't change. It is merely a listing
# of aliases that remain in effect.
# Create cl_acc_gi_len file form cloneend information:
grep -v "^#" /hive/data/genomes/hg19/bed/cloneend/all.txt \
| awk '{gsub(".[0-9]*$", "", $2);
printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len
hgsql -N \
-e "select chrom,chromStart,chromEnd,contig from ctgPos;" hg19 \
| sort -k1,1 -k2,2n > ctgPos.bed
hgsql -N \
-e "select chrom,chromStart,chromEnd,frag,0,strand from gold;" hg19 \
| sort -k1,1 -k2,2n > gold.bed
hgsql -N \
-e "select tName,tStart,tEnd,qName,0,strand from all_bacends;" hg19 \
| sort -k1,1 -k2,2n > all_bacends.bed
hgsql -N \
-e "select chrom,chromStart,chromEnd,name,score,strand from bacEndPairs;" hg19 \
| sort -k1,1 -k2,2n > bacEndPairs.bed
ssh hgwdev
# have to be on hgwdev for this since it is going to read from the
# database. Had to work on this program to get it past what is
# evidently a bad entry in hbrc.fixed where columns of information
# are missing for one clone in particular
time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg19 \
/hive/data/outside/ncbi/fishClones/fishClones.2009-04/fixed.hbrc.txt \
/hive/data/outside/ncbi/fishClones/fishClones.2009-04/clac.out \
./cl_acc_gi_len \
/hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl \
fishClones
# real 2m4.708s
# Reading Fish Clones file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
# reading fishInfo file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
# Reading Clone/Acc (clac.out) file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/clac.out
# Reading BAC Ends file ./cl_acc_gi_len
# Reading BAC Ends psl file /hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl
# Reading additional STS Marker links fhcrc.sts
# Determining good positions
# findClonePos: determining positions of fish clones
# Writing output file
# ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# Load the track
ssh hgwdev
cd /hive/data/genomes/hg19/bed/fishClones
hgLoadBed -notItemRgb -noBin -tab \
-sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
hg19 fishClones fishClones.bed
# Loaded 9461 elements of size 16
##############################################################################
# UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
mkdir /hive/data/genomes/hg19/ensembl
cd /hive/data/genomes/hg19/ensembl
wget --timestamping \
'ftp://ftp.ensembl.org/pub/pre/homo_sapiens/GRCh37/dna/*'
# do not need the repeat masker sequence (although it would be
# interesting to measure to see how it compares)
rm -f *.dna_rm.*
# fortunately we have the same sizes as Ensembl for everything
# (except the haplotypes) and the sizes are unique for each sequence
# so we can relate the names via their sizes
mkdir /hive/data/genomes/hg19/bed/ucscToEnsembl
cd /hive/data/genomes/hg19/bed/ucscToEnsembl
# the toplevel file is a duplicate of everything else
ls /hive/data/genomes/hg19/ensembl/*.fa.gz | grep -v toplevel \
| while read F
do
zcat "${F}"
done | faCount stdin > faCount.txt
cat << '_EOF_' > relateUcscEnsembl.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %ucscChrs; # key is size, value is UCSC chr name
open (FH,"<../../chrom.sizes") or die "can not read ../../chrom.sizes";
while (my $line = <FH>) {
chomp $line;
my ($chr, $size) = split('\s+', $line);
die "'$line\n'duplicate size in ../chrom.sizes" if (exists($ucscChrs{$size})
);
$ucscChrs{$size} = $chr;
}
close (FH);
my %ensemblChrs; # key is size, value is Ensembl chr name
open (FH,"<faCount.txt") or die "can not read faCount.txt";
while (my $line = <FH>) {
next if ($line =~ m/#/);
next if ($line =~ m/total/);
chomp $line;
my ($chr, $size, $rest) = split('\s+', $line, 3);
die "'$line\n'duplicate size in faCount.txt" if (exists($ensemblChrs{$size})
);
$ensemblChrs{$size} = $chr;
}
close (FH);
my %usedUcscChrs;
my %usedEnsemblChrs;
my %ensemblTranslate; # key is Ensembl name, value is UCSC size
foreach my $size (keys %ucscChrs) {
if (exists($ensemblChrs{$size})) {
$usedUcscChrs{$size} = $ucscChrs{$size};
$usedEnsemblChrs{$size} = $ensemblChrs{$size};
printf "%s\t%s\t%d\n", $ucscChrs{$size}, $ensemblChrs{$size}, $size;
} else {
my $ucscName = $ucscChrs{$size};
my $ensemblName = "unknown";
if ($ucscName =~ m/^chr6/) {
$ucscName =~ s/_hap.//;
$ucscName =~ s/chr6_/chr6_mhc_/;
$ensemblName = "HS" . uc($ucscName);
} elsif ($ucscName =~ m/^chr17_/ || $ucscName =~ m/^chr4_/) {
$ucscName =~ s/_.*/_1/;
$ensemblName = "HS" . uc($ucscName);
} elsif ($ucscName =~ m/^chrM/) {
print "# no translation for chrM\n";
} else {
die "unknown UCSC chr name: $ucscName";
}
printf "# ucsc $ucscChrs{$size} -> $ensemblName\n";
$ensemblTranslate{$ensemblName} = $size;
}
}
foreach my $size (keys %ensemblChrs) {
if (!exists($usedEnsemblChrs{$size})) {
my $ensemblName = $ensemblChrs{$size};
if (! exists($ensemblTranslate{$ensemblName})) {
die "can not translate Ensembl name $ensemblName";
} else {
my $ucscSize = $ensemblTranslate{$ensemblName};
printf "%s\t%s\t%d\t%d\n", $ucscChrs{$ucscSize}, $ensemblChrs{$size}
, $ucscSize, $size;
}
}
}
printf "chrM\tMT\n";
'_EOF_'
# << happy emacs
chmod +x relateUcscEnsembl.pl
./relateUcscEnsembl.pl 2>&1 | grep -v "^#" \
| awk '{printf "%s\t%s\n", $1, $2}' | sort > ucscToEnsembl.tab
cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
ucsc varchar(255) not null, # UCSC chromosome name
ensembl varchar(255) not null, # Ensembl chromosome name
#Indices
PRIMARY KEY(ucsc(21))
);
'_EOF_'
hgsql hg19 < ucscToEnsembl.sql
hgsql hg19 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'
awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
> ensemblLift.tab
cat << '_EOF_' > ensemblLift.sql
# UCSC offset to Ensembl coordinates
CREATE TABLE ensemblLift (
chrom varchar(255) not null, # Ensembl chromosome name
offset int unsigned not null, # offset to add to UCSC position
#Indices
PRIMARY KEY(chrom(15))
);
'_EOF_'
hgsql hg19 < ensemblLift.sql
hgsql hg19 \
-e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'
##############################################################################
# BLASTZ MOUSE Mm9 (DONE - 2009-05-11 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzMm9.2009-05-11
cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-11
cat << '_EOF_' > DEF
# human vs mouse
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0
# QUERY: Mouse Mm9
SEQ2_DIR=/scratch/data/mm9/nib
SEQ2_SMSK=/scratch/data/mm9/notInOthers
SEQ2_LEN=/scratch/data/mm9/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
BASE=/hive/data/genomes/hg19/bed/lastzMm9.2009-05-11
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
+ -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
-XXX - running Mon May 11 13:40:46 PDT 2009
+ # fixed up some bugs in the doBlastzChainNet.pl script, finished
+ # the lastz run manually, then continuing:
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -continue=cat \
+ -noLoadChainSplit \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
+ cat fb.hg19.chainMm9Link.txt
+ # 1022723573 bases of 2897316137 (35.299%) in intersection
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -continue=syntenicNet -syntenicNet \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 &
#########################################################################
# BLASTZ Dog CanFam2 (DONE - 2009-05-11 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-11
cd /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-11
cat << '_EOF_' > DEF
# human vs dog
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0
# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/data/canFam2/nib
SEQ2_LEN=/scratch/data/canFam2/chrom.sizes
SEQ2_SMSK=/scratch/scratch/data/canFam2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=20000000
SEQ2_LAP=10000
BASE=/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-11
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
- -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -noLoadChainSplit \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
-XXX - running Mon May 11 13:40:46 PDT 2009
+ # fixed up some bugs in the doBlastzChainNet.pl script, finished
+ # the lastz run manually, then continuing:
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -continue=cat \
+ -noLoadChainSplit \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
+ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
+ # fixup some data missing problems on encodek, then continuing
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -continue=chainMerge \
+ -noLoadChainSplit \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
+ -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 &
+ # real 91m9.723s
+ cat fb.hg19.chainCanFam2Link.txt
+ # 1532071680 bases of 2897316137 (52.879%) in intersection
+
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -continue=syntenicNet -syntenicNet \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
+ -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 &
+XXX - running Wed May 13 12:51:14 PDT 2009
+
+#########################################################################
+# BLASTZ Chicken GalGal3 (WORKING - 2009-05-13 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
+ cd /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
+
+ cat << '_EOF_' > DEF
+# human vs chicken
+# Specific settings for chicken (per Webb email to Brian Raney)
+BLASTZ_H=2000
+BLASTZ_Y=3400
+BLASTZ_L=10000
+BLASTZ_K=2200
+BLASTZ_Q=/scratch/data/blastz/HoxD55.q
+BLASTZ_ABRIDGE_REPEATS=1
+
+# TARGET: Human hg19
+SEQ1_DIR=/scratch/data/hg19/nib
+SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
+SEQ2_DIR=/scratch/data/galGal3/nib
+SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
+SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
+SEQ2_CHUNK=200000000
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -syntenicNet \
+ -noLoadChainSplit \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
+ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
+XXX - running Wed May 13 10:21:42 PDT 2009
#########################################################################
+# BLASTZ Macaca Mulatta RheMac2 (WORKING - 2009-05-13 - Hiram)
+
+ mkdir /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
+ cd /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
+
+ cat << '_EOF_' > DEF
+# human vs macaca mulatta
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+# other parameters from panTro2 vs hg18 lastz on advice from Webb
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/hg19.2bit
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=0
+SEQ1_IN_CONTIGS=0
+
+# QUERY: Macaca Mulatta RheMac2
+SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit
+SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LAP=10000
+SEQ2_IN_CONTIGS=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -syntenicNet \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ > do.log 2>&1 &
+XXX - running Wed May 13 10:41:20 PDT 2009
+
+#########################################################################
+# BLASTZ Rat Rn4 (WORKING - 2009-05-13 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
+ cd /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
+
+ cat << '_EOF_' > DEF
+# human vs rat
+BLASTZ_ABRIDGE_REPEATS=1
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/nib
+SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=0
+
+# QUERY: Rat Rn4
+SEQ2_DIR=/scratch/data/rn4/nib
+SEQ2_SMSK=/scratch/data/rn4/linSpecRep.notInHuman
+SEQ2_LEN=/scratch/data/rn4/chrom.sizes
+SEQ2_CHUNK=10000000
+SEQ2_LAP=10000
+
+BASE=/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ # establish a screen to control this job
+ screen
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -syntenicNet -noLoadChainSplit \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
+
+##############################################################################
+# BLASTZ Rat Rn4 (WORKING - 2009-05-13 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
+ cd /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
+
+ cat << '_EOF_' > DEF
+# human vs orangutan
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+# other parameters from panTro2 vs hg18 lastz on advice from Webb
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/hg19.2bit
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=0
+SEQ1_IN_CONTIGS=0
+
+# QUERY: Orangutan PonAbe1
+SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
+SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LAP=10000
+SEQ2_IN_CONTIGS=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ # establish a screen to control this job
+ screen
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -syntenicNet \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
+ > do.log 2>&1 &
+XXX - running Wed May 13 12:44:33 PDT 2009
+
+##############################################################################