src/hg/makeDb/doc/hg19.txt 1.23
1.23 2009/06/02 21:43:00 hiram
the swap lastz are done for those genomes that are public on the RR
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/hg19.txt 2 Jun 2009 18:35:57 -0000 1.22
+++ src/hg/makeDb/doc/hg19.txt 2 Jun 2009 21:43:00 -0000 1.23
@@ -1,4058 +1,4044 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on
# NCBI build 37 (February 2009 freeze) aka:
# GRCh37 - Genome Reference Consortium Human Reference 37
# Assembly Accession: GCA_000001405.1
# "$Id$";
#############################################################################
# Download sequence (DONE - 2009-02-04 - Hiram)
mkdir -p /hive/data/genomes/hg19/download
cd /hive/data/genomes/hg19/download
mkdir -p assembled_chromosomes
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=assembled_chromosomes \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes
mkdir -p alternate_loci
for N in 1 2 3 4 5 6 7 8 9
do
wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=alternate_loci \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
done
mkdir -p unlocalized_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=unlocalized_scaffolds \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds
mkdir -p unplaced_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=unplaced_scaffolds \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds
mkdir -p placed_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=placed_scaffolds \
-nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds
mkdir ucscChr
cd ucscChr
for F in ../assembled_chromosomes/FASTA/chr*.fa
do
C=`basename $F`
C=${C/.fa}
echo -n "${C} "
H=`head -1 "${F}"`
chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
echo $chrN $A
grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
| sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
echo ">${chrN}" > ${chrN}.fa
grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
done
rm -f scaffolds.agp
find ../alternate_loci -type f | grep ".agp$" | while read F
do
grep "^GL" $F | sed -e \
"s/^GL000250.1/chr6_apd_hap1/" -e \
"s/^GL000251.1/chr6_cox_hap2/" -e \
"s/^GL000252.1/chr6_dbb_hap3/" -e \
"s/^GL000253.1/chr6_mann_hap4/" -e \
"s/^GL000254.1/chr6_mcf_hap5/" -e \
"s/^GL000255.1/chr6_qbl_hap6/" -e \
"s/^GL000256.1/chr6_ssto_hap7/" -e \
"s/^GL000257.1/chr4_ctg9_hap1/" -e \
"s/^GL000258.1/chr17_ctg5_hap1/"
done > scaffolds.agp
find ../unlocalized_scaffolds -type f | grep ".agp$" \
| while read F
do
C=`basename ${F}`
C=${C/.unlocalized.scaf.agp}
grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
done >> scaffolds.agp
find ../unplaced_scaffolds -type f | grep ".agp$" \
| while read F
do
grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
done >> scaffolds.agp
rm -f scaffolds.fa
find ../alternate_loci -type f | grep ".fa$" | while read F
do
sed -e \
"s/>.*GL000250.*/>chr6_apd_hap1/" -e \
"s/>.*GL000251.*/>chr6_cox_hap2/" -e \
"s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
"s/>.*GL000253.*/>chr6_mann_hap4/" -e \
"s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
"s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
"s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
"s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
"s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
done > scaffolds.fa
find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
do
sed -e \
"s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
done >> scaffolds.fa
find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
do
sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
done >> scaffolds.fa
############################################################################
## Create database (DONE - 2009-03-04 - Hiram)
cd /hive/data/genomes/hg19
cat << '_EOF_' > hg19.config.ra
# Config parameters for makeGenomeDb.pl:
db hg19
scientificName Homo sapiens
commonName Human
assemblyDate Feb. 2009
assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
orderKey 14
mitoAcc NC_001807
fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
# qualFiles /dev/null
dbDbSpeciesDir human
taxId 9606
'_EOF_'
# << happy emacs
time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
# real 14m8.958s
featureBits -countGaps hg19 gap
# 239845127 bases of 3137161264 (7.645%) in intersection
featureBits -noRandom -noHap -countGaps hg19 gap
# 234344806 bases of 3095693983 (7.570%) in intersection
# verify featureBits is properly ignorning haps and randoms:
egrep -v "_" chrom.sizes | awk '{sum+=$2;print sum,$0}'
# 3095693983 chrM 16571
# same total as in featureBits
# much later on, discovered that we needed a chrM definition in the
# agp files, added by hand to hg19/M/chrM.agp and hg19/hg19.agp the line:
# chrM 1 16571 1 F NC001807 1 16571 +
# the spaces there are tabs
############################################################################
# running repeat masker (DONE - 2009-03-05 - Hiram)
screen # use screen to manage this day-long job
mkdir /hive/data/genomes/hg19/bed/repeatMasker
cd /hive/data/genomes/hg19/bed/repeatMasker
time doRepeatMasker.pl -bigClusterHub=swarm -buildDir=`pwd` hg19 \
> do.log 2>&1
# real 525m23.521s
cat faSize.rmsk.txt
# 3137161264 bases (239850802 N's 2897310462 real 1431585691
# upper 1465724771 lower) in 93 sequences in 1 files
# %46.72 masked total, %50.59 masked real
featureBits -countGaps hg19 rmsk
# 1465724774 bases of 3137161264 (46.721%) in intersection
# this is odd, 3 bases more in featureBits than were masked ?
# check it out, make a bed file from the featureBits:
featureBits -countGaps -bed=rmsk.bed hg19 rmsk
# went down a sequence of intersections with this idea, but could
# not get it resolved. It appears there are 75 bases in the rmsk
# table that were not masked in the 2bit file ?
# Later on, realized that featureBits does not count lower case N's
# in the "lower" category, but only in the N's category.
# trying a non-split table:
hgsql -e "show tables;" hg19 | grep _rmsk | while read T
do
hgsql -e "drop table ${T};" hg19
done
hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.fa.out
bad rep range [4385, 4384] line 1348605 of hg19.fa.out
bad rep range [5563, 5562] line 1563988 of hg19.fa.out
bad rep range [4539, 4538] line 3111186 of hg19.fa.out
# featureBits still reports 1465724774 bases in rmsk table
# cleaning the hg19.fa.out file:
cp hg19.fa.out hg19.clean.out
# edit hg19.clean.out and remove the three lines:
# 1467 20.7 1.2 17.6 chr14 35056767 35056794 (72292746) + L1ME1 LINE/L1 4385 4384 (1761) 1120962
# 1943 23.8 5.0 12.6 chr15 65775909 65775924 (36755468) + L1MC4 LINE/L1 5563 5562 (2480) 1299299
# 2463 25.1 5.0 11.6 chr3 121291056 121291083 (76731347) + L1M3 LINE/L1 4539 4538 (1608) 2589267
# reload the table
hgsql -e "drop table rmsk;" hg19
hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.clean.out
# try masking with this clean file:
twoBitMask /hive/data/genomes/hg19/hg19.unmasked.2bit hg19.clean.out \
hg19.clean.2bit
twoBitToFa hg19.clean.2bit stdout | faSize stdin > faSize.clean.txt
cat faSize.clean.txt
# this gives the lower by 75 bases result:
# 3137161264 bases (239850802 N's 2897310462 real 1431585763 upper
# 1465724699 lower) in 93 sequences in 1 files
# %46.72 masked total, %50.59 masked real
featureBits -countGaps hg19 rmsk
# 1465724774 bases of 3137161264 (46.721%) in intersection
# is the countGaps interferring ?
featureBits hg19 rmsk
# 1465724774 bases of 2897316137 (50.589%) in intersection
# nope, lets' see what the .out file has:
grep chr hg19.clean.out | sed -e "s/^ *//" | awk '{print $5,$6-1,$7}' \
| sort -k1,1 -k2,2n > hg19.clean.out.bed
featureBits -countGaps hg19 hg19.clean.out.bed
# 1465724774 bases of 3137161264 (46.721%) in intersection
# is it perhaps not masking N's ?
twoBitToFa hg19.clean.2bit stdout | grep n | less
# that does find some lower case n's, find all N's:
findMotif -strand=+ -motif=gattaca -verbose=4 hg19.clean.2bit \
2> findMotif.out
grep "^#GAP" findMotif.out | sed -e "s/#GAP //" > nLocations.bed
# which cover:
featureBits -countGaps hg19 nLocations.bed
# 251299071 bases of 3137161264 (8.010%) in intersection
# overlapping rmsk business with these N locations:
featureBits -countGaps hg19 hg19.clean.out.bed nLocations.bed
# 6494740 bases of 3137161264 (0.207%) in intersection
# and overlapping with gap:
featureBits -countGaps hg19 gap nLocations.bed
# 239845127 bases of 3137161264 (7.645%) in intersection
############################################################################
# running TRF simple repeats (DONE - 2009-03-05 - Hiram)
screen # use screen to manage this day-long job
mkdir /hive/data/genomes/hg19/bed/simpleRepeat
cd /hive/data/genomes/hg19/bed/simpleRepeat
time doSimpleRepeat.pl -bigClusterHub=pk -workhorse=hgwdev \
-smallClusterHub=pk -buildDir=`pwd` hg19 > do.log 2>&1
# real 33m25.815s
twoBitMask bed/repeatMasker/hg19.clean.2bit \
-add bed/simpleRepeat/trfMask.bed hg19.2bit
twoBitToFa hg19.2bit stdout | faSize stdin > faSize.hg19.2bit.txt
# 3137161264 bases (239850802 N's 2897310462 real 1430387259 upper
# 1466923203 lower) in 93 sequences in 1 files
# %46.76 masked total, %50.63 masked real
############################################################################
# prepare cluster data (DONE - 2009-03-06 - Hiram)
cd /hive/data/genomes/hg19
rm /gbdb/hg19/hg19.2bit
ln -s `pwd`/hg19.2bit /gbdb/hg19/hg19.2bit
time blat hg19.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
# Wrote 30675 overused 11-mers to 11.ooc
# real 3m11.302s
mkdir /hive/data/staging/data/hg19
cp -p hg19.2bit /hive/data/staging/data/hg19
cp -p 11.ooc /hive/data/staging/data/hg19
cp -p chrom.sizes /hive/data/staging/data/hg19
mkdir separateChrs
cd separateChrs
grep -v "_" ../chrom.sizes | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.chrOnly.2bit
twoBitInfo hg19.chrOnly.2bit stdout | sort -k2,2nr > chrOnly.chrom.sizes
grep "_hap" ../chrom.sizes | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.hapOnly.2bit
twoBitInfo hg19.hapOnly.2bit stdout | sort -k2,2nr > hapOnly.chrom.sizes
grep "_" ../chrom.sizes | grep -v "_hap" | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.scaffolds.2bit
twoBitInfo hg19.scaffolds.2bit stdout | sort -k2,2nr > scaffolds.chrom.sizes
cp -p *.2bit *.sizes /hive/data/staging/data/hg19
# ask admin to sync this directory: /hive/data/staging/data/hg19/
# to the kluster nodes /scratch/data/hg19/
############################################################################
# running cpgIsland business (DONE - 2009-03-06 - Hiram)
mkdir /hive/data/genomes/hg19/bed/cpgIsland
cd /hive/data/genomes/hg19/bed/cpgIsland
cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
# comment out the following two lines if it compiles cleanly
# some day (there were some other fixups too, adding include lines)
sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
mv tmp.c cpg_lh.c
make
cd ../../
ln -s hg3rdParty/cpgIslands/cpglh.exe
mkdir -p hardMaskedFa
cut -f1 ../../chrom.sizes | while read C
do
echo ${C}
twoBitToFa ../../hg19.2bit:$C stdout \
| maskOutFa stdin hard hardMaskedFa/${C}.fa
done
cut -f1 ../../chrom.sizes > chr.list
cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out line results/$(root1).cpg}
#ENDLOOP
'_EOF_'
# << happy emacs
cat << '_EOF_' > runOne
#!/bin/csh -fe
./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
mv /scratch/tmp/$1.$$ $2
'_EOF_'
# << happy emacs
gensub2 chr.list single template jobList
para create jobList
para try
para check ... etc
para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 172s 2.86m 0.05h 0.00d 0.000 y
# IO & Wait Time: 1748s 29.14m 0.49h 0.02d 0.000 y
# Average job time: 21s 0.34m 0.01h 0.00d
# Longest finished job: 34s 0.57m 0.01h 0.00d
# Submission to last job: 83s 1.38m 0.02h 0.00d
# Transform cpglh output to bed +
catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed
cd /hive/data/genomes/hg19/bed/cpgIsland
hgLoadBed hg19 cpgIslandExt -tab \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 28226 elements of size 10
# Sorted
# Saving bed.tab
# Loading hg19
############################################################################
# create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
mkdir /hive/data/genomes/hg19/bed/gap
cd /hive/data/genomes/hg19/bed/gap
gapToLift hg19 hg19.unBridged.lift -bedFile=unBridged.lift.bed
cp -p hg19.unBridged.lift ../../jkStuff
cp -p hg19.unBridged.lift /hive/data/staging/data/hg19
############################################################################
# AUTO UPDATE GENBANK RUN (DONE - 2009-03-07,13 - Hiram)
# align with latest genbank process.
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add hg19 just after hg18
# hg19 - GRCh37 - Genome Reference Consortium Human Reference 37
# Assembly Accession: GCA_000001405.1
hg19.serverGenome = /hive/data/genomes/hg19/hg19.2bit
hg19.clusterGenome = /scratch/data/hg19/hg19.2bit
hg19.ooc = /scratch/data/hg19/11.ooc
hg19.lift = /scratch/data/hg19/hg19.unBridged.lift
# hg19.hapRegions = /hive/data/genomes/hg19/bed/haplotypePos/haplotypePos.psl
hg19.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter}
hg19.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter}
hg19.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
hg19.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter}
hg19.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
hg19.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter}
hg19.genbank.est.xeno.load = yes
hg19.refseq.mrna.xeno.load = yes
hg19.refseq.mrna.xeno.loadDesc = yes
hg19.mgc = yes
hg19.orfeome = yes
hg19.downloadDir = hg19
# hg19.ccds.ncbiBuild = 36.3
# hg19.upstreamGeneTbl = refGene
# hg19.upstreamMaf = multiz28way
# /hive/data/genomes/hg19/bed/multiz28way/species.lst multiz44way
# /hive/data/genomes/hg19/bed/multiz44way/species.list
hg19.genbank.mrna.blatTargetDb = yes
cvs ci -m "Added hg19." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial hg19 &
# logFile: var/build/logs/2009.03.10-20:28:44.hg19.initalign.log
# real 2761m13.680s
# that ran on the swarm with little interference and no problems
# load database when finished
ssh hgwdev
screen # use screen to manage this long running command
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19 &
# logFile: var/dbload/hgwdev/logs/2009.03.12-21:10:02.dbload.log
# real 369m11.941s
# enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add hg19 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added hg19 - Human - GRCh37" etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# BLATSERVERS ENTRY (DONE - 2009-03-09 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("hg19", "blat13", "17778", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("hg19", "blat13", "17779", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
# Making download files (DONE - 2009-03-13 - Hiram)
cd /hive/data/genomes/hg19
makeDownloads.pl -allowMissedTrfs -noChromRoot hg19 \
> downloads.log 2>&1
############################################################################
# Venter1 chain, net experiment (DONE - Hiram - 2009-03-15)
doBlastzChainNet.pl `pwd`/DEF \
-stop=partition -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > partition.log 2>&1
doBlastzChainNet.pl `pwd`/DEF \
-continue=blastz -stop=blastz -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > blastz.log 2>&1
doBlastzChainNet.pl `pwd`/DEF \
-continue=cat -stop=net -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > net.log 2>&1
real 163m28.438s
# to load, run it in debug, then check the load script
doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -continue=load -stop=load -bigClusterHub=swarm \
-debug -smallClusterHub=swarm -chainMinScore=1000 \
-chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > load.log 2>&1
# and create a synNet for multiz, run in debug, and examine script
# to make sure it works correctly
doBlastzChainNet.pl `pwd`/DEF \
-syntenicNet -continue=syntenicNet -stop=syntenicNet \
-debug -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > synNet.log 2>&1
# real 31m11.216s
############################################################################
# reset position to chr6 haplotype situation
hgsql -e \
'update dbDb set defaultPos="chr6:28343766-33555363" where name="hg19";' \
hgcentraltest
# reset to a smaller range (2009-04-24 - Brooke)
# this is the SOD1 gene, implicated in Lou Gehrig's disease.
hgsql -e \
'update dbDb set defaultPos="chr21:33,031,597-33,041,570" where name="hg19";' \
hgcentraltest
############################################################################
# Self Lastz run (DONE - 2009-03-19 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
cat << '_EOF_'
# human vs human
BLASTZ=lastz
# maximum M allowed with lastz is only 255
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Human Hg19
SEQ2_DIR=/scratch/data/hg19/hg19.2bit
SEQ2_LEN=/scratch/data/hg19/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-workhorse=hgwdev \
-stop=net -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
# cluster difficulties, finished manually, then:
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-continue=cat -workhorse=hgwdev \
-stop=net -smallClusterHub=pk -bigClusterHub=swarm > cat.log 2>&1 &
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-continue=load -debug -workhorse=hgwdev \
-stop=load -smallClusterHub=pk -bigClusterHub=swarm > load.debug.log 2>&1 &
# that indicates it would do:
hgLoadChain -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
# adding -normScore
hgLoadChain -normScore -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
############################################################################
# Chimp Lastz run (DONE - 2009-03-19 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
cd /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
cat << '_EOF_'
# human vs chimp
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/data/panTro2/panTro2.2bit
SEQ2_LEN=/scratch/data/panTro2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
# real 173m22.880s
# cluster problems, continuing after lastz done:
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=cat \
-stop=net -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> net.log 2>&1 &
# real 81m20.209s
# continuing with the load and adding syntenicNet
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=load \
-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
-chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> load.log 2>&1 &
# real 47m17.871s
cat fb.hg19.chainPanTro2Link.txt
# 2747983350 bases of 2897316137 (94.846%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-05-24
ssh swarm
mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap
cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
> swap.log 2>&1 &
# real 723m41.377s
cat fb.panTro2.chainHg19Link.txt
# 2761343871 bases of 2909485072 (94.908%) in intersection
############################################################################
# Creating the pushQ entry (DONE - 2009-03-20 - Hiram)
mkdir /hive/data/genomes/hg19/pushQ
cd /hive/data/genomes/hg19/pushQ
makePushQSql.pl hg19 > hg19.pushQ.sql 2> make.err
# many complaints about the chain and net tables from the haplotype
# experiments, and this table:
# orfeomeGenes
# which is probably in genbank, and these usual ones:
# hg19 does not have seq
# hg19 does not have extFile
############################################################################
# Determine PAR region of X and Y (DONE - 2009-03-20 - Hiram)
mkdir /hive/data/genomes/hg19/bed/parRegion
cd /hive/data/genomes/hg19/bed/parRegion
awk '$5 != "N"' ../../X/chrX.agp | awk '{print $6}' | sort > chrX.cloneList
awk '$5 != "N"' ../../Y/chrY.agp | awk '{print $6}' | sort > chrY.cloneList
comm -12 chrX.cloneList chrY.cloneList > chrXY.par.clone.list
cat chrXY.par.clone.list \
| while read C; do grep "${C}" ../../X/chrX.agp; done \
| sort -k1,1 -k2,2n >> chrX.par.region.agp
cat chrXY.par.clone.list \
| while read C; do grep "${C}" ../../Y/chrY.agp; done \
| sort -k1,1 -k2,2n >> chrY.par.region.agp
awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrY.par.region.agp \
> chrY.par.region.bed
awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrX.par.region.agp \
> chrX.par.region.bed
# use those bed files in custom tracks on hg19 to verify that they
# are two continuous regions with only gaps between these items
# these location extents are: (zero relative)
# chrX 60000 2722842
# chrX 154906585 155260560
# chrY 10000 2649520
# chrY 59034049 59363566
############################################################################
# Gorilla Lastz run (DONE - 2009-03-21,05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
cd /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
cat << '_EOF_'
# human vs gorilla
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Gorilla gorGor1
SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> do.log 2>&1 &
cat fb.hg19.chainGorGor1Link.txt
# 1723432141 bases of 2897316137 (59.484%) in intersection
doRecipBest.pl -buildDir=`pwd` hg19 gorGor1 > rbest.log 2>&1
############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2009-04-02 - Hiram)
ssh pk
mkdir /hive/data/genomes/hg19/bed/linSpecRep
cd /hive/data/genomes/hg19/bed/linSpecRep
# create individual .out files from the master record in ../repeatMasker
mkdir splitOut
cat << '_EOF_' > split.csh
#!/bin/csh -fe
set C = $1
head -3 ../repeatMasker/hg19.clean.out > splitOut/${C}.out
grep "${C} " ../repeatMasker/hg19.clean.out >> splitOut/${C}.out
'_EOF_'
# << happy emacs
cat << '_EOF_' > template
#LOOP
split.csh $(root1) {check out line+ splitOut/$(root1).out}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 ../../chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
para create jobList
para try ... check ... push ... etc...
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 127s 2.12m 0.04h 0.00d 0.000 y
# IO & Wait Time: 17154s 285.90m 4.76h 0.20d 0.001 y
# Average job time: 186s 3.10m 0.05h 0.00d
# Longest finished job: 224s 3.73m 0.06h 0.00d
# Submission to last job: 280s 4.67m 0.08h 0.00d
# now, we can date and process each of those .out files
# this really should be a single creation of notInOthers
# These four different ones all end up to be the same anyhow
# the notInMouse becomes notInOthers below and the others are removed.
mkdir dateRepeats
cd dateRepeats
cat << '_EOF_' > mkLSR
#!/bin/csh -fe
rm -f $1.out_mus-musculus_rattus_canis-familiaris_bos-taurus
ln -s ../splitOut/$1.out .
/scratch/data/RepeatMasker/DateRepeats \
$1.out -query human -comp mouse -comp rat -comp dog -comp cow
rm $1.out
mkdir -p ../notInMouse ../notInRat ../notInDog ../notInCow
/cluster/bin/scripts/extractRepeats 1 $1.out_mus*-taurus \
> ../notInMouse/$1.out.spec
/cluster/bin/scripts/extractRepeats 2 $1.out_mus*-taurus \
> ../notInRat/$1.out.spec
/cluster/bin/scripts/extractRepeats 3 $1.out_mus*-taurus \
> ../notInDog/$1.out.spec
/cluster/bin/scripts/extractRepeats 4 $1.out_mus*-taurus \
> ../notInCow/$1.out.spec
'_EOF_'
# << happy emacs
chmod +x mkLSR
cat << '_EOF_' > template
#LOOP
./mkLSR $(path1) {check out line+ $(path1).out_mus-musculus_rattus_canis-familiaris_bos-taurus}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../chrom.list single template jobList
para try ... check ... push ... etc...
para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 2441s 40.69m 0.68h 0.03d 0.000 y
# IO & Wait Time: 332s 5.53m 0.09h 0.00d 0.000 y
# Average job time: 30s 0.50m 0.01h 0.00d
# Longest finished job: 125s 2.08m 0.03h 0.00d
# Submission to last job: 454s 7.57m 0.13h 0.01d
done
# these four types of out.spec results all turn out to be identical
# To check identical
cd /hive/data/genomes/hg19/bed/linSpecRep
find . -name "*.out.spec" | \
while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
| sort -k1,1n | sort -t"/" -k3,3 | sed -e "s#./notIn.*/##" \
| sort | uniq -c | less
# You will see they are all a count of 4
# Set them up on scratch data and get to all the kluster nodes:
mkdir /hive/data/staging/data/hg19/lineageSpecificRepeats
cd notInMouse
rsync -a --progress ./ /hive/data/staging/data/hg19/lineageSpecificRepeats
cd ..
mv notInMouse notInOthers
# do not need to keep all of these
rm -fr notInRat notInDog notInCow
# We also need the nibs for blastz runs with lineage specific repeats
mkdir /hive/data/genomes/hg19/bed/nibs
cd /hive/data/genomes/hg19/bed/nibs
cut -f1 ../../chrom.sizes | while read C
do
twoBitToFa -seq=${C} ../../hg19.2bit stdout \
| faToNib -softMask stdin ${C}.nib
echo "${C} done"
done
mkdir /hive/data/staging/data/hg19/nib
rsync -a --progress ./ /hive/data/staging/data/hg19/nib
# Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
#############################################################################
# create gc5Base download file (DONE - 2009-04-24 - Hiram)
cd /hive/data/genomes/hg19/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
/cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz
#############################################################################
# Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram)
mkdir /hive/data/genomes/hg19/bed/ctgPos
cd /hive/data/genomes/hg19/bed/ctgPos
cat << '_EOF_' > mkCtgPos.sh
AGP="/hive/data/genomes/hg19/download/assembled_chromosomes/AGP"
export AGP
for F in `(cd ${AGP}; ls chr*.agp | grep -v ".comp.agp")`
do
C=${F/.agp/}
grep "^CM" "${AGP}/${F}" | awk '$5 != "N"' | awk '
{
printf "%s\t%d\t%s\t%d\t%d\n", $6, $8-$7+1, "'${C}'", $2-1+$7-1, $2-1+$8
}
'
done
'_EOF_'
# << happy emacs
chmod +x mkCtgPos.sh
./mkCtgPos.sh > ctgPos.tab
cat << '_EOF_' > mkRanCtgPos.sh
AGP="/hive/data/genomes/hg19/download/unlocalized_scaffolds/AGP"
export AGP
for F in `(cd ${AGP}; ls chr*.agp)`
do
C=${F/.unlocalized.scaf.agp/}
c=${C/chr/}
export C c
grep "^GL" "${AGP}/${F}" | awk '$5 != "N"' | awk '
BEGIN {
ctgName=""
ctgStart=0
ctgEnd=0
chrom="'${c}'"
ctgNameLower=""
}
{
if (match(ctgName,$1)) {
ctgEnd = $3
} else {
if (length(ctgName) > 0) {
size=ctgEnd - ctgStart
printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower,
ctgStart, ctgEnd
}
ctgStart = $2 - 1
ctgEnd = $3
ctgName = $1
ctgNameLower = tolower($1)
sub(".1$","",ctgNameLower)
}
}
END {
size=ctgEnd - ctgStart
printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower,
ctgStart, ctgEnd
}
'
done
'_EOF_'
# << happy emacs
chmod +x mkRanCtgPos.sh
./mkRanCtgPos.sh >> ctgPos.tab
# fetch .sql definition from hg18
chmod 777 .
hgsqldump --all -c --tab=. hg18 ctgPos
chmod 775 .
hgsql hg19 < ctgPos.sql
hgsql -e 'load data local infile "ctgPos.tab" into table ctgPos;' hg19
#############################################################################
# CLONE ENDS - first step for BACEND/CytoBand tracks
# (DONE - 2009-04-28 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/cloneend/ncbi
cd /hive/data/genomes/hg19/bed/cloneend/ncbi
wget --timestamping \
'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_ends*.mfa.gz'
wget --timestamping \
'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_info*.txt.gz'
cd /hive/data/genomes/hg19/bed/cloneend
# seems like the *.mfa files were split just for convenience
# concatenate
for F in ncbi/*.mfa.gz
do
zcat "${F}"
echo "${F}" 1>&2
done | gzip > all.mfa.gz
# that 1>&2 echos to stderr so you can see the file name and not
# interfere with the pipe stdout output to gzip
# Convert the title line of the all.mfa file
zcat all.mfa.gz \
| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#\.[0-9]|.*##" \
| gzip > cloneEnds.fa.gz
zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz
# make sure nothing got broken:
faSize all.mfa.gz
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
faSize cloneEnds.fa.gz
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
# identical numbers
# you can also carefully check the names:
zcat all.mfa.gz | grep "^>" | awk -F'|' '{print $4}' \
| sed -e "s/\.[0-9]$//" | sort > mfa.names
# should be the same as:
zcat cloneEnds.fa.gz | grep "^>" | sed -e "s/>//" | sort > clone.names
# concatenate the text files, too
bash
for F in ncbi/*.txt.gz
do
zcat "${F}"
echo "${F}" 1>&2
done | gzip > all.txt.gz
# generate cloneEndPairs.txt and cloneEndSingles.txt
zcat all.txt.gz >all.txt
$HOME/kent/src/hg/utils/cloneEndParse.pl all.txt
# Reading in end info
# Writing out pair info
# Writing out singleton info
# 302264 pairs and 203094 singles
# examined all the clone names and all the bac end names in these two
# files and compared with business from all.txt to make sure we properly
# classified all of them correctly. We had 833,173 clone sequences,
# and 501,135 bac end names
# faSplit does not function correctly if given a .gz source file
# AND, we need the unzipped file for sequence loading below
gunzip cloneEnds.fa.gz
# split
mkdir splitdir
cd splitdir
faSplit sequence ../cloneEnds.fa 100 cloneEnds
# Check to ensure no breakage:
cat *.fa | faSize stdin
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
# same numbers as before
# load sequences
ssh hgwdev
mkdir /gbdb/hg19/cloneend
cd /gbdb/hg19/cloneend
ln -s /hive/data/genomes/hg19/bed/cloneend/cloneEnds.fa .
cd /tmp
hgLoadSeq hg19 /gbdb/hg19/cloneend/cloneEnds.fa
# Advisory lock created
# Creating .tab file
# Adding /gbdb/hg19/cloneend/cloneEnds.fa
# 833173 sequences
# Updating seq table
# Advisory lock has been released
# All done
##############################################################################
# BACEND SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/bacends/run.blat
cd /hive/data/genomes/hg19/bed/bacends/run.blat
# going to run separate runs for the golden path sequence vs. the
# randoms, haplotypes, chrUn and chrM
partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
| egrep -v "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://; s/:/./" > hg19.list
ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
> bacEnds.list
ssh swarm
cd /hive/data/genomes/hg19/bed/bacends/run.blat
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(path2) {check out line+ psl/$(root1)/$(file1).$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set target = $1
set query = $2
set result = $3
set partSpec = `echo $target | sed -e "s/\./:/"`
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set dir = $result:h
set chr = `echo $target | sed -e "s/\..*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
# echo $tmpFile
# echo "chr: $chr $start $end -> size: $chrSize, range: $range"
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p $dir
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
rm -f $result
liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
# << happy emacs
gensub2 hg19.list bacEnds.list template jobList
para create jobList
# 62034 jobs in batch
# these jobs run quickly, limit them to 250 at a time
para try, check, -maxJob=250 push, etc ...
# Completed: 62034 of 62034 jobs
# CPU time in finished jobs: 506023s 8433.72m 140.56h 5.86d 0.016 y
# IO & Wait Time: 175853s 2930.88m 48.85h 2.04d 0.006 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest finished job: 752s 12.53m 0.21h 0.01d
# Submission to last job: 3533s 58.88m 0.98h 0.04d
# combine the alignments
time pslSort dirs raw.psl temp psl/chr*
# 62034 files in 24 dirs
# Got 62034 files 249 files per mid file
# real 81m2.820s
# -rw-rw-r-- 1 13410334441 Apr 29 12:00 raw.psl
# cleanup
rmdir temp
time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
# real 5m55.990s
# Processed 106254032 alignments
# -rw-rw-r-- 1 372734361 Apr 29 12:56 bacEnds.psl
wc -l bacEnds.psl
# 2852977 bacEnds.psl
time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
all_bacends bacEnds
# Reading pair file
# Reading psl file
# Creating Pairs
# Writing to files
# real 0m18.851s
# this creates the files:
# -rw-rw-r-- 1 21178741 Apr 29 13:00 bacEnds.pairs
# -rw-rw-r-- 1 5250873 Apr 29 13:00 bacEnds.orphan
# -rw-rw-r-- 1 738045 Apr 29 13:00 bacEnds.short
# -rw-rw-r-- 1 463560 Apr 29 13:00 bacEnds.slop
# -rw-rw-r-- 1 146369 Apr 29 13:00 bacEnds.mismatch
# -rw-rw-r-- 1 3528 Apr 29 13:00 bacEnds.long
# filter and sort
awk '$5 >= 300' bacEnds.pairs | sort -k1,1 -k2,2n > bacEndPairs.bed
awk '$5 >= 300' bacEnds.slop bacEnds.short bacEnds.long \
bacEnds.mismatch bacEnds.orphan | sort -k1,1 -k2,2n > bacEndPairsBad.bed
extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
> bacEndPairs.load.psl
############################################################################
# BACEND Randoms SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/bacends/run.randoms
cd /hive/data/genomes/hg19/bed/bacends/run.randoms
# this separate run for the randoms, haplotypes, chrUn and chrM
partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
| egrep "random|_hap|chrUn" \
| sed -e "s/.*2bit://; s/:/./" > random.list
cat tParts/*.lst | sed -e "s/.*2bit://; s/:/./" >> random.list
ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
> bacEnds.list
ssh swarm
cd /hive/data/genomes/hg19/bed/bacends/run.randoms
gensub2 random.list bacEnds.list ../run.blat/template jobList
# very similar runOne.csh script as above, but it doesn't need to do
# the lift
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set target = $1
set query = $2
set result = $3
set partSpec = `echo $target | sed -e "s/\./:/"`
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set dir = $result:h
set chr = `echo $target | sed -e "s/\..*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
# echo $tmpFile
# echo "chr: $chr $start $end -> size: $chrSize, range: $range"
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p $dir
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
rm -f $result
mv $tmpFile.psl $result
echo rm -f $tmpFile.lift
'_EOF_'
# << happy emacs
# these jobs run fast, do not let too many of them run
para -maxJob=100 try...check...push
para time
# Completed: 6762 of 6762 jobs
# CPU time in finished jobs: 20357s 339.29m 5.65h 0.24d 0.001 y
# IO & Wait Time: 17839s 297.31m 4.96h 0.21d 0.001 y
# Average job time: 6s 0.09m 0.00h 0.00d
# Longest finished job: 261s 4.35m 0.07h 0.00d
# Submission to last job: 508s 8.47m 0.14h 0.01d
time pslSort dirs raw.psl temp psl/chr*
# 6762 files in 69 dirs
# Got 6762 files 82 files per mid file
# real 6m37.177s
# 37044 files in 98 dirs
# Got 37044 files 192 files per mid file
# real 32m24.804s
# -rw-rw-r-- 1 6487445210 Feb 2 21:08 raw.psl
time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl randomEnds.psl randomReps.psr > pslReps.out 2>&1 &
# real 0m5.761s
# Processed 1254273 alignments
# cleanup
rmdir temp
wc -l randomEnds.psl
# 367567 randomEnds.psl
time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose randomEnds.psl \
/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
all_bacends bacEnds
# Reading pair file
# Reading psl file
# Creating Pairs
# Writing to files
# real 0m11.221s
# this creates the files:
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.slop
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.short
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.mismatch
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.long
# -rw-rw-r-- 1 141836 Apr 29 14:53 bacEnds.pairs
# -rw-rw-r-- 1 649907 Apr 29 14:53 bacEnds.orphan
##############################################################################
# BacEnds track - both results loaded together (DONE - 2009-04-29 - Hiram)
ssh hgwdev
cd /hive/data/genomes/hg19/bed/bacends
# filter and sort
awk '$5 >= 300' run.blat/bacEnds.pairs run.randoms/bacEnds.pairs \
| sort -k1,1 -k2,2n > bacEndPairs.bed
awk '$5 >= 300' run.blat/bacEnds.slop run.blat/bacEnds.short \
run.blat/bacEnds.long run.blat/bacEnds.mismatch \
run.blat/bacEnds.orphan run.randoms/bacEnds.slop \
run.randoms/bacEnds.short run.randoms/bacEnds.long \
run.randoms/bacEnds.mismatch run.randoms/bacEnds.orphan \
| sort -k1,1 -k2,2n > bacEndPairsBad.bed
head -5 run.blat/bacEnds.psl > bacEnds.psl
headRest 5 run.blat/bacEnds.psl > t.psl
headRest 5 run.randoms/randomEnds.psl >> t.psl
sort -k14,14 -k16,16n t.psl >> bacEnds.psl
extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
> bacEnds.load.psl
# load them into the database
ssh hgwdev
cd /hive/data/genomes/hg19/bed/bacends
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $4}' bacEndPairs.bed | grep " "
awk '{print $5}' bacEndPairs.bed | sort | uniq -c
# result should be the scores, no extraneous strings:
# 156984 1000
# 195 300
# 316 375
# 297 500
# 1476 750
# edit the file and fix it if it has a bad name.
hgLoadBed -notItemRgb hg19 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 208922 elements of size 11
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed -notItemRgb hg19 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 79004 elements of size 11
#hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl hg19 -table=all_bacends bacEnds.load.psl
# one complaint, there appears to be a bogus insert count in one
# of the blat results:
# < 585 797 67 0 3 2 -63 9 79188 + AQ743980 852 42 846 chr19_gl000208_random 92689 4045 84100 11 14,124,84,496,53,6,20,28,28,10,4, 42,56,180,200,696,750,756,776,804,832,842, 4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
Became:
# > 585 797 67 0 3 2 0 9 79188 + AQ743980 852 42 846 chr19_gl000208_random 92689 4045 84100 11 14,124,84,496,53,6,20,28,28,10,4, 42,56,180,200,696,750,756,776,804,832,842, 4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
hgsql -N -e "select count(*) from all_bacends;" hg19
# 2289275
hgsql -N -e "select count(*) from all_bacends;" hg18
# 1727387
hgsql -N -e "select count(*) from all_bacends;" hg17
# 1729146
nice featureBits hg19 all_bacends
# 230917362 bases of 2897316137 (7.970%) in intersection
nice featureBits hg18 all_bacends
# 227770876 bases of 2881515245 (7.905%) in intersectio
nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection
nice featureBits hg19 bacEndPairs
# 236889607 bases of 2897316137 (8.176%) in intersection
nice featureBits hg18 bacEndPairs
# 162690030 bases of 2881515245 (5.646%) in intersection
nice featureBits hg17 bacEndPairs
# 162099487 bases of 2866216770 (5.656%) in intersection
nice featureBits hg19 bacEndPairsBad
# 38344094 bases of 2897316137 (1.323%) in intersection
nice featureBits hg18 bacEndPairsBad
# 37326990 bases of 2881515245 (1.295%) in intersection
nice featureBits hg17 bacEndPairsBad
# 37437558 bases of 2866216770 (1.306%) in intersection
############################################################################
# STS MARKERS (DONE - 2009-04-30 - 2009-05-06 - Hiram)
mkdir /hive/data/outside/ncbi/sts.2009-04
cd /hive/data/outside/ncbi
ln -s sts.2009-04 sts.11
cd /hive/data/outside/ncbi/sts.2009-04
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
gunzip sts.gz
mv sts dbSTS.fa
# these items are copied in from the previous builds
cp -p /cluster/data/ncbi/sts.10/all.STS.fa ./all.STS.fa.prev
cp -p /cluster/data/ncbi/sts.10/stsInfo2.bed ./stsInfo2.bed.prev
# edit stsInfo2.bed.prev for a
# manual fixup of error that is in the hg18 bed file, replace
# the line for AFM067XA9 to fix bogus long list of aliases to be:
# 22788^IAFM067XA9^I1^IZ66598^I1^IGDB:1221611,^I5^I067XA9,GDB:1221611,W202,Z66598,SWSS2303^I69047^I0^I^ITCTTGGGGTTTAATTGCTTT^ICTTTGCCACAATCTTACACA^I149^IHomo sapiens^I1^I2^I6453,6454,^I0^I^I^I^I0^I0^I^I^I0^I0^IAFM067XA9^Ichr7^I145^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0
# as taken directly out of the hg18.stsInfo2 table which was fixed
# by Bob and Archana
# Convert the title line of the dbSTS.fa file
# Verify that column 3 only contains gb emb dbj
grep "^>" dbSTS.fa | awk -F'|' '{print $3}' | sort | uniq -c
# 39124 dbj
# 57375 emb
# 1212541 gb
# if that is true, this sed will work:
cat dbSTS.fa \
| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#^>gi.[0-9]*.dbj.#>#; s#\.[0-9]|.*##" \
> UniSTS.convert.fa
# get accessions
grep ">" UniSTS.convert.fa | sed -e "s/^>//" | sort > UniSTS.acc
# head and tail that to ensure names are reasonable, odd names would
# show up at the beginning or end
wc -l UniSTS.acc
# 1309040 UniSTS.acc
# NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
# all.STS.fa, stsAlias.bed files
updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
UniSTS.sts UniSTS.aliases UniSTS.convert.fa new
# verify the number of aliases is reasonable:
awk '{print $3}' new.alias | sort | uniq -c | sort -rn | less
# 50 D7S831
# 34 CHLC.GATA2B06.465
# 24 CHLC.GATA11E11
# 23 AFM276ZF5
# 23 AFM273YH9
# 22 SHGC-133043
# ... etc ...
# verify there are no unusually long or short lines:
awk '{printf "%d\n", length($0)}' new.info | sort -n | head -3
# 143
# 144
# 144
awk '{printf "%d\n", length($0)}' new.info | sort -n | tail -3
# 552
# 553
# 644
# check for null in the new files:
grep -i null new.*
# if the new files look good, they can become the set to use:
mv new.info stsInfo2.bed
mv new.primers all.primers
mv new.alias stsAlias.bed
mv new.fa all.STS.fa
# get list of all STS id's in the fasta file
sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n > all.STS.id
wc -l all.STS.id
# 100520 total sequences
# in hg18 this was: 93698 total sequences
$HOME/kent/src/hg/stsMarkers/convertPrimerToFA all.primers > all.primers.fa
# check that fasta file for unusual length sequences:
faSize all.primers.fa
# 97815329 bases (83677626 N's 14137703 real 14137703 upper 0 lower) in 317592 sequences in 1 files
# Total size: mean 308.0 sd 279.3 min 40 (dbSTS_144) max 30000 (dbSTS_156892) median 244
# Copy stsInfo2.bed and stsAlias.bed to data directory becuase
# these will be loaded into the database later
mkdir -p /hive/data/genomes/hg19/bed/sts
cp -p stsInfo2.bed /hive/data/genomes/hg19/bed/sts/
cp -p stsAlias.bed /hive/data/genomes/hg19/bed/sts/
# Create sts sequence alignments
mkdir /hive/data/genomes/hg19/bed/sts/split
faSplit sequence all.STS.fa 100 /hive/data/genomes/hg19/bed/sts/split/sts
ssh swarm
mkdir /hive/data/genomes/hg19/bed/sts/run
cd /hive/data/genomes/hg19/bed/sts/run
# going to run separate runs for the golden path sequence vs. the
# randoms, haplotypes, chrUn and chrM
# 40,000,000 chunck sizes, 20,000 overlap
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep -v "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://;" > hg19.list
ls -1S ../split > sts.list
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec \
../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
# rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
# << happy emacs
chmod +x runOne.csh
gensub2 hg19.list sts.list template jobList
# these jobs run quickly, allow only 100 at a time
para -maxJob=100 create jobList
# 8367 jobs in batch
para try ... check ... push ... etc
# Completed: 8366 of 8366 jobs
# CPU time in finished jobs: 89744s 1495.74m 24.93h 1.04d 0.003 y
# IO & Wait Time: 25467s 424.44m 7.07h 0.29d 0.001 y
# Average job time: 14s 0.23m 0.00h 0.00d
# Longest finished job: 53s 0.88m 0.01h 0.00d
# Submission to last job: 1592s 26.53m 0.44h 0.02d
# and, run the randoms as a separate run:
mkdir /hive/data/genomes/hg19/bed/sts/run.randoms
cd /hive/data/genomes/hg19/bed/sts/run.randoms
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep "tParts|random|_hap|chrUn"
cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
ls -1S ../split > sts.list
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec \
../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
/bin/rm -f $tmpFile.psl
'_EOF_'
# << happy emacs
chmod +x runOne.csh
gensub2 hg19.list sts.list template jobList
# these jobs run quickly, allow only 100 at a time
para -maxJob=100 create jobList
# 6486 jobs in batch
para try ... check ... push ... etc
# Completed: 6486 of 6486 jobs
# CPU time in finished jobs: 2206s 36.77m 0.61h 0.03d 0.000 y
# IO & Wait Time: 16505s 275.08m 4.58h 0.19d 0.001 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest finished job: 21s 0.35m 0.01h 0.00d
# Submission to last job: 601s 10.02m 0.17h 0.01d
# Compile sts sequence results
ssh hgwdev
cd /hive/data/genomes/hg19/bed/sts/run
time pslSort dirs raw.psl temp psl/chr*
# 8366 files in 89 dirs
# Got 8366 files 91 files per mid file
# real 8m50.714s
# -rw-rw-r-- 1 810438277 May 1 11:45 raw.psl
cd /hive/data/genomes/hg19/bed/sts/run.randoms
time pslSort dirs raw.psl temp psl/chr*
# 6486 files in 69 dirs
# Got 6486 files 81 files per mid file
# real 1m42.120s
# -rw-rw-r-- 1 18378188 May 1 11:52 raw.psl
rmdir temp
cd /hive/data/genomes/hg19/bed/sts
cat run*/raw.psl | egrep -v "^$|^psLayout|^match|^ |^-" \
| pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
stsMarkers.psl /dev/null
# Processed 7412166 alignments
# -rw-rw-r-- 1 12031760 May 1 11:57 stsMarkers.psl
$HOME/kent/src/hg/stsMarkers/extractPslInfo -h stsMarkers.psl
# creates stsMarkers.psl.initial
# -rw-rw-r-- 1 4485053 May 1 12:06 stsMarkers.psl.initial
wc -l stsMarkers.psl.initial
# 101338 stsMarkers.psl.initial
# this command needs a chrom_names file to work correctly with this
# new style of layout for hg19:
cd /hive/data/genomes/hg19
cut -f1 chrom.sizes | sed -e "s/chr//" > chrom_names
cd /hive/data/genomes/hg19/bed/sts
$HOME/kent/src/hg/stsMarkers/findAccession.pl -agp stsMarkers.psl.initial \
/cluster/data/hg19
wc -l stsMarkers.psl.initial.acc
# 101338 stsMarkers.psl.initial.acc
sort -k4,4n stsMarkers.psl.initial.acc > stsMarkers.final
# determine found markers (4th field in file)
cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
wc -l stsMarkers.found
# 96472 stsMarkers.found
# out of 100520 total sequences from:
wc -l /hive/data/outside/ncbi/sts.2009-04/all.STS.id
# There are lots of duplicates:
wc -l stsMarkers.final
# 101338 stsMarkers.final
# And a lot of them are just completely haywire:
awk '$3-$2 < 1001' stsMarkers.final | wc -l
# 98382
# filter out markers that are too long
awk '$3-$2 < 1001' stsMarkers.final > stsMarkers.1K.size.filtered
# alignment of primers
ssh swarm
cd /hive/data/outside/ncbi/sts.2009-04
awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
all.primers > all.primers.ispcr
mkdir primerAlign
cd primerAlign
mkdir split
cd split
split -l 5000 ../../all.primers.ispcr primer_
ls > ../primer.list
cd ..
# we need a 10.ooc file for this business
time blat /scratch/data/hg19/hg19.2bit \
/dev/null /dev/null -tileSize=10 -makeOoc=10.ooc -repMatch=1024
# Wrote 146902 overused 10-mers to 10.ooc
# real 19m16.758s
# separate runs for whole genome vs. randoms
mkdir run
cd run
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep -v "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://;" > hg19.list
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
-ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
# << happy emacs
chmod +x runOne.csh
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 hg19.list ../primer.list template jobList
para create jobList
# 5696 jobs in batch
para try ... check ... push ... etc
# Completed: 5696 of 5696 jobs
# CPU time in finished jobs: 203899s 3398.32m 56.64h 2.36d 0.006 y
# IO & Wait Time: 22049s 367.48m 6.12h 0.26d 0.001 y
# Average job time: 40s 0.66m 0.01h 0.00d
# Longest finished job: 5314s 88.57m 1.48h 0.06d
# Submission to last job: 5418s 90.30m 1.50h 0.06d
# Estimated complete: 0s 0.00m 0.00h 0.00d
# sort and filter the results
cd psl
pslSort dirs raw.psl temp chr*
# 5696 files in 89 dirs
# Got 5696 files 75 files per mid file
# -rw-rw-r-- 1 456802973 May 4 13:32 raw.psl
cd ..
mkdir filter
pslQuickFilter -minMatch=26 -maxMismatch=5 \
-maxTinsert=5000 -verbose psl/ filter/
# -rw-rw-r-- 1 50302564 May 4 13:35 raw.psl
# And, for the randoms
mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://;" > hg19.list
cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
cat tParts/* > hg19.list
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
-ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
'_EOF_'
# << happy emacs
chmod +x runOne.csh
# can not use line+ check here, many of them are empty
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 hg19.list ../primer.list template jobList
# they run quickly, limit to 100
para -maxJob=100 create jobList
para try ... check ... push ... etc
# Completed: 4416 of 4416 jobs
# CPU time in finished jobs: 1746s 29.09m 0.48h 0.02d 0.000 y
# IO & Wait Time: 11407s 190.12m 3.17h 0.13d 0.000 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest finished job: 8s 0.13m 0.00h 0.00d
# Submission to last job: 147s 2.45m 0.04h 0.00d
# sort and filter the results
cd psl
pslSort dirs raw.psl temp chr*
# 4416 files in 69 dirs
# Got 4416 files 66 files per mid file
rmdir temp
# -rw-rw-r-- 1 9066053 May 4 13:31 raw.psl
# putting the two runs together
mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
ln -s ../run/filter/raw.psl run.psl
ln -s ../runRandoms/filter/raw.psl runRandoms.psl
# -rw-rw-r-- 1 50302564 May 4 13:35 run.psl
# -rw-rw-r-- 1 825973 May 4 13:35 runRandoms.psl
cd ..
pslSort dirs primers.psl temp psl
# 2 files in 1 dirs
# Got 2 files 1 files per mid file
# -rw-rw-r-- 1 51128110 May 4 13:39 primers.psl
wc -l primers.psl
# 448107 primers.psl
rmdir temp
pslFilterPrimers primers.psl ../all.primers primers.filter.psl
# creates primers.filter.unlifted.psl.notfound.primers
wc -l primers*
# 237962 primers.filter.psl
# 97191 primers.filter.psl.notfound.primers
# see if ePCR can find some of these notfound
ssh swarm
mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
mkdir split
cd split
split -l 5000 ../../primers.filter.psl.notfound.primers primers_
cd ..
ls -1S split > primers.lst
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| grep -v tParts | sed -e "s/.*2bit://;" > hg19.list
cat tParts/* | sed -e "s/.*2bit://;" >> hg19.list
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set primer = split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p epcr/$partSpec
/bin/rm -f $tmpFile.psl
twoBitToFa /scratch/data/hg19/hg19.2bit:$partSpec $tmpFile.fa
/cluster/bin/scripts/runEpcr64 $primer $tmpFile.fa $tmpFile.epcr
/bin/rm -f $result
/bin/mv $tmpFile.epcr $result
rm -f $tmpFile.fa $tmpFile.lift $tmpFile.psl $tmpFile.*
'_EOF_'
# << happy emacs
chmod +x runOne.csh
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line epcr/$(file1)/$(root2).epcr}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 hg19.list primers.lst template jobList
para create jobList
# 3160 jobs
para try ... check ... push ... etc ...
# Completed: 3160 of 3160 jobs
# CPU time in finished jobs: 86253s 1437.54m 23.96h 1.00d 0.003 y
# IO & Wait Time: 11196s 186.61m 3.11h 0.13d 0.000 y
# Average job time: 31s 0.51m 0.01h 0.00d
# Longest finished job: 89s 1.48m 0.02h 0.00d
# Submission to last job: 237s 3.95m 0.07h 0.00d
find ./epcr -type f | xargs cat > all.epcr
wc -l all.epcr
# 797286 all.epcr
# convert the coordinates from the partitionSequence.pl to a lift file
awk '{print $1}' all.epcr | sort -u > hg19.partSpec.txt
$HOME/kent/src/hg/stsMarkers/liftFromSpec.pl hg19 hg19.partSpec.txt \
> all.epcr.lift
cat all.epcr | sed -e "s/\.\./ /; s/ */\t/g" \
| liftUp -type=.bed stdout all.epcr.lift error stdin \
| awk '
{
printf "%s %d..%d %d %d\n", $1, $2, $3, $4, $5
}
' > all.epcr.lifted
pslFilterPrimers -epcr=all.epcr.lifted -verbose=1 ../primers.psl \
/cluster/home/hiram/bin/x86_64/pslFilterPrimers -epcr=all.epcr.lifted \
-verbose=1 ../primers.psl ../../all.primers epcr.primers.psl
# this took a long time, many hours
# -rw-rw-r-- 1 2785254 May 5 17:28 epcr.not.found
# -rw-rw-r-- 1 27343510 May 5 17:28 epcr.primers.psl
# -rw-rw-r-- 1 1616885 May 5 17:28 epcr.primers.psl.notfound.primers
time ./epcrToHgPsl.pl epcr.not.found ../../all.primers \
time $HOME/kent/src/hg/stsMarkers/epcrToPsl epcr.not.found \
../../all.primers /hive/data/genomes/hg19
# real 69m38.444s
# -rw-rw-r-- 1 0 May 6 14:18 epcr.not.found.nomatch
# -rw-rw-r-- 1 8369138 May 6 15:26 epcr.not.found.psl
# combining everything together now
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign
sort -u primers.filter.psl epcr/epcr.primers.psl epcr/epcr.not.found.psl \
| sort -k15,15 -k17,17n > primers.final.psl
wc -l primers.final.psl
# 310705 primers.final.psl
time $HOME/kent/src/hg/stsMarkers/fixPrimersQueryGaps.pl \
../all.primers primers.final.psl > primers.final.fix.psl
# real 0m19.580s
wc -l primers.final.fix.psl
# 310705 primers.final.fix.psl
# Extract relevant info, make alignments unique, and create final file to
# be merged with full sequence alignments
$HOME/kent/src/hg/stsMarkers/extractPslInfo -h primers.final.fix.psl
# real 0m15.303s
# -rw-rw-r-- 1 15660447 May 6 15:44 primers.final.fix.psl.initial
wc -l primers.final.fix.psl.initial
# 308210 primers.final.fix.psl.initial
$HOME/kent/src/hg/stsMarkers/findAccession.pl -agp \
primers.final.fix.psl.initial /hive/data/genomes/hg19
wc -l primers.final.fix.psl.initial.acc
# 308210 primers.final.fix.psl.initial.acc
$HOME/kent/src/hg/stsMarkers/getStsId ../stsInfo2.bed \
primers.final.fix.psl.initial.acc | sort -k 4n > primers.final
wc -l primers.final
# 308210 primers.final
# There doesn't appear to be any use for this primers.ids list
# except for curiosity. Check the head and tail of this list to
# verify no garbage is in here. There should just be numbers.
awk '{print $4}' primers.final | sort -n | uniq > primers.ids
wc -l primers.ids
# 290961 primers.ids
# Merge primer and sequence files to create final bed file
# Merge (combineSeqPrimerPos) takes about an hour to run
cd /hive/data/genomes/hg19/bed/sts
time $HOME/kent/src/hg/stsMarkers/combineSeqPrimerPos stsMarkers.final \
/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final
# real 0m12.310s
# -rw-rw-r-- 1 15222346 May 6 15:55 stsMarkers_pos.rdb
wc -l stsMarkers_pos.rdb
# 315308 stsMarkers_pos.rdb
time /cluster/bin/scripts/createSTSbed \
/hive/data/outside/ncbi/sts.2009-04/stsInfo2.bed \
stsMarkers_pos.rdb > stsMap.bed
# real 0m31.886s
# -rw-rw-r-- 1 38244880 May 6 16:25 stsMap.bed
wc -l stsMap.bed
# 305914 stsMap.bed
# Set up sequence files
ssh hgwdev
mkdir /gbdb/hg19/sts.11/
ln -s /hive/data/outside/ncbi/sts.11/all.STS.fa \
/gbdb/hg19/sts.11/all.STS.fa
ln -s /hive/data/outside/ncbi/sts.11/all.primers.fa \
/gbdb/hg19/sts.11/all.primers.fa
# Load all files
cd /hive/data/genomes/hg19/bed/sts
hgLoadSeq hg19 /gbdb/hg19/sts.11/all.STS.fa /gbdb/hg19/sts.11/all.primers.fa
# Creating seq.tab file
# Adding /gbdb/hg19/sts.11/all.STS.fa
# 100520 sequences
# Adding /gbdb/hg19/sts.11/all.primers.fa
# 317592 sequences
# Updating seq table
# Advisory lock has been released
# All done
hgsql hg19 < $HOME/kent/src/hg/lib/stsInfo2.sql
hgsql hg19 < $HOME/kent/src/hg/lib/stsAlias.sql
# these files already exist here from previous operations
# cp -p /hive/data/outside/ncbi/sts.11/{stsInfo2.bed,stsAlias.bed} .
hgsql hg19 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
hgsql hg19 -e 'load data local infile "stsAlias.bed" into table stsAlias'
# a couple minutes for each load above
# filter the stsMap.bed to eliminate items longer than 5,000 bases,
# takes out about 850:
awk '$3-$2 < 5001' stsMap.bed | sort -k1,1 -k2,2n \
> stsMap.filtered.5000.bed
hgLoadBed -notItemRgb -noBin -tab \
-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql hg19 stsMap \
stsMap.filtered.5000.bed
# Loaded 305064 elements of size 28
ln -s \
/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final.fix.psl \
primers.psl
hgLoadPsl -nobin -table=all_sts_primer hg19 primers.psl
hgLoadPsl -nobin -table=all_sts_seq hg19 stsMarkers.psl
##############################################################################
# FISH CLONES (WORKING - 2009-04-29 - Hiram)
# The STS Marker and BAC End Pairs tracks must be completed prior to
# creating this track.
mkdir /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
cd /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
# Download information from NCBI
# point browser at:
# http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
# change "Sequence tag:" to "placed on contig"
# change "Show details on sequence-tag" to "yes"
# change "Download or Display" to "Download table for UNIX"
# press Submit - save as
# /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
chmod 664 /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
# Unfortunately the format of this hbrc file has changed since
# last time. The columns have been rearranged, and one important
# column is missing, the contig information. So, let's see if we
# can recover the original format by putting this together with
# some other things we have here.
$HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
/hive/data/genomes/hg19/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
2> dbg
XXX - need to get this seq_clone.pmd from NCBI, maybe Paul Kitts
# the seq_clone.pmd file was obtained via email from Wonhee Jang
# jang at ncbi.nlm.nih.gov - I have asked for clarification where
# such a file can be fetched without resorting to email.
# Get current clone/accession information
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/clone/reports/clac.out
http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
# Create initial Fish Clones bed file
ssh kkstore02
mkdir /hive/data/genomes/hg19/bed/fishClones
cd /hive/data/genomes/hg19/bed/fishClones
# Copy previous sts info from fhcrc
cp -p /hive/data/genomes/hg18/bed/fishClones/fhcrc.sts .
# This fhcrc.sts listing doesn't change. It is merely a listing
# of aliases that remain in effect.
# Create cl_acc_gi_len file form cloneend information:
grep -v "^#" /hive/data/genomes/hg19/bed/cloneend/all.txt \
| awk '{gsub(".[0-9]*$", "", $2);
printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len
hgsql -N \
-e "select chrom,chromStart,chromEnd,contig from ctgPos;" hg19 \
| sort -k1,1 -k2,2n > ctgPos.bed
hgsql -N \
-e "select chrom,chromStart,chromEnd,frag,0,strand from gold;" hg19 \
| sort -k1,1 -k2,2n > gold.bed
hgsql -N \
-e "select tName,tStart,tEnd,qName,0,strand from all_bacends;" hg19 \
| sort -k1,1 -k2,2n > all_bacends.bed
hgsql -N \
-e "select chrom,chromStart,chromEnd,name,score,strand from bacEndPairs;" hg19 \
| sort -k1,1 -k2,2n > bacEndPairs.bed
ssh hgwdev
# have to be on hgwdev for this since it is going to read from the
# database. Had to work on this program to get it past what is
# evidently a bad entry in hbrc.fixed where columns of information
# are missing for one clone in particular
time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg19 \
/hive/data/outside/ncbi/fishClones/fishClones.2009-04/fixed.hbrc.txt \
/hive/data/outside/ncbi/fishClones/fishClones.2009-04/clac.out \
./cl_acc_gi_len \
/hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl \
fishClones
# real 2m4.708s
# Reading Fish Clones file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
# reading fishInfo file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
# Reading Clone/Acc (clac.out) file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/clac.out
# Reading BAC Ends file ./cl_acc_gi_len
# Reading BAC Ends psl file /hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl
# Reading additional STS Marker links fhcrc.sts
# Determining good positions
# findClonePos: determining positions of fish clones
# Writing output file
# ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# Load the track
ssh hgwdev
cd /hive/data/genomes/hg19/bed/fishClones
hgLoadBed -notItemRgb -noBin -tab \
-sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
hg19 fishClones fishClones.bed
# Loaded 9461 elements of size 16
##############################################################################
# UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
mkdir /hive/data/genomes/hg19/ensembl
cd /hive/data/genomes/hg19/ensembl
wget --timestamping \
'ftp://ftp.ensembl.org/pub/pre/homo_sapiens/GRCh37/dna/*'
# do not need the repeat masker sequence (although it would be
# interesting to measure to see how it compares)
rm -f *.dna_rm.*
# fortunately we have the same sizes as Ensembl for everything
# (except the haplotypes) and the sizes are unique for each sequence
# so we can relate the names via their sizes
mkdir /hive/data/genomes/hg19/bed/ucscToEnsembl
cd /hive/data/genomes/hg19/bed/ucscToEnsembl
# the toplevel file is a duplicate of everything else
ls /hive/data/genomes/hg19/ensembl/*.fa.gz | grep -v toplevel \
| while read F
do
zcat "${F}"
done | faCount stdin > faCount.txt
cat << '_EOF_' > relateUcscEnsembl.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %ucscChrs; # key is size, value is UCSC chr name
open (FH,"<../../chrom.sizes") or die "can not read ../../chrom.sizes";
while (my $line = <FH>) {
chomp $line;
my ($chr, $size) = split('\s+', $line);
die "'$line\n'duplicate size in ../chrom.sizes" if (exists($ucscChrs{$size})
);
$ucscChrs{$size} = $chr;
}
close (FH);
my %ensemblChrs; # key is size, value is Ensembl chr name
open (FH,"<faCount.txt") or die "can not read faCount.txt";
while (my $line = <FH>) {
next if ($line =~ m/#/);
next if ($line =~ m/total/);
chomp $line;
my ($chr, $size, $rest) = split('\s+', $line, 3);
die "'$line\n'duplicate size in faCount.txt" if (exists($ensemblChrs{$size})
);
$ensemblChrs{$size} = $chr;
}
close (FH);
my %usedUcscChrs;
my %usedEnsemblChrs;
my %ensemblTranslate; # key is Ensembl name, value is UCSC size
foreach my $size (keys %ucscChrs) {
if (exists($ensemblChrs{$size})) {
$usedUcscChrs{$size} = $ucscChrs{$size};
$usedEnsemblChrs{$size} = $ensemblChrs{$size};
printf "%s\t%s\t%d\n", $ucscChrs{$size}, $ensemblChrs{$size}, $size;
} else {
my $ucscName = $ucscChrs{$size};
my $ensemblName = "unknown";
if ($ucscName =~ m/^chr6/) {
$ucscName =~ s/_hap.//;
$ucscName =~ s/chr6_/chr6_mhc_/;
$ensemblName = "HS" . uc($ucscName);
} elsif ($ucscName =~ m/^chr17_/ || $ucscName =~ m/^chr4_/) {
$ucscName =~ s/_.*/_1/;
$ensemblName = "HS" . uc($ucscName);
} elsif ($ucscName =~ m/^chrM/) {
print "# no translation for chrM\n";
} else {
die "unknown UCSC chr name: $ucscName";
}
printf "# ucsc $ucscChrs{$size} -> $ensemblName\n";
$ensemblTranslate{$ensemblName} = $size;
}
}
foreach my $size (keys %ensemblChrs) {
if (!exists($usedEnsemblChrs{$size})) {
my $ensemblName = $ensemblChrs{$size};
if (! exists($ensemblTranslate{$ensemblName})) {
die "can not translate Ensembl name $ensemblName";
} else {
my $ucscSize = $ensemblTranslate{$ensemblName};
printf "%s\t%s\t%d\t%d\n", $ucscChrs{$ucscSize}, $ensemblChrs{$size}
, $ucscSize, $size;
}
}
}
printf "chrM\tMT\n";
'_EOF_'
# << happy emacs
chmod +x relateUcscEnsembl.pl
./relateUcscEnsembl.pl 2>&1 | grep -v "^#" \
| awk '{printf "%s\t%s\n", $1, $2}' | sort > ucscToEnsembl.tab
cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
ucsc varchar(255) not null, # UCSC chromosome name
ensembl varchar(255) not null, # Ensembl chromosome name
#Indices
PRIMARY KEY(ucsc(21))
);
'_EOF_'
hgsql hg19 < ucscToEnsembl.sql
hgsql hg19 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'
awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
> ensemblLift.tab
cat << '_EOF_' > ensemblLift.sql
# UCSC offset to Ensembl coordinates
CREATE TABLE ensemblLift (
chrom varchar(255) not null, # Ensembl chromosome name
offset int unsigned not null, # offset to add to UCSC position
#Indices
PRIMARY KEY(chrom(15))
);
'_EOF_'
hgsql hg19 < ensemblLift.sql
hgsql hg19 \
-e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'
##############################################################################
# LASTZ MOUSE Mm9 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
cat << '_EOF_' > DEF
# human vs mouse
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Mouse Mm9
SEQ2_DIR=/scratch/data/mm9/nib
SEQ2_SMSK=/scratch/data/mm9/notInOthers
SEQ2_LEN=/scratch/data/mm9/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
cat fb.hg19.chainMm9Link.txt
# 1022734273 bases of 2897316137 (35.299%) in intersection
# and the swap
mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap
cd /hive/data/genomes/mm9/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \
-swap -noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 131m58.763s
cat fb.mm9.chainHg19Link.txt
# 1013880568 bases of 2620346127 (38.693%) in intersection
#########################################################################
# LASTZ Dog CanFam2 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
cat << '_EOF_' > DEF
# human vs dog
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/data/canFam2/nib
SEQ2_LEN=/scratch/data/canFam2/chrom.sizes
SEQ2_SMSK=/scratch/scratch/data/canFam2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=20000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
cat fb.hg19.chainCanFam2Link.txt
# 1532073507 bases of 2897316137 (52.879%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/canFam2/bed/blastz.hg19.swap
cd /hive/data/genomes/canFam2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13/DEF \
-noLoadChainSplit -swap \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:19:50 PDT 2009
- # real 723m41.377s
+ # real 200m17.158s
cat fb.canFam2.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 1480018167 bases of 2384996543 (62.055%) in intersection
#########################################################################
# LASTZ Chicken GalGal3 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
cat << '_EOF_' > DEF
# human vs chicken
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/galGal3/nib
SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
cat fb.hg19.chainGalGal3Link.txt
# 104053179 bases of 2897316137 (3.591%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/galGal3/bed/blastz.hg19.swap
cd /hive/data/genomes/galGal3/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13/DEF \
-swap \
-noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
-XXX - running Tue Jun 2 11:24:11 PDT 2009
- # real 723m41.377s
+ # real 16m45.090s
cat fb.galGal3.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 91605899 bases of 1042591351 (8.786%) in intersection
+
#########################################################################
# LASTZ Macaca Mulatta RheMac2 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
cat << '_EOF_' > DEF
# human vs macaca mulatta
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Macaca Mulatta RheMac2
SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit
SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 760m22.810s
cat fb.hg19.chainRheMac2Link.txt
# 2397361211 bases of 2897316137 (82.744%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
cd /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13/DEF \
-swap \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> swap.log 2>&1 &
-XXX - running Tue Jun 2 11:16:25 PDT 2009
- # real 723m41.377s
+ # real 83m51.483s
cat fb.rheMac2.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 2313806886 bases of 2646704109 (87.422%) in intersection
#########################################################################
# LASTZ Rat Rn4 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
cat << '_EOF_' > DEF
# human vs rat
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat Rn4
SEQ2_DIR=/scratch/data/rn4/nib
SEQ2_SMSK=/scratch/data/rn4/linSpecRep.notInHuman
SEQ2_LEN=/scratch/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 314m18.227s
cat fb.hg19.chainRn4Link.txt
# 952605822 bases of 2897316137 (32.879%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/rn4/bed/blastz.hg19.swap
cd /hive/data/genomes/rn4/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13/DEF \
-swap -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:21:44 PDT 2009
- # real 723m41.377s
+ # real 188m0.163s
cat fb.rn4.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 947862300 bases of 2571531505 (36.860%) in intersection
##############################################################################
# LASTZ Orangutan PonAbe2 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
cat << '_EOF_' > DEF
# human vs orangutan
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Orangutan PonAbe1
SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
cat fb.hg19.chainPonAbe2Link.txt
# 2646687531 bases of 2897316137 (91.350%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
cd /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13/DEF \
-swap \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> swap.log 2>&1 &
-XXX - running Tue Jun 2 11:08:17 PDT 2009
- # real 723m41.377s
+ # real 124m3.610s
cat fb.ponAbe2.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 2772351468 bases of 3093572278 (89.617%) in intersection
##############################################################################
# LASTZ Lamprey PetMar1 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Lamprey
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ2_LIMIT=5
# QUERY: Lamprey petMar1
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
> do.log 2>&1 &
# real 113m20.116s
cat fb.hg19.chainPetMar1Link.txt
# 31347143 bases of 2897316137 (1.082%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/petMar1/bed/blastz.hg19.swap
cd /hive/data/genomes/petMar1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
-swap > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:35:37 PDT 2009
- # real 723m41.377s
+ # real 59m14.813s
cat fb.petMar1.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 26615001 bases of 831696438 (3.200%) in intersection
##############################################################################
# LASTZ Fugu Fr2 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Fugu
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Fugu fr2
# Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/scratch/data/fr2/fr2.2bit
SEQ2_LEN=/hive/data/genomes/fr2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/fr2/noUn/fr2.scaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/fr2/noUn/fr2.scaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/fr2/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
> do.log 2>&1 &
# real 5797m9.288s
# had a small problem finishing the fundamental batch run, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=cat -qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
> cat.log 2>&1 &
cat fb.hg19.chainFr2Link.txt
# 49309456 bases of 2897316137 (1.702%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/fr2/bed/blastz.hg19.swap
cd /hive/data/genomes/fr2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
-swap > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:30:44 PDT 2009
- # real 723m41.377s
+ # real 25m8.491s
cat fb.fr2.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 42984130 bases of 393312790 (10.929%) in intersection
+
##############################################################################
# LASTZ Tetraodon TetNig1 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
cat << '_EOF_' > DEF
# human vs tetraodon
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/data/tetNig1/tetNig1.2bit
SEQ2_LEN=/hive/data/genomes/tetNig1/chrom.sizes
SEQ2_CHUNK=410000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 166m19.745s
cat fb.hg19.chainTetNig1Link.txt
# 58038079 bases of 2897316137 (2.003%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
cd /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-swap > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:33:01 PDT 2009
- # real 723m41.377s
+ # real 29m20.968s
cat fb.tetNig1.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 49453375 bases of 342403326 (14.443%) in intersection
##############################################################################
# LASTZ Stickleback GasAcu1 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Stickleback
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# TARGET: Stickleback gasAcu1
SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
SEQ2_LEN=/hive/data/genomes/gasAcu1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 174m40.659s
cat fb.hg19.chainGasAcu1Link.txt
# 55509003 bases of 2897316137 (1.916%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
cd /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-swap > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:08:17 PDT 2009
- # real 723m41.377s
+ # real 29m41.433s
cat fb.gasAcu1.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 49909819 bases of 446627861 (11.175%) in intersection
##############################################################################
# LASTZ Marmoset CalJac1 (DONE - 2009-05-14,22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
cat << '_EOF_' > DEF
# human vs. marmoset
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Marmoset (calJac1)
SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/scratch/data/calJac1/chrom.sizes
SEQ2_LIMIT=200
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# real 214m16.294s
cat fb.hg19.chainCalJac1Link.txt
# 2053025318 bases of 2897316137 (70.860%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 calJac1 > rbest.log 2>&1 &
# real 97m17.207s
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/calJac1/bed/blastz.hg19.swap
cd /hive/data/genomes/calJac1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14/DEF \
-swap \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> swap.log 2>&1 &
-XXX - running Tue Jun 2 11:17:56 PDT 2009
- # real 723m41.377s
+ # real 162m52.189s
cat fb.calJac1.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 2105959656 bases of 2929139385 (71.897%) in intersection
#########################################################################
# LASTZ Tarsier TarSyr1 (DONE - 2009-05-14,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Tarsier
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Tarsier
SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit
SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# real 1724m48.032s
# need to load the chain table manually:
# mySQL error 1114: The table 'chainTarSyr1Link' is full
cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14/axtChain
wc -l *.tab
# 21882142 chain.tab
# 165017606 link.tab
# 186899748 total
awk '{print length($0)}' link.tab | sort | uniq -c | less
4 23
9 24
27 25
105 26
767 27
1401 28
5020 29
8472 30
24390 31
117666 32
264774 33
776095 34
1632393 35
2672187 36
7125988 37
16831901 38
34905113 39
45218159 40
31570706 41
13746548 42
5868689 43
2460114 44
1118556 45
420826 46
106674 47
36770 48
40719 49
36955 50
19389 51
5571 52
1557 53
61 54
time nice -n +19 hgsql -e "DROP TABLE chainTarSyr1Link;" hg19
cat << '_EOF_' | hgsql hg19
CREATE TABLE chainTarSyr1Link (
bin smallint(5) unsigned NOT NULL default 0,
tName varchar(255) NOT NULL default '',
tStart int(10) unsigned NOT NULL default 0,
tEnd int(10) unsigned NOT NULL default 0,
qStart int(10) unsigned NOT NULL default 0,
chainId int(10) unsigned NOT NULL default 0,
KEY tName (tName(16),bin),
KEY chainId (chainId)
) ENGINE=MyISAM max_rows=166000000 avg_row_length=42 pack_keys=1 CHARSET=latin1;
'_EOF_'
# << happy emacs
time nice -n +19 hgsql -e \
"load data local infile \"link.tab\" into table chainTarSyr1Link;" hg19
# real 157m0.230s
# the running the rest of loadUp.csh after the hgLoadChain
# real 26m8.263s
cat fb.hg19.chainTarSyr1Link.txt
# 1385797066 bases of 2897316137 (47.830%) in intersection
# Continuing:
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-continue=download -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> download.log 2>&1 &
# real 48m6.573s
# ran the script on swarm to recover after hive outages
time doRecipBest.pl -buildDir=`pwd` hg19 tarSyr1 > rbest.log 2>&1 &
# real 404m0.201s
time doRecipBest.pl -continue=download -buildDir=`pwd` \
hg19 tarSyr1 > rbest.download.log 2>&1 &
#########################################################################
# LASTZ Bushbaby OtoGar1 (DONE - 2009-05-14,22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Bushbaby
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold
SEQ2_DIR=/scratch/data/otoGar1/otoGar1.rmsk.2bit
SEQ2_LEN=/hive/data/genomes/otoGar1/chrom.sizes
SEQ2_LIMIT=200
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# real 762m56.055s
cat fb.hg19.chainOtoGar1Link.txt
# 1264492372 bases of 2897316137 (43.644%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 otoGar1 > rbest.log 2>&1 &
# real 271m39.925s
#########################################################################
# LASTZ Mouse lemur MicMur1 (DONE - 2009-05-14,26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Mouse lemur
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Mouse lemur
SEQ2_DIR=/hive/data/genomes/micMur1/bed/repeatMasker/micMur1.rmsk.2bit
SEQ2_LEN=/hive/data/genomes/micMur1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 5429m52.082s
# there is one unusual long running job having trouble
# continuing after finishing the lastz run manually:
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-continue=cat -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> cat.log 2>&1 &
# real 388m25.032s
cat fb.hg19.chainMicMur1Link.txt
# 1347792207 bases of 2897316137 (46.519%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 micMur1 > rbest.log 2>&1
# about 4h30m
#########################################################################
# LASTZ Baboon PapHam1 (DONE - 2009-05-20,22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
cd /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
cat << '_EOF_' > DEF
# human vs baboon
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Baboon papHam1
SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit
SEQ2_LEN=/scratch/data/papHam1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# forgot that the synNet was not needed here, use recip best as below
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
cat fb.hg19.chainPapHam1Link.txt
# 2399269031 bases of 2897316137 (82.810%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 papHam1 > rbest.log 2>&1
# real 182m0.276s
#########################################################################
# SGP GENES (DONE - 2009-05-22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/sgpGene
cd /hive/data/genomes/hg19/bed/sgpGene
mkdir download
cd download
for C in `cut -f1 ../../../chrom.sizes`
do
echo $C
wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.gtf
wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.prot
done
cd ..
cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 sgpGene stdin
# Read 33994 transcripts in 291782 lines in 1 files
# 33994 groups 85 seqs 1 sources 3 feature types
# 33994 gene predictions
nice -n +19 featureBits -enrichment hg19 refGene:CDS sgpGene
# refGene:CDS 1.181%, sgpGene 1.295%, both 1.011%, cover 85.59%, enrich 66.08x
###########################################################################
# GENEID GENE PREDICTIONS (DONE - 2009-05-22 - Hiram)
ssh hgwdev
mkdir /hive/data/genomes/hg19/bed/geneid
cd /hive/data/genomes/hg19/bed/geneid
mkdir download
cd download
for C in `cut -f1 ../../../chrom.sizes`
do
echo $C
wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.gtf
wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.prot
done
cd ..
cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 geneid stdin
# Read 33428 transcripts in 277332 lines in 1 files
# 33428 groups 92 seqs 1 sources 3 feature types
# 33428 gene predictions
##########################################################################
## 4-Way Multiz for UCSC Genes construction (DONE - 2009-05-22 - Hiram)
ssh hgwdev
mkdir /hive/data/genomes/hg19/bed/multiz4way
cd /hive/data/genomes/hg19/bed/multiz4way
# extract our 4 organisms from the 44-way on hg18:
ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh
/cluster/bin/phast/tree_doctor \
--prune-all-but hg18,mm9,canFam2,rheMac2 44way.nh \
| sed -e "s/hg18/hg19/" > 4way.nh
# this looks like:
cat 4way.nh
(((hg19:0.032973,rheMac2:0.036199):0.109706,mm9:0.352605):0.020666,canFam2:0.193569);
# Use this specification in the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to obtain a gif image for htdocs/images/phylo/hg19_4way.gif
/cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
# Use this output to create the table below
grep -y hg19 4way.distances.txt | sort -k3,3n
#
# If you can fill in all the numbers in this table, you are ready for
# the multiple alignment procedure
#
# featureBits chainLink measures
# chainHg19Link chain linearGap
# distance on hg19 on other minScore
# 1 0.069172 - rhesus rheMac2 (% 82.744) (% xx.xxx) 5000 medium
# 2 0.356914 - dog canFam2 (% 52.879) (% xx.xxx) 3000 medium
# 3 0.495284 - mouse mm9 (% 35.299) (% 38.693) 3000 medium
# using the syntenic nets
cd /cluster/data/hg19/bed/multiz4way
mkdir mafLinks
cd mafLinks
mkdir rheMac2 canFam2 mm9
cd mm9
ln -s ../../../lastz.mm9/mafSynNet/*.maf.gz .
cd ../canFam2
ln -s ../../../lastz.canFam2/mafSynNet/*.maf.gz .
cd ../rheMac2
ln -s ../../../lastz.rheMac2/mafSynNet/*.maf.gz .
# determine what is the newest version of multiz and use that
cd /hive/data/genomes/hg19/bed/multiz4way
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
# the autoMultiz cluster run
ssh swarm
cd /hive/data/genomes/hg19/bed/multiz4way
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
4way.nh > tmp.nh
echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir run maf
cd run
# NOTE: you need to set the db and multiz dirname properly in this script
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = hg19
set c = $1
set maf = $2
set binDir = /hive/data/genomes/hg19/bed/multiz4way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/hg19/bed/multiz4way/mafLinks
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz4way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 /cluster/data/hg19/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
# 93 jobs
para try ... check ... push ... etc ...
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 24282s 404.70m 6.75h 0.28d 0.001 y
# IO & Wait Time: 2362s 39.36m 0.66h 0.03d 0.000 y
# Average job time: 286s 4.77m 0.08h 0.00d
# Longest finished job: 2235s 37.25m 0.62h 0.03d
# Submission to last job: 2241s 37.35m 0.62h 0.03d
# combine results into a single file for loading and gbdb reference
cd /hive/data/genomes/hg19/bed/multiz4way
time nice -n +19 catDir maf > multiz4way.maf
# real 3m27.561s
# makes a 8.5 Gb file:
# -rw-rw-r-- 1 9026080732 May 22 11:11 multiz4way.maf
# Load into database
ssh hgwdev
cd /hive/data/genomes/hg19/bed/multiz4way
mkdir /gbdb/hg19/multiz4way
ln -s /hive/data/genomes/hg19/bed/multiz4way/multiz4way.maf \
/gbdb/hg19/multiz4way
# the hgLoadMaf generates huge tmp files, locate them in /scratch/tmp/
cd /scratch/tmp
time nice -n +19 hgLoadMaf hg19 multiz4way
# real 5m31.883s
# Loaded 5788627 mafs in 1 files from /gbdb/hg19/multiz4way
cd /hive/data/genomes/hg19/bed/multiz4way
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 hg19 multiz4waySummary multiz4way.maf
# Created 1238721 summary blocks from 11959676 components
# and 5788627 mafs from multiz4way.maf
# real 6m33.936s
#########################################################################
# LASTZ Medaka OryLat2 (DONE - 2009-05-22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
cd /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
cat << '_EOF_' > DEF
# Human vs. Medaka
# typical parameters for a genome that is distant from human
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ2_LEN=/hive/data/genomes/oryLat2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=200
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 124m5.298s
cat fb.hg19.chainOryLat2Link.txt
# 53571737 bases of 2897316137 (1.849%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
cd /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-swap > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:08:17 PDT 2009
- # real 723m41.377s
+ # real 28m35.174s
cat fb.oryLat2.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 46961818 bases of 700386597 (6.705%) in intersection
##############################################################################
# LASTZ Opossum MonDom5 (DONE - 2009-05-23,29 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
cd /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
cat << '_EOF_' > DEF
# human vs. opossum
# settings for more distant organism alignments
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Opossum monDom5
SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# One job took a long time to complete, had to run it manually on
# swarm:
# /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
# /scratch/data/hg19/hg19.2bit:chr19:50000000-59128983 \
# /scratch/data/monDom5/monDom5.2bit:chr4:390000000-420000000 \
# ../DEF \
# ../psl/hg19.2bit:chr19:50000000-59128983/hg19.2bit:chr19:50000000-59128983_monDom5.2bit:chr4:390000000-420000000.psl
# took about 48 hours, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=cat > cat.log 2>&1 &
# real 1508m18.471s == about 25h08m
cat fb.hg19.chainMonDom5Link.txt
# 415997117 bases of 2897316137 (14.358%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
# real 20m29.049s
##############################################################################
# LASTZ Armadillo DasNov2 (DONE - 2009-05-23,28 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
cd /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
cat << '_EOF_' > DEF
# Human vs. Armadillo
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Armadillo
SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit
SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# finished the lastz run manually after hive maintenance outages
# then, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-continue=cat > cat.log 2>&1 &
# real 458m11.304s
cat fb.hg19.chainDasNov2Link.txt
# 971847303 bases of 2897316137 (33.543%) in intersection
time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 dasNov2 \
> rbest.log 2>&1
# time about 6h30m
##############################################################################
# LASTZ Rock Hyrax ProCap1 (DONE - 2009-05-23,26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
cd /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
cat << '_EOF_' > DEF
# Human vs. Rock Hyrax
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Rock Hyrax
SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit
SEQ2_LEN=/scratch/data/proCap1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# Completed: 997438 of 997438 jobs
# CPU time in finished jobs: 32830587s 547176.45m 9119.61h 379.98d 1.041 y
# IO & Wait Time: 9549484s 159158.07m 2652.63h 110.53d 0.303 y
# Average job time: 42s 0.71m 0.01h 0.00d
# Longest finished job: 1953s 32.55m 0.54h 0.02d
# Submission to last job: 67216s 1120.27m 18.67h 0.78d
# finished lastz run manually, then continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=cat > cat.log 2>&1 &
# real 369m1.678s
cat fb.hg19.chainProCap1Link.txt
# 894221652 bases of 2897316137 (30.864%) in intersection
time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 proCap1 \
> rbest.log 2>&1
# real 251m59.549s
##############################################################################
# LASTZ Zebra Finch TaeGut1 (DONE - 2009-05-26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
cd /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
cat << '_EOF_' > DEF
# human vs Zebra Finch
# distant from Human settings
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Zebra Finch taeGut1 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-qRepeats=windowmaskerSdust > do.log 2>&1 &
cat fb.hg19.chainTaeGut1Link.txt
# real 192m48.479s
# 101295490 bases of 2897316137 (3.496%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
-chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=syntenicNet -qRepeats=windowmaskerSdust > synNet.log 2>&1 &
# real 4m10.261s
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
cd /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26/DEF \
-swap -noLoadChainSplit -chainMinScore=5000 \
-chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-qRepeats=windowmaskerSdust > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:25:44 PDT 2009
- # real 723m41.377s
+ # real real 16m45.080s
cat fb.taeGut1.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 95320369 bases of 1222864691 (7.795%) in intersection
##############################################################################
# LASTZ Lizard AnoCar1 (DONE - 2009-05-30,31 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
cd /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
cat << '_EOF_' > DEF
# human vs lizard
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Lizard anoCar1
SEQ2_DIR=/scratch/data/anoCar1/anoCar1.2bit
SEQ2_LEN=/scratch/data/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-qRepeats=windowmaskerSdust > do.log 2>&1 &
# real 168m32.016s
cat fb.hg19.chainAnoCar1Link.txt
# 104045950 bases of 2897316137 (3.591%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 anoCar1 > rbest.log 2>&1
# real 45m58.001s
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
cd /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:26:55 PDT 2009
- # real 723m41.377s
+ # real 34m55.857s
cat fb.anoCar1.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 89608316 bases of 1741478929 (5.146%) in intersection
##############################################################################
# LASTZ X. tropicalis XenTro2 (DONE - 2009-05-26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
cd /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
cat << '_EOF_' > DEF
# human vs X. tropicalis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Lizard anoCar1
SEQ2_DIR=/scratch/data/xenTro2/xenTro2.2bit
SEQ2_LEN=/scratch/data/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 1129m11.568s
# finished the lastz run manually after hive difficulties, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-continue=cat > cat.log 2>&1 &
# time about 1h30m
cat fb.hg19.chainXenTro2Link.txt
# 92015242 bases of 2897316137 (3.176%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
cd /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-swap > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:28:06 PDT 2009
- # real 723m41.377s
+ # real 130m53.860s
cat fb.xenTro2.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 92070065 bases of 1359412157 (6.773%) in intersection
##############################################################################
# LASTZ Zebrafish DanRer5 (DONE - 2009-05-26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
cd /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
cat << '_EOF_' > DEF
# human vs X. zebrafish
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Zebrafish danRer5
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/scratch/data/danRer5/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=40
BASE=/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 311m39.817s
cat fb.hg19.chainDanRer5Link.txt
# 74229561 bases of 2897316137 (2.562%) in intersection
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/danRer5/bed/blastz.hg19.swap
cd /hive/data/genomes/danRer5/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-swap > swap.log 2>&1 &
-XXX - running Tue Jun 2 11:31:49 PDT 2009
- # real 723m41.377s
+ # real 26m54.605s
cat fb.danRer5.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 73852780 bases of 1435609608 (5.144%) in intersection
##############################################################################
# LASTZ Platypus OrnAna1 (DONE - 2009-05-26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
cd /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
cat << '_EOF_' > DEF
# human vs platypus
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Platypus ornAna1
SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 572m18.808s
cat fb.hg19.chainOrnAna1Link.txt
# 220977689 bases of 2897316137 (7.627%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 ornAna1 > rbest.log 2>&1
# time about 1h32m
- # running the swap
+ # running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
cd /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26/DEF \
-swap -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> swap.log 2>&1 &
-XXX - running Tue Jun 2 11:23:00 PDT 2009
- # real 723m41.377s
+ # real 146m52.638s
cat fb.ornAna1.chainHg19Link.txt
- # 2761343871 bases of 2909485072 (94.908%) in intersection
+ # 207415519 bases of 1842236818 (11.259%) in intersection
##############################################################################
# LASTZ Elephant LoxAfr2 (DONE - 2009-05-27,29 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
cd /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
cat << '_EOF_' > DEF
# Human vs. Elephant
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Elephant
SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit
SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# time about 3h23m
cat fb.hg19.chainLoxAfr2Link.txt
# 1018502258 bases of 2897316137 (35.153%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr2 > rbest.log 2>&1
# real 322m37.502s
##############################################################################
# LASTZ Tenrec EchTel1 (DONE - 2009-05-27 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
cd /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
cat << '_EOF_' > DEF
# Human vs. Tenrec
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tenrec
SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/data/echTel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 1153m34.595s
cat fb.hg19.chainEchTel1Link.txt
# 669856841 bases of 2897316137 (23.120%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 echTel1 > rbest.log 2>&1
# time about 7h13m
##############################################################################
-# LASTZ Tree Shrew TupBel1 (WORKING - 2009-05-27 - Hiram)
+# LASTZ Tree Shrew TupBel1 (DONE - 2009-05-27,06-02 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
cat << '_EOF_' > DEF
# Human vs. Tree Shrew
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tree Shrew
SEQ2_DIR=/scratch/data/tupBel1/tupBel1.2bit
SEQ2_LEN=/scratch/data/tupBel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
> do.log 2>&1 &
# real 811m54.095s
# having trouble with pk, finished manually
# XXX there is one job that is taking forever ...
# finished it in pieces on swarm in a few minutes, like this:
mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
#!/bin/sh
S=100000000
E=101010000
export S E
for I in 0 1 2 3 4 5 6 7 8 9
do
echo $S $E
/usr/bin/time -p /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
/scratch/data/hg19/nib/chr1.nib:chr1:${S}-${E} ../qParts/part019.lst \
../../DEF psl/chr1.nib:chr1:${S}-${E}_part019.lst.psl
nextS=`echo $S | awk '{printf "%d", $1 + 1000000}'`
nextE=`echo $E | awk '{printf "%d", $1 + 1000000}'`
S=$nextS
E=$nextE
done
grep -h "^#" psl/chr* | sort -u > result.psl
grep -h -v "^#" psl/chr* | sort -k14,14 -k16,16n >> result.psl
cp -p result.psl \
../../psl/chr1.nib:chr1:100000000-110010000/chr1.nib:chr1:100000000-110010000_part019.lst.psl
# then, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
-continue=cat > cat.log 2>&1 &
# real 212m22.707s
time doRecipBest.pl -buildDir=`pwd` hg19 tupBel1 > rbest.log 2>&1
-XXX - running Tue Jun 2 10:15:24 PDT 2009
+ # time about 4h22m
##############################################################################
# LASTZ Shrew SorAra1 (DONE - 2009-05-28,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
cd /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
cat << '_EOF_' > DEF
# Human vs. Shrew
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Shrew
SEQ2_DIR=/scratch/data/sorAra1/sorAra1.2bit
SEQ2_LEN=/scratch/data/sorAra1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# time about 23h26m
cat fb.hg19.chainSorAra1Link.txt
# 572519288 bases of 2897316137 (19.760%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 sorAra1 > rbest.log 2>&1
# real 251m20.055s
##############################################################################
# LASTZ Rabbit OryCun1 (DONE - 2009-05-28,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
cd /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
cat << '_EOF_' > DEF
# Human vs. Rabbit
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rabbit
SEQ2_DIR=/scratch/data/oryCun1/oryCun1.2bit
SEQ2_LEN=/scratch/data/oryCun1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# time about 23h09m
cat fb.hg19.chainOryCun1Link.txt
# 975693323 bases of 2897316137 (33.676%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 oryCun1 > rbest.log 2>&1
# real 318m1.142s
##############################################################################
# LASTZ Hedgehog EriEur1 (DONE - 2009-05-28,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
cd /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
cat << '_EOF_' > DEF
# Human vs. Hedgehog
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Hedgehog
SEQ2_DIR=/scratch/data/eriEur1/eriEur1.2bit
SEQ2_LEN=/scratch/data/eriEur1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=500
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 2043m33.198s
cat fb.hg19.chainEriEur1Link.txt
# 560965051 bases of 2897316137 (19.362%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 eriEur1 > rbest.log 2>&1
# real 350m17.737s
##############################################################################
# LASTZ Pika OchPri2 (DONE - 2009-05-29,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
cd /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
cat << '_EOF_' > DEF
# Human vs. Pika
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Pika
SEQ2_DIR=/scratch/data/ochPri2/ochPri2.2bit
SEQ2_LEN=/scratch/data/ochPri2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 393m42.569s
cat fb.hg19.chainOchPri2Link.txt
# 804516397 bases of 2897316137 (27.768%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 ochPri2 > rbest.log 2>&1
# real 224m47.979s
##############################################################################
# LASTZ Kangaroo Rat DipOrd1 (DONE - 2009-05-29,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
cd /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
cat << '_EOF_' > DEF
# Human vs. Kangaroo Rat
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Kangaroo Rat
SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit
SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 688m47.595s
time doRecipBest.pl -buildDir=`pwd` hg19 dipOrd1 > rbest.log 2>&1
# real 140m42.014s
##############################################################################