src/hg/makeDb/doc/hg19.txt 1.49
1.49 2009/10/21 18:34:58 hiram
done with phastCons runs for the 46-way
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.48
retrieving revision 1.49
diff -b -B -U 1000000 -r1.48 -r1.49
--- src/hg/makeDb/doc/hg19.txt 16 Oct 2009 17:17:44 -0000 1.48
+++ src/hg/makeDb/doc/hg19.txt 21 Oct 2009 18:34:58 -0000 1.49
@@ -1,6618 +1,7192 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on
# NCBI build 37 (February 2009 freeze) aka:
# GRCh37 - Genome Reference Consortium Human Reference 37
# Assembly Accession: GCA_000001405.1
# "$Id$";
#############################################################################
# NOTE FOR NEXT HUMAN ASSEMBLY (2009-07-29 - Brooke): hg19 contains the wrong
# sequence for chrM. The accession NC_001807 was replaced in GenBank with
# NC_012920, with the note: "This sequence was removed since the accepted
# reference sequence for the Homo sapiens mitochondrion is the rCRS/Mitomap
# sequence, which is now available as the record NC_012920".
# Also, from http://www.mitomap.org/mitoseq.html:
# "IMPORTANT: Do not use NC_001807 as "the rCRS" as it is an African
# (Yoruban) sequence with over 40 variant nucleotides from the rCRS. As of
# July 8, 2009 it has been removed from GenBank as a reference sequence but
# may be found, if needed, as AF347015, one of 53 African sequence deposited
# in Genbank by Ingman et al in 2001."
# Use NC_012920 for the chrM sequence for the next build!
# Download sequence (DONE - 2009-02-04 - Hiram)
mkdir -p /hive/data/genomes/hg19/download
cd /hive/data/genomes/hg19/download
mkdir -p assembled_chromosomes
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=assembled_chromosomes \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes
mkdir -p alternate_loci
for N in 1 2 3 4 5 6 7 8 9
do
wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=alternate_loci \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
done
mkdir -p unlocalized_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=unlocalized_scaffolds \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds
mkdir -p unplaced_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=unplaced_scaffolds \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds
mkdir -p placed_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=placed_scaffolds \
-nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds
mkdir ucscChr
cd ucscChr
for F in ../assembled_chromosomes/FASTA/chr*.fa
do
C=`basename $F`
C=${C/.fa}
echo -n "${C} "
H=`head -1 "${F}"`
chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
echo $chrN $A
grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
| sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
echo ">${chrN}" > ${chrN}.fa
grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
done
rm -f scaffolds.agp
find ../alternate_loci -type f | grep ".agp$" | while read F
do
grep "^GL" $F | sed -e \
"s/^GL000250.1/chr6_apd_hap1/" -e \
"s/^GL000251.1/chr6_cox_hap2/" -e \
"s/^GL000252.1/chr6_dbb_hap3/" -e \
"s/^GL000253.1/chr6_mann_hap4/" -e \
"s/^GL000254.1/chr6_mcf_hap5/" -e \
"s/^GL000255.1/chr6_qbl_hap6/" -e \
"s/^GL000256.1/chr6_ssto_hap7/" -e \
"s/^GL000257.1/chr4_ctg9_hap1/" -e \
"s/^GL000258.1/chr17_ctg5_hap1/"
done > scaffolds.agp
find ../unlocalized_scaffolds -type f | grep ".agp$" \
| while read F
do
C=`basename ${F}`
C=${C/.unlocalized.scaf.agp}
grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
done >> scaffolds.agp
find ../unplaced_scaffolds -type f | grep ".agp$" \
| while read F
do
grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
done >> scaffolds.agp
rm -f scaffolds.fa
find ../alternate_loci -type f | grep ".fa$" | while read F
do
sed -e \
"s/>.*GL000250.*/>chr6_apd_hap1/" -e \
"s/>.*GL000251.*/>chr6_cox_hap2/" -e \
"s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
"s/>.*GL000253.*/>chr6_mann_hap4/" -e \
"s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
"s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
"s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
"s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
"s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
done > scaffolds.fa
find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
do
sed -e \
"s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
done >> scaffolds.fa
find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
do
sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
done >> scaffolds.fa
############################################################################
## Create database (DONE - 2009-03-04 - Hiram)
cd /hive/data/genomes/hg19
cat << '_EOF_' > hg19.config.ra
# Config parameters for makeGenomeDb.pl:
db hg19
scientificName Homo sapiens
commonName Human
assemblyDate Feb. 2009
assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
orderKey 14
mitoAcc NC_001807
fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
# qualFiles /dev/null
dbDbSpeciesDir human
taxId 9606
'_EOF_'
# << happy emacs
time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
# real 14m8.958s
featureBits -countGaps hg19 gap
# 239845127 bases of 3137161264 (7.645%) in intersection
featureBits -noRandom -noHap -countGaps hg19 gap
# 234344806 bases of 3095693983 (7.570%) in intersection
# verify featureBits is properly ignorning haps and randoms:
egrep -v "_" chrom.sizes | awk '{sum+=$2;print sum,$0}'
# 3095693983 chrM 16571
# same total as in featureBits
# much later on, discovered that we needed a chrM definition in the
# agp files, added by hand to hg19/M/chrM.agp and hg19/hg19.agp the line:
# chrM 1 16571 1 F NC001807 1 16571 +
# the spaces there are tabs
############################################################################
# running repeat masker (DONE - 2009-03-05 - Hiram)
screen # use screen to manage this day-long job
mkdir /hive/data/genomes/hg19/bed/repeatMasker
cd /hive/data/genomes/hg19/bed/repeatMasker
time doRepeatMasker.pl -bigClusterHub=swarm -buildDir=`pwd` hg19 \
> do.log 2>&1
# real 525m23.521s
cat faSize.rmsk.txt
# 3137161264 bases (239850802 N's 2897310462 real 1431585691
# upper 1465724771 lower) in 93 sequences in 1 files
# %46.72 masked total, %50.59 masked real
featureBits -countGaps hg19 rmsk
# 1465724774 bases of 3137161264 (46.721%) in intersection
# this is odd, 3 bases more in featureBits than were masked ?
# check it out, make a bed file from the featureBits:
featureBits -countGaps -bed=rmsk.bed hg19 rmsk
# went down a sequence of intersections with this idea, but could
# not get it resolved. It appears there are 75 bases in the rmsk
# table that were not masked in the 2bit file ?
# Later on, realized that featureBits does not count lower case N's
# in the "lower" category, but only in the N's category.
# trying a non-split table:
hgsql -e "show tables;" hg19 | grep _rmsk | while read T
do
hgsql -e "drop table ${T};" hg19
done
hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.fa.out
bad rep range [4385, 4384] line 1348605 of hg19.fa.out
bad rep range [5563, 5562] line 1563988 of hg19.fa.out
bad rep range [4539, 4538] line 3111186 of hg19.fa.out
# featureBits still reports 1465724774 bases in rmsk table
# cleaning the hg19.fa.out file:
cp hg19.fa.out hg19.clean.out
# edit hg19.clean.out and remove the three lines:
# 1467 20.7 1.2 17.6 chr14 35056767 35056794 (72292746) + L1ME1 LINE/L1 4385 4384 (1761) 1120962
# 1943 23.8 5.0 12.6 chr15 65775909 65775924 (36755468) + L1MC4 LINE/L1 5563 5562 (2480) 1299299
# 2463 25.1 5.0 11.6 chr3 121291056 121291083 (76731347) + L1M3 LINE/L1 4539 4538 (1608) 2589267
# reload the table
hgsql -e "drop table rmsk;" hg19
hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.clean.out
# try masking with this clean file:
twoBitMask /hive/data/genomes/hg19/hg19.unmasked.2bit hg19.clean.out \
hg19.clean.2bit
twoBitToFa hg19.clean.2bit stdout | faSize stdin > faSize.clean.txt
cat faSize.clean.txt
# this gives the lower by 75 bases result:
# 3137161264 bases (239850802 N's 2897310462 real 1431585763 upper
# 1465724699 lower) in 93 sequences in 1 files
# %46.72 masked total, %50.59 masked real
featureBits -countGaps hg19 rmsk
# 1465724774 bases of 3137161264 (46.721%) in intersection
# is the countGaps interferring ?
featureBits hg19 rmsk
# 1465724774 bases of 2897316137 (50.589%) in intersection
# nope, lets' see what the .out file has:
grep chr hg19.clean.out | sed -e "s/^ *//" | awk '{print $5,$6-1,$7}' \
| sort -k1,1 -k2,2n > hg19.clean.out.bed
featureBits -countGaps hg19 hg19.clean.out.bed
# 1465724774 bases of 3137161264 (46.721%) in intersection
# is it perhaps not masking N's ?
twoBitToFa hg19.clean.2bit stdout | grep n | less
# that does find some lower case n's, find all N's:
findMotif -strand=+ -motif=gattaca -verbose=4 hg19.clean.2bit \
2> findMotif.out
grep "^#GAP" findMotif.out | sed -e "s/#GAP //" > nLocations.bed
# which cover:
featureBits -countGaps hg19 nLocations.bed
# 251299071 bases of 3137161264 (8.010%) in intersection
# overlapping rmsk business with these N locations:
featureBits -countGaps hg19 hg19.clean.out.bed nLocations.bed
# 6494740 bases of 3137161264 (0.207%) in intersection
# and overlapping with gap:
featureBits -countGaps hg19 gap nLocations.bed
# 239845127 bases of 3137161264 (7.645%) in intersection
############################################################################
# running TRF simple repeats (DONE - 2009-03-05 - Hiram)
screen # use screen to manage this day-long job
mkdir /hive/data/genomes/hg19/bed/simpleRepeat
cd /hive/data/genomes/hg19/bed/simpleRepeat
time doSimpleRepeat.pl -bigClusterHub=pk -workhorse=hgwdev \
-smallClusterHub=pk -buildDir=`pwd` hg19 > do.log 2>&1
# real 33m25.815s
twoBitMask bed/repeatMasker/hg19.clean.2bit \
-add bed/simpleRepeat/trfMask.bed hg19.2bit
twoBitToFa hg19.2bit stdout | faSize stdin > faSize.hg19.2bit.txt
# 3137161264 bases (239850802 N's 2897310462 real 1430387259 upper
# 1466923203 lower) in 93 sequences in 1 files
# %46.76 masked total, %50.63 masked real
############################################################################
# prepare cluster data (DONE - 2009-03-06 - Hiram)
cd /hive/data/genomes/hg19
rm /gbdb/hg19/hg19.2bit
ln -s `pwd`/hg19.2bit /gbdb/hg19/hg19.2bit
time blat hg19.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
# Wrote 30675 overused 11-mers to 11.ooc
# real 3m11.302s
mkdir /hive/data/staging/data/hg19
cp -p hg19.2bit /hive/data/staging/data/hg19
cp -p 11.ooc /hive/data/staging/data/hg19
cp -p chrom.sizes /hive/data/staging/data/hg19
mkdir separateChrs
cd separateChrs
grep -v "_" ../chrom.sizes | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.chrOnly.2bit
twoBitInfo hg19.chrOnly.2bit stdout | sort -k2,2nr > chrOnly.chrom.sizes
grep "_hap" ../chrom.sizes | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.hapOnly.2bit
twoBitInfo hg19.hapOnly.2bit stdout | sort -k2,2nr > hapOnly.chrom.sizes
grep "_" ../chrom.sizes | grep -v "_hap" | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.scaffolds.2bit
twoBitInfo hg19.scaffolds.2bit stdout | sort -k2,2nr > scaffolds.chrom.sizes
cp -p *.2bit *.sizes /hive/data/staging/data/hg19
# ask admin to sync this directory: /hive/data/staging/data/hg19/
# to the kluster nodes /scratch/data/hg19/
############################################################################
# running cpgIsland business (DONE - 2009-03-06 - Hiram)
mkdir /hive/data/genomes/hg19/bed/cpgIsland
cd /hive/data/genomes/hg19/bed/cpgIsland
cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
# comment out the following two lines if it compiles cleanly
# some day (there were some other fixups too, adding include lines)
sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
mv tmp.c cpg_lh.c
make
cd ../../
ln -s hg3rdParty/cpgIslands/cpglh.exe
mkdir -p hardMaskedFa
cut -f1 ../../chrom.sizes | while read C
do
echo ${C}
twoBitToFa ../../hg19.2bit:$C stdout \
| maskOutFa stdin hard hardMaskedFa/${C}.fa
done
cut -f1 ../../chrom.sizes > chr.list
cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out line results/$(root1).cpg}
#ENDLOOP
'_EOF_'
# << happy emacs
cat << '_EOF_' > runOne
#!/bin/csh -fe
./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
mv /scratch/tmp/$1.$$ $2
'_EOF_'
# << happy emacs
gensub2 chr.list single template jobList
para create jobList
para try
para check ... etc
para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 172s 2.86m 0.05h 0.00d 0.000 y
# IO & Wait Time: 1748s 29.14m 0.49h 0.02d 0.000 y
# Average job time: 21s 0.34m 0.01h 0.00d
# Longest finished job: 34s 0.57m 0.01h 0.00d
# Submission to last job: 83s 1.38m 0.02h 0.00d
# Transform cpglh output to bed +
catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed
cd /hive/data/genomes/hg19/bed/cpgIsland
hgLoadBed hg19 cpgIslandExt -tab \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 28226 elements of size 10
# Sorted
# Saving bed.tab
# Loading hg19
############################################################################
# create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
mkdir /hive/data/genomes/hg19/bed/gap
cd /hive/data/genomes/hg19/bed/gap
gapToLift hg19 hg19.unBridged.lift -bedFile=unBridged.lift.bed
cp -p hg19.unBridged.lift ../../jkStuff
cp -p hg19.unBridged.lift /hive/data/staging/data/hg19
############################################################################
# AUTO UPDATE GENBANK RUN (DONE - 2009-03-07,13 - Hiram)
# align with latest genbank process.
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add hg19 just after hg18
# hg19 - GRCh37 - Genome Reference Consortium Human Reference 37
# Assembly Accession: GCA_000001405.1
hg19.serverGenome = /hive/data/genomes/hg19/hg19.2bit
hg19.clusterGenome = /scratch/data/hg19/hg19.2bit
hg19.ooc = /scratch/data/hg19/11.ooc
hg19.lift = /hive/data/genomes/hg19/jkStuff/hg19.unBridged.lift
# hg19.hapRegions = /hive/data/genomes/hg19/bed/haplotypePos/haplotypePos.psl
hg19.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter}
hg19.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter}
hg19.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
hg19.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter}
hg19.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
hg19.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter}
hg19.genbank.est.xeno.load = yes
hg19.refseq.mrna.xeno.load = yes
hg19.refseq.mrna.xeno.loadDesc = yes
hg19.mgc = yes
hg19.orfeome = yes
hg19.downloadDir = hg19
# hg19.ccds.ncbiBuild = 36.3
# hg19.upstreamGeneTbl = refGene
# hg19.upstreamMaf = multiz28way
# /hive/data/genomes/hg19/bed/multiz28way/species.lst multiz44way
# /hive/data/genomes/hg19/bed/multiz44way/species.list
hg19.genbank.mrna.blatTargetDb = yes
cvs ci -m "Added hg19." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial hg19 &
# logFile: var/build/logs/2009.03.10-20:28:44.hg19.initalign.log
# real 2761m13.680s
# that ran on the swarm with little interference and no problems
# load database when finished
ssh hgwdev
screen # use screen to manage this long running command
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19 &
# logFile: var/dbload/hgwdev/logs/2009.03.12-21:10:02.dbload.log
# real 369m11.941s
# enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add hg19 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added hg19 - Human - GRCh37" etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# BLATSERVERS ENTRY (DONE - 2009-03-09 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("hg19", "blat13", "17778", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("hg19", "blat13", "17779", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
# Making download files (DONE - 2009-03-13 - Hiram)
cd /hive/data/genomes/hg19
makeDownloads.pl -allowMissedTrfs -noChromRoot hg19 \
> downloads.log 2>&1
############################################################################
# Venter1 chain, net experiment (DONE - Hiram - 2009-03-15)
doBlastzChainNet.pl `pwd`/DEF \
-stop=partition -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > partition.log 2>&1
doBlastzChainNet.pl `pwd`/DEF \
-continue=blastz -stop=blastz -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > blastz.log 2>&1
doBlastzChainNet.pl `pwd`/DEF \
-continue=cat -stop=net -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > net.log 2>&1
real 163m28.438s
# to load, run it in debug, then check the load script
doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -continue=load -stop=load -bigClusterHub=swarm \
-debug -smallClusterHub=swarm -chainMinScore=1000 \
-chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > load.log 2>&1
# and create a synNet for multiz, run in debug, and examine script
# to make sure it works correctly
doBlastzChainNet.pl `pwd`/DEF \
-syntenicNet -continue=syntenicNet -stop=syntenicNet \
-debug -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > synNet.log 2>&1
# real 31m11.216s
############################################################################
# reset position to chr6 haplotype situation
hgsql -e \
'update dbDb set defaultPos="chr6:28343766-33555363" where name="hg19";' \
hgcentraltest
# reset to a smaller range (2009-04-24 - Brooke)
# this is the SOD1 gene, implicated in Lou Gehrig's disease.
hgsql -e \
'update dbDb set defaultPos="chr21:33,031,597-33,041,570" where name="hg19";' \
hgcentraltest
############################################################################
# Self Lastz run (DONE - 2009-03-19 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
cat << '_EOF_'
# human vs human
BLASTZ=lastz
# maximum M allowed with lastz is only 255
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Human Hg19
SEQ2_DIR=/scratch/data/hg19/hg19.2bit
SEQ2_LEN=/scratch/data/hg19/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-workhorse=hgwdev \
-stop=net -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
# cluster difficulties, finished manually, then:
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-continue=cat -workhorse=hgwdev \
-stop=net -smallClusterHub=pk -bigClusterHub=swarm > cat.log 2>&1 &
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-continue=load -debug -workhorse=hgwdev \
-stop=load -smallClusterHub=pk -bigClusterHub=swarm > load.debug.log 2>&1 &
# that indicates it would do:
hgLoadChain -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
# adding -normScore
hgLoadChain -normScore -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
############################################################################
# Chimp Lastz run (DONE - 2009-03-19 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
cd /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
cat << '_EOF_'
# human vs chimp
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/data/panTro2/panTro2.2bit
SEQ2_LEN=/scratch/data/panTro2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
# real 173m22.880s
# cluster problems, continuing after lastz done:
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=cat \
-stop=net -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> net.log 2>&1 &
# real 81m20.209s
# continuing with the load and adding syntenicNet
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=load \
-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
-chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> load.log 2>&1 &
# real 47m17.871s
cat fb.hg19.chainPanTro2Link.txt
# 2747983350 bases of 2897316137 (94.846%) in intersection
# running the swap - DONE - 2009-05-24
ssh swarm
mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap
cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
> swap.log 2>&1 &
# real 723m41.377s
cat fb.panTro2.chainHg19Link.txt
# 2761343871 bases of 2909485072 (94.908%) in intersection
############################################################################
# Creating the pushQ entry (DONE - 2009-03-20 - Hiram)
mkdir /hive/data/genomes/hg19/pushQ
cd /hive/data/genomes/hg19/pushQ
makePushQSql.pl hg19 > hg19.pushQ.sql 2> make.err
# many complaints about the chain and net tables from the haplotype
# experiments, and this table:
# orfeomeGenes
# which is probably in genbank, and these usual ones:
# hg19 does not have seq
# hg19 does not have extFile
############################################################################
# Determine PAR region of X and Y (DONE - 2009-03-20 - Hiram)
mkdir /hive/data/genomes/hg19/bed/parRegion
cd /hive/data/genomes/hg19/bed/parRegion
awk '$5 != "N"' ../../X/chrX.agp | awk '{print $6}' | sort > chrX.cloneList
awk '$5 != "N"' ../../Y/chrY.agp | awk '{print $6}' | sort > chrY.cloneList
comm -12 chrX.cloneList chrY.cloneList > chrXY.par.clone.list
cat chrXY.par.clone.list \
| while read C; do grep "${C}" ../../X/chrX.agp; done \
| sort -k1,1 -k2,2n >> chrX.par.region.agp
cat chrXY.par.clone.list \
| while read C; do grep "${C}" ../../Y/chrY.agp; done \
| sort -k1,1 -k2,2n >> chrY.par.region.agp
awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrY.par.region.agp \
> chrY.par.region.bed
awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrX.par.region.agp \
> chrX.par.region.bed
# use those bed files in custom tracks on hg19 to verify that they
# are two continuous regions with only gaps between these items
# these location extents are: (zero relative)
# chrX 60000 2722842
# chrX 154906585 155260560
# chrY 10000 2649520
# chrY 59034049 59363566
############################################################################
# Gorilla Lastz run (DONE - 2009-03-21,05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
cd /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
cat << '_EOF_'
# human vs gorilla
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Gorilla gorGor1
SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> do.log 2>&1 &
cat fb.hg19.chainGorGor1Link.txt
# 1723432141 bases of 2897316137 (59.484%) in intersection
doRecipBest.pl -buildDir=`pwd` hg19 gorGor1 > rbest.log 2>&1
############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2009-04-02 - Hiram)
ssh pk
mkdir /hive/data/genomes/hg19/bed/linSpecRep
cd /hive/data/genomes/hg19/bed/linSpecRep
# create individual .out files from the master record in ../repeatMasker
mkdir splitOut
cat << '_EOF_' > split.csh
#!/bin/csh -fe
set C = $1
head -3 ../repeatMasker/hg19.clean.out > splitOut/${C}.out
grep "${C} " ../repeatMasker/hg19.clean.out >> splitOut/${C}.out
'_EOF_'
# << happy emacs
cat << '_EOF_' > template
#LOOP
split.csh $(root1) {check out line+ splitOut/$(root1).out}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 ../../chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
para create jobList
para try ... check ... push ... etc...
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 127s 2.12m 0.04h 0.00d 0.000 y
# IO & Wait Time: 17154s 285.90m 4.76h 0.20d 0.001 y
# Average job time: 186s 3.10m 0.05h 0.00d
# Longest finished job: 224s 3.73m 0.06h 0.00d
# Submission to last job: 280s 4.67m 0.08h 0.00d
# now, we can date and process each of those .out files
# this really should be a single creation of notInOthers
# These four different ones all end up to be the same anyhow
# the notInMouse becomes notInOthers below and the others are removed.
mkdir dateRepeats
cd dateRepeats
cat << '_EOF_' > mkLSR
#!/bin/csh -fe
rm -f $1.out_mus-musculus_rattus_canis-familiaris_bos-taurus
ln -s ../splitOut/$1.out .
/scratch/data/RepeatMasker/DateRepeats \
$1.out -query human -comp mouse -comp rat -comp dog -comp cow
rm $1.out
mkdir -p ../notInMouse ../notInRat ../notInDog ../notInCow
/cluster/bin/scripts/extractRepeats 1 $1.out_mus*-taurus \
> ../notInMouse/$1.out.spec
/cluster/bin/scripts/extractRepeats 2 $1.out_mus*-taurus \
> ../notInRat/$1.out.spec
/cluster/bin/scripts/extractRepeats 3 $1.out_mus*-taurus \
> ../notInDog/$1.out.spec
/cluster/bin/scripts/extractRepeats 4 $1.out_mus*-taurus \
> ../notInCow/$1.out.spec
'_EOF_'
# << happy emacs
chmod +x mkLSR
cat << '_EOF_' > template
#LOOP
./mkLSR $(path1) {check out line+ $(path1).out_mus-musculus_rattus_canis-familiaris_bos-taurus}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../chrom.list single template jobList
para try ... check ... push ... etc...
para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 2441s 40.69m 0.68h 0.03d 0.000 y
# IO & Wait Time: 332s 5.53m 0.09h 0.00d 0.000 y
# Average job time: 30s 0.50m 0.01h 0.00d
# Longest finished job: 125s 2.08m 0.03h 0.00d
# Submission to last job: 454s 7.57m 0.13h 0.01d
done
# these four types of out.spec results all turn out to be identical
# To check identical
cd /hive/data/genomes/hg19/bed/linSpecRep
find . -name "*.out.spec" | \
while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
| sort -k1,1n | sort -t"/" -k3,3 | sed -e "s#./notIn.*/##" \
| sort | uniq -c | less
# You will see they are all a count of 4
# Set them up on scratch data and get to all the kluster nodes:
mkdir /hive/data/staging/data/hg19/lineageSpecificRepeats
cd notInMouse
rsync -a --progress ./ /hive/data/staging/data/hg19/lineageSpecificRepeats
cd ..
mv notInMouse notInOthers
# do not need to keep all of these
rm -fr notInRat notInDog notInCow
# We also need the nibs for blastz runs with lineage specific repeats
mkdir /hive/data/genomes/hg19/bed/nibs
cd /hive/data/genomes/hg19/bed/nibs
cut -f1 ../../chrom.sizes | while read C
do
twoBitToFa -seq=${C} ../../hg19.2bit stdout \
| faToNib -softMask stdin ${C}.nib
echo "${C} done"
done
mkdir /hive/data/staging/data/hg19/nib
rsync -a --progress ./ /hive/data/staging/data/hg19/nib
# Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
#############################################################################
# create gc5Base download file (DONE - 2009-04-24 - Hiram)
cd /hive/data/genomes/hg19/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
/cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz
#############################################################################
# Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram)
mkdir /hive/data/genomes/hg19/bed/ctgPos
cd /hive/data/genomes/hg19/bed/ctgPos
cat << '_EOF_' > mkCtgPos.sh
AGP="/hive/data/genomes/hg19/download/assembled_chromosomes/AGP"
export AGP
for F in `(cd ${AGP}; ls chr*.agp | grep -v ".comp.agp")`
do
C=${F/.agp/}
grep "^CM" "${AGP}/${F}" | awk '$5 != "N"' | awk '
{
printf "%s\t%d\t%s\t%d\t%d\n", $6, $8-$7+1, "'${C}'", $2-1+$7-1, $2-1+$8
}
'
done
'_EOF_'
# << happy emacs
chmod +x mkCtgPos.sh
./mkCtgPos.sh > ctgPos.tab
cat << '_EOF_' > mkRanCtgPos.sh
AGP="/hive/data/genomes/hg19/download/unlocalized_scaffolds/AGP"
export AGP
for F in `(cd ${AGP}; ls chr*.agp)`
do
C=${F/.unlocalized.scaf.agp/}
c=${C/chr/}
export C c
grep "^GL" "${AGP}/${F}" | awk '$5 != "N"' | awk '
BEGIN {
ctgName=""
ctgStart=0
ctgEnd=0
chrom="'${c}'"
ctgNameLower=""
}
{
if (match(ctgName,$1)) {
ctgEnd = $3
} else {
if (length(ctgName) > 0) {
size=ctgEnd - ctgStart
printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower,
ctgStart, ctgEnd
}
ctgStart = $2 - 1
ctgEnd = $3
ctgName = $1
ctgNameLower = tolower($1)
sub(".1$","",ctgNameLower)
}
}
END {
size=ctgEnd - ctgStart
printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower,
ctgStart, ctgEnd
}
'
done
'_EOF_'
# << happy emacs
chmod +x mkRanCtgPos.sh
./mkRanCtgPos.sh >> ctgPos.tab
# fetch .sql definition from hg18
chmod 777 .
hgsqldump --all -c --tab=. hg18 ctgPos
chmod 775 .
hgsql hg19 < ctgPos.sql
hgsql -e 'load data local infile "ctgPos.tab" into table ctgPos;' hg19
#############################################################################
# CLONE ENDS - first step for BACEND/CytoBand tracks
# (DONE - 2009-04-28 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/cloneend/ncbi
cd /hive/data/genomes/hg19/bed/cloneend/ncbi
wget --timestamping \
'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_ends*.mfa.gz'
wget --timestamping \
'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_info*.txt.gz'
cd /hive/data/genomes/hg19/bed/cloneend
# seems like the *.mfa files were split just for convenience
# concatenate
for F in ncbi/*.mfa.gz
do
zcat "${F}"
echo "${F}" 1>&2
done | gzip > all.mfa.gz
# that 1>&2 echos to stderr so you can see the file name and not
# interfere with the pipe stdout output to gzip
# Convert the title line of the all.mfa file
zcat all.mfa.gz \
| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#\.[0-9]|.*##" \
| gzip > cloneEnds.fa.gz
zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz
# make sure nothing got broken:
faSize all.mfa.gz
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
faSize cloneEnds.fa.gz
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
# identical numbers
# you can also carefully check the names:
zcat all.mfa.gz | grep "^>" | awk -F'|' '{print $4}' \
| sed -e "s/\.[0-9]$//" | sort > mfa.names
# should be the same as:
zcat cloneEnds.fa.gz | grep "^>" | sed -e "s/>//" | sort > clone.names
# concatenate the text files, too
bash
for F in ncbi/*.txt.gz
do
zcat "${F}"
echo "${F}" 1>&2
done | gzip > all.txt.gz
# generate cloneEndPairs.txt and cloneEndSingles.txt
zcat all.txt.gz >all.txt
$HOME/kent/src/hg/utils/cloneEndParse.pl all.txt
# Reading in end info
# Writing out pair info
# Writing out singleton info
# 302264 pairs and 203094 singles
# examined all the clone names and all the bac end names in these two
# files and compared with business from all.txt to make sure we properly
# classified all of them correctly. We had 833,173 clone sequences,
# and 501,135 bac end names
# faSplit does not function correctly if given a .gz source file
# AND, we need the unzipped file for sequence loading below
gunzip cloneEnds.fa.gz
# split
mkdir splitdir
cd splitdir
faSplit sequence ../cloneEnds.fa 100 cloneEnds
# Check to ensure no breakage:
cat *.fa | faSize stdin
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
# same numbers as before
# load sequences
ssh hgwdev
mkdir /gbdb/hg19/cloneend
cd /gbdb/hg19/cloneend
ln -s /hive/data/genomes/hg19/bed/cloneend/cloneEnds.fa .
cd /tmp
hgLoadSeq hg19 /gbdb/hg19/cloneend/cloneEnds.fa
# Advisory lock created
# Creating .tab file
# Adding /gbdb/hg19/cloneend/cloneEnds.fa
# 833173 sequences
# Updating seq table
# Advisory lock has been released
# All done
##############################################################################
# BACEND SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/bacends/run.blat
cd /hive/data/genomes/hg19/bed/bacends/run.blat
# going to run separate runs for the golden path sequence vs. the
# randoms, haplotypes, chrUn and chrM
partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
| egrep -v "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://; s/:/./" > hg19.list
ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
> bacEnds.list
ssh swarm
cd /hive/data/genomes/hg19/bed/bacends/run.blat
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(path2) {check out line+ psl/$(root1)/$(file1).$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set target = $1
set query = $2
set result = $3
set partSpec = `echo $target | sed -e "s/\./:/"`
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set dir = $result:h
set chr = `echo $target | sed -e "s/\..*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
# echo $tmpFile
# echo "chr: $chr $start $end -> size: $chrSize, range: $range"
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p $dir
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
rm -f $result
liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
# << happy emacs
gensub2 hg19.list bacEnds.list template jobList
para create jobList
# 62034 jobs in batch
# these jobs run quickly, limit them to 250 at a time
para try, check, -maxJob=250 push, etc ...
# Completed: 62034 of 62034 jobs
# CPU time in finished jobs: 506023s 8433.72m 140.56h 5.86d 0.016 y
# IO & Wait Time: 175853s 2930.88m 48.85h 2.04d 0.006 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest finished job: 752s 12.53m 0.21h 0.01d
# Submission to last job: 3533s 58.88m 0.98h 0.04d
# combine the alignments
time pslSort dirs raw.psl temp psl/chr*
# 62034 files in 24 dirs
# Got 62034 files 249 files per mid file
# real 81m2.820s
# -rw-rw-r-- 1 13410334441 Apr 29 12:00 raw.psl
# cleanup
rmdir temp
time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
# real 5m55.990s
# Processed 106254032 alignments
# -rw-rw-r-- 1 372734361 Apr 29 12:56 bacEnds.psl
wc -l bacEnds.psl
# 2852977 bacEnds.psl
time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
all_bacends bacEnds
# Reading pair file
# Reading psl file
# Creating Pairs
# Writing to files
# real 0m18.851s
# this creates the files:
# -rw-rw-r-- 1 21178741 Apr 29 13:00 bacEnds.pairs
# -rw-rw-r-- 1 5250873 Apr 29 13:00 bacEnds.orphan
# -rw-rw-r-- 1 738045 Apr 29 13:00 bacEnds.short
# -rw-rw-r-- 1 463560 Apr 29 13:00 bacEnds.slop
# -rw-rw-r-- 1 146369 Apr 29 13:00 bacEnds.mismatch
# -rw-rw-r-- 1 3528 Apr 29 13:00 bacEnds.long
# filter and sort
awk '$5 >= 300' bacEnds.pairs | sort -k1,1 -k2,2n > bacEndPairs.bed
awk '$5 >= 300' bacEnds.slop bacEnds.short bacEnds.long \
bacEnds.mismatch bacEnds.orphan | sort -k1,1 -k2,2n > bacEndPairsBad.bed
extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
> bacEndPairs.load.psl
############################################################################
# BACEND Randoms SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/bacends/run.randoms
cd /hive/data/genomes/hg19/bed/bacends/run.randoms
# this separate run for the randoms, haplotypes, chrUn and chrM
partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
| egrep "random|_hap|chrUn" \
| sed -e "s/.*2bit://; s/:/./" > random.list
cat tParts/*.lst | sed -e "s/.*2bit://; s/:/./" >> random.list
ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
> bacEnds.list
ssh swarm
cd /hive/data/genomes/hg19/bed/bacends/run.randoms
gensub2 random.list bacEnds.list ../run.blat/template jobList
# very similar runOne.csh script as above, but it doesn't need to do
# the lift
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set target = $1
set query = $2
set result = $3
set partSpec = `echo $target | sed -e "s/\./:/"`
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set dir = $result:h
set chr = `echo $target | sed -e "s/\..*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`
# echo $tmpFile
# echo "chr: $chr $start $end -> size: $chrSize, range: $range"
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p $dir
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
rm -f $result
mv $tmpFile.psl $result
echo rm -f $tmpFile.lift
'_EOF_'
# << happy emacs
# these jobs run fast, do not let too many of them run
para -maxJob=100 try...check...push
para time
# Completed: 6762 of 6762 jobs
# CPU time in finished jobs: 20357s 339.29m 5.65h 0.24d 0.001 y
# IO & Wait Time: 17839s 297.31m 4.96h 0.21d 0.001 y
# Average job time: 6s 0.09m 0.00h 0.00d
# Longest finished job: 261s 4.35m 0.07h 0.00d
# Submission to last job: 508s 8.47m 0.14h 0.01d
time pslSort dirs raw.psl temp psl/chr*
# 6762 files in 69 dirs
# Got 6762 files 82 files per mid file
# real 6m37.177s
# 37044 files in 98 dirs
# Got 37044 files 192 files per mid file
# real 32m24.804s
# -rw-rw-r-- 1 6487445210 Feb 2 21:08 raw.psl
time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl randomEnds.psl randomReps.psr > pslReps.out 2>&1 &
# real 0m5.761s
# Processed 1254273 alignments
# cleanup
rmdir temp
wc -l randomEnds.psl
# 367567 randomEnds.psl
time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose randomEnds.psl \
/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
all_bacends bacEnds
# Reading pair file
# Reading psl file
# Creating Pairs
# Writing to files
# real 0m11.221s
# this creates the files:
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.slop
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.short
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.mismatch
# -rw-rw-r-- 1 0 Apr 29 14:53 bacEnds.long
# -rw-rw-r-- 1 141836 Apr 29 14:53 bacEnds.pairs
# -rw-rw-r-- 1 649907 Apr 29 14:53 bacEnds.orphan
##############################################################################
# BacEnds track - both results loaded together (DONE - 2009-04-29 - Hiram)
ssh hgwdev
cd /hive/data/genomes/hg19/bed/bacends
# filter and sort
awk '$5 >= 300' run.blat/bacEnds.pairs run.randoms/bacEnds.pairs \
| sort -k1,1 -k2,2n > bacEndPairs.bed
awk '$5 >= 300' run.blat/bacEnds.slop run.blat/bacEnds.short \
run.blat/bacEnds.long run.blat/bacEnds.mismatch \
run.blat/bacEnds.orphan run.randoms/bacEnds.slop \
run.randoms/bacEnds.short run.randoms/bacEnds.long \
run.randoms/bacEnds.mismatch run.randoms/bacEnds.orphan \
| sort -k1,1 -k2,2n > bacEndPairsBad.bed
head -5 run.blat/bacEnds.psl > bacEnds.psl
headRest 5 run.blat/bacEnds.psl > t.psl
headRest 5 run.randoms/randomEnds.psl >> t.psl
sort -k14,14 -k16,16n t.psl >> bacEnds.psl
extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
> bacEnds.load.psl
# load them into the database
ssh hgwdev
cd /hive/data/genomes/hg19/bed/bacends
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $4}' bacEndPairs.bed | grep " "
awk '{print $5}' bacEndPairs.bed | sort | uniq -c
# result should be the scores, no extraneous strings:
# 156984 1000
# 195 300
# 316 375
# 297 500
# 1476 750
# edit the file and fix it if it has a bad name.
hgLoadBed -notItemRgb hg19 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 208922 elements of size 11
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed -notItemRgb hg19 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 79004 elements of size 11
#hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl hg19 -table=all_bacends bacEnds.load.psl
# one complaint, there appears to be a bogus insert count in one
# of the blat results:
# < 585 797 67 0 3 2 -63 9 79188 + AQ743980 852 42 846 chr19_gl000208_random 92689 4045 84100 11 14,124,84,496,53,6,20,28,28,10,4, 42,56,180,200,696,750,756,776,804,832,842, 4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
Became:
# > 585 797 67 0 3 2 0 9 79188 + AQ743980 852 42 846 chr19_gl000208_random 92689 4045 84100 11 14,124,84,496,53,6,20,28,28,10,4, 42,56,180,200,696,750,756,776,804,832,842, 4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
hgsql -N -e "select count(*) from all_bacends;" hg19
# 2289275
hgsql -N -e "select count(*) from all_bacends;" hg18
# 1727387
hgsql -N -e "select count(*) from all_bacends;" hg17
# 1729146
nice featureBits hg19 all_bacends
# 230917362 bases of 2897316137 (7.970%) in intersection
nice featureBits hg18 all_bacends
# 227770876 bases of 2881515245 (7.905%) in intersectio
nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection
nice featureBits hg19 bacEndPairs
# 236889607 bases of 2897316137 (8.176%) in intersection
nice featureBits hg18 bacEndPairs
# 162690030 bases of 2881515245 (5.646%) in intersection
nice featureBits hg17 bacEndPairs
# 162099487 bases of 2866216770 (5.656%) in intersection
nice featureBits hg19 bacEndPairsBad
# 38344094 bases of 2897316137 (1.323%) in intersection
nice featureBits hg18 bacEndPairsBad
# 37326990 bases of 2881515245 (1.295%) in intersection
nice featureBits hg17 bacEndPairsBad
# 37437558 bases of 2866216770 (1.306%) in intersection
############################################################################
# STS MARKERS (DONE - 2009-04-30 - 2009-05-06 - Hiram)
mkdir /hive/data/outside/ncbi/sts.2009-04
cd /hive/data/outside/ncbi
ln -s sts.2009-04 sts.11
cd /hive/data/outside/ncbi/sts.2009-04
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
gunzip sts.gz
mv sts dbSTS.fa
# these items are copied in from the previous builds
cp -p /cluster/data/ncbi/sts.10/all.STS.fa ./all.STS.fa.prev
cp -p /cluster/data/ncbi/sts.10/stsInfo2.bed ./stsInfo2.bed.prev
# edit stsInfo2.bed.prev for a
# manual fixup of error that is in the hg18 bed file, replace
# the line for AFM067XA9 to fix bogus long list of aliases to be:
# 22788^IAFM067XA9^I1^IZ66598^I1^IGDB:1221611,^I5^I067XA9,GDB:1221611,W202,Z66598,SWSS2303^I69047^I0^I^ITCTTGGGGTTTAATTGCTTT^ICTTTGCCACAATCTTACACA^I149^IHomo sapiens^I1^I2^I6453,6454,^I0^I^I^I^I0^I0^I^I^I0^I0^IAFM067XA9^Ichr7^I145^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0
# as taken directly out of the hg18.stsInfo2 table which was fixed
# by Bob and Archana
# Convert the title line of the dbSTS.fa file
# Verify that column 3 only contains gb emb dbj
grep "^>" dbSTS.fa | awk -F'|' '{print $3}' | sort | uniq -c
# 39124 dbj
# 57375 emb
# 1212541 gb
# if that is true, this sed will work:
cat dbSTS.fa \
| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#^>gi.[0-9]*.dbj.#>#; s#\.[0-9]|.*##" \
> UniSTS.convert.fa
# get accessions
grep ">" UniSTS.convert.fa | sed -e "s/^>//" | sort > UniSTS.acc
# head and tail that to ensure names are reasonable, odd names would
# show up at the beginning or end
wc -l UniSTS.acc
# 1309040 UniSTS.acc
# NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
# all.STS.fa, stsAlias.bed files
updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
UniSTS.sts UniSTS.aliases UniSTS.convert.fa new
# verify the number of aliases is reasonable:
awk '{print $3}' new.alias | sort | uniq -c | sort -rn | less
# 50 D7S831
# 34 CHLC.GATA2B06.465
# 24 CHLC.GATA11E11
# 23 AFM276ZF5
# 23 AFM273YH9
# 22 SHGC-133043
# ... etc ...
# verify there are no unusually long or short lines:
awk '{printf "%d\n", length($0)}' new.info | sort -n | head -3
# 143
# 144
# 144
awk '{printf "%d\n", length($0)}' new.info | sort -n | tail -3
# 552
# 553
# 644
# check for null in the new files:
grep -i null new.*
# if the new files look good, they can become the set to use:
mv new.info stsInfo2.bed
mv new.primers all.primers
mv new.alias stsAlias.bed
mv new.fa all.STS.fa
# get list of all STS id's in the fasta file
sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n > all.STS.id
wc -l all.STS.id
# 100520 total sequences
# in hg18 this was: 93698 total sequences
$HOME/kent/src/hg/stsMarkers/convertPrimerToFA all.primers > all.primers.fa
# check that fasta file for unusual length sequences:
faSize all.primers.fa
# 97815329 bases (83677626 N's 14137703 real 14137703 upper 0 lower) in 317592 sequences in 1 files
# Total size: mean 308.0 sd 279.3 min 40 (dbSTS_144) max 30000 (dbSTS_156892) median 244
# Copy stsInfo2.bed and stsAlias.bed to data directory becuase
# these will be loaded into the database later
mkdir -p /hive/data/genomes/hg19/bed/sts
cp -p stsInfo2.bed /hive/data/genomes/hg19/bed/sts/
cp -p stsAlias.bed /hive/data/genomes/hg19/bed/sts/
# Create sts sequence alignments
mkdir /hive/data/genomes/hg19/bed/sts/split
faSplit sequence all.STS.fa 100 /hive/data/genomes/hg19/bed/sts/split/sts
ssh swarm
mkdir /hive/data/genomes/hg19/bed/sts/run
cd /hive/data/genomes/hg19/bed/sts/run
# going to run separate runs for the golden path sequence vs. the
# randoms, haplotypes, chrUn and chrM
# 40,000,000 chunck sizes, 20,000 overlap
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep -v "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://;" > hg19.list
ls -1S ../split > sts.list
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec \
../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
# rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
# << happy emacs
chmod +x runOne.csh
gensub2 hg19.list sts.list template jobList
# these jobs run quickly, allow only 100 at a time
para -maxJob=100 create jobList
# 8367 jobs in batch
para try ... check ... push ... etc
# Completed: 8366 of 8366 jobs
# CPU time in finished jobs: 89744s 1495.74m 24.93h 1.04d 0.003 y
# IO & Wait Time: 25467s 424.44m 7.07h 0.29d 0.001 y
# Average job time: 14s 0.23m 0.00h 0.00d
# Longest finished job: 53s 0.88m 0.01h 0.00d
# Submission to last job: 1592s 26.53m 0.44h 0.02d
# and, run the randoms as a separate run:
mkdir /hive/data/genomes/hg19/bed/sts/run.randoms
cd /hive/data/genomes/hg19/bed/sts/run.randoms
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep "tParts|random|_hap|chrUn"
cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
ls -1S ../split > sts.list
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
/scratch/data/hg19/hg19.2bit:$partSpec \
../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
/bin/rm -f $tmpFile.psl
'_EOF_'
# << happy emacs
chmod +x runOne.csh
gensub2 hg19.list sts.list template jobList
# these jobs run quickly, allow only 100 at a time
para -maxJob=100 create jobList
# 6486 jobs in batch
para try ... check ... push ... etc
# Completed: 6486 of 6486 jobs
# CPU time in finished jobs: 2206s 36.77m 0.61h 0.03d 0.000 y
# IO & Wait Time: 16505s 275.08m 4.58h 0.19d 0.001 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest finished job: 21s 0.35m 0.01h 0.00d
# Submission to last job: 601s 10.02m 0.17h 0.01d
# Compile sts sequence results
ssh hgwdev
cd /hive/data/genomes/hg19/bed/sts/run
time pslSort dirs raw.psl temp psl/chr*
# 8366 files in 89 dirs
# Got 8366 files 91 files per mid file
# real 8m50.714s
# -rw-rw-r-- 1 810438277 May 1 11:45 raw.psl
cd /hive/data/genomes/hg19/bed/sts/run.randoms
time pslSort dirs raw.psl temp psl/chr*
# 6486 files in 69 dirs
# Got 6486 files 81 files per mid file
# real 1m42.120s
# -rw-rw-r-- 1 18378188 May 1 11:52 raw.psl
rmdir temp
cd /hive/data/genomes/hg19/bed/sts
cat run*/raw.psl | egrep -v "^$|^psLayout|^match|^ |^-" \
| pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
stsMarkers.psl /dev/null
# Processed 7412166 alignments
# -rw-rw-r-- 1 12031760 May 1 11:57 stsMarkers.psl
$HOME/kent/src/hg/stsMarkers/extractPslInfo -h stsMarkers.psl
# creates stsMarkers.psl.initial
# -rw-rw-r-- 1 4485053 May 1 12:06 stsMarkers.psl.initial
wc -l stsMarkers.psl.initial
# 101338 stsMarkers.psl.initial
# this command needs a chrom_names file to work correctly with this
# new style of layout for hg19:
cd /hive/data/genomes/hg19
cut -f1 chrom.sizes | sed -e "s/chr//" > chrom_names
cd /hive/data/genomes/hg19/bed/sts
$HOME/kent/src/hg/stsMarkers/findAccession.pl -agp stsMarkers.psl.initial \
/cluster/data/hg19
wc -l stsMarkers.psl.initial.acc
# 101338 stsMarkers.psl.initial.acc
sort -k4,4n stsMarkers.psl.initial.acc > stsMarkers.final
# determine found markers (4th field in file)
cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
wc -l stsMarkers.found
# 96472 stsMarkers.found
# out of 100520 total sequences from:
wc -l /hive/data/outside/ncbi/sts.2009-04/all.STS.id
# There are lots of duplicates:
wc -l stsMarkers.final
# 101338 stsMarkers.final
# And a lot of them are just completely haywire:
awk '$3-$2 < 1001' stsMarkers.final | wc -l
# 98382
# filter out markers that are too long
awk '$3-$2 < 1001' stsMarkers.final > stsMarkers.1K.size.filtered
# alignment of primers
ssh swarm
cd /hive/data/outside/ncbi/sts.2009-04
awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
all.primers > all.primers.ispcr
mkdir primerAlign
cd primerAlign
mkdir split
cd split
split -l 5000 ../../all.primers.ispcr primer_
ls > ../primer.list
cd ..
# we need a 10.ooc file for this business
time blat /scratch/data/hg19/hg19.2bit \
/dev/null /dev/null -tileSize=10 -makeOoc=10.ooc -repMatch=1024
# Wrote 146902 overused 10-mers to 10.ooc
# real 19m16.758s
# separate runs for whole genome vs. randoms
mkdir run
cd run
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep -v "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://;" > hg19.list
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
-ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
# << happy emacs
chmod +x runOne.csh
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 hg19.list ../primer.list template jobList
para create jobList
# 5696 jobs in batch
para try ... check ... push ... etc
# Completed: 5696 of 5696 jobs
# CPU time in finished jobs: 203899s 3398.32m 56.64h 2.36d 0.006 y
# IO & Wait Time: 22049s 367.48m 6.12h 0.26d 0.001 y
# Average job time: 40s 0.66m 0.01h 0.00d
# Longest finished job: 5314s 88.57m 1.48h 0.06d
# Submission to last job: 5418s 90.30m 1.50h 0.06d
# Estimated complete: 0s 0.00m 0.00h 0.00d
# sort and filter the results
cd psl
pslSort dirs raw.psl temp chr*
# 5696 files in 89 dirs
# Got 5696 files 75 files per mid file
# -rw-rw-r-- 1 456802973 May 4 13:32 raw.psl
cd ..
mkdir filter
pslQuickFilter -minMatch=26 -maxMismatch=5 \
-maxTinsert=5000 -verbose psl/ filter/
# -rw-rw-r-- 1 50302564 May 4 13:35 raw.psl
# And, for the randoms
mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| egrep "tParts|random|_hap|chrUn" \
| sed -e "s/.*2bit://;" > hg19.list
cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
cat tParts/* > hg19.list
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
-ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
'_EOF_'
# << happy emacs
chmod +x runOne.csh
# can not use line+ check here, many of them are empty
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 hg19.list ../primer.list template jobList
# they run quickly, limit to 100
para -maxJob=100 create jobList
para try ... check ... push ... etc
# Completed: 4416 of 4416 jobs
# CPU time in finished jobs: 1746s 29.09m 0.48h 0.02d 0.000 y
# IO & Wait Time: 11407s 190.12m 3.17h 0.13d 0.000 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest finished job: 8s 0.13m 0.00h 0.00d
# Submission to last job: 147s 2.45m 0.04h 0.00d
# sort and filter the results
cd psl
pslSort dirs raw.psl temp chr*
# 4416 files in 69 dirs
# Got 4416 files 66 files per mid file
rmdir temp
# -rw-rw-r-- 1 9066053 May 4 13:31 raw.psl
# putting the two runs together
mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
ln -s ../run/filter/raw.psl run.psl
ln -s ../runRandoms/filter/raw.psl runRandoms.psl
# -rw-rw-r-- 1 50302564 May 4 13:35 run.psl
# -rw-rw-r-- 1 825973 May 4 13:35 runRandoms.psl
cd ..
pslSort dirs primers.psl temp psl
# 2 files in 1 dirs
# Got 2 files 1 files per mid file
# -rw-rw-r-- 1 51128110 May 4 13:39 primers.psl
wc -l primers.psl
# 448107 primers.psl
rmdir temp
pslFilterPrimers primers.psl ../all.primers primers.filter.psl
# creates primers.filter.unlifted.psl.notfound.primers
wc -l primers*
# 237962 primers.filter.psl
# 97191 primers.filter.psl.notfound.primers
# see if ePCR can find some of these notfound
ssh swarm
mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
mkdir split
cd split
split -l 5000 ../../primers.filter.psl.notfound.primers primers_
cd ..
ls -1S split > primers.lst
partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
| grep -v tParts | sed -e "s/.*2bit://;" > hg19.list
cat tParts/* | sed -e "s/.*2bit://;" >> hg19.list
cat > runOne.csh << '_EOF_'
#!/bin/csh -fe
set partSpec = $1
set primer = split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p epcr/$partSpec
/bin/rm -f $tmpFile.psl
twoBitToFa /scratch/data/hg19/hg19.2bit:$partSpec $tmpFile.fa
/cluster/bin/scripts/runEpcr64 $primer $tmpFile.fa $tmpFile.epcr
/bin/rm -f $result
/bin/mv $tmpFile.epcr $result
rm -f $tmpFile.fa $tmpFile.lift $tmpFile.psl $tmpFile.*
'_EOF_'
# << happy emacs
chmod +x runOne.csh
cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line epcr/$(file1)/$(root2).epcr}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 hg19.list primers.lst template jobList
para create jobList
# 3160 jobs
para try ... check ... push ... etc ...
# Completed: 3160 of 3160 jobs
# CPU time in finished jobs: 86253s 1437.54m 23.96h 1.00d 0.003 y
# IO & Wait Time: 11196s 186.61m 3.11h 0.13d 0.000 y
# Average job time: 31s 0.51m 0.01h 0.00d
# Longest finished job: 89s 1.48m 0.02h 0.00d
# Submission to last job: 237s 3.95m 0.07h 0.00d
find ./epcr -type f | xargs cat > all.epcr
wc -l all.epcr
# 797286 all.epcr
# convert the coordinates from the partitionSequence.pl to a lift file
awk '{print $1}' all.epcr | sort -u > hg19.partSpec.txt
$HOME/kent/src/hg/stsMarkers/liftFromSpec.pl hg19 hg19.partSpec.txt \
> all.epcr.lift
cat all.epcr | sed -e "s/\.\./ /; s/ */\t/g" \
| liftUp -type=.bed stdout all.epcr.lift error stdin \
| awk '
{
printf "%s %d..%d %d %d\n", $1, $2, $3, $4, $5
}
' > all.epcr.lifted
pslFilterPrimers -epcr=all.epcr.lifted -verbose=1 ../primers.psl \
/cluster/home/hiram/bin/x86_64/pslFilterPrimers -epcr=all.epcr.lifted \
-verbose=1 ../primers.psl ../../all.primers epcr.primers.psl
# this took a long time, many hours
# -rw-rw-r-- 1 2785254 May 5 17:28 epcr.not.found
# -rw-rw-r-- 1 27343510 May 5 17:28 epcr.primers.psl
# -rw-rw-r-- 1 1616885 May 5 17:28 epcr.primers.psl.notfound.primers
time ./epcrToHgPsl.pl epcr.not.found ../../all.primers \
time $HOME/kent/src/hg/stsMarkers/epcrToPsl epcr.not.found \
../../all.primers /hive/data/genomes/hg19
# real 69m38.444s
# -rw-rw-r-- 1 0 May 6 14:18 epcr.not.found.nomatch
# -rw-rw-r-- 1 8369138 May 6 15:26 epcr.not.found.psl
# combining everything together now
cd /hive/data/outside/ncbi/sts.2009-04/primerAlign
sort -u primers.filter.psl epcr/epcr.primers.psl epcr/epcr.not.found.psl \
| sort -k15,15 -k17,17n > primers.final.psl
wc -l primers.final.psl
# 310705 primers.final.psl
time $HOME/kent/src/hg/stsMarkers/fixPrimersQueryGaps.pl \
../all.primers primers.final.psl > primers.final.fix.psl
# real 0m19.580s
wc -l primers.final.fix.psl
# 310705 primers.final.fix.psl
# Extract relevant info, make alignments unique, and create final file to
# be merged with full sequence alignments
$HOME/kent/src/hg/stsMarkers/extractPslInfo -h primers.final.fix.psl
# real 0m15.303s
# -rw-rw-r-- 1 15660447 May 6 15:44 primers.final.fix.psl.initial
wc -l primers.final.fix.psl.initial
# 308210 primers.final.fix.psl.initial
$HOME/kent/src/hg/stsMarkers/findAccession.pl -agp \
primers.final.fix.psl.initial /hive/data/genomes/hg19
wc -l primers.final.fix.psl.initial.acc
# 308210 primers.final.fix.psl.initial.acc
$HOME/kent/src/hg/stsMarkers/getStsId ../stsInfo2.bed \
primers.final.fix.psl.initial.acc | sort -k 4n > primers.final
wc -l primers.final
# 308210 primers.final
# There doesn't appear to be any use for this primers.ids list
# except for curiosity. Check the head and tail of this list to
# verify no garbage is in here. There should just be numbers.
awk '{print $4}' primers.final | sort -n | uniq > primers.ids
wc -l primers.ids
# 290961 primers.ids
# Merge primer and sequence files to create final bed file
# Merge (combineSeqPrimerPos) takes about an hour to run
cd /hive/data/genomes/hg19/bed/sts
time $HOME/kent/src/hg/stsMarkers/combineSeqPrimerPos stsMarkers.final \
/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final
# real 0m12.310s
# -rw-rw-r-- 1 15222346 May 6 15:55 stsMarkers_pos.rdb
wc -l stsMarkers_pos.rdb
# 315308 stsMarkers_pos.rdb
time /cluster/bin/scripts/createSTSbed \
/hive/data/outside/ncbi/sts.2009-04/stsInfo2.bed \
stsMarkers_pos.rdb > stsMap.bed
# real 0m31.886s
# -rw-rw-r-- 1 38244880 May 6 16:25 stsMap.bed
wc -l stsMap.bed
# 305914 stsMap.bed
# Set up sequence files
ssh hgwdev
mkdir /gbdb/hg19/sts.11/
ln -s /hive/data/outside/ncbi/sts.11/all.STS.fa \
/gbdb/hg19/sts.11/all.STS.fa
ln -s /hive/data/outside/ncbi/sts.11/all.primers.fa \
/gbdb/hg19/sts.11/all.primers.fa
# Load all files
cd /hive/data/genomes/hg19/bed/sts
hgLoadSeq hg19 /gbdb/hg19/sts.11/all.STS.fa /gbdb/hg19/sts.11/all.primers.fa
# Creating seq.tab file
# Adding /gbdb/hg19/sts.11/all.STS.fa
# 100520 sequences
# Adding /gbdb/hg19/sts.11/all.primers.fa
# 317592 sequences
# Updating seq table
# Advisory lock has been released
# All done
hgsql hg19 < $HOME/kent/src/hg/lib/stsInfo2.sql
hgsql hg19 < $HOME/kent/src/hg/lib/stsAlias.sql
# these files already exist here from previous operations
# cp -p /hive/data/outside/ncbi/sts.11/{stsInfo2.bed,stsAlias.bed} .
hgsql hg19 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
hgsql hg19 -e 'load data local infile "stsAlias.bed" into table stsAlias'
# a couple minutes for each load above
# filter the stsMap.bed to eliminate items longer than 5,000 bases,
# takes out about 850:
awk '$3-$2 < 5001' stsMap.bed | sort -k1,1 -k2,2n \
> stsMap.filtered.5000.bed
hgLoadBed -notItemRgb -noBin -tab \
-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql hg19 stsMap \
stsMap.filtered.5000.bed
# Loaded 305064 elements of size 28
ln -s \
/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final.fix.psl \
primers.psl
hgLoadPsl -nobin -table=all_sts_primer hg19 primers.psl
hgLoadPsl -nobin -table=all_sts_seq hg19 stsMarkers.psl
##############################################################################
# FISH CLONES (WORKING - 2009-04-29 - Hiram)
# The STS Marker and BAC End Pairs tracks must be completed prior to
# creating this track.
mkdir /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
cd /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
# Download information from NCBI
# point browser at:
# http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
# change "Sequence tag:" to "placed on contig"
# change "Show details on sequence-tag" to "yes"
# change "Download or Display" to "Download table for UNIX"
# press Submit - save as
# /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
chmod 664 /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
# Unfortunately the format of this hbrc file has changed since
# last time. The columns have been rearranged, and one important
# column is missing, the contig information. So, let's see if we
# can recover the original format by putting this together with
# some other things we have here.
$HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
/hive/data/genomes/hg19/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
2> dbg
XXX - need to get this seq_clone.pmd from NCBI, maybe Paul Kitts
# the seq_clone.pmd file was obtained via email from Wonhee Jang
# jang at ncbi.nlm.nih.gov - I have asked for clarification where
# such a file can be fetched without resorting to email.
# Get current clone/accession information
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/clone/reports/clac.out
http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
# Create initial Fish Clones bed file
ssh kkstore02
mkdir /hive/data/genomes/hg19/bed/fishClones
cd /hive/data/genomes/hg19/bed/fishClones
# Copy previous sts info from fhcrc
cp -p /hive/data/genomes/hg18/bed/fishClones/fhcrc.sts .
# This fhcrc.sts listing doesn't change. It is merely a listing
# of aliases that remain in effect.
# Create cl_acc_gi_len file form cloneend information:
grep -v "^#" /hive/data/genomes/hg19/bed/cloneend/all.txt \
| awk '{gsub(".[0-9]*$", "", $2);
printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len
hgsql -N \
-e "select chrom,chromStart,chromEnd,contig from ctgPos;" hg19 \
| sort -k1,1 -k2,2n > ctgPos.bed
hgsql -N \
-e "select chrom,chromStart,chromEnd,frag,0,strand from gold;" hg19 \
| sort -k1,1 -k2,2n > gold.bed
hgsql -N \
-e "select tName,tStart,tEnd,qName,0,strand from all_bacends;" hg19 \
| sort -k1,1 -k2,2n > all_bacends.bed
hgsql -N \
-e "select chrom,chromStart,chromEnd,name,score,strand from bacEndPairs;" hg19 \
| sort -k1,1 -k2,2n > bacEndPairs.bed
ssh hgwdev
# have to be on hgwdev for this since it is going to read from the
# database. Had to work on this program to get it past what is
# evidently a bad entry in hbrc.fixed where columns of information
# are missing for one clone in particular
time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg19 \
/hive/data/genomes/hg19/bed/ncbiCytoBand/contig/fixed.hbrc.txt \
/hive/data/outside/ncbi/fishClones/fishClones.2009-04/clac.out \
./cl_acc_gi_len \
/hive/data/genomes/hg19/bed/bacends/bacEnds.load.psl \
fishClones
# real 2m4.708s
# Reading Fish Clones file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
# reading fishInfo file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
# Reading Clone/Acc (clac.out) file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/clac.out
# Reading BAC Ends file ./cl_acc_gi_len
# Reading BAC Ends psl file /hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl
# Reading additional STS Marker links fhcrc.sts
# Determining good positions
# findClonePos: determining positions of fish clones
# Writing output file
# ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# Load the track
ssh hgwdev
cd /hive/data/genomes/hg19/bed/fishClones
hgLoadBed -notItemRgb -noBin -tab \
-sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
hg19 fishClones fishClones.bed
# Loaded 9461 elements of size 16
##############################################################################
# CytoBands from Wonhee Jang at NCBI (DONE - 2009-06-10 - Hiram)
mkdir /hive/data/genomes/hg19/bed/ncbiCytoBand
cd /hive/data/genomes/hg19/bed/ncbiCytoBand
# received the following files via email:
ls -ogrt
# -rw-rw-r-- 1 187930 Jun 10 13:53 ideogram
# -rw-rw-r-- 1 672327 Jun 8 09:55 fish.markers.bed
# created cytobands.bed from the ideogram file with:
cat << '_EOF_' > ideoToCytoBand.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH,"<ideogram") or die "can not read ideogram";
while (my $line = <FH>) {
next if $line =~ m/^#/;
chomp $line;
my ($chr, $arm, $location, $a, $b, $start, $end, $stain) =
split('\s+',$line);
next if ($location =~ m/[a-z]$/);
//g;$stain =~ s/
$start -= 1 if ($start == 1);
printf "chr%s\t%d\t%d\t%s%s\t%s\n", $chr, $start, $end, $arm, $location,
$stain;
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x ideoToCytoBand.pl
./ideoToCytoBand.pl > cytobands.bed
hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
hg19 cytoBand cytobands.bed
hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
hg19 cytoBandIdeo cytobands.bed
# checking coverage:
featureBits -noRandom -noHap -countGaps hg19 cytoBand
# 3095677412 bases of 3095693983 (99.999%) in intersection
# that is everything except chrM:
echo 3095693983-3095677412 | bc -q
# 16571
##############################################################################
# UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
mkdir /hive/data/genomes/hg19/ensembl
cd /hive/data/genomes/hg19/ensembl
wget --timestamping \
'ftp://ftp.ensembl.org/pub/pre/homo_sapiens/GRCh37/dna/*'
# do not need the repeat masker sequence (although it would be
# interesting to measure to see how it compares)
rm -f *.dna_rm.*
# fortunately we have the same sizes as Ensembl for everything
# (except the haplotypes) and the sizes are unique for each sequence
# so we can relate the names via their sizes
mkdir /hive/data/genomes/hg19/bed/ucscToEnsembl
cd /hive/data/genomes/hg19/bed/ucscToEnsembl
# the toplevel file is a duplicate of everything else
ls /hive/data/genomes/hg19/ensembl/*.fa.gz | grep -v toplevel \
| while read F
do
zcat "${F}"
done | faCount stdin > faCount.txt
cat << '_EOF_' > relateUcscEnsembl.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %ucscChrs; # key is size, value is UCSC chr name
open (FH,"<../../chrom.sizes") or die "can not read ../../chrom.sizes";
while (my $line = <FH>) {
chomp $line;
my ($chr, $size) = split('\s+', $line);
die "'$line\n'duplicate size in ../chrom.sizes" if (exists($ucscChrs{$size})
);
$ucscChrs{$size} = $chr;
}
close (FH);
my %ensemblChrs; # key is size, value is Ensembl chr name
open (FH,"<faCount.txt") or die "can not read faCount.txt";
while (my $line = <FH>) {
next if ($line =~ m/#/);
next if ($line =~ m/total/);
chomp $line;
my ($chr, $size, $rest) = split('\s+', $line, 3);
die "'$line\n'duplicate size in faCount.txt" if (exists($ensemblChrs{$size})
);
$ensemblChrs{$size} = $chr;
}
close (FH);
my %usedUcscChrs;
my %usedEnsemblChrs;
my %ensemblTranslate; # key is Ensembl name, value is UCSC size
foreach my $size (keys %ucscChrs) {
if (exists($ensemblChrs{$size})) {
$usedUcscChrs{$size} = $ucscChrs{$size};
$usedEnsemblChrs{$size} = $ensemblChrs{$size};
printf "%s\t%s\t%d\n", $ucscChrs{$size}, $ensemblChrs{$size}, $size;
} else {
my $ucscName = $ucscChrs{$size};
my $ensemblName = "unknown";
if ($ucscName =~ m/^chr6/) {
$ucscName =~ s/_hap.//;
$ucscName =~ s/chr6_/chr6_mhc_/;
$ensemblName = "HS" . uc($ucscName);
} elsif ($ucscName =~ m/^chr17_/ || $ucscName =~ m/^chr4_/) {
$ucscName =~ s/_.*/_1/;
$ensemblName = "HS" . uc($ucscName);
} elsif ($ucscName =~ m/^chrM/) {
print "# no translation for chrM\n";
} else {
die "unknown UCSC chr name: $ucscName";
}
printf "# ucsc $ucscChrs{$size} -> $ensemblName\n";
$ensemblTranslate{$ensemblName} = $size;
}
}
foreach my $size (keys %ensemblChrs) {
if (!exists($usedEnsemblChrs{$size})) {
my $ensemblName = $ensemblChrs{$size};
if (! exists($ensemblTranslate{$ensemblName})) {
die "can not translate Ensembl name $ensemblName";
} else {
my $ucscSize = $ensemblTranslate{$ensemblName};
printf "%s\t%s\t%d\t%d\n", $ucscChrs{$ucscSize}, $ensemblChrs{$size}
, $ucscSize, $size;
}
}
}
printf "chrM\tMT\n";
'_EOF_'
# << happy emacs
chmod +x relateUcscEnsembl.pl
./relateUcscEnsembl.pl 2>&1 | grep -v "^#" \
| awk '{printf "%s\t%s\n", $1, $2}' | sort > ucscToEnsembl.tab
cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
ucsc varchar(255) not null, # UCSC chromosome name
ensembl varchar(255) not null, # Ensembl chromosome name
#Indices
PRIMARY KEY(ucsc(21))
);
'_EOF_'
hgsql hg19 < ucscToEnsembl.sql
hgsql hg19 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'
awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
> ensemblLift.tab
cat << '_EOF_' > ensemblLift.sql
# UCSC offset to Ensembl coordinates
CREATE TABLE ensemblLift (
chrom varchar(255) not null, # Ensembl chromosome name
offset int unsigned not null, # offset to add to UCSC position
#Indices
PRIMARY KEY(chrom(15))
);
'_EOF_'
hgsql hg19 < ensemblLift.sql
hgsql hg19 \
-e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'
##############################################################################
# LASTZ MOUSE Mm9 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
cat << '_EOF_' > DEF
# human vs mouse
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Mouse Mm9
SEQ2_DIR=/scratch/data/mm9/nib
SEQ2_SMSK=/scratch/data/mm9/notInOthers
SEQ2_LEN=/scratch/data/mm9/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
cat fb.hg19.chainMm9Link.txt
# 1022734273 bases of 2897316137 (35.299%) in intersection
# and the swap
mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap
cd /hive/data/genomes/mm9/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \
-swap -noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 131m58.763s
cat fb.mm9.chainHg19Link.txt
# 1013880568 bases of 2620346127 (38.693%) in intersection
#########################################################################
# LASTZ Dog CanFam2 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
cat << '_EOF_' > DEF
# human vs dog
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/data/canFam2/nib
SEQ2_LEN=/scratch/data/canFam2/chrom.sizes
SEQ2_SMSK=/scratch/scratch/data/canFam2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=20000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
cat fb.hg19.chainCanFam2Link.txt
# 1532073507 bases of 2897316137 (52.879%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/canFam2/bed/blastz.hg19.swap
cd /hive/data/genomes/canFam2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13/DEF \
-noLoadChainSplit -swap \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 200m17.158s
cat fb.canFam2.chainHg19Link.txt
# 1480018167 bases of 2384996543 (62.055%) in intersection
#########################################################################
# LASTZ Chicken GalGal3 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
cat << '_EOF_' > DEF
# human vs chicken
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/galGal3/nib
SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
cat fb.hg19.chainGalGal3Link.txt
# 104053179 bases of 2897316137 (3.591%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/galGal3/bed/blastz.hg19.swap
cd /hive/data/genomes/galGal3/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13/DEF \
-swap \
-noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
# real 16m45.090s
cat fb.galGal3.chainHg19Link.txt
# 91605899 bases of 1042591351 (8.786%) in intersection
#########################################################################
# LASTZ Macaca Mulatta RheMac2 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
cat << '_EOF_' > DEF
# human vs macaca mulatta
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Macaca Mulatta RheMac2
SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit
SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 760m22.810s
cat fb.hg19.chainRheMac2Link.txt
# 2397361211 bases of 2897316137 (82.744%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
cd /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13/DEF \
-swap \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> swap.log 2>&1 &
# real 83m51.483s
cat fb.rheMac2.chainHg19Link.txt
# 2313806886 bases of 2646704109 (87.422%) in intersection
#########################################################################
# LASTZ Rat Rn4 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
cat << '_EOF_' > DEF
# human vs rat
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat Rn4
SEQ2_DIR=/scratch/data/rn4/nib
SEQ2_SMSK=/scratch/data/rn4/linSpecRep.notInHuman
SEQ2_LEN=/scratch/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 314m18.227s
cat fb.hg19.chainRn4Link.txt
# 952605822 bases of 2897316137 (32.879%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/rn4/bed/blastz.hg19.swap
cd /hive/data/genomes/rn4/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13/DEF \
-swap -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 188m0.163s
cat fb.rn4.chainHg19Link.txt
# 947862300 bases of 2571531505 (36.860%) in intersection
##############################################################################
# LASTZ Orangutan PonAbe2 (DONE - 2009-05-13 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
cd /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
cat << '_EOF_' > DEF
# human vs orangutan
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Orangutan PonAbe1
SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
cat fb.hg19.chainPonAbe2Link.txt
# 2646687531 bases of 2897316137 (91.350%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
cd /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13/DEF \
-swap \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> swap.log 2>&1 &
# real 124m3.610s
cat fb.ponAbe2.chainHg19Link.txt
# 2772351468 bases of 3093572278 (89.617%) in intersection
##############################################################################
# LASTZ Lamprey PetMar1 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Lamprey
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ2_LIMIT=5
# QUERY: Lamprey petMar1
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
> do.log 2>&1 &
# real 113m20.116s
cat fb.hg19.chainPetMar1Link.txt
# 31347143 bases of 2897316137 (1.082%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/petMar1/bed/blastz.hg19.swap
cd /hive/data/genomes/petMar1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
-swap > swap.log 2>&1 &
# real 59m14.813s
cat fb.petMar1.chainHg19Link.txt
# 26615001 bases of 831696438 (3.200%) in intersection
##############################################################################
# LASTZ Fugu Fr2 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Fugu
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Fugu fr2
# Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/scratch/data/fr2/fr2.2bit
SEQ2_LEN=/hive/data/genomes/fr2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/fr2/noUn/fr2.scaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/fr2/noUn/fr2.scaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/fr2/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
> do.log 2>&1 &
# real 5797m9.288s
# had a small problem finishing the fundamental batch run, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=cat -qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
> cat.log 2>&1 &
cat fb.hg19.chainFr2Link.txt
# 49309456 bases of 2897316137 (1.702%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/fr2/bed/blastz.hg19.swap
cd /hive/data/genomes/fr2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
-swap > swap.log 2>&1 &
# real 25m8.491s
cat fb.fr2.chainHg19Link.txt
# 42984130 bases of 393312790 (10.929%) in intersection
##############################################################################
# LASTZ Tetraodon TetNig1 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
cat << '_EOF_' > DEF
# human vs tetraodon
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/data/tetNig1/tetNig1.2bit
SEQ2_LEN=/hive/data/genomes/tetNig1/chrom.sizes
SEQ2_CHUNK=410000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 166m19.745s
cat fb.hg19.chainTetNig1Link.txt
# 58038079 bases of 2897316137 (2.003%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
cd /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-swap > swap.log 2>&1 &
# real 29m20.968s
cat fb.tetNig1.chainHg19Link.txt
# 49453375 bases of 342403326 (14.443%) in intersection
##############################################################################
# LASTZ Stickleback GasAcu1 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Stickleback
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# TARGET: Stickleback gasAcu1
SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
SEQ2_LEN=/hive/data/genomes/gasAcu1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 174m40.659s
cat fb.hg19.chainGasAcu1Link.txt
# 55509003 bases of 2897316137 (1.916%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
cd /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-swap > swap.log 2>&1 &
# real 29m41.433s
cat fb.gasAcu1.chainHg19Link.txt
# 49909819 bases of 446627861 (11.175%) in intersection
##############################################################################
# LASTZ Marmoset CalJac1 (DONE - 2009-05-14,22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
cat << '_EOF_' > DEF
# human vs. marmoset
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Marmoset (calJac1)
SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/scratch/data/calJac1/chrom.sizes
SEQ2_LIMIT=200
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# real 214m16.294s
cat fb.hg19.chainCalJac1Link.txt
# 2053025318 bases of 2897316137 (70.860%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 calJac1 > rbest.log 2>&1 &
# real 97m17.207s
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/calJac1/bed/blastz.hg19.swap
cd /hive/data/genomes/calJac1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14/DEF \
-swap \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> swap.log 2>&1 &
# real 162m52.189s
cat fb.calJac1.chainHg19Link.txt
# 2105959656 bases of 2929139385 (71.897%) in intersection
#########################################################################
# LASTZ Tarsier TarSyr1 (DONE - 2009-05-14,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Tarsier
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Tarsier
SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit
SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# real 1724m48.032s
# need to load the chain table manually:
# mySQL error 1114: The table 'chainTarSyr1Link' is full
cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14/axtChain
wc -l *.tab
# 21882142 chain.tab
# 165017606 link.tab
# 186899748 total
awk '{print length($0)}' link.tab | sort | uniq -c | less
4 23
9 24
27 25
105 26
767 27
1401 28
5020 29
8472 30
24390 31
117666 32
264774 33
776095 34
1632393 35
2672187 36
7125988 37
16831901 38
34905113 39
45218159 40
31570706 41
13746548 42
5868689 43
2460114 44
1118556 45
420826 46
106674 47
36770 48
40719 49
36955 50
19389 51
5571 52
1557 53
61 54
time nice -n +19 hgsql -e "DROP TABLE chainTarSyr1Link;" hg19
cat << '_EOF_' | hgsql hg19
CREATE TABLE chainTarSyr1Link (
bin smallint(5) unsigned NOT NULL default 0,
tName varchar(255) NOT NULL default '',
tStart int(10) unsigned NOT NULL default 0,
tEnd int(10) unsigned NOT NULL default 0,
qStart int(10) unsigned NOT NULL default 0,
chainId int(10) unsigned NOT NULL default 0,
KEY tName (tName(16),bin),
KEY chainId (chainId)
) ENGINE=MyISAM max_rows=166000000 avg_row_length=42 pack_keys=1 CHARSET=latin1;
'_EOF_'
# << happy emacs
time nice -n +19 hgsql -e \
"load data local infile \"link.tab\" into table chainTarSyr1Link;" hg19
# real 157m0.230s
# the running the rest of loadUp.csh after the hgLoadChain
# real 26m8.263s
cat fb.hg19.chainTarSyr1Link.txt
# 1385797066 bases of 2897316137 (47.830%) in intersection
# Continuing:
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-continue=download -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> download.log 2>&1 &
# real 48m6.573s
# ran the script on swarm to recover after hive outages
time doRecipBest.pl -buildDir=`pwd` hg19 tarSyr1 > rbest.log 2>&1 &
# real 404m0.201s
time doRecipBest.pl -continue=download -buildDir=`pwd` \
hg19 tarSyr1 > rbest.download.log 2>&1 &
#########################################################################
# LASTZ Bushbaby OtoGar1 (DONE - 2009-05-14,22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Bushbaby
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold
SEQ2_DIR=/scratch/data/otoGar1/otoGar1.rmsk.2bit
SEQ2_LEN=/hive/data/genomes/otoGar1/chrom.sizes
SEQ2_LIMIT=200
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# real 762m56.055s
cat fb.hg19.chainOtoGar1Link.txt
# 1264492372 bases of 2897316137 (43.644%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 otoGar1 > rbest.log 2>&1 &
# real 271m39.925s
#########################################################################
# LASTZ Mouse lemur MicMur1 (DONE - 2009-05-14,26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
cat << '_EOF_' > DEF
# Human vs. Mouse lemur
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Mouse lemur
SEQ2_DIR=/hive/data/genomes/micMur1/bed/repeatMasker/micMur1.rmsk.2bit
SEQ2_LEN=/hive/data/genomes/micMur1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 5429m52.082s
# there is one unusual long running job having trouble
# continuing after finishing the lastz run manually:
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-continue=cat -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> cat.log 2>&1 &
# real 388m25.032s
cat fb.hg19.chainMicMur1Link.txt
# 1347792207 bases of 2897316137 (46.519%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 micMur1 > rbest.log 2>&1
# about 4h30m
#########################################################################
# LASTZ Baboon PapHam1 (DONE - 2009-05-20,22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
cd /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
cat << '_EOF_' > DEF
# human vs baboon
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Baboon papHam1
SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit
SEQ2_LEN=/scratch/data/papHam1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# forgot that the synNet was not needed here, use recip best as below
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
cat fb.hg19.chainPapHam1Link.txt
# 2399269031 bases of 2897316137 (82.810%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 papHam1 > rbest.log 2>&1
# real 182m0.276s
#########################################################################
# SGP GENES (DONE - 2009-05-22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/sgpGene
cd /hive/data/genomes/hg19/bed/sgpGene
mkdir download
cd download
for C in `cut -f1 ../../../chrom.sizes`
do
echo $C
wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.gtf
wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.prot
done
cd ..
cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 sgpGene stdin
# Read 33994 transcripts in 291782 lines in 1 files
# 33994 groups 85 seqs 1 sources 3 feature types
# 33994 gene predictions
nice -n +19 featureBits -enrichment hg19 refGene:CDS sgpGene
# refGene:CDS 1.181%, sgpGene 1.295%, both 1.011%, cover 85.59%, enrich 66.08x
###########################################################################
# GENEID GENE PREDICTIONS (DONE - 2009-05-22 - Hiram)
ssh hgwdev
mkdir /hive/data/genomes/hg19/bed/geneid
cd /hive/data/genomes/hg19/bed/geneid
mkdir download
cd download
for C in `cut -f1 ../../../chrom.sizes`
do
echo $C
wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.gtf
wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.prot
done
cd ..
cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 geneid stdin
# Read 33428 transcripts in 277332 lines in 1 files
# 33428 groups 92 seqs 1 sources 3 feature types
# 33428 gene predictions
##########################################################################
## 4-Way Multiz for UCSC Genes construction (DONE - 2009-05-22 - Hiram)
ssh hgwdev
mkdir /hive/data/genomes/hg19/bed/multiz4way
cd /hive/data/genomes/hg19/bed/multiz4way
# extract our 4 organisms from the 44-way on hg18:
ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh
/cluster/bin/phast/tree_doctor \
--prune-all-but hg18,mm9,canFam2,rheMac2 44way.nh \
| sed -e "s/hg18/hg19/" > 4way.nh
# this looks like:
cat 4way.nh
(((hg19:0.032973,rheMac2:0.036199):0.109706,mm9:0.352605):0.020666,canFam2:0.193569);
# Use this specification in the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to obtain a gif image for htdocs/images/phylo/hg19_4way.gif
/cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
# Use this output to create the table below
grep -y hg19 4way.distances.txt | sort -k3,3n
#
# If you can fill in all the numbers in this table, you are ready for
# the multiple alignment procedure
#
# featureBits chainLink measures
# chainHg19Link chain linearGap
# distance on hg19 on other minScore
# 1 0.069172 - rhesus rheMac2 (% 82.744) (% xx.xxx) 5000 medium
# 2 0.356914 - dog canFam2 (% 52.879) (% xx.xxx) 3000 medium
# 3 0.495284 - mouse mm9 (% 35.299) (% 38.693) 3000 medium
# using the syntenic nets
cd /cluster/data/hg19/bed/multiz4way
mkdir mafLinks
cd mafLinks
mkdir rheMac2 canFam2 mm9
cd mm9
ln -s ../../../lastz.mm9/mafSynNet/*.maf.gz .
cd ../canFam2
ln -s ../../../lastz.canFam2/mafSynNet/*.maf.gz .
cd ../rheMac2
ln -s ../../../lastz.rheMac2/mafSynNet/*.maf.gz .
# determine what is the newest version of multiz and use that
cd /hive/data/genomes/hg19/bed/multiz4way
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
# the autoMultiz cluster run
ssh swarm
cd /hive/data/genomes/hg19/bed/multiz4way
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
4way.nh > tmp.nh
echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir run maf
cd run
# NOTE: you need to set the db and multiz dirname properly in this script
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = hg19
set c = $1
set maf = $2
set binDir = /hive/data/genomes/hg19/bed/multiz4way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/hg19/bed/multiz4way/mafLinks
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz4way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 /cluster/data/hg19/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
# 93 jobs
para try ... check ... push ... etc ...
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 24282s 404.70m 6.75h 0.28d 0.001 y
# IO & Wait Time: 2362s 39.36m 0.66h 0.03d 0.000 y
# Average job time: 286s 4.77m 0.08h 0.00d
# Longest finished job: 2235s 37.25m 0.62h 0.03d
# Submission to last job: 2241s 37.35m 0.62h 0.03d
# combine results into a single file for loading and gbdb reference
cd /hive/data/genomes/hg19/bed/multiz4way
time nice -n +19 catDir maf > multiz4way.maf
# real 3m27.561s
# makes a 8.5 Gb file:
# -rw-rw-r-- 1 9026080732 May 22 11:11 multiz4way.maf
# Load into database
ssh hgwdev
cd /hive/data/genomes/hg19/bed/multiz4way
mkdir /gbdb/hg19/multiz4way
ln -s /hive/data/genomes/hg19/bed/multiz4way/multiz4way.maf \
/gbdb/hg19/multiz4way
# the hgLoadMaf generates huge tmp files, locate them in /scratch/tmp/
cd /scratch/tmp
time nice -n +19 hgLoadMaf hg19 multiz4way
# real 5m31.883s
# Loaded 5788627 mafs in 1 files from /gbdb/hg19/multiz4way
cd /hive/data/genomes/hg19/bed/multiz4way
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 hg19 multiz4waySummary multiz4way.maf
# Created 1238721 summary blocks from 11959676 components
# and 5788627 mafs from multiz4way.maf
# real 6m33.936s
#########################################################################
# LASTZ Medaka OryLat2 (DONE - 2009-05-22 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
cd /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
cat << '_EOF_' > DEF
# Human vs. Medaka
# typical parameters for a genome that is distant from human
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ2_LEN=/hive/data/genomes/oryLat2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=200
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 124m5.298s
cat fb.hg19.chainOryLat2Link.txt
# 53571737 bases of 2897316137 (1.849%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
cd /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-swap > swap.log 2>&1 &
# real 28m35.174s
cat fb.oryLat2.chainHg19Link.txt
# 46961818 bases of 700386597 (6.705%) in intersection
##############################################################################
# LASTZ Opossum MonDom5 (DONE - 2009-05-23,29 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
cd /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
cat << '_EOF_' > DEF
# human vs. opossum
# settings for more distant organism alignments
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Opossum monDom5
SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# One job took a long time to complete, had to run it manually on
# swarm:
# /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
# /scratch/data/hg19/hg19.2bit:chr19:50000000-59128983 \
# /scratch/data/monDom5/monDom5.2bit:chr4:390000000-420000000 \
# ../DEF \
# ../psl/hg19.2bit:chr19:50000000-59128983/hg19.2bit:chr19:50000000-59128983_monDom5.2bit:chr4:390000000-420000000.psl
# took about 48 hours, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=cat > cat.log 2>&1 &
# real 1508m18.471s == about 25h08m
cat fb.hg19.chainMonDom5Link.txt
# 415997117 bases of 2897316137 (14.358%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
# real 20m29.049s
mkdir /hive/data/genomes/monDom5/bed/blastz.hg19.swap
cd /hive/data/genomes/monDom5/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-swap -syntenicNet > swap.log 2>&1 &
# real 297m13.041s
cat fb.monDom5.chainHg19Link.txt
# 406727849 bases of 3501660299 (11.615%) in intersection
##############################################################################
# LASTZ Armadillo DasNov2 (DONE - 2009-05-23,28 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
cd /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
cat << '_EOF_' > DEF
# Human vs. Armadillo
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Armadillo
SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit
SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# finished the lastz run manually after hive maintenance outages
# then, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-continue=cat > cat.log 2>&1 &
# real 458m11.304s
cat fb.hg19.chainDasNov2Link.txt
# 971847303 bases of 2897316137 (33.543%) in intersection
time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 dasNov2 \
> rbest.log 2>&1
# time about 6h30m
##############################################################################
# LASTZ Rock Hyrax ProCap1 (DONE - 2009-05-23,26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
cd /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
cat << '_EOF_' > DEF
# Human vs. Rock Hyrax
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Rock Hyrax
SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit
SEQ2_LEN=/scratch/data/proCap1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# Completed: 997438 of 997438 jobs
# CPU time in finished jobs: 32830587s 547176.45m 9119.61h 379.98d 1.041 y
# IO & Wait Time: 9549484s 159158.07m 2652.63h 110.53d 0.303 y
# Average job time: 42s 0.71m 0.01h 0.00d
# Longest finished job: 1953s 32.55m 0.54h 0.02d
# Submission to last job: 67216s 1120.27m 18.67h 0.78d
# finished lastz run manually, then continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=cat > cat.log 2>&1 &
# real 369m1.678s
cat fb.hg19.chainProCap1Link.txt
# 894221652 bases of 2897316137 (30.864%) in intersection
time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 proCap1 \
> rbest.log 2>&1
# real 251m59.549s
##############################################################################
# LASTZ Zebra Finch TaeGut1 (DONE - 2009-05-26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
cd /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
cat << '_EOF_' > DEF
# human vs Zebra Finch
# distant from Human settings
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Zebra Finch taeGut1 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-qRepeats=windowmaskerSdust > do.log 2>&1 &
cat fb.hg19.chainTaeGut1Link.txt
# real 192m48.479s
# 101295490 bases of 2897316137 (3.496%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
-chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=syntenicNet -qRepeats=windowmaskerSdust > synNet.log 2>&1 &
# real 4m10.261s
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
cd /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26/DEF \
-swap -noLoadChainSplit -chainMinScore=5000 \
-chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-qRepeats=windowmaskerSdust > swap.log 2>&1 &
# real real 16m45.080s
cat fb.taeGut1.chainHg19Link.txt
# 95320369 bases of 1222864691 (7.795%) in intersection
##############################################################################
# LASTZ Lizard AnoCar1 (DONE - 2009-05-30,31 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
cd /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
cat << '_EOF_' > DEF
# human vs lizard
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Lizard anoCar1
SEQ2_DIR=/scratch/data/anoCar1/anoCar1.2bit
SEQ2_LEN=/scratch/data/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-qRepeats=windowmaskerSdust > do.log 2>&1 &
# real 168m32.016s
cat fb.hg19.chainAnoCar1Link.txt
# 104045950 bases of 2897316137 (3.591%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 anoCar1 > rbest.log 2>&1
# real 45m58.001s
# running syntenic Net 2009-08-27 - Hiram
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-continue=syntenicNet -syntenicNet \
-qRepeats=windowmaskerSdust > syntenicNet.log 2>&1 &
# real 6m13.304s
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
cd /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
# real 34m55.857s
cat fb.anoCar1.chainHg19Link.txt
# 89608316 bases of 1741478929 (5.146%) in intersection
##############################################################################
# LASTZ X. tropicalis XenTro2 (DONE - 2009-05-26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
cd /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
cat << '_EOF_' > DEF
# human vs X. tropicalis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Lizard anoCar1
SEQ2_DIR=/scratch/data/xenTro2/xenTro2.2bit
SEQ2_LEN=/scratch/data/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 1129m11.568s
# finished the lastz run manually after hive difficulties, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-continue=cat > cat.log 2>&1 &
# time about 1h30m
cat fb.hg19.chainXenTro2Link.txt
# 92015242 bases of 2897316137 (3.176%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
cd /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-swap > swap.log 2>&1 &
# real 130m53.860s
cat fb.xenTro2.chainHg19Link.txt
# 92070065 bases of 1359412157 (6.773%) in intersection
##############################################################################
# LASTZ Zebrafish DanRer5 (DONE - 2009-05-26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
cd /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
cat << '_EOF_' > DEF
# human vs X. zebrafish
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Zebrafish danRer5
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/scratch/data/danRer5/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=40
BASE=/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 311m39.817s
cat fb.hg19.chainDanRer5Link.txt
# 74229561 bases of 2897316137 (2.562%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/danRer5/bed/blastz.hg19.swap
cd /hive/data/genomes/danRer5/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-swap > swap.log 2>&1 &
# real 26m54.605s
cat fb.danRer5.chainHg19Link.txt
# 73852780 bases of 1435609608 (5.144%) in intersection
##############################################################################
# LASTZ Platypus OrnAna1 (DONE - 2009-05-26 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
cd /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
cat << '_EOF_' > DEF
# human vs platypus
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Platypus ornAna1
SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 572m18.808s
cat fb.hg19.chainOrnAna1Link.txt
# 220977689 bases of 2897316137 (7.627%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 ornAna1 > rbest.log 2>&1
# time about 1h32m
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
cd /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26/DEF \
-swap -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> swap.log 2>&1 &
# real 146m52.638s
cat fb.ornAna1.chainHg19Link.txt
# 207415519 bases of 1842236818 (11.259%) in intersection
##############################################################################
# LASTZ Elephant LoxAfr2 (DONE - 2009-05-27,29 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
cd /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
cat << '_EOF_' > DEF
# Human vs. Elephant
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Elephant
SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit
SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# time about 3h23m
cat fb.hg19.chainLoxAfr2Link.txt
# 1018502258 bases of 2897316137 (35.153%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr2 > rbest.log 2>&1
# real 322m37.502s
##############################################################################
# LASTZ Tenrec EchTel1 (DONE - 2009-05-27 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
cd /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
cat << '_EOF_' > DEF
# Human vs. Tenrec
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tenrec
SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/data/echTel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 1153m34.595s
cat fb.hg19.chainEchTel1Link.txt
# 669856841 bases of 2897316137 (23.120%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 echTel1 > rbest.log 2>&1
# time about 7h13m
##############################################################################
# LASTZ Tree Shrew TupBel1 (DONE - 2009-05-27,06-02 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
cat << '_EOF_' > DEF
# Human vs. Tree Shrew
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tree Shrew
SEQ2_DIR=/scratch/data/tupBel1/tupBel1.2bit
SEQ2_LEN=/scratch/data/tupBel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
> do.log 2>&1 &
# real 811m54.095s
# having trouble with pk, finished manually
# XXX there is one job that is taking forever ...
# finished it in pieces on swarm in a few minutes, like this:
mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
#!/bin/sh
S=100000000
E=101010000
export S E
for I in 0 1 2 3 4 5 6 7 8 9
do
echo $S $E
/usr/bin/time -p /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
/scratch/data/hg19/nib/chr1.nib:chr1:${S}-${E} ../qParts/part019.lst \
../../DEF psl/chr1.nib:chr1:${S}-${E}_part019.lst.psl
nextS=`echo $S | awk '{printf "%d", $1 + 1000000}'`
nextE=`echo $E | awk '{printf "%d", $1 + 1000000}'`
S=$nextS
E=$nextE
done
grep -h "^#" psl/chr* | sort -u > result.psl
grep -h -v "^#" psl/chr* | sort -k14,14 -k16,16n >> result.psl
cp -p result.psl \
../../psl/chr1.nib:chr1:100000000-110010000/chr1.nib:chr1:100000000-110010000_part019.lst.psl
# then, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
-continue=cat > cat.log 2>&1 &
# real 212m22.707s
time doRecipBest.pl -buildDir=`pwd` hg19 tupBel1 > rbest.log 2>&1
# time about 4h22m
##############################################################################
# LASTZ Shrew SorAra1 (DONE - 2009-05-28,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
cd /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
cat << '_EOF_' > DEF
# Human vs. Shrew
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Shrew
SEQ2_DIR=/scratch/data/sorAra1/sorAra1.2bit
SEQ2_LEN=/scratch/data/sorAra1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# time about 23h26m
cat fb.hg19.chainSorAra1Link.txt
# 572519288 bases of 2897316137 (19.760%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 sorAra1 > rbest.log 2>&1
# real 251m20.055s
##############################################################################
# LASTZ Rabbit OryCun1 (DONE - 2009-05-28,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
cd /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
cat << '_EOF_' > DEF
# Human vs. Rabbit
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rabbit
SEQ2_DIR=/scratch/data/oryCun1/oryCun1.2bit
SEQ2_LEN=/scratch/data/oryCun1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# time about 23h09m
cat fb.hg19.chainOryCun1Link.txt
# 975693323 bases of 2897316137 (33.676%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 oryCun1 > rbest.log 2>&1
# real 318m1.142s
##############################################################################
# LASTZ Hedgehog EriEur1 (DONE - 2009-05-28,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
cd /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
cat << '_EOF_' > DEF
# Human vs. Hedgehog
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Hedgehog
SEQ2_DIR=/scratch/data/eriEur1/eriEur1.2bit
SEQ2_LEN=/scratch/data/eriEur1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=500
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
# real 2043m33.198s
cat fb.hg19.chainEriEur1Link.txt
# 560965051 bases of 2897316137 (19.362%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 eriEur1 > rbest.log 2>&1
# real 350m17.737s
##############################################################################
# LASTZ Pika OchPri2 (DONE - 2009-05-29,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
cd /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
cat << '_EOF_' > DEF
# Human vs. Pika
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Pika
SEQ2_DIR=/scratch/data/ochPri2/ochPri2.2bit
SEQ2_LEN=/scratch/data/ochPri2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 393m42.569s
cat fb.hg19.chainOchPri2Link.txt
# 804516397 bases of 2897316137 (27.768%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 ochPri2 > rbest.log 2>&1
# real 224m47.979s
##############################################################################
# LASTZ Kangaroo Rat DipOrd1 (DONE - 2009-05-29,30 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
cd /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
cat << '_EOF_' > DEF
# Human vs. Kangaroo Rat
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Kangaroo Rat
SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit
SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 688m47.595s
time doRecipBest.pl -buildDir=`pwd` hg19 dipOrd1 > rbest.log 2>&1
# real 140m42.014s
##############################################################################
# LIFTOVER TO Hg18 (DONE - 2009-06-04 - Hiram )
mkdir /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
cd /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
# -debug run to create run dir, preview scripts...
# verifies files can be found
doSameSpeciesLiftOver.pl -debug hg19 hg18
# Real run:
time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
hg19 hg18 > do.log 2>&1
# real 115m26.071s
#############################################################################
# BLASTZ/CHAIN/NET/ETC 11 GENOMES TO HG19 (DONE, Andy 2009-06-06)
ssh hgwdev
cd /hive/data/genomes/hg19/bed
mkdir lastz{SpeTri1,FelCat3,CavPor3,BosTau4,PteVam1,EquCab2,VicPac1,MyoLuc1,TurTru1,ChoHof1}.2009-06-04
ln -s lastzSpeTri1.2009-06-04 lastz.speTri1
ln -s lastzFelCat3.2009-06-04 lastz.felCat3
ln -s lastzCavPor3.2009-06-04 lastz.cavPor3
ln -s lastzBosTau4.2009-06-04 lastz.bosTau4
ln -s lastzPteVam1.2009-06-04 lastz.pteVam1
ln -s lastzEquCab2.2009-06-04 lastz.equCab2
ln -s lastzVicPac1.2009-06-04 lastz.vicPac1
ln -s lastzMyoLuc1.2009-06-04 lastz.myoLuc1
ln -s lastzTurTru1.2009-06-04 lastz.turTru1
ln -s lastzChoHof1.2009-06-04 lastz.choHof1
cat > lastz.speTri1/DEF << 'EOF'
# human vs squirrel
# TARGET: human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: squirrel speTri1
SEQ2_DIR=/hive/data/genomes/speTri1/speTri1.2bit
SEQ2_LEN=/hive/data/genomes/speTri1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastz.speTri1
TMPDIR=/scratch/tmp
EOF
sed 's/speTri1/felCat3/g; s/squirrel/cat/;' lastz.speTr1/DEF | \
sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' \
> lastz.felCat3/DEF
sed 's/speTri1/cavPor3/g; s/squirrel/guinea pig/;' lastz.speTr1/DEF | \
sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/' | \
sed 's/hive\/data\/genomes\/cavPor3/scratch\/data\/cavPor3/' \
> lastz.cavPor3/DEF
sed 's/speTri1/bosTau4/g; s/squirrel/cow/;' lastz.speTr1/DEF | \
sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' \
> lastz.bosTau4/DEF
sed 's/speTri1/pteVam1/g; s/squirrel/megabat/;' lastz.speTr1/DEF | \
sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=2/' \
> lastz.pteVam1/DEF
sed 's/cavPor3/equCab2/g; s/guinea pig/horse/' lastz.cavPor3/DEF | \
sed 's/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' > lastz.equCab2/DEF
sed 's/equCab2/vicPac1/g; s/horse/alpaca/' lastz.equCab2/DEF > lastz.vicPac1/DEF
sed 's/pteVam1/myoLuc1/g; s/megabat/microbat/' lastz.pteVam1/DEF | \
sed 's/SEQ2_LIMIT=3/SEQ2_LIMIT=2/' > lastz.myoLuc1/DEF
sed 's/equCab2/turTru1/g; s/horse/dolphin/' lastz.equCab2/DEF | \
sed 's/SEQ2_LIMIT=3/SEQ2_LIMIT=2/' > lastz.turTru1/DEF
sed 's/equCab2/choHof11/g; s/horse/sloth/' lastz.equCab2/DEF > lastz.choHof1/DEF
cd andy/
for db in speTri1 felCat3 cavPor3 bosTau4 pteVam1 equCab2 vicPac1 myoLuc1 turTru1 choHof1; do
ln -s ../lastz.${db}/DEF ${db}.DEF
done
screen -S speTri1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium speTri1.DEF >& speTri1.do.log
# [detach screen]
#real 2059m30.699s
screen -S felCat3
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium felCat3.DEF >& felCat3.do.log
# [detach screen]
#real 1574m47.522s
screen -S bosTau4
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium bosTau4.DEF >& bosTau4.do.log
# [detach screen]
#real 1474m54.655s
screen -S pteVam1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm
-chainMinScore=3000 -chainLinearGap=medium pteVam1.DEF >& pteVam1.do.log
# [detach screen]
#real 1168m33.923s
screen -S equCab2
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium -syntenicNet equCab2.DEF >& equCab2.do.log
# [detach screen]
#real 1662m56.158s
# (included syntenic net)
screen -S vicPac1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium vicPac1.DEF >& vicPac1.do.log
# [detach screen]
#real 1495m48.173s
screen -S turTru1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium turTru1.DEF >& turTru1.do.log
# [detach screen]
#real 1079m17.234s
screen -S choHof1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium choHof1.DEF >& choHof1.do.log
# [detach screen]
#real 1310m49.287s (script and cluster run stopped after halfway...
# pk was too slow... remaining jobs started on swarm)
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium -continue=cat \
choHof1.DEF >& choHof1.doAfterBlastz.log
#real 257m32.701s
screen -S cavPor3
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
-smallClusterHub=memk -bigClusterHub=pk cavPor3.DEF >& cavPor3.do.log
# [detach screen]
#real 1370m5.258s
# TROUBLE! got to the 'load' step and failed. This one needs a special
# chain table and chainLink table to get loaded.
cd ../lastz.cavPor3/axtChain/
# figure out number of rows and average length
wc -l *.tab
# 27186468 chain.tab
# 240602108 link.tab
randomLines link.tab 10000000 stdout | awk '{print length($0)}' | sort | uniq -c
randomLines chain.tab 1000000 stdout | awk '{print length($0)}' | sort | uniq -c
# about 43 average length for the chainLink and 100 for the chain
sed "s/hgLoadChain.*/hgsqldump hg19 chainSpeTri1Link --no-data --skip-comments | sed \'s\/SpeTri1\/CavPor3\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=241000000 avg_row_length=43 pack_keys=1 CHARSET=latin1\/\' | hgsql hg19 \n\
hgsqldump hg19 chainSpeTri1 --no-data --skip-comments | sed \'s\/SpeTri1\/CavPor3\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=27200000 avg_row_length=100 pack_keys=1 CHARSET=latin1\/\' | hgsql hg19 \n\
hgsql hg19 -e \"load data local infile \'chain.tab\' into table chainCavPor3\"\n\
hgsql hg19 -e \"load data local infile \'link.tab\' into table chainCavPor3Link\"\n\
hgsql hg19 -e \"INSERT into history (ix, startId, endId, who, what, modTime, errata) VALUES(NULL,0,0,\'aamp\',\'Loaded 27186468 chains into cavPor3 chain table manually\', NOW(), NULL)\"\
/" loadUp.csh > manualLoadUp.csh
chmod +x manualLoadUp.csh
time nice -n +19 ./manualLoadUp.csh
# [detach screen]
#real 584m4.093s
cd ../../andy/
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
-smallClusterHub=memk -bigClusterHub=swarm -continue=download \
cavPor3.DEF >& cavPor3.doAfterLoad.log
#real 5m45.122s
# syntenic nets
screen -r bosTau4
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
-continue=syntenicNet bosTau4.DEF >& bosTau4.syn.log
#real 31m48.545s
# reciprocal best choHof1 and cavPor3
screen -r choHof1
time nice -n +19 doRecipBest.pl -buildDir=/hive/data/genomes/hg19/bed/lastz.choHof1 \
-workhorse=hgwdev hg19 choHof1 >& choHof1.doRecip.log
#real 367m52.993s
screen -r cavPor3
time nice -n +19 doRecipBest.pl -buildDir=/hive/data/genomes/hg19/bed/lastz.cavPor3 \
-workhorse=hgwdev hg19 cavPor3 >& cavPor3.doRecip.log
#real 123m3.795s
# reciprocal best small six genome memk run
screen -S recipRun
mkdir recipRun
cd recipRun/
cat > gsub << 'EOF'
#LOOP
./doRecip.sh $(path1)
#ENDLOOP
'EOF'
cat > doRecip.sh << 'EOF'
#!/bin/csh -ef
set db = $1
/cluster/bin/scripts/doRecipBest.pl -workhorse=`uname -n` -stop=recipBest -buildDir=/hive/data/genomes/hg19/bed/lastz.$db hg19 $db >& $db.recipBest.log
'EOF'
chmod +x doRecip.sh
cat > db.lst << 'EOF'
speTri1
vicPac1
myoLuc1
turTru1
pteVam1
felCat3
EOF
ssh memk
cd /hive/data/genomes/hg19/bed/andy/recipRun
gensub2 db.lst single gsub jobList
para create jobList
para push
# finished overnight
exit # to hgwdev
for log in *.recipBest.log; do
db=${log%.recipBest.log};
echo $db;
doRecipBest.pl -workhorse=hgwdev -continue=download \
-buildDir=/hive/data/genomes/hg19/bed/lastz.$db \
hg19 $db >& $db.recipBestDownload.log;
done
# swaps for equCab2, felCat3, bostTau4, cavPor3
cd /hive/data/genomes/hg19/bed/andy
screen -r equCab2
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u01 \
-chainMinScore=3000 -chainLinearGap=medium -swap equCab2.DEF >& equCab2.doSwap.log
# [detach screen]
#real 486m35.206s
screen -r felCat3
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u02 \
-chainMinScore=3000 -chainLinearGap=medium -swap felCat3.DEF >& felCat3.doSwap.log
# [detach screen]
#real 463m5.257s
screen -r bosTau4
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u03 \
-chainMinScore=3000 -chainLinearGap=medium -swap bosTau4.DEF >& bosTau4.doSwap.log
# [detach screen]
#real 391m40.132s
screen -r cavPor3
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=hgwdev
-chainMinScore=3000 -chainLinearGap=medium -swap cavPor3.DEF >& cavPor3.doSwap.log
# [detach screen]
real 192m39.792s
##########################################################################
# LASTZ Venter's Poodle canFamPoodle1 (DONE - 2009-06-05,10 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
cd /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
cat << '_EOF_' > DEF
# human vs Venter's poodle
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/data/canFamPoodle1/canFamPoodle1.2bit
SEQ2_LEN=/scratch/data/canFamPoodle1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LAP=0
SEQ2_LIMIT=600
BASE=/hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noDbNameCheck -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium
# real 5162m58.743s
cat fb.hg19.chainCanFamPoodle1Link.txt
# 898034247 bases of 2897316137 (30.995%) in intersection
# the original canFam2 measured:
# 1532073507 bases of 2897316137 (52.879%) in intersection
time nice -n +19 doRecipBest.pl -buildDir=`pwd` \
hg19 canFamPoodle1 > rbest.log 2>&1 &
# real 811m27.965s
##############################################################################
## 46-Way Multiz (WORKING - 2009-06-09 - Hiram)
mkdir /hive/data/genomes/hg19/bed/multiz46way
cd /hive/data/genomes/hg19/bed/multiz46way
# starting with the 46way tree created from 44 way tree
cat << '_EOF_' > 46way.nh
(((((((((((((((((
((hg19:0.006591,panTro2:0.006639):0.002184,gorGor1:0.009411):0.009942,
ponAbe2:0.018342):0.014256,rheMac2:0.036199):0.021496,papHam1:0.04):0.02,
calJac1:0.066389):0.056911,tarSyr1:0.135169):0.011307,
(micMur1:0.091452,otoGar1:0.128984):0.035463):0.015304,
tupBel1:0.183583):0.004688,(((((mm9:0.083220,rn4:0.090564):0.196605,
dipOrd1:0.209532):0.022555,cavPor3:0.223415):0.009828,
speTri1:0.146894):0.025042,
(oryCun2:0.116009,ochPri2:0.198295):0.100037):0.015355):0.020666,
(((vicPac1:0.105252,(turTru1:0.064182,bosTau4:0.121911):0.025111):0.039691,
((equCab2:0.107726,(felCat3:0.097971,canFam2:0.100888):0.049486):0.006252,
(myoLuc1:0.141155,pteVam1:0.111787):0.033187):0.004179):0.011699,
(eriEur1:0.220580,sorAra1:0.266859):0.056117):0.021065):0.023276,
(((loxAfr3:0.083775,proCap1:0.152633):0.026190,echTel1:0.240221):0.049905,
(dasNov2:0.115179,choHof1:0.096272):0.052373):0.006713):0.132748,
macEug1:0.3):0.1,
monDom5:0.325899):0.072430,ornAna1:0.453916):0.109903,
((galGal3:0.166386,taeGut1:0.170717):0.199763,
anoCar1:0.509545):0.108130):0.166150,xenTro2:0.852482):0.300396,
(((tetNig2:0.224774,fr2:0.205294):0.191836,
(gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,
danRer6:0.731166):0.155214):0.511293,petMar1:0.511293);
'_EOF_'
# << happy emacs
# Use this specification in the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to obtain a gif image for htdocs/images/phylo/hg19_46way.gif
/cluster/bin/phast/all_dists 46way.nh > 46way.distances.txt
# Use this output to create the table below, with this perl script:
cat << '_EOF_' > sizeStats.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH, "grep -y hg19 46way.distances.txt | sort -k3,3n|") or
die "can not read 46way.distances.txt";
my $count = 0;
while (my $line = <FH>) {
chomp $line;
my ($hg19, $D, $dist) = split('\s+', $line);
my $chain = "chain" . ucfirst($D);
my $B="/hive/data/genomes/hg19/bed/lastz.$D/fb.hg19." .
$chain . "Link.txt";
my $chainLinkMeasure =
`awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
chomp $chainLinkMeasure;
$chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
$chainLinkMeasure =~ s/\%//;
my $swapFile="/hive/data/genomes/${D}/bed/blastz.hg19.swap/fb.${D}.chainHg19Link.txt";
my $swapMeasure = "N/A";
if ( -s $swapFile ) {
$swapMeasure =
`awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
chomp $swapMeasure;
$swapMeasure = 0.0 if (length($swapMeasure) < 1);
$swapMeasure =~ s/\%//;
}
my $orgName=
`hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
chomp $orgName;
if (length($orgName) < 1) {
$orgName="N/A";
}
++$count;
if ($swapMeasure eq "N/A") {
printf "# %02d %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist,
$orgName, $D, $chainLinkMeasure, $swapMeasure
} else {
printf "# %02d %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist,
$orgName, $D, $chainLinkMeasure, $swapMeasure
}
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x ./sizeStats.pl
./sizeStats.pl
#
# If you can fill in all the numbers in this table, you are ready for
# the multiple alignment procedure
#
# featureBits chainLink measures
# chainOryLat1Link chain linearGap
# distance on hg19 on other minScore
# 01 0.0132 - Chimp panTro2 (% 94.846) (% 94.908)
# 02 0.0182 - Gorilla gorGor1 (% 59.484) (N/A)
# 03 0.0371 - Orangutan ponAbe2 (% 91.350) (% 89.617)
# 04 0.0692 - Rhesus rheMac2 (% 82.744) (% 87.422)
# 05 0.0945 - Baboon papHam1 (% 82.810) (N/A)
# 06 0.1409 - Marmoset calJac1 (% 70.860) (% 71.897)
# 07 0.2665 - Tarsier tarSyr1 (% 47.830) (N/A)
# 08 0.2696 - Mouse lemur micMur1 (% 46.519) (N/A)
# 09 0.3071 - Bushbaby otoGar1 (% 43.644) (N/A)
# 10 0.3343 - Horse equCab2 (% 57.050) (% 66.774)
# 11 0.3416 - TreeShrew tupBel1 (% 36.156) (N/A)
# 12 0.3451 - Dolphin turTru1 (% 48.398) (N/A)
# 13 0.3500 - Squirrel speTri1 (% 35.713) (N/A)
# 14 0.3611 - Alpaca vicPac1 (% 39.399) (N/A)
# 15 0.3620 - Sloth choHof1 (% 34.377) (N/A)
# 16 0.3653 - Megabat pteVam1 (% 45.414) (N/A)
# 17 0.3732 - Elephant loxAfr3 (% 46.636) (% 42.430)
# 18 0.3740 - Cat felCat3 (% 35.713) (% 61.104)
# 19 0.3769 - Dog canFam2 (% 52.879) (% 62.055)
# 20 0.3809 - Armadillo dasNov2 (% 33.543) (N/A)
# 21 0.3941 - Rabbit oryCun2 (% 44.317) (58.405)
# 22 0.3946 - Microbat myoLuc1 (% 33.174) (N/A)
# 23 0.4028 - Cow bosTau4 (% 46.506) (% 50.297)
# 24 0.4363 - Guinea Pig cavPor3 (% 43.680) (N/A)
# 25 0.4421 - Rock hyrax proCap1 (% 30.864) (N/A)
# 26 0.4450 - Kangaroo rat dipOrd1 (% 27.161) (N/A)
# 27 0.4764 - Pika ochPri2 (% 27.768) (N/A)
# 28 0.4811 - Hedgehog eriEur1 (% 19.362) (N/A)
# 29 0.5035 - Tenrec echTel1 (% 23.120) (N/A)
# 30 0.5153 - Mouse mm9 (% 35.299) (% 38.693)
# 31 0.5226 - Rat rn4 (% 32.879) (% 36.860)
# 32 0.5274 - Shrew sorAra1 (% 19.760) (N/A)
# 33 0.6394 - Wallaby macEug1 (% 6.011) (N/A)
# 34 0.7653 - Opossum monDom5 (% 14.358) (N/A)
# 35 0.9657 - Platypus ornAna1 (% 7.627) (% 11.259)
# 36 1.0960 - Chicken galGal3 (% 3.591) (% 8.786)
# 37 1.1003 - Zebra finch taeGut1 (% 3.496) (% 7.795)
# 38 1.2394 - Lizard anoCar1 (% 3.591) (% 5.146)
# 39 1.6403 - X. tropicalis xenTro2 (% 3.176) (% 6.773)
# 40 1.9387 - Stickleback gasAcu1 (% 1.916) (% 11.175)
# 41 1.9634 - Fugu fr2 (% 1.702) (% 10.929)
# 42 1.9746 - Zebrafish danRer6 (% 3.051) (% 6.399)
# 43 1.9829 - Tetraodon tetNig2 (% 1.712) (% 14.194)
# 44 2.1031 - Medaka oryLat2 (% 1.849) (% 6.705)
# 45 2.1108 - Lamprey petMar1 (% 1.082) (% 3.200)
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
46way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.list
cd /hive/data/genomes/hg19/bed/multiz46way
# bash shell syntax here ...
export H=/hive/data/genomes/hg19/bed
mkdir mafLinks
for G in `sed -e "s/hg19 //" species.list`
do
mkdir mafLinks/$G
if [ -s ${H}/lastz.${G}/mafRBestNet/chr1.maf.gz ]; then
echo "$G - recipBest"
ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
else
if [ -s ${H}/lastz.${G}/mafSynNet/chr1.maf.gz ]; then
echo "$G - synNet"
ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G
else
if [ -s ${H}/lastz.${G}/mafNet/chr1.maf.gz ]; then
echo "$G - mafNet"
ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
else
echo "missing directory lastz.${G}/*Net"
fi
fi
fi
done
# verify the alignment type is correct:
for D in `cat /hive/users/hiram/bigWayHg19/ordered.list`
do
ls -l mafLinks/$D/chr1.maf.gz | awk '{print $NF}'
done
# compare to the list at:
# http://genomewiki.ucsc.edu/index.php/Hg19_Genome_size_statistics
# need to split these things up into smaller pieces for
# efficient kluster run.
cd /hive/data/genomes/hg19/bed/multiz46way
mkdir mafSplit
cd mafSplit
# mafSplitPos splits on gaps or repeat areas that will not have
# any chains, approx 5 Mbp intervals, gaps at least 10,000
mafSplitPos -minGap=10000 hg19 5stdout | sort -u \
| sort -k1,1 -k2,2n > mafSplit.bed
# There is a splitRegions.pl script here (copied from previous 44way)
# that can create a custom track from this mafSplit.bed file.
# Take a look at that in the browser and see if it looks OK,
# check the number of sections on each chrom to verify none are
# too large. Despite the claim above, it does appear that some
# areas are split where actual chains exist.
# run a small kluster job to split them all
ssh memk
cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
cat << '_EOF_' > runOne
#!/bin/csh -ef
set G = $1
set C = $2
mkdir -p $G
pushd $G > /dev/null
if ( -s ../../mafLinks/${G}/${C}.maf.gz ) then
rm -f hg19_${C}.*.maf
mafSplit ../mafSplit.bed hg19_ ../../mafLinks/${G}/${C}.maf.gz
gzip hg19_${C}.*.maf
else
touch hg19_${C}.00.maf
gzip hg19_${C}.00.maf
endif
popd > /dev/null
'_EOF_'
# << happy emacs
chmod +x runOne
cat << '_EOF_' > template
#LOOP
runOne $(root1) $(root2) {check out line $(root1)/hg19_$(root2).00.maf}
#ENDLOOP
'_EOF_'
# << happy emacs
for G in `sed -e "s/hg19 //" ../species.list`
do
echo $G
done > species.list
cut -f 1 ../../../chrom.sizes > chr.list
gensub2 species.list chr.list template jobList
para -ram=8g create jobList
para try ... check ... push ... etc...
# Completed: 4185 of 4185 jobs
# CPU time in finished jobs: 25547s 425.78m 7.10h 0.30d 0.001 y
# IO & Wait Time: 268664s 4477.73m 74.63h 3.11d 0.009 y
# Average job time: 70s 1.17m 0.02h 0.00d
# Longest finished job: 1234s 20.57m 0.34h 0.01d
# Submission to last job: 3048s 50.80m 0.85h 0.04d
# the autoMultiz cluster run
ssh swarm
cd /hive/data/genomes/hg19/bed/multiz46way/
mkdir splitRun
cd splitRun
mkdir maf run
cd run
mkdir penn
cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn
# set the db and pairs directories here
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = hg19
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/hg19/bed/multiz46way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/ $db//" species.list`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
/bin/zcat $in.gz > $out
if (! -s $out) then
echo "##maf version=1 scoring=autoMZ" > $out
endif
else if (-e $in) then
/bin/ln -s $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \
> /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
find ../../mafSplit -type f | grep hg19_ | xargs -L 1 basename \
| sed -e "s/.gz//" | sort -u > chr.part.list
gensub2 chr.part.list single template jobList
para -ram=8g create jobList
# initial run experience suggest some of the big jobs reach 8 Gb
# of memory usage, so, tell parasol to limit the number of jobs per
# node to avoid thrashing
para -ram=8g try
para -ram=8g push
# Completed: 504 of 504 jobs
# CPU time in finished jobs: 1342039s 22367.32m 372.79h 15.53d 0.043 y
# IO & Wait Time: 63835s 1063.91m 17.73h 0.74d 0.002 y
# Average job time: 2789s 46.49m 0.77h 0.03d
# Longest finished job: 12625s 210.42m 3.51h 0.15d
# Submission to last job: 15300s 255.00m 4.25h 0.18d
# put the split maf results back together into a single maf file
# eliminate duplicate comments
ssh hgwdev
cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
mkdir ../maf
# the sed edits take out partitioning name information from the comments
# so the multiple parts will condense to smaller number of lines
# this takes almost 2 hours of time, resulting in a bit over 150 Gb,
# almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
# HOWEVER, this is actually not necessary to maintain these comments,
# they are lost during the mafAddIRows
cat << '_EOF_' >> runOne
#!/bin/csh -fe
set C = $1
if ( -s ../maf/${C}.maf.gz ) then
rm -f ../maf/${C}.maf.gz
endif
head -q -n 1 maf/hg19_${C}.*.maf | sort -u > ../maf/${C}.maf
grep -h "^#" maf/hg19_${C}.*.maf | egrep -v "maf version=1|eof maf" | \
sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
| sort -u >> ../maf/${C}.maf
grep -h -v "^#" `ls maf/hg19_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf
tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
'_EOF_'
# << happy emacs
chmod +x runOne
cat << '_EOF_' >> template
#LOOP
runOne $(root1) {check out exists+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 ../../../chrom.sizes > chr.list
ssh encodek
cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
gensub2 chr.list single template jobList
para create jobList
para try ... check ... push ... etc ...
# Completed: 92 of 93 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 412s 6.86m 0.11h 0.00d 0.000 y
# IO & Wait Time: 21187s 353.12m 5.89h 0.25d 0.001 y
# Average job time: 235s 3.91m 0.07h 0.00d
# Longest finished job: 1529s 25.48m 0.42h 0.02d
# Submission to last job: 1542s 25.70m 0.43h 0.02d
# one of the results is completely empty, the grep for results failed
# this file ../maf/chrUn_gl000226.maf only has header comments, no result
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/hg19/multiz46way/maf
cd /hive/data/genomes/hg19/bed/multiz46way/maf
ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf
# this generates an immense multiz46way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /data/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
# Loaded 33558634 mafs in 93 files from /gbdb/hg19/multiz46way/maf
# real 512m8.053s
# load summary table
time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
| $HOME/bin/$MACHTYPE/hgLoadMafSummary hg19 -minSize=30000 -verbose=2 \
-mergeGap=1500 -maxSize=200000 multiz46waySummary stdin
# real 92m30.700s
# flushSummaryBlocks: output 45 blocks
# Created 8766427 summary blocks from 645238409 components and
# 33558634 mafs from stdin
# blocks too small to be used: 29456
# Loading into hg19 table multiz46waySummary...
# Gap Annotation
# prepare bed files with gap info
mkdir /hive/data/genomes/hg19/bed/multiz46way/anno
cd /hive/data/genomes/hg19/bed/multiz46way/anno
mkdir maf run
# most of these will already exist from previous multiple alignments
# remove the echo from in front of the twoBitInfo command to get them
# to run if this loop appears to be correct
for DB in `cat ../species.list`
do
CDIR="/hive/data/genomes/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `sed -e "s/hg19 //" ../../species.list`
do
echo "${DB} "
ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
# the annotation step requires large memory, run on memk nodes
ssh memk
cd /hive/data/genomes/hg19/bed/multiz46way/anno/run
ls ../../maf | sed -e "s/.maf//" > chr.list
cat << '_EOF_' > template
#LOOP
./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cat << '_EOF_' > anno.csh
#!/bin/csh -fe
set inMaf = ../../maf/$1.maf
set outMaf = ../maf/$1.maf
rm -f $outMaf
mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/hg19/hg19.2bit $outMaf
'_EOF_'
# << happy emacs
chmod +x anno.csh
gensub2 chr.list single template jobList
para -ram=30g create jobList
# specify lots of ram to get one job per node
para -ram=30g push
#
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 10371s 172.85m 2.88h 0.12d 0.000 y
# IO & Wait Time: 3365s 56.09m 0.93h 0.04d 0.000 y
# Average job time: 148s 2.46m 0.04h 0.00d
# Longest finished job: 1153s 19.22m 0.32h 0.01d
# Submission to last job: 7402s 123.37m 2.06h 0.09d
ssh hgwdev
rm -fr /gbdb/hg19/multiz46way/maf
mkdir /gbdb/hg19/multiz46way/maf
cd /hive/data/genomes/hg19/bed/multiz46way/anno/maf
ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf/
# by loading this into the table multiz46way, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /data/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
# real 113m11.709s
# Loaded 33612571 mafs in 93 files from /gbdb/hg19/multiz46way/maf
XXX - done to here
time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
| hgLoadMafSummary hg19 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz46waySummary stdin
# with the quality annotated mafs, and mem interference on hgwdev:
# Created 8514381 summary blocks from 600504256 components \
# and 33320838 mafs from stdin
# real 169m56.936s
# with the Irow annotations after the multiz fix:
# Created 8514380 summary blocks from 600499937
# components and 33298894 mafs from stdin
# real 184m42.893s
# user 70m44.431s
# sys 8m7.970s
# Created 8514078 summary blocks from 604683213 components
# and 35125649 mafs from stdin
# real 130m55.115s
# user 71m37.409s
# sys 8m5.110s
# by loading this into the table multiz46waySummary, it will replace
# the previously loaded table with the unannotated mafs
# remove the multiz46way*.tab files in this /data/tmp directory
# -rw-rw-r-- 1 1949221892 Nov 15 14:04 multiz46way.tab
# -rw-rw-r-- 1 417994189 Nov 15 20:57 multiz46waySummary.tab
wc -l multiz46way*.tab
# 33964377 multiz46way.tab
# 8514078 multiz46waySummary.tab
# 42478455 total
rm multiz46way*.tab
# create some downloads
mkdir -p /hive/data/genomes/hg19/bed/multiz46way/download/maf
cd /hive/data/genomes/hg19/bed/multiz46way/download/maf
time cp -p ../../anno/maf/chr*.maf .
# real 72m46.514s
# user 0m1.293s
# sys 5m15.981s
time gzip --rsyncable *.maf
time gzip --rsyncable *.maf
# real 185m37.884s
# user 179m51.161s
# sys 3m48.016s
time md5sum *.gz > md5sum.txt
# real 3m59.009s
# user 1m19.338s
# sys 0m18.976s
##############################################################################
# LASTZ Sea Hare aplCal1 (STARTING - 2009-06-08 - Galt)
mkdir /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
cd /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
cat << '_EOF_' > DEF
# Human vs. Sea Hare
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ2_LIMIT=5
# QUERY: Sea Hare aplCal1
SEQ2_DIR=/scratch/data/aplCal1/aplCal1.2bit
SEQ2_LEN=/scratch/data/aplCal1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
# (NOTE I SHOULD NOT HAVE USED -qRepeats=windowmaskerSdust)
screen
time nice +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
>& do.log &
# real ?? about one hour but one job hung
# resuming from failure
# edited loadUp.csh, commenting out the first completed step
# and removing the unneeded -qRepeats=windowmaskerSdust
# from the next step, now run it to complete the load step.
/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/axtChain/loadUp.csh \
>& continue-loadUp.log&
# continue from step 'download'
time nice +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
-continue download \
>& continue-download.log &
cat fb.hg19.chainAplCal1Link.txt
# 19675762 bases of 2897316137 (0.679%) in intersection
# running the swap - DONE - 2009-06-02
# (NOTE I SHOULD NOT HAVE USED -qRepeats=windowmaskerSdust)
mkdir /hive/data/genomes/aplCal1/bed/blastz.hg19.swap
cd /hive/data/genomes/aplCal1/bed/blastz.hg19.swap
time nice +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
-swap >& swap.log &
# real time not long
# resuming from failure
# edited loadUp.csh, commenting out the first completed step
# and removing the unneeded -tRepeats=windowmaskerSdust
# from the next step, now run it to complete the load step.
/hive/data/genomes/aplCal1/bed/blastz.hg19.swap/axtChain/loadUp.csh \
>& continue-loadUp.log&
time nice +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
-continue download \
-swap >& continue-download.log &
cat fb.aplCal1.chainHg19Link.txt
# 14163455 bases of 619228098 (2.287%) in intersection
#########################################################################
# EXONIPHY Hg19, lifted from hg18 (DONE - 2009-06-19 - Hiram)
# needed for uscsGenes11 building
# create a syntenic liftOver chain file
cd /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
time nice -n +19 netSyntenic run.chain/hg18.hg19.noClass.net.gz stdout \
| netFilter -syn stdin | netChainSubset -verbose=0 stdin \
run.chain/hg18.hg19.all.chain.gz stdout \
| chainStitchId stdin stdout | gzip -c > hg18.hg19.syn.chain.gz
# memory usage 55492608, utime 3 s/100, stime 3
# real 2m35.613s
# real 5m55.575s
# slightly smaller than the ordinary liftOver chain file:
# -rw-rw-r-- 1 137245 Mar 6 17:37 hg18ToHg19.over.chain.gz
# -rw-rw-r-- 1 96115 Jun 19 14:30 hg18.hg19.syn.chain.gz
# exoniphyHg19.gp is prepared as follows
mkdir /cluster/data/hg19/bed/exoniphy
cd /cluster/data/hg19/bed/exoniphy
hgsql hg18 -e "select * from exoniphy" -N > exoniphyHg18.gp
time nice -n +19 liftOver -genePred exoniphyHg18.gp \
/hive/data/genomes/hg18/bed/blat.hg19.2009-03-06/hg18.hg19.syn.chain.gz \
exoniphyHg19.gp unmapped
wc -l *
# 178162 exoniphyHg18.gp
# 178109 exoniphyHg19.gp
# 106 unmapped
mkdir dump
cd dump
hgsqldump --all -c --tab=. hg18 exoniphy
cd ..
chmod 775 dump
hgsql hg19 < dump/exoniphy.sql
hgsql hg19 \
-e "load data local infile \"exoniphyHg19.gp\" into table exoniphy;"
nice -n +19 featureBits hg19 exoniphy
# 27421336 bases of 2897316137 (0.946%) in intersection
nice -n +19 featureBits hg18 exoniphy
# 27475705 bases of 2881515245 (0.954%) in intersection
#########################################################################
# BIOCYCTABLES NEEDED BY hgGene (DONE - 2009-06-22 - Hiram)
# First register with BioCyc to download their HumanCyc database
# The site will email you the URL for download. Beware, they supply
# a URL to a directory chock a block full of data, almost 7 Gb,
# you only need one file
mkdir /hive/data/outside/bioCyc/090623
cd /hive/data/outside/bioCyc/090623
mkdir download
cd download
wget --timestamping --no-directories --recursive \
"http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.tar.Z"
tar xvzf humancyc-flatfiles.tar.Z
mkdir /hive/data/genomes/hg19/bed/bioCyc
cd /hive/data/genomes/hg19/bed/bioCyc
# clean the headers from these files
grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/genes.col \
> genes.tab
# this file isn't consistent in its number of columns
grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/pathways.col \
| awk -F'\t' '{if (140 == NF) { printf "%s\t\t\n", $0; } else { print $0}}' \
> pathways.tab
hgsql hg19 -e 'create database bioCyc090623'
hgLoadSqlTab bioCyc090623 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab
hgLoadSqlTab bioCyc090623 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab
# Create bioCycMapDesc.tab
hgsql bioCyc090623 -N \
-e 'select UNIQUE_ID, NAME from pathways' | sort -u > bioCycMapDesc.tab
XXX see alternative below
# this kgBioCyc0 thing needs kgXref and other UCSC gene tables to work
# Create bioCycPathway.tab
kgBioCyc0 bioCyc090623 hg19 hg19
hgLoadSqlTab hg19 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
hgLoadSqlTab hg19 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
XXX maybe instead do this in the gene build procedure
# from the UCSC genes build procedure
# Do BioCyc Pathways build
mkdir $dir/bioCyc
cd $dir/bioCyc
grep -v '^#' $bioCycPathways > pathways.tab
grep -v '^#' $bioCycGenes > genes.tab
kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab
hgLoadSqlTab $tempDb bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
hgLoadSqlTab $tempDb bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
##############################################################################
nscanGene (2009-06-22 markd)
# nscanGene track from WUSTL
cd /cluster/data/hg19/bed/nscan
wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.updated.gtf
wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.readme
wget -r -np -l 1 http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19_proteins
bzip2 hg19.updated.gtf hg19_proteins/*.fa
# load track
gtfToGenePred -genePredExt hg19.updated.gtf.bz2 stdout| hgLoadGenePred -genePredExt hg19 nscanGene stdin
bzcat hg19_proteins/chr*.fa.bz2 | hgPepPred hg19 generic nscanPep stdin
rm *.tab
# validate same number of transcripts and peptides are loaded
hgsql -Ne 'select count(*) from nscanGene' hg19
hgsql -Ne 'select count(*) from nscanPep' hg19
# validate search expression
hgc-sql -Ne 'select name from nscanGene' hg19 | egrep -v -e '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |wc -l
#########################################################################
# Phylogenetic tree from 46-way (DONE - 2009-06-25,07-07 - Hiram)
# Extract 4-fold degenerate sites based on
# of RefSeq Reviewed, coding
mkdir /hive/data/genomes/hg19/bed/multiz46way/4d
cd /hive/data/genomes/hg19/bed/multiz46way/4d
hgsql hg19 -Ne \
"select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 \
> refSeqReviewed.gp
wc -l refSeqReviewed.gp
# 14077 refSeqReviewed.gp
genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
wc -l refSeqReviewedNR.gp
# 7951 refSeqReviewedNR.gp
ssh memk
mkdir /hive/data/genomes/hg19/bed/multiz46way/4d/run
cd /hive/data/genomes/hg19/bed/multiz46way/4d/run
mkdir ../mfa
# whole chrom mafs version, using new version of
# uses memory-efficient version of phast, from Melissa Hubisz at Cornell (mjhubisz@gmail.com)
cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set r = "/hive/data/genomes/hg19/bed/multiz46way"
set c = $1
set infile = $r/maf/$2
set outfile = $3
cd /scratch/tmp
# 'clean' maf
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp > $c.gp
set PHASTBIN=/cluster/bin/phast.2008-12-18
$PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/$outfile
rm -f $c.gp $c.maf $c.ss
'_EOF_'
# << happy emacs
chmod +x 4d.csh
ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/*.maf | \
egrep -E -v "chrM|chrUn|random|_hap" | sed -e "s#.*multiz46way/maf/##" \
> maf.list
cat << '_EOF_' > template
#LOOP
4d.csh $(root1) {check in line+ $(path1)} {check out line+ mfa/$(root1).mfa}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 maf.list single template stdout | tac > jobList
XXX - ready to go here - 2009-07-06
rm -fr /cluster/data/hg19/bed/multiz46way/4d/mfa
mkdir /cluster/data/hg19/bed/multiz46way/4d/mfa
para create jobList
para try
para check
para push
# combine mfa files
cd ..
sed -e "s/ /,/g" ../species.list > species.lst
/cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \
sed s/"> "/">"/ > 4d.all.mfa
sed -e 's/,macEug1.*//' species.lst > placentals.lst
# XXX this didn't work
/cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \
sed s/"> "/">"/ > 4d.placentals.mfa
# use phyloFit to create tree model (output is phyloFit.mod)
set PHASTBIN=/cluster/bin/phast.2008-12-18
time $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA \
--subst-mod REV --tree ../tree-commas.nh 4d.all.mfa
# real 111m23.119s
mv phyloFit.mod phyloFit.all.mod
grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.46way.nh
sed -e 's/.*,choHof1,//' species.lst > notPlacentals.list
$PHASTBIN/tree_doctor \
--prune=`cat notPlacentals.list` \
tree_4d.46way.nh > tree_4d.46way.placental.nh
#############################################################################
# phastCons 46-way (WORKING - 2009-09-21 - Hiram)
+ # was unable to split the full chrom MAF files, now working on the
+ # maf files as they were split up during multiz
# split 46way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh memk
mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
+ cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
+ ./splitRegions.pl mafSplit.bed > \
+ /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/region.list
mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/ss
cd /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
-set MAF = /hive/data/genomes/hg19/bed/multiz46way/maf/$c.maf
+set MAF = /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/hg19_$c.maf
set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/cons/ss/$c
rm -fr $WINDOWS
+# set seq = `egrep "${c}"'$' region.list | awk '{printf "-seq=%s -start=%d
+# -end=%d", $1, $2, $3}'`
+set seq = `egrep "${c}"'$' region.list | awk '{printf "-seq=%s", $1}'`
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
-twoBitToFa -seq=$c /hive/data/genomes/hg19/hg19.2bit hg19.$c.fa
-/cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
+twoBitToFa ${seq} /hive/data/genomes/hg19/hg19.2bit hg19.$c.fa
+set empty = `faSize hg19.$c.fa | egrep " 0 real 0 upper 0 lower|masked total" | wc -l`
+if ( $empty != 2 ) then
+ /cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
-M hg19.$c.fa -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
+endif
rm -f hg19.$c.fa
popd > /dev/null
-date >> $c.done
+date >> $2
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
# << happy emacs
# do the easy ones first to see some immediate results
- ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
+ ls -1S -r ../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list
gensub2 maf.list single template jobList
para -ram=32g create jobList
para try ... check ... etc
+ # XXX - this did not work
# this takes a really long time. memk was down to 2 usable
# machines - got it finished manually on a combination of hgwdevnew CPUs
# and other machines
# Estimate phastCons parameters
# experimented with this as a parasol job on hgwdevnew to try a number
# of SS files. With a command of:
/cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
--tree "(((((((((((((((((hg19,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
--out-root=$OUT/starting_tree
# running over the input files ../ss/*/*.ss results to
#.../genomes/hg19/bed/multiz46way/cons/startingTree/result/*/starting-tree.mod
# add up the C and G:
find ./result -type f | xargs ls -rt | while read F
do
D=`dirname $F`
echo -n `basename $D`" - "
grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
done
# counting number of species seen in the maf file:
find ./result -type f | xargs ls -rt | while read F
do
D=`dirname $F`
echo -n `basename $D`" - "
grep TREE $F | sed -e \
"s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g" | tr ',' '\n' | wc -l
done
# Run phastCons
- # This job is I/O intensive in its output files, thus it is all
- # working over in /scratch/tmp/
+ # This job is I/O intensive in its output files, beware where this
+ # takes place or do not run too many at once.
ssh swarm
mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
cd /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
- # all euarchontogliers placentals
+ # all primates placentals
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast/x86_64
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set cons = /hive/data/genomes/hg19/bed/multiz46way/cons
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons
if (-s $cons/$grp/$grp.non-inf) then
ln -s $cons/$grp/$grp.mod $tmp
ln -s $cons/$grp/$grp.non-inf $tmp
ln -s $ssSrc/ss/$c/$f.ss $tmp
ln -s $cons/$grp/$grp.mod $tmp
ln -s $cons/$grp/$grp.non-inf $tmp
else
ln -s $ssSrc/ss/$c/$f.ss $tmp
ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p pp/$c bed/$c
sleep 4
touch pp/$c bed/$c
rm -f pp/$c/$f.pp
rm -f bed/$c/$f.bed
mv $tmp/$f.pp pp/$c
mv $tmp/$f.bed bed/$c
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
# this template will serve for all runs
# root1 == chrom name, file1 == ss file name without .ss suffix
cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
- ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > ss.list
-
# run for all species
cd /hive/data/genomes/hg19/bed/multiz46way/cons
mkdir -p all
cd all
# Using Kate's .mod tree
cp -p ../../4d/46way.all.mod ./all.mod
gensub2 ../run.cons/ss.list single ../run.cons/template jobList
para -ram=8g create jobList
para try ... check ... push ... etc.
-XXX - running Tue Jan 13 22:19:21 PST 2009
-# Completed: 322 of 322 jobs
-# CPU time in finished jobs: 47406s 790.10m 13.17h 0.55d 0.002 y
-# IO & Wait Time: 29902s 498.37m 8.31h 0.35d 0.001 y
-# Average job time: 240s 4.00m 0.07h 0.00d
-# Longest finished job: 354s 5.90m 0.10h 0.00d
-# Submission to last job: 536s 8.93m 0.15h 0.01d
+
+# second run on swarm parasol: the failed jobs have empty bed file results
+# Completed: 575 of 580 jobs
+# Crashed: 5 jobs
+# CPU time in finished jobs: 42049s 700.81m 11.68h 0.49d 0.001 y
+# IO & Wait Time: 19735s 328.92m 5.48h 0.23d 0.001 y
+# Average job time: 107s 1.79m 0.03h 0.00d
+# Longest finished job: 267s 4.45m 0.07h 0.00d
+# Submission to last job: 479s 7.98m 0.13h 0.01d
+
+# first run on hgwdev parasol:
+# Completed: 574 of 579 jobs
+# Crashed: 5 jobs
+# CPU time in finished jobs: 53050s 884.17m 14.74h 0.61d 0.002 y
+# IO & Wait Time: 6633s 110.55m 1.84h 0.08d 0.000 y
+# Average job time: 104s 1.73m 0.03h 0.00d
+# Longest finished job: 248s 4.13m 0.07h 0.00d
+# Submission to last job: 4121s 68.68m 1.14h 0.05d
# create Most Conserved track
- cd /hive/data/genomes/hg19/bed/multiz46way/cons
- cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
- awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
- /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+ cut -f1 ../../../../chrom.sizes | while read C
+do
+ ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
+ do
+ cat ${D}/${C}*.bed
+ done | awk 'BEGIN{ ID=1 }{printf "%s\t%d\t%d\t%s.%d\t%d\t%s\n", "'${C}'", $2, $3, "'${C}'", ID, $5, $6; ++ID}'
+done > mostConserved.bed
# ~ 1 minute
# load into database
ssh hgwdev
cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
time nice -n +19 hgLoadBed hg19 phastConsElements46way mostConserved.bed
- # Loaded 4878296 elements of size 5
- # real 2m3.414s
+ # Loaded 5868432 elements of size 6
+ # real 1m14.357s
# Try for 5% overall cov, and 70% CDS cov
- # --rho 0.3 --expected-length 45 --target-coverage 0.3
featureBits hg19 -enrichment refGene:cds phastConsElements46way
- # refGene:cds 1.144%, mostConserved.bed 4.973%,
- # both 0.854%, cover 74.62%, enrich 15.01x
-
- # --rho .31 --expected-length 45 --target-coverage .3
- # refGene:cds 1.144%, phastConsElements46way 4.706%,
- # both 0.824%, cover 72.07%, enrich 15.31x
-
# --rho 0.3 --expected-length 45 --target-coverage 0.3
- featureBits hg19 -enrichment knownGene:cds phastConsElements46way
- # knownGene:cds 1.205%, mostConserved.bed 4.973%,
- # both 0.874%, cover 72.55%, enrich 14.59x
-
- # --rho .31 --expected-length 45 --target-coverage .3
- # knownGene:cds 1.205%, phastConsElements46way 4.706%,
- # both 0.844%, cover 70.05%, enrich 14.88x
-
- featureBits hg19 -enrichment refGene:cds phastConsElements28way
- # refGene:cds 1.144%, phastConsElements28way 4.920%,
- # both 0.858%, cover 74.96%, enrich 15.24x
- featureBits hg19 -enrichment knownGene:cds phastConsElements28way
- # knownGene:cds 1.205%, phastConsElements28way 4.920%,
- # both 0.878%, cover 72.88%, enrich 14.81x
+ # refGene:cds 1.186%, phastConsElements46way 5.621%,
+ # both 0.878%, cover 73.98%, enrich 13.16x
# Create merged posterier probability file and wiggle track data files
cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
- cat << '_EOF_' > gzipAscii.sh
+ mkdir downloads
+ cat << '_EOF_' > phastCat.sh
#!/bin/sh
-TOP=`pwd`
-export TOP
+set -beEu -o pipefail
mkdir -p downloads
-
-for D in pp/chr*
+cut -f1 ../../../../chrom.sizes | while read C
do
- C=${D/pp\/}
- out=downloads/${C}.phastCons46way.wigFix.gz
- echo "${D} > ${C}.phastCons46way.wigFix.gz"
- ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
- gzip > ${out}
+ echo -n "${C} ... working ... "
+ ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+ do
+ cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+ done | gzip > downloads/${C}.phastCons46way.wigFix.gz
+ echo "done"
done
'_EOF_'
# << happy emacs
- chmod +x gzipAscii.sh
- time nice -n +19 ./gzipAscii.sh
- # real 30m7.228s
+ chmod +x phastCat.sh
+ time nice -n +19 ./phastCat.sh
+ # real 30m2.623s
# encode those files into wiggle data
zcat downloads/*.wigFix.gz \
| wigEncode stdin phastCons46way.wig phastCons46way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
- # real 22m54.291s
+ # real 18m37.881s
+ du -hsc *.wi?
+ # 2.7G phastCons46way.wib
+ # 271M phastCons46way.wig
+ # 3.0G total
+
+ # encode into a bigWig file:
+ # (warning wigToBigWig process grows to about 36 Gb)
+ # in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG
+ulimit -v $sizeG
+ zcat downloads/*.wigFix.gz \
+ | wigToBigWig stdin ../../../../chrom.sizes phastCons46way.bw
+ # real 52m36.142s
+# -rw-rw-r-- 1 21667535139 Oct 20 13:59 phastCons46way.bw
+ mkdir /gbdb/hg19/bbi
+ ln -s `pwd`/phastCons46way.bw /gbdb/hg19/bbi
+ # loading bigWig table:
+ hgsql hg19 -e 'drop table if exists phastCons46way; \
+ create table phastCons46way (fileName varchar(255) not null); \
+ insert into phastCons46way values
+ ("/gbdb/hg19/bbi/phastCons46way.bw");'
+ # Using the bigWig file instead of this database table:
# Load gbdb and database with wiggle.
+# ssh hgwdev
+# cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+# ln -s `pwd`/phastCons46way.wib /gbdb/hg19/multiz46way/phastCons46way.wib
+# time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+# phastCons46way phastCons46way.wig
+
+ # Create histogram to get an overview of all the data
ssh hgwdev
cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
- ln -s `pwd`/phastCons46way.wib /gbdb/hg19/multiz46way/phastCons46way.wib
- time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
- phastCons46way phastCons46way.wig
- # real 1m13.681s
+ time nice -n +19 hgWiggle -doHistogram -db=hg19 \
+ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+ pc46 > histogram.data 2>&1
+ # real 7m37.212s
+
+ # create plot of histogram:
+
+ cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Human Hg19 Histogram phastCons46way track"
+set xlabel " phastCons46way score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+ # << happy emacs
+
+ display histo.png &
+
+ ########################################################################
+ ### Create a phastCons data set for Primates
+
+ # setup primates-only run
+ ssh swarm
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+ # primates-only: exclude all but these for phastCons tree:
+
+ /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
+ --prune-all-but=hg19,panTro2,gorGor1,ponAbe2,rheMac2,papHam1,calJac1,tarSyr1,micMur1,otoGar1 \
+ > primates.mod
+ # and place the removed ones in the non-inf file so phastCons will
+ # truly ignore them:
+ echo "tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun2,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr3,proCap1,echTel1,dasNov2,choHof1,macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
+ > primates.non-inf
+
+ gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+ para -ram=8g create jobList
+ para try ... check ... push ... etc.
+# Completed: 539 of 580 jobs
+# Crashed: 41 jobs
+# CPU time in finished jobs: 19518s 325.30m 5.42h 0.23d 0.001 y
+# IO & Wait Time: 19782s 329.70m 5.50h 0.23d 0.001 y
+# Average job time: 73s 1.22m 0.02h 0.00d
+# Longest finished job: 157s 2.62m 0.04h 0.00d
+# Submission to last job: 1989s 33.15m 0.55h 0.02d
+
+ # the 41 crashed jobs are due to empty bed file results.
+# bed/chrUn_gl000237.00/chrUn_gl000237.00.1-45866.bed is empty
+# ... etc
+
+ # create Most Conserved track
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+ ../all/bedCat.sh > mostConserved.bed
+ featureBits hg19 mostConserved.bed
+ # 146285948 bases of 2897316137 (5.049%) in intersection
+
+ # load into database
+ ssh hgwdev
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+ time nice -n +19 hgLoadBed hg19 phastConsElements46wayPrimates \
+ mostConserved.bed
+ # Loaded 1109918 elements of size 6
+ # real 0m15.498s
+ # verify coverage
+ featureBits hg19 phastConsElements46wayPrimates
+ # 146285948 bases of 2897316137 (5.049%) in intersection
+
+ # --rho 0.3 --expected-length 45 --target-coverage 0.3
+ featureBits hg19 -enrichment refGene:cds phastConsElements46wayPrimates
+ # refGene:cds 1.186%, phastConsElements46wayPrimates 5.049%,
+ # both 0.771%, cover 64.95%, enrich 12.86x
+
+ featureBits hg19 -enrichment knownGene:cds phastConsElements46wayPrimates
+ # knownGene:cds 1.252%, phastConsElements46wayPrimates 5.049%,
+ # both 0.784%, cover 62.65%, enrich 12.41x
+
+ # Create the downloads .pp files, from which the phastCons wiggle data
+ # is calculated
+ # sort by chromName, chromStart so that items are in numerical order
+ # for wigEncode
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+ mkdir downloads
+ cat << '_EOF_' > phastCat.sh
+#!/bin/sh
+
+mkdir -p downloads
+cut -f1 ../../../../chrom.sizes | while read C
+do
+ echo -n "${C} ... working ... "
+ if [ -d "pp/${C}.00" ]; then
+ ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+ do
+ cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+ done | gzip > downloads/${C}.phastCons46way.primates.wigFix.gz
+ fi
+ echo "done"
+done
+'_EOF_'
+ # << happy emacs
+ chmod +x ./phastCat.sh
+ time nice -n +19 ./phastCat.sh
+ # real 39m47.189s
+
+ # Create merged posterier probability file and wiggle track data files
+ zcat downloads/chr*.wigFix.gz \
+ | wigEncode stdin phastCons46wayPrimates.wig phastCons46wayPrimates.wib
+ # Converted stdin, upper limit 1.00, lower limit 0.00
+ # real 17m20.601s
+
+ # encode to bigWig
+ # (warning wigToBigWig process grows to about 36 Gb)
+ # in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG
+ulimit -v $sizeG
+
+ zcat downloads/*.wigFix.gz \
+ | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPrimates.bw
+
+ ln -s `pwd`/phastCons46wayPrimates.bw /gbdb/hg19/bbi
+ # loading bigWig table:
+ hgsql hg19 -e 'drop table if exists phastCons46wayPrimates; \
+ create table phastCons46wayPrimates \
+ (fileName varchar(255) not null); \
+ insert into phastCons46wayPrimates values
+ ("/gbdb/hg19/bbi/phastCons46wayPrimates.bw");'
+
+ ## load table with wiggle data
+ ## not done now, using the bigWig file instead
+# ssh hgwdev
+# cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+# ln -s `pwd`/phastCons46wayPrimates.wib \
+# /gbdb/hg19/multiz46way/phastCons46wayPrimates.wib
+# time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+# phastCons46wayPrimates phastCons46wayPrimates.wig
+ # Instead, temporary load into a table so we can do the histogram
+ ln -s `pwd`/phastCons46wayPrimates.wib /gbdb/hg19/wib/pc46.wib
+ hgLoadWiggle hg19 pc46 phastCons46wayPrimates.wig
# Create histogram to get an overview of all the data
+ time nice -n +19 hgWiggle -doHistogram \
+ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+ -db=hg19 pc46 > histogram.data 2>&1
+ # real 5m30.086s
+
+ # create plot of histogram:
+
+ cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color \
+ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Mouse Hg19 Histogram phastCons46wayPrimates track"
+set xlabel " phastCons46wayPrimates score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+ # << happy emacs
+
+ display histo.png &
+
+ ########################################################################
+ ### Create a phastCons data set for Placentals
+ # setup placental-only run
+ ssh swarm
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+
+ # placental-only: exclude all but these for phastCons tree:
+ /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
+ --prune-all-but=hg19,panTro2,gorGor1,ponAbe2,rheMac2,papHam1,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun2,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr3,proCap1,echTel1,dasNov2,choHof1 \
+ > placental.mod
+ # and place the removed ones in the non-inf file so phastCons will
+ # truly ignore them:
+ echo "macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
+ > placental.non-inf
+
+ gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+ para -ram=8g create jobList
+ para try ... check ... push ... etc.
+# Completed: 562 of 580 jobs
+# Crashed: 18 jobs
+# CPU time in finished jobs: 33874s 564.57m 9.41h 0.39d 0.001 y
+# IO & Wait Time: 12493s 208.21m 3.47h 0.14d 0.000 y
+# Average job time: 83s 1.38m 0.02h 0.00d
+# Longest finished job: 193s 3.22m 0.05h 0.00d
+# Submission to last job: 62872s 1047.87m 17.46h 0.73d
+
+ # The crashed jobs produce zero length bed files: e.g.
+ # bed/chrUn_gl000246.00/chrUn_gl000246.00.1-38144.bed is empty
+
+ # create Most Conserved track
+ ../all/bedCat.sh > mostConserved.bed
+
+ # load into database
ssh hgwdev
- cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+ time nice -n +19 hgLoadBed hg19 phastConsElements46wayPlacental \
+ mostConserved.bed
+ # Loaded 4785089 elements of size 6
+ # real 0m58.367s
+ # verify coverage
+ featureBits hg19 phastConsElements46wayPlacental
+ # 146457699 bases of 2897316137 (5.055%) in intersection
+ # 119635433 bases of 2881515245 (4.152%) in intersection
+
+ # --rho 0.3 --expected-length 45 --target-coverage 0.3
+ featureBits hg19 -enrichment refGene:cds phastConsElements46wayPlacental
+ # refGene:cds 1.186%, phastConsElements46wayPlacental 5.055%,
+ # both 0.847%, cover 71.42%, enrich 14.13x
+ featureBits hg19 -enrichment knownGene:cds phastConsElements46wayPlacental
+ # knownGene:cds 1.252%, phastConsElements46wayPlacental 5.055%,
+ # both 0.865%, cover 69.10%, enrich 13.67x
+
+ # Create the downloads .pp files, from which the phastCons wiggle data
+ # is calculated
+ # sort by chromName, chromStart so that items are in numerical order
+ # for wigEncode
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+ mkdir downloads
+ cat << '_EOF_' > phastCat.sh
+#!/bin/sh
+
+mkdir -p downloads
+cut -f1 ../../../../chrom.sizes | while read C
+do
+ echo -n "${C} ... working ... "
+ if [ -d "pp/${C}.00" ]; then
+ ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+ do
+ cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+ done | gzip > downloads/${C}.phastCons46way.placental.wigFix.gz
+ fi
+ echo "done"
+done
+'_EOF_'
+ # << happy emacs
+ chmod +x ./phastCat.sh
+ time nice -n +19 ./phastCat.sh
+
+ # Create merged posterier probability file and wiggle track data files
+ zcat downloads/chr*.wigFix.gz \
+ | wigEncode stdin phastCons46wayPlacental.wig \
+ phastCons46wayPlacental.wib
+ # Converted stdin, upper limit 1.00, lower limit 0.00
+ # real 14m53.395s
+
+ # encode to bigWig
+ # (warning wigToBigWig process grows to about 36 Gb)
+ # in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG
+ulimit -v $sizeG
+
+ zcat downloads/*.wigFix.gz \
+ | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPlacental.bw
+ # real 40m55.568s
+
+ ln -s `pwd`/phastCons46wayPlacental.bw /gbdb/hg19/bbi
+ # loading bigWig table:
+ hgsql hg19 -e 'drop table if exists phastCons46wayPlacental; \
+ create table phastCons46wayPlacental \
+ (fileName varchar(255) not null); \
+ insert into phastCons46wayPlacental values
+ ("/gbdb/hg19/bbi/phastCons46wayPlacental.bw");'
+
+
+ ## load table with wiggle data
+ ## no longer load this data, using the bigWig file instead
+# ssh hgwdev
+# cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+# ln -s `pwd`/phastCons46wayPlacental.wib \
+# /gbdb/hg19/multiz46way/phastCons46wayPlacental.wib
+# time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+# phastCons46wayPlacental phastCons46wayPlacental.wig
+
+ # Instead, temporary load into a table so we can do the histogram
+ ln -s `pwd`/phastCons46wayPlacental.wib /gbdb/hg19/wib/pc46.wib
+ hgLoadWiggle hg19 pc46 phastCons46wayPlacental.wig
+
+ # Create histogram to get an overview of all the data
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
- -db=hg19 phastCons46way > histogram.data 2>&1
- # real 8m6.841s
+ -db=hg19 pc46 > histogram.data 2>&1
+ # real 8m15.623s
+ hgsql -e "drop table pc46;" hg19
+ rm /gbdb/hg19/wib/pc46.wib
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
-set title " Human Hg18 Histogram phastCons46way track"
-set xlabel " phastCons46way score"
+set title " Human Hg19 Histogram phastCons46wayPlacental track"
+set xlabel " phastCons46wayPlacental score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
#########################################################################
# LASTZ Zebrafish DanRer6 (DONE - 2009-07-08,10 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
cd /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
cat << '_EOF_' > DEF
# human vs X. zebrafish
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Zebrafish danRer6
SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit
SEQ2_LEN=/scratch/data/danRer6/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=40
BASE=/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 1678m17.827s
# failed during the chain step due to encodek cluster problems
# finish that manually, then:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=chainMerge > chainMerge.log 2>&1 &
# real 167m6.930s
cat fb.hg19.chainDanRer6Link.txt
# 88391631 bases of 2897316137 (3.051%) in intersection
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/danRer6/bed/blastz.hg19.swap
cd /hive/data/genomes/danRer6/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-swap > swap.log 2>&1 &
# real 183m21.102s
cat fb.danRer6.chainHg19Link.txt
# 96424507 bases of 1506896106 (6.399%) in intersection
##############################################################################
# LASTZ Elephant LoxAfr3 (DONE - 2009-07-21,23 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
cd /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
cat << '_EOF_' > DEF
# Human vs. Elephant
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Elephant
SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit
SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 317m32.664s
# broken when it went to chaining on encodek, finish the chain then:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-continue=chainMerge > chainMerge.log 2>&1 &
# real 217m25.159s
# time about 3h23m
cat fb.hg19.chainLoxAfr3Link.txt
# 1351200080 bases of 2897316137 (46.636%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet -continue=syntenicNet -stop=syntenicNet \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> synNet.log 2>&1 &
# real 32m40.554s
time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr3 > rbest.log 2>&1
# real 184m3.435s
mkdir /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
cd /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-swap > swap.log 2>&1 &
# real 220m16.839s
cat fb.loxAfr3.chainHg19Link.txt
# 1323201500 bases of 3118565340 (42.430%) in intersection
##############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
# AGILENT PROBES LIFTED FROM HG18 (DONE, 2009-07-28 Andy)
ssh hgwdev
bash
mkdir /hive/data/genomes/hg19/bed/agilentProbes
cd /hive/data/genomes/hg19/bed/agilentProbes
for table in `echo show tables like \'agilent%\' | hgsql hg18 | tail -n +2 | grep -v Probe`; do
echo $table; echo "select * from $table" | hgsql hg18 | \
tail -n +2 | cut -f2- > ${table}.hg18.bed; liftOver ${table}.hg18.bed \
/gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz ${table}.hg19.{bed,unmapped};
hgLoadBed hg19 $table ${table}.hg19.bed;
echo done with $table;
done
for unmap in *.unmapped; do
table=${unmap%.hg19.unmapped}
grep Deleted -A1 $unmap | grep -v Deleted | grep -v "^--" > agilentProbesHg18Unmapped/${table}.deleted.bed
grep Split -A1 $unmap | grep -v Split | grep -v "^--" > agilentProbesHg18Unmapped/${table}.split.bed
grep Partially -A1 $unmap | grep -v Partially | grep -v "^--" > agilentProbesHg18Unmapped/${table}.partiallyDeleted.bed
done
find agilentProbesHg18Unmapped/ -size 0b | xargs rm
rm *hg18.bed *.unmapped bed.tab
gzip *.bed
tar cfz agilentProbesHg18Unmapped.tar.gz agilentProbesHg18Unmapped
cd /usr/local/apache/htdocs/goldenPath/hg19
mkdir agilentProbes
cd agilentProbes/
ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped beds
ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped.tar.gz
##############################################################################
# LASTZ Tetraodon TetNig2 (DONE - 2009-08-10,11 - Hiram)
# This is the incorrect date/time stamp on this directory,
# it should be 2009-08-10
mkdir /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
cd /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
cat << '_EOF_' > DEF
# human vs tetraodon
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# real 220m36.068s
# forgot the qRepeats for tetNig2
rm axtChain/hg19.tetNig2.net
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=load -qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> load.log 2>&1 &
# real 5m53.096s
cat fb.hg19.chainTetNig2Link.txt
# 49611132 bases of 2897316137 (1.712%) in intersection
# running the swap
mkdir /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
cd /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-swap > swap.log 2>&1 &
# real 13m21.591s
# forgot the qRepeats for tetNig2
rm axtChain/tetNig2.hg19.net
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
-continue=load -qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-swap > load.log 2>&1 &
# real 4m7.559s
cat fb.tetNig2.chainHg19Link.txt
# 42910930 bases of 302314788 (14.194%) in intersection
##############################################################################
# dbSNP BUILD 130 - PROVISIONAL REMAPPING TO BUILD 37 (DONE 8/28/09 angie)
# /hive/data/outside/dbSNP/130/ was already set up during the hg18 run --
# just add hg19 coord files and go from there.
cd /hive/data/outside/dbSNP/130/human/data
alias wg wget --timestamping
set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/misc/exchange
# These are provisional files in an ad-hoc format.
wg $ftpSnpDb/README.txt
wg $ftpSnpDb/Remap_36_3_37_1.info
wg $ftpSnpDb/Remap_36_3_37_1.txt.gz
mv README.txt Remap_36_3_37_1_README
zcat Remap_36_3_37_1.txt.gz | wc -l
#18823990
# Use the remapping to transform ../ucscNcbiSnp.bed into one for hg19.
# Useful columns, 1-based: 1=ID, 3=oldChr, 4=oldStart, 5=oldEnd,
# 10=newChr, 11=newStart, 12=newEnd, 13=newLocType, 14=newWeight, 16=newStrand
# For mappings to chr*_random, oldStart and oldEnd are empty -- skip.
# Sort both hg18 snp file and remap file by {rsID,chr,start} to keep them in sync.
mkdir /hive/data/outside/dbSNP/130/human/hg19
cd /hive/data/outside/dbSNP/130/human/hg19
sort -k4n,4n -k1,1 -k2n,2n ../ucscNcbiSnp.bed > /data/tmp/hg18.ucscNcbiSnp.idSorted.bed
zcat ../data/Remap_36_3_37_1.txt.gz \
| sort -t " " -k1n,1n -k3,3 -k4n,4n \
> /data/tmp/Remap_36_3_37_1.txt
perl -we \
'use strict; \
sub nextMap { \
my ($rsId, undef, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
$nLocType, $nWt, $nRef, $nStr);\
do { \
($rsId, undef, $oChr, $oStart, $oEnd, undef,undef,undef,undef, \
$nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = split("\t", <>); \
if (defined $nStr) { \
chomp $nStr; $nStr =~ tr/+-/01/; $oChr = "chr$oChr"; $nChr = "chr$nChr"; \
} \
$oStart--; $oEnd--; $nStart--; $nEnd--; # Yep. 0-based closed vs 1-based closed \
} while (defined $nStr && ($oEnd < 0 || $nChr eq "chrUn")); \
return ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
$nLocType, $nWt, $nRef, $nStr); \
} # nextMap \
my ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
&nextMap(); \
my ($rCount, $oCount, $tCount) = 0; \
open(my $oldF, "/data/tmp/hg18.ucscNcbiSnp.idSorted.bed") || die; \
while (my ($chr, $s, $e, $id, $str, $rn,$obs,$mt,$cn,$vn,$ah,$ahse,$fc,$lt,$wt) = \
split("\t", <$oldF>)) { \
my $thisRCount = 0; \
while (defined $oChr && $chr eq $oChr && $s == $oStart && $e == $oEnd && $id == $rsId) { \
print join("\t", $nChr,$nStart,$nEnd,$id,$nStr,$nRef,$obs,$mt,$cn,$vn,$ah,$ahse,$fc, \
$nLocType,$nWt,$nStart) \
. "\n"; \
($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
&nextMap(); \
$thisRCount++; \
} \
if (defined $rsId && $id > $rsId) {warn "Slipped a cog"; last;} \
$tCount += $thisRCount; \
$rCount++ if ($thisRCount > 0); \
$oCount++; \
} \
close($oldF); print STDERR "Replaced $rCount of $oCount inputs ($tCount outputs).\n";' \
/data/tmp/Remap_36_3_37_1.txt \
| sort -k1,1 -k2n,2n -k4,4 \
> /data/tmp/hg19.ucscNcbiSnp.bed
#Replaced 18693260 of 19189750 inputs (18697579 outputs).
#504.562u 27.037s 8:59.57 98.5% 0+0k 0+0io 0pf+0w
wc -l /data/tmp/hg19.ucscNcbiSnp.bed
# 18697579 /data/tmp/hg19.ucscNcbiSnp.bed
# Drum roll please... translate NCBI's encoding into UCSC's, and
# perform a bunch of checks. This is where developer involvement
# is most likely as NCBI extends the encodings used in dbSNP.
cd /hive/data/outside/dbSNP/130/human/hg19
snpNcbiToUcsc /data/tmp/hg19.ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit \
-1000GenomesRsIds=../data/1000GenomesRsIds.txt snp130
#spaces stripped from observed:
#chr12 6093134 6093134 rs41402545
#Line 8049395 of /data/tmp/hg19.ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA
#count of snps with weight 0 = 0
#count of snps with weight 1 = 17042465
#count of snps with weight 2 = 345274
#count of snps with weight 3 = 1017906
#count of snps with weight 10 = 291934
#Skipped 1496 snp mappings due to errors -- see snp130Errors.bed
#146.837u 9.867s 4:21.63 59.8% 0+0k 0+0io 0pf+0w
# Comparable to hg18.snp130, with some losses due to coord translation, loss of _randoms,
# and 1496 errors (new locType or refNCBI inconsistent with new size).
expr 18697579 - 291934 - 1496
#18404149
# Move hg19.ucscNcbiSnp.bed from fast tmp to slow (today) hive:
gzip /data/tmp/hg19.ucscNcbiSnp.bed
mv /data/tmp/hg19.ucscNcbiSnp.bed.gz hg19.ucscNcbiSnp.bed.gz
# Will try not reuse hg18.snp130's giant 18G fasta file, not duplicate.
# Load up main track tables.
cd /hive/data/outside/dbSNP/130/human/hg19
hgLoadBed -tab -tmpDir=/data/tmp -allowStartEqualEnd \
hg19 snp130 -sqlTable=snp130.sql snp130.bed
#Loaded 18404149 elements of size 17
#115.086u 21.663s 2:32:09.98 1.4% 0+0k 0+0io 1pf+0w
#that is freakishly long -- lots happening today w/db move, hive recovery,...
hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
hg19 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
snp130Exceptions.bed
#Loaded 1982828 elements of size 5
#10.500u 0.851s 1:13.42 15.4% 0+0k 0+0io 0pf+0w
hgLoadSqlTab hg19 snp130ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
snp130ExceptionDesc.tab
# Load up sequences *from hg18 file*:
hgLoadSqlTab hg19 snp130Seq ~/kent/src/hg/lib/snpSeq.sql ../snp130Seq.tab
# Put in a link where one would expect to find the track build dir...
ln -s /hive/data/outside/dbSNP/130/human/hg19 /hive/data/genomes/hg19/bed/snp130
# Look at the breakdown of exception categories:
cd /hive/data/outside/dbSNP/130/human/hg19
cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr
#1350217 MultipleAlignments
# 495981 ObservedMismatch
# 37603 ObservedTooLong
# 26855 SingleClassTriAllelic
# 24443 FlankMismatchGenomeShorter
# 17927 SingleClassLongerSpan
# 13685 SingleClassZeroSpan
# 6238 FlankMismatchGenomeLonger
# 3016 DuplicateObserved
# 2851 SingleClassQuadAllelic
# 1777 MixedObserved
# 1264 NamedDeletionZeroSpan
# 508 FlankMismatchGenomeEqual
# 329 NamedInsertionNonzeroSpan
# 121 ObservedContainsIupac
# 11 RefAlleleMismatch
# 2 ObservedWrongFormat
#TODO: go through those above (esp snp130Errors.bed) and send some bug reports to dbSNP.
##############################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 8/31/09 angie)
mkdir /hive/data/genomes/hg19/bed/snp130Ortho
cd /hive/data/genomes/hg19/bed/snp130Ortho
# Following Heather's lead in snp126orthos, filter SNPs to to keep
# only those with class=single, length=1, chrom!~random;
# Exclude those with exceptions MultipleAlignments,
# SingleClassTriAllelic or SingleClassQuadAllelic.
# Unlike snp masking, we do not filter for weight -- don't know why.
awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
/hive/data/outside/dbSNP/130/human/hg19/snp130Exceptions.bed \
| sort -u \
> snp130ExcludeIds.txt
awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
/hive/data/outside/dbSNP/130/human/hg19/snp130.bed \
| grep -vFwf snp130ExcludeIds.txt \
> snp130Simple.bed
#203.193u 9.197s 2:57.40 119.7% 0+0k 0+0io 0pf+0w
wc -l snp130Simple.bed
#12278514 snp130Simple.bed
# Glom all human info that we need for the final table onto the
# name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
awk 'BEGIN{OFS="\t";} \
{print $1, $2, $3, \
$4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
0, $6;}' \
snp130Simple.bed > snp130ForLiftOver.bed
# Map coords to chimp using liftOver.
# I don't know why chimp took so much longer than macaque... the
# chimp .over has fewer chains and fewer bytes than the macaque .over.
mkdir run.liftOChimp
cd run.liftOChimp
mkdir split out
splitFile ../snp130ForLiftOver.bed 25000 split/chunk
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
\{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
>> jobList
end
ssh swarm
cd /hive/data/genomes/hg19/bed/snp130Ortho/run.liftOChimp
para make jobList
#Completed: 492 of 492 jobs
#CPU time in finished jobs: 51793s 863.22m 14.39h 0.60d 0.002 y
#IO & Wait Time: 3825s 63.75m 1.06h 0.04d 0.000 y
#Average job time: 113s 1.88m 0.03h 0.00d
#Longest finished job: 286s 4.77m 0.08h 0.00d
#Submission to last job: 300s 5.00m 0.08h 0.00d
# Map coords to orangutan using liftOver.
mkdir ../run.liftOPon
cd ../run.liftOPon
mkdir out
ln -s ../run.liftOChimp/split .
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
\{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
>> jobList
end
para make jobList
#Completed: 492 of 492 jobs
#CPU time in finished jobs: 125656s 2094.26m 34.90h 1.45d 0.004 y
#IO & Wait Time: 5413s 90.22m 1.50h 0.06d 0.000 y
#Average job time: 266s 4.44m 0.07h 0.00d
#Longest finished job: 646s 10.77m 0.18h 0.01d
#Submission to last job: 649s 10.82m 0.18h 0.01d
# Map coords to macaque using liftOver.
mkdir ../run.liftOMac
cd ../run.liftOMac
mkdir out
ln -s ../run.liftOChimp/split .
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
\{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
>> jobList
end
para make jobList
#Completed: 492 of 492 jobs
#CPU time in finished jobs: 161612s 2693.54m 44.89h 1.87d 0.005 y
#IO & Wait Time: 6218s 103.63m 1.73h 0.07d 0.000 y
#Average job time: 341s 5.69m 0.09h 0.00d
#Longest finished job: 727s 12.12m 0.20h 0.01d
#Submission to last job: 739s 12.32m 0.21h 0.01d
cd /hive/data/genomes/hg19/bed/snp130Ortho
# Concatenate the chimp results, sorting by chimp pos in order to
# efficiently access 2bit sequence in getOrthoSeq. The output of
# that is then sorted by the glommed human info field, so that we
# can use join to combine chimp and macaque results in the next step.
# Ditto for macaque and orangutan. Each command pipe takes ~5 minutes:
sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
| sort > panTro2.orthoGlom.txt
sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
| sort > ponAbe2.orthoGlom.txt
sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
| sort > rheMac2.orthoGlom.txt
wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
# 11428526 panTro2.orthoGlom.txt
# 10861969 ponAbe2.orthoGlom.txt
# 9694237 rheMac2.orthoGlom.txt
# Use the glommed name field as a key to join up chimp and macaque
# allele data. Include glommed name from both files because if only
# file 2 has a line for the key in 2.1, then 1.1 is empty. Then plop
# in the orthoGlom fields from each file, which are in the same order
# as the chimp and macaque columns of snp130OrthoPanTro2RheMac2.
join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
-a 1 -a 2 -e '?' \
panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
| awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
else { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
> tmp.txt
join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
-a 1 -a 2 -e '?' \
tmp.txt rheMac2.orthoGlom.txt \
| perl -wpe 'chomp; \
($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
$o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
$o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
$glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
split(/\|/, $glomKey); \
$o1Start =~ s/^\?$/0/; $o2Start =~ s/^\?$/0/; $o3Start =~ s/^\?$/0/; \
$o1End =~ s/^\?$/0/; $o2End =~ s/^\?$/0/; $o3End =~ s/^\?$/0/; \
print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
$o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
$o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
$o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
s/^.*$//;' \
| sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed
#304.434u 27.118s 4:31.30 122.2% 0+0k 0+0io 0pf+0w
wc -l snp130OrthoPt2Pa2Rm2.bed
#11876029 snp130OrthoPt2Pa2Rm2.bed
cd /hive/data/genomes/hg19/bed/snp130Ortho
hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
-sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
hg19 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed
#Loaded 11876029 elements of size 22
#75.442u 8.828s 9:50.27 14.2% 0+0k 0+0io 0pf+0w
# Cleanup fileserver:
cd /hive/data/genomes/hg19/bed/snp130Ortho
gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed &
rm -r run*/split tmp.txt *.orthoGlom.txt
-
##############################################################################
-<<<<<<< hg19.txt
# LASTZ Rabbit OryCun2 (DONE - 2009-08-12 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
cd /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
cat << '_EOF_' > DEF
# Human vs. Rabbit
BLASTZ_M=50
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rabbit at chunk 20,000,000 all but 36 contigs can fit in a single job
SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit
SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/oryCun2/oryCun2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/oryCun2/oryCun2.contigs.sizes
SEQ2_LIFT=/hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 516m41.981s
cat fb.hg19.chainOryCun2Link.txt
# 1283994337 bases of 2897316137 (44.317%) in intersection
# should have run syntenicNet in that first run
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
# about 1 hour
mkdir /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
cd /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-swap -syntenicNet > swap.log 2>&1 &
# real 176m35.932s
cat fb.oryCun2.chainHg19Link.txt
# 1260477501 bases of 2604023284 (48.405%) in intersection
##############################################################################
# running syntenicNet on CavPor3 lastz (DONE - 2009-08-27 - Hiram)
cd /hive/data/genomes/hg19/bed/lastzCavPor3.2009-06-04
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
# about 44 minutes
##############################################################################
# loading the lastz tables on cavPor3 - (DONE - 2009-08-28 - Hiram)
# the chain.tab and link.tab files are left over from the failed load
cd /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain
# find out their sizes, average and total:
awk '{print length($0)}' chain.tab | ave stdin
Q1 92.000000 median 93.000000 Q3 96.000000
average 93.651267
min 64.000000 max 109.000000
count 27186468
total 2546047186.000000
awk '{print length($0)}' link.tab | ave stdin
Q1 45.000000 median 47.000000 Q3 48.000000
average 46.731871
min 22.000000 max 52.000000
count 240602108
total 11243786622.000000
cat << '_EOF_' > chainHg19Link.sql
CREATE TABLE chainHg19Link (
bin smallint(5) unsigned NOT NULL default 0,
tName varchar(255) NOT NULL default '',
tStart int(10) unsigned NOT NULL default 0,
tEnd int(10) unsigned NOT NULL default 0,
qStart int(10) unsigned NOT NULL default 0,
chainId int(10) unsigned NOT NULL default 0,
KEY tName (tName(13),bin),
KEY chainId (chainId)
) ENGINE=MyISAM max_rows=241000000 avg_row_length=50 pack_keys=1 CHARSET=latin1;
'_EOF_'
# << happy emacs
hgsql cavPor3 < chainHg19Link.sql
time hgsql -e \
'load data local infile "link.tab" into table chainHg19Link;' cavPor3
# real 405m15.956s
cd /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain
# and the net tracks were not loaded:
time netClass -verbose=0 -noAr noClass.net cavPor3 hg19 cavPor3.hg19.net
# real 40m25.078s
netFilter -minGap=10 cavPor3.hg19.net \
| hgLoadNet -verbose=0 cavPor3 netHg19 stdin
# real 33m24.972s (plus the featureBits below)
featureBits cavPor3 chainHg19Link > fb.cavPor3.chainHg19Link.txt 2>&1
cat fb.cavPor3.chainHg19Link.txt
# 1279572660 bases of 2663369733 (48.043%) in intersection
##############################################################################
# DBSNP CODING ANNOTATIONS (DONE 9/1/09 angie)
# Repeat the coord-remapping performed for snp130 on the hg18 coding anno table.
cd /hive/data/outside/dbSNP/130/human/hg19
sed -re 's/\trs([0-9]+)\t/\t\1\t/' ../snp130CodingDbSnp.bed \
| sort -k4n,4n -k1,1 -k2n,2n > /data/tmp/hg18.snp130Coding.idSorted.bed
# reuse /data/tmp/Remap_36_3_37_1.txt mapping file created for snp130 above:
perl -we \
'use strict; \
sub nextMap { \
my ($rsId, undef, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd); \
do { \
($rsId, undef, $oChr, $oStart, $oEnd, undef,undef,undef,undef, \
$nChr, $nStart, $nEnd) = split("\t", <>); \
if (defined $nEnd) { \
$oChr = "chr$oChr"; $nChr = "chr$nChr"; \
} \
$oStart--; $oEnd--; $nStart--; $nEnd--; # Yep. 0-based closed vs 1-based closed \
} while (defined $rsId && ($oEnd < 0 || $nChr eq "chrUn")); \
return ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd); \
} # nextMap \
my ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = &nextMap(); \
my ($rCount, $oCount, $tCount) = 0; \
open(my $oldF, "/data/tmp/hg18.snp130Coding.idSorted.bed") || die; \
while (my ($chr, $s, $e, $id, $tx, $frm, $alCount, $funcs, $als, $codons, $peps) = \
split("\t", <$oldF>)) { \
my $thisRCount = 0; \
while (defined $rsId && $rsId < $id) { \
($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = &nextMap(); \
} \
while (defined $oChr && $chr eq $oChr && $s == $oStart && $e == $oEnd && $id == $rsId) { \
print join("\t", $nChr, $nStart, $nEnd, "rs$id", $tx, $frm, \
$alCount, $funcs, $als, $codons, $peps); \
($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = &nextMap(); \
$thisRCount++; \
} \
$tCount += $thisRCount; \
$rCount++ if ($thisRCount > 0); \
$oCount++; \
} \
close($oldF); print STDERR "Replaced $rCount of $oCount inputs ($tCount outputs).\n";' \
/data/tmp/Remap_36_3_37_1.txt \
| sort -k1,1 -k2n,2n -k4,4 \
> /data/tmp/hg19.snp130Coding.bed
#Replaced 197921 of 279815 inputs (198493 outputs).
#160.824u 1.949s 2:43.01 99.8% 0+0k 0+0io 0pf+0w
hgLoadBed hg19 snp130CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
-renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
/data/tmp/hg19.snp130Coding.bed
#Loaded 198493 elements of size 11
mv /data/tmp/hg19.snp130Coding.bed hg19.snp130CodingDbSnp.bed
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
##########################################################################
# BUILD ALLEN BRAIN TRACK (DONE 09/30/09 kent)
# Make the working directory
ssh hgwdev
cd /cluster/data/hg19/bed
mkdir allenBrain
cd allenBrain
# Remap the probe alignments from mm7 to hg19
zcat /gbdb/mm9/liftOver/mm9ToHg19.over.chain.gz \
| pslMap -chainMapFile -swapMap \
/cluster/data/mm9/bed/allenBrain/allenBrainAli.psl stdin stdout \
| sort -k 14,14 -k 16,16n > unscored.psl
pslRecalcMatch unscored.psl /cluster/data/hg19/hg19.2bit \
/cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa allenBrainAli.psl
# Load the database
hgsql hg19 < ~/kent/src/hg/lib/allenBrainUrl.sql
hgsql hg19 -e 'load data local infile "/cluster/data/mm9/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
hgLoadPsl hg19 allenBrainAli.psl
mkdir /gbdb/hg19/allenBrain
ln -s /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa /gbdb/hg19/allenBrain/allenBrainProbes.fa
hgLoadSeq hg19 /gbdb/hg19/allenBrain/allenBrainProbes.fa
# Make mapping between known genes and allenBrain
hgMapToGene hg19 allenBrainAli -type=psl knownGene knownToAllenBrain
############################################################################
+## Annotate 46-way multiple alignment with gene annotations
+## (DONE - 2008-12-08,23 - Hiram)
+ # Gene frames
+ ## survey all genomes to see what type of gene track to use
+ ssh hgwdev
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/frames
+ cd /hive/data/genomes/hg19/bed/multiz46way/frames
+ #
+ # survey all the genomes to find out what kinds of gene tracks they have
+ cat << '_EOF_' > showGenes.csh
+#!/bin/csh -fe
+foreach db (`cat ../species.list`)
+ echo -n "${db}: "
+ set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
+ foreach table ($tables)
+ if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
+ $table == "knownGene" || $table == "xenoRefGene" ) then
+ set count = `hgsql $db -N -e "select count(*) from $table"`
+ echo -n "${table}: ${count}, "
+ endif
+ end
+ set orgName = `hgsql hgcentraltest -N -e \
+ "select scientificName from dbDb where name='$db'"`
+ set orgId = `hgsql hg19 -N -e \
+ "select id from organism where name='$orgName'"`
+ if ($orgId == "") then
+ echo "Mrnas: 0"
+ else
+ set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
+ echo "Mrnas: ${count}"
+ endif
+end
+'_EOF_'
+ # << happy emacs
+ chmod +x ./showGenes.csh
+ # rearrange that output to create four sections:
+ # 1. knownGenes for hg19, mm9, rn4
+ # 2. ensGene for almost everything else
+ # 3. xenoRefGene for calJac1, petMar1, loxAfr3, papHam1, macEug1, oryCun2
+
+ mkdir genes
+ # knownGene
+ for DB in hg19 mm9 rn4
+do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
+ | genePredSingleCover stdin stdout | gzip -2c \
+ > /scratch/tmp/${DB}.tmp.gz
+ mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+ echo "${DB} done"
+done
+
+ echo "panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+ tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+ bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+ proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+ taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6" \
+ | sed -e "s/ */ /g" > ensGene.list
+
+
+do
+ # ensGene
+ for DB in panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+ tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+ bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+ proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+ taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6
+do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
+ | genePredSingleCover stdin stdout | gzip -2c \
+ > /scratch/tmp/${DB}.tmp.gz
+ mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+ echo "${DB} done"
+done
+
+ echo "calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2" > xenoRef.list
+
+ # xenoRefGene
+ for DB in calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2
+do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \
+ | genePredSingleCover stdin stdout | gzip -2c \
+ > /scratch/tmp/${DB}.tmp.gz
+ mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+ echo "${DB} done"
+done
+
+ # the following single command doesn't work on any 32 Gb computer,
+ # requires much more memory, turn it into a kluster job, see below ...
+
+ # Create this command with this script:
+ cat << '_EOF_' > mkCmd.sh
+#!/bin/sh
+
+echo "time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \\"
+for G in mm9 rn4
+do
+ if [ ! -s genes/${G}.gp.gz ]; then
+ echo "missing genes/${G}.gp.gz"
+ exit 255
+ fi
+ echo -n "${G} genes/${G}.gp.gz "
+done
+echo "\\"
+for D in `sort ensGene.list`
+do
+ if [ ! -s genes/${D}.gp.gz ]; then
+ echo "missing genes/${D}.gp.gz"
+ exit 255
+ fi
+ echo -n "${D} genes/${D}.gp.gz "
+done
+echo "\\"
+for D in `sort xenoRef.list`
+do
+ if [ ! -s genes/${D}.gp.gz ]; then
+ echo "missing genes/${D}.gp.gz"
+ exit 255
+ fi
+ echo -n "${D} genes/${D}.gp.gz "
+done
+echo "\\"
+echo " | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1"
+'_EOF_'
+ # << happy emacs
+ chmod +x ./mkCmd.sh
+
+ time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \
+mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz \
+panTro2 genes/panTro2.gp.gz gorGor1 genes/gorGor1.gp.gz ponAbe2 genes/ponAbe2.gp.gz rheMac2 genes/rheMac2.gp.gz tarSyr1 genes/tarSyr1.gp.gz micMur1 genes/micMur1.gp.gz otoGar1 genes/otoGar1.gp.gz tupBel1 genes/tupBel1.gp.gz dipOrd1 genes/dipOrd1.gp.gz cavPor3 genes/cavPor3.gp.gz speTri1 genes/speTri1.gp.gz ochPri2 genes/ochPri2.gp.gz vicPac1 genes/vicPac1.gp.gz turTru1 genes/turTru1.gp.gz bosTau4 genes/bosTau4.gp.gz equCab2 genes/equCab2.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz myoLuc1 genes/myoLuc1.gp.gz pteVam1 genes/pteVam1.gp.gz eriEur1 genes/eriEur1.gp.gz sorAra1 genes/sorAra1.gp.gz proCap1 genes/proCap1.gp.gz echTel1 genes/echTel1.gp.gz dasNov2 genes/dasNov2.gp.gz choHof1 genes/choHof1.gp.gz monDom5 genes/monDom5.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz taeGut1 genes/taeGut1.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz tetNig2 genes/tetNig2.gp.gz fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat2 genes/oryLat2.gp.gz danRer6 genes/danRer6.gp.gz \
+calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz loxAfr3 genes/loxAfr3.gp.gz papHam1 genes/papHam1.gp.gz macEug1 genes/macEug1.gp.gz oryCun2 genes/oryCun2.gp.gz \
+ | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1
+
+ # that doesn't work on any 32 Gb computer, requires much more memory
+ # turn it into a kluster job
+ ssh swarm
+ cd /hive/data/genomes/hg19/bed/multiz46way/frames
+ cat << '_EOF_' > runOne
+#!/bin/csh -fe
+
+set C = $1
+set G = $2
+
+cat ../maf/${C}.maf | genePredToMafFrames hg19 stdin stdout \
+ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
+'_EOF_'
+ # << happy emacs
+ chmod +x runOne
+
+ ls ../maf | sed -e "s/.maf//" > chr.list
+ ls genes | sed -e "s/.gp.gz//" | grep -v hg19 > gene.list
+
+ cat << '_EOF_' > template
+#LOOP
+runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ mkdir parts
+ gensub2 chr.list gene.list template jobList
+ para -ram=8g create jobList
+ para try ... check ... push
+# Completed: 4185 of 4185 jobs
+# CPU time in finished jobs: 72491s 1208.19m 20.14h 0.84d 0.002 y
+# IO & Wait Time: 1462162s 24369.36m 406.16h 16.92d 0.046 y
+# Average job time: 367s 6.11m 0.10h 0.00d
+# Longest finished job: 3165s 52.75m 0.88h 0.04d
+# Submission to last job: 6364s 106.07m 1.77h 0.07d
+
+ # see what it looks like in terms of number of annotations per DB:
+ find ./parts -type f | while read F
+do
+ zcat ${F}
+done | cut -f4 | sort | uniq -c | sort -n > annotation.survey.txt
+ 79191 rn4
+ 108287 petMar1
+ 139581 gorGor1
+ 140487 taeGut1
+ 143058 choHof1
+ 143233 vicPac1
+ 150073 anoCar1
+ 154462 tarSyr1
+ 163930 sorAra1
+ 164575 galGal3
+ 171191 macEug1
+ 174221 felCat3
+ 175831 dasNov2
+ 177622 ornAna1
+ 190729 eriEur1
+ 192285 tupBel1
+ 198052 speTri1
+ 199639 micMur1
+ 201731 papHam1
+ 201961 panTro2
+ 206170 oryCun2
+ 209327 ponAbe2
+ 209504 otoGar1
+ 210860 rheMac2
+ 212533 proCap1
+ 212848 myoLuc1
+ 213146 dipOrd1
+ 213479 calJac1
+ 215995 echTel1
+ 220341 ochPri2
+ 225132 loxAfr3
+ 226689 turTru1
+ 230903 monDom5
+ 232025 pteVam1
+ 232831 equCab2
+ 236945 cavPor3
+ 238167 bosTau4
+ 239857 mm9
+ 255727 canFam2
+ 316850 xenTro2
+ 359507 danRer6
+ 375156 oryLat2
+ 390076 fr2
+ 426532 gasAcu1
+ 434619 tetNig2
+
+ # load the resulting file
+ ssh hgwdev
+ cd /cluster/data/hg19/bed/multiz46way/frames
+ find ./parts -type f | while read F
+do
+ zcat ${F}
+done | sort -k1,1 -k2,2n | hgLoadMafFrames hg19 multiz46wayFrames stdin
+ # real 5m47.840s
+
+ find ./parts -type f | while read F
+do
+ zcat ${F}
+done | sort -k1,1 -k2,2n > multiz46wayFrames.bed
+
+ featureBits -countGaps hg19 multiz46wayFrames.bed
+ # 62315198 bases of 3107677273 (2.005%) in intersection
+ featureBits -countGaps hg19 multiz28wayFrames
+ # 48236360 bases of 3107677273 (1.552%) in intersection
+
+ # enable the trackDb entries:
+# frames multiz46wayFrames
+# irows on
+ # appears to work OK
+
+#############################################################################
# AFFY U133AB (Done - 2009-09-30 - Jim)
# Align probes
ssh swarm
cd /cluster/data/hg19/bed
mkdir -p affyProbes/affyU133/run
cd affyProbes/affyU133/run
mkdir psl
ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
ls -1 /hive/data/outside/affyProbes/HG-U133AB_all.fa > mrna.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.lst mrna.lst gsub jobList
para create jobList
para try
para check
para push
para time
#Completed: 93 of 93 jobs
#CPU time in finished jobs: 21246s 354.09m 5.90h 0.25d 0.001 y
#IO & Wait Time: 349s 5.82m 0.10h 0.00d 0.000 y
#Average job time: 232s 3.87m 0.06h 0.00d
#Longest finished job: 1650s 27.50m 0.46h 0.02d
#Submission to last job: 1685s 28.08m 0.47h 0.02d
# Do sort, best in genome filter.
# to create affyU133.psl.
pslSort dirs raw.psl tmp psl
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyU133.psl /dev/null
rm -r raw.psl psl
# Load probes and alignments into database.
ssh hgwdev
cd /cluster/data/hg19/bed/affyProbes/affyU133
hgLoadPsl hg19 affyU133.psl
hgLoadSeq hg19 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
##########################################################################
# GNF ATLAS 2 (Done - 2009-09-30 - Jim)
# Align probes from GNF1H chip.
ssh swarm
cd /cluster/data/hg19/bed
mkdir -p geneAtlas2/run/psl
cd geneAtlas2/run
mkdir psl
ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
ls -1 /hive/data/outside/gnf/human/atlas2/gnf1h.fa > mrna.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.lst mrna.lst gsub jobList
para create jobList
para try
para check
para push
para time
#Completed: 93 of 93 jobs
#CPU time in finished jobs: 3299s 54.98m 0.92h 0.04d 0.000 y
#IO & Wait Time: 330s 5.50m 0.09h 0.00d 0.000 y
#Average job time: 39s 0.65m 0.01h 0.00d
#Longest finished job: 370s 6.17m 0.10h 0.00d
#Submission to last job: 477s 7.95m 0.13h 0.01d
# Do sort, best in genome filter
# to create gnf1h.psl.
pslSort dirs raw.psl tmp psl
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1h.psl /dev/null
rm -r raw.psl psl
# Load probes and alignments from GNF1H into database.
ssh hgwdev
cd /hive/data/genomes/hg19/bed/geneAtlas2
hgLoadPsl hg19 affyGnf1h.psl
hgLoadSeq hg19 /gbdb/hgFixed/affyProbes/gnf1h.fa
grep -v U133B ../affyProbes/affyU133/affyU133.psl \
| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
| sed -e "s/;//" > affyU133A.psl
hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
affyU133A.psl affyGnf1h.psl
# Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
# Mapped 33186, multiply-mapped 3171, missed 48, unmapped 11510
hgLoadBed hg19 gnfAtlas2 gnfAtlas2.bed
# Loaded 36357 elements of size 15
##########################################################################
# BUILD NIBB IMAGE PROBES (DONE 2009-10-12 JK)
# Make directory on san for cluster job and copy in sequence
ssh swarm
mkdir /hive/data/genomes/hg19/bed/nibbPics
cd /hive/data/genomes/hg19/bed/nibbPics
cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
# Make parasol job dir and sequence list files
mkdir run
cd run
mkdir psl
ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
echo ../nibbImageProbes.fa > mrna.lst
# Create parasol gensub file file
cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'
# Create parasol batch
gensub2 genome.lst mrna.lst gsub spec
para create spec
# Do para try/push/time etc.
#Completed: 93 of 93 jobs
#CPU time in finished jobs: 8008s 133.47m 2.22h 0.09d 0.000 y
#IO & Wait Time: 364s 6.07m 0.10h 0.00d 0.000 y
#Average job time: 90s 1.50m 0.03h 0.00d
#Longest finished job: 765s 12.75m 0.21h 0.01d
#Submission to last job: 824s 13.73m 0.23h 0.01d
# Make sort and filter
catDir psl | sort -k 10 \
| pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
| sort -k 14,14 -k 16,16n \
| sed 's#/scratch/data/hg19/nib/chr#chr#' \
| sed 's/.nib//' > ../nibbImageProbes.psl
# Make bed file and copy in stuff
ssh hgwdev
cd /hive/data/genomes/hg19/bed/nibbPics
cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
# Load into database
ln -s /cluster/data/hg19/bed/nibbPics/nibbImageProbes.fa /gbdb/hg19/nibbImageProbes.fa
hgLoadSeq hg19 /gbdb/hg19/nibbImageProbes.fa
hgLoadPsl hg19 nibbImageProbes.psl
##########################################################################
# Initial vgProbeTrack run for hg19 (galt 2009-10-15)
# see visiGene.txt make doc
# uses nibbImageProbes and vgProbeTrack utility
# creates vgAllProbes and knownToVisiGene
# 25931
# updates visiGene.vgPrbAliAll.
# creates and runs hgLoadSeq on /gbdb/hg19/visiGene/*.fa
##########################################################################
# make new grp table to match hg18 (DONE 2009-10-01 kuhn)
# to split regulation from expression
# phenDis group is also missing in hg19
# and add one more column: defaultIsClosed
# get the hg18.grp table into hg19
# copy the hg18.grp table into hg19.grpNew and edit
hgsql hg19
CREATE TABLE grpNew SELECT * FROM hg18.grp;
# 24 rows in set (0.00 sec)
DELETE FROM grpNew WHERE name LIKE "encode%";
DELETE FROM grpNew WHERE name LIKE "remc%";
DELETE FROM grpNew WHERE name LIKE "tcga%";
DELETE FROM grpNew WHERE name LIKE "cancer%";
DELETE FROM grpNew WHERE name LIKE "jk%";
# 10 rows in set (0.00 sec)
# move the new table into place quickly
DROP TABLE grp;
RENAME TABLE grpNew TO grp;
#########################################################################
# BUILD OMIM RELATED GENES TRACK (done 2009-10-13 jk)
ssh hgwdev
cd /hive/data/genomes/hg19/bed
mkdir omimGene
cd omimGene
# download the file morbidmap and genemap from OMIM
mkdir omim
cd omim
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/morbidmap
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/genemap
cat genemap|sed -e 's/|/\t/g' > genemap.tab
autoSql ~/src/hg/lib/omimGeneMap.as x
cat x.sql |sed -e 's/PRIMARY KEY(numbering)/KEY(omimId)/' >omimGeneMap.sql
hgLoadSqlTab -warn hg19 omimGeneMap omimGeneMap.sql genemap.tab
# got warning on 3 records, just ignore them
# Warning: load of omimGeneMap did not go as planned: 12216 record(s), 0 row(s)
rm x.c x.h
cd ..
cat omim/morbidmap|sed -e 's/|/\t/g' > mobidmap.tab
autoSql ~/src/hg/lib/omimMorbidMap.as x
cat x.sql |sed -e 's/PRIMARY KEY(description)/KEY(omimId)/' >omimMorbidMap.sql
hgLoadSqlTab -warn hg19 omimMorbidMap omimMorbidMap.sql mobidmap.tab
# get all UCSC genes (from the knownGene table) that cross-reference to a RefSeq gene
# that has a non-empty OMIM ID according to the refLink table. And use OMIM ID as
# the gene name for this new table. Please note the alignId field still holds the KG ID.
hgsql hg19 -N -e \
'select omimId, kg.* from knownGene kg, knownToRefSeq kr, refLink l where omimId != 0 and mrnaAcc=kr.value and kg.name=kr.name ' \
|cut -f 1,3-13 >o1.tab
# collect more OMIM related genes via the MIM external DB links from UniProt
hgsql hg19 -N -e \
'select extAC, kg.* from knownGene kg, kgXref k, proteome.spXref2 p where spId=p.accession and extDB="MIM" and kg.name=kgId ' \
|cut -f 1,3-13 >o2.tab
# concatenate the above two gene sets and remove duplications.
cat o1.tab o2.tab |sort -u >o3.tab
# load the result into a temp table, fanO3
hgLoadSqlTab hg19 fanO3 ~/src/hg/lib/knownGene.sql o3.tab
# while holding onto the OMIM ID, get the canonical gene (via the knownGene, knowIsoforms,
# and knownCanonical tables) that represent a cluster which contains
# initial OMIM gene in the fanO3 table
hgsql hg19 -N -e \
'select f3.name, kg.* from fanO3 f3, knownGene kg, knownCanonical c, knownIsoforms i where f3.alignId=i.transcript and kg.name=c.transcript and c.clusterId=i.clusterId'\
> o4.tab
# first column is the OMIM ID
cut -f 1 o4.tab >j1.tmp
# col 3-13 is the gene structure of the canonical KG
cut -f 3-13 o4.tab >j2.tmp
# stitch them together and remove duplicates, load the result into fanO4 table
paste j1.tmp j2.tmp |sort -u >fanO4.tab
hgLoadSqlTab hg19 fanO4 ~/src/hg/lib/knownGene.sql fanO4.tab
# finally sort the table and create bed 4 file and load it as the omimGene table
hgsql hg19 -N -e 'select chrom, txStart, txEnd, name from fanO4 order by chrom, txStart, txEnd' |sort -u >omimGene.bed
hgLoadBed hg19 omimGene omimGene.bed
# create and load the omimToKnownCanonical table.
hgsql hg19 -N -e 'select name, alignId from fanO4 order by name'\
> omimToKnownCanonical.tab
hgLoadSqlTab hg19 omimToKnownCanonical \
~/src/hg/lib/omimToKnownCanonical.sql omimToKnownCanonical.tab
# The following clean up could be done.
# hgsql hg19 -e 'drop table fanO3'
# hgsql hg19 -e 'drop table fanO4'
# rm j*.tmp
# rm o1.tab o2.tab o3.tab o4.tab
#########################################################################
# BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (in progress 2009-10-14 jk)
# Make the directory to work in
cd /hive/data/genomes/hg19/bed
mkdir hprd
cd hprd
# Download HPRD_XML_070609.tar.gz from www.hprd.org. Unfortunately this
# requires registration, so can't just wget it.
zcat HPRD_XML_070609.tar.gz | tar -xv
# This will create 20000 or more xxxx.xml files under HPRD_XML_070609
# Create hprdToCdna table
echo HPRD_XML_070609/*.xml | xargs grep entry_cdna > j.cdna
cat j.cdna| sed -e 's/\//\t/' | sed -e 's/.xml/\t/' |\
sed -e 's/<entry_cdna>/\t/' | sed -e 's/<\//\t/'| sed -e 's/\./\t/'| cut -f 2,4|\
grep -v None >hprdToCdna.tab
hgsql hg19 <~/src/hg/lib/hprdToCdna.sql
hgsql hg19 -e 'load data local infile "hprdToCdna.tab" into table hprdToCdna'
# Create hprdToUniProt table
echo 'fgrep -H Swiss HPRD_XML_070609/$1.xml' >do1
ls HPRD_XML_070609 >j
cat j |sed -e 's/.xml/\tdo1/g' >jj
cut -f 1 jj >j.2
cut -f 2 jj >j.1
paste j.1 j.2 >doall
chmod +x do*
./doall >j.out
cat j.out|grep SwissProt | sed -e 's/\//\t/' | sed -e 's/.xml/\t/' | \
sed -e 's/Prot>/\t/' | sed -e 's/<\//\t/'| cut -f 2,4|grep -v None >hprdToUniProt.tab
hgsql hg19 <~/src/hg/lib/hprdToUniProt.sql
hgsql hg19 -e 'load data local infile "hprdToUniProt.tab" into table hprdToUniProt'
# build knownToHprd table
hgsql hg19 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=refseq' >j.kg1
hgsql hg19 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2
cat j.kg1 j.kg2 | sed 's/_.//' | sort -u >knownToHprd.tab
wc knownToHprd.tab
hgsql hg19 <~/src/hg/lib/knownToHprd.sql
hgsql hg19 -e 'load data local infile "knownToHprd.tab" into table knownToHprd'
hgsql hg19 -e 'select count(*) from knownToHprd'
# 21,516 records created
# remove temporary files.
rm j*