src/hg/makeDb/doc/hg19.txt 1.8
1.8 2009/04/27 20:11:50 hiram
done with swap of chain/net to panTro2
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/hg/makeDb/doc/hg19.txt 24 Apr 2009 23:42:48 -0000 1.7
+++ src/hg/makeDb/doc/hg19.txt 27 Apr 2009 20:11:50 -0000 1.8
@@ -1,815 +1,818 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on
# NCBI build 37 (February 2009 freeze) aka:
# GRCh37 - Genome Reference Consortium Human Reference 37
# Assembly Accession: GCA_000001405.1
# "$Id$";
#############################################################################
# Download sequence (DONE - 2009-02-04 - Hiram)
mkdir -p /hive/data/genomes/hg19/download
cd /hive/data/genomes/hg19/download
mkdir -p assembled_chromosomes
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=assembled_chromosomes \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes
mkdir -p alternate_loci
for N in 1 2 3 4 5 6 7 8 9
do
wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=alternate_loci \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
done
mkdir -p unlocalized_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=unlocalized_scaffolds \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds
mkdir -p unplaced_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=unplaced_scaffolds \
-nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds
mkdir -p placed_scaffolds
wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
--directory-prefix=placed_scaffolds \
-nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds
mkdir ucscChr
cd ucscChr
for F in ../assembled_chromosomes/FASTA/chr*.fa
do
C=`basename $F`
C=${C/.fa}
echo -n "${C} "
H=`head -1 "${F}"`
chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
echo $chrN $A
grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
| sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
echo ">${chrN}" > ${chrN}.fa
grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
done
rm -f scaffolds.agp
find ../alternate_loci -type f | grep ".agp$" | while read F
do
grep "^GL" $F | sed -e \
"s/^GL000250.1/chr6_apd_hap1/" -e \
"s/^GL000251.1/chr6_cox_hap2/" -e \
"s/^GL000252.1/chr6_dbb_hap3/" -e \
"s/^GL000253.1/chr6_mann_hap4/" -e \
"s/^GL000254.1/chr6_mcf_hap5/" -e \
"s/^GL000255.1/chr6_qbl_hap6/" -e \
"s/^GL000256.1/chr6_ssto_hap7/" -e \
"s/^GL000257.1/chr4_ctg9_hap1/" -e \
"s/^GL000258.1/chr17_ctg5_hap1/"
done > scaffolds.agp
find ../unlocalized_scaffolds -type f | grep ".agp$" \
| while read F
do
C=`basename ${F}`
C=${C/.unlocalized.scaf.agp}
grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
done >> scaffolds.agp
find ../unplaced_scaffolds -type f | grep ".agp$" \
| while read F
do
grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
done >> scaffolds.agp
rm -f scaffolds.fa
find ../alternate_loci -type f | grep ".fa$" | while read F
do
sed -e \
"s/>.*GL000250.*/>chr6_apd_hap1/" -e \
"s/>.*GL000251.*/>chr6_cox_hap2/" -e \
"s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
"s/>.*GL000253.*/>chr6_mann_hap4/" -e \
"s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
"s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
"s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
"s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
"s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
done > scaffolds.fa
find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
do
sed -e \
"s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
done >> scaffolds.fa
find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
do
sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
done >> scaffolds.fa
############################################################################
## Create database (DONE - 2009-03-04 - Hiram)
cd /hive/data/genomes/hg19
cat << '_EOF_' > hg19.config.ra
# Config parameters for makeGenomeDb.pl:
db hg19
scientificName Homo sapiens
commonName Human
assemblyDate Feb. 2009
assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
orderKey 14
mitoAcc NC_001807
fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
# qualFiles /dev/null
dbDbSpeciesDir human
taxId 9606
'_EOF_'
# << happy emacs
time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
# real 14m8.958s
featureBits -countGaps hg19 gap
# 239845127 bases of 3137161264 (7.645%) in intersection
featureBits -noRandom -noHap -countGaps hg19 gap
# 234344806 bases of 3095693983 (7.570%) in intersection
# verify featureBits is properly ignorning haps and randoms:
egrep -v "_" chrom.sizes | awk '{sum+=$2;print sum,$0}'
# 3095693983 chrM 16571
# same total as in featureBits
############################################################################
# running repeat masker (DONE - 2009-03-05 - Hiram)
screen # use screen to manage this day-long job
mkdir /hive/data/genomes/hg19/bed/repeatMasker
cd /hive/data/genomes/hg19/bed/repeatMasker
time doRepeatMasker.pl -bigClusterHub=swarm -buildDir=`pwd` hg19 \
> do.log 2>&1
# real 525m23.521s
cat faSize.rmsk.txt
# 3137161264 bases (239850802 N's 2897310462 real 1431585691
# upper 1465724771 lower) in 93 sequences in 1 files
# %46.72 masked total, %50.59 masked real
featureBits -countGaps hg19 rmsk
# 1465724774 bases of 3137161264 (46.721%) in intersection
# this is odd, 3 bases more in featureBits than were masked ?
# check it out, make a bed file from the featureBits:
featureBits -countGaps -bed=rmsk.bed hg19 rmsk
# went down a sequence of intersections with this idea, but could
# not get it resolved. It appears there are 75 bases in the rmsk
# table that were not masked in the 2bit file ?
# Later on, realized that featureBits does not count lower case N's
# in the "lower" category, but only in the N's category.
# trying a non-split table:
hgsql -e "show tables;" hg19 | grep _rmsk | while read T
do
hgsql -e "drop table ${T};" hg19
done
hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.fa.out
bad rep range [4385, 4384] line 1348605 of hg19.fa.out
bad rep range [5563, 5562] line 1563988 of hg19.fa.out
bad rep range [4539, 4538] line 3111186 of hg19.fa.out
# featureBits still reports 1465724774 bases in rmsk table
# cleaning the hg19.fa.out file:
cp hg19.fa.out hg19.clean.out
# edit hg19.clean.out and remove the three lines:
# 1467 20.7 1.2 17.6 chr14 35056767 35056794 (72292746) + L1ME1 LINE/L1 4385 4384 (1761) 1120962
# 1943 23.8 5.0 12.6 chr15 65775909 65775924 (36755468) + L1MC4 LINE/L1 5563 5562 (2480) 1299299
# 2463 25.1 5.0 11.6 chr3 121291056 121291083 (76731347) + L1M3 LINE/L1 4539 4538 (1608) 2589267
# reload the table
hgsql -e "drop table rmsk;" hg19
hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.clean.out
# try masking with this clean file:
twoBitMask /hive/data/genomes/hg19/hg19.unmasked.2bit hg19.clean.out \
hg19.clean.2bit
twoBitToFa hg19.clean.2bit stdout | faSize stdin > faSize.clean.txt
cat faSize.clean.txt
# this gives the lower by 75 bases result:
# 3137161264 bases (239850802 N's 2897310462 real 1431585763 upper
# 1465724699 lower) in 93 sequences in 1 files
# %46.72 masked total, %50.59 masked real
featureBits -countGaps hg19 rmsk
# 1465724774 bases of 3137161264 (46.721%) in intersection
# is the countGaps interferring ?
featureBits hg19 rmsk
# 1465724774 bases of 2897316137 (50.589%) in intersection
# nope, lets' see what the .out file has:
grep chr hg19.clean.out | sed -e "s/^ *//" | awk '{print $5,$6-1,$7}' \
| sort -k1,1 -k2,2n > hg19.clean.out.bed
featureBits -countGaps hg19 hg19.clean.out.bed
# 1465724774 bases of 3137161264 (46.721%) in intersection
# is it perhaps not masking N's ?
twoBitToFa hg19.clean.2bit stdout | grep n | less
# that does find some lower case n's, find all N's:
findMotif -strand=+ -motif=gattaca -verbose=4 hg19.clean.2bit \
2> findMotif.out
grep "^#GAP" findMotif.out | sed -e "s/#GAP //" > nLocations.bed
# which cover:
featureBits -countGaps hg19 nLocations.bed
# 251299071 bases of 3137161264 (8.010%) in intersection
# overlapping rmsk business with these N locations:
featureBits -countGaps hg19 hg19.clean.out.bed nLocations.bed
# 6494740 bases of 3137161264 (0.207%) in intersection
# and overlapping with gap:
featureBits -countGaps hg19 gap nLocations.bed
# 239845127 bases of 3137161264 (7.645%) in intersection
############################################################################
# running TRF simple repeats (DONE - 2009-03-05 - Hiram)
screen # use screen to manage this day-long job
mkdir /hive/data/genomes/hg19/bed/simpleRepeat
cd /hive/data/genomes/hg19/bed/simpleRepeat
time doSimpleRepeat.pl -bigClusterHub=pk -workhorse=hgwdev \
-smallClusterHub=pk -buildDir=`pwd` hg19 > do.log 2>&1
# real 33m25.815s
twoBitMask bed/repeatMasker/hg19.clean.2bit \
-add bed/simpleRepeat/trfMask.bed hg19.2bit
twoBitToFa hg19.2bit stdout | faSize stdin > faSize.hg19.2bit.txt
# 3137161264 bases (239850802 N's 2897310462 real 1430387259 upper
# 1466923203 lower) in 93 sequences in 1 files
# %46.76 masked total, %50.63 masked real
############################################################################
# prepare cluster data (DONE - 2009-03-06 - Hiram)
cd /hive/data/genomes/hg19
rm /gbdb/hg19/hg19.2bit
ln -s `pwd`/hg19.2bit /gbdb/hg19/hg19.2bit
time blat hg19.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
# Wrote 30675 overused 11-mers to 11.ooc
# real 3m11.302s
mkdir /hive/data/staging/data/hg19
cp -p hg19.2bit /hive/data/staging/data/hg19
cp -p 11.ooc /hive/data/staging/data/hg19
cp -p chrom.sizes /hive/data/staging/data/hg19
mkdir separateChrs
cd separateChrs
grep -v "_" ../chrom.sizes | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.chrOnly.2bit
twoBitInfo hg19.chrOnly.2bit stdout | sort -k2,2nr > chrOnly.chrom.sizes
grep "_hap" ../chrom.sizes | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.hapOnly.2bit
twoBitInfo hg19.hapOnly.2bit stdout | sort -k2,2nr > hapOnly.chrom.sizes
grep "_" ../chrom.sizes | grep -v "_hap" | awk '{print $1}' | while read C
do
twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.scaffolds.2bit
twoBitInfo hg19.scaffolds.2bit stdout | sort -k2,2nr > scaffolds.chrom.sizes
cp -p *.2bit *.sizes /hive/data/staging/data/hg19
# ask admin to sync this directory: /hive/data/staging/data/hg19/
# to the kluster nodes /scratch/data/hg19/
############################################################################
# running cpgIsland business (DONE - 2009-03-06 - Hiram)
mkdir /hive/data/genomes/hg19/bed/cpgIsland
cd /hive/data/genomes/hg19/bed/cpgIsland
cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
# comment out the following two lines if it compiles cleanly
# some day (there were some other fixups too, adding include lines)
sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
mv tmp.c cpg_lh.c
make
cd ../../
ln -s hg3rdParty/cpgIslands/cpglh.exe
mkdir -p hardMaskedFa
cut -f1 ../../chrom.sizes | while read C
do
echo ${C}
twoBitToFa ../../hg19.2bit:$C stdout \
| maskOutFa stdin hard hardMaskedFa/${C}.fa
done
cut -f1 ../../chrom.sizes > chr.list
cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out line results/$(root1).cpg}
#ENDLOOP
'_EOF_'
# << happy emacs
cat << '_EOF_' > runOne
#!/bin/csh -fe
./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
mv /scratch/tmp/$1.$$ $2
'_EOF_'
# << happy emacs
gensub2 chr.list single template jobList
para create jobList
para try
para check ... etc
para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 172s 2.86m 0.05h 0.00d 0.000 y
# IO & Wait Time: 1748s 29.14m 0.49h 0.02d 0.000 y
# Average job time: 21s 0.34m 0.01h 0.00d
# Longest finished job: 34s 0.57m 0.01h 0.00d
# Submission to last job: 83s 1.38m 0.02h 0.00d
# Transform cpglh output to bed +
catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed
cd /hive/data/genomes/hg19/bed/cpgIsland
hgLoadBed hg19 cpgIslandExt -tab \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 28226 elements of size 10
# Sorted
# Saving bed.tab
# Loading hg18
############################################################################
# create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
mkdir /hive/data/genomes/hg19/bed/gap
cd /hive/data/genomes/hg19/bed/gap
gapToLift hg19 hg19.unBridged.lift -bedFile=unBridged.lift.bed
cp -p hg19.unBridged.lift ../../jkStuff
cp -p hg19.unBridged.lift /hive/data/staging/data/hg19
############################################################################
# AUTO UPDATE GENBANK RUN (DONE - 2009-03-07,13 - Hiram)
# align with latest genbank process.
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add hg19 just after hg18
# hg19 - GRCh37 - Genome Reference Consortium Human Reference 37
# Assembly Accession: GCA_000001405.1
hg19.serverGenome = /hive/data/genomes/hg19/hg19.2bit
hg19.clusterGenome = /scratch/data/hg19/hg19.2bit
hg19.ooc = /scratch/data/hg19/11.ooc
hg19.lift = /scratch/data/hg19/hg19.unBridged.lift
# hg19.hapRegions = /hive/data/genomes/hg19/bed/haplotypePos/haplotypePos.psl
hg19.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter}
hg19.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter}
hg19.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
hg19.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter}
hg19.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
hg19.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter}
hg19.genbank.est.xeno.load = yes
hg19.refseq.mrna.xeno.load = yes
hg19.refseq.mrna.xeno.loadDesc = yes
hg19.mgc = yes
hg19.orfeome = yes
hg19.downloadDir = hg19
# hg19.ccds.ncbiBuild = 36.3
# hg19.upstreamGeneTbl = refGene
# hg19.upstreamMaf = multiz28way
# /hive/data/genomes/hg19/bed/multiz28way/species.lst multiz44way
# /hive/data/genomes/hg19/bed/multiz44way/species.list
hg19.genbank.mrna.blatTargetDb = yes
cvs ci -m "Added hg19." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial hg19 &
# logFile: var/build/logs/2009.03.10-20:28:44.hg19.initalign.log
# real 2761m13.680s
# that ran on the swarm with little interference and no problems
# load database when finished
ssh hgwdev
screen # use screen to manage this long running command
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19 &
# logFile: var/dbload/hgwdev/logs/2009.03.12-21:10:02.dbload.log
# real 369m11.941s
# enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add hg19 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added hg19 - Human - GRCh37" etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# BLATSERVERS ENTRY (DONE - 2009-03-09 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("hg19", "blat13", "17778", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("hg19", "blat13", "17779", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
# Making download files (DONE - 2009-03-13 - Hiram)
cd /hive/data/genomes/hg19
makeDownloads.pl -allowMissedTrfs -noChromRoot hg19 \
> downloads.log 2>&1
############################################################################
# Venter1 chain, net experiment (DONE - Hiram - 2009-03-15)
doBlastzChainNet.pl `pwd`/DEF \
-stop=partition -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > partition.log 2>&1
doBlastzChainNet.pl `pwd`/DEF \
-continue=blastz -stop=blastz -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > blastz.log 2>&1
doBlastzChainNet.pl `pwd`/DEF \
-continue=cat -stop=net -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > net.log 2>&1
real 163m28.438s
# to load, run it in debug, then check the load script
doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -continue=load -stop=load -bigClusterHub=swarm \
-debug -smallClusterHub=swarm -chainMinScore=1000 \
-chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > load.log 2>&1
# and create a synNet for multiz, run in debug, and examine script
# to make sure it works correctly
doBlastzChainNet.pl `pwd`/DEF \
-syntenicNet -continue=syntenicNet -stop=syntenicNet \
-debug -bigClusterHub=swarm \
-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
-workhorse=hgwdev -fileServer=hgwdev > synNet.log 2>&1
# real 31m11.216s
############################################################################
# reset position to chr6 haplotype situation
hgsql -e \
'update dbDb set defaultPos="chr6:28343766-33555363" where name="hg19";' \
hgcentraltest
# reset to a smaller range (2009-04-24 - Brooke)
# this is the SOD1 gene, implicated in Lou Gehrig's disease.
hgsql -e \
'update dbDb set defaultPos="chr21:33,031,597-33,041,570" where name="hg19";' \
hgcentraltest
############################################################################
# Self Lastz run (DONE - 2009-03-19 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
cat << '_EOF_'
# human vs human
BLASTZ=lastz
# maximum M allowed with lastz is only 255
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Human Hg19
SEQ2_DIR=/scratch/data/hg19/hg19.2bit
SEQ2_LEN=/scratch/data/hg19/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-workhorse=hgwdev \
-stop=net -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
# cluster difficulties, finished manually, then:
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-continue=cat -workhorse=hgwdev \
-stop=net -smallClusterHub=pk -bigClusterHub=swarm > cat.log 2>&1 &
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-continue=load -debug -workhorse=hgwdev \
-stop=load -smallClusterHub=pk -bigClusterHub=swarm > load.debug.log 2>&1 &
# that indicates it would do:
hgLoadChain -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
# adding -normScore
hgLoadChain -normScore -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
############################################################################
# Chimp Lastz run (WORKING - 2009-03-19 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
cd /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
cat << '_EOF_'
# human vs chimp
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/data/panTro2/panTro2.2bit
SEQ2_LEN=/scratch/data/panTro2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
# real 173m22.880s
# cluster problems, continuing after lastz done:
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=cat \
-stop=net -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> net.log 2>&1 &
# real 81m20.209s
# continuing with the load and adding syntenicNet
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=load \
-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
-chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> load.log 2>&1 &
# real 47m17.871s
# running the swap
ssh swarm
mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap
cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
> swap.log 2>&1 &
+ # real 723m41.377s
+ cat fb.panTro2.chainHg19Link.txt
+ # 2761343871 bases of 2909485072 (94.908%) in intersection
############################################################################
# Creating the pushQ entry (DONE - 2009-03-20 - Hiram)
mkdir /hive/data/genomes/hg19/pushQ
cd /hive/data/genomes/hg19/pushQ
makePushQSql.pl hg19 > hg19.pushQ.sql 2> make.err
# many complaints about the chain and net tables from the haplotype
# experiments, and this table:
# orfeomeGenes
# which is probably in genbank, and these usual ones:
# hg19 does not have seq
# hg19 does not have extFile
############################################################################
# Determine PAR region of X and Y (DONE - 2009-03-20 - Hiram)
mkdir /hive/data/genomes/hg19/bed/parRegion
cd /hive/data/genomes/hg19/bed/parRegion
awk '$5 != "N"' ../../X/chrX.agp | awk '{print $6}' | sort > chrX.cloneList
awk '$5 != "N"' ../../Y/chrY.agp | awk '{print $6}' | sort > chrY.cloneList
comm -12 chrX.cloneList chrY.cloneList > chrXY.par.clone.list
cat chrXY.par.clone.list \
| while read C; do grep "${C}" ../../X/chrX.agp; done \
| sort -k1,1 -k2,2n >> chrX.par.region.agp
cat chrXY.par.clone.list \
| while read C; do grep "${C}" ../../Y/chrY.agp; done \
| sort -k1,1 -k2,2n >> chrY.par.region.agp
awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrY.par.region.agp \
> chrY.par.region.bed
awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrX.par.region.agp \
> chrX.par.region.bed
# use those bed files in custom tracks on hg19 to verify that they
# are two continuous regions with only gaps between these items
# these location extents are: (zero relative)
# chrX 60000 2722842
# chrX 154906585 155260560
# chrY 10000 2649520
# chrY 59034049 59363566
############################################################################
# Gorilla Lastz run (WORKING - 2009-03-21 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
cd /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
cat << '_EOF_'
# human vs gorilla
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
# this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Gorilla gorGor1
SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
screen # use screen to manage this long-running job
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> do.log 2>&1 &
# XXX running
Sat Mar 21 22:22:18 PDT 2009
############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2009-04-02 - Hiram)
ssh pk
mkdir /hive/data/genomes/hg19/bed/linSpecRep
cd /hive/data/genomes/hg19/bed/linSpecRep
# create individual .out files from the master record in ../repeatMasker
mkdir splitOut
cat << '_EOF_' > split.csh
#!/bin/csh -fe
set C = $1
head -3 ../repeatMasker/hg19.clean.out > splitOut/${C}.out
grep "${C} " ../repeatMasker/hg19.clean.out >> splitOut/${C}.out
'_EOF_'
# << happy emacs
cat << '_EOF_' > template
#LOOP
split.csh $(root1) {check out line+ splitOut/$(root1).out}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 ../../chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
para create jobList
para try ... check ... push ... etc...
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 127s 2.12m 0.04h 0.00d 0.000 y
# IO & Wait Time: 17154s 285.90m 4.76h 0.20d 0.001 y
# Average job time: 186s 3.10m 0.05h 0.00d
# Longest finished job: 224s 3.73m 0.06h 0.00d
# Submission to last job: 280s 4.67m 0.08h 0.00d
# now, we can date and process each of those .out files
# this really should be a single creation of notInOthers
# These four different ones all end up to be the same anyhow
# the notInMouse becomes notInOthers below and the others are removed.
mkdir dateRepeats
cd dateRepeats
cat << '_EOF_' > mkLSR
#!/bin/csh -fe
rm -f $1.out_mus-musculus_rattus_canis-familiaris_bos-taurus
ln -s ../splitOut/$1.out .
/scratch/data/RepeatMasker/DateRepeats \
$1.out -query human -comp mouse -comp rat -comp dog -comp cow
rm $1.out
mkdir -p ../notInMouse ../notInRat ../notInDog ../notInCow
/cluster/bin/scripts/extractRepeats 1 $1.out_mus*-taurus \
> ../notInMouse/$1.out.spec
/cluster/bin/scripts/extractRepeats 2 $1.out_mus*-taurus \
> ../notInRat/$1.out.spec
/cluster/bin/scripts/extractRepeats 3 $1.out_mus*-taurus \
> ../notInDog/$1.out.spec
/cluster/bin/scripts/extractRepeats 4 $1.out_mus*-taurus \
> ../notInCow/$1.out.spec
'_EOF_'
# << happy emacs
chmod +x mkLSR
cat << '_EOF_' > template
#LOOP
./mkLSR $(path1) {check out line+ $(path1).out_mus-musculus_rattus_canis-familiaris_bos-taurus}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../chrom.list single template jobList
para try ... check ... push ... etc...
para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 2441s 40.69m 0.68h 0.03d 0.000 y
# IO & Wait Time: 332s 5.53m 0.09h 0.00d 0.000 y
# Average job time: 30s 0.50m 0.01h 0.00d
# Longest finished job: 125s 2.08m 0.03h 0.00d
# Submission to last job: 454s 7.57m 0.13h 0.01d
done
# these four types of out.spec results all turn out to be identical
# To check identical
cd /hive/data/genomes/hg19/bed/linSpecRep
find . -name "*.out.spec" | \
while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
| sort -k1,1n | sort -t"/" -k3,3 | sed -e "s#./notIn.*/##" \
| sort | uniq -c | less
# You will see they are all a count of 4
# Set them up on scratch data and get to all the kluster nodes:
mkdir /hive/data/staging/data/hg19/lineageSpecificRepeats
cd notInMouse
rsync -a --progress ./ /hive/data/staging/data/hg19/lineageSpecificRepeats
cd ..
mv notInMouse notInOthers
# do not need to keep all of these
rm -fr notInRat notInDog notInCow
# We also need the nibs for blastz runs with lineage specific repeats
mkdir /hive/data/genomes/hg19/bed/nibs
cd /hive/data/genomes/hg19/bed/nibs
cut -f1 ../../chrom.sizes | while read C
do
twoBitToFa -seq=${C} ../../hg19.2bit stdout \
| faToNib -softMask stdin ${C}.nib
echo "${C} done"
done
mkdir /hive/data/staging/data/hg19/nib
rsync -a --progress ./ /hive/data/staging/data/hg19/nib
# Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
#############################################################################
# create gc5Base download file (DONE - 2009-04-24 - Hiram)
cd /hive/data/genomes/hg19/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
/cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz
#############################################################################