src/hg/makeDb/doc/hg19.txt 1.117
1.117 2010/06/07 16:57:12 angie
Regenerated snp131Exceptions, and snp131OrthoPt2Rm2Pa2 which depends on the exceptions. Multiple alignments were flagged even when there were single alignments to a chrom and chrom_hap, which had the effect of excluding haplo regions (even on main chroms) from ortho alignments.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.116
retrieving revision 1.117
diff -b -B -U 4 -r1.116 -r1.117
--- src/hg/makeDb/doc/hg19.txt 2 Jun 2010 23:01:10 -0000 1.116
+++ src/hg/makeDb/doc/hg19.txt 7 Jun 2010 16:57:12 -0000 1.117
@@ -9537,9 +9537,9 @@
#count of snps with weight 2 = 472416
#count of snps with weight 3 = 2536961
#count of snps with weight 10 = 1399430
#Skipped 7 snp mappings due to errors -- see snp131Errors.bed
-#218.985u 11.475s 4:41.38 81.8% 0+0k 0+0io 0pf+0w
+#173.162u 5.982s 7:57.91 37.4% 0+0k 0+0io 3pf+0w
head snp131Errors.bed
#chr13 32953907 32954033 rs80359736 rs80359736 is 126 bases long but refNCBI is different length: CATCATCAGATTTATATTCTCTGTTAACAGAAGGAAAGAGATACAGAATTTATCATCTTGCAACTTCAAAATCTAAAAGTAAATCTGAAAGAGCTAACAT
#chr17 41223118 41223133 rs80359888 Missing observed value (deleted SNP?).
#chr17 41245687 41245900 rs80359886 rs80359886 is 213 bases long but refNCBI is different length: AATATGCCTGGTAGAAGACTTCCTCCTCAGCCTATTCTTTTTAGGTGCTTTTGAATTGTGGATATTTAATTCGAGTTCCATATTGCTTATACTGCTGCTT
@@ -9551,10 +9551,9 @@
# 26033053 snp131.bed
# 22 snp131.sql
# 7 snp131Errors.bed
# 18 snp131ExceptionDesc.tab
-# 4859728 snp131Exceptions.bed
-# 23653726 snp131Seq.tab
+# 4281351 snp131Exceptions.bed
# 8M new snps, lots more exceptions than snp130 (had 2631563)
# Make one big fasta file.
# It's a monster: 18G! Can we split by hashing rsId?
@@ -9584,9 +9583,9 @@
#162.666u 19.611s 8:53.56 34.1% 0+0k 0+0io 0pf+0w
hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
hg19 snp131Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
snp131Exceptions.bed
-#Loaded 4859728 elements of size 5
+#Loaded 4281351 elements of size 5
#32.020u 2.006s 1:22.87 41.0% 0+0k 0+0io 0pf+0w
hgLoadSqlTab hg19 snp131ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
snp131ExceptionDesc.tab
# Load up sequences.
@@ -9599,9 +9598,9 @@
# Look at the breakdown of exception categories:
cd /hive/data/outside/dbSNP/131/human
cut -f 5 snp131Exceptions.bed | sort | uniq -c | sort -nr
-#3666812 MultipleAlignments
+#3088435 MultipleAlignments
# 886159 ObservedMismatch
# 92341 SingleClassTriAllelic
# 70184 SingleClassZeroSpan
# 43319 ObservedTooLong
@@ -9623,9 +9622,13 @@
# Sent a few bug reports to dbSNP
############################################################################
-# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP131 (DONE 4/15/10 angie)
+# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP131 (DONE 6/3/10 angie)
+# First done 4/15/10. Then found that SNPs that appeared on both a main chrom
+# (like chr6) and on a haplo chrom (like chr6_cox_hap2) were being flagged
+# as multiple alignments when they should be, excluding them from this.
+# Regenerated exceptions, then regenerated this.
mkdir /hive/data/genomes/hg19/bed/snp131Ortho
cd /hive/data/genomes/hg19/bed/snp131Ortho
# Following Heather's lead in snp126orthos, filter SNPs to to keep
@@ -9642,9 +9645,10 @@
| grep -vFwf snp131ExcludeIds.txt \
> snp131Simple.bed
#333.829u 11.879s 3:57.31 145.6% 0+0k 0+0io 0pf+0w
wc -l snp131Simple.bed
-#17337248 snp131Simple.bed
+#17784981 snp131Simple.bed
+#with too many SNPs excluded, was: 17337248 snp131Simple.bed
# Glom all human info that we need for the final table onto the
# name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
awk 'BEGIN{OFS="\t";} \
@@ -9653,10 +9657,8 @@
0, $6;}' \
snp131Simple.bed > snp131ForLiftOver.bed
# Map coords to chimp using liftOver.
- # I don't know why chimp took so much longer than macaque... the
- # chimp .over has fewer chains and fewer bytes than the macaque .over.
mkdir run.liftOChimp
cd run.liftOChimp
mkdir split out
splitFile ../snp131ForLiftOver.bed 25000 split/chunk
@@ -9666,17 +9668,17 @@
/hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
\{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
>> jobList
end
- ssh pk
+ ssh swarm
cd /hive/data/genomes/hg19/bed/snp131Ortho/run.liftOChimp
para make jobList
-#Completed: 694 of 694 jobs
-#CPU time in finished jobs: 114883s 1914.72m 31.91h 1.33d 0.004 y
-#IO & Wait Time: 3183s 53.05m 0.88h 0.04d 0.000 y
-#Average job time: 170s 2.84m 0.05h 0.00d
-#Longest finished job: 460s 7.67m 0.13h 0.01d
-#Submission to last job: 525s 8.75m 0.15h 0.01d
+#Completed: 712 of 712 jobs
+#CPU time in finished jobs: 127853s 2130.88m 35.51h 1.48d 0.004 y
+#IO & Wait Time: 11528s 192.14m 3.20h 0.13d 0.000 y
+#Average job time: 196s 3.26m 0.05h 0.00d
+#Longest finished job: 506s 8.43m 0.14h 0.01d
+#Submission to last job: 676s 11.27m 0.19h 0.01d
# Map coords to orangutan using liftOver.
mkdir ../run.liftOPon
cd ../run.liftOPon
@@ -9689,14 +9691,15 @@
\{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
>> jobList
end
para make jobList
-#Completed: 694 of 694 jobs
-#CPU time in finished jobs: 224315s 3738.58m 62.31h 2.60d 0.007 y
-#IO & Wait Time: 18237s 303.95m 5.07h 0.21d 0.001 y
-#Average job time: 349s 5.82m 0.10h 0.00d
-#Longest finished job: 1010s 16.83m 0.28h 0.01d
-#Submission to last job: 1024s 17.07m 0.28h 0.01d
+# on pk:
+#Completed: 712 of 712 jobs
+#CPU time in finished jobs: 230882s 3848.03m 64.13h 2.67d 0.007 y
+#IO & Wait Time: 3660s 61.00m 1.02h 0.04d 0.000 y
+#Average job time: 329s 5.49m 0.09h 0.00d
+#Longest finished job: 1019s 16.98m 0.28h 0.01d
+#Submission to last job: 1667s 27.78m 0.46h 0.02d
# Map coords to macaque using liftOver.
mkdir ../run.liftOMac
cd ../run.liftOMac
@@ -9709,14 +9712,14 @@
\{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
>> jobList
end
para make jobList
-#Completed: 694 of 694 jobs
-#CPU time in finished jobs: 289558s 4825.96m 80.43h 3.35d 0.009 y
-#IO & Wait Time: 23402s 390.04m 6.50h 0.27d 0.001 y
-#Average job time: 451s 7.52m 0.13h 0.01d
-#Longest finished job: 893s 14.88m 0.25h 0.01d
-#Submission to last job: 906s 15.10m 0.25h 0.01d
+#Completed: 712 of 712 jobs
+#CPU time in finished jobs: 281168s 4686.14m 78.10h 3.25d 0.009 y
+#IO & Wait Time: 22164s 369.39m 6.16h 0.26d 0.001 y
+#Average job time: 426s 7.10m 0.12h 0.00d
+#Longest finished job: 868s 14.47m 0.24h 0.01d
+#Submission to last job: 872s 14.53m 0.24h 0.01d
cd /hive/data/genomes/hg19/bed/snp131Ortho
# Concatenate the chimp results, sorting by chimp pos in order to
# efficiently access 2bit sequence in getOrthoSeq. The output of
@@ -9732,11 +9735,14 @@
sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
| sort > rheMac2.orthoGlom.txt
wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
-# 16230258 panTro2.orthoGlom.txt
-# 15535287 ponAbe2.orthoGlom.txt
-# 13996256 rheMac2.orthoGlom.txt
+# 16641106 panTro2.orthoGlom.txt
+# 15796202 ponAbe2.orthoGlom.txt
+# 14289736 rheMac2.orthoGlom.txt
+#was: 16230258 panTro2.orthoGlom.txt
+#was: 15535287 ponAbe2.orthoGlom.txt
+#was: 13996256 rheMac2.orthoGlom.txt
# Use the glommed name field as a key to join up chimp and macaque
# allele data. Include glommed name from both files because if only
# file 2 has a line for the key in 2.1, then 1.1 is empty. Then plop
@@ -9767,19 +9773,20 @@
s/^.*$//;' \
| sort -k1,1 -k2n,2n > snp131OrthoPt2Pa2Rm2.bed
#437.114u 37.309s 6:33.92 120.4% 0+0k 0+0io 0pf+0w
wc -l snp131OrthoPt2Pa2Rm2.bed
-#16842459 snp131OrthoPt2Pa2Rm2.bed
+#17276174 snp131OrthoPt2Pa2Rm2.bed
+#was: 16842459 snp131OrthoPt2Pa2Rm2.bed
hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
-sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
hg19 snp131OrthoPt2Pa2Rm2 snp131OrthoPt2Pa2Rm2.bed
-#Loaded 16842459 elements of size 22
-#104.012u 12.931s 7:21.75 26.4% 0+0k 0+0io 0pf+0w
+#Loaded 17276174 elements of size 22
+#123.287u 13.079s 8:17.88 27.3% 0+0k 0+0io 0pf+0w
# Cleanup:
nice gzip snp131Simple.bed snp131ExcludeIds.txt snp131ForLiftOver.bed
- rm -r run*/split tmp.txt *.orthoGlom.txt
+ rm -r run*/split tmp.txt *.orthoGlom.txt bed.tab
############################################################################
# DBSNP CODING ANNOTATIONS (DONE 5/25/10 angie)