src/hg/makeDb/doc/hg19.txt 1.117

1.117 2010/06/07 16:57:12 angie
Regenerated snp131Exceptions, and snp131OrthoPt2Rm2Pa2 which depends on the exceptions. Multiple alignments were flagged even when there were single alignments to a chrom and chrom_hap, which had the effect of excluding haplo regions (even on main chroms) from ortho alignments.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.116
retrieving revision 1.117
diff -b -B -U 4 -r1.116 -r1.117
--- src/hg/makeDb/doc/hg19.txt	2 Jun 2010 23:01:10 -0000	1.116
+++ src/hg/makeDb/doc/hg19.txt	7 Jun 2010 16:57:12 -0000	1.117
@@ -9537,9 +9537,9 @@
 #count of snps with weight  2 = 472416
 #count of snps with weight  3 = 2536961
 #count of snps with weight 10 = 1399430
 #Skipped 7 snp mappings due to errors -- see snp131Errors.bed
-#218.985u 11.475s 4:41.38 81.8%  0+0k 0+0io 0pf+0w
+#173.162u 5.982s 7:57.91 37.4%   0+0k 0+0io 3pf+0w
     head snp131Errors.bed
 #chr13   32953907        32954033        rs80359736      rs80359736 is 126 bases long but refNCBI is different length: CATCATCAGATTTATATTCTCTGTTAACAGAAGGAAAGAGATACAGAATTTATCATCTTGCAACTTCAAAATCTAAAAGTAAATCTGAAAGAGCTAACAT
 #chr17   41223118        41223133        rs80359888      Missing observed value (deleted SNP?).
 #chr17   41245687        41245900        rs80359886      rs80359886 is 213 bases long but refNCBI is different length: AATATGCCTGGTAGAAGACTTCCTCCTCAGCCTATTCTTTTTAGGTGCTTTTGAATTGTGGATATTTAATTCGAGTTCCATATTGCTTATACTGCTGCTT
@@ -9551,10 +9551,9 @@
 #  26033053 snp131.bed
 #        22 snp131.sql
 #         7 snp131Errors.bed
 #        18 snp131ExceptionDesc.tab
-#   4859728 snp131Exceptions.bed
-#  23653726 snp131Seq.tab
+#   4281351 snp131Exceptions.bed
     # 8M new snps, lots more exceptions than snp130 (had 2631563)
 
     # Make one big fasta file.
     # It's a monster: 18G!  Can we split by hashing rsId?
@@ -9584,9 +9583,9 @@
 #162.666u 19.611s 8:53.56 34.1%  0+0k 0+0io 0pf+0w
     hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
       hg19 snp131Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
       snp131Exceptions.bed
-#Loaded 4859728 elements of size 5
+#Loaded 4281351 elements of size 5
 #32.020u 2.006s 1:22.87 41.0%    0+0k 0+0io 0pf+0w
     hgLoadSqlTab hg19 snp131ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
       snp131ExceptionDesc.tab
     # Load up sequences.
@@ -9599,9 +9598,9 @@
 
     # Look at the breakdown of exception categories:
     cd /hive/data/outside/dbSNP/131/human
     cut -f 5 snp131Exceptions.bed | sort | uniq -c | sort -nr
-#3666812 MultipleAlignments
+#3088435 MultipleAlignments
 # 886159 ObservedMismatch
 #  92341 SingleClassTriAllelic
 #  70184 SingleClassZeroSpan
 #  43319 ObservedTooLong
@@ -9623,9 +9622,13 @@
     # Sent a few bug reports to dbSNP
 
 
 ############################################################################
-# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP131 (DONE 4/15/10 angie)
+# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP131 (DONE 6/3/10 angie)
+# First done 4/15/10.  Then found that SNPs that appeared on both a main chrom
+# (like chr6) and on a haplo chrom (like chr6_cox_hap2) were being flagged
+# as multiple alignments when they should be, excluding them from this.
+# Regenerated exceptions, then regenerated this.
     mkdir /hive/data/genomes/hg19/bed/snp131Ortho
     cd /hive/data/genomes/hg19/bed/snp131Ortho
 
     # Following Heather's lead in snp126orthos, filter SNPs to to keep
@@ -9642,9 +9645,10 @@
     | grep -vFwf snp131ExcludeIds.txt \
       > snp131Simple.bed
 #333.829u 11.879s 3:57.31 145.6% 0+0k 0+0io 0pf+0w
     wc -l snp131Simple.bed
-#17337248 snp131Simple.bed
+#17784981 snp131Simple.bed
+#with too many SNPs excluded, was: 17337248 snp131Simple.bed
 
     # Glom all human info that we need for the final table onto the
     # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
     awk 'BEGIN{OFS="\t";} \
@@ -9653,10 +9657,8 @@
                0, $6;}' \
       snp131Simple.bed > snp131ForLiftOver.bed
 
     # Map coords to chimp using liftOver.
-    # I don't know why chimp took so much longer than macaque... the
-    # chimp .over has fewer chains and fewer bytes than the macaque .over.
     mkdir run.liftOChimp
     cd run.liftOChimp
     mkdir split out
     splitFile ../snp131ForLiftOver.bed 25000 split/chunk
@@ -9666,17 +9668,17 @@
         /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
         \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
         >> jobList
     end
-    ssh pk
+    ssh swarm
     cd /hive/data/genomes/hg19/bed/snp131Ortho/run.liftOChimp
     para make jobList
-#Completed: 694 of 694 jobs
-#CPU time in finished jobs:     114883s    1914.72m    31.91h    1.33d  0.004 y
-#IO & Wait Time:                  3183s      53.05m     0.88h    0.04d  0.000 y
-#Average job time:                 170s       2.84m     0.05h    0.00d
-#Longest finished job:             460s       7.67m     0.13h    0.01d
-#Submission to last job:           525s       8.75m     0.15h    0.01d
+#Completed: 712 of 712 jobs
+#CPU time in finished jobs:     127853s    2130.88m    35.51h    1.48d  0.004 y
+#IO & Wait Time:                 11528s     192.14m     3.20h    0.13d  0.000 y
+#Average job time:                 196s       3.26m     0.05h    0.00d
+#Longest finished job:             506s       8.43m     0.14h    0.01d
+#Submission to last job:           676s      11.27m     0.19h    0.01d
 
     # Map coords to orangutan using liftOver.
     mkdir ../run.liftOPon
     cd ../run.liftOPon
@@ -9689,14 +9691,15 @@
         \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
         >> jobList
     end
     para make jobList
-#Completed: 694 of 694 jobs
-#CPU time in finished jobs:     224315s    3738.58m    62.31h    2.60d  0.007 y
-#IO & Wait Time:                 18237s     303.95m     5.07h    0.21d  0.001 y
-#Average job time:                 349s       5.82m     0.10h    0.00d
-#Longest finished job:            1010s      16.83m     0.28h    0.01d
-#Submission to last job:          1024s      17.07m     0.28h    0.01d
+# on pk:
+#Completed: 712 of 712 jobs
+#CPU time in finished jobs:     230882s    3848.03m    64.13h    2.67d  0.007 y
+#IO & Wait Time:                  3660s      61.00m     1.02h    0.04d  0.000 y
+#Average job time:                 329s       5.49m     0.09h    0.00d
+#Longest finished job:            1019s      16.98m     0.28h    0.01d
+#Submission to last job:          1667s      27.78m     0.46h    0.02d
 
     # Map coords to macaque using liftOver.
     mkdir ../run.liftOMac
     cd ../run.liftOMac
@@ -9709,14 +9712,14 @@
         \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
         >> jobList
     end
     para make jobList
-#Completed: 694 of 694 jobs
-#CPU time in finished jobs:     289558s    4825.96m    80.43h    3.35d  0.009 y
-#IO & Wait Time:                 23402s     390.04m     6.50h    0.27d  0.001 y
-#Average job time:                 451s       7.52m     0.13h    0.01d
-#Longest finished job:             893s      14.88m     0.25h    0.01d
-#Submission to last job:           906s      15.10m     0.25h    0.01d
+#Completed: 712 of 712 jobs
+#CPU time in finished jobs:     281168s    4686.14m    78.10h    3.25d  0.009 y
+#IO & Wait Time:                 22164s     369.39m     6.16h    0.26d  0.001 y
+#Average job time:                 426s       7.10m     0.12h    0.00d
+#Longest finished job:             868s      14.47m     0.24h    0.01d
+#Submission to last job:           872s      14.53m     0.24h    0.01d
 
     cd /hive/data/genomes/hg19/bed/snp131Ortho
     # Concatenate the chimp results, sorting by chimp pos in order to
     # efficiently access 2bit sequence in getOrthoSeq.  The output of
@@ -9732,11 +9735,14 @@
     sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
     | sort > rheMac2.orthoGlom.txt
     wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
-#  16230258 panTro2.orthoGlom.txt
-#  15535287 ponAbe2.orthoGlom.txt
-#  13996256 rheMac2.orthoGlom.txt
+#  16641106 panTro2.orthoGlom.txt
+#  15796202 ponAbe2.orthoGlom.txt
+#  14289736 rheMac2.orthoGlom.txt
+#was:  16230258 panTro2.orthoGlom.txt
+#was:  15535287 ponAbe2.orthoGlom.txt
+#was:  13996256 rheMac2.orthoGlom.txt
 
     # Use the glommed name field as a key to join up chimp and macaque
     # allele data.  Include glommed name from both files because if only
     # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
@@ -9767,19 +9773,20 @@
         s/^.*$//;' \
     | sort -k1,1 -k2n,2n > snp131OrthoPt2Pa2Rm2.bed
 #437.114u 37.309s 6:33.92 120.4% 0+0k 0+0io 0pf+0w
     wc -l snp131OrthoPt2Pa2Rm2.bed
-#16842459 snp131OrthoPt2Pa2Rm2.bed
+#17276174 snp131OrthoPt2Pa2Rm2.bed
+#was: 16842459 snp131OrthoPt2Pa2Rm2.bed
 
     hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
       -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
       hg19 snp131OrthoPt2Pa2Rm2 snp131OrthoPt2Pa2Rm2.bed
-#Loaded 16842459 elements of size 22
-#104.012u 12.931s 7:21.75 26.4%  0+0k 0+0io 0pf+0w
+#Loaded 17276174 elements of size 22
+#123.287u 13.079s 8:17.88 27.3%  0+0k 0+0io 0pf+0w
 
     # Cleanup:
     nice gzip snp131Simple.bed snp131ExcludeIds.txt snp131ForLiftOver.bed
-    rm -r run*/split tmp.txt *.orthoGlom.txt
+    rm -r run*/split tmp.txt *.orthoGlom.txt bed.tab
 
 
 ############################################################################
 # DBSNP CODING ANNOTATIONS (DONE 5/25/10 angie)