src/hg/makeDb/doc/hg19.txt 1.114

1.114 2010/05/26 16:17:24 angie
Rebuilt snp131 and snp131CodingDbSnp with corrected function codes from dbSNP.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.113
retrieving revision 1.114
diff -b -B -U 4 -r1.113 -r1.114
--- src/hg/makeDb/doc/hg19.txt	24 May 2010 15:41:40 -0000	1.113
+++ src/hg/makeDb/doc/hg19.txt	26 May 2010 16:17:24 -0000	1.114
@@ -9209,9 +9209,11 @@
 'LOAD DATA LOCAL INFILE "ucscToEnsemblV57.tab" INTO TABLE ucscToEnsembl'
 
 
 ############################################################################
-# dbSNP BUILD 131 (SNP131) (DONE 4/15/10 angie)
+# dbSNP BUILD 131 (SNP131) (DONE 5/25/10 angie)
+# Originally done 4/15/10 -- updated 5/25 with corrected function codes from
+# dbSNP (b131_SNPContigLocusId_37_1.bcp.gz).
     # Set up build directory
     mkdir -p /hive/data/outside/dbSNP/131/{human,shared}
 
     # Get field encodings -- if there are changes or additions to the
@@ -9294,9 +9296,9 @@
 #Celera
 #GRCh37
 #Homo sapiens MT
 #HuRef
-    foreach t (ContigInfo MapInfo)
+    foreach t (ContigInfo MapInfo ContigLocusId)
       zcat $t.gz \
       | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
       | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
       | hgLoadSqlTab -oldTable hg19snp131 $t placeholder stdin
@@ -9383,19 +9385,15 @@
 #Warning 1264 Out of range value adjusted for column 'last_updated_time' at row 2
 #Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 2
 #Warning 1366 Incorrect integer value: '' for column 'map_property' at row 2
     # ... no big deal.
-#*** NOTE FOR NEXT TIME: grep for ref. contigs in ContigLocusId.gz (othw. huge):
-    zcat ContigLocusId.gz \
-    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
-    | hgLoadSqlTab -oldTable hg19snp131 ContigLocusId placeholder stdin
     foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
      echo -n "${t}:\t"
       hgsql -N -B hg19snp131 -e 'select count(*) from '$t
     end
 #ContigInfo:           260
 #ContigLoc:       27500025
-#ContigLocusId:  110690222
+#ContigLocusId:   55347972
 #MapInfo:         23619373
 #SNP:    	  23653729
 
 
@@ -9416,9 +9414,9 @@
                            from ContigLocusId as cli, ContigInfo as ci \
                            where cli.ctg_id = ci.ctg_id;' \
       > ncbiFuncAnnotations.txt
     wc -l ncbiFuncAnnotations.txt
-#33665062 ncbiFuncAnnotations.txt
+#16835438 ncbiFuncAnnotations.txt
     # Ignore function code 8 (cds-reference, just means that some allele matches reference)
     # and glom functions for each SNP id:
     cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
     | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
@@ -9434,9 +9432,9 @@
                 } \
                 print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n"' \
       > ucscFunc.txt
     wc -l ucscFunc.txt
-#10328243 ucscFunc.txt
+#10328697 ucscFunc.txt
     cat > ucscFunc.sql <<EOF
 CREATE TABLE ucscFunc (
         snp_id int NOT NULL ,
         ctg_id int(10) NOT NULL ,
@@ -9476,9 +9474,8 @@
 
     # Add indices to tables for a big join (5 or 6 minutes):
     hgsql hg19snp131 -e \
       'alter table ContigLoc  add index (ctg_id); \
-       alter table ContigLocusId add index (snp_id); \
        alter table SNP        add index (snp_id); \
        alter table MapInfo    add index (snp_id);'
 
     # Big leftie join to bring together all of the columns that we want in snp131,
@@ -9496,12 +9493,11 @@
         LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
        LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
                                 and uf.asn_from = cl.asn_from;' \
       > ucscNcbiSnp.ctg.bed
-#69.323u 12.056s 22:37.20 5.9%   0+0k 0+0io 0pf+0w
+#75.815u 13.622s 32:04.35 4.6%   0+0k 0+0io 0pf+0w
     wc -l ucscNcbiSnp.ctg.bed 
 #27500025 ucscNcbiSnp.ctg.bed
-
     # Use liftUp for everything except mito, then liftOver for mito:
     # There are some weird cases of length=1 but locType=range... in all the cases 
     # that I checked, the length really seems to be 1 so I'm not sure where they got 
     # the locType=range.  Tweak locType in those cases so we can keep those SNPs:
@@ -9520,9 +9516,9 @@
     | liftOver -bedPlus=3 stdin NC_012920ToChrM.over.chain stdout chrM.unmapped \
     | awk -F"\t" 'BEGIN{OFS="\t";} {$3 -= 1; print;}' \
     | sort -k2n,2n \
       > chrMNcbiSnp.bed
-#3.523u 2.265s 0:30.99 18.6%     0+0k 0+0io 0pf+0w
+#3.479u 2.428s 0:53.57 10.9%     0+0k 0+0io 4pf+0w
     # Good, got all but 2 SNPS (rs28693675 and rs55749223, partially deleted / deleted in new)
     cat chrMNcbiSnp.bed >> ucscNcbiSnp.bed
     wc -l ucscNcbiSnp.bed
 #27500023 ucscNcbiSnp.bed
@@ -9555,9 +9551,9 @@
 #         7 snp131Errors.bed
 #        18 snp131ExceptionDesc.tab
 #   4859728 snp131Exceptions.bed
 #  23653726 snp131Seq.tab
-    # 8M new snps, lots more exceptions than snp130 (2631563)
+    # 8M new snps, lots more exceptions than snp130 (had 2631563)
 
     # Make one big fasta file.
     # It's a monster: 18G!  Can we split by hashing rsId?
     zcat rs_fasta/rs_ch*.fas.gz \
@@ -9621,10 +9617,9 @@
 #      1 RefAlleleRevComp
 #      1 ObservedWrongFormat
     # Compared to snp130, nice to see fewer disfunctional locTypes (FlankMismatch*)
     # and singleClassQuadAllelic -- major increases in most others though.
-
-#TODO: go through those and send some bug reports to dbSNP.
+    # Sent a few bug reports to dbSNP
 
 
 ############################################################################
 # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP131 (DONE 4/15/10 angie)
@@ -9783,8 +9778,63 @@
     nice gzip snp131Simple.bed snp131ExcludeIds.txt snp131ForLiftOver.bed
     rm -r run*/split tmp.txt *.orthoGlom.txt
 
 
+############################################################################
+# DBSNP CODING ANNOTATIONS (DONE 5/25/10 angie)
+# also done 4/16, but found some strange function codes and notified dbSNP;
+# then updated 5/25/10 with corrected function codes (b131_SNPContigLocusId_37_1.bcp.gz).
+# originally done 6/2/09 - redone w/snp131, using mapping locations of dbSNP's func. annos
+    cd /hive/data/outside/dbSNP/131/human
+    awk -F"\t" '$9 != NULL {print;}' ncbiFuncAnnotations.txt \
+    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n | uniq > ncbiCodingAnnotations.txt
+    wc -l ncbiCodingAnnotations.txt
+#928279 ncbiCodingAnnotations.txt
+    # How many & what kinds of function types?
+    cut -f 6 ncbiCodingAnnotations.txt \
+    | sort -n | uniq -c
+#     25 0   (empty fxn_class in ContigLocusId! reported to dbSNP)
+# 168639 3   (coding-synon)
+# 443295 8   (cds-reference -- ignored)
+#   9790 41  (nonsense)
+# 261906 42  (missense)
+#  44624 44  (frameshift)
+    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
+    perl -e  'while (<>) { chomp; \
+                my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
+                if (defined $lastRs && \
+                    ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
+                     $lastTx ne $txId || $lastFrm ne $frm)) { \
+                  if (defined $refRow) { \
+                    $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
+                    $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
+                  } \
+                  print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
+                        "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
+                  $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
+                } \
+                ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
+                    ($rsId, $ctg, $s, $e, $txId, $frm); \
+                $count++; \
+                if ($fxn == 8) { \
+                  $refRow = [$fxn, $nt, $aa, $codon]; \
+                } else { \
+                 $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
+                } \
+              } \
+              if (defined $refRow) { \
+                $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
+                $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
+              } \
+              print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
+                    "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
+      ncbiCodingAnnotations.txt \
+    | liftUp snp131CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
+    hgLoadBed hg19 snp131CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
+      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
+      snp131CodingDbSnp.bed
+#Loaded 443454 elements of size 11
+
 
 ##############################################################################
 #  RE-BUILD sno/miRNA TRACK (DONE - 04-20-2010 - Chin)