src/hg/makeDb/doc/hg19.txt 1.114
1.114 2010/05/26 16:17:24 angie
Rebuilt snp131 and snp131CodingDbSnp with corrected function codes from dbSNP.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.113
retrieving revision 1.114
diff -b -B -U 4 -r1.113 -r1.114
--- src/hg/makeDb/doc/hg19.txt 24 May 2010 15:41:40 -0000 1.113
+++ src/hg/makeDb/doc/hg19.txt 26 May 2010 16:17:24 -0000 1.114
@@ -9209,9 +9209,11 @@
'LOAD DATA LOCAL INFILE "ucscToEnsemblV57.tab" INTO TABLE ucscToEnsembl'
############################################################################
-# dbSNP BUILD 131 (SNP131) (DONE 4/15/10 angie)
+# dbSNP BUILD 131 (SNP131) (DONE 5/25/10 angie)
+# Originally done 4/15/10 -- updated 5/25 with corrected function codes from
+# dbSNP (b131_SNPContigLocusId_37_1.bcp.gz).
# Set up build directory
mkdir -p /hive/data/outside/dbSNP/131/{human,shared}
# Get field encodings -- if there are changes or additions to the
@@ -9294,9 +9296,9 @@
#Celera
#GRCh37
#Homo sapiens MT
#HuRef
- foreach t (ContigInfo MapInfo)
+ foreach t (ContigInfo MapInfo ContigLocusId)
zcat $t.gz \
| egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable hg19snp131 $t placeholder stdin
@@ -9383,19 +9385,15 @@
#Warning 1264 Out of range value adjusted for column 'last_updated_time' at row 2
#Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 2
#Warning 1366 Incorrect integer value: '' for column 'map_property' at row 2
# ... no big deal.
-#*** NOTE FOR NEXT TIME: grep for ref. contigs in ContigLocusId.gz (othw. huge):
- zcat ContigLocusId.gz \
- | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
- | hgLoadSqlTab -oldTable hg19snp131 ContigLocusId placeholder stdin
foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
echo -n "${t}:\t"
hgsql -N -B hg19snp131 -e 'select count(*) from '$t
end
#ContigInfo: 260
#ContigLoc: 27500025
-#ContigLocusId: 110690222
+#ContigLocusId: 55347972
#MapInfo: 23619373
#SNP: 23653729
@@ -9416,9 +9414,9 @@
from ContigLocusId as cli, ContigInfo as ci \
where cli.ctg_id = ci.ctg_id;' \
> ncbiFuncAnnotations.txt
wc -l ncbiFuncAnnotations.txt
-#33665062 ncbiFuncAnnotations.txt
+#16835438 ncbiFuncAnnotations.txt
# Ignore function code 8 (cds-reference, just means that some allele matches reference)
# and glom functions for each SNP id:
cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
| sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
@@ -9434,9 +9432,9 @@
} \
print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n"' \
> ucscFunc.txt
wc -l ucscFunc.txt
-#10328243 ucscFunc.txt
+#10328697 ucscFunc.txt
cat > ucscFunc.sql <<EOF
CREATE TABLE ucscFunc (
snp_id int NOT NULL ,
ctg_id int(10) NOT NULL ,
@@ -9476,9 +9474,8 @@
# Add indices to tables for a big join (5 or 6 minutes):
hgsql hg19snp131 -e \
'alter table ContigLoc add index (ctg_id); \
- alter table ContigLocusId add index (snp_id); \
alter table SNP add index (snp_id); \
alter table MapInfo add index (snp_id);'
# Big leftie join to bring together all of the columns that we want in snp131,
@@ -9496,12 +9493,11 @@
LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
and uf.asn_from = cl.asn_from;' \
> ucscNcbiSnp.ctg.bed
-#69.323u 12.056s 22:37.20 5.9% 0+0k 0+0io 0pf+0w
+#75.815u 13.622s 32:04.35 4.6% 0+0k 0+0io 0pf+0w
wc -l ucscNcbiSnp.ctg.bed
#27500025 ucscNcbiSnp.ctg.bed
-
# Use liftUp for everything except mito, then liftOver for mito:
# There are some weird cases of length=1 but locType=range... in all the cases
# that I checked, the length really seems to be 1 so I'm not sure where they got
# the locType=range. Tweak locType in those cases so we can keep those SNPs:
@@ -9520,9 +9516,9 @@
| liftOver -bedPlus=3 stdin NC_012920ToChrM.over.chain stdout chrM.unmapped \
| awk -F"\t" 'BEGIN{OFS="\t";} {$3 -= 1; print;}' \
| sort -k2n,2n \
> chrMNcbiSnp.bed
-#3.523u 2.265s 0:30.99 18.6% 0+0k 0+0io 0pf+0w
+#3.479u 2.428s 0:53.57 10.9% 0+0k 0+0io 4pf+0w
# Good, got all but 2 SNPS (rs28693675 and rs55749223, partially deleted / deleted in new)
cat chrMNcbiSnp.bed >> ucscNcbiSnp.bed
wc -l ucscNcbiSnp.bed
#27500023 ucscNcbiSnp.bed
@@ -9555,9 +9551,9 @@
# 7 snp131Errors.bed
# 18 snp131ExceptionDesc.tab
# 4859728 snp131Exceptions.bed
# 23653726 snp131Seq.tab
- # 8M new snps, lots more exceptions than snp130 (2631563)
+ # 8M new snps, lots more exceptions than snp130 (had 2631563)
# Make one big fasta file.
# It's a monster: 18G! Can we split by hashing rsId?
zcat rs_fasta/rs_ch*.fas.gz \
@@ -9621,10 +9617,9 @@
# 1 RefAlleleRevComp
# 1 ObservedWrongFormat
# Compared to snp130, nice to see fewer disfunctional locTypes (FlankMismatch*)
# and singleClassQuadAllelic -- major increases in most others though.
-
-#TODO: go through those and send some bug reports to dbSNP.
+ # Sent a few bug reports to dbSNP
############################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP131 (DONE 4/15/10 angie)
@@ -9783,8 +9778,63 @@
nice gzip snp131Simple.bed snp131ExcludeIds.txt snp131ForLiftOver.bed
rm -r run*/split tmp.txt *.orthoGlom.txt
+############################################################################
+# DBSNP CODING ANNOTATIONS (DONE 5/25/10 angie)
+# also done 4/16, but found some strange function codes and notified dbSNP;
+# then updated 5/25/10 with corrected function codes (b131_SNPContigLocusId_37_1.bcp.gz).
+# originally done 6/2/09 - redone w/snp131, using mapping locations of dbSNP's func. annos
+ cd /hive/data/outside/dbSNP/131/human
+ awk -F"\t" '$9 != NULL {print;}' ncbiFuncAnnotations.txt \
+ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n | uniq > ncbiCodingAnnotations.txt
+ wc -l ncbiCodingAnnotations.txt
+#928279 ncbiCodingAnnotations.txt
+ # How many & what kinds of function types?
+ cut -f 6 ncbiCodingAnnotations.txt \
+ | sort -n | uniq -c
+# 25 0 (empty fxn_class in ContigLocusId! reported to dbSNP)
+# 168639 3 (coding-synon)
+# 443295 8 (cds-reference -- ignored)
+# 9790 41 (nonsense)
+# 261906 42 (missense)
+# 44624 44 (frameshift)
+ # Gather up multiple annotation lines into one line per {snp, gene, frame}:
+ perl -e 'while (<>) { chomp; \
+ my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
+ if (defined $lastRs && \
+ ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
+ $lastTx ne $txId || $lastFrm ne $frm)) { \
+ if (defined $refRow) { \
+ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \
+ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \
+ } \
+ print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
+ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
+ $refRow = undef; @rows = (); ($count, $fxns, $nts, $codons, $aas) = (); \
+ } \
+ ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
+ ($rsId, $ctg, $s, $e, $txId, $frm); \
+ $count++; \
+ if ($fxn == 8) { \
+ $refRow = [$fxn, $nt, $aa, $codon]; \
+ } else { \
+ $fxns .= "$fxn,"; $nts .= "$nt,"; $aas .= "$aa,"; $codons .= "$codon,"; \
+ } \
+ } \
+ if (defined $refRow) { \
+ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \
+ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \
+ } \
+ print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
+ "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
+ ncbiCodingAnnotations.txt \
+ | liftUp snp131CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
+ hgLoadBed hg19 snp131CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
+ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
+ snp131CodingDbSnp.bed
+#Loaded 443454 elements of size 11
+
##############################################################################
# RE-BUILD sno/miRNA TRACK (DONE - 04-20-2010 - Chin)