src/hg/makeDb/doc/hg19.txt 1.100
1.100 2010/04/15 22:06:57 angie
Added alternate haplotype contigs to ctgPos. Built snp131.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.99
retrieving revision 1.100
diff -b -B -U 4 -r1.99 -r1.100
--- src/hg/makeDb/doc/hg19.txt 12 Apr 2010 21:32:37 -0000 1.99
+++ src/hg/makeDb/doc/hg19.txt 15 Apr 2010 22:06:57 -0000 1.100
@@ -836,9 +836,9 @@
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
/cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz
#############################################################################
-# Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram)
+# Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram) (Alt. haplotypes added 4/12/10 angie)
mkdir /hive/data/genomes/hg19/bed/ctgPos
cd /hive/data/genomes/hg19/bed/ctgPos
cat << '_EOF_' > mkCtgPos.sh
AGP="/hive/data/genomes/hg19/download/assembled_chromosomes/AGP"
@@ -903,12 +903,33 @@
# fetch .sql definition from hg18
chmod 777 .
hgsqldump --all -c --tab=. hg18 ctgPos
+ # Don't confuse us w/hg18 data:
+ rm ctgPos.txt
chmod 775 .
hgsql hg19 < ctgPos.sql
hgsql -e 'load data local infile "ctgPos.tab" into table ctgPos;' hg19
+ # 4/12/10 (angie): add the alt loci:
+ perl -we 'while (<>) { \
+ next if (/^#/); chomp; @w = split; \
+ $w[0] = lc($w[0]); $w[0] =~ s/^hs//; $w[0] =~ s/_mhc_/_/; $w[0] =~ s/_ctg1$//; \
+ $w[0] =~ s/_apd$/_apd_hap1/; $w[0] =~ s/_cox$/_cox_hap2/; \
+ $w[0] =~ s/_dbb$/_dbb_hap3/; $w[0] =~ s/_mann$/_mann_hap4/; \
+ $w[0] =~ s/_mcf$/_mcf_hap5/; $w[0] =~ s/_qbl$/_qbl_hap6/; \
+ $w[0] =~ s/_ssto$/_ssto_hap7/; \
+ $w[0] =~ s/_1(_ctg\d)/${1}_hap1/; \
+ if ($w[0] eq "chr6_cox_hap2" && $w[8] == 4873745) { $w[8] = 4795371; } # yep, inconsistent \
+ print join("\t", $w[1], $w[8], $w[0], 0, $w[8]) . "\n"; }' \
+ /hive/data/genomes/hg19/download/alternate_loci/*/placed_scaffolds/alt_locus_scaf2primary.pos \
+ >> ctgPos.tab
+ sort -k 3,3 -k4n,4n ctgPos.tab \
+ | hgLoadSqlTab hg19 ctgPos ctgPos.sql stdin
+ # TODO: tell NCBI alternate_loci/ALT_REF_LOCI_2/placed_scaffolds/alt_locus_scaf2primary.pos
+ # has size inconsistent w/AGP, FASTA
+
+
#############################################################################
# CLONE ENDS - first step for BACEND/CytoBand tracks
# (DONE - 2009-04-28 - Hiram)
mkdir -p /hive/data/genomes/hg19/bed/cloneend/ncbi
@@ -9097,5 +9118,423 @@
hgsql hg19 -e 'delete from ucscToEnsembl where ucsc like "%";'
hgsql hg19 -e \
'LOAD DATA LOCAL INFILE "ucscToEnsemblV57.tab" INTO TABLE ucscToEnsembl'
-########################################################################
+
+############################################################################
+# dbSNP BUILD 131 (SNP131) (DONE 4/15/10 angie)
+ # Set up build directory
+ mkdir -p /hive/data/outside/dbSNP/131/{human,shared}
+
+ # Get field encodings -- if there are changes or additions to the
+ # encoding of the corresponding fields, you might need to update
+ # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
+ # hg/lib/snp125Ui.c).
+ cd /hive/data/outside/dbSNP/131/shared
+ alias wg wget --timestamping
+ set ftpShared = ftp://ftp.ncbi.nih.gov/snp/database/shared_data
+ wg $ftpShared/LocTypeCode.bcp.gz
+ wg $ftpShared/SnpClassCode.bcp.gz
+ wg $ftpShared/SnpFunctionCode.bcp.gz
+ wg $ftpShared/SnpValidationCode.bcp.gz
+ # Here is another source -- it is not as up-to-date as the above, but
+ # our encodings (enums and sets in snp131.sql) are named more similar
+ # to those in the 2005 ASN:
+ # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
+
+ ########################## DOWNLOAD #############################
+ cd /hive/data/outside/dbSNP/131/human
+ mkdir data schema rs_fasta
+ # Get data from NCBI (anonymous FTP)
+ set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
+ wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
+ cd /hive/data/outside/dbSNP/131/human/data
+ # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
+ wg $ftpSnpDb/organism_data/b131_SNPContigLoc_37_1.bcp.gz
+ wg $ftpSnpDb/organism_data/b131_SNPContigLocusId_37_1.bcp.gz
+ wg $ftpSnpDb/organism_data/b131_ContigInfo_37_1.bcp.gz
+ # MapInfo has alignment weights
+ wg $ftpSnpDb/organism_data/b131_SNPMapInfo_37_1.bcp.gz
+ # SNP has univar_id, validation status and heterozygosity
+ wg $ftpSnpDb/organism_data/SNP.bcp.gz
+
+ # Get schema
+ cd /hive/data/outside/dbSNP/131/human/schema
+ wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
+ wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz
+
+ # Get fasta files
+ # using headers of fasta files for molType, class, observed
+ cd /hive/data/outside/dbSNP/131/human/rs_fasta
+ wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
+
+ ########################## LOAD NCBI TABLES #############################
+ # Simplify names of data files -- strip version & extras to get
+ # local canonical table names.
+ cd /hive/data/outside/dbSNP/131/human/data
+ foreach f (*.bcp.gz)
+ set new = `echo $f \
+ | sed -e 's/^b131_SNP//; s/^b131_//; s/_37_1//; s/.bcp//;'`
+ mv $f $new
+ echo $new
+ end
+
+ cd /hive/data/outside/dbSNP/131/human/schema
+ zcat human_9606_table.sql.gz \
+ | perl -we '$/ = "\nGO\n\n\n"; \
+ while (<>) { \
+ next unless /^CREATE TABLE \[(b131_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_37_1)?\]/; \
+ s/b131_(SNP)?//; s/_37_1//; \
+ s/[\[\]]//g; s/GO\n\n/;/; s/smalldatetime/datetime/g; \
+ s/ON PRIMARY//g; s/COLLATE//g; s/Latin1_General_BIN//g; \
+ s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
+ s/nvarchar/varchar/g; s/set quoted/--set quoted/g; \
+ s/(image|varchar\s+\(\d+\))/BLOB/g; \
+ print; \
+ }' \
+ > table.sql
+
+ # load on hgwdev (kolossus disk almost full, no more small cluster mysql5's):
+ hgsql -e 'create database hg19snp131'
+ cd /hive/data/outside/dbSNP/131/human/schema
+ hgsql hg19snp131 < table.sql
+ cd ../data
+
+ # Avoid wasting space by excluding mappings to non-reference contigs (ContigInfo.group_label):
+ zcat ContigInfo.gz | cut -f 12 | uniq | sort -u
+#CRA_TCAGchr7v2
+#Celera
+#GRCh37
+#Homo sapiens MT
+#HuRef
+ foreach t (ContigInfo MapInfo)
+ zcat $t.gz \
+ | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
+ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+ | hgLoadSqlTab -oldTable hg19snp131 $t placeholder stdin
+ end
+
+ # Compare contig list between our ctgPos and reference contigs in ContigInfo.
+ # If they are identical, sweet, we probably have a $db/jkStuff/liftContigs.lft
+ # or similar file to use below. If they are not identical, need to make
+ # lift file using available information.
+ hgsql hg19 -N -B -e 'select contig from ctgPos;' \
+ | sort > /tmp/1
+ # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
+ hgsql hg19snp131 -N -B -e 'select contig_acc from ContigInfo;' | sort > /tmp/2
+ diff /tmp/1 /tmp/2
+ # Doh! Completely different: ctgPos has GL*, ContigInfo has NC_* / NT_*
+ # We will need to generate own liftUp file for N*_* contig IDs.
+
+ # NC_001807 entrez sez "Record removed.This sequence was removed
+ # since the accepted reference sequence for the Homo sapiens
+ # mitochondrion is the rCRS/Mitomap sequence, which is now
+ # available as the record NC_012920".
+ # They align w/gaps on both q & t, so liftUp won't do, we need liftOver:
+ blat -noHead NC_012920.fa /hive/data/genomes/hg19/M/chrM.fa stdout \
+ | axtChain -psl -linearGap=medium stdin -faT NC_012920.fa /hive/data/genomes/hg19/hg19.2bit \
+ NC_012920ToChrM.over.chain
+
+ # NT_004350: entrez sez:
+#COMMENT REFSEQ INFORMATION: Features on this sequence have been produced
+# for build 37 version 1 of the NCBI's genome annotation [see
+# documentation]. The reference sequence is identical to
+# GL000003.1.
+
+ # Using the contigs named in ContigInfo, screen-scrape genbank to get GL ID for contig ID.
+ cp /dev/null contigToGl.txt
+ foreach nt (`hgsql hg19snp131 -N -B -e 'select contig_acc from ContigInfo;'`)
+ wget --quiet -O - 'http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?tool=portal&db=nuccore&val='$nt'&dopt=genbank&sendto=on' \
+ | perl -we 'while (<>) { \
+ if (/^LOCUS/) { \
+ m/^LOCUS\s+'$nt'\s+(\d+) bp/ || die "parse ('$nt'): $_\t"; \
+ $size = $1; \
+ } elsif (/^ (GL\d+)\.\d+\.$/) { \
+ print "'$nt'\t$1\t$size\n"; \
+ } \
+ }' \
+ >> contigToGl.txt
+ end
+ hgsql hg19 -NBe 'select chromStart, SUBSTRING_INDEX(contig, ".", 1), \
+ ctgPos.size, ctgPos.chrom, chromInfo.size \
+ from ctgPos,chromInfo \
+ where ctgPos.chrom=chromInfo.chrom order by contig' \
+ > glToLift.txt
+ sort -k2,2 contigToGl.txt \
+ | join -1 2 -2 2 -t" " -o 1.1,2.1,1.3,1.4,1.5 -a 2 -e MISSING \
+ glToLift.txt - \
+ > /hive/data/genomes/hg19/jkStuff/liftContigs.lft
+
+ # Manually add NC_001807 -> chrM just in case:
+ echo "0 NC_001807 16571 chrM 16571" \
+ >> /hive/data/genomes/hg19/jkStuff/liftContigs.lft
+ # Blat NC_012920 to chrM shows gaps, so we'll need to use liftOver chain created above.
+
+ # Make sure there are no orient != 0 contigs among those selected.
+ hgsql hg19snp131 -NBe \
+ 'select count(*) from ContigInfo where orient != 0;'
+#0
+
+ # ContigLoc is huge, and we want just the reference contig mappings.
+ # So, based on the reference & haplo ctg_id values in ContigInfo,
+ # filter to get just the mappings for those contigs:
+ zcat ContigLoc.gz \
+ | awk '$3 <= 9 || $3 == 6647 || $3 >= 11178' \
+ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+ | hgLoadSqlTab -oldTable hg19snp131 ContigLoc placeholder stdin
+#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 1
+#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 2
+#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 3
+#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 4
+#load of ContigLoc did not go as planned: 27500025 record(s), 0 row(s) skipped, 3273 warning(s) loading /dev/stdin
+ zcat SNP.gz \
+ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+ | hgLoadSqlTab -oldTable hg19snp131 SNP placeholder stdin
+#Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 1
+#Warning 1366 Incorrect integer value: '' for column 'map_property' at row 1
+#Warning 1264 Out of range value adjusted for column 'last_updated_time' at row 2
+#Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 2
+#Warning 1366 Incorrect integer value: '' for column 'map_property' at row 2
+ # ... no big deal.
+#*** NOTE FOR NEXT TIME: grep for ref. contigs in ContigLocusId.gz (othw. huge):
+ zcat ContigLocusId.gz \
+ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+ | hgLoadSqlTab -oldTable hg19snp131 ContigLocusId placeholder stdin
+ foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
+ echo -n "${t}:\t"
+ hgsql -N -B hg19snp131 -e 'select count(*) from '$t
+ end
+#ContigInfo: 260
+#ContigLoc: 27500025
+#ContigLocusId: 110690222
+#MapInfo: 23619373
+#SNP: 23653729
+
+
+ #################### EXTRACT INFO FROM NCBI TABLES ####################
+ # Glom each SNP's function codes together and load up a new hg19Snp131 table.
+ # Also extract NCBI's annotations of coding SNPs' effects on translation.
+ # We extract ContigLocusId info only for reference assembly mapping.
+ # Some SNP's functional annotations are for an alternate assembly, so we will
+ # have no NCBI functional annotations to display for those (but our own are
+ # available).
+ cd /hive/data/outside/dbSNP/131/human
+ # Add indices to tables for a big join (5 or 6 minutes):
+ hgsql hg19snp131 -e \
+ 'alter table ContigInfo add index (ctg_id); \
+ alter table ContigLocusId add index (ctg_id);'
+ hgsql hg19snp131 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \
+ fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \
+ from ContigLocusId as cli, ContigInfo as ci \
+ where cli.ctg_id = ci.ctg_id;' \
+ > ncbiFuncAnnotations.txt
+ wc -l ncbiFuncAnnotations.txt
+#33665062 ncbiFuncAnnotations.txt
+ # Ignore function code 8 (cds-reference, just means that some allele matches reference)
+ # and glom functions for each SNP id:
+ cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
+ | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
+ | perl -we 'while (<>) { chomp; \
+ ($id, undef, $s, $e, $f, $c) = split; \
+ if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \
+ $prevFunc .= "$f," unless ($f == 8); \
+ } else { \
+ print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if (defined $prevId); \
+ $prevFunc = ($f == 8) ? "" : "$f,"; \
+ } \
+ ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \
+ } \
+ print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n"' \
+ > ucscFunc.txt
+ wc -l ucscFunc.txt
+#10328243 ucscFunc.txt
+ cat > ucscFunc.sql <<EOF
+CREATE TABLE ucscFunc (
+ snp_id int NOT NULL ,
+ ctg_id int(10) NOT NULL ,
+ asn_from int(10) NOT NULL ,
+ asn_to int(10) NOT NULL ,
+ fxn_class varchar(255) NOT NULL ,
+ INDEX snp_id (snp_id),
+ INDEX ctg_id (ctg_id)
+);
+EOF
+ hgLoadSqlTab hg19snp131 ucscFunc{,.sql,.txt}
+
+ # Extract observed allele, molType and snp class from FASTA headers gnl
+ # 4/13: found some inconsistent headers in rs_chPAR.fas.gz vs. other rs_ch*,
+ # reported to dbSNP, Lon said that rs_chPAR.fas.gz snuck in from build 130!
+ rm /hive/data/outside/dbSNP/131/human/rs_fasta/rs_chPAR.fas.gz
+ zcat /hive/data/outside/dbSNP/131/human/rs_fasta/rs_ch*.fas.gz \
+ | grep '^>gnl' \
+ | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
+ | sort -nu \
+ > ucscGnl.txt
+#520.305u 74.766s 7:02.48 140.8% 0+0k 0+0io 0pf+0w
+ wc -l ucscGnl.txt
+#23653726 ucscGnl.txt
+ cut -f 1 ucscGnl.txt | uniq | wc -l
+#23653726
+ cat > ucscGnl.sql <<EOF
+CREATE TABLE ucscGnl (
+ snp_id int NOT NULL ,
+ observed varchar(255) NOT NULL,
+ molType varchar(255) NOT NULL,
+ class varchar(255) NULL ,
+ INDEX snp_id (snp_id)
+);
+EOF
+ hgLoadSqlTab hg19snp131 ucscGnl{,.sql,.txt}
+
+ # Add indices to tables for a big join (5 or 6 minutes):
+ hgsql hg19snp131 -e \
+ 'alter table ContigLoc add index (ctg_id); \
+ alter table ContigLocusId add index (snp_id); \
+ alter table SNP add index (snp_id); \
+ alter table MapInfo add index (snp_id);'
+
+ # Big leftie join to bring together all of the columns that we want in snp131,
+ # using all of the available joining info:
+ hgsql hg19snp131 -NBe \
+ 'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
+ ug.observed, ug.molType, ug.class, \
+ s.validation_status, s.avg_heterozygosity, s.het_se, \
+ uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
+ FROM \
+ ((((ContigLoc as cl JOIN ContigInfo as ci \
+ ON cl.ctg_id = ci.ctg_id) \
+ LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
+ LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
+ LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
+ LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
+ and uf.asn_from = cl.asn_from;' \
+ > ucscNcbiSnp.ctg.bed
+#69.323u 12.056s 22:37.20 5.9% 0+0k 0+0io 0pf+0w
+ wc -l ucscNcbiSnp.ctg.bed
+#27500025 ucscNcbiSnp.ctg.bed
+
+ # Use liftUp for everything except mito, then liftOver for mito:
+ # There are some weird cases of length=1 but locType=range... in all the cases
+ # that I checked, the length really seems to be 1 so I'm not sure where they got
+ # the locType=range. Tweak locType in those cases so we can keep those SNPs:
+ grep -vw ^NC_012920 ucscNcbiSnp.ctg.bed \
+ | awk -F"\t" 'BEGIN{OFS="\t";} $2 == $3 && $14 == 1 {$14=2; numTweaked++;} {print;} \
+ END{print numTweaked, "single-base, locType=range, tweaked locType" > "/dev/stderr";}' \
+ | liftUp ucscNcbiSnp.bed \
+ /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
+#2535 single-base, locType=range, tweaked locType
+#392.182u 27.358s 7:20.66 95.2% 0+0k 0+0io 0pf+0w
+ # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
+ # doesn't deal with 0-base items. Fake out phys_pos_from to 0 because many coords
+ # will differ, oh well.
+ grep -w NC_012920 ucscNcbiSnp.ctg.bed \
+ | awk -F"\t" 'BEGIN{OFS="\t";} {$3 += 1; $16 = 0; print;}' \
+ | liftOver -bedPlus=3 stdin NC_012920ToChrM.over.chain stdout chrM.unmapped \
+ | awk -F"\t" 'BEGIN{OFS="\t";} {$3 -= 1; print;}' \
+ | sort -k2n,2n \
+ > chrMNcbiSnp.bed
+#3.523u 2.265s 0:30.99 18.6% 0+0k 0+0io 0pf+0w
+ # Good, got all but 2 SNPS (rs28693675 and rs55749223, partially deleted / deleted in new)
+ cat chrMNcbiSnp.bed >> ucscNcbiSnp.bed
+ wc -l ucscNcbiSnp.bed
+#27500023 ucscNcbiSnp.bed
+
+ # Translate NCBI's encoding into UCSC's, and perform a bunch of checks.
+ # This is where developer involvement is most likely as NCBI extends the
+ # encodings used in dbSNP.
+ cd /hive/data/outside/dbSNP/131/human/
+ snpNcbiToUcsc ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit snp131
+#spaces stripped from observed:
+#chr12 6093134 6093134 rs41402545
+#count of snps with weight 0 = 67535
+#count of snps with weight 1 = 23023681
+#count of snps with weight 2 = 472416
+#count of snps with weight 3 = 2536961
+#count of snps with weight 10 = 1399430
+#Skipped 7 snp mappings due to errors -- see snp131Errors.bed
+#218.985u 11.475s 4:41.38 81.8% 0+0k 0+0io 0pf+0w
+ head snp131Errors.bed
+#chr13 32953907 32954033 rs80359736 rs80359736 is 126 bases long but refNCBI is different length: CATCATCAGATTTATATTCTCTGTTAACAGAAGGAAAGAGATACAGAATTTATCATCTTGCAACTTCAAAATCTAAAAGTAAATCTGAAAGAGCTAACAT
+#chr17 41223118 41223133 rs80359888 Missing observed value (deleted SNP?).
+#chr17 41245687 41245900 rs80359886 rs80359886 is 213 bases long but refNCBI is different length: AATATGCCTGGTAGAAGACTTCCTCCTCAGCCTATTCTTTTTAGGTGCTTTTGAATTGTGGATATTTAATTCGAGTTCCATATTGCTTATACTGCTGCTT
+#chr17 41245687 41245900 rs80359886 Missing observed value (deleted SNP?).
+#chr17 41276085 41276094 rs80359887 Missing observed value (deleted SNP?).
+#chrM 308 310 rs66492218 Unexpected coords for locType "between" (3) -- expected NCBI's chrEnd = chrStart+1.
+#chrM 308 310 rs66492218 rs66492218 is 2 bases long but refNCBI is different length: -
+ wc -l snp*
+# 26033053 snp131.bed
+# 22 snp131.sql
+# 7 snp131Errors.bed
+# 18 snp131ExceptionDesc.tab
+# 4859728 snp131Exceptions.bed
+# 23653726 snp131Seq.tab
+ # 8M new snps, lots more exceptions than snp130 (2631563)
+
+ # Make one big fasta file.
+ # It's a monster: 18G! Can we split by hashing rsId?
+ zcat rs_fasta/rs_ch*.fas.gz \
+ | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
+ > snp131.fa
+ # Check for duplicates.
+ grep ^\>rs snp131.fa | sort > /data/tmp/seqHeaders
+ wc -l /data/tmp/seqHeaders
+#23653726 /data/tmp/seqHeaders
+ uniq /data/tmp/seqHeaders | wc -l
+#23653726
+ # Use hgLoadSeq to generate .tab output for sequence file offsets,
+ # and keep only the columns that we need: acc and file_offset.
+ # Index it and translate to snpSeq table format.
+ hgLoadSeq -test placeholder snp131.fa
+#23653726 sequences
+#128.364u 25.531s 10:52.02 23.6% 0+0k 0+0io 0pf+0w
+ cut -f 2,6 seq.tab > snp131Seq.tab
+ rm seq.tab
+
+ # Load up main track tables.
+ cd /hive/data/outside/dbSNP/131/human
+ hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
+ hg19 snp131 -sqlTable=snp131.sql snp131.bed
+#Loaded 26033053 elements of size 17
+#162.666u 19.611s 8:53.56 34.1% 0+0k 0+0io 0pf+0w
+ hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
+ hg19 snp131Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
+ snp131Exceptions.bed
+#Loaded 4859728 elements of size 5
+#32.020u 2.006s 1:22.87 41.0% 0+0k 0+0io 0pf+0w
+ hgLoadSqlTab hg19 snp131ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
+ snp131ExceptionDesc.tab
+ # Load up sequences.
+ mkdir -p /gbdb/hg19/snp
+ ln -s /hive/data/outside/dbSNP/131/human/snp131.fa /gbdb/hg19/snp/snp131.fa
+ hgLoadSqlTab hg19 snp131Seq ~/kent/src/hg/lib/snpSeq.sql snp131Seq.tab
+
+ # Put in a link where one would expect to find the track build dir...
+ ln -s /hive/data/outside/dbSNP/131/human /hive/data/genomes/hg19/bed/snp131
+
+ # Look at the breakdown of exception categories:
+ cd /hive/data/outside/dbSNP/131/human
+ cut -f 5 snp131Exceptions.bed | sort | uniq -c | sort -nr
+#3666812 MultipleAlignments
+# 886159 ObservedMismatch
+# 92341 SingleClassTriAllelic
+# 70184 SingleClassZeroSpan
+# 43319 ObservedTooLong
+# 25745 MixedObserved
+# 22606 SingleClassLongerSpan
+# 19681 SingleClassQuadAllelic
+# 15245 FlankMismatchGenomeShorter
+# 9808 DuplicateObserved
+# 4463 NamedDeletionZeroSpan
+# 2040 FlankMismatchGenomeLonger
+# 802 ObservedContainsIupac
+# 317 NamedInsertionNonzeroSpan
+# 142 FlankMismatchGenomeEqual
+# 62 RefAlleleMismatch
+# 1 RefAlleleRevComp
+# 1 ObservedWrongFormat
+ # Compared to snp130, nice to see fewer disfunctional locTypes (FlankMismatch*)
+ # and singleClassQuadAllelic -- major increases in most others though.
+
+#TODO: go through those and send some bug reports to dbSNP.
+
+
+############################################################################