src/hg/makeDb/doc/hg19.txt 1.100

1.100 2010/04/15 22:06:57 angie
Added alternate haplotype contigs to ctgPos. Built snp131.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.99
retrieving revision 1.100
diff -b -B -U 4 -r1.99 -r1.100
--- src/hg/makeDb/doc/hg19.txt	12 Apr 2010 21:32:37 -0000	1.99
+++ src/hg/makeDb/doc/hg19.txt	15 Apr 2010 22:06:57 -0000	1.100
@@ -836,9 +836,9 @@
     hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
         /cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz
 
 #############################################################################
-# Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram)
+# Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram) (Alt. haplotypes added 4/12/10 angie)
     mkdir /hive/data/genomes/hg19/bed/ctgPos
     cd /hive/data/genomes/hg19/bed/ctgPos
     cat << '_EOF_' > mkCtgPos.sh
 AGP="/hive/data/genomes/hg19/download/assembled_chromosomes/AGP"
@@ -903,12 +903,33 @@
 
     #	fetch .sql definition from hg18
     chmod 777 .
     hgsqldump --all -c --tab=. hg18 ctgPos
+    # Don't confuse us w/hg18 data:
+    rm ctgPos.txt
     chmod 775 .
     hgsql hg19 < ctgPos.sql
     hgsql -e 'load data local infile "ctgPos.tab" into table ctgPos;' hg19
 
+    # 4/12/10 (angie): add the alt loci:
+    perl -we 'while (<>) { \
+                next if (/^#/); chomp; @w = split; \
+                $w[0] = lc($w[0]); $w[0] =~ s/^hs//; $w[0] =~ s/_mhc_/_/;  $w[0] =~ s/_ctg1$//; \
+                $w[0] =~ s/_apd$/_apd_hap1/; $w[0] =~ s/_cox$/_cox_hap2/; \
+                $w[0] =~ s/_dbb$/_dbb_hap3/; $w[0] =~ s/_mann$/_mann_hap4/; \
+                $w[0] =~ s/_mcf$/_mcf_hap5/; $w[0] =~ s/_qbl$/_qbl_hap6/; \
+                $w[0] =~ s/_ssto$/_ssto_hap7/; \
+                $w[0] =~ s/_1(_ctg\d)/${1}_hap1/; \
+                if ($w[0] eq "chr6_cox_hap2" && $w[8] == 4873745) { $w[8] = 4795371; } # yep, inconsistent \
+                print join("\t", $w[1], $w[8], $w[0], 0, $w[8]) . "\n"; }' \
+    /hive/data/genomes/hg19/download/alternate_loci/*/placed_scaffolds/alt_locus_scaf2primary.pos \
+      >> ctgPos.tab
+    sort -k 3,3 -k4n,4n ctgPos.tab \
+    | hgLoadSqlTab hg19 ctgPos ctgPos.sql stdin
+    # TODO: tell NCBI alternate_loci/ALT_REF_LOCI_2/placed_scaffolds/alt_locus_scaf2primary.pos
+    # has size inconsistent w/AGP, FASTA
+
+
 #############################################################################
 # CLONE ENDS - first step for BACEND/CytoBand tracks
 #	(DONE - 2009-04-28 - Hiram)
     mkdir -p /hive/data/genomes/hg19/bed/cloneend/ncbi
@@ -9097,5 +9118,423 @@
     hgsql hg19 -e 'delete from ucscToEnsembl where ucsc like "%";'
     hgsql hg19 -e \
 'LOAD DATA LOCAL INFILE "ucscToEnsemblV57.tab" INTO TABLE ucscToEnsembl'
 
-######################################################################## 
+
+############################################################################
+# dbSNP BUILD 131 (SNP131) (DONE 4/15/10 angie)
+    # Set up build directory
+    mkdir -p /hive/data/outside/dbSNP/131/{human,shared}
+
+    # Get field encodings -- if there are changes or additions to the
+    # encoding of the corresponding fields, you might need to update
+    # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
+    # hg/lib/snp125Ui.c).
+    cd /hive/data/outside/dbSNP/131/shared
+    alias wg wget --timestamping
+    set ftpShared = ftp://ftp.ncbi.nih.gov/snp/database/shared_data
+    wg $ftpShared/LocTypeCode.bcp.gz
+    wg $ftpShared/SnpClassCode.bcp.gz
+    wg $ftpShared/SnpFunctionCode.bcp.gz
+    wg $ftpShared/SnpValidationCode.bcp.gz
+    # Here is another source -- it is not as up-to-date as the above, but
+    # our encodings (enums and sets in snp131.sql) are named more similar
+    # to those in the 2005 ASN:
+    # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
+
+    ########################## DOWNLOAD #############################
+    cd /hive/data/outside/dbSNP/131/human
+    mkdir data schema rs_fasta
+    # Get data from NCBI (anonymous FTP)
+    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
+    wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
+    cd /hive/data/outside/dbSNP/131/human/data
+    # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
+    wg $ftpSnpDb/organism_data/b131_SNPContigLoc_37_1.bcp.gz
+    wg $ftpSnpDb/organism_data/b131_SNPContigLocusId_37_1.bcp.gz
+    wg $ftpSnpDb/organism_data/b131_ContigInfo_37_1.bcp.gz
+    # MapInfo has alignment weights
+    wg $ftpSnpDb/organism_data/b131_SNPMapInfo_37_1.bcp.gz
+    # SNP has univar_id, validation status and heterozygosity
+    wg $ftpSnpDb/organism_data/SNP.bcp.gz
+
+    # Get schema
+    cd /hive/data/outside/dbSNP/131/human/schema
+    wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
+    wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz
+
+    # Get fasta files
+    # using headers of fasta files for molType, class, observed
+    cd /hive/data/outside/dbSNP/131/human/rs_fasta
+    wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
+
+    ########################## LOAD NCBI TABLES #############################
+    # Simplify names of data files -- strip version & extras to get
+    # local canonical table names.
+    cd /hive/data/outside/dbSNP/131/human/data
+    foreach f (*.bcp.gz)
+      set new = `echo $f \
+                 | sed -e 's/^b131_SNP//; s/^b131_//; s/_37_1//; s/.bcp//;'`
+      mv $f $new
+      echo $new
+    end
+
+    cd /hive/data/outside/dbSNP/131/human/schema
+    zcat human_9606_table.sql.gz \
+    | perl -we '$/ = "\nGO\n\n\n"; \
+        while (<>) { \
+          next unless /^CREATE TABLE \[(b131_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_37_1)?\]/; \
+          s/b131_(SNP)?//; s/_37_1//; \
+          s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
+          s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
+          s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
+          s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
+          s/(image|varchar\s+\(\d+\))/BLOB/g; \
+          print; \
+        }' \
+      > table.sql
+
+    # load on hgwdev (kolossus disk almost full, no more small cluster mysql5's):
+    hgsql -e 'create database hg19snp131'
+    cd /hive/data/outside/dbSNP/131/human/schema
+    hgsql hg19snp131 < table.sql
+    cd ../data
+
+    # Avoid wasting space by excluding mappings to non-reference contigs (ContigInfo.group_label):
+    zcat ContigInfo.gz | cut -f 12 | uniq | sort -u
+#CRA_TCAGchr7v2
+#Celera
+#GRCh37
+#Homo sapiens MT
+#HuRef
+    foreach t (ContigInfo MapInfo)
+      zcat $t.gz \
+      | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
+      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+      | hgLoadSqlTab -oldTable hg19snp131 $t placeholder stdin
+    end
+
+    # Compare contig list between our ctgPos and reference contigs in ContigInfo.
+    # If they are identical, sweet, we probably have a $db/jkStuff/liftContigs.lft
+    # or similar file to use below.  If they are not identical, need to make 
+    # lift file using available information.
+    hgsql hg19 -N -B -e 'select contig from ctgPos;' \
+    | sort > /tmp/1
+    # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
+    hgsql hg19snp131 -N -B -e 'select contig_acc from ContigInfo;' | sort > /tmp/2
+    diff /tmp/1 /tmp/2
+    # Doh!  Completely different: ctgPos has GL*, ContigInfo has NC_* / NT_*
+    # We will need to generate own liftUp file for N*_* contig IDs.
+
+    # NC_001807 entrez sez "Record removed.This sequence was removed
+    # since the accepted reference sequence for the Homo sapiens
+    # mitochondrion is the rCRS/Mitomap sequence, which is now
+    # available as the record NC_012920".  
+    # They align w/gaps on both q & t, so liftUp won't do, we need liftOver:
+    blat -noHead NC_012920.fa /hive/data/genomes/hg19/M/chrM.fa stdout \
+    | axtChain -psl -linearGap=medium stdin -faT NC_012920.fa /hive/data/genomes/hg19/hg19.2bit \
+        NC_012920ToChrM.over.chain
+
+    # NT_004350: entrez sez:
+#COMMENT     REFSEQ INFORMATION: Features on this sequence have been produced
+#            for build 37 version 1 of the NCBI's genome annotation [see
+#            documentation].   The reference sequence is identical to
+#            GL000003.1.
+
+    # Using the contigs named in ContigInfo, screen-scrape genbank to get GL ID for contig ID.
+    cp /dev/null contigToGl.txt
+    foreach nt (`hgsql hg19snp131 -N -B -e 'select contig_acc from ContigInfo;'`)
+      wget --quiet -O - 'http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?tool=portal&amp;db=nuccore&amp;val='$nt'&amp;dopt=genbank&amp;sendto=on' \
+      | perl -we 'while (<>) { \
+                    if (/^LOCUS/) { \
+                      m/^LOCUS\s+'$nt'\s+(\d+) bp/ || die "parse ('$nt'): $_\t"; \
+		      $size = $1; \
+                    } elsif (/^            (GL\d+)\.\d+\.$/) { \
+                      print "'$nt'\t$1\t$size\n"; \
+		    } \
+                  }' \
+        >> contigToGl.txt
+    end
+    hgsql hg19 -NBe 'select chromStart, SUBSTRING_INDEX(contig, ".", 1), \
+                       ctgPos.size, ctgPos.chrom, chromInfo.size \
+                       from ctgPos,chromInfo \
+                       where ctgPos.chrom=chromInfo.chrom order by contig' \
+      > glToLift.txt
+    sort -k2,2 contigToGl.txt \
+    | join -1 2 -2 2 -t"	" -o 1.1,2.1,1.3,1.4,1.5 -a 2 -e MISSING \
+        glToLift.txt - \
+      > /hive/data/genomes/hg19/jkStuff/liftContigs.lft
+
+    # Manually add NC_001807 -> chrM just in case:
+    echo "0	NC_001807	16571	chrM	16571" \
+      >> /hive/data/genomes/hg19/jkStuff/liftContigs.lft
+    # Blat NC_012920 to chrM shows gaps, so we'll need to use liftOver chain created above.
+
+    # Make sure there are no orient != 0 contigs among those selected.
+    hgsql hg19snp131 -NBe \
+      'select count(*) from ContigInfo where orient != 0;'
+#0
+
+    # ContigLoc is huge, and we want just the reference contig mappings.
+    # So, based on the reference & haplo ctg_id values in ContigInfo,
+    # filter to get just the mappings for those contigs:
+    zcat ContigLoc.gz \
+    | awk '$3 <= 9 || $3 == 6647 || $3 >= 11178' \
+    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+    | hgLoadSqlTab -oldTable hg19snp131 ContigLoc placeholder stdin
+#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 1
+#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 2
+#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 3
+#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 4
+#load of ContigLoc did not go as planned: 27500025 record(s), 0 row(s) skipped, 3273 warning(s) loading /dev/stdin
+    zcat SNP.gz \
+    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+    | hgLoadSqlTab -oldTable hg19snp131 SNP placeholder stdin
+#Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 1
+#Warning 1366 Incorrect integer value: '' for column 'map_property' at row 1
+#Warning 1264 Out of range value adjusted for column 'last_updated_time' at row 2
+#Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 2
+#Warning 1366 Incorrect integer value: '' for column 'map_property' at row 2
+    # ... no big deal.
+#*** NOTE FOR NEXT TIME: grep for ref. contigs in ContigLocusId.gz (othw. huge):
+    zcat ContigLocusId.gz \
+    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+    | hgLoadSqlTab -oldTable hg19snp131 ContigLocusId placeholder stdin
+    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
+     echo -n "${t}:\t"
+      hgsql -N -B hg19snp131 -e 'select count(*) from '$t
+    end
+#ContigInfo:           260
+#ContigLoc:       27500025
+#ContigLocusId:  110690222
+#MapInfo:         23619373
+#SNP:    	  23653729
+
+
+    #################### EXTRACT INFO FROM NCBI TABLES ####################
+    # Glom each SNP's function codes together and load up a new hg19Snp131 table.
+    # Also extract NCBI's annotations of coding SNPs' effects on translation.
+    # We extract ContigLocusId info only for reference assembly mapping.
+    # Some SNP's functional annotations are for an alternate assembly, so we will
+    # have no NCBI functional annotations to display for those (but our own are 
+    # available).
+    cd /hive/data/outside/dbSNP/131/human
+    # Add indices to tables for a big join (5 or 6 minutes):
+    hgsql hg19snp131 -e \
+      'alter table ContigInfo add index (ctg_id); \
+       alter table ContigLocusId add index (ctg_id);'
+    hgsql hg19snp131 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \
+                           fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \
+                           from ContigLocusId as cli, ContigInfo as ci \
+                           where cli.ctg_id = ci.ctg_id;' \
+      > ncbiFuncAnnotations.txt
+    wc -l ncbiFuncAnnotations.txt
+#33665062 ncbiFuncAnnotations.txt
+    # Ignore function code 8 (cds-reference, just means that some allele matches reference)
+    # and glom functions for each SNP id:
+    cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
+    | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
+    | perl -we 'while (<>) { chomp; \
+                  ($id, undef, $s, $e, $f, $c) = split; \
+                  if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \
+                    $prevFunc .= "$f," unless ($f == 8); \
+                  } else { \
+                    print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if (defined $prevId); \
+                    $prevFunc = ($f == 8) ? "" : "$f,"; \
+                  } \
+                  ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \
+                } \
+                print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n"' \
+      > ucscFunc.txt
+    wc -l ucscFunc.txt
+#10328243 ucscFunc.txt
+    cat > ucscFunc.sql <<EOF
+CREATE TABLE ucscFunc (
+        snp_id int NOT NULL ,
+        ctg_id int(10) NOT NULL ,
+        asn_from int(10) NOT NULL ,
+        asn_to int(10) NOT NULL ,
+        fxn_class varchar(255) NOT NULL ,
+        INDEX snp_id (snp_id),
+        INDEX ctg_id (ctg_id)
+);
+EOF
+    hgLoadSqlTab hg19snp131 ucscFunc{,.sql,.txt}
+
+    # Extract observed allele, molType and snp class from FASTA headers gnl
+    # 4/13: found some inconsistent headers in rs_chPAR.fas.gz vs. other rs_ch*,
+    # reported to dbSNP, Lon said that rs_chPAR.fas.gz snuck in from build 130!
+    rm /hive/data/outside/dbSNP/131/human/rs_fasta/rs_chPAR.fas.gz
+    zcat /hive/data/outside/dbSNP/131/human/rs_fasta/rs_ch*.fas.gz \
+    | grep '^>gnl' \
+    | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
+    | sort -nu \
+      > ucscGnl.txt
+#520.305u 74.766s 7:02.48 140.8% 0+0k 0+0io 0pf+0w
+    wc -l ucscGnl.txt
+#23653726 ucscGnl.txt
+    cut -f 1 ucscGnl.txt | uniq | wc -l
+#23653726
+    cat > ucscGnl.sql <<EOF
+CREATE TABLE ucscGnl (
+        snp_id int NOT NULL ,
+        observed varchar(255) NOT NULL,
+        molType varchar(255) NOT NULL,
+        class varchar(255) NULL ,
+        INDEX snp_id (snp_id)
+);
+EOF
+    hgLoadSqlTab hg19snp131 ucscGnl{,.sql,.txt}
+
+    # Add indices to tables for a big join (5 or 6 minutes):
+    hgsql hg19snp131 -e \
+      'alter table ContigLoc  add index (ctg_id); \
+       alter table ContigLocusId add index (snp_id); \
+       alter table SNP        add index (snp_id); \
+       alter table MapInfo    add index (snp_id);'
+
+    # Big leftie join to bring together all of the columns that we want in snp131,
+    # using all of the available joining info:
+    hgsql hg19snp131 -NBe \
+     'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
+             ug.observed, ug.molType, ug.class, \
+             s.validation_status, s.avg_heterozygosity, s.het_se, \
+             uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
+      FROM \
+      ((((ContigLoc as cl JOIN ContigInfo as ci \
+               ON cl.ctg_id = ci.ctg_id) \
+          LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
+         LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
+        LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
+       LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
+                                and uf.asn_from = cl.asn_from;' \
+      > ucscNcbiSnp.ctg.bed
+#69.323u 12.056s 22:37.20 5.9%   0+0k 0+0io 0pf+0w
+    wc -l ucscNcbiSnp.ctg.bed 
+#27500025 ucscNcbiSnp.ctg.bed
+
+    # Use liftUp for everything except mito, then liftOver for mito:
+    # There are some weird cases of length=1 but locType=range... in all the cases 
+    # that I checked, the length really seems to be 1 so I'm not sure where they got 
+    # the locType=range.  Tweak locType in those cases so we can keep those SNPs:
+    grep -vw ^NC_012920 ucscNcbiSnp.ctg.bed \
+    | awk -F"\t" 'BEGIN{OFS="\t";}  $2 == $3 && $14 == 1 {$14=2; numTweaked++;}  {print;} \
+           END{print numTweaked, "single-base, locType=range, tweaked locType" > "/dev/stderr";}' \
+    | liftUp ucscNcbiSnp.bed \
+      /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
+#2535    single-base, locType=range, tweaked locType
+#392.182u 27.358s 7:20.66 95.2%  0+0k 0+0io 0pf+0w
+    # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
+    # doesn't deal with 0-base items.  Fake out phys_pos_from to 0 because many coords
+    # will differ, oh well.
+    grep -w NC_012920 ucscNcbiSnp.ctg.bed \
+    | awk -F"\t" 'BEGIN{OFS="\t";} {$3 += 1; $16 = 0; print;}' \
+    | liftOver -bedPlus=3 stdin NC_012920ToChrM.over.chain stdout chrM.unmapped \
+    | awk -F"\t" 'BEGIN{OFS="\t";} {$3 -= 1; print;}' \
+    | sort -k2n,2n \
+      > chrMNcbiSnp.bed
+#3.523u 2.265s 0:30.99 18.6%     0+0k 0+0io 0pf+0w
+    # Good, got all but 2 SNPS (rs28693675 and rs55749223, partially deleted / deleted in new)
+    cat chrMNcbiSnp.bed >> ucscNcbiSnp.bed
+    wc -l ucscNcbiSnp.bed
+#27500023 ucscNcbiSnp.bed
+
+    # Translate NCBI's encoding into UCSC's, and perform a bunch of checks.
+    # This is where developer involvement is most likely as NCBI extends the 
+    # encodings used in dbSNP.
+    cd /hive/data/outside/dbSNP/131/human/
+    snpNcbiToUcsc ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit snp131
+#spaces stripped from observed:
+#chr12   6093134 6093134 rs41402545
+#count of snps with weight  0 = 67535
+#count of snps with weight  1 = 23023681
+#count of snps with weight  2 = 472416
+#count of snps with weight  3 = 2536961
+#count of snps with weight 10 = 1399430
+#Skipped 7 snp mappings due to errors -- see snp131Errors.bed
+#218.985u 11.475s 4:41.38 81.8%  0+0k 0+0io 0pf+0w
+    head snp131Errors.bed
+#chr13   32953907        32954033        rs80359736      rs80359736 is 126 bases long but refNCBI is different length: CATCATCAGATTTATATTCTCTGTTAACAGAAGGAAAGAGATACAGAATTTATCATCTTGCAACTTCAAAATCTAAAAGTAAATCTGAAAGAGCTAACAT
+#chr17   41223118        41223133        rs80359888      Missing observed value (deleted SNP?).
+#chr17   41245687        41245900        rs80359886      rs80359886 is 213 bases long but refNCBI is different length: AATATGCCTGGTAGAAGACTTCCTCCTCAGCCTATTCTTTTTAGGTGCTTTTGAATTGTGGATATTTAATTCGAGTTCCATATTGCTTATACTGCTGCTT
+#chr17   41245687        41245900        rs80359886      Missing observed value (deleted SNP?).
+#chr17   41276085        41276094        rs80359887      Missing observed value (deleted SNP?).
+#chrM    308     310     rs66492218      Unexpected coords for locType "between" (3) -- expected NCBI's chrEnd = chrStart+1.
+#chrM    308     310     rs66492218      rs66492218 is 2 bases long but refNCBI is different length: -
+    wc -l snp*
+#  26033053 snp131.bed
+#        22 snp131.sql
+#         7 snp131Errors.bed
+#        18 snp131ExceptionDesc.tab
+#   4859728 snp131Exceptions.bed
+#  23653726 snp131Seq.tab
+    # 8M new snps, lots more exceptions than snp130 (2631563)
+
+    # Make one big fasta file.
+    # It's a monster: 18G!  Can we split by hashing rsId?
+    zcat rs_fasta/rs_ch*.fas.gz \
+    | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
+      > snp131.fa
+    # Check for duplicates.
+    grep ^\>rs snp131.fa | sort > /data/tmp/seqHeaders
+    wc -l /data/tmp/seqHeaders
+#23653726 /data/tmp/seqHeaders
+    uniq /data/tmp/seqHeaders | wc -l
+#23653726
+    # Use hgLoadSeq to generate .tab output for sequence file offsets,
+    # and keep only the columns that we need: acc and file_offset.
+    # Index it and translate to snpSeq table format.
+    hgLoadSeq -test placeholder snp131.fa
+#23653726 sequences
+#128.364u 25.531s 10:52.02 23.6% 0+0k 0+0io 0pf+0w
+    cut -f 2,6 seq.tab > snp131Seq.tab
+    rm seq.tab
+
+    # Load up main track tables.
+    cd /hive/data/outside/dbSNP/131/human
+    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
+      hg19 snp131 -sqlTable=snp131.sql snp131.bed
+#Loaded 26033053 elements of size 17
+#162.666u 19.611s 8:53.56 34.1%  0+0k 0+0io 0pf+0w
+    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
+      hg19 snp131Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
+      snp131Exceptions.bed
+#Loaded 4859728 elements of size 5
+#32.020u 2.006s 1:22.87 41.0%    0+0k 0+0io 0pf+0w
+    hgLoadSqlTab hg19 snp131ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
+      snp131ExceptionDesc.tab
+    # Load up sequences.
+    mkdir -p /gbdb/hg19/snp
+    ln -s /hive/data/outside/dbSNP/131/human/snp131.fa /gbdb/hg19/snp/snp131.fa
+    hgLoadSqlTab hg19 snp131Seq ~/kent/src/hg/lib/snpSeq.sql snp131Seq.tab
+
+    # Put in a link where one would expect to find the track build dir...
+    ln -s /hive/data/outside/dbSNP/131/human /hive/data/genomes/hg19/bed/snp131
+
+    # Look at the breakdown of exception categories:
+    cd /hive/data/outside/dbSNP/131/human
+    cut -f 5 snp131Exceptions.bed | sort | uniq -c | sort -nr
+#3666812 MultipleAlignments
+# 886159 ObservedMismatch
+#  92341 SingleClassTriAllelic
+#  70184 SingleClassZeroSpan
+#  43319 ObservedTooLong
+#  25745 MixedObserved
+#  22606 SingleClassLongerSpan
+#  19681 SingleClassQuadAllelic
+#  15245 FlankMismatchGenomeShorter
+#   9808 DuplicateObserved
+#   4463 NamedDeletionZeroSpan
+#   2040 FlankMismatchGenomeLonger
+#    802 ObservedContainsIupac
+#    317 NamedInsertionNonzeroSpan
+#    142 FlankMismatchGenomeEqual
+#     62 RefAlleleMismatch
+#      1 RefAlleleRevComp
+#      1 ObservedWrongFormat
+    # Compared to snp130, nice to see fewer disfunctional locTypes (FlankMismatch*)
+    # and singleClassQuadAllelic -- major increases in most others though.
+
+#TODO: go through those and send some bug reports to dbSNP.
+
+
+############################################################################