src/hg/makeDb/doc/hg18.txt 1.363

1.363 2009/05/26 22:13:50 angie
Added snp130 basic tables.
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.362
retrieving revision 1.363
diff -b -B -U 4 -r1.362 -r1.363
--- src/hg/makeDb/doc/hg18.txt	22 May 2009 22:26:21 -0000	1.362
+++ src/hg/makeDb/doc/hg18.txt	26 May 2009 22:13:50 -0000	1.363
@@ -22344,8 +22344,492 @@
     rm -r run*/split tmp.txt *.orthoGlom.txt
 
 
 ############################################################################
+# dbSNP BUILD 130 (DONE 5/22/09 angie)
+    # Set up build directory
+    mkdir -p /hive/data/outside/dbSNP/130/{human,shared}
+
+    # Get field encodings -- if there are changes or additions to the
+    # encoding of the corresponding fields, you might need to update
+    # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
+    # hg/lib/snp125Ui.c).
+    cd /hive/data/outside/dbSNP/130/shared
+    alias wg wget --timestamping
+    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
+    wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
+    wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
+    wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
+    wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
+    # Here is another source -- it is not as up-to-date as the above, but
+    # our encodings (enums and sets in snp130.sql) are named more similar
+    # to those in the 2005 ASN:
+    # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
+
+    ########################## DOWNLOAD #############################
+    cd /hive/data/outside/dbSNP/130/human
+    mkdir data schema rs_fasta
+    # Get data from NCBI (anonymous FTP)
+    wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
+    cd /hive/data/outside/dbSNP/130/human/data
+    # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
+    wg $ftpSnpDb/organism_data/b130_SNPContigLoc_36_3.bcp.gz
+    wg $ftpSnpDb/organism_data/b130_SNPContigLocusId_36_3.bcp.gz
+    wg $ftpSnpDb/organism_data/b130_ContigInfo_36_3.bcp.gz
+    # MapInfo has alignment weights
+    wg $ftpSnpDb/organism_data/b130_SNPMapInfo_36_3.bcp.gz
+    # SNP has univar_id, validation status and heterozygosity
+    wg $ftpSnpDb/organism_data/SNP.bcp.gz
+
+    # Get schema
+    cd /hive/data/outside/dbSNP/130/human/schema
+    wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
+    wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz
+
+    # Get fasta files
+    # using headers of fasta files for molType, class, observed
+    cd /hive/data/outside/dbSNP/130/human/rs_fasta
+    wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
+
+    # Get 1000 Genomes IDs (unfortunately not in validation field as Sol suggested)
+    cd /hive/data/outside/dbSNP/130/human/data
+    wg -O 1000Genomes_README ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/ReadMe.txt
+    wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/B130_1000G_RsClusterReport.txt.gz
+    zcat B130_1000G_RsClusterReport.txt.gz | wc -l
+#7512342
+    # Make a uniquified list of only the numeric portion of the assigned rs IDs:
+    zcat B130_1000G_RsClusterReport.txt.gz \
+    | cut -d, -f 3 | sed -e 's/^rs//' \
+    | sort -nu > 1000GenomesRsIds.txt
+    wc -l 1000GenomesRsIds.txt
+#5611085 1000GenomesRsIds.txt
+
+    ########################## LOAD NCBI TABLES #############################
+    # Simplify names of data files -- strip version & extras to get
+    # local canonical table names.
+    cd /hive/data/outside/dbSNP/130/human/data
+    foreach f (*.bcp.gz)
+      set new = `echo $f \
+                 | sed -e 's/^b130_SNP//; s/^b130_//; s/_36_3//; s/.bcp//;'`
+      mv $f $new
+      echo $new
+    end
+
+    cd /hive/data/outside/dbSNP/130/human/schema
+    zcat human_9606_table.sql.gz \
+    | perl -we '$/ = "\nGO\n\n\n"; \
+        while (<>) { \
+          next unless /^CREATE TABLE \[(b130_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_3)?\]/; \
+          s/b130_(SNP)?//; s/_36_3//; \
+          s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
+          s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
+          s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
+          s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
+          s/(image|varchar\s+\(\d+\))/BLOB/g; \
+          print; \
+        }' \
+      > table.sql
+
+    # load on hgwdev (kolossus disk almost full, no more small cluster mysql5's):
+    hgsql '' -e 'create database hg18snp130'
+    cd /hive/data/outside/dbSNP/130/human/schema
+    hgsql hg18snp130 < table.sql
+    cd ../data
+
+    # Avoid wasting space by excluding mappings to non-reference contigs:
+    foreach t (ContigInfo MapInfo)
+      zcat $t.gz \
+      | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
+      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+      | hgLoadSqlTab -oldTable hg18snp130 $t placeholder stdin
+    end
+#load of ContigInfo did not go as planned: 379 record(s), 0 row(s) skipped, 88 warning(s) loading /dev/stdin
+    # Checked ContigInfo visually, looks OK.
+    # Compare contig list between our ctgPos and reference contigs in
+    # ContigInfo:
+    ssh hgwdev-10 hgsql hg18 -N -B -e '"select contig from ctgPos;"' \
+    | sort > /tmp/1
+    hgsql hg18snp130 -NBe 'select distinct(group_label) from ContigInfo'
+    # --> reference, c5_H2, c6_COX, c6_QBL, c22_H2, DR53
+    # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
+    hgsql hg18snp130 -N -B -e 'select contig_acc from ContigInfo \
+        where group_label in \
+        ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
+    diff /tmp/1 /tmp/2
+    # No diff.
+    # Make sure there are no orient != 0 contigs among those selected.
+    hgsql hg18snp130 -NBe \
+      'select count(*) from ContigInfo where orient != 0 and \
+         group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
+#0
+
+    # ContigLoc is huge, and we want just the reference contig mappings.
+    # So, based on the reference & haplo ctg_id values in ContigInfo,
+    # filter to get just the mappings for those contigs:
+    zcat ContigLoc.gz \
+    | awk '$3 <= 377 || $3 == 7015' \
+    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+    | hgLoadSqlTab -oldTable hg18snp130 ContigLoc placeholder stdin
+    zcat SNP.gz \
+    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+    | hgLoadSqlTab -oldTable hg18snp130 SNP placeholder stdin
+    zcat ContigLocusId.gz \
+    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
+    | hgLoadSqlTab -oldTable hg18snp130 ContigLocusId placeholder stdin
+    # There were some warnings (many cleared up by the perl substitution)
+    # but no rows were dropped.  In mysql5, 'show warnings' after a manual 'load data'
+    # complains about missing values (OK when e.g. position is not known).
+    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
+     echo -n "${t}:\t"
+      hgsql -N -B hg18snp130 -e 'select count(*) from '$t
+    end
+#ContigInfo:     379
+#ContigLoc:      19189750
+#ContigLocusId:  11790054
+#MapInfo:        17928700
+#SNP:    	 17804034
+
+
+    #################### EXTRACT INFO FROM NCBI TABLES ####################
+    # Glom each SNP's function codes together and load up a new hg18Snp130 table.
+    # Also extract NCBI's annotations of coding SNPs' effects on translation.
+    # ContigLocusId includes contig_acc and asn_{from,to} but those are not comprehensive!
+    # If a SNP has been mapped to multiple contigs, one is randomly selected, and if 
+    # it is not a reference contig, we miss out if we restrict by contig.  We may end
+    # up getting a few spurious functional annotations from mappings to other assemblies
+    # but them's the breaks.
+    cd /hive/data/outside/dbSNP/130/human
+    hgsql hg18snp130 -NBe 'select snp_id, mrna_acc, fxn_class, \
+                           reading_frame, allele, residue, codon from ContigLocusId' \
+      > ncbiFuncAnnotations.txt
+    cut -f 1,3 ncbiFuncAnnotations.txt \
+    | sort -u -k1,1 -k2,2n  \
+    | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
+            else { if (prevId) {print prevId "\t" prevFunc;} \
+                                prevFunc = $2 ","; }} \
+           {prevId = $1;} \
+           END {print prevId "\t" prevFunc;}' \
+      > ucscFunc.txt
+
+    wc -l ucscFunc.txt
+#7344853 ucscFunc.txt
+    cat > ucscFunc.sql <<EOF
+CREATE TABLE ucscFunc (
+        snp_id int NOT NULL ,
+        fxn_class varchar(255) NOT NULL ,
+        INDEX snp_id (snp_id)
+);
+EOF
+    hgLoadSqlTab hg18snp130 ucscFunc{,.sql,.txt}
+
+    # Extract observed allele, molType and snp class from FASTA headers gnl
+    zcat /hive/data/outside/dbSNP/130/human/rs_fasta/rs_ch*.fas.gz \
+    | grep '^>gnl' \
+    | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
+    | sort -n \
+      > ucscGnl.txt
+#407.555u 57.499s 4:32.89 170.4% 0+0k 0+0io 0pf+0w
+    wc -l ucscGnl.txt
+#17804034 ucscGnl.txt
+    cut -f 1 ucscGnl.txt | uniq | wc -l
+#17804034
+    cat > ucscGnl.sql <<EOF
+CREATE TABLE ucscGnl (
+        snp_id int NOT NULL ,
+        observed varchar(255) NOT NULL,
+        molType varchar(255) NOT NULL,
+        class varchar(255) NULL ,
+        INDEX snp_id (snp_id)
+);
+EOF
+    hgLoadSqlTab hg18snp130 ucscGnl{,.sql,.txt}
+
+    # Add indices to tables for a big join (5 or 6 minutes):
+    hgsql hg18snp130 -e \
+      'alter table ContigLoc  add index (ctg_id); \
+       alter table ContigInfo add index (ctg_id); \
+       alter table ContigLoc  add index (snp_id); \
+       alter table SNP        add index (snp_id); \
+       alter table MapInfo    add index (snp_id);'
+
+    # Big leftie join to bring together all of the columns that we want in snp130,
+    # using all of the available joining info:
+    hgsql hg18snp130 -NBe \
+     'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
+             ug.observed, ug.molType, ug.class, \
+             s.validation_status, s.avg_heterozygosity, s.het_se, \
+             uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
+      FROM \
+      ((((ContigLoc as cl JOIN ContigInfo as ci \
+               ON cl.ctg_id = ci.ctg_id and \
+                  ci.group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")) \
+          LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
+         LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
+        LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
+       LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id;' \
+      > ucscNcbiSnp.ctg.bed
+#on a not-so busy hgwdev: 80.735u 36.958s 8:54.76 22.0%   0+0k 0+0io 0pf+0w
+#on a very busy hgwdev:   78.753u 41.304s 30:19.77 6.5%   0+0k 0+0io 0pf+0w
+    wc -l ucscNcbiSnp.ctg.bed 
+#19189750 ucscNcbiSnp.ctg.bed
+    liftUp ucscNcbiSnp.bed \
+      /cluster/data/hg18/jkStuff/liftContigs.lft warn \
+      ucscNcbiSnp.ctg.bed
+#116.027u 7.078s 2:27.93 83.2%   0+0k 0+0io 0pf+0w
+
+    # Drum roll please... translate NCBI's encoding into UCSC's, and
+    # perform a bunch of checks.  This is where developer involvement
+    # is most likely as NCBI extends the encodings used in dbSNP.
+    cd /hive/data/outside/dbSNP/130/human/
+    snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
+      -1000GenomesRsIds=data/1000GenomesRsIds.txt snp130
+#spaces stripped from observed:
+#chr12   5963395 5963395 rs41402545
+#Line 8106609 of ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA
+#count of snps with weight  0 = 74828
+#count of snps with weight  1 = 17254041
+#count of snps with weight  2 = 389501
+#count of snps with weight  3 = 1189989
+#count of snps with weight 10 = 281391
+#Found no errors.
+#143.111u 14.313s 3:15.18 80.6%  0+0k 0+0io 0pf+0w
+    wc -l snp*
+#  18833531 snp130.bed
+#        22 snp130.sql
+#         0 snp130Errors.bed
+#        18 snp130ExceptionDesc.tab
+#   2631563 snp130Exceptions.bed
+    # More SNPs but 0 errors and a bit fewer exceptions that snp129, cool!
+
+    # Make one big fasta file.
+    # It's a monster: 18G!  Can we split by hashing rsId?
+    zcat rs_fasta/rs_ch*.fas.gz \
+    | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
+      > snp130.fa
+    # Check for duplicates.
+    grep ^\>rs snp130.fa | sort > /scratch/tmp/seqHeaders
+    wc -l /scratch/tmp/seqHeaders
+#17804034 /scratch/tmp/seqHeaders
+    uniq /scratch/tmp/seqHeaders | wc -l
+#17804034
+    # Use hgLoadSeq to generate .tab output for sequence file offsets,
+    # and keep only the columns that we need: acc and file_offset.
+    # Index it and translate to snpSeq table format.
+    time hgLoadSeq -test placeholder snp130.fa
+#107.748u 24.338s 6:58.50 31.5%  0+0k 0+0io 0pf+0w
+    cut -f 2,6 seq.tab > snp130Seq.tab
+    rm seq.tab
+
+    # Load up main track tables.
+    cd /hive/data/outside/dbSNP/130/human
+    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
+      hg18 snp130 -sqlTable=snp130.sql snp130.bed
+#Loaded 18833531 elements of size 17
+#107.246u 11.546s 10:49.23 18.2% 0+0k 0+0io 0pf+0w
+    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
+      hg18 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
+      snp130Exceptions.bed
+#15.255u 1.257s 1:11.11 23.2%    0+0k 0+0io 0pf+0w
+    sed -e 's/snp125/snp130/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
+      > snp130ExceptionDesc.sql
+    hgLoadSqlTab hg18 snp130ExceptionDesc snp130ExceptionDesc.sql \
+      snp130ExceptionDesc.tab
+    # Load up sequences.
+    sed -e 's/snpSeq/snp130Seq/' ~/kent/src/hg/lib/snpSeq.sql \
+      > snp130Seq.sql
+    mkdir -p /gbdb/hg18/snp
+    ln -s /hive/data/outside/dbSNP/130/human/snp130.fa /gbdb/hg18/snp/snp130.fa
+    time nice hgLoadSqlTab hg18 snp130Seq snp130Seq.sql snp130Seq.tab
+#0.001u 0.004s 3:41.13 0.0%      0+0k 0+0io 0pf+0w
+
+    # Put in a link where one would expect to find the track build dir...
+    ln -s /hive/data/outside/dbSNP/130/human /cluster/data/hg18/bed/snp130
+
+    # Look at the breakdown of exception categories:
+    cd /hive/data/outside/dbSNP/130/human
+    cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr
+#1960737 MultipleAlignments
+# 519222 ObservedMismatch
+#  38444 ObservedTooLong
+#  32069 SingleClassTriAllelic
+#  26351 FlankMismatchGenomeShorter
+#  19089 SingleClassLongerSpan
+#  15441 SingleClassZeroSpan
+#   6583 FlankMismatchGenomeLonger
+#   4108 DuplicateObserved
+#   3627 SingleClassQuadAllelic
+#   3473 MixedObserved
+#   1369 NamedDeletionZeroSpan
+#    547 FlankMismatchGenomeEqual
+#    355 NamedInsertionNonzeroSpan
+#    136 ObservedContainsIupac
+#      8 ObservedWrongFormat
+#      4 RefAlleleMismatch
+
+#TODO: go through those above and send some bug reports to dbSNP.
+
+
+#######################################################################
+# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 5/15/09 angie)
+    mkdir /hive/data/genomes/hg18/bed/snp130Ortho
+    cd /hive/data/genomes/hg18/bed/snp130Ortho
+
+    # Following Heather's lead in snp126orthos, filter SNPs to to keep
+    # only those with class=single, length=1, chrom!~random;
+    # Exclude those with exceptions MultipleAlignments,
+    # SingleClassTriAllelic or SingleClassQuadAllelic.
+    # Unlike snp masking, we do not filter for weight -- don't know why.
+    awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
+      /hive/data/outside/dbSNP/130/human/snp130Exceptions.bed \
+    | sort -u \
+      > snp130ExcludeIds.txt
+    awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
+      /hive/data/outside/dbSNP/130/human/snp130.bed \
+    | grep -vFwf snp130ExcludeIds.txt \
+      > snp130Simple.bed
+#182.396u 12.388s 2:10.30 149.4% 0+0k 0+0io 0pf+0w
+    wc -l snp130Simple.bed
+#12141377 snp130Simple.bed
+
+    # Glom all human info that we need for the final table onto the
+    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
+    awk 'BEGIN{OFS="\t";} \
+        {print $1, $2, $3, \
+               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
+               0, $6;}' \
+      snp130Simple.bed > snp130ForLiftOver.bed
+
+    # Map coords to chimp using liftOver.
+    # I don't know why chimp took so much longer than macaque... the
+    # chimp .over has fewer chains and fewer bytes than the macaque .over.
+    mkdir run.liftOChimp
+    cd run.liftOChimp
+    mkdir split out
+    splitFile ../snp130ForLiftOver.bed 25000 split/chunk
+    cp /dev/null jobList
+    foreach f (split/chunk*)
+      echo liftOver $f \
+        /hive/data/genomes/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
+        \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
+        >> jobList
+    end
+    ssh pk
+    cd /hive/data/genomes/hg18/bed/snp130Ortho/run.liftOChimp
+    para make jobList
+#Completed: 486 of 486 jobs
+#CPU time in finished jobs:      76679s    1277.99m    21.30h    0.89d  0.002 y
+#IO & Wait Time:                  1828s      30.46m     0.51h    0.02d  0.000 y
+#Average job time:                 162s       2.69m     0.04h    0.00d
+#Longest finished job:             486s       8.10m     0.14h    0.01d
+#Submission to last job:           513s       8.55m     0.14h    0.01d
+
+    # Map coords to orangutan using liftOver.
+    mkdir ../run.liftOPon
+    cd ../run.liftOPon
+    mkdir out
+    ln -s ../run.liftOChimp/split .
+    cp /dev/null jobList
+    foreach f (split/chunk*)
+      echo liftOver $f \
+        /hive/data/genomes/hg18/bed/liftOver/hg18ToPonAbe2.over.chain.gz \
+        \{check out exists out/ponAbe2.$f:t.bed\} out/hg18.$f:t.unmapped \
+        >> jobList
+    end
+    para make jobList
+#Completed: 486 of 486 jobs
+#CPU time in finished jobs:     165378s    2756.31m    45.94h    1.91d  0.005 y
+#IO & Wait Time:                  2614s      43.56m     0.73h    0.03d  0.000 y
+#Average job time:                 346s       5.76m     0.10h    0.00d
+#Longest finished job:            1017s      16.95m     0.28h    0.01d
+#Submission to last job:          1051s      17.52m     0.29h    0.01d
+
+    # Map coords to macaque using liftOver.
+    mkdir ../run.liftOMac
+    cd ../run.liftOMac
+    mkdir out
+    ln -s ../run.liftOChimp/split .
+    cp /dev/null jobList
+    foreach f (split/chunk*)
+      echo liftOver $f \
+        /hive/data/genomes/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
+        \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
+        >> jobList
+    end
+    para make jobList
+#Completed: 486 of 486 jobs
+#CPU time in finished jobs:       4068s      67.80m     1.13h    0.05d  0.000 y
+#IO & Wait Time:                  1944s      32.40m     0.54h    0.02d  0.000 y
+#Average job time:                  12s       0.21m     0.00h    0.00d
+#Longest finished job:              38s       0.63m     0.01h    0.00d
+#Submission to last job:           126s       2.10m     0.04h    0.00d
+
+    cd /hive/data/genomes/hg18/bed/snp130Ortho
+    # Concatenate the chimp results, sorting by chimp pos in order to
+    # efficiently access 2bit sequence in getOrthoSeq.  The output of
+    # that is then sorted by the glommed human info field, so that we
+    # can use join to combine chimp and macaque results in the next step.
+    # Ditto for macaque and orangutan.  Each command pipe takes ~5 minutes:
+    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
+    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
+    | sort > panTro2.orthoGlom.txt
+    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
+    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
+    | sort > ponAbe2.orthoGlom.txt
+    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
+    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
+    | sort > rheMac2.orthoGlom.txt
+    wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
+#  11318466 panTro2.orthoGlom.txt
+#  10976821 ponAbe2.orthoGlom.txt
+#   9702063 rheMac2.orthoGlom.txt
+
+    # Use the glommed name field as a key to join up chimp and macaque
+    # allele data.  Include glommed name from both files because if only
+    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
+    # in the orthoGlom fields from each file, which are in the same order
+    # as the chimp and macaque columns of snp130OrthoPanTro2RheMac2.
+    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
+      -a 1 -a 2 -e '?' \
+      panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
+    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
+            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
+      > tmp.txt
+    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
+      -a 1 -a 2 -e '?' \
+      tmp.txt rheMac2.orthoGlom.txt \
+    | perl -wpe 'chomp; \
+        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
+         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
+         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
+        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
+        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
+          split(/\|/, $glomKey); \
+        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
+        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
+        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
+                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
+                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
+                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
+        s/^.*$//;' \
+    | sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed
+#300.357u 31.419s 4:33.00 121.5% 0+0k 0+0io 0pf+0w
+    wc -l snp130OrthoPt2Pa2Rm2.bed
+#11797184 snp130OrthoPt2Pa2Rm2.bed
+
+    cd /hive/data/genomes/hg18/bed/snp130Ortho
+    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
+      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
+      hg18 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed
+#Loaded 11797184 elements of size 22
+#83.624u 9.627s 10:19.26 15.0%   0+0k 0+0io 0pf+0w
+
+    # Cleanup fileserver:
+    cd /hive/data/genomes/hg18/bed/snp130Ortho
+    nice gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed
+    rm -r run*/split tmp.txt *.orthoGlom.txt
+
+
+############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from: