src/hg/makeDb/doc/hg19.txt 1.36

1.36 2009/09/01 04:27:30 angie
Added snp130: dbSNP's provisional mapping from b36 to GRCh37 coords. Also added orthologous allele table for snp130.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.35
retrieving revision 1.36
diff -b -B -U 4 -r1.35 -r1.36
--- src/hg/makeDb/doc/hg19.txt	11 Aug 2009 17:16:53 -0000	1.35
+++ src/hg/makeDb/doc/hg19.txt	1 Sep 2009 04:27:30 -0000	1.36
@@ -5490,5 +5490,307 @@
     #	real    4m7.559s
     cat fb.tetNig2.chainHg19Link.txt 
     #	42910930 bases of 302314788 (14.194%) in intersection
 
+
+##############################################################################
+# dbSNP BUILD 130 - PROVISIONAL REMAPPING TO BUILD 37 (DONE 8/28/09 angie)
+    # /hive/data/outside/dbSNP/130/ was already set up during the hg18 run --
+    # just add hg19 coord files and go from there.
+    cd /hive/data/outside/dbSNP/130/human/data
+    alias wg wget --timestamping
+    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/misc/exchange
+    # These are provisional files in an ad-hoc format.
+    wg $ftpSnpDb/README.txt
+    wg $ftpSnpDb/Remap_36_3_37_1.info
+    wg $ftpSnpDb/Remap_36_3_37_1.txt.gz
+    mv README.txt Remap_36_3_37_1_README
+    zcat Remap_36_3_37_1.txt.gz | wc -l
+#18823990
+
+    # Use the remapping to transform ../ucscNcbiSnp.bed into one for hg19.
+    # Useful columns, 1-based: 1=ID, 3=oldChr, 4=oldStart, 5=oldEnd,
+    # 10=newChr, 11=newStart, 12=newEnd, 13=newLocType, 14=newWeight, 16=newStrand
+    # For mappings to chr*_random, oldStart and oldEnd are empty -- skip.
+    # Sort both hg18 snp file and remap file by {rsID,chr,start} to keep them in sync.
+    mkdir /hive/data/outside/dbSNP/130/human/hg19
+    cd /hive/data/outside/dbSNP/130/human/hg19
+    sort -k4n,4n -k1,1 -k2n,2n ../ucscNcbiSnp.bed > /data/tmp/hg18.ucscNcbiSnp.idSorted.bed
+    zcat ../data/Remap_36_3_37_1.txt.gz \
+    | sort -t "	" -k1n,1n -k3,3 -k4n,4n \
+      > /data/tmp/Remap_36_3_37_1.txt
+    perl -we \
+      'use strict; \
+       sub nextMap { \
+         my ($rsId, undef, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
+             $nLocType, $nWt, $nRef, $nStr);\
+         do { \
+           ($rsId, undef, $oChr, $oStart, $oEnd, undef,undef,undef,undef, \
+               $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = split("\t", <>); \
+           if (defined $nStr) { \
+             chomp $nStr; $nStr =~ tr/+-/01/; $oChr = "chr$oChr";  $nChr = "chr$nChr"; \
+           } \
+           $oStart--;  $oEnd--;  $nStart--;  $nEnd--;  # Yep. 0-based closed vs 1-based closed \
+         } while (defined $nStr && ($oEnd < 0 || $nChr eq "chrUn")); \
+         return ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
+                 $nLocType, $nWt, $nRef, $nStr); \
+       } # nextMap \
+       my ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
+         &nextMap(); \
+       my ($rCount, $oCount, $tCount) = 0; \
+       open(my $oldF, "/data/tmp/hg18.ucscNcbiSnp.idSorted.bed") || die; \
+       while (my ($chr, $s, $e, $id, $str, $rn,$obs,$mt,$cn,$vn,$ah,$ahse,$fc,$lt,$wt) = \
+              split("\t", <$oldF>)) { \
+         my $thisRCount = 0; \
+         while (defined $oChr && $chr eq $oChr && $s == $oStart && $e == $oEnd && $id == $rsId) { \
+           print join("\t", $nChr,$nStart,$nEnd,$id,$nStr,$nRef,$obs,$mt,$cn,$vn,$ah,$ahse,$fc, \
+                            $nLocType,$nWt,$nStart) \
+                      . "\n"; \
+           ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
+             &nextMap(); \
+           $thisRCount++; \
+         } \
+         if (defined $rsId && $id > $rsId) {warn "Slipped a cog"; last;} \
+         $tCount += $thisRCount; \
+         $rCount++ if ($thisRCount > 0); \
+         $oCount++; \
+       } \
+       close($oldF);  print STDERR "Replaced $rCount of $oCount inputs ($tCount outputs).\n";' \
+      /data/tmp/Remap_36_3_37_1.txt \
+    | sort -k1,1 -k2n,2n -k4,4 \
+    > /data/tmp/hg19.ucscNcbiSnp.bed
+#Replaced 18693260 of 19189750 inputs (18697579 outputs).
+#504.562u 27.037s 8:59.57 98.5%  0+0k 0+0io 0pf+0w
+    wc -l /data/tmp/hg19.ucscNcbiSnp.bed
+#  18697579 /data/tmp/hg19.ucscNcbiSnp.bed
+
+    # Drum roll please... translate NCBI's encoding into UCSC's, and
+    # perform a bunch of checks.  This is where developer involvement
+    # is most likely as NCBI extends the encodings used in dbSNP.
+    cd /hive/data/outside/dbSNP/130/human/hg19
+    snpNcbiToUcsc /data/tmp/hg19.ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit \
+      -1000GenomesRsIds=../data/1000GenomesRsIds.txt snp130
+#spaces stripped from observed:
+#chr12   6093134 6093134 rs41402545
+#Line 8049395 of /data/tmp/hg19.ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA
+#count of snps with weight  0 = 0
+#count of snps with weight  1 = 17042465
+#count of snps with weight  2 = 345274
+#count of snps with weight  3 = 1017906
+#count of snps with weight 10 = 291934
+#Skipped 1496 snp mappings due to errors -- see snp130Errors.bed
+#146.837u 9.867s 4:21.63 59.8%   0+0k 0+0io 0pf+0w
+    # Comparable to hg18.snp130, with some losses due to coord translation, loss of _randoms,
+    # and 1496 errors (new locType or refNCBI inconsistent with new size).
+    expr 18697579 - 291934 - 1496
+#18404149
+
+    # Move hg19.ucscNcbiSnp.bed from fast tmp to slow (today) hive:
+    gzip /data/tmp/hg19.ucscNcbiSnp.bed
+    mv /data/tmp/hg19.ucscNcbiSnp.bed.gz hg19.ucscNcbiSnp.bed.gz
+
+    # Will try not reuse hg18.snp130's giant 18G fasta file, not duplicate.
+
+    # Load up main track tables.
+    cd /hive/data/outside/dbSNP/130/human/hg19
+    hgLoadBed -tab -tmpDir=/data/tmp -allowStartEqualEnd \
+      hg19 snp130 -sqlTable=snp130.sql snp130.bed
+#Loaded 18404149 elements of size 17
+#115.086u 21.663s 2:32:09.98 1.4%        0+0k 0+0io 1pf+0w
+#that is freakishly long -- lots happening today w/db move, hive recovery,...
+    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
+      hg19 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
+      snp130Exceptions.bed
+#Loaded 1982828 elements of size 5
+#10.500u 0.851s 1:13.42 15.4%    0+0k 0+0io 0pf+0w
+    hgLoadSqlTab hg19 snp130ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
+      snp130ExceptionDesc.tab
+    # Load up sequences *from hg18 file*:
+    hgLoadSqlTab hg19 snp130Seq ~/kent/src/hg/lib/snpSeq.sql ../snp130Seq.tab
+
+    # Put in a link where one would expect to find the track build dir...
+    ln -s /hive/data/outside/dbSNP/130/human/hg19 /hive/data/genomes/hg19/bed/snp130
+
+    # Look at the breakdown of exception categories:
+    cd /hive/data/outside/dbSNP/130/human/hg19
+    cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr
+#1350217 MultipleAlignments
+# 495981 ObservedMismatch
+#  37603 ObservedTooLong
+#  26855 SingleClassTriAllelic
+#  24443 FlankMismatchGenomeShorter
+#  17927 SingleClassLongerSpan
+#  13685 SingleClassZeroSpan
+#   6238 FlankMismatchGenomeLonger
+#   3016 DuplicateObserved
+#   2851 SingleClassQuadAllelic
+#   1777 MixedObserved
+#   1264 NamedDeletionZeroSpan
+#    508 FlankMismatchGenomeEqual
+#    329 NamedInsertionNonzeroSpan
+#    121 ObservedContainsIupac
+#     11 RefAlleleMismatch
+#      2 ObservedWrongFormat
+
+#TODO: go through those above (esp snp130Errors.bed) and send some bug reports to dbSNP.
+
+
+##############################################################################
+# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 8/31/09 angie)
+    mkdir /hive/data/genomes/hg19/bed/snp130Ortho
+    cd /hive/data/genomes/hg19/bed/snp130Ortho
+
+    # Following Heather's lead in snp126orthos, filter SNPs to to keep
+    # only those with class=single, length=1, chrom!~random;
+    # Exclude those with exceptions MultipleAlignments,
+    # SingleClassTriAllelic or SingleClassQuadAllelic.
+    # Unlike snp masking, we do not filter for weight -- don't know why.
+    awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
+      /hive/data/outside/dbSNP/130/human/hg19/snp130Exceptions.bed \
+    | sort -u \
+      > snp130ExcludeIds.txt
+    awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
+      /hive/data/outside/dbSNP/130/human/hg19/snp130.bed \
+    | grep -vFwf snp130ExcludeIds.txt \
+      > snp130Simple.bed
+#203.193u 9.197s 2:57.40 119.7%  0+0k 0+0io 0pf+0w
+    wc -l snp130Simple.bed
+#12278514 snp130Simple.bed
+
+    # Glom all human info that we need for the final table onto the
+    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
+    awk 'BEGIN{OFS="\t";} \
+        {print $1, $2, $3, \
+               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
+               0, $6;}' \
+      snp130Simple.bed > snp130ForLiftOver.bed
+    # Map coords to chimp using liftOver.
+    # I don't know why chimp took so much longer than macaque... the
+    # chimp .over has fewer chains and fewer bytes than the macaque .over.
+    mkdir run.liftOChimp
+    cd run.liftOChimp
+    mkdir split out
+    splitFile ../snp130ForLiftOver.bed 25000 split/chunk
+    cp /dev/null jobList
+    foreach f (split/chunk*)
+      echo liftOver $f \
+        /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
+        \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
+        >> jobList
+    end
+    ssh swarm
+    cd /hive/data/genomes/hg19/bed/snp130Ortho/run.liftOChimp
+    para make jobList
+#Completed: 492 of 492 jobs
+#CPU time in finished jobs:      51793s     863.22m    14.39h    0.60d  0.002 y
+#IO & Wait Time:                  3825s      63.75m     1.06h    0.04d  0.000 y
+#Average job time:                 113s       1.88m     0.03h    0.00d
+#Longest finished job:             286s       4.77m     0.08h    0.00d
+#Submission to last job:           300s       5.00m     0.08h    0.00d
+
+    # Map coords to orangutan using liftOver.
+    mkdir ../run.liftOPon
+    cd ../run.liftOPon
+    mkdir out
+    ln -s ../run.liftOChimp/split .
+    cp /dev/null jobList
+    foreach f (split/chunk*)
+      echo liftOver $f \
+        /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
+        \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
+        >> jobList
+    end
+    para make jobList
+#Completed: 492 of 492 jobs
+#CPU time in finished jobs:     125656s    2094.26m    34.90h    1.45d  0.004 y
+#IO & Wait Time:                  5413s      90.22m     1.50h    0.06d  0.000 y
+#Average job time:                 266s       4.44m     0.07h    0.00d
+#Longest finished job:             646s      10.77m     0.18h    0.01d
+#Submission to last job:           649s      10.82m     0.18h    0.01d
+
+    # Map coords to macaque using liftOver.
+    mkdir ../run.liftOMac
+    cd ../run.liftOMac
+    mkdir out
+    ln -s ../run.liftOChimp/split .
+    cp /dev/null jobList
+    foreach f (split/chunk*)
+      echo liftOver $f \
+        /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
+        \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
+        >> jobList
+    end
+    para make jobList
+#Completed: 492 of 492 jobs
+#CPU time in finished jobs:     161612s    2693.54m    44.89h    1.87d  0.005 y
+#IO & Wait Time:                  6218s     103.63m     1.73h    0.07d  0.000 y
+#Average job time:                 341s       5.69m     0.09h    0.00d
+#Longest finished job:             727s      12.12m     0.20h    0.01d
+#Submission to last job:           739s      12.32m     0.21h    0.01d
+
+    cd /hive/data/genomes/hg19/bed/snp130Ortho
+    # Concatenate the chimp results, sorting by chimp pos in order to
+    # efficiently access 2bit sequence in getOrthoSeq.  The output of
+    # that is then sorted by the glommed human info field, so that we
+    # can use join to combine chimp and macaque results in the next step.
+    # Ditto for macaque and orangutan.  Each command pipe takes ~5 minutes:
+    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
+    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
+    | sort > panTro2.orthoGlom.txt
+    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
+    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
+    | sort > ponAbe2.orthoGlom.txt
+    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
+    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
+    | sort > rheMac2.orthoGlom.txt
+    wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
+#  11428526 panTro2.orthoGlom.txt
+#  10861969 ponAbe2.orthoGlom.txt
+#   9694237 rheMac2.orthoGlom.txt
+
+    # Use the glommed name field as a key to join up chimp and macaque
+    # allele data.  Include glommed name from both files because if only
+    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
+    # in the orthoGlom fields from each file, which are in the same order
+    # as the chimp and macaque columns of snp130OrthoPanTro2RheMac2.
+    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
+      -a 1 -a 2 -e '?' \
+      panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
+    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
+            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
+      > tmp.txt
+    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
+      -a 1 -a 2 -e '?' \
+      tmp.txt rheMac2.orthoGlom.txt \
+    | perl -wpe 'chomp; \
+        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
+         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
+         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
+        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
+        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
+          split(/\|/, $glomKey); \
+        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
+        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
+        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
+                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
+                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
+                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
+        s/^.*$//;' \
+    | sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed
+#304.434u 27.118s 4:31.30 122.2% 0+0k 0+0io 0pf+0w
+    wc -l snp130OrthoPt2Pa2Rm2.bed
+#11876029 snp130OrthoPt2Pa2Rm2.bed
+
+    cd /hive/data/genomes/hg19/bed/snp130Ortho
+    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
+      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
+      hg19 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed
+#Loaded 11876029 elements of size 22
+#75.442u 8.828s 9:50.27 14.2%    0+0k 0+0io 0pf+0w
+
+    # Cleanup fileserver:
+    cd /hive/data/genomes/hg19/bed/snp130Ortho
+    gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed &
+    rm -r run*/split tmp.txt *.orthoGlom.txt
+
+
 ##############################################################################