src/hg/makeDb/doc/hg19.txt 1.101

1.101 2010/04/16 17:47:00 angie
Added snp131OrthoPt2Pa2Rm2 (mappings to other primate reference assemblies).
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.100
retrieving revision 1.101
diff -b -B -U 4 -r1.100 -r1.101
--- src/hg/makeDb/doc/hg19.txt	15 Apr 2010 22:06:57 -0000	1.100
+++ src/hg/makeDb/doc/hg19.txt	16 Apr 2010 17:47:00 -0000	1.101
@@ -9537,4 +9537,162 @@
 #TODO: go through those and send some bug reports to dbSNP.
 
 
 ############################################################################
+# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP131 (DONE 4/15/10 angie)
+    mkdir /hive/data/genomes/hg19/bed/snp131Ortho
+    cd /hive/data/genomes/hg19/bed/snp131Ortho
+
+    # Following Heather's lead in snp126orthos, filter SNPs to to keep
+    # only those with class=single, length=1, chrom!~random;
+    # Exclude those with exceptions MultipleAlignments,
+    # SingleClassTriAllelic or SingleClassQuadAllelic.
+    # Unlike snp masking, we do not filter for weight -- don't know why.
+    awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
+      /hive/data/outside/dbSNP/131/human/snp131Exceptions.bed \
+    | sort -u \
+      > snp131ExcludeIds.txt
+    awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
+      /hive/data/outside/dbSNP/131/human/snp131.bed \
+    | grep -vFwf snp131ExcludeIds.txt \
+      > snp131Simple.bed
+#333.829u 11.879s 3:57.31 145.6% 0+0k 0+0io 0pf+0w
+    wc -l snp131Simple.bed
+#17337248 snp131Simple.bed
+
+    # Glom all human info that we need for the final table onto the
+    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
+    awk 'BEGIN{OFS="\t";} \
+        {print $1, $2, $3, \
+               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
+               0, $6;}' \
+      snp131Simple.bed > snp131ForLiftOver.bed
+
+    # Map coords to chimp using liftOver.
+    # I don't know why chimp took so much longer than macaque... the
+    # chimp .over has fewer chains and fewer bytes than the macaque .over.
+    mkdir run.liftOChimp
+    cd run.liftOChimp
+    mkdir split out
+    splitFile ../snp131ForLiftOver.bed 25000 split/chunk
+    cp /dev/null jobList
+    foreach f (split/chunk*)
+      echo liftOver $f \
+        /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
+        \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
+        >> jobList
+    end
+    ssh pk
+    cd /hive/data/genomes/hg19/bed/snp131Ortho/run.liftOChimp
+    para make jobList
+#Completed: 694 of 694 jobs
+#CPU time in finished jobs:     114883s    1914.72m    31.91h    1.33d  0.004 y
+#IO & Wait Time:                  3183s      53.05m     0.88h    0.04d  0.000 y
+#Average job time:                 170s       2.84m     0.05h    0.00d
+#Longest finished job:             460s       7.67m     0.13h    0.01d
+#Submission to last job:           525s       8.75m     0.15h    0.01d
+
+    # Map coords to orangutan using liftOver.
+    mkdir ../run.liftOPon
+    cd ../run.liftOPon
+    mkdir out
+    ln -s ../run.liftOChimp/split .
+    cp /dev/null jobList
+    foreach f (split/chunk*)
+      echo liftOver $f \
+        /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
+        \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
+        >> jobList
+    end
+    para make jobList
+#Completed: 694 of 694 jobs
+#CPU time in finished jobs:     224315s    3738.58m    62.31h    2.60d  0.007 y
+#IO & Wait Time:                 18237s     303.95m     5.07h    0.21d  0.001 y
+#Average job time:                 349s       5.82m     0.10h    0.00d
+#Longest finished job:            1010s      16.83m     0.28h    0.01d
+#Submission to last job:          1024s      17.07m     0.28h    0.01d
+
+    # Map coords to macaque using liftOver.
+    mkdir ../run.liftOMac
+    cd ../run.liftOMac
+    mkdir out
+    ln -s ../run.liftOChimp/split .
+    cp /dev/null jobList
+    foreach f (split/chunk*)
+      echo liftOver $f \
+        /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
+        \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
+        >> jobList
+    end
+    para make jobList
+#Completed: 694 of 694 jobs
+#CPU time in finished jobs:     289558s    4825.96m    80.43h    3.35d  0.009 y
+#IO & Wait Time:                 23402s     390.04m     6.50h    0.27d  0.001 y
+#Average job time:                 451s       7.52m     0.13h    0.01d
+#Longest finished job:             893s      14.88m     0.25h    0.01d
+#Submission to last job:           906s      15.10m     0.25h    0.01d
+
+    cd /hive/data/genomes/hg19/bed/snp131Ortho
+    # Concatenate the chimp results, sorting by chimp pos in order to
+    # efficiently access 2bit sequence in getOrthoSeq.  The output of
+    # that is then sorted by the glommed human info field, so that we
+    # can use join to combine chimp and macaque results in the next step.
+    # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
+    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
+    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
+    | sort > panTro2.orthoGlom.txt
+    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
+    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
+    | sort > ponAbe2.orthoGlom.txt
+    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
+    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
+    | sort > rheMac2.orthoGlom.txt
+    wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
+#  16230258 panTro2.orthoGlom.txt
+#  15535287 ponAbe2.orthoGlom.txt
+#  13996256 rheMac2.orthoGlom.txt
+
+    # Use the glommed name field as a key to join up chimp and macaque
+    # allele data.  Include glommed name from both files because if only
+    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
+    # in the orthoGlom fields from each file, which are in the same order
+    # as the chimp and macaque columns of snp131OrthoPanTro2RheMac2.
+    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
+      -a 1 -a 2 -e '?' \
+      panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
+    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
+            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
+      > tmp.txt
+    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
+      -a 1 -a 2 -e '?' \
+      tmp.txt rheMac2.orthoGlom.txt \
+    | perl -wpe 'chomp; \
+        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
+         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
+         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
+        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
+        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
+          split(/\|/, $glomKey); \
+        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
+        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
+        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
+                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
+                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
+                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
+        s/^.*$//;' \
+    | sort -k1,1 -k2n,2n > snp131OrthoPt2Pa2Rm2.bed
+#437.114u 37.309s 6:33.92 120.4% 0+0k 0+0io 0pf+0w
+    wc -l snp131OrthoPt2Pa2Rm2.bed
+#16842459 snp131OrthoPt2Pa2Rm2.bed
+
+    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
+      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
+      hg19 snp131OrthoPt2Pa2Rm2 snp131OrthoPt2Pa2Rm2.bed
+#Loaded 16842459 elements of size 22
+#104.012u 12.931s 7:21.75 26.4%  0+0k 0+0io 0pf+0w
+
+    # Cleanup:
+    nice gzip snp131Simple.bed snp131ExcludeIds.txt snp131ForLiftOver.bed
+    rm -r run*/split tmp.txt *.orthoGlom.txt
+
+
+############################################################################