src/hg/makeDb/doc/hg19.txt 1.94

1.94 2010/03/31 14:58:03 angie
DGV v9 (first time DGV is in hg19). Also a bit more info about how Belinda mapped over the personal genome variants track from hg18.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.93
retrieving revision 1.94
diff -b -B -U 4 -r1.93 -r1.94
--- src/hg/makeDb/doc/hg19.txt	29 Mar 2010 22:27:51 -0000	1.93
+++ src/hg/makeDb/doc/hg19.txt	31 Mar 2010 14:58:03 -0000	1.94
@@ -8462,11 +8462,78 @@
 
 #########################################################################
 # PERSONAL GENOME VARIANTS (DONE 12/29/09 giardine)
 
-# liftOver track files in /hive/data/genomes/hg18/bed/pgSnp/
+    # This is Angie's attempt to reconstruct Belinda's steps:
+    mkdir /hive/data/genomes/hg19/bed/pgSnpLiftOver
+    cd /hive/data/genomes/hg19/bed/pgSnpLiftOver
+    # liftOver track files in /hive/data/genomes/hg18/bed/pgSnp/
+    set hg18Dir = /hive/data/genomes/hg18/bed/pgSnp
+    set chainFile = /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz
+
+    #*** Are we losing insertions here?:
+    foreach f (NA12878.pgSnp NA12891.pgSnp NA12892.pgSnp NA19240.pgSnp \
+               pgWatson.bed pgYri3.txt pgSnpYh.txt)
+      liftOver $hg18Dir/$f $chainFile $f:r.hg19.pgSnp{,.unmapped}
+    end
+    liftOver $hg18Dir/koref.sub.pgSnp $chainFile koref.hg19.pgSnp{,.unmapped}
+    # Why pgVenter2?
+    liftOver $hg18Dir/pgVenter.bed $chainFile pgVenter2.hg19.pgSnp{,.unmapped}
+ 
+    # remove variants that are homozygous matches to hg19
+cat > addRefNt.pl <<'_EOF_'
+#!/usr/bin/perl -w
+use strict;
+
+my $build = 'hg19';
+my $nib = "/hive/data/genomes/hg19/nib/";
+my $nibFrag = "nibFrag";
+
+while (<>) {
+   chomp;
+   my @f = split(/\t/);
+   my $ref = '';
+   if ($f[1] eq $f[2]) {
+      $ref = '.'; #insertion nothing in ref
+   }else {
+      open(NIB, "$nibFrag $nib$f[0].nib $f[1] $f[2] + stdout |")
+         or die "Couldn't run $nibFrag, $!\n";
+      while(<NIB>) {
+         chomp;
+         if (/^>/) { next; }
+         $ref .= $_;
+      }
+      close NIB or die "Couldn't close $nibFrag, $!\n";
+   }
+   #splice(@f, 3, 0, uc($ref));
+   #print join("\t", @f), "\n";
+   print join("\t", @f), "\t", uc($ref), "\n";
+}
+'_EOF_'
+    # << emacs
+    chmod a+x addRefNt.pl
+    foreach f (*.pgSnp)
+      addRefNt.pl $f \
+      | perl -wpe '@w=split; s/^.*\n$// if ($w[3] eq $w[6]); s/\t\w+$//;' \
+        > $f:r.filtered.pgSnp
+    end
+#TODO: complete attempt to reverse-engineer Belinda's work
+    BelindasFix pgYri3.hg19.filtered.pgSnp > pgYri3.hg19.filtered.fixed.pgSnp
+
+    # Load into db:
+    foreach i (NA12878 NA12891 NA12892 NA19240 Watson)
+      hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
+        hg19 pg$i $i.hg19.filtered.pgSnp
+    end
+    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
+      hg19 pgVenter pgVenter2.hg19.filtered.pgSnp
+    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
+      hg19 pgYh1 pgSnpYh1.hg19.filtered.pgSnp
+    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
+      hg19 pgSjk koref.hg19.filtered.pgSnp
+    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
+      hg19 pgYoruban3 pgYri3.hg19.filtered.fixed.pgSnp
 
-# remove variants that are homozygous matches to hg19
 
 ########################################################################
 # CRG MAPABILITY (2010-01-19 - 2010-01-28, hartera, DONE)
 # Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca
@@ -8716,8 +8783,45 @@
       'INSERT into targetDb values("hg19Kg", "UCSC Genes", \
          "hg19", "kgTargetAli", "", "", \
          "/gbdb/hg19/targetDb/kgTargetSeq.2bit", 1, now(), "");'
 
+########################################################################
+# DGV V9 (DATABASE OF GENOMIC VARIANTS) (DONE 3/26/10 angie)
+    ssh hgwdev
+    mkdir /hive/data/genomes/hg19/bed/dgv.v9
+    cd /hive/data/genomes/hg19/bed/dgv.v9
+    wget --timestamping \
+      http://projects.tcag.ca/variation/downloads/variation.hg19.v9.mar.2010.txt
+    wget --timestamping \
+      http://projects.tcag.ca/variation/downloads/indel.hg19.v9.mar.2010.txt
+    # shuffle fields into bed8+
+    foreach f (*.v9.*.txt)
+      tail -n +2 $f \
+      | perl -wpe 'chomp; \
+        ($id, $landmark, $chr, $start, $end, $varType, \
+         undef, undef, undef, $ref, $pmid, $method, \
+         $gain, $loss, undef, undef, $sample) = split("\t"); \
+        $id =~ s/^Variation_//; \
+        $start-- unless ($start == 0); \
+        $landmark = "" if ($landmark =~ /^chr.*\d\.\.\d/); \
+        $rgb = ($varType =~ /^Inv/) ? "200,0,200" : "0,200,0"; \
+        if ($gain ne "" || $loss ne "") { \
+          $gain =~ s/^(NA)? ?$/0/;  $loss =~ s/^(NA)? ?$/0/; \
+          $rgb = "200,0,0" if ($gain > 0 && $loss == 0); \
+          $rgb = "0,0,200" if ($loss > 0 && $gain == 0); \
+        } \
+        $_ = join("\t", $chr, $start, $end, $id, 0, "+", \
+                  $start, $start, $rgb, $landmark, $varType, \
+                  $ref, $pmid, $method, $sample) . "\n";' \
+          > $f:r.bed
+    end
+    hgLoadBed hg19 dgv *.bed \
+      -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -tab
+#Loaded 85689 elements of size 15
+      hgsql hg19 -NBe 'select count(distinct(pubMedId)) from dgv;'
+#38
+
+
 #######################################################################
 # felCatV17e Cat BLASTZ/CHAIN/NET (DONE  - 2010-03-22 - Chin)
     screen # use a screen to manage this multi-day job
     mkdir /hive/data/genomes/hg19/bed/lastzFelCatV17e.2010-03-22