src/hg/makeDb/doc/mm9.txt 1.90

1.90 2009/02/18 03:07:03 angie
Updated komp (now called IKMC) w/new data from Carol Bult.
Index: src/hg/makeDb/doc/mm9.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm9.txt,v
retrieving revision 1.89
retrieving revision 1.90
diff -b -B -U 4 -r1.89 -r1.90
--- src/hg/makeDb/doc/mm9.txt	30 Jan 2009 00:09:27 -0000	1.89
+++ src/hg/makeDb/doc/mm9.txt	18 Feb 2009 03:07:03 -0000	1.90
@@ -8611,56 +8611,73 @@
 rm bed.tab
 
 
 #########################################################################
-# KOMP (KNOCKOUT MOUSE PROJECT) (DONE 10/21/08 angie)
+# KOMP (KNOCKOUT MOUSE PROJECT) (DONE 2/12/09 angie)
+# done 10/21/08 w/files emailed from Carol Bult 10/18
     ssh hgwdev
-    mkdir /hive/data/genomes/mm9/bed/komp
-    cd /hive/data/genomes/mm9/bed/komp
-    # Save files emailed from Carol Bult 10/18 as 
-    # CSD_GFF_10182008.txt and Regeneron_GFF_10172008.txt.
-    # Make bed9 (would be bed4, but using itemRgb):
-    perl -wpe 'chomp; @w = split("\t"); \
-      if ($w[3] eq "") { warn "$_\n";  s/^.*//; \
-                         next; } # Some lines have no coords. \
-      $color = ($w[9] eq "Yellow") ? "255,215,0" : \
-               ($w[9] eq "Green")  ? "0,240,0" : \
-               ($w[9] eq "Blue")   ? "0,0,200" : "0,0,0"; \
-      $w[3]--; \
-      $_ = join("\t", $w[0], $w[3], $w[4], $w[10], \
-                0, ".", $w[3], $w[3], $color) . "\n";' \
-      CSD_GFF_10182008.txt Regeneron_GFF_10172008.txt > komp.bed
-#chr10   .       CSD                     .       .       .       MGI:3645882;In Progress Yellow  EG633971
-#chr11   .       CSD                     .       .       .       MGI:3650936;Withdrawn/Problematic       Black   OTTMUSG00000002671
-#chr2    .       CSD                     .       .       .       MGI:3702423;Withdrawn/Problematic       Black   OTTMUSG00000016613
-#chr2    .       CSD                     .       .       .       MGI:3706568;Withdrawn/Problematic       Black   OTTMUSG00000016226
-#chr2    .       CSD                     .       .       .       MGI:1913626;In Progress Yellow  3100002L24Rik
-#chr2    .       CSD                     .       .       .       MGI:3649840;Withdrawn/Problematic       Black   OTTMUSG00000016576
-#chr4    .       CSD                     .       .       .       MGI:3645499;Withdrawn/Problematic       Black   EG433762
-#chr8    .       CSD                     .       .       .       MGI:3642269;Withdrawn/Problematic       Black   OTTMUSG00000018948
-#chr8    .       CSD                     .       .       .       MGI:3647176;Withdrawn/Problematic       Black   EG654453
-#chr11   .       Regeneron                       .       .       .       MGI:1890761;Not Started/On Hold Blue    Sox16
-#chr5    .       Regeneron                       .       .       .       MGI:3648938;Reagent(s) Available        Green   Pvrig
-#chr8    .       Regeneron                       .       .       .       MGI:2387432;Not Started/On Hold Blue    Trim61
+    mkdir -p /hive/data/genomes/mm9/bed/komp/2009_02
+    cd /hive/data/genomes/mm9/bed/komp/2009_02
+    # Save files emailed from Carol Bult 2/12 as 
+    # csd_gff_021109.gz and regeneron_gff_021109.gz
+    # Make bed12 with itemRgb:
+    zcat *.gz \
+    | perl -we \
+      'while (<>) { \
+         s/\r?\n$//; \
+         ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
+         if ($s eq "") { warn "$_\n";  s/^.*//; next; } # Some lines have no coords. \
+         $col = ($col eq "Yellow") ? "255,215,0" : \
+                ($col eq "Green")  ? "0,240,0" : \
+                ($col eq "Blue")   ? "0,0,200" : "0,0,0"; \
+         $s--; \
+         my $geneId = join("|", $chr, $ctr, $n, $id); \
+         push @{$geneBlks{$geneId}}, [$s, $e, $col]; \
+      } \
+      warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
+      foreach my $geneId (keys %geneBlks) { \
+         my @blks = @{$geneBlks{$geneId}}; \
+         my ($chrom, $center, $name) = split(/\|/, $geneId); \
+         my $blkCount = @blks; \
+         @blks = sort {$a->[0] <=> $b->[0]} @blks; \
+         my $chromStart = $blks[0]->[0]; \
+         my $chromEnd = $blks[$blkCount-1]->[1]; \
+         my $color = $blks[0]->[2]; \
+         my $blkStarts = ""; \
+         my $blkSizes = ""; \
+         foreach my $blk (@blks) { \
+           my ($start, $end, $col) = @{$blk}; \
+           $blkStarts .= ($start - $chromStart) . ","; \
+           $blkSizes  .= ($end - $start) . ","; \
+           if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
+         } \
+        print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
+                   $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
+      }' \
+    | sort -k 1,1 -k 2n,2n > komp.bed
+#Got 16665 genes.
+    # No stderr empty-coord warnings this time (no unmapped items).
     # Make an alias-style table with associated info (MGI ID and status):
-    perl -wpe 'chomp; @w = split("\t"); \
+    zcat *.gz \
+    | perl -wpe 's/\r?\n$//; @w = split("\t"); \
       if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
       $w[8] =~ m/^(MGI:\d+);\s*(\w.*)/ || die; \
       ($mgi, $status) = ($1, $2); \
       $_ = "$w[10]\t$mgi,$w[2],$status\n";' \
-      CSD_GFF_10182008.txt Regeneron_GFF_10172008.txt \
-    | sort > kompExtra.tab
+    | sort -u > kompExtra.tab
     # Load 'em up:
     hgLoadBed mm9 komp komp.bed
-#Loaded 9049 elements of size 9
+#Loaded 16665 elements of size 12
     sed -e 's/genericAlias/kompExtra/' $HOME/kent/src/hg/lib/genericAlias.sql \
       > kompExtra.sql
     hgLoadSqlTab mm9 kompExtra kompExtra.sql kompExtra.tab
     checkTableCoords -verbose=2 mm9 komp
-    # No output, good.
+#mm9.komp item Tekt3 chr11:62887195-62896116: blocks 3 and 4 overlap.
+#mm9.komp has 1 records with overlapping blocks.
+    # Carol talking to the Sanger folks about that one... I think we can waive.
+
     runJoiner.csh mm9 komp
-#Checking keys on database mm9
-# mm9.kompExtra.name - hits 9049 of 9049 ok
+# mm9.kompExtra.name - hits 16665 of 16665 ok
 
 
 #########################################################################
 ### Affy MOE430 version 2 (DONE - 2008-09-25,10-02 - Hiram)