src/hg/makeDb/doc/mm9.txt 1.105

1.105 2009/08/20 22:51:18 angie
Updates for MGI: RepTranscript, Allele, Phenotype, and IKMC.
Index: src/hg/makeDb/doc/mm9.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm9.txt,v
retrieving revision 1.104
retrieving revision 1.105
diff -b -B -U 4 -r1.104 -r1.105
--- src/hg/makeDb/doc/mm9.txt	4 Aug 2009 21:40:18 -0000	1.104
+++ src/hg/makeDb/doc/mm9.txt	20 Aug 2009 22:51:18 -0000	1.105
@@ -8231,13 +8231,14 @@
 # loaded by Belinda Giardine, in same manner as hg18 ORegAnno track
 
 
 ############################################################################
-# JAX/MGI TRACKS (DONE 6/11/09 angie)
-# Previously done 4/24/09 in /hive/data/genomes/mm9/bed/jax/2009_06 (not pushed)
+# JAX/MGI TRACKS (DONE 8/20/09 angie)
+# Previously done 6/11/09 in /hive/data/genomes/mm9/bed/jax/2009_06 (pushed)
+# Previously done 4/24/09 in /hive/data/genomes/mm9/bed/jax/2009_04 (not pushed)
 # Previously done 9/24/08 in /cluster/data/mm9/bed/jax/2008_09
-    mkdir -p /hive/data/genomes/mm9/bed/jax/2009_06
-    cd /hive/data/genomes/mm9/bed/jax/2009_06
+    mkdir -p /hive/data/genomes/mm9/bed/jax/2009_08
+    cd /hive/data/genomes/mm9/bed/jax/2009_08
     wget ftp://ftp.informatics.jax.org/pub/gbrowse/\*
     wget ftp://ftp.informatics.jax.org/pub/reports/MGI_PhenotypicAllele.rpt
 
     # Jax Rep Transcript track
@@ -8246,9 +8247,9 @@
     # -- aliases ~ MGI:\d+
     # Use simple perl script to uniquify transcript names and make alias.tab.
     # (Copied /hive/data/genomes/mm8/bed/jax/2007_07/parseRepTranscript.pl and
     # modified to tweak a regex for tweaked name NR_027008_Gt(ROSA)26Sor_1)
-    ./parseRepTranscript.pl SEQ_RepTransGenomic_rpt.gff \
+    ../2009_06/parseRepTranscript.pl SEQ_RepTransGenomic_rpt.gff \
     | sed -e 's/^/chr/; s/chrMT/chrM/;' \
       > jaxRepTranscript.gff
 
     # Jax Allele track
@@ -8282,32 +8283,34 @@
     rm -f jaxPhenotype.bed jaxPhenotypeAlias.tab fixJaxPhenotype.sql
     foreach f (MP_*.gff)
       set type = `echo $f:t:r \
         | perl -wpe 's/MP_[0-9]*_//; s/[_-](\w)/\u$1/g; s/^(\w)/\u$1/; \
-                    s@AdiposeTissue@Adipose@; \
-                    s@BehaviorNeurological@Behavior@; \
-                    s@CardiovascularSystem@Cardiovascular@; \
-                    s@DigestiveAlimentary@Digestive@; \
-                    s@EndocrineExocrineGland@Gland@; \
-                    s@GrowthSize@Growth Size@; \
-                    s@HearingEar@Hearing/Ear@; \
-                    s@HematopoieticSystem@Hematopoietic@; \
-                    s@HomeostasisMetabolism@Homeostasis@; \
-                    s@ImmuneSystem@Immune@; \
-                    s@LethalityEmbryonicPerinatal@Embryonic Lethal@; \
-                    s@LethalityPostnatal@Postnatal Lethal@; \
-                    s@LifeSpanPostWeaningAging@Life Span@; \
-                    s@LimbsDigitsTail@Limbs and Tail@; \
-                    s@LiverBiliarySystem@Liver and Bile@; \
-                    s@NervousSystem@Nervous System@; \
-                    s@RenalUrinarySystem@Renal/Urinary@; \
-                    s@ReproductiveSystem@Reproductive@; \
-                    s@RespiratorySystem@Respiratory@; \
-                    s@SkinCoatNails@Skin/Coat/Nails@; \
-                    s@TasteOlfaction@Taste/Smell@; \
-                    s@TouchVibrissae@Touch@; \
-                    s@Tumorigenesis@Tumorigenesis@; \
-                    s@VisionEye@Vision/Eye@;'`
+                    s@AdiposeTissue@Adipose@ || \
+                    s@BehaviorNeurological@Behavior@ || \
+                    s@CardiovascularSystem@Cardiovascular@ || \
+                    s@DigestiveAlimentary@Digestive@ || \
+                    s@EndocrineExocrineGland@Gland@ || \
+                    s@GrowthSize@Growth Size@ || \
+                    s@HearingEar@Hearing/Ear@ || \
+                    s@HematopoieticSystem@Hematopoietic@ || \
+                    s@HomeostasisMetabolism@Homeostasis@ || \
+                    s@ImmuneSystem@Immune@ || \
+                    s@LethalityEmbryonicPerinatal@Embryonic Lethal@ || \
+                    s@LethalityPostnatal@Postnatal Lethal@ || \
+                    s@LifeSpanPostWeaningAging@Life Span@ || \
+                    s@LimbsDigitsTail@Limbs and Tail@ || \
+                    s@LiverBiliarySystem@Liver and Bile@ || \
+                    s@NervousSystem@Nervous System@ || \
+                    s@RenalUrinarySystem@Renal/Urinary@ || \
+                    s@ReproductiveSystem@Reproductive@ || \
+                    s@RespiratorySystem@Respiratory@ || \
+                    s@SkinCoatNails@Skin/Coat/Nails@ || \
+                    s@TasteOlfaction@Taste/Smell@ || \
+                    s@TouchVibrissae@Touch@ || \
+                    s@Tumorigenesis@Tumorigenesis@ || \
+                    s@VisionEye@Vision/Eye@ || \
+                    m/^Craniofacial|Cellular|Embryogenesis|Muscle|Normal|Other|Pigmentation|Skeleton|$/ || \
+                    die "Unrec $_";'`
       echo $type
       /hive/data/genomes/mm8/bed/jax/2006_10/parsePhenotype.pl $f \
       | ldHgGene mm9 placeholder stdin -nobin -out=stdout \
       | /cluster/bin/scripts/genePredToBed \
@@ -8319,8 +8322,11 @@
 
     # Jax QTL track
     # QTL*.gff --> jaxQtl2 (or 3?)... but we're missing MIT SSLP marker
     # and CM distance for 2, or those plus flanking markers for 3...
+    cmp MGI_QTL.gff ../2009_06/MGI_QTL.gff 
+    # No output, so skip this part:
+    if (0)
     perl -wpe 'chomp; s/\s*$//; \
       ($c, undef, undef, $start, $end, undef, $strand, undef, $info) = \
         split("\t"); \
       if ($info =~ /QTL (\S+); Dbxref "(MGI:\d+)"; Alias .*; Note "([^"]+)"/) { \
@@ -8329,8 +8335,9 @@
       if ($start > $end) { $tmp = $end; $end = $start; $start = $tmp; } \
       $start-- unless $start == 0; \
       s/^.*$/chr$c\t$start\t$end\t$name\t1000\t$strand\t\t$mgiID\t$desc\t0.0\n/;' \
       MGI_QTL.gff > jaxQtl.bed
+    endif
 
     # Extract phenotype-allele relationships:
     # Make a file for the one code not already in a filename:
     cp /dev/null MP_0003012_no_phenotypic_analysis
@@ -8343,47 +8350,37 @@
 
     # Load tables
     # jaxRepTranscript
     ldHgGene mm9 jaxRepTranscript jaxRepTranscript.gff
-#35313 gene predictions
+#35505 gene predictions
     hgsql mm9 < fixJaxRepTranscript.sql
-    sed -e 's/genericAlias/jaxRepTranscriptAlias/g' \
-      ~/kent/src/hg/lib/genericAlias.sql > jaxRepTranscriptAlias.sql 
     hgLoadSqlTab mm9 jaxRepTranscriptAlias \
-      jaxRepTranscriptAlias.sql jaxRepTranscriptAlias.tab
+      ~/kent/src/hg/lib/genericAlias.sql jaxRepTranscriptAlias.tab
     checkTableCoords mm9 jaxRepTranscript
     # jaxAllele
     hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \
       mm9 jaxAllele jaxAllele.bed
-#Loaded 15526 elements of size 13
+#Loaded 15904 elements of size 13
     # fixJaxAllele.sql is empty so don't need to do this:
     # hgsql mm9 < fixJaxAllele.sql
     hgLoadSqlTab mm9 jaxAlleleInfo \
       ~/kent/src/hg/lib/jaxAlleleInfo.sql jaxAlleleInfo.tab
     # jaxPhenotype
     hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \
       -tab mm9 jaxPhenotype jaxPhenotype.bed
-#Loaded 32061 elements of size 13
+#Loaded 32922 elements of size 13
     # fixJaxPhenotype.sql is empty so don't need to do this:
     # hgsql mm9 < fixJaxPhenotype.sql
-    sed -e 's/genericAlias/jaxPhenotypeAlias/' \
-      ~/kent/src/hg/lib/genericAlias.sql > jaxPhenotypeAlias.sql
     hgLoadSqlTab mm9 jaxPhenotypeAlias \
-      jaxPhenotypeAlias.sql jaxPhenotypeAlias.tab
+      ~/kent/src/hg/lib/genericAlias.sql jaxPhenotypeAlias.tab
     # jaxQtl
-    hgLoadBed -tab -notItemRgb -noBin \
-      -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql \
-      mm9 jaxQtl jaxQtl.bed
-    checkTableCoords -verbose=2 mm9 jaxQtl
-#Loaded 1892 elements of size 10
-#mm9.jaxQtl item Ath13 chr14:51915898-165887941: chromEnd > chromSize 125194864
-#mm9.jaxQtl item Ity2 chr11:145756703-145756947: chromEnd > chromSize 121843856
-    perl -wpe 's/^(\w+)\t(\d+)$/ \
-      delete from jaxQtl where chrom="$1" and chromStart >= $2; \
-      update jaxQtl set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
-      ../../../chrom.sizes \
-    | hgsql mm9
+    cmp MGI_QTL.gff ../2009_06/MGI_QTL.gff 
+    # No output ==> no data change, skip the following lines:
+#    hgLoadBed -tab -notItemRgb -noBin \
+#      -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql \
+#      mm9 jaxQtl jaxQtl.bed
     checkTableCoords -verbose=2 mm9 jaxQtl
+    # No output, good.
     # phenotype-allele relationships
     hgLoadSqlTab mm9 jaxAllelePheno \
       ~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab
 
@@ -8614,19 +8611,20 @@
 rm bed.tab
 
 
 #########################################################################
-# KOMP (KNOCKOUT MOUSE PROJECT) (DONE 5/7/09 angie)
+# KOMP/IKMC (KNOCKOUT MOUSE PROJECT became Int'l Knockout Mouse Cons) (DONE 7/24/09 angie)
+# done 5/7/09 w/files emailed from Carol Bult 5/7
 # done 2/12/09 w/files emailed from Carol Bult 2/12
 # done 10/21/08 w/files emailed from Carol Bult 10/18
     ssh hgwdev
-    mkdir -p /hive/data/genomes/mm9/bed/komp/2009_05
-    cd /hive/data/genomes/mm9/bed/komp/2009_05
-    # Save files emailed from Carol Bult 5/7 as 
-    # ucsc.gff.zip
-    unzip ucsc.gff.zip
+    mkdir -p /hive/data/genomes/mm9/bed/komp/2009_07
+    cd /hive/data/genomes/mm9/bed/komp/2009_07
+    # Save files emailed from Carol Bult 7/24 as 
+    # 20090724_ikmc.gff.gz
     # Make bed12 with itemRgb:
-    perl -we \
+    zcat 20090724_ikmc.gff.gz \
+    | perl -we \
       'while (<>) { \
          s/\r?\n$//; \
          ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
          if ($s eq "") { warn "$_\n";  s/^.*//; next; } # Some lines have no coords. \
@@ -8635,9 +8633,9 @@
                 ($col eq "Blue")   ? "0,0,200" : "0,0,0"; \
          $s--; \
          $id =~ s/^MGI:\d+; (\w+); .*/$1/ || die "Cant parse id \"$id\""; \
          my $geneId = join("|", $chr, $ctr, "${n}_$id"); \
-         push @{$geneBlks{$geneId}}, [$s, $e, $col]; \
+         push @{$geneBlks{$geneId}}, [$s, $e, $col] unless $e <= 0; \
       } \
       warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
       foreach my $geneId (keys %geneBlks) { \
          my @blks = @{$geneBlks{$geneId}}; \
@@ -8656,35 +8654,40 @@
            if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
          } \
         print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
                    $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
-      }' ucsc.gff \
+      }' \
     | sort -k 1,1 -k 2n,2n > komp.bed
-#Got 26142 genes.
+#Got 32185 genes.
     # No stderr empty-coord warnings this time (no unmapped items).
     # Make an alias-style table with associated info (MGI ID and status):
-    perl -wpe 's/\r?\n$//; @w = split("\t"); \
+    zcat 20090724_ikmc.gff.gz \
+    | perl -wpe 's/\r?\n$//; @w = split("\t"); \
       if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
+      if ($w[4] <= 0) { s/^.*//; next; } # A few lines have end=0. \
       $w[8] =~ m/^(MGI:\d+); (\w+); (\w.*)/ || die; \
       ($mgi, $designId, $status) = ($1, $2, $3); \
-      $_ = "$w[10]_$designId\t$mgi,$w[2],$status\n";' ucsc.gff \
+      $_ = "$w[10]_$designId\t$mgi,$w[2],$status\n";' \
     | sort -u > kompExtra.tab
     # Load 'em up:
     hgLoadBed mm9 komp komp.bed
-#Loaded 26142 elements of size 12
-    sed -e 's/genericAlias/kompExtra/' $HOME/kent/src/hg/lib/genericAlias.sql \
-      > kompExtra.sql
-    hgLoadSqlTab mm9 kompExtra kompExtra.sql kompExtra.tab
+#Loaded 32185 elements of size 12
+    hgLoadSqlTab mm9 kompExtra $HOME/kent/src/hg/lib/genericAlias.sql kompExtra.tab
     checkTableCoords -verbose=2 mm9 komp
 #mm9.komp item Tekt3_41479 chr11:62887195-62896116: blocks 3 and 4 overlap.
 #mm9.komp item Tekt3_41478 chr11:62887195-62896116: blocks 3 and 4 overlap.
 #mm9.komp item Tekt3_41477 chr11:62887195-62896116: blocks 3 and 4 overlap.
 #mm9.komp item Tekt3_41476 chr11:62887195-62896116: blocks 3 and 4 overlap.
-#mm9.komp has 4 records with overlapping blocks.
-    # Carol talked to the Sanger folks about that one... we can waive like last time.
+#mm9.komp item Cntn5_44827 chr9:10008998-10019351: blocks 1 and 2 overlap.
+    # Carol talked to the Sanger folks about those... pls waive.
+
+    # NOTE FOR NEXT TIME: Carol noticed some very long items and is asking
+    # Sanger about them.  Here's how to check it ourselves next time:
+    hgsql mm9 -e 'select name, (chromEnd-chromStart) as length from komp \
+                  where chromEnd - chromStart > 1000000 order by length desc;'
 
     runJoiner.csh mm9 komp
-# mm9.kompExtra.name - hits 26142 of 26142 ok
+# mm9.kompExtra.name - hits 32185 of 32185 ok
 
 
 #########################################################################
 ### Affy MOE430 version 2 (DONE - 2008-09-25,10-02 - Hiram)