58f8eb6283281844e5a236c36026675369139720
angie
  Mon Jun 20 14:28:42 2011 -0700
Track #1091 (MGI/Jackson Lab Tracks update): got new data from BobSinclair at MGI.  There are a couple new phenotypes, that apparently
have subsumed a few of the old phenotypes; I added links to MGI's
Mammalian Phenotype Browser pages from our details page's table of
phenotypes.  The files from MGI include both old and new phenotypes.
Finally moved the 5+ year old perl scripts into the source tree
instead of inlining them (or not) in mm9.txt.

diff --git src/hg/jaxMgi/parseAllele.pl src/hg/jaxMgi/parseAllele.pl
new file mode 100755
index 0000000..d8d0f67
--- /dev/null
+++ src/hg/jaxMgi/parseAllele.pl
@@ -0,0 +1,92 @@
+#!/usr/bin/perl -w
+use strict;
+# Parse this particular flavor of GFF3 into GFF1 (stdout) with allele source 
+# appended to the name (for later parsing back out in to bed+ field).  
+# Also, *append* to an info file and a fixit SQL file.  
+# Add uniquifying suffix to transcript names when necessary.  
+# Rely on the fact that an mRNA line always immediately 
+# precedes the exon lines.  
+
+my $sourceSuffixes = "Induced|Other|Spontaneous|Targeted|Transgenic";
+my %gffSource = ( "AL_IND" => "Induced", "AL_OTHER" => "Other", "AL_SPON" => "Spontaneous",
+                  "AL_TARG" => "Targeted", "AL_TRANS" => "Transgenic",
+                  "MGI_DNA_GTRAP" => "GeneTrappedDna", "MGI_RNA_GTRAP" => "GeneTrappedRna", );
+
+my $alias = "jaxAlleleInfo.tab";
+my $fixit = "fixJaxAllele.sql";
+open(OUT, ">>$alias") || die "Cant open $alias for appending: $!\n";
+open(SQL, ">>$fixit") || die "Cant open $fixit for appending: $!\n";
+my (%txNameIndx, $tweakedName);
+while (<>) {
+  chomp;
+  s/\s*$//;
+  my ($chr, $source, $type, $start, $end, undef, $strand, undef, $info) = split("\t");
+  die "Unrecognized GFF source $source" if (! defined $gffSource{$source});
+  $chr =~ s/MT/M/;  $chr =~ s/Chr/chr/ || $chr =~ s/^([^c])/chr$1/;
+  $source = $gffSource{$source};
+  $info =~ s/Lexicon Genetics/LG/;  $info =~ s/BayGenomics/BG/;
+  if ($info =~ /Note "(\d)-RACE"/) {
+    $source .= $1;
+  }
+  if ($type eq "mRNA" || $type eq "genetrap_DNA") {
+    my ($name, $alName, $mgiID);
+    $info =~ s/^([^;]+)_($sourceSuffixes);/$1;/;
+    if ($info =~ /^mRNA ([\w.&\/ -]+(\<[\w\(\). ,\*\/+-]+(\>|m1Nt))?); Dbxref "(MGI:\d+)";/) {
+      ($name, $mgiID) = ($1, $4);
+      $name =~ s/ /_/g;
+      if ($name =~ /\<.*[^\>]$/) {
+        print STDERR "Missing > for mRNA name $name\n";
+        $name = $name . ">";
+      }
+    } elsif ($info =~ /^Genetrap_DNA ([\w.&\/ -]+); Dbxref "(MGI:\d+)"/) {
+      ($name, $mgiID) = ($1, $2);
+      $name =~ s/ /_/g;
+      print "$chr\tMGI\texon\t$start\t$end\t.\t$strand\t.\t$name\|\|$source\n";
+    } else {
+      die "parse, mRNA/Genetrap_DNA line $.:\n$info\n";
+    }
+    if (defined $txNameIndx{$name}) {
+      $tweakedName = $name . "_" . $txNameIndx{$name};
+      print SQL "update jaxAllele set name = \"$name\" " .
+                "where name = \"$tweakedName\";\n";
+    } else {
+      undef $tweakedName;
+      $source =~ s/GeneTrapped([DR])na/Gene trapped $1NA/;
+      $source =~ s/([35])$/ ($1'-RACE)/;
+      if ($source =~ /GeneTrapped([DR])na([35]?)/) {
+        $source = "Gene";
+      }
+      print OUT "$name\t$mgiID\t$source\n";
+    }
+    $txNameIndx{$name}++;
+  } elsif ($type =~ /^exon$/) {
+    my ($name);
+    $info =~ s/_($sourceSuffixes)$//;
+    if ($info =~ /^mRNA ([\w.&\/ -]+(\<[\w\(\). ,\*\/+-]+(\>|m1Nt))?)$/) {
+      $name = $1;
+      $name =~ s/ /_/g;
+      if ($name =~ /\<.*[^\>]$/) {
+        $name = $name . ">";
+      }
+    } elsif ($info =~ /^mRNA ([\w.&\/ -]+);/) {
+      $name = $1;
+      $name =~ s/ /_/g;
+    } else {
+      die "parse, exon line $.:\n$info\n";
+    }
+    if (defined $tweakedName) {
+      my $escName = $name;
+      $escName =~ s/\(/\\(/g;  $escName =~ s/\)/\\)/g;
+      if ($tweakedName !~ /^${escName}_\d+$/) {
+        die "tweakedName $tweakedName does not start with name $name and " .
+            " have a numeric suffix like expected";
+      }
+      $name = $tweakedName;
+    }
+    print "$chr\tMGI\t$type\t$start\t$end\t.\t$strand\t.\t$name\|\|$source\n";
+  } elsif ($type ne "gene") {
+    die "unrecognized type $type, line $.";
+  }
+}
+close(OUT);
+close(SQL);