58f8eb6283281844e5a236c36026675369139720 angie Mon Jun 20 14:28:42 2011 -0700 Track #1091 (MGI/Jackson Lab Tracks update): got new data from BobSinclair at MGI. There are a couple new phenotypes, that apparently have subsumed a few of the old phenotypes; I added links to MGI's Mammalian Phenotype Browser pages from our details page's table of phenotypes. The files from MGI include both old and new phenotypes. Finally moved the 5+ year old perl scripts into the source tree instead of inlining them (or not) in mm9.txt. diff --git src/hg/jaxMgi/parseRepTranscript.pl src/hg/jaxMgi/parseRepTranscript.pl new file mode 100755 index 0000000..8898aa2 --- /dev/null +++ src/hg/jaxMgi/parseRepTranscript.pl @@ -0,0 +1,68 @@ +#!/usr/bin/perl -w +# Parse this particular flavor of GFF3 into GFF1 (stdout) +# plus an association file (alias.tab) and a fixit SQL file (fixit.sql). + +use strict; + +# Keep track of transcript names; our GFF-parsing code requires unique +# transcript names but non-unique ones are used here. Add uniquifying +# suffix. Rely on the fact that a REP_transcript line always immediately +# precedes the REP_exon lines. +my %txNameIndx; +my $tweakedName; + +my $alias = "jaxRepTranscriptAlias.tab"; +my $fixit = "fixJaxRepTranscript.sql"; +open(OUT, ">$alias") || die "Cant open $alias for writing: $!\n"; +open(SQL, ">$fixit") || die "Cant open $fixit for writing: $!\n"; +while (<>) { + chomp; + s/\s*$//; + my ($chr, undef, $type, $start, $end, undef, $strand, undef, $info) = + split("\t"); + if ($type eq "mRNA") { + my ($name1, $latter) = split(/_MGI:/, $info); + $name1 =~ s/^mRNA // + || die "line parse, $type line $.:\n$info\n"; + $latter =~ m/^(\d+)_([\w.\(\)\/-]+);/ + || die "latter parse, $type line $.:\n$latter\n"; + my ($mgiID, $name2) = ($1, $2); + my $name = $name1 . "_" . $name2; + if (defined $txNameIndx{$name}) { + $tweakedName = $name . "_" . $txNameIndx{$name}; + print SQL "update jaxRepTranscript set name = \"$name\" " . + "where name = \"$tweakedName\";\n"; + } else { + undef $tweakedName; + print OUT "$name\tMGI:$mgiID\n"; + } + $txNameIndx{$name}++; + } elsif ($type eq "exon" || $type eq "CDS") { + $type = "exon" if ($type eq "CDS"); # They set cdsStart=txStart, cdsEnd=txEnd! even for N.C. + my ($name1, $latter) = split(/_MGI:/, $info); + $name1 =~ s/^mRNA // + || die "line parse, $type line $.:\n$info\n"; + $latter =~ m/^(\d+)_([\w.\(\)\/-]+)$/ + || die "latter parse, $type line $.:\n$latter\n"; + my ($mgiID, $name2) = ($1, $2); + my $name = $name1 . "_" . $name2; + if (defined $tweakedName) { + my $quotedName = $name; $quotedName =~ s/\(/\\(/g; $quotedName =~ s/\)/\\)/g; + if ($tweakedName !~ /^${quotedName}_\d+$/) { + die "tweakedName $tweakedName does not match /^${quotedName}_\\d+\$/ like expected"; + } + $name = $tweakedName; + } + $chr =~ s/MT/M/; + if (/^Chr/) { + $chr =~ s/^Chr/chr/; + } elsif (!/^chr/) { + $chr = "chr$chr"; + } + print "$chr\tMGI\t$type\t$start\t$end\t.\t$strand\t.\t$name\n"; + } elsif ($type ne "gene") { + die "unrecognized type $type, line $."; + } +} +close(OUT); +close(SQL);