d78f2f29016298f39d54b536b0b51a9ff8ffbff9
hiram
  Mon Feb 12 15:07:39 2024 -0800
procedure to add selective annotations to ncbiRefSeq tables refs #32902

diff --git src/hg/makeDb/doc/mm39/gffToLink.pl src/hg/makeDb/doc/mm39/gffToLink.pl
new file mode 100755
index 0000000..ab27fb1
--- /dev/null
+++ src/hg/makeDb/doc/mm39/gffToLink.pl
@@ -0,0 +1,113 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+open (my $fh, "-|", "grep 'ID=gene-' genome.gff.NC_005089.1.tsv") or die "can not grep genome.gff.NC_005089.1.tsv";
+while (my $line = <$fh>) {
+  chomp $line;
+  my @a = split('\t', $line);
+  my @b = split(';', $a[8]);
+#  printf "%s\t%s\n", $a[0], $b[0];
+  my $id = "";
+  my $name = "";
+  my $product = "";
+  my $mrnaAcc = "";
+  my $protAcc = "";
+  my $genbank = "";
+  my $hgnc = "";
+  my $gbkey = "";
+  my $note = "";
+  foreach my $tagVal (@b) {
+    my @c = split('=', $tagVal, 2);
+    if ( $c[0] eq "ID" ) {
+	$id = $c[1];
+	$id =~ s/cds-//;
+	$id =~ s/gene-//;
+	$mrnaAcc = $id;
+    } elsif ($c[0] eq "gene") {
+	$name = $c[1];
+    } elsif ($c[0] eq "product") {
+	$product = $c[1];
+    } elsif ($c[0] eq "protein_id") {
+	$protAcc = $c[1];
+    } elsif ($c[0] eq "gbkey") {
+	$gbkey = $c[1];
+    } elsif ($c[0] eq "Note") {
+	$note = $c[1];
+    } elsif ($c[0] eq "Dbxref") {
+        my @d = split(',', $c[1]);
+	foreach my $xref (@d) {
+	    my @e = split(':', $xref, 2);
+	    if ( $e[0] eq "GenBank") {
+		$genbank = $e[1];
+	    } elsif ( $e[0] eq "HGNC" ) {
+		$hgnc = $e[1];
+		$hgnc =~ s/HGNC://;
+	    }
+	}
+    }
+  }
+  printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $id, "Unknown", $name, $product, $mrnaAcc, $protAcc, "", "", $hgnc, $genbank, "", $gbkey, "", "", "", "", $note, "", "";
+}
+close ($fh);
+
+__END__
+  my $id = "";
+  my $name = "";
+  my $product = "";
+  my $mrnaAcc = "";
+  my $protAcc = "";
+  my $genbank = "";
+  my $hgnc = "";
+  my $gbkey = "";
+  my $note = "";
+
+NC_012920.1     ID=cds-YP_003024026.1
+ID      cds-YP_003024026.1
+Parent  rna-ND1
+Dbxref  GenBank:YP_003024026.1,GeneID:4535,HGNC:HGNC:7455,MIM:516000
+Name    YP_003024026.1
+Note    TAA stop codon is completed by the addition of 3' A residues to the mRNA
+gbkey   CDS
+gene    ND1
+product NADH dehydrogenase subunit 1
+protein_id      YP_003024026.1
+transl_except   (pos:4261..4262%2Caa:TERM)
+transl_table    2
+
+
+chrMT.catchUp.refLink.tab
+fromNcbiRefLink.txt
+fromNcbiRefSeq.txt
+genome.gff.YP_.tsv
+ncbiRefSeqLink.desc.txt
+someData.sh
+twoTable.data.txt
+
+NC_012920.1	RefSeq	CDS	3307	4262	.	+	0	ID=cds-YP_003024026.1;Parent=rna-ND1;Dbxref=GenBank:YP_003024026.1,GeneID:4535,HGNC:HGNC:7455,MIM:516000;Name=YP_003024026.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=ND1;product=NADH dehydrogenase subunit 1;protein_id=YP_003024026.1;transl_except=(pos:4261..4262%2Caa:TERM);transl_table=2
+
++--------------+--------------+------+-----+---------+-------+
+| Field        | Type         | Null | Key | Default | Extra |
++--------------+--------------+------+-----+---------+-------+
+| id           | varchar(255) | NO   | PRI | NULL    |       |
+| status       | varchar(255) | NO   |     | NULL    |       |
+| name         | varchar(255) | NO   | MUL | NULL    |       |
+| product      | varchar(255) | NO   |     | NULL    |       |
+| mrnaAcc      | varchar(255) | NO   | MUL | NULL    |       |
+| protAcc      | varchar(255) | NO   | MUL | NULL    |       |
+| locusLinkId  | varchar(255) | NO   |     | NULL    |       |
+| omimId       | varchar(255) | NO   |     | NULL    |       |
+| hgnc         | varchar(255) | NO   |     | NULL    |       |
+| genbank      | varchar(255) | NO   |     | NULL    |       |
+| pseudo       | varchar(255) | NO   |     | NULL    |       |
+| gbkey        | varchar(255) | NO   |     | NULL    |       |
+| source       | varchar(255) | NO   |     | NULL    |       |
+| gene_biotype | varchar(255) | NO   |     | NULL    |       |
+| gene_synonym | varchar(255) | NO   |     | NULL    |       |
+| ncrna_class  | varchar(255) | NO   |     | NULL    |       |
+| note         | longblob     | NO   |     | NULL    |       |
+| description  | longblob     | NO   |     | NULL    |       |
+| externalId   | varchar(255) | NO   |     | NULL    |       |
++--------------+--------------+------+-----+---------+-------+
+