6da76acba7f617920cc7eaa161b7d987362be78f
markd
  Wed Feb 14 11:29:46 2018 -0800
hide patent GenBank mRNA sequences by default based on accession prefix

diff --git src/hg/lib/genbank.c src/hg/lib/genbank.c
index 553c1f8..149efee 100644
--- src/hg/lib/genbank.c
+++ src/hg/lib/genbank.c
@@ -387,15 +387,57 @@
 if (geneStrand == '+')
     {
     genomeCds.startComplete = startComplete;
     genomeCds.endComplete = endComplete;
     }
 else
     {
     genomeCds.startComplete = endComplete;
     genomeCds.endComplete = startComplete;
     }
 if (psl->strand[1] == '-')
     reverseIntRange(&genomeCds.start, &genomeCds.end, psl->tSize);
 return genomeCds;
 }
 
+/* Accession prefixes indicating patent sequences, taken from:
+ * https://www.ncbi.nlm.nih.gov/Sequin/acc.html
+ */
+static char *genbankPatentPrefixes[] =
+{
+    "E", "BD", "DD", "DI", "DJ", "DL", "DM", "FU", "FV", "FW", "FZ", "GB",
+    "HV", "HW", "HZ", "LF", "LG", "LV", "LX", "LY", "LZ", "MA", "MB", "A",
+    "AX", "CQ", "CS", "FB", "GM", "GN", "HA", "HB", "HC", "HD", "HH", "HI",
+    "JA", "JB", "JC", "JD", "JE", "LP", "LQ", "MP", "MQ", "MR", "MS", "I",
+    "AR", "DZ", "EA", "GC", "GP", "GV", "GX", "GY", "GZ", "HJ", "HK", "HL",
+    "KH", "MI", NULL
+};
+
+static struct hash *genbankPatentPrefixesHash = NULL;
+
+static void makeGenbenkPatentPrefixHash(void)
+/* build hash of genbank accession prefixes on first uses */
+{
+int i;
+genbankPatentPrefixesHash = hashNew(0);
+for (i = 0; genbankPatentPrefixes[i] != NULL; i++)
+    hashAddInt(genbankPatentPrefixesHash, genbankPatentPrefixes[i], TRUE);
+}
+
+boolean isGenbenkPatentAccession(char *acc)
+/* Is this an accession prefix allocated to patent sequences. */
+{
+if (genbankPatentPrefixesHash == NULL)
+    makeGenbenkPatentPrefixHash();
+
+if (strlen(acc) >= GENBANK_ACC_BUFSZ)
+    return FALSE;  // too big, shouldn't happen
+
+// drop numeric part
+char accbuf[GENBANK_ACC_BUFSZ];
+safecpy(accbuf, sizeof(accbuf), acc);
+char *numPtr = skipToNumeric(accbuf);
+if (numPtr == NULL)
+    return FALSE;  // doesn't look like genbank acc
+*numPtr = '\0';
+return hashLookup(genbankPatentPrefixesHash, accbuf) != NULL;
+}