f2cc86e3506c2d5fefe00dbe85e7f05f0f33f43f
jcasper
  Wed Mar 6 11:33:33 2024 -0800
Updates for new uniProt import, refs #30476

diff --git src/hg/protein/spToDb/spToDb.c src/hg/protein/spToDb/spToDb.c
index f55d509..c368789 100644
--- src/hg/protein/spToDb/spToDb.c
+++ src/hg/protein/spToDb/spToDb.c
@@ -375,110 +375,105 @@
 	break;
 	}
     stripChar(line, ' ');
     dyStringAppend(dy, line);
     }
 spr->seq = lmCloneString(lm, dy->string);
 }
 
 static void spParseFeature(struct lineFile *lf, char *line, 
 	struct lm *lm, struct dyString *dy, struct spRecord *spr)
 /* Parse feature into record and hang it on spr. */
 {
 struct spFeature *feat;
 char *class = nextWord(&line);
 char *start = nextWord(&line);
-char *end = nextWord(&line);
+char *end;
+if (strchr(start, '.'))
+    {
+    end = strrchr(start, '.')+1;
+    }
+else
+    {
+    end = start;
+    }
+
+
+// NB - we no longer expect content on the first FT line after the coordinates
 char *type = skipLeadingSpaces(line);
+if (type != NULL && type[0] != 0)
+    errAbort("Unexpected content at end of FT line %d of %s", lf->lineIx, lf->fileName);
+
 char *FTId;
 char c;
-if (end == NULL || end[0] == 0)
-    errAbort("Short FT line %d of %s", lf->lineIx, lf->fileName);
 lmAllocVar(lm, feat);
 feat->class = lmCloneString(lm, class);
 c = *start;
 if (c == '<')
     {
     feat->softEndBits |= spFeatureStartLess;
     start += 1;
     }
 else if (c == '?')
     {
     feat->softEndBits |= spFeatureStartFuzzy;
     start += 1;
     }
 c = *end;
 if (c == '>')
     {
     feat->softEndBits |= spFeatureEndGreater;
     end += 1;
     }
 else if (c == '?')
     {
     feat->softEndBits |= spFeatureEndFuzzy;
     end += 1;
     }
 feat->start = atoi(start)-1;
 feat->end = atoi(end);
 
-if (type != NULL)
-    {
 /* Looks like multi-line type. */
-    dyStringClear(dy);
-    dyStringAppend(dy, type);
-    dyStringAppend(dy, " ");  /* space between lines */
+// NB: looks like they're always multi-line type now
 while (lineFileNext(lf, &line, NULL))
     {
     char *sig = "FT    ";	/* Extra space after FT */
     if (!startsWith(sig, line))
         {
         lineFileReuse(lf);
         break;
         }
     line = skipLeadingSpaces(line+strlen(sig));
-        FTId= strstr(line, "/FTId=");
-    	if (FTId != NULL)
-    	    {
-    	    FTId = lmCloneString(lm, FTId+6);
-	    stripLastPeriod(FTId);
-	    feat->FTId = FTId;
-	    }
-        else
-	    {
-	    dyStringAppend(dy, line);
-            dyStringAppend(dy, " "); /* space between lines */
-	    }
-	}
-    eraseTrailingSpaces(dy->string); /* remove last space */
-    stripLastPeriod(dy->string);
-    feat->type = lmCloneString(lm, dy->string);
-    }
-else
-    {
-    lineFileNext(lf, &line, NULL);
-    FTId= strstr(line, "/FTId=");
+    FTId= strstr(line, "/id=");
     if (FTId != NULL)
         {
-    	FTId = lmCloneString(lm, FTId+6);
-	stripLastPeriod(FTId);
+        FTId+=4;
+        stripEnclosingDoubleQuotes(FTId);
+        FTId = lmCloneString(lm, FTId);
         feat->FTId = FTId;
         }
-    else
+    char *typeStr = strstr(line, "/note=");
+    if (typeStr != NULL)
         {
-	lineFileReuse(lf);
+        typeStr += 6;
+        typeStr = stripEnclosingDoubleQuotes(typeStr);
+        type = lmCloneString(lm, typeStr);
 	}
+    // not sure what to do with other info like ligand, ligand_id, ligand_label, and ligand_note
     }
+feat->type = type;
+
 slAddHead(&spr->featureList, feat);
 }
 
 static void parseProteinEvidence(struct lineFile *lf, struct lm *lm, struct dyString *dy, 
 	struct spRecord *spr)
 /* Parse protein evidence line[s] assuming these are already grouped into dy.  At this
  * stage they will be of form:
  *     ID: evidence description; ID: evidence description;
  * where ID is a number.   There should be a 1-1 correnspondence between ID's and
  * evidence descriptions, but we check that later, not here, since we're just parsing
  * a single one here.  The return value is setting the spProtEvList field of spr. */
 {
 char *s = dy->string;
 for (;;)
     {
@@ -671,35 +666,39 @@
 	spr->organelle = lmCloneString(lm, dy->string);
 	}
     else if (startsWith("OX", type))
         {
 	/* Taxon(s) of species with this protein. */
 	char *s;
 	struct spTaxon *taxon;
 	groupLine(lf, type, line, dy);
 	s = dy->string;
 	if (!startsWith(taxonSig, s))
 	    errAbort("Don't understand OX line %d of %s. Expecting %s", 
 	    	lf->lineIx, lf->fileName, taxonSig);
 	s += taxonSigLen;
 	while ((word = nextWord(&s)) != NULL)
 	    {
+            // Sometimes there are words in this line, skip those
+            if (atoi(word) != 0)
+                {
                 lmAllocVar(lm, taxon);
                 taxon->id = atoi(word);
                 slAddHead(&spr->taxonList, taxon);
                 }
 	    }
+	}
     else if (startsWith("OH", type))
         {
 	char *s;
 	struct spTaxon *host;
 	/* Pathogen host relationship. */
 	groupLine(lf, type, line, dy);
 	s = dy->string;
 	if (!startsWith(taxonSig, s))
 	    errAbort("Don't understand OH line %d of %s. Expecting %s", 
 	    	lf->lineIx, lf->fileName, taxonSig);
 	while ((s = stringIn(taxonSig, s)) != NULL)
 	     {
 	     s += taxonSigLen;
 	     lmAllocVar(lm, host);
 	     host->id = atoi(s);