f2cc86e3506c2d5fefe00dbe85e7f05f0f33f43f jcasper Wed Mar 6 11:33:33 2024 -0800 Updates for new uniProt import, refs #30476 diff --git src/hg/protein/spToDb/spToDb.c src/hg/protein/spToDb/spToDb.c index f55d509..c368789 100644 --- src/hg/protein/spToDb/spToDb.c +++ src/hg/protein/spToDb/spToDb.c @@ -375,110 +375,105 @@ break; } stripChar(line, ' '); dyStringAppend(dy, line); } spr->seq = lmCloneString(lm, dy->string); } static void spParseFeature(struct lineFile *lf, char *line, struct lm *lm, struct dyString *dy, struct spRecord *spr) /* Parse feature into record and hang it on spr. */ { struct spFeature *feat; char *class = nextWord(&line); char *start = nextWord(&line); -char *end = nextWord(&line); +char *end; +if (strchr(start, '.')) + { + end = strrchr(start, '.')+1; + } +else + { + end = start; + } + + +// NB - we no longer expect content on the first FT line after the coordinates char *type = skipLeadingSpaces(line); +if (type != NULL && type[0] != 0) + errAbort("Unexpected content at end of FT line %d of %s", lf->lineIx, lf->fileName); + char *FTId; char c; -if (end == NULL || end[0] == 0) - errAbort("Short FT line %d of %s", lf->lineIx, lf->fileName); lmAllocVar(lm, feat); feat->class = lmCloneString(lm, class); c = *start; if (c == '<') { feat->softEndBits |= spFeatureStartLess; start += 1; } else if (c == '?') { feat->softEndBits |= spFeatureStartFuzzy; start += 1; } c = *end; if (c == '>') { feat->softEndBits |= spFeatureEndGreater; end += 1; } else if (c == '?') { feat->softEndBits |= spFeatureEndFuzzy; end += 1; } feat->start = atoi(start)-1; feat->end = atoi(end); -if (type != NULL) - { /* Looks like multi-line type. */ - dyStringClear(dy); - dyStringAppend(dy, type); - dyStringAppend(dy, " "); /* space between lines */ +// NB: looks like they're always multi-line type now while (lineFileNext(lf, &line, NULL)) { char *sig = "FT "; /* Extra space after FT */ if (!startsWith(sig, line)) { lineFileReuse(lf); break; } line = skipLeadingSpaces(line+strlen(sig)); - FTId= strstr(line, "/FTId="); - if (FTId != NULL) - { - FTId = lmCloneString(lm, FTId+6); - stripLastPeriod(FTId); - feat->FTId = FTId; - } - else - { - dyStringAppend(dy, line); - dyStringAppend(dy, " "); /* space between lines */ - } - } - eraseTrailingSpaces(dy->string); /* remove last space */ - stripLastPeriod(dy->string); - feat->type = lmCloneString(lm, dy->string); - } -else - { - lineFileNext(lf, &line, NULL); - FTId= strstr(line, "/FTId="); + FTId= strstr(line, "/id="); if (FTId != NULL) { - FTId = lmCloneString(lm, FTId+6); - stripLastPeriod(FTId); + FTId+=4; + stripEnclosingDoubleQuotes(FTId); + FTId = lmCloneString(lm, FTId); feat->FTId = FTId; } - else + char *typeStr = strstr(line, "/note="); + if (typeStr != NULL) { - lineFileReuse(lf); + typeStr += 6; + typeStr = stripEnclosingDoubleQuotes(typeStr); + type = lmCloneString(lm, typeStr); } + // not sure what to do with other info like ligand, ligand_id, ligand_label, and ligand_note } +feat->type = type; + slAddHead(&spr->featureList, feat); } static void parseProteinEvidence(struct lineFile *lf, struct lm *lm, struct dyString *dy, struct spRecord *spr) /* Parse protein evidence line[s] assuming these are already grouped into dy. At this * stage they will be of form: * ID: evidence description; ID: evidence description; * where ID is a number. There should be a 1-1 correnspondence between ID's and * evidence descriptions, but we check that later, not here, since we're just parsing * a single one here. The return value is setting the spProtEvList field of spr. */ { char *s = dy->string; for (;;) { @@ -671,35 +666,39 @@ spr->organelle = lmCloneString(lm, dy->string); } else if (startsWith("OX", type)) { /* Taxon(s) of species with this protein. */ char *s; struct spTaxon *taxon; groupLine(lf, type, line, dy); s = dy->string; if (!startsWith(taxonSig, s)) errAbort("Don't understand OX line %d of %s. Expecting %s", lf->lineIx, lf->fileName, taxonSig); s += taxonSigLen; while ((word = nextWord(&s)) != NULL) { + // Sometimes there are words in this line, skip those + if (atoi(word) != 0) + { lmAllocVar(lm, taxon); taxon->id = atoi(word); slAddHead(&spr->taxonList, taxon); } } + } else if (startsWith("OH", type)) { char *s; struct spTaxon *host; /* Pathogen host relationship. */ groupLine(lf, type, line, dy); s = dy->string; if (!startsWith(taxonSig, s)) errAbort("Don't understand OH line %d of %s. Expecting %s", lf->lineIx, lf->fileName, taxonSig); while ((s = stringIn(taxonSig, s)) != NULL) { s += taxonSigLen; lmAllocVar(lm, host); host->id = atoi(s);