66a1a82f72f9f3295bb4007b1b5681013976a694 angie Wed Mar 20 15:20:27 2024 -0700 Overhaul struct sampleMetadata: instead of one struct member per anticipated column, use an array of column values and a shared array of column names. As we support more species, the available metadata gets more divergent so this needs to be more general. We still need to make aggregation of attributes on branches more generic / config-driven. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 7875023..b7333a3 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -734,244 +734,154 @@ for (vpn = variantPath; vpn != NULL; vpn = vpn->next) { if (vpn != variantPath) fprintf(f, " > "); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { if (snc != vpn->sncList) fprintf(f, ", "); fprintf(f, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } } static struct hash *getSampleMetadata(char *metadataFile) -/* If config.ra defines a metadataFile, load its contents into a hash indexed by EPI ID and return; - * otherwise return NULL. */ +/* If config.ra defines a metadataFile, load its contents into a hash indexed by name (and INSDC + * nucleotide accession sans dot if present) and return the hash; otherwise return NULL. */ { struct hash *sampleMetadata = NULL; if (isNotEmpty(metadataFile) && fileExists(metadataFile)) { sampleMetadata = hashNew(0); struct lineFile *lf = lineFileOpen(metadataFile, TRUE); int headerWordCount = 0; char **headerWords = NULL; char *line; // Check for header line if (lineFileNext(lf, &line, NULL)) { if (startsWithWord("strain", line)) { char *headerLine = cloneString(line); headerWordCount = chopString(headerLine, "\t", NULL, 0); AllocArray(headerWords, headerWordCount); chopString(headerLine, "\t", headerWords, headerWordCount); } else errAbort("Missing header line from metadataFile %s", metadataFile); } - int strainIx = stringArrayIx("strain", headerWords, headerWordCount); - int epiIdIx = stringArrayIx("gisaid_epi_isl", headerWords, headerWordCount); + // Look for genbank_accession in header line so we can index by that as well int genbankIx = stringArrayIx("genbank_accession", headerWords, headerWordCount); - int dateIx = stringArrayIx("date", headerWords, headerWordCount); - int authorIx = stringArrayIx("authors", headerWords, headerWordCount); - int nCladeIx = stringArrayIx("Nextstrain_clade", headerWords, headerWordCount); - int gCladeIx = stringArrayIx("GISAID_clade", headerWords, headerWordCount); - int lineageIx = stringArrayIx("pangolin_lineage", headerWords, headerWordCount); - if (lineageIx < 0) - lineageIx = stringArrayIx("pango_lineage", headerWords, headerWordCount); - int countryIx = stringArrayIx("country", headerWords, headerWordCount); - int divisionIx = stringArrayIx("division", headerWords, headerWordCount); - int locationIx = stringArrayIx("location", headerWords, headerWordCount); - int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount); - int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount); - int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount); - int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount); - int regionIx = stringArrayIx("region", headerWords, headerWordCount); - int nCladeUsherIx = stringArrayIx("Nextstrain_clade_usher", headerWords, headerWordCount); - int lineageUsherIx = stringArrayIx("pango_lineage_usher", headerWords, headerWordCount); - int authorsIx = stringArrayIx("authors", headerWords, headerWordCount); - int pubsIx = stringArrayIx("publications", headerWords, headerWordCount); - int nLineageIx = stringArrayIx("Nextstrain_lineage", headerWords, headerWordCount); - int gnCladeIx = stringArrayIx("goya_nextclade", headerWords, headerWordCount); - int rnCladeIx = stringArrayIx("GCC_nextclade", headerWords, headerWordCount); - int guCladeIx = stringArrayIx("goya_usher", headerWords, headerWordCount); - int ruCladeIx = stringArrayIx("GCC_usher", headerWords, headerWordCount); - int rtCladeIx = stringArrayIx("GCC_assigned_2023-11", headerWords, headerWordCount); while (lineFileNext(lf, &line, NULL)) { - char *words[headerWordCount]; - int wordCount = chopTabs(line, words); - lineFileExpectWords(lf, headerWordCount, wordCount); struct sampleMetadata *met; AllocVar(met); - if (strainIx >= 0) - met->strain = cloneString(words[strainIx]); - if (epiIdIx >= 0) - met->epiId = cloneString(words[epiIdIx]); - if (genbankIx >= 0 && !sameString("?", words[genbankIx])) - met->gbAcc = cloneString(words[genbankIx]); - if (dateIx >= 0) - met->date = cloneString(words[dateIx]); - if (authorIx >= 0) - met->author = cloneString(words[authorIx]); - if (nCladeIx >= 0) - met->nClade = cloneString(words[nCladeIx]); - if (gCladeIx >= 0) - met->gClade = cloneString(words[gCladeIx]); - if (lineageIx >= 0) - met->lineage = cloneString(words[lineageIx]); - if (countryIx >= 0) - met->country = cloneString(words[countryIx]); - if (divisionIx >= 0) - met->division = cloneString(words[divisionIx]); - if (locationIx >= 0) - met->location = cloneString(words[locationIx]); - if (countryExpIx >= 0) - met->countryExp = cloneString(words[countryExpIx]); - if (divExpIx >= 0) - met->divExp = cloneString(words[divExpIx]); - if (origLabIx >= 0) - met->origLab = cloneString(words[origLabIx]); - if (subLabIx >= 0) - met->subLab = cloneString(words[subLabIx]); - if (regionIx >= 0) - met->region = cloneString(words[regionIx]); - if (nCladeUsherIx >= 0) - met->nCladeUsher = cloneString(words[nCladeUsherIx]); - if (lineageUsherIx >= 0) - met->lineageUsher = cloneString(words[lineageUsherIx]); - if (authorsIx >= 0) - met->authors = cloneString(words[authorsIx]); - if (pubsIx >= 0) - met->pubs = cloneString(words[pubsIx]); - if (nLineageIx >= 0) - met->nLineage = cloneString(words[nLineageIx]); - // For RSV, use lineage for GCC clades and nClade for Goya clades. - // This is getting ugly and we really should specify metadata columns in config.ra files. - if (gnCladeIx >= 0) - met->nClade = cloneString(words[gnCladeIx]); - if (rnCladeIx >= 0) - met->lineage = cloneString(words[rnCladeIx]); - if (guCladeIx >= 0) - met->nCladeUsher = cloneString(words[guCladeIx]); - if (ruCladeIx >= 0) - met->lineageUsher = cloneString(words[ruCladeIx]); - // Uglier still, use gClade to store GCC designations because it's left over. - if (rtCladeIx >= 0) - met->gClade = cloneString(words[rtCladeIx]); - // If epiId and/or genbank ID is included, we'll probably be using that to look up items. - if (epiIdIx >= 0 && !isEmpty(words[epiIdIx])) - hashAdd(sampleMetadata, words[epiIdIx], met); - if (genbankIx >= 0 && !isEmpty(words[genbankIx]) && !sameString("?", words[genbankIx])) - { - if (strchr(words[genbankIx], '.')) + met->columnCount = headerWordCount; + met->columnNames = headerWords; + AllocArray(met->columnValues, headerWordCount); + char *lineCopy = cloneString(line); + int wordCount = chopByChar(lineCopy, '\t', met->columnValues, headerWordCount); + lineFileExpectWords(lf, headerWordCount, wordCount); + if (genbankIx >= 0 && !isEmpty(met->columnValues[genbankIx]) && + !sameString("?", met->columnValues[genbankIx])) + { + if (strchr(met->columnValues[genbankIx], '.')) { // Index by versionless accession - char copy[strlen(words[genbankIx])+1]; - safecpy(copy, sizeof copy, words[genbankIx]); + char copy[strlen(met->columnValues[genbankIx])+1]; + safecpy(copy, sizeof copy, met->columnValues[genbankIx]); char *dot = strchr(copy, '.'); *dot = '\0'; hashAdd(sampleMetadata, copy, met); } else - hashAdd(sampleMetadata, words[genbankIx], met); + hashAdd(sampleMetadata, met->columnValues[genbankIx], met); } - if (strainIx >= 0 && !isEmpty(words[strainIx])) - hashAdd(sampleMetadata, words[strainIx], met); + hashAdd(sampleMetadata, met->columnValues[0], met); } lineFileClose(&lf); } return sampleMetadata; } -char *epiIdFromSampleName(char *sampleId) -/* If an EPI_ISL_# ID is present somewhere in sampleId, extract and return it, otherwise NULL. */ -{ -char *epiId = cloneString(strstr(sampleId, "EPI_ISL_")); -if (epiId) - { - char *p = epiId + strlen("EPI_ISL_"); - while (isdigit(*p)) - p++; - *p = '\0'; - } -return epiId; -} - char *gbIdFromSampleName(char *sampleId) /* If a GenBank accession is present somewhere in sampleId, extract and return it, otherwise NULL. */ { char *gbId = NULL; regmatch_t substrs[2]; // If it's preceded by anything, make sure it's a | so we ignore isolate names that glom on the // reference's GenBank accession in them (e.g. ..._MN908947.3/2022|OQ070230.1). if (regexMatchSubstr(sampleId, "^(.*\\|)?([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs))) { // Make sure there is a word boundary at the end of the match too if (!isalnum(sampleId[substrs[1].rm_eo])) gbId = cloneStringZ(sampleId+substrs[1].rm_so, substrs[1].rm_eo - substrs[1].rm_so); } return gbId; } struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ { struct sampleMetadata *met = NULL; if (sampleMetadata == NULL) return NULL; -char *epiId = epiIdFromSampleName(sampleId); -if (epiId) - met = hashFindVal(sampleMetadata, epiId); +met = hashFindVal(sampleMetadata, sampleId); +if (met) + return met; if (!met) { char *gbId = gbIdFromSampleName(sampleId); if (gbId) met = hashFindVal(sampleMetadata, gbId); } -if (!met) - met = hashFindVal(sampleMetadata, sampleId); if (!met && strchr(sampleId, '|')) { char copy[strlen(sampleId)+1]; safecpy(copy, sizeof copy, sampleId); char *words[4]; int wordCount = chopString(copy, "|", words, ArraySize(words)); if (isNotEmpty(words[0])) met = hashFindVal(sampleMetadata, words[0]); if (met == NULL && wordCount > 1 && isNotEmpty(words[1])) met = hashFindVal(sampleMetadata, words[1]); } // If it's one of our collapsed node names, dig out the example name and try that. if (!met && isdigit(sampleId[0]) && strstr(sampleId, "_from_")) { char *eg = strstr(sampleId, "_eg_"); if (eg) met = hashFindVal(sampleMetadata, eg+strlen("_eg_")); } return met; } static char *lineageForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ { char *lineage = NULL; struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId); if (met) - lineage = met->lineage ? met->lineage : met->nLineage; + { + int ix; + if ((ix = stringArrayIx("pangolin_lineage", met->columnNames, met->columnCount)) >= 0 || + (ix = stringArrayIx("pango_lineage", met->columnNames, met->columnCount)) >= 0 || + (ix = stringArrayIx("Nextstrain_lineage", met->columnNames, met->columnCount)) >= 0 || + (ix = stringArrayIx("GCC_nextclade", met->columnNames, met->columnCount)) >= 0) + lineage = met->columnNames[ix]; + } return lineage; } static void printLineageLink(char *lineage, char *db) /* Print lineage with link to outbreak.info or pango-designation issue if appropriate. * Caller must handle case of empty/NULL lineage. */ { boolean isSarsCov2 = sameString(db, "wuhCor1"); if (isEmpty(lineage)) errAbort("programmer error:printLineageLink called with empty lineage"); else if (isSarsCov2 && startsWith("proposed", lineage)) printf("%s", lineage+strlen("proposed"), lineage); else if (isSarsCov2 && differentString(lineage, "None") && differentString(lineage, "Unassigned")) {