0e66591e44d669ef83fa016069b1ceb283c01800 angie Thu Jul 15 12:40:43 2021 -0700 Support more flexible ID search: accessions without dot-versions and isolate names. When loading FASTA, detect sequence names that are numeric or already in the tree and add a prefix to prevent usher from rejecting the sequences. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index fb8b18e..20cb86f 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -2356,78 +2356,116 @@ *retMetadataFile = treeChoices->metadataFiles[0]; *retSource = treeChoices->sources[0]; *retAliasFile = treeChoices->aliasFiles[0]; } } else { // Fall back on old settings *retProtobufPath = getUsherAssignmentsPath(db, TRUE); *retMetadataFile = phyloPlaceDbSettingPath(db, "metadataFile"); *retSource = "GISAID"; *retAliasFile = NULL; } } -static void addNameAndComponents(struct hash *nameHash, char *fullName) -/* Add entries to nameHash mapping fullName to itself, and components of fullName to fullName. */ +static void addModVersion(struct hash *nameHash, char *id, char *fullName) +/* Map id to fullName. If id has .version, strip that (modifying id!) and map versionless id + * to fullName. */ +{ +hashAdd(nameHash, id, fullName); +char *p = strchr(id, '.'); +if (p) + { + *p = '\0'; + hashAdd(nameHash, id, fullName); + } +} + +static void maybeAddIsolate(struct hash *nameHash, char *name, char *fullName) +/* If name looks like country/isolate/year and isolate looks sufficiently distinctive + * then also map isolate to fullName. */ +{ +regmatch_t substrs[2]; +if (regexMatchSubstr(name, "^[A-Za-z _]+/(.*)/[0-9]{4}$", substrs, ArraySize(substrs))) + { + char isolate[substrs[1].rm_eo - substrs[1].rm_so + 1]; + regexSubstringCopy(name, substrs[1], isolate, sizeof isolate); + if (! isAllDigits(isolate) && + (regexMatch(isolate, "[A-Za-z0-9]{4}") || + regexMatch(isolate, "[A-Za-z0-9][A-Za-z0-9]+[-_][A-Za-z0-9][A-Za-z0-9]+"))) + { + hashAdd(nameHash, isolate, fullName); + } + } +} +static void addNameMaybeComponents(struct hash *nameHash, char *fullName, boolean addComponents) +/* Add entries to nameHash mapping fullName to itself, and components of fullName to fullName. + * If addComponents and fullName is |-separated name|ID|date, add name and ID to nameHash. */ { char *fullNameHashStored = hashStoreName(nameHash, fullName); // Now that we have hash storage for fullName, make it point to itself. struct hashEl *hel = hashLookup(nameHash, fullName); if (hel == NULL) errAbort("Can't look up '%s' right after adding it", fullName); hel->val = fullNameHashStored; +if (addComponents) + { char *words[4]; char copy[strlen(fullName)+1]; safecpy(copy, sizeof copy, fullName); int wordCount = chopString(copy, "|", words, ArraySize(words)); if (wordCount == 3) { // name|ID|date hashAdd(nameHash, words[0], fullNameHashStored); - hashAdd(nameHash, words[1], fullNameHashStored); + maybeAddIsolate(nameHash, words[0], fullNameHashStored); + addModVersion(nameHash, words[1], fullNameHashStored); } else if (wordCount == 2) { // ID|date - hashAdd(nameHash, words[0], fullNameHashStored); + addModVersion(nameHash, words[0], fullNameHashStored); + // ID might be a COG-UK country/isolate/year + maybeAddIsolate(nameHash, words[0], fullNameHashStored); + } } } -static void rAddLeafNames(struct phyloTree *node, struct hash *condensedNodes, struct hash *nameHash) +static void rAddLeafNames(struct phyloTree *node, struct hash *condensedNodes, struct hash *nameHash, + boolean addComponents) /* Recursively descend tree, adding leaf node names to nameHash (including all names of condensed * leaf nodes). Also map components of full names (country/isolate/year|ID|date) to full names. */ { if (node->numEdges == 0) { char *leafName = node->ident->name; struct slName *nodeList = hashFindVal(condensedNodes, leafName); if (nodeList) { struct slName *sample; for (sample = nodeList; sample != NULL; sample = sample->next) - addNameAndComponents(nameHash, sample->name); + addNameMaybeComponents(nameHash, sample->name, addComponents); } else - addNameAndComponents(nameHash, leafName); + addNameMaybeComponents(nameHash, leafName, addComponents); } else { int i; for (i = 0; i < node->numEdges; i++) - rAddLeafNames(node->edges[i], condensedNodes, nameHash); + rAddLeafNames(node->edges[i], condensedNodes, nameHash, addComponents); } } static void addAliases(struct hash *nameHash, char *aliasFile) /* If there is an aliasFile, then add its mappings of ID/alias to full tree name to nameHash. */ { if (isNotEmpty(aliasFile) && fileExists(aliasFile)) { struct lineFile *lf = lineFileOpen(aliasFile, TRUE); int missCount = 0; char *missExample = NULL; char *line; while (lineFileNextReal(lf, &line)) { char *words[3]; @@ -2438,37 +2476,38 @@ hashAdd(nameHash, words[0], fullName); else { missCount++; if (missExample == NULL) missExample = cloneString(words[1]); } } lineFileClose(&lf); if (missCount > 0) fprintf(stderr, "aliasFile %s: %d values in second column were not found in tree, " "e.g. '%s'", aliasFile, missCount, missExample); } } -static struct hash *getTreeNames(struct mutationAnnotatedTree *bigTree, char *aliasFile) -/* Make a hash of full names of leaves of bigTree; also map components of those names to the - * full names in case the user gives us partial names. */ +static struct hash *getTreeNames(struct mutationAnnotatedTree *bigTree, char *aliasFile, + boolean addComponents) +/* Make a hash of full names of leaves of bigTree; if addComponents, also map components of those + * names to the full names in case the user gives us partial names/IDs. */ { int nodeCount = bigTree->nodeHash->elCount; struct hash *nameHash = hashNew(digitsBaseTwo(nodeCount) + 3); -rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash); +rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash, addComponents); addAliases(nameHash, aliasFile); return nameHash; } static char *matchName(struct hash *nameHash, char *name) /* Look for a possibly partial name or ID provided by the user in nameHash. Return the result, * possibly NULL. If the full name doesn't match, try components of the name. */ { name = trimSpaces(name); // GISAID fasta headers all have hCoV-19/country/isolate/year|EPI_ISL_#|date; strip the hCoV-19 // because Nextstrain strips it in nextmeta/nextfasta download files, and so do I when building // UCSC's tree. if (startsWithNoCase("hCoV-19/", name)) name += strlen("hCoV-19/"); char *match = hashFindVal(nameHash, name); @@ -2512,31 +2551,31 @@ safecpy(copy, sizeof copy, name); stripChar(copy, ' '); match = hashFindVal(nameHash, copy); } return match; } static struct slName *readSampleIds(struct lineFile *lf, struct mutationAnnotatedTree *bigTree, char *aliasFile) /* Read a file of sample names/IDs from the user; typically these will not be exactly the same * as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt * to find component matches if an exact match isn't found. */ { struct slName *sampleIds = NULL; struct slName *unmatched = NULL; -struct hash *nameHash = getTreeNames(bigTree, aliasFile); +struct hash *nameHash = getTreeNames(bigTree, aliasFile, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { // If tab-sep or comma-sep, just try first word in line char *tab = strchr(line, '\t'); if (tab) *tab = '\0'; else { char *comma = strchr(line, ','); if (comma) *comma = '\0'; } char *match = matchName(nameHash, line); if (match) @@ -2594,31 +2633,32 @@ if (! bigTree) { warn("Problem parsing %s; can't make subtree subtracks.", protobufPath); } lineFileCarefulNewlines(lf); struct slName **maskSites = getProblematicSites(db); struct dnaSeq *refGenome = hChromSeq(db, chrom, 0, chromSize); boolean isFasta = FALSE; boolean subtreesOnly = FALSE; struct seqInfo *seqInfoList = NULL; if (lfLooksLikeFasta(lf)) { boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites); struct slPair *failedSeqs; struct slPair *failedPsls; - vcfTn = vcfFromFasta(lf, db, refGenome, informativeBases, maskSites, + struct hash *treeNames = getTreeNames(bigTree, NULL, FALSE); + vcfTn = vcfFromFasta(lf, db, refGenome, informativeBases, maskSites, treeNames, &sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime); if (failedSeqs) { puts("<p>"); struct slPair *fail; for (fail = failedSeqs; fail != NULL; fail = fail->next) printf("%s<br>\n", fail->name); puts("</p>"); } if (failedPsls) { puts("<p>"); struct slPair *fail; for (fail = failedPsls; fail != NULL; fail = fail->next) printf("%s<br>\n", fail->name); @@ -2808,21 +2848,17 @@ puts(" (JSON file)</a>"); } puts("</ul>"); if (!subtreesOnly) { // Notify in opposite order of custom track creation. puts("<h3>Custom tracks for viewing in the Genome Browser</h3>"); printf("<p>Added custom track of uploaded samples.</p>\n"); if (subtreeCount > 0 && subtreeCount <= MAX_SUBTREE_CTS) printf("<p>Added %d subtree custom track%s.</p>\n", subtreeCount, (subtreeCount > 1 ? "s" : "")); ctFile = urlFromTn(ctTn); } } -else - { - warn("No subtree output from usher.\n"); - } return ctFile; }