0f5e8ee9ef8d65cf207610e97d1ce7c368623df5 angie Wed Jan 27 15:13:20 2021 -0800 qBaseInsert and tBaseInsert may include sequence skipped on both sides, e.g. due to NNN's. Count actual inserted and deleted bases more carefully. refs #26868 diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 2a44636..6f64fe4 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1,1671 +1,1694 @@ /* Place SARS-CoV-2 sequences in phylogenetic tree using usher program. */ /* Copyright (C) 2020 The Regents of the University of California */ #include "common.h" #include "bigBed.h" #include "cheapcgi.h" #include "errCatch.h" #include "fa.h" #include "genePred.h" #include "hCommon.h" #include "hash.h" #include "hgConfig.h" #include "htmshell.h" #include "hui.h" #include "iupac.h" #include "jsHelper.h" #include "linefile.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "psl.h" #include "ra.h" #include "regexHelper.h" #include "trashDir.h" #include "vcf.h" // Globals: static boolean measureTiming = FALSE; // wuhCor1-specific: char *chrom = "NC_045512v2"; int chromSize = 29903; // Parameter constants: int maxGenotypes = 100; // Upper limit on number of samples user can upload at once. boolean showBestNodePaths = FALSE; boolean showParsimonyScore = FALSE; char *phyloPlaceDbSetting(char *db, char *settingName) /* Return a setting from hgPhyloPlaceData//config.ra or NULL if not found. */ { static struct hash *configHash = NULL; static char *configDb = NULL; if (!sameOk(db, configDb)) { char configFile[1024]; safef(configFile, sizeof configFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", db); if (fileExists(configFile)) { configHash = raReadSingle(configFile); configDb = cloneString(db); } } if (sameOk(db, configDb)) return cloneString(hashFindVal(configHash, settingName)); return NULL; } char *phyloPlaceDbSettingPath(char *db, char *settingName) /* Return path to a file named by a setting from hgPhyloPlaceData//config.ra, * or NULL if not found. (Append hgPhyloPlaceData// to the beginning of relative path) */ { char *fileName = phyloPlaceDbSetting(db, settingName); if (isNotEmpty(fileName) && fileName[0] != '/' && !fileExists(fileName)) { struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s", db, fileName); if (fileExists(dy->string)) return dyStringCannibalize(&dy); else return NULL; } return fileName; } char *getUsherPath(boolean abortIfNotFound) /* Return hgPhyloPlaceData/usher if it exists, else NULL. Do not free the returned value. */ { char *usherPath = PHYLOPLACE_DATA_DIR "/usher"; if (fileExists(usherPath)) return usherPath; else if (abortIfNotFound) errAbort("Missing required file %s", usherPath); return NULL; } char *getUsherAssignmentsPath(char *db, boolean abortIfNotFound) /* If /config.ra specifies the file for use by usher --load-assignments and the file exists, * return the path, else NULL. Do not free the returned value. */ { char *usherAssignmentsPath = phyloPlaceDbSettingPath(db, "usherAssignmentsFile"); if (isNotEmpty(usherAssignmentsPath) && fileExists(usherAssignmentsPath)) return usherAssignmentsPath; else if (abortIfNotFound) errAbort("Missing required file %s", usherAssignmentsPath); return NULL; } //#*** This needs to go in a lib so CGIs know whether to include it in the menu. needs better name. boolean hgPhyloPlaceEnabled() /* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */ { char *cfgSetting = cfgOption("hgPhyloPlaceEnabled"); boolean isEnabled = (isNotEmpty(cfgSetting) && differentWord(cfgSetting, "off") && differentWord(cfgSetting, "no")); return (isEnabled && hDbExists("wuhCor1")); } static void addPathIfNecessary(struct dyString *dy, char *db, char *fileName) /* If fileName exists, copy it into dy, else try hgPhyloPlaceData//fileName */ { dyStringClear(dy); if (fileExists(fileName)) dyStringAppend(dy, fileName); else dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", db, fileName); } struct treeChoices *loadTreeChoices(char *db) /* If /config.ra specifies a treeChoices file, load it up, else return NULL. */ { struct treeChoices *treeChoices = NULL; char *filename = phyloPlaceDbSettingPath(db, "treeChoices"); if (isNotEmpty(filename) && fileExists(filename)) { AllocVar(treeChoices); int maxChoices = 128; AllocArray(treeChoices->protobufFiles, maxChoices); AllocArray(treeChoices->metadataFiles, maxChoices); AllocArray(treeChoices->sources, maxChoices); AllocArray(treeChoices->descriptions, maxChoices); struct lineFile *lf = lineFileOpen(filename, TRUE); char *line; while (lineFileNextReal(lf, &line)) { char *words[5]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, 4, wordCount); if (treeChoices->count >= maxChoices) { warn("File %s has too many lines, only showing first %d phylogenetic tree choices", filename, maxChoices); break; } struct dyString *dy = dyStringNew(0); addPathIfNecessary(dy, db, words[0]); treeChoices->protobufFiles[treeChoices->count] = cloneString(dy->string); addPathIfNecessary(dy, db, words[1]); treeChoices->metadataFiles[treeChoices->count] = dyStringCannibalize(&dy); treeChoices->sources[treeChoices->count] = cloneString(words[2]); treeChoices->descriptions[treeChoices->count] = cloneString(words[3]); treeChoices->count++; } lineFileClose(&lf); } return treeChoices; } static char *urlFromTn(struct tempName *tn) /* Make a full URL to a trash file that our net.c code will be able to follow, for when we can't * just leave it up to the user's web browser to do the right thing with "../". */ { struct dyString *dy = dyStringCreate("%s%s", hLocalHostCgiBinUrl(), tn->forHtml); return dyStringCannibalize(&dy); } void reportTiming(int *pStartTime, char *message) /* Print out a report to stderr of how much time something took. */ { if (measureTiming) { int now = clock1000(); fprintf(stderr, "%dms to %s\n", now - *pStartTime, message); *pStartTime = now; } } static boolean lfLooksLikeFasta(struct lineFile *lf) /* Peek at file to see if it looks like FASTA, i.e. begins with a >header. */ { boolean hasFastaHeader = FALSE; char *line; if (lineFileNext(lf, &line, NULL)) { if (line[0] == '>') hasFastaHeader = TRUE; lineFileReuse(lf); } return hasFastaHeader; } static void rInformativeBasesFromTree(struct phyloTree *node, boolean *informativeBases) /* For each variant associated with a non-leaf node, set informativeBases[chromStart]. */ { if (node->numEdges > 0) { if (node->priv) { struct singleNucChange *snc, *sncs = node->priv; for (snc = sncs; snc != NULL; snc = snc->next) informativeBases[snc->chromStart] = TRUE; } int i; for (i = 0; i < node->numEdges; i++) rInformativeBasesFromTree(node->edges[i], informativeBases); } } static boolean *informativeBasesFromTree(struct phyloTree *bigTree, struct slName **maskSites) /* Return an array indexed by reference position with TRUE at positions that have a non-leaf * variant in bigTree. */ { boolean *informativeBases; AllocArray(informativeBases, chromSize); if (bigTree) { rInformativeBasesFromTree(bigTree, informativeBases); int i; for (i = 0; i < chromSize; i++) { struct slName *maskedReasons = maskSites[i]; if (maskedReasons && informativeBases[i]) { warn("protobuf tree contains masked mutation at %d (%s)", i+1, maskedReasons->name); informativeBases[i] = FALSE; } } } return informativeBases; } static boolean lfLooksLikeVcf(struct lineFile *lf) /* Peek at file to see if it looks like VCF, i.e. begins with a ##fileformat=VCF header. */ { boolean hasVcfHeader = FALSE; char *line; if (lineFileNext(lf, &line, NULL)) { if (startsWith("##fileformat=VCF", line)) hasVcfHeader = TRUE; lineFileReuse(lf); } return hasVcfHeader; } static struct tempName *checkAndSaveVcf(struct lineFile *lf, struct dnaSeq *refGenome, struct slName **maskSites, struct seqInfo **retSeqInfoList, struct slName **retSampleIds) /* Save the contents of lf to a trash file. If it has a reasonable number of genotype columns * with recognizable genotypes, and the coordinates seem to be in range, then return the path * to the trash file. Otherwise complain and return NULL. */ { struct tempName *tn; AllocVar(tn); trashDirFile(tn, "ct", "ct_pp", ".vcf"); FILE *f = mustOpen(tn->forCgi, "w"); struct seqInfo *seqInfoList = NULL; struct slName *sampleIds = NULL; struct errCatch *errCatch = errCatchNew(); if (errCatchStart(errCatch)) { char *line; int lineSize; int sampleCount = 0; while (lineFileNext(lf, &line, &lineSize)) { if (startsWith("#CHROM\t", line)) { //#*** TODO: if the user uploads a sample with the same ID as one already in the //#*** saved assignment file, then usher will ignore it! //#*** Better check for that and warn the user. int colCount = chopTabs(line, NULL); if (colCount == 1) { lineFileAbort(lf, "VCF requires tab-separated columns, but no tabs found"); } sampleCount = colCount - VCF_NUM_COLS_BEFORE_GENOTYPES; if (sampleCount < 1 || sampleCount > maxGenotypes) { if (sampleCount < 1) lineFileAbort(lf, "VCF header #CHROM line has %d columns; expecting at least %d " "columns including sample IDs for genotype columns", colCount, 10); else lineFileAbort(lf, "VCF header #CHROM line defines %d samples but only up to %d " "are supported", sampleCount, maxGenotypes); } char lineCopy[lineSize+1]; safecpy(lineCopy, sizeof lineCopy, line); char *words[colCount]; chopTabs(lineCopy, words); struct hash *uniqNames = hashNew(0); int i; for (i = VCF_NUM_COLS_BEFORE_GENOTYPES; i < colCount; i++) { if (hashLookup(uniqNames, words[i])) lineFileAbort(lf, "VCF sample names in #CHROM line must be unique, but '%s' " "appears more than once", words[i]); hashAdd(uniqNames, words[i], NULL); slNameAddHead(&sampleIds, words[i]); struct seqInfo *si; AllocVar(si); si->seq = cloneDnaSeq(refGenome); si->seq->name = cloneString(words[i]); slAddHead(&seqInfoList, si); } slReverse(&seqInfoList); slReverse(&sampleIds); hashFree(&uniqNames); fputs(line, f); fputc('\n', f); } else if (line[0] == '#') { fputs(line, f); fputc('\n', f); } else { if (sampleCount < 1) { lineFileAbort(lf, "VCF header did not include #CHROM line defining sample IDs for " "genotype columns"); } int colCount = chopTabs(line, NULL); int genotypeCount = colCount - VCF_NUM_COLS_BEFORE_GENOTYPES; if (genotypeCount != sampleCount) { lineFileAbort(lf, "VCF header defines %d samples but there are %d genotype columns", sampleCount, genotypeCount); } char *words[colCount]; chopTabs(line, words); //#*** TODO: check that POS is sorted int pos = strtol(words[1], NULL, 10); if (pos > chromSize) { lineFileAbort(lf, "VCF POS value %d exceeds size of reference sequence (%d)", pos, chromSize); } // make sure REF value (if given) matches reference genome int chromStart = pos - 1; struct slName *maskedReasons = maskSites[chromStart]; char *ref = words[3]; if (strlen(ref) != 1) { // Not an SNV -- skip it. //#*** should probably report or at least count these... continue; } char refBase = toupper(refGenome->dna[chromStart]); if (ref[0] == '*' || ref[0] == '.') ref[0] = refBase; else if (ref[0] != refBase) lineFileAbort(lf, "VCF REF value at position %d is '%s', expecting '%c' " "(or '*' or '.')", pos, ref, refBase); char altStrCopy[strlen(words[4])+1]; safecpy(altStrCopy, sizeof altStrCopy, words[4]); char *alts[strlen(words[4])+1]; chopCommas(altStrCopy, alts); //#*** Would be nice to trim out indels from ALT column -- but that would require //#*** adjusting genotype codes below. struct seqInfo *si = seqInfoList; int i; for (i = VCF_NUM_COLS_BEFORE_GENOTYPES; i < colCount; i++, si = si->next) { if (words[i][0] != '.' && !isdigit(words[i][0])) { lineFileAbort(lf, "VCF genotype columns must contain numeric allele codes; " "can't parse '%s'", words[i]); } else { if (words[i][0] == '.') { si->seq->dna[chromStart] = 'n'; si->nCountMiddle++; } else { int alIx = atol(words[i]); if (alIx > 0) { char *alt = alts[alIx-1]; if (strlen(alt) == 1) { si->seq->dna[chromStart] = alt[0]; struct singleNucChange *snc = sncNew(chromStart, ref[0], '\0', alt[0]); if (maskedReasons) { slAddHead(&si->maskedSncList, snc); slAddHead(&si->maskedReasonsList, slRefNew(maskedReasons)); } else { if (isIupacAmbiguous(alt[0])) si->ambigCount++; slAddHead(&si->sncList, snc); } } } } } } if (!maskedReasons) { fputs(chrom, f); for (i = 1; i < colCount; i++) { fputc('\t', f); fputs(words[i], f); } fputc('\n', f); } } } } errCatchEnd(errCatch); carefulClose(&f); if (errCatch->gotError) { warn("%s", errCatch->message->string); unlink(tn->forCgi); freez(&tn); } errCatchFree(&errCatch); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) slReverse(&si->sncList); *retSeqInfoList = seqInfoList; *retSampleIds = sampleIds; return tn; } static void displaySampleMuts(struct placementInfo *info) { printf("

Differences from the reference genome " "(" "NC_045512.2): "); if (info->sampleMuts == NULL) printf("(None; identical to reference)"); else { struct slName *sln; for (sln = info->sampleMuts; sln != NULL; sln = sln->next) { if (sln != info->sampleMuts) printf(", "); printf("%s", sln->name); } } puts("

"); } static void variantPathPrint(struct variantPathNode *variantPath) /* Print out a variantPath; print nodeName only if non-numeric * (i.e. a sample ID not internal node) */ { struct variantPathNode *vpn; for (vpn = variantPath; vpn != NULL; vpn = vpn->next) { if (vpn != variantPath) printf(" > "); if (!isAllDigits(vpn->nodeName)) printf("%s: ", vpn->nodeName); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { if (snc != vpn->sncList) printf(", "); printf("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } } static void displayVariantPath(struct variantPathNode *variantPath, char *sampleId) /* Display mutations on the path to this sample. */ { printf("

Mutations along the path from the root of the phylogenetic tree to %s:
", sampleId); if (variantPath) { variantPathPrint(variantPath); puts("
"); } else puts("(None; your sample was placed at the root of the phylogenetic tree)"); puts("

"); } static boolean isInternalNodeName(char *nodeName, int minNewNode) /* Return TRUE if nodeName looks like an internal node ID from the protobuf tree, i.e. is numeric * and less than minNewNode. */ { return isAllDigits(nodeName) && (atoi(nodeName) < minNewNode); } static struct variantPathNode *findLastInternalNode(struct variantPathNode *variantPath, int minNewNode) /* Return the last node in variantPath with a numeric name less than minNewNode, or NULL. */ { if (!variantPath || !isInternalNodeName(variantPath->nodeName, minNewNode)) return NULL; while (variantPath->next && isInternalNodeName(variantPath->next->nodeName, minNewNode)) variantPath = variantPath->next; if (variantPath && isInternalNodeName(variantPath->nodeName, minNewNode)) return variantPath; return NULL; } static int mutCountCmp(const void *a, const void *b) /* Compare number of mutations of phyloTree nodes a and b. */ { const struct phyloTree *nodeA = *(struct phyloTree * const *)a; const struct phyloTree *nodeB = *(struct phyloTree * const *)b; return slCount(nodeA->priv) - slCount(nodeB->priv); } static struct slName *findNearestNeighbor(struct mutationAnnotatedTree *bigTree, char *sampleId, struct variantPathNode *variantPath) /* Use the sequence of mutations in variantPath to find sampleId's parent node in bigTree, * then look for most similar leaf sibling(s). */ { struct slName *neighbors = NULL; int bigTreeINodeCount = phyloCountInternalNodes(bigTree->tree); int minNewNode = bigTreeINodeCount + 1; // 1-based struct variantPathNode *lastOldNode = findLastInternalNode(variantPath, minNewNode); struct phyloTree *node = lastOldNode ? hashFindVal(bigTree->nodeHash, lastOldNode->nodeName) : bigTree->tree; if (lastOldNode && !node) errAbort("Can't find last internal node for sample %s", sampleId); // Look for a leaf kid with no mutations relative to the parent, should be closest. if (node->numEdges == 0) { struct slName *nodeList = hashFindVal(bigTree->condensedNodes, node->ident->name); if (nodeList) slNameAddHead(&neighbors, nodeList->name); else slNameAddHead(&neighbors, node->ident->name); } else { int leafCount = 0; int i; for (i = 0; i < node->numEdges; i++) { struct phyloTree *kid = node->edges[i]; if (kid->numEdges == 0) { leafCount++; struct singleNucChange *kidMuts = kid->priv; if (!kidMuts) { struct slName *nodeList = hashFindVal(bigTree->condensedNodes, kid->ident->name); if (nodeList) slNameAddHead(&neighbors, nodeList->name); else slNameAddHead(&neighbors, kid->ident->name); break; } } } if (neighbors == NULL && leafCount) { // Pick the leaf with the fewest mutations. struct phyloTree *leafKids[leafCount]; int leafIx = 0; for (i = 0; i < node->numEdges; i++) { struct phyloTree *kid = node->edges[i]; if (kid->numEdges == 0) leafKids[leafIx++] = kid; } qsort(leafKids, leafCount, sizeof(leafKids[0]), mutCountCmp); neighbors = slNameNew(leafKids[0]->ident->name); } } return neighbors; } static void printVariantPathNoNodeNames(struct variantPathNode *variantPath) /* Print out variant path with no node names (even if non-numeric) */ { struct variantPathNode *vpn; for (vpn = variantPath; vpn != NULL; vpn = vpn->next) { if (vpn != variantPath) printf(" > "); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { if (snc != vpn->sncList) printf(", "); printf("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } } static struct hash *getSampleMetadata(char *metadataFile) /* If config.ra defines a metadataFile, load its contents into a hash indexed by EPI ID and return; * otherwise return NULL. */ { struct hash *sampleMetadata = NULL; if (isNotEmpty(metadataFile) && fileExists(metadataFile)) { sampleMetadata = hashNew(0); struct lineFile *lf = lineFileOpen(metadataFile, TRUE); int headerWordCount = 0; char **headerWords = NULL; char *line; // Check for header line if (lineFileNext(lf, &line, NULL)) { if (startsWithWord("strain", line)) { char *headerLine = cloneString(line); headerWordCount = chopString(headerLine, "\t", NULL, 0); AllocArray(headerWords, headerWordCount); chopString(headerLine, "\t", headerWords, headerWordCount); } else errAbort("Missing header line from metadataFile %s", metadataFile); } int strainIx = stringArrayIx("strain", headerWords, headerWordCount); int epiIdIx = stringArrayIx("gisaid_epi_isl", headerWords, headerWordCount); int genbankIx = stringArrayIx("genbank_accession", headerWords, headerWordCount); int dateIx = stringArrayIx("date", headerWords, headerWordCount); int authorIx = stringArrayIx("authors", headerWords, headerWordCount); int nCladeIx = stringArrayIx("Nextstrain_clade", headerWords, headerWordCount); int gCladeIx = stringArrayIx("GISAID_clade", headerWords, headerWordCount); int lineageIx = stringArrayIx("pangolin_lineage", headerWords, headerWordCount); int countryIx = stringArrayIx("country", headerWords, headerWordCount); int divisionIx = stringArrayIx("division", headerWords, headerWordCount); int locationIx = stringArrayIx("location", headerWords, headerWordCount); int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount); int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount); int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount); int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount); int regionIx = stringArrayIx("region", headerWords, headerWordCount); while (lineFileNext(lf, &line, NULL)) { char *words[headerWordCount]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, headerWordCount, wordCount); struct sampleMetadata *met; AllocVar(met); if (strainIx >= 0) met->strain = cloneString(words[strainIx]); if (epiIdIx >= 0) met->epiId = cloneString(words[epiIdIx]); if (genbankIx >= 0 && !sameString("?", words[genbankIx])) met->gbAcc = cloneString(words[genbankIx]); if (dateIx >= 0) met->date = cloneString(words[dateIx]); if (authorIx >= 0) met->author = cloneString(words[authorIx]); if (nCladeIx >= 0) met->nClade = cloneString(words[nCladeIx]); if (gCladeIx >= 0) met->gClade = cloneString(words[gCladeIx]); if (lineageIx >= 0) met->lineage = cloneString(words[lineageIx]); if (countryIx >= 0) met->country = cloneString(words[countryIx]); if (divisionIx >= 0) met->division = cloneString(words[divisionIx]); if (locationIx >= 0) met->location = cloneString(words[locationIx]); if (countryExpIx >= 0) met->countryExp = cloneString(words[countryExpIx]); if (divExpIx >= 0) met->divExp = cloneString(words[divExpIx]); if (origLabIx >= 0) met->origLab = cloneString(words[origLabIx]); if (subLabIx >= 0) met->subLab = cloneString(words[subLabIx]); if (regionIx >= 0) met->region = cloneString(words[regionIx]); // If epiId and/or genbank ID is included, we'll probably be using that to look up items. if (epiIdIx >= 0 && !isEmpty(words[epiIdIx])) hashAdd(sampleMetadata, words[epiIdIx], met); if (genbankIx >= 0 && !isEmpty(words[genbankIx]) && !sameString("?", words[genbankIx])) { if (strchr(words[genbankIx], '.')) { // Index by versionless accession char copy[strlen(words[genbankIx])+1]; safecpy(copy, sizeof copy, words[genbankIx]); char *dot = strchr(copy, '.'); *dot = '\0'; hashAdd(sampleMetadata, copy, met); } else hashAdd(sampleMetadata, words[genbankIx], met); } if (strainIx >= 0 && !isEmpty(words[strainIx])) hashAdd(sampleMetadata, words[strainIx], met); } lineFileClose(&lf); } return sampleMetadata; } char *epiIdFromSampleName(char *sampleId) /* If an EPI_ISL_# ID is present somewhere in sampleId, extract and return it, otherwise NULL. */ { char *epiId = cloneString(strstr(sampleId, "EPI_ISL_")); if (epiId) { char *p = epiId + strlen("EPI_ISL_"); while (isdigit(*p)) p++; *p = '\0'; } return epiId; } char *gbIdFromSampleName(char *sampleId) /* If a GenBank accession is present somewhere in sampleId, extract and return it, otherwise NULL. */ { char *gbId = NULL; regmatch_t substrs[2]; if (regexMatchSubstr(sampleId, "([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs))) { // Make sure there are word boundaries around the match if ((substrs[1].rm_so == 0 || !isalnum(sampleId[substrs[1].rm_so-1])) && !isalnum(sampleId[substrs[1].rm_eo])) gbId = cloneStringZ(sampleId+substrs[1].rm_so, substrs[1].rm_eo - substrs[1].rm_so); } return gbId; } struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ { struct sampleMetadata *met = NULL; char *epiId = epiIdFromSampleName(sampleId); if (epiId) met = hashFindVal(sampleMetadata, epiId); if (!met) { char *gbId = gbIdFromSampleName(sampleId); if (gbId) met = hashFindVal(sampleMetadata, gbId); } if (!met) met = hashFindVal(sampleMetadata, sampleId); if (!met && strchr(sampleId, '|')) { char copy[strlen(sampleId)+1]; safecpy(copy, sizeof copy, sampleId); char *words[4]; int wordCount = chopString(copy, "|", words, ArraySize(words)); if (isNotEmpty(words[0])) met = hashFindVal(sampleMetadata, words[0]); if (met == NULL && wordCount > 1 && isNotEmpty(words[1])) met = hashFindVal(sampleMetadata, words[1]); } return met; } static char *lineageForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ { char *lineage = NULL; struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId); if (met) lineage = met->lineage; return lineage; } static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree, char *source, struct placementInfo *info, struct hash *sampleMetadata) /* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */ { if (bigTree) { printf("

Nearest neighboring %s sequence already in phylogenetic tree: ", source); struct slName *neighbors = findNearestNeighbor(bigTree, info->sampleId, info->variantPath); struct slName *neighbor; for (neighbor = neighbors; neighbor != NULL; neighbor = neighbor->next) { if (neighbor != neighbors) printf(", "); printf("%s", neighbor->name); char *lineage = lineageForSample(sampleMetadata, neighbor->name); if (isNotEmpty(lineage)) printf(": lineage %s", lineage); } puts("

"); } } static void displayBestNodes(struct placementInfo *info, struct hash *sampleMetadata) /* Show the node(s) most closely related to sample. */ { if (info->bestNodeCount == 1) printf("

The placement in the tree is unambiguous; " "there are no other parsimony-optimal placements in the phylogenetic tree.

\n"); else if (info->bestNodeCount > 1) printf("

This placement is not the only parsimony-optimal placement in the tree; " "%d other placements exist.

\n", info->bestNodeCount - 1); if (showBestNodePaths && info->bestNodes) { if (info->bestNodeCount != slCount(info->bestNodes)) errAbort("Inconsistent bestNodeCount (%d) and number of bestNodes (%d)", info->bestNodeCount, slCount(info->bestNodes)); if (info->bestNodeCount > 1) printf("

used for placement: "); if (differentString(info->bestNodes->name, "?") && !isAllDigits(info->bestNodes->name)) printf("%s ", info->bestNodes->name); printVariantPathNoNodeNames(info->bestNodes->variantPath); struct bestNodeInfo *bn; for (bn = info->bestNodes->next; bn != NULL; bn = bn->next) { printf("\n
"); if (differentString(bn->name, "?") && !isAllDigits(bn->name)) printf("%s ", bn->name); printVariantPathNoNodeNames(bn->variantPath); char *lineage = lineageForSample(sampleMetadata, bn->name); if (isNotEmpty(lineage)) printf(": lineage %s", lineage); } if (info->bestNodeCount > 1) puts("

"); puts("

"); } } static int placementInfoRefCmpSampleMuts(const void *va, const void *vb) /* Compare slRef->placementInfo->sampleMuts lists. Shorter lists first. Using alpha sort * to distinguish between different sampleMuts contents arbitrarily; the purpose is to * clump samples with identical lists. */ { struct slRef * const *rra = va; struct slRef * const *rrb = vb; struct placementInfo *pa = (*rra)->val; struct placementInfo *pb = (*rrb)->val; int diff = slCount(pa->sampleMuts) - slCount(pb->sampleMuts); if (diff == 0) { struct slName *slnA, *slnB; for (slnA = pa->sampleMuts, slnB = pb->sampleMuts; slnA != NULL; slnA = slnA->next, slnB = slnB->next) { diff = strcmp(slnA->name, slnB->name); if (diff != 0) break; } } return diff; } static struct slRef *getPlacementRefList(struct slName *sampleIds, struct hash *samplePlacements) /* Look up sampleIds in samplePlacements and return ref list of placements. */ { struct slRef *placementRefs = NULL; struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) { struct placementInfo *info = hashFindVal(samplePlacements, sample->name); if (!info) errAbort("getPlacementRefList: can't find placement info for sample '%s'", sample->name); slAddHead(&placementRefs, slRefNew(info)); } slReverse(&placementRefs); return placementRefs; } static int countIdentical(struct slRef *placementRefs) /* Return the number of placements that have identical sampleMuts lists. */ { int clumpCount = 0; struct slRef *ref; for (ref = placementRefs; ref != NULL; ref = ref->next) { clumpCount++; if (ref->next == NULL || placementInfoRefCmpSampleMuts(&ref, &ref->next)) break; } return clumpCount; } static void asciiTree(struct phyloTree *node, char *indent, boolean isLast) /* Until we can make a real graphic, at least print an ascii tree. */ { if (isNotEmpty(indent) || isNotEmpty(node->ident->name)) { if (node->ident->name && !isAllDigits(node->ident->name)) printf("%s %s\n", indent, node->ident->name); } int indentLen = strlen(indent); char indentForKids[indentLen+1]; safecpy(indentForKids, sizeof indentForKids, indent); if (indentLen >= 4) { if (isLast) safecpy(indentForKids+indentLen - 4, 4 + 1, " "); else safecpy(indentForKids+indentLen - 4, 4 + 1, "| "); } if (node->numEdges > 0) { char kidIndent[strlen(indent)+5]; safef(kidIndent, sizeof kidIndent, "%s%s", indentForKids, "+---"); int i; for (i = 0; i < node->numEdges; i++) asciiTree(node->edges[i], kidIndent, (i == node->numEdges - 1)); } } static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements, struct phyloTree *subtree, struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree, char *source) /* Report how each sample fits into the big tree. */ { // Sort sample placements by sampleMuts so we can group identical samples. struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements); slSort(&placementRefs, placementInfoRefCmpSampleMuts); int relatedCount = slCount(placementRefs); int clumpSize = countIdentical(placementRefs); if (clumpSize < relatedCount && relatedCount > 2) { // Not all of the related sequences are identical, so they will be broken down into // separate "clumps". List all related samples first to avoid confusion. puts("

");
     asciiTree(subtree, "", TRUE);
     puts("

"); } struct slRef *refsToGo = placementRefs; while ((clumpSize = countIdentical(refsToGo)) > 0) { struct slRef *ref = refsToGo; struct placementInfo *info = ref->val; if (clumpSize > 1) { // Sort identical samples alphabetically: struct slName *sortedSamples = NULL; int i; for (i = 0, ref = refsToGo; ref != NULL && i < clumpSize; ref = ref->next, i++) { info = ref->val; slNameAddHead(&sortedSamples, info->sampleId); } slNameSort(&sortedSamples); printf("%d identical samples:\n

%s\n", sln->name); puts("

"); } else { printf("%s\n", info->sampleId); ref = ref->next; } refsToGo = ref; displaySampleMuts(info); if (info->imputedBases) { puts("

Base values imputed by parsimony:\n

%d: %s\n", bv->chromStart+1, bv->val); puts("

"); puts("

"); } displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples"); displayBestNodes(info, sampleMetadata); if (!showBestNodePaths) displayNearestNeighbors(bigTree, source, info, sampleMetadata); if (showParsimonyScore && info->parsimonyScore > 0) printf("

Parsimony score added by your sample: %d

\n", info->parsimonyScore); //#*** TODO: explain parsimony score } } struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds) /* Prune all descendants of node that have no leaf descendants in sampleIds. */ { if (node->numEdges) { struct phyloTree *prunedKids = NULL; int i; for (i = 0; i < node->numEdges; i++) { struct phyloTree *kidIntersected = phyloPruneToIds(node->edges[i], sampleIds); if (kidIntersected) slAddHead(&prunedKids, kidIntersected); } int kidCount = slCount(prunedKids); assert(kidCount <= node->numEdges); if (kidCount > 1) { slReverse(&prunedKids); // There is no phyloTreeFree, but if we ever add one, should use it here. node->numEdges = kidCount; struct phyloTree *kid; for (i = 0, kid = prunedKids; i < kidCount; i++, kid = kid->next) { node->edges[i] = kid; kid->parent = node; } } else return prunedKids; } else if (! (node->ident->name && slNameInList(sampleIds, node->ident->name))) node = NULL; return node; } static struct subtreeInfo *subtreeInfoForSample(struct subtreeInfo *subtreeInfoList, char *name, int *retIx) /* Find the subtree that contains sample name and set *retIx to its index in the list. * If we can't find it, return NULL and set *retIx to -1. */ { struct subtreeInfo *ti; int ix; for (ti = subtreeInfoList, ix = 0; ti != NULL; ti = ti->next, ix++) if (slNameInList(ti->subtreeUserSampleIds, name)) break; if (ti == NULL) ix = -1; *retIx = ix; return ti; } //#*** TODO: Replace temporary host with nextstrain.org when feature request is released //#*** https://github.com/nextstrain/nextstrain.org/pull/216 static char *nextstrainHost() /* Until the new /fetch/ function is live on nextstrain.org, get their temporary staging host * from an hg.conf param, or NULL if missing. */ { return cfgOption("nextstrainHost"); } static char *nextstrainUrlFromTn(struct tempName *jsonTn) /* Return a link to Nextstrain to view an annotated subtree. */ { char *jsonUrlForNextstrain = urlFromTn(jsonTn); char *protocol = strstr(jsonUrlForNextstrain, "://"); if (protocol) jsonUrlForNextstrain = protocol + strlen("://"); struct dyString *dy = dyStringCreate("%s/fetch/%s", nextstrainHost(), jsonUrlForNextstrain); return dyStringCannibalize(&dy); } static void makeNextstrainButton(char *idBase, int ix, struct tempName *jsonTns[]) /* Make a button to view results in Nextstrain. idBase is a short string and * ix is 0-based subtree number. */ { char buttonId[256]; safef(buttonId, sizeof buttonId, "%s%d", idBase, ix+1); char buttonLabel[256]; safef(buttonLabel, sizeof buttonLabel, "view subtree %d in Nextstrain", ix+1); char *nextstrainUrl = nextstrainUrlFromTn(jsonTns[ix]); struct dyString *js = dyStringCreate("window.open('%s');", nextstrainUrl); cgiMakeOnClickButton(buttonId, js->string, buttonLabel); dyStringFree(&js); freeMem(nextstrainUrl); } static void makeButtonRow(struct tempName *jsonTns[], int subtreeCount, boolean isFasta) /* Russ's suggestion: row of buttons at the top to view results in GB, Nextstrain, Nextclade. */ { puts("

"); cgiMakeButton("submit", "view in Genome Browser"); if (nextstrainHost()) { int ix; for (ix = 0; ix < subtreeCount; ix++) { printf(" "); makeNextstrainButton("viewNextstrainTopRow", ix, jsonTns); } } if (0 && isFasta) { printf(" "); struct dyString *js = dyStringCreate("window.open('https://master.clades.nextstrain.org/" "?input-fasta=%s');", "needATn"); //#*** TODO: save FASTA to file cgiMakeOnClickButton("viewNextclade", js->string, "view sequences in Nextclade"); } puts("

"); } #define TOOLTIP(text) " " static void printSummaryHeader(boolean isFasta) /* Print the summary table header row with tooltips explaining columns. */ { puts(""); if (isFasta) puts("Fasta Sequence\n" "Size" TOOLTIP("Length of uploaded sequence in bases, excluding runs of N bases at " "beginning and/or end") "\n#Ns" TOOLTIP("Number of 'N' bases in uploaded sequence, excluding runs of N bases at " "beginning and/or end") ""); else puts("VCF Sample\n" "#Ns" TOOLTIP("Number of no-call variants for this sample in uploaded VCF, " "i.e. '.' used in genotype column") ""); puts("#Mixed" TOOLTIP("Number of IUPAC ambiguous bases, e.g. 'R' for 'A or G'") ""); if (isFasta) puts("Bases aligned" TOOLTIP("Number of bases aligned to reference NC_045512.2 Wuhan/Hu-1, including " "matches and mismatches") - "\nInsertions" + "\nInserted bases" TOOLTIP("Number of bases in aligned portion of uploaded sequence that are not present in " "reference NC_045512.2 Wuhan/Hu-1") - "\nDeletions" + "\nDeleted bases" TOOLTIP("Number of bases in reference NC_045512.2 Wuhan/Hu-1 that are not " "present in aligned portion of uploaded sequence") ""); puts("#SNVs used for placement" TOOLTIP("Number of single-nucleotide variants in uploaded sample " "(does not include N's or mixed bases) used by UShER to place sample " "in phylogenetic tree") "\n#Masked SNVs" TOOLTIP("Number of single-nucleotide variants in uploaded sample that are masked " "(not used for placement) because they occur at known " "Problematic Sites") "\nNeighboring sample in tree" TOOLTIP("A sample already in the tree that is a child of the node at which the uploaded " "sample was placed, to give an example of a closely related sample") "\nLineage of neighbor" TOOLTIP("The " "Pangolin lineage assigned to the nearest neighboring sample already in the tree") "\n#Imputed values for mixed bases" TOOLTIP("If the uploaded sequence contains mixed/ambiguous bases, then UShER may assign " "values based on maximum parsimony") "\n#Maximally parsimonious placements" TOOLTIP("Number of potential placements in the tree with minimal parsimony score; " "the higher the number, the less confident the placement") "\nParsimony score" TOOLTIP("Number of mutations/changes that must be added to the tree when placing the " "uploaded sample; the higher the number, the more diverged the sample") "\nSubtree number" TOOLTIP("Sequence number of subtree that contains this sample") ""); } static char *qcClassForIntMin(int n, int minExc, int minGood, int minMeh, int minBad) /* Return {qcExcellent, qcGood, qcMeh, qcBad or qcFail} depending on how n compares to the * thresholds. Don't free result. */ { if (n >= minExc) return "qcExcellent"; else if (n >= minGood) return "qcGood"; else if (n >= minMeh) return "qcMeh"; else if (n >= minBad) return "qcBad"; else return "qcFail"; } static char *qcClassForLength(int length) /* Return qc class for length of sequence. */ { return qcClassForIntMin(length, 29750, 29500, 29000, 28000); } static char *qcClassForIntMax(int n, int maxExc, int maxGood, int maxMeh, int maxBad) /* Return {qcExcellent, qcGood, qcMeh, qcBad or qcFail} depending on how n compares to the * thresholds. Don't free result. */ { if (n <= maxExc) return "qcExcellent"; else if (n <= maxGood) return "qcGood"; else if (n <= maxMeh) return "qcMeh"; else if (n <= maxBad) return "qcBad"; else return "qcFail"; } static char *qcClassForNs(int nCount) /* Return qc class for #Ns in sample. */ { return qcClassForIntMax(nCount, 0, 5, 20, 100); } static char *qcClassForMixed(int mixedCount) /* Return qc class for #ambiguous bases in sample. */ { return qcClassForIntMax(mixedCount, 0, 5, 20, 100); } static char *qcClassForIndel(int indelCount) /* Return qc class for #inserted or deleted bases. */ { return qcClassForIntMax(indelCount, 0, 2, 5, 10); } static char *qcClassForSNVs(int snvCount) /* Return qc class for #SNVs in sample. */ { return qcClassForIntMax(snvCount, 10, 20, 30, 40); } static char *qcClassForMaskedSNVs(int maskedCount) /* Return qc class for #SNVs at problematic sites. */ { return qcClassForIntMax(maskedCount, 0, 1, 2, 3); } static char *qcClassForImputedBases(int imputedCount) /* Return qc class for #ambiguous bases for which UShER imputed values based on placement. */ { return qcClassForMixed(imputedCount); } static char *qcClassForPlacements(int placementCount) /* Return qc class for number of equally parsimonious placements. */ { return qcClassForIntMax(placementCount, 1, 2, 3, 4); } static char *qcClassForPScore(int parsimonyScore) /* Return qc class for parsimonyScore. */ { return qcClassForIntMax(parsimonyScore, 0, 2, 5, 10); } static void printTooltip(char *text) /* Print a tooltip with explanatory text. */ { printf(TOOLTIP("%s"), text); } static void appendExcludingNs(struct dyString *dy, struct seqInfo *si) /* Append a note to dy about how many N bases and start and/or end are excluded from statistic. */ { dyStringAppend(dy, "excluding "); if (si->nCountStart) dyStringPrintf(dy, "%d N bases at start", si->nCountStart); if (si->nCountStart && si->nCountEnd) dyStringAppend(dy, " and "); if (si->nCountEnd) dyStringPrintf(dy, "%d N bases at end", si->nCountEnd); } static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta, struct usherResults *ur, struct tempName *jsonTns[], struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree) /* Show a table with composition & alignment stats for each sequence that passed basic QC. */ { if (seqInfoList) { puts(""); printSummaryHeader(isFasta); puts(""); struct dyString *dy = dyStringNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) { puts(""); printf(""); } else printf("", qcClassForLength(si->seq->size), si->seq->size); } printf(""); if (isFasta) { struct psl *psl = si->psl; if (psl) { int aliCount = psl->match + psl->misMatch + psl->repMatch; printf(""); } else printf("", qcClassForLength(0)); } int snvCount = slCount(si->sncList) - alignedAmbigCount; printf(""); struct placementInfo *pi = hashFindVal(ur->samplePlacements, si->seq->name); if (pi) { struct slName *neighbor = findNearestNeighbor(bigTree, pi->sampleId, pi->variantPath); char *lineage = neighbor ? lineageForSample(sampleMetadata, neighbor->name) : "?"; printf("", neighbor ? replaceChars(neighbor->name, "|", " | ") : "?", lineage ? lineage : "?"); int imputedCount = slCount(pi->imputedBases); printf(""); } else printf(""); int ix; struct subtreeInfo *ti = subtreeInfoForSample(ur->subtreeInfoList, si->seq->name, &ix); if (ix < 0) //#*** Probably an error. printf(""); else { printf(""); } puts(""); } puts("

%s", replaceChars(si->seq->name, "\|", " \| ")); if (isFasta) { if (si->nCountStart \|\| si->nCountEnd) { int effectiveLength = si->seq->size - (si->nCountStart + si->nCountEnd); dyStringClear(dy); dyStringPrintf(dy, "%d ", effectiveLength); appendExcludingNs(dy, si); dyStringPrintf(dy, " (original size %d)", si->seq->size); printf("	%d", qcClassForLength(effectiveLength), effectiveLength); printTooltip(dy->string); printf("	%d	%d", qcClassForNs(si->nCountMiddle), si->nCountMiddle); if (si->nCountStart \|\| si->nCountEnd) { dyStringClear(dy); dyStringPrintf(dy, "%d Ns ", si->nCountMiddle); appendExcludingNs(dy, si); printTooltip(dy->string); } printf("	%d ", qcClassForMixed(si->ambigCount), si->ambigCount); int alignedAmbigCount = 0; if (si->ambigCount > 0) { dyStringClear(dy); struct singleNucChange *snc; for (snc = si->sncList; snc != NULL; snc = snc->next) { if (isIupacAmbiguous(snc->newBase)) { dyStringAppendSep(dy, ", "); dyStringPrintf(dy, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); alignedAmbigCount++; } } if (isEmpty(dy->string)) dyStringAppend(dy, "(Masked or not aligned to reference)"); else if (alignedAmbigCount != si->ambigCount) dyStringPrintf(dy, " (%d masked or not aligned to reference)", si->ambigCount - alignedAmbigCount); printTooltip(dy->string); } printf("	%d ", qcClassForLength(aliCount), aliCount); dyStringClear(dy); dyStringPrintf(dy, "bases %d - %d align to reference bases %d - %d", psl->qStart+1, psl->qEnd, psl->tStart+1, psl->tEnd); printTooltip(dy->string); + int insBases = 0, insCount = 0, delBases = 0, delCount = 0; + if (psl->qBaseInsert \|\| psl->tBaseInsert) + { + // Tally up actual insertions and deletions; ignore skipped N bases. + int ix; + for (ix = 0; ix < psl->blockCount - 1; ix++) + { + int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix]; + int qGapEnd = psl->qStarts[ix+1]; + int qGapLen = qGapEnd - qGapStart; + int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix]; + int tGapEnd = psl->tStarts[ix+1]; + int tGapLen = tGapEnd - tGapStart; + if (qGapLen > tGapLen) + { + insCount++; + insBases += qGapLen - tGapLen; + } + else if (tGapLen > qGapLen) + { + delCount++; + delBases += tGapLen - qGapLen; + } + } + } printf("	%d ", - qcClassForIndel(psl->qBaseInsert), psl->qBaseInsert); - if (psl->qBaseInsert) + qcClassForIndel(insBases), insBases); + if (insBases) { dyStringClear(dy); - dyStringPrintf(dy, "%d bases in %d locations", - psl->qBaseInsert, psl->qNumInsert); + dyStringPrintf(dy, "%d bases in %d locations", insBases, insCount); printTooltip(dy->string); } printf("	%d ", - qcClassForIndel(psl->tBaseInsert), psl->tBaseInsert); - if (psl->tBaseInsert) + qcClassForIndel(delBases), delBases); + if (delBases) { dyStringClear(dy); - dyStringPrintf(dy, "%d bases in %d locations", - psl->tBaseInsert, psl->tNumInsert); + dyStringPrintf(dy, "%d bases in %d locations", delBases, delCount); printTooltip(dy->string); } printf("	not alignable			%d", qcClassForSNVs(snvCount), snvCount); if (snvCount > 0) { dyStringClear(dy); struct singleNucChange *snc; for (snc = si->sncList; snc != NULL; snc = snc->next) { if (!isIupacAmbiguous(snc->newBase)) { dyStringAppendSep(dy, ", "); dyStringPrintf(dy, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); } } printTooltip(dy->string); } int maskedCount = slCount(si->maskedSncList); printf("	%d", qcClassForMaskedSNVs(maskedCount), maskedCount); if (maskedCount > 0) { dyStringClear(dy); struct singleNucChange snc; struct slRef reasonsRef; for (snc = si->maskedSncList, reasonsRef = si->maskedReasonsList; snc != NULL; snc = snc->next, reasonsRef = reasonsRef->next) { dyStringAppendSep(dy, ", "); struct slName reasonList = reasonsRef->val, reason; replaceChar(reasonList->name, '_', ' '); dyStringPrintf(dy, "%c%d%c (%s", snc->refBase, snc->chromStart+1, snc->newBase, reasonList->name); for (reason = reasonList->next; reason != NULL; reason = reason->next) { replaceChar(reason->name, '_', ' '); dyStringPrintf(dy, ", %s", reason->name); } dyStringAppendC(dy, ')'); } printTooltip(dy->string); } printf("	%s	%s	%d", qcClassForImputedBases(imputedCount), imputedCount); if (imputedCount > 0) { dyStringClear(dy); struct baseVal *bv; for (bv = pi->imputedBases; bv != NULL; bv = bv->next) { dyStringAppendSep(dy, ", "); dyStringPrintf(dy, "%d: %s", bv->chromStart+1, bv->val); } printTooltip(dy->string); } printf("	%d", qcClassForPlacements(pi->bestNodeCount), pi->bestNodeCount); printf("	%d", qcClassForPScore(pi->parsimonyScore), pi->parsimonyScore); printf("	n/a	n/a	n/a	n/a	n/a	n/a	%d", ix+1); if (ti && nextstrainHost()) { char *nextstrainUrl = nextstrainUrlFromTn(jsonTns[ix]); printf(" (view in Nextstrain )", nextstrainUrl); } printf("

"); } } static struct slName **getProblematicSites(char *db) /* If config.ra specfies maskFile them return array of lists (usually NULL) of reasons that * masking is recommended, one per position in genome; otherwise return NULL. */ { struct slName **pSites = NULL; char *pSitesFile = phyloPlaceDbSettingPath(db, "maskFile"); if (isNotEmpty(pSitesFile) && fileExists(pSitesFile)) { AllocArray(pSites, chromSize); struct bbiFile *bbi = bigBedFileOpen(pSitesFile); struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); for (bb = bbList; bb != NULL; bb = bb->next) { char *extra = bb->rest; char *reason = nextWord(&extra); int i; for (i = bb->start; i < bb->end; i++) slNameAddHead(&pSites[i], reason); } bigBedFileClose(&bbi); } return pSites; } static int subTreeInfoUserSampleCmp(const void *pa, const void *pb) /* Compare subtreeInfo by number of user sample IDs (highest number first). */ { struct subtreeInfo *tiA = *(struct subtreeInfo **)pa; struct subtreeInfo *tiB = *(struct subtreeInfo **)pb; return slCount(tiB->subtreeUserSampleIds) - slCount(tiA->subtreeUserSampleIds); } char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf, boolean doMeasureTiming, int subtreeSize, int fontHeight) /* Given a lineFile that contains either FASTA or VCF, prepare VCF for usher; * if that goes well then run usher, report results, make custom track files * and return the top-level custom track file; otherwise return NULL. */ { char *ctFile = NULL; measureTiming = doMeasureTiming; int startTime = clock1000(); struct tempName *vcfTn = NULL; struct slName *sampleIds = NULL; char *usherPath = getUsherPath(TRUE); char *usherAssignmentsPath = NULL; char *source = NULL; char *metadataFile = NULL; struct treeChoices *treeChoices = loadTreeChoices(db); if (treeChoices) { usherAssignmentsPath = defaultProtobuf; if (isEmpty(usherAssignmentsPath)) usherAssignmentsPath = treeChoices->protobufFiles[0]; int i; for (i = 0; i < treeChoices->count; i++) if (sameString(treeChoices->protobufFiles[i], usherAssignmentsPath)) { metadataFile = treeChoices->metadataFiles[i]; source = treeChoices->sources[i]; break; } if (i == treeChoices->count) { usherAssignmentsPath = treeChoices->protobufFiles[0]; metadataFile = treeChoices->metadataFiles[0]; source = treeChoices->sources[0]; } } else { // Fall back on old settings usherAssignmentsPath = getUsherAssignmentsPath(db, TRUE); metadataFile = phyloPlaceDbSettingPath(db, "metadataFile"); source = "GISAID"; } struct mutationAnnotatedTree *bigTree = parseParsimonyProtobuf(usherAssignmentsPath); reportTiming(&startTime, "parse protobuf file"); if (! bigTree) { warn("Problem parsing %s; can't make subtree subtracks.", usherAssignmentsPath); } lineFileCarefulNewlines(lf); struct slName **maskSites = getProblematicSites(db); struct dnaSeq *refGenome = hChromSeq(db, chrom, 0, chromSize); boolean isFasta = FALSE; struct seqInfo *seqInfoList = NULL; if (lfLooksLikeFasta(lf)) { boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites); struct slPair *failedSeqs; struct slPair *failedPsls; vcfTn = vcfFromFasta(lf, db, refGenome, informativeBases, maskSites, &sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime); if (failedSeqs) { puts("

"); struct slPair *fail; for (fail = failedSeqs; fail != NULL; fail = fail->next) printf("%s
\n", fail->name); puts("

"); } if (failedPsls) { puts("

"); struct slPair *fail; for (fail = failedPsls; fail != NULL; fail = fail->next) printf("%s
\n", fail->name); puts("

"); } if (seqInfoList == NULL) printf("

Sorry, could not align any sequences to reference well enough to place in " "the phylogenetic tree.

\n"); isFasta = TRUE; } else if (lfLooksLikeVcf(lf)) { vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds); reportTiming(&startTime, "check uploaded VCF"); } else { if (isNotEmpty(lf->fileName)) warn("Sorry, can't recognize your file %s as FASTA or VCF.\n", lf->fileName); else warn("Sorry, can't recognize your uploaded data as FASTA or VCF.\n"); } lineFileClose(&lf); if (vcfTn) { struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi, subtreeSize, sampleIds, bigTree->condensedNodes, &startTime); if (results->subtreeInfoList) { int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); struct hash *sampleMetadata = getSampleMetadata(metadataFile); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json"); treeToAuspiceJson(ti, db, refGenome, bigGenePredFile, sampleMetadata, jsonTns[ix]->forCgi, source); } puts("

"); makeButtonRow(jsonTns, subtreeCount, isFasta); printf("

If you have metadata you wish to display, click a 'view subtree in Nextstrain' " "button, and then you can drag on a CSV file to " "add it to the tree view." "

\n"); summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree); reportTiming(&startTime, "write summary table (including reading in lineages)"); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("

Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1) printf("%d related samples", subtreeUserSampleCount); else if (subtreeCount > 1) printf("Unrelated sample"); printf("

\n"); makeNextstrainButton("viewNextstrainSub", ix, jsonTns); puts("
"); // Make a sub-subtree with only user samples for display: struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi); subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds); describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, subtree, sampleMetadata, bigTree, source); } reportTiming(&startTime, "describe placements"); // Make custom tracks for uploaded samples and subtree(s). struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, source, fontHeight, &startTime); // Offer big tree w/new samples for download puts("

Downloads

"); puts("

SARS-CoV-2 phylogenetic tree " "with your samples (Newick file)\n", results->bigTreePlusTn->forHtml); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { printf("
Subtree with %s", ti->subtreeTn->forHtml, ti->subtreeUserSampleIds->name); struct slName *sln; for (sln = ti->subtreeUserSampleIds->next; sln != NULL; sln = sln->next) printf(", %s", sln->name); puts(" (Newick file)"); printf("
Auspice JSON for subtree with %s", jsonTns[ix]->forHtml, ti->subtreeUserSampleIds->name); for (sln = ti->subtreeUserSampleIds->next; sln != NULL; sln = sln->next) printf(", %s", sln->name); puts(" (JSON file)"); } puts("

"); // Notify in opposite order of custom track creation. puts("

Custom tracks for viewing in the Genome Browser

"); printf("

Added custom track of uploaded samples.

\n"); printf("

Added %d subtree custom track%s.

\n", subtreeCount, (subtreeCount > 1 ? "s" : "")); ctFile = urlFromTn(ctTn); } else { warn("No subtree output from usher.\n"); } } return ctFile; }