afafa0301ea4b14fbe1fbd5aa379c5351ecd640d angie Tue Aug 2 19:41:44 2022 -0700 Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub). * Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name. * Add a menu input to the main page for switching between supported genomes if there are more than one. * Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings. * Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions. * Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV). * also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output. TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes. diff --git src/hg/hgPhyloPlace/writeCustomTracks.c src/hg/hgPhyloPlace/writeCustomTracks.c index 08ca4be..1ee0bde 100644 --- src/hg/hgPhyloPlace/writeCustomTracks.c +++ src/hg/hgPhyloPlace/writeCustomTracks.c @@ -1,40 +1,34 @@ /* Write custom tracks: one per subtree and one with user's uploaded samples. */ /* Copyright (C) 2020 The Regents of the University of California */ #include "common.h" #include "hash.h" +#include "hdb.h" #include "iupac.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "trashDir.h" #include "vcf.h" -// wuhCor1-specific: -char *geneTrack = "ncbiGeneBGP"; -int maxVcfRows = 30000; - -// Globals -extern char *chrom; -extern int chromSize; - // Parameter constants: int minSamplesForOwnTree = 3; // If user uploads at least this many samples, show tree for them. -static char ***imputedBasesByPosition(struct hash *samplePlacements, struct slName *sampleIds) +static char ***imputedBasesByPosition(struct hash *samplePlacements, struct slName *sampleIds, + int chromSize) /* Unpack imputedBases into an array of (a few) arrays indexed by 0-based position and sample index, * for speedy retrieval while rewriting user VCF. */ { char ***impsByPos = needMem(chromSize * sizeof(char **)); int sampleCount = slCount(sampleIds); struct slName *sample; int ix; for (ix = 0, sample = sampleIds; sample != NULL; ix++, sample = sample->next) { struct placementInfo *info = hashFindVal(samplePlacements, sample->name); if (!info) errAbort("imputedBasesByPosition: can't find placementInfo for sample '%s'", sample->name); struct baseVal *bv; for (bv = info->imputedBases; bv != NULL; bv = bv->next) { @@ -48,45 +42,45 @@ static boolean anyImputedBases(struct hash *samplePlacements, struct slName *sampleIds) /* Return TRUE if any sample has at least one imputed base. */ { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) { struct placementInfo *info = hashFindVal(samplePlacements, sample->name); if (info && info->imputedBases) return TRUE; } return FALSE; } struct tempName *userVcfWithImputedBases(struct tempName *userVcfTn, struct hash *samplePlacements, - struct slName *sampleIds) + struct slName *sampleIds, int chromSize) /* If usher imputed the values of ambiguous bases in user's VCF, then swap in * the imputed values and write a new VCF to use as the custom track. Otherwise just use * the user's VCF for the custom track. */ { struct tempName *ctVcfTn = userVcfTn; if (anyImputedBases(samplePlacements, sampleIds)) { AllocVar(ctVcfTn); trashDirFile(ctVcfTn, "ct", "imputed", ".vcf"); FILE *f = mustOpen(ctVcfTn->forCgi, "w"); struct lineFile *lf = lineFileOpen(userVcfTn->forCgi, TRUE); int sampleCount = slCount(sampleIds); int colCount = VCF_NUM_COLS_BEFORE_GENOTYPES + sampleCount; - char ***impsByPos = imputedBasesByPosition(samplePlacements, sampleIds); + char ***impsByPos = imputedBasesByPosition(samplePlacements, sampleIds, chromSize); char *line; int size; while (lineFileNext(lf, &line, &size)) { if (line[0] == '#') fputs(line, f); else { char *words[colCount]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, colCount, wordCount); int chromStart = atoi(words[1]) - 1; if (impsByPos[chromStart]) { verbose(3, "%d ref=%s alt=%s\n", chromStart+1, words[3], words[4]); @@ -195,76 +189,76 @@ tree = phyloPruneToIds(tree, sampleIds); FILE *f = mustOpen(sampleTreeFile, "w"); phyloPrintTree(tree, f); carefulClose(&f); return tree; } static int heightForSampleCount(int fontHeight, int count) /* Return a height sufficient for this many labels stacked vertically. */ { return (fontHeight + 1) * count; } static struct phyloTree *addSampleOnlyCustomTrack(FILE *ctF, struct tempName *vcfTn, char *bigTreePlusFile, struct slName *sampleIds, - int fontHeight, int *pStartTime) + char *geneTrack, int fontHeight, int *pStartTime) /* Make custom track with uploaded VCF. If there are enough samples to make a non-trivial tree, * then make the tree by pruning down the big tree plus samples to only the uploaded samples. */ { struct phyloTree *sampleTree = NULL; fprintf(ctF, "track name=uploadedSamples description='Uploaded sample genotypes' " "type=vcf visibility=pack " "hapClusterEnabled=on hapClusterHeight=%d ", heightForSampleCount(fontHeight, slCount(sampleIds))); if (slCount(sampleIds) >= minSamplesForOwnTree) { struct tempName sampleTreeTn; trashDirFile(&sampleTreeTn, "ct", "uploadedSamples", ".nwk"); sampleTree = uploadedSamplesTree(sampleTreeTn.forCgi, bigTreePlusFile, sampleIds); fprintf(ctF, "hapClusterMethod='treeFile %s' ", sampleTreeTn.forCgi); } else fprintf(ctF, "hapClusterMethod=fileOrder "); if (isNotEmpty(geneTrack)) fprintf(ctF, "hapClusterColorBy=function geneTrack=%s", geneTrack); fputc('\n', ctF); struct lineFile *vcfLf = lineFileOpen(vcfTn->forCgi, TRUE); char *line; while (lineFileNext(vcfLf, &line, NULL)) fprintf(ctF, "%s\n",line); lineFileClose(&vcfLf); reportTiming(pStartTime, "add custom track for uploaded samples"); return sampleTree; } -static struct vcfFile *parseUserVcf(char *userVcfFile, int *pStartTime) +static struct vcfFile *parseUserVcf(char *userVcfFile, int chromSize, int *pStartTime) /* Read in user VCF file and parse genotypes. */ { -struct vcfFile *userVcf = vcfFileMayOpen(userVcfFile, chrom, 0, chromSize, 0, maxVcfRows, TRUE); +struct vcfFile *userVcf = vcfFileMayOpen(userVcfFile, NULL, 0, chromSize, 0, chromSize, TRUE); if (! userVcf) return NULL; reportTiming(pStartTime, "read userVcf"); struct vcfRecord *rec; for (rec = userVcf->records; rec != NULL; rec = rec->next) vcfParseGenotypesGtOnly(rec); reportTiming(pStartTime, "parse userVcf genotypes"); return userVcf; } static void writeSubtreeTrackLine(FILE *ctF, struct subtreeInfo *ti, int subtreeNum, char *source, - int fontHeight) + char *geneTrack, int fontHeight) /* Write a custom track line to ctF for one subtree. */ { fprintf(ctF, "track name=subtree%d", subtreeNum); // Keep the description from being too long in order to avoid buffer overflow in hgTrackUi // (and limited space for track title in hgTracks) int descLen = fprintf(ctF, " description='Subtree %d: uploaded sample%s %s", subtreeNum, (slCount(ti->subtreeUserSampleIds) > 1 ? "s" : ""), ti->subtreeUserSampleIds->name); struct slName *id; for (id = ti->subtreeUserSampleIds->next; id != NULL; id = id->next) { if (descLen > 100) { fprintf(ctF, " and %d other samples", slCount(id)); break; @@ -455,32 +449,32 @@ genotypes[ix] = '.'; else { int altIx; for (altIx = 0; altIx < altCount; altIx++) if (altAlleles[altIx] == sampleAl) break; if (altIx == altCount) errAbort("altsToGenotypes: sample %d allele '%c' not found in array of %d alts.", ix, sampleAl, altCount); genotypes[ix] = '1' + altIx; } } } -static void baseAllelesToVcf(char *refBases, char **sampleBases, int baseCount, int sampleCount, - FILE *f) +static void baseAllelesToVcf(char *refBases, char **sampleBases, char *chrom, int baseCount, + int sampleCount, FILE *f) /* Write VCF rows with per-sample genotypes from refBases and sampleBases. */ { int chromStart; for (chromStart = 0; chromStart < baseCount; chromStart++) { char ref = refBases[chromStart]; if (ref != '\0') { // This position has a variant. Tally up alternate alleles and counts and determine // allele indices to use when printing out genotypes. int maxAlts = 16; char altAlleles[maxAlts]; int altCounts[maxAlts]; int noCallCount; int altCount = tallyAlts(sampleBases[chromStart], ref, sampleCount, @@ -499,88 +493,96 @@ for (altIx = 1; altIx < altCount; altIx++) fprintf(f, ",%c", toupper(altAlleles[altIx])); fputs("\t.\t.\t", f); fprintf(f, "AC=%d", altCounts[0]); for (altIx = 1; altIx < altCount; altIx++) fprintf(f, ",%d", altCounts[altIx]); fprintf(f, ";AN=%d\tGT", sampleCount - noCallCount); int gtIx; for (gtIx = 0; gtIx < sampleCount; gtIx++) fprintf(f, "\t%c", genotypes[gtIx]); fputc('\n', f); } } } -static void writeSubtreeVcf(FILE *f, struct hash *sampleToIx, +static void writeSubtreeVcf(FILE *f, struct hash *sampleToIx, char *chrom, int chromSize, struct vcfFile *userVcf, struct mutationAnnotatedTree *bigTree) /* Write rows of VCF with genotypes for samples in sampleToIx (with order determined by sampleToIx) * with some samples found in userVcf and the rest found in bigTree which has been annotated * with single-nucleotide variants. */ { char refBases[chromSize]; memset(refBases, 0, sizeof refBases); char *sampleBases[chromSize]; memset(sampleBases, 0, sizeof sampleBases); treeToBaseAlleles(bigTree->tree, bigTree->condensedNodes, refBases, sampleBases, sampleToIx, NULL); vcfToBaseAlleles(userVcf, refBases, sampleBases, sampleToIx); int sampleCount = sampleToIx->elCount; -baseAllelesToVcf(refBases, sampleBases, chromSize, sampleCount, f); +baseAllelesToVcf(refBases, sampleBases, chrom, chromSize, sampleCount, f); int i; for (i = 0; i < chromSize; i++) freeMem(sampleBases[i]); } static void addSubtreeCustomTracks(FILE *ctF, char *userVcfFile, struct subtreeInfo *subtreeInfoList, - struct hash *samplePlacements, + struct hash *samplePlacements, char *chrom, int chromSize, struct mutationAnnotatedTree *bigTree, - char *source, int fontHeight, int *pStartTime) + char *source, char *geneTrack, int fontHeight, int *pStartTime) /* For each subtree trashfile, write VCF+treeFile custom track text to ctF. */ { -struct vcfFile *userVcf = parseUserVcf(userVcfFile, pStartTime); +struct vcfFile *userVcf = parseUserVcf(userVcfFile, chromSize, pStartTime); if (! userVcf) { warn("Problem parsing VCF file with user variants '%s'; can't make subtree subtracks.", userVcfFile); return; } if (!bigTree) return; struct subtreeInfo *ti; int subtreeNum; for (subtreeNum = 1, ti = subtreeInfoList; ti != NULL; ti = ti->next, subtreeNum++) { - writeSubtreeTrackLine(ctF, ti, subtreeNum, source, fontHeight); + writeSubtreeTrackLine(ctF, ti, subtreeNum, source, geneTrack, fontHeight); writeVcfHeader(ctF, ti->subtreeNameList); - writeSubtreeVcf(ctF, ti->subtreeIdToIx, userVcf, bigTree); + writeSubtreeVcf(ctF, ti->subtreeIdToIx, chrom, chromSize, userVcf, bigTree); fputc('\n', ctF); } vcfFileFree(&userVcf); reportTiming(pStartTime, "write subtree custom tracks"); } -struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur, +struct tempName *writeCustomTracks(char *db, struct tempName *vcfTn, struct usherResults *ur, struct slName *sampleIds, struct mutationAnnotatedTree *bigTree, - char *source, int fontHeight, struct phyloTree **retSampleTree, - int *pStartTime) + char *source, int fontHeight, + struct phyloTree **retSampleTree, int *pStartTime) /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ { -struct tempName *ctVcfTn = userVcfWithImputedBases(vcfTn, ur->samplePlacements, sampleIds); +char *chrom = hDefaultChrom(db); +int chromSize = hChromSize(db, chrom); +char *geneTrack = phyloPlaceDbSetting(db, "geneTrack"); +// Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra. +if (isEmpty(geneTrack)) + geneTrack = sameString(db, "wuhCor1") ? "ncbiGeneBGP" : "ncbiGene"; +struct tempName *ctVcfTn = userVcfWithImputedBases(vcfTn, ur->samplePlacements, sampleIds, + chromSize); struct tempName *ctTn; AllocVar(ctTn); trashDirFile(ctTn, "ct", "ct_pp", ".ct"); FILE *ctF = mustOpen(ctTn->forCgi, "w"); int subtreeCount = slCount(ur->subtreeInfoList); if (subtreeCount <= MAX_SUBTREE_CTS) - addSubtreeCustomTracks(ctF, ctVcfTn->forCgi, ur->subtreeInfoList, ur->samplePlacements, bigTree, - source, fontHeight, pStartTime); + addSubtreeCustomTracks(ctF, ctVcfTn->forCgi, ur->subtreeInfoList, ur->samplePlacements, + chrom, chromSize, bigTree, source, geneTrack, fontHeight, pStartTime); else printf("<p>Subtree custom tracks are added when there are at most %d subtrees, " "but %d subtrees were found.</p>\n", MAX_SUBTREE_CTS, subtreeCount); struct phyloTree *sampleTree = addSampleOnlyCustomTrack(ctF, ctVcfTn, ur->bigTreePlusTn->forCgi, - sampleIds, fontHeight, pStartTime); + sampleIds, geneTrack, fontHeight, + pStartTime); if (retSampleTree) *retSampleTree = sampleTree; carefulClose(&ctF); return ctTn; }