9c4b7225d91f5180fcddbecd843b71c4e6503688 angie Tue Mar 30 10:39:22 2021 -0700 Add subtree JSON URLs to single-subtree JSON, using Auspice's new url attribute. Allow up to 1000 VCF samples and extend timeout to 15 minutes. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 80ed662..0583fc3 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -23,31 +23,31 @@ #include "pipeline.h" #include "psl.h" #include "ra.h" #include "regexHelper.h" #include "trashDir.h" #include "vcf.h" // Globals: static boolean measureTiming = FALSE; // wuhCor1-specific: char *chrom = "NC_045512v2"; int chromSize = 29903; // Parameter constants: -int maxGenotypes = 100; // Upper limit on number of samples user can upload at once. +int maxGenotypes = 1000; // Upper limit on number of samples user can upload at once. boolean showBestNodePaths = FALSE; boolean showParsimonyScore = FALSE; char *phyloPlaceDbSetting(char *db, char *settingName) /* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */ { static struct hash *configHash = NULL; static char *configDb = NULL; if (!sameOk(db, configDb)) { char configFile[1024]; safef(configFile, sizeof configFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", db); if (fileExists(configFile)) { @@ -633,30 +633,32 @@ headerWordCount = chopString(headerLine, "\t", NULL, 0); AllocArray(headerWords, headerWordCount); chopString(headerLine, "\t", headerWords, headerWordCount); } else errAbort("Missing header line from metadataFile %s", metadataFile); } int strainIx = stringArrayIx("strain", headerWords, headerWordCount); int epiIdIx = stringArrayIx("gisaid_epi_isl", headerWords, headerWordCount); int genbankIx = stringArrayIx("genbank_accession", headerWords, headerWordCount); int dateIx = stringArrayIx("date", headerWords, headerWordCount); int authorIx = stringArrayIx("authors", headerWords, headerWordCount); int nCladeIx = stringArrayIx("Nextstrain_clade", headerWords, headerWordCount); int gCladeIx = stringArrayIx("GISAID_clade", headerWords, headerWordCount); int lineageIx = stringArrayIx("pangolin_lineage", headerWords, headerWordCount); + if (lineageIx < 0) + lineageIx = stringArrayIx("pango_lineage", headerWords, headerWordCount); int countryIx = stringArrayIx("country", headerWords, headerWordCount); int divisionIx = stringArrayIx("division", headerWords, headerWordCount); int locationIx = stringArrayIx("location", headerWords, headerWordCount); int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount); int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount); int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount); int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount); int regionIx = stringArrayIx("region", headerWords, headerWordCount); while (lineFileNext(lf, &line, NULL)) { char *words[headerWordCount]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, headerWordCount, wordCount); struct sampleMetadata *met; AllocVar(met); @@ -2175,57 +2177,64 @@ warn("Sorry, can't recognize your uploaded data as FASTA or VCF.\n"); } lineFileClose(&lf); if (vcfTn) { fflush(stdout); int seqCount = slCount(seqInfoList); // Don't make smaller subtrees when a large number of sequences are uploaded. if (seqCount > MAX_SEQ_DETAILS) subtreeSize = 0; struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi, subtreeSize, sampleIds, bigTree->condensedNodes, &startTime); if (results->singleSubtreeInfo) { + puts("<p></p>"); readQcThresholds(db); int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome); struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize); struct hash *sampleMetadata = getSampleMetadata(metadataFile); - struct tempName *singleSubtreeJsonTn; - AllocVar(singleSubtreeJsonTn); - trashDirFile(singleSubtreeJsonTn, "ct", "singleSubtreeAuspice", ".json"); - treeToAuspiceJson(results->singleSubtreeInfo, db, geneInfoList, gSeqWin, sampleMetadata, - singleSubtreeJsonTn->forCgi, source); + struct hash *sampleUrls = hashNew(0); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); char subtreeName[512]; safef(subtreeName, sizeof(subtreeName), "subtreeAuspice%d", ix+1); trashDirFile(jsonTns[ix], "ct", subtreeName, ".json"); - treeToAuspiceJson(ti, db, geneInfoList, gSeqWin, sampleMetadata, jsonTns[ix]->forCgi, - source); + treeToAuspiceJson(ti, db, geneInfoList, gSeqWin, sampleMetadata, NULL, + jsonTns[ix]->forCgi, source); + // Add a link for every sample to this subtree, so the single-subtree JSON can + // link to subtree JSONs + char *subtreeUrl = nextstrainUrlFromTn(jsonTns[ix]); + struct slName *sample; + for (sample = ti->subtreeUserSampleIds; sample != NULL; sample = sample->next) + hashAdd(sampleUrls, sample->name, subtreeUrl); } - puts("<p></p>"); + struct tempName *singleSubtreeJsonTn; + AllocVar(singleSubtreeJsonTn); + trashDirFile(singleSubtreeJsonTn, "ct", "singleSubtreeAuspice", ".json"); + treeToAuspiceJson(results->singleSubtreeInfo, db, geneInfoList, gSeqWin, sampleMetadata, + sampleUrls, singleSubtreeJsonTn->forCgi, source); struct subtreeInfo *subtreeInfoForButtons = results->subtreeInfoList; if (seqCount > MAX_SEQ_DETAILS || subtreeCount > MAX_SUBTREE_BUTTONS) subtreeInfoForButtons = NULL; makeButtonRow(singleSubtreeJsonTn, jsonTns, subtreeInfoForButtons, subtreeSize, isFasta); printf("<p>If you have metadata you wish to display, click a 'view subtree in " "Nextstrain' button, and then you can drag on a CSV file to " "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>." "</p>\n"); // Make custom tracks for uploaded samples and subtree(s). struct phyloTree *sampleTree = NULL; struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, source, fontHeight, &sampleTree, &startTime); // Make a sample summary TSV file and accumulate S gene changes