e267c9ec78e654975dbcd0bb8d5b4bf393187269 angie Thu Mar 4 15:39:32 2021 -0800 Add ZIP file of subtree JSON & Newick files for download. Add subtree numbers to the filenames so it's not all trashDir soup. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index cf2966e..ea6177b 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -8,30 +8,31 @@ #include "errCatch.h" #include "fa.h" #include "genePred.h" #include "hCommon.h" #include "hash.h" #include "hgConfig.h" #include "htmshell.h" #include "hui.h" #include "iupac.h" #include "jsHelper.h" #include "linefile.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" +#include "pipeline.h" #include "psl.h" #include "ra.h" #include "regexHelper.h" #include "trashDir.h" #include "vcf.h" // Globals: static boolean measureTiming = FALSE; // wuhCor1-specific: char *chrom = "NC_045512v2"; int chromSize = 29903; // Parameter constants: int maxGenotypes = 100; // Upper limit on number of samples user can upload at once. @@ -1977,62 +1978,94 @@ struct aaMutInfo *ami = sChanges[ix]; int sampleCount = slCount(ami->sampleIds); fprintf(f, "S:%s\t%d\t%f", ami->name, sampleCount, (double)sampleCount / (double)totalSampleCount); slReverse(&ami->sampleIds); fprintf(f, "\t%s", ami->sampleIds->name); struct slName *sample; for (sample = ami->sampleIds->next; sample != NULL; sample = sample->next) fprintf(f, ",%s", sample->name); fputc('\n', f); } carefulClose(&f); return tsvTn; } +static struct tempName *makeSubtreeZipFile(struct usherResults *results, struct tempName *jsonTns[], + struct tempName *singleSubtreeJsonTn, int *pStartTime) +/* Make a zip archive file containing all of the little subtree Newick and JSON files so + * user doesn't have to click on each one. */ +{ +struct tempName *zipTn; +AllocVar(zipTn); +trashDirFile(zipTn, "ct", "usher_subtrees", ".zip"); +int subtreeCount = slCount(results->subtreeInfoList); +char *cmd[10 + 2*(subtreeCount+1)]; +char **cmds[] = { cmd, NULL }; +int cIx = 0, sIx = 0; +cmd[cIx++] = "zip"; +cmd[cIx++] = "-j"; +cmd[cIx++] = zipTn->forCgi; +cmd[cIx++] = singleSubtreeJsonTn->forCgi; +cmd[cIx++] = results->singleSubtreeInfo->subtreeTn->forCgi; +struct subtreeInfo *ti; +for (ti = results->subtreeInfoList; ti != NULL; ti = ti->next, sIx++) + { + cmd[cIx++] = jsonTns[sIx]->forCgi; + cmd[cIx++] = ti->subtreeTn->forCgi; + } +cmd[cIx++] = NULL; +struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, NULL); +pipelineClose(&pl); +reportTiming(pStartTime, "make subtree zipfile"); +return zipTn; +} + static struct slName **getProblematicSites(char *db) /* If config.ra specfies maskFile them return array of lists (usually NULL) of reasons that * masking is recommended, one per position in genome; otherwise return NULL. */ { struct slName **pSites = NULL; char *pSitesFile = phyloPlaceDbSettingPath(db, "maskFile"); if (isNotEmpty(pSitesFile) && fileExists(pSitesFile)) { AllocArray(pSites, chromSize); struct bbiFile *bbi = bigBedFileOpen(pSitesFile); struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); for (bb = bbList; bb != NULL; bb = bb->next) { char *extra = bb->rest; char *reason = nextWord(&extra); int i; for (i = bb->start; i < bb->end; i++) slNameAddHead(&pSites[i], reason); } bigBedFileClose(&bbi); } return pSites; } -static void downloadsRow(char *treeFile, char *sampleSummaryFile, char *spikeSummaryFile) +static void downloadsRow(char *treeFile, char *sampleSummaryFile, char *spikeSummaryFile, + char *subtreeZipFile) /* Make a row of quick download file links, to appear between the button row & big summary table. */ { printf("<p><b>Downloads:</b> | "); printf("<a href='%s' download>Global phylogenetic tree with your sequences</a> | ", treeFile); printf("<a href='%s' download>TSV summary of sequences and placements</a> | ", sampleSummaryFile); printf("<a href='%s' download>TSV summary of Spike mutations</a> | ", spikeSummaryFile); +printf("<a href='%s' download>ZIP file of subtree JSON and Newick files</a> | ", subtreeZipFile); puts("</p>"); } static int subTreeInfoUserSampleCmp(const void *pa, const void *pb) /* Compare subtreeInfo by number of user sample IDs (highest number first). */ { struct subtreeInfo *tiA = *(struct subtreeInfo **)pa; struct subtreeInfo *tiB = *(struct subtreeInfo **)pb; return slCount(tiB->subtreeUserSampleIds) - slCount(tiA->subtreeUserSampleIds); } char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf, boolean doMeasureTiming, int subtreeSize, int fontHeight) /* Given a lineFile that contains either FASTA or VCF, prepare VCF for usher; * if that goes well then run usher, report results, make custom track files @@ -2147,55 +2180,60 @@ char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome); struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize); struct hash *sampleMetadata = getSampleMetadata(metadataFile); struct tempName *singleSubtreeJsonTn; AllocVar(singleSubtreeJsonTn); trashDirFile(singleSubtreeJsonTn, "ct", "singleSubtreeAuspice", ".json"); treeToAuspiceJson(results->singleSubtreeInfo, db, geneInfoList, gSeqWin, sampleMetadata, singleSubtreeJsonTn->forCgi, source); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); - trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json"); + char subtreeName[512]; + safef(subtreeName, sizeof(subtreeName), "subtreeAuspice%d", ix+1); + trashDirFile(jsonTns[ix], "ct", subtreeName, ".json"); treeToAuspiceJson(ti, db, geneInfoList, gSeqWin, sampleMetadata, jsonTns[ix]->forCgi, source); } puts("<p></p>"); int subtreeButtonCount = subtreeCount; if (seqCount > MAX_SEQ_DETAILS || subtreeCount > MAX_SUBTREE_BUTTONS) subtreeButtonCount = 0; makeButtonRow(singleSubtreeJsonTn, jsonTns, subtreeButtonCount, isFasta); printf("<p>If you have metadata you wish to display, click a 'view subtree in " "Nextstrain' button, and then you can drag on a CSV file to " "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>." "</p>\n"); // Make custom tracks for uploaded samples and subtree(s). struct phyloTree *sampleTree = NULL; struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, source, fontHeight, &sampleTree, &startTime); // Make a sample summary TSV file and accumulate S gene changes struct hash *spikeChanges = hashNew(0); struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoList, geneInfoList, gSeqWin, spikeChanges, &startTime); struct tempName *sTsvTn = writeSpikeChangeSummary(spikeChanges, slCount(sampleIds)); - downloadsRow(results->bigTreePlusTn->forHtml, tsvTn->forHtml, sTsvTn->forHtml); + struct tempName *zipTn = makeSubtreeZipFile(results, jsonTns, singleSubtreeJsonTn, + &startTime); + downloadsRow(results->bigTreePlusTn->forHtml, tsvTn->forHtml, sTsvTn->forHtml, + zipTn->forHtml); if (seqCount <= MAX_SEQ_DETAILS) { summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree, refGenome); reportTiming(&startTime, "write summary table (including reading in lineages)"); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<h3>Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1) printf("%d related samples", subtreeUserSampleCount); else if (subtreeCount > 1) printf("Unrelated sample"); printf("</h3>\n"); @@ -2212,30 +2250,34 @@ else printf("<p>(Skipping details and subtrees; " "you uploaded %d sequences, and details/subtrees are shown only when " "you upload at most %d sequences.)</p>\n", seqCount, MAX_SEQ_DETAILS); // Offer big tree w/new samples for download puts("<h3>Downloads</h3>"); puts("<ul>"); printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree " "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml); printf("<li><a href='%s' download>TSV summary of sequences and placements</a>\n", tsvTn->forHtml); printf("<li><a href='%s' download>TSV summary of S (Spike) gene changes</a>\n", sTsvTn->forHtml); + printf("<li><a href='%s' download>ZIP archive of subtree Newick and JSON files</a>\n", + zipTn->forHtml); + // For now, leave in the individual links so I don't break anybody's pipeline that's + // scraping this page... for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml, ti->subtreeUserSampleIds->name); if (subtreeUserSampleCount > 10) printf(" and %d other samples", subtreeUserSampleCount - 1); else { struct slName *sln; for (sln = ti->subtreeUserSampleIds->next; sln != NULL; sln = sln->next) printf(", %s", sln->name); } puts(" (Newick file)</a>"); printf("<li><a href='%s' download>Auspice JSON for subtree with %s",