7e58340888377874edaad1dbc5174e20295f890c angie Mon Feb 22 14:17:33 2021 -0800 Support upload of more sequences, add TSV file summarizing sample variants and placements. Requested by Joe de Risi (UCSF). Increase timeout to 10 minutes; make TSV with each sample's ID, nuc muts, AA muts, imputed bases and path from root to sample. Also use Yatish's new -K subtree algorithm in usher: one subtree encompassing all uploaded samples, plus the specified number of samples randomly selected from the rest of the tree. Don't show every single sample name in the title because there can be 1000 samples in the same subtree now. :) diff --git src/hg/hgPhyloPlace/writeCustomTracks.c src/hg/hgPhyloPlace/writeCustomTracks.c index 6e98f88..0f16bb1 100644 --- src/hg/hgPhyloPlace/writeCustomTracks.c +++ src/hg/hgPhyloPlace/writeCustomTracks.c @@ -174,75 +174,79 @@ int i; fputs(words[0], f); for (i = 1; i < colCount; i++) { fputc('\t', f); fputs(words[i], f); } } fputc('\n', f); } carefulClose(&f); } return ctVcfTn; } -static void uploadedSamplesTree(char *sampleTreeFile, char *bigTreeFile, struct slName *sampleIds) +static struct phyloTree *uploadedSamplesTree(char *sampleTreeFile, char *bigTreeFile, + struct slName *sampleIds) /* Read in tree from bigTreeFile, prune all nodes that have no leaf descendants in sampleIds, * save pruned tree to sampleTreeFile. */ { struct phyloTree *tree = phyloOpenTree(bigTreeFile); tree = phyloPruneToIds(tree, sampleIds); FILE *f = mustOpen(sampleTreeFile, "w"); phyloPrintTree(tree, f); carefulClose(&f); +return tree; } static int heightForSampleCount(int fontHeight, int count) /* Return a height sufficient for this many labels stacked vertically. */ { return (fontHeight + 1) * count; } -static void addSampleOnlyCustomTrack(FILE *ctF, struct tempName *vcfTn, char *bigTreePlusFile, - struct slName *sampleIds, +static struct phyloTree *addSampleOnlyCustomTrack(FILE *ctF, struct tempName *vcfTn, + char *bigTreePlusFile, struct slName *sampleIds, int fontHeight, int *pStartTime) /* Make custom track with uploaded VCF. If there are enough samples to make a non-trivial tree, * then make the tree by pruning down the big tree plus samples to only the uploaded samples. */ { +struct phyloTree *sampleTree = NULL; fprintf(ctF, "track name=uploadedSamples description='Uploaded sample genotypes' " "type=vcf visibility=pack " "hapClusterEnabled=on hapClusterHeight=%d ", heightForSampleCount(fontHeight, slCount(sampleIds))); if (slCount(sampleIds) >= minSamplesForOwnTree) { struct tempName sampleTreeTn; trashDirFile(&sampleTreeTn, "ct", "uploadedSamples", ".nwk"); - uploadedSamplesTree(sampleTreeTn.forCgi, bigTreePlusFile, sampleIds); + sampleTree = uploadedSamplesTree(sampleTreeTn.forCgi, bigTreePlusFile, sampleIds); fprintf(ctF, "hapClusterMethod='treeFile %s' ", sampleTreeTn.forCgi); } else fprintf(ctF, "hapClusterMethod=fileOrder "); if (isNotEmpty(geneTrack)) fprintf(ctF, "hapClusterColorBy=function geneTrack=%s", geneTrack); fputc('\n', ctF); struct lineFile *vcfLf = lineFileOpen(vcfTn->forCgi, TRUE); char *line; while (lineFileNext(vcfLf, &line, NULL)) fprintf(ctF, "%s\n",line); lineFileClose(&vcfLf); reportTiming(pStartTime, "add custom track for uploaded samples"); +return sampleTree; } static struct vcfFile *parseUserVcf(char *userVcfFile, int *pStartTime) /* Read in user VCF file and parse genotypes. */ { struct vcfFile *userVcf = vcfFileMayOpen(userVcfFile, chrom, 0, chromSize, 0, maxVcfRows, TRUE); if (! userVcf) return NULL; reportTiming(pStartTime, "read userVcf"); struct vcfRecord *rec; for (rec = userVcf->records; rec != NULL; rec = rec->next) vcfParseGenotypesGtOnly(rec); reportTiming(pStartTime, "parse userVcf genotypes"); return userVcf; } @@ -540,30 +544,39 @@ return; struct subtreeInfo *ti; for (ti = subtreeInfoList; ti != NULL; ti = ti->next) { writeSubtreeTrackLine(ctF, ti, source, fontHeight); writeVcfHeader(ctF, ti->subtreeNameList); writeSubtreeVcf(ctF, ti->subtreeIdToIx, userVcf, bigTree); fputc('\n', ctF); } vcfFileFree(&userVcf); reportTiming(pStartTime, "write subtree custom tracks"); } struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur, struct slName *sampleIds, struct phyloTree *bigTree, - char *source, int fontHeight, int *pStartTime) + char *source, int fontHeight, struct phyloTree **retSampleTree, + int *pStartTime) /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ { struct tempName *ctVcfTn = userVcfWithImputedBases(vcfTn, ur->samplePlacements, sampleIds); struct tempName *ctTn; AllocVar(ctTn); trashDirFile(ctTn, "ct", "ct_pp", ".ct"); FILE *ctF = mustOpen(ctTn->forCgi, "w"); +int subtreeCount = slCount(ur->subtreeInfoList); +if (subtreeCount <= MAX_SUBTREE_CTS) addSubtreeCustomTracks(ctF, ctVcfTn->forCgi, ur->subtreeInfoList, ur->samplePlacements, bigTree, source, fontHeight, pStartTime); -addSampleOnlyCustomTrack(ctF, ctVcfTn, ur->bigTreePlusTn->forCgi, sampleIds, fontHeight, - pStartTime); +else + printf("

Subtree custom tracks are added when there are at most %d subtrees, " + "but %d subtrees were found.

\n", + MAX_SUBTREE_CTS, subtreeCount); +struct phyloTree *sampleTree = addSampleOnlyCustomTrack(ctF, ctVcfTn, ur->bigTreePlusTn->forCgi, + sampleIds, fontHeight, pStartTime); +if (retSampleTree) + *retSampleTree = sampleTree; carefulClose(&ctF); return ctTn; }