491395482b817cfe82cbdef4dd7f8ee0723c1a4a angie Wed Mar 23 21:01:49 2011 -0700 Feature #2822, #2823 (VCF customFactory + track handler):Added a new track type, vcfTabix, with handlers in hgTracks and hgc and a customFactory. It is a new bigDataUrl type of track; the remote VCF file must be compressed and indexed by tabix, so like BAM a separate index file is required. If the VCF file has genotypes, then each sample's two haplotypes are graphed in a line, with one line per sample. Otherwise, alleles and counts (if available) are drawn using Belinda's pgSnp methods. The source code has to be compiled with USE_TABIX=1 (which is automatically set for us by common.mk when it finds the local installation) in order for the CGIs to recognize the track type. diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c new file mode 100644 index 0000000..00f1aff --- /dev/null +++ src/hg/hgTracks/vcfTrack.c @@ -0,0 +1,429 @@ +/* vcfTrack -- handlers for Variant Call Format data. */ + +#include "common.h" +#include "bigWarn.h" +#include "dystring.h" +#include "errCatch.h" +#include "hdb.h" +#include "hgTracks.h" +#include "pgSnp.h" +#include "trashDir.h" +#include "vcf.h" +#if (defined USE_TABIX && defined KNETFILE_HOOKS) +#include "knetUdc.h" +#include "udc.h" +#endif//def USE_TABIX && KNETFILE_HOOKS + +#ifdef USE_TABIX + +//#*** TODO: use trackDb/cart setting or something +static boolean boringBed = FALSE; +static boolean doHapGraphDisplay = TRUE; +static boolean sortByGenotype = TRUE; + +static struct bed4 *vcfFileToBed4(struct vcfFile *vcff) +/* Convert vcff's records to bed4; don't free vcff until you're done with bed4 + * because bed4 contains pointers into vcff's records' chrom and name. */ +{ +struct bed4 *bedList = NULL; +struct vcfRecord *rec; +for (rec = vcff->records; rec != NULL; rec = rec->next) + { + struct bed4 *bed; + AllocVar(bed); + bed->chrom = rec->chrom; + bed->chromStart = rec->chromStart; + bed->chromEnd = rec->chromEnd; + bed->name = rec->name; + slAddHead(&bedList, bed); + } +slReverse(&bedList); +return bedList; +} + +#define VCF_MAX_ALLELE_LEN 80 + +static struct pgSnp *vcfFileToPgSnp(struct vcfFile *vcff) +/* Convert vcff's records to pgSnp; don't free vcff until you're done with pgSnp + * because it contains pointers into vcff's records' chrom. */ +{ +struct pgSnp *pgsList = NULL; +struct vcfRecord *rec; +struct dyString *dy = dyStringNew(0); +for (rec = vcff->records; rec != NULL; rec = rec->next) + { + struct pgSnp *pgs; + AllocVar(pgs); + pgs->chrom = rec->chrom; + pgs->chromStart = rec->chromStart; + pgs->chromEnd = rec->chromEnd; + // Build up slash-separated allele string from rec->ref + rec->alt: + dyStringClear(dy); + dyStringAppend(dy, rec->ref); + int altCount, i; + if (sameString(rec->alt, ".")) + altCount = 0; + else + { + char *words[64]; // we're going to truncate anyway if there are this many alleles! + char copy[VCF_MAX_ALLELE_LEN+1]; + strncpy(copy, rec->alt, VCF_MAX_ALLELE_LEN); + copy[VCF_MAX_ALLELE_LEN] = '\0'; + altCount = chopCommas(copy, words); + for (i = 0; i < altCount && dy->stringSize < VCF_MAX_ALLELE_LEN; i++) + dyStringPrintf(dy, "/%s", words[i]); + if (i < altCount) + altCount = i; + } + pgs->name = cloneStringZ(dy->string, dy->stringSize+1); + pgs->alleleCount = altCount + 1; + // Build up comma-sep list of per-allele counts, if available: + dyStringClear(dy); + int refAlleleCount = 0; + boolean gotAltCounts = FALSE; + for (i = 0; i < rec->infoCount; i++) + if (sameString(rec->infoElements[i].key, "AN")) + { + refAlleleCount = rec->infoElements[i].values[0].datInt; + break; + } + for (i = 0; i < rec->infoCount; i++) + if (sameString(rec->infoElements[i].key, "AC")) + { + int alCounts[64]; + int j; + gotAltCounts = (rec->infoElements[i].count > 0); + for (j = 0; j < rec->infoElements[i].count; j++) + { + int ac = rec->infoElements[i].values[j].datInt; + if (j < altCount) + alCounts[1+j] = ac; + refAlleleCount -= ac; + } + if (gotAltCounts) + { + while (j++ < altCount) + alCounts[1+j] = -1; + alCounts[0] = refAlleleCount; + if (refAlleleCount >= 0) + dyStringPrintf(dy, "%d", refAlleleCount); + else + dyStringAppend(dy, "-1"); + for (j = 0; j < altCount; j++) + if (alCounts[1+j] >= 0) + dyStringPrintf(dy, ",%d", alCounts[1+j]); + else + dyStringAppend(dy, ",-1"); + } + break; + } + if (refAlleleCount > 0 && !gotAltCounts) + dyStringPrintf(dy, "%d", refAlleleCount); + pgs->alleleFreq = cloneStringZ(dy->string, dy->stringSize+1); + // Build up comma-sep list... supposed to be per-allele quality scores but I think + // the VCF spec only gives us one BQ... for the reference position? should ask. + dyStringClear(dy); + for (i = 0; i < rec->infoCount; i++) + if (sameString(rec->infoElements[i].key, "BQ")) + { + float qual = rec->infoElements[i].values[0].datFloat; + dyStringPrintf(dy, "%.1f", qual); + int j; + for (j = 0; j < altCount; j++) + dyStringPrintf(dy, ",%.1f", qual); + break; + } + pgs->alleleScores = cloneStringZ(dy->string, dy->stringSize+1); + slAddHead(&pgsList, pgs); + } +slReverse(&pgsList); +return pgsList; +} + +INLINE char *hapIxToAllele(int hapIx, char *refAllele, char *altAlleles[]) +/* Look up allele by index into reference allele and alternate allele(s). */ +{ +return (hapIx == 0) ? refAllele : altAlleles[hapIx-1]; +} + +INLINE Color colorFromGt(struct vcfGenotype *gt, int ploidIx, char *refAllele, + char *altAlleles[], int altCount) +/* Color allele by base. */ +{ +int hapIx = ploidIx ? gt->hapIxB : gt->hapIxA; +char *allele = hapIxToAllele(hapIx, refAllele, altAlleles); +if (gt->isHaploid && hapIx > 0) + return shadesOfGray[5]; +// Copying pgSnp color scheme here, using first base of allele which is not ideal for multibase +// but allows us to simplify it to 5 colors: +else if (allele[0] == 'A') + return MG_RED; +else if (allele[0] == 'C') + return MG_BLUE; +else if (allele[0] == 'G') + return darkGreenColor; +else if (allele[0] == 'T') + return MG_MAGENTA; +else + return shadesOfGray[5]; +} + +INLINE int drawOneGenotype(struct vcfGenotype *gt, char *ref, char *altAlleles[], int altCount, + struct hvGfx *hvg, int x1, int y, int w, + int itemHeight, int lineHeight) +/* Draw a base-colored box for each haplotype in genotype, with a separating line + * (black for phased genotypes, gray for unphased). Return the new y offset. */ +{ +int ploidIx, ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. +for (ploidIx = 0; ploidIx < ploidy; ploidIx++) + { + if (ploidIx > 0 && gt->isHaploid) + break; + Color color = colorFromGt(gt, ploidIx, ref, altAlleles, altCount); + hvGfxBox(hvg, x1, y, w, itemHeight, color); + if (ploidIx < ploidy-1) + hvGfxLine(hvg, x1, y+itemHeight, x1+w-1, y+itemHeight, + (gt->isPhased ? MG_BLACK : MG_GRAY)); + y += lineHeight; + } +return y; +} + +static void vcfHapGraphDrawGtOrdered(struct track *tg, int gtOrder[], + struct hvGfx *hvg, int xOff, int yOff, int width, + MgFont *font, Color color, enum trackVisibility vis) +/* Draw per-individual haplotypes in gtOrder (maybe phased, maybe not) from a VCF + * file that contains genotypes. */ +{ +const int lineHeight = tg->lineHeight; +const int itemHeight = tg->heightPer; +const double scale = scaleForPixels(width); +struct dyString *tmp = dyStringNew(0); +const struct vcfFile *vcff = tg->extraUiData; +struct vcfRecord *rec; +for (rec = vcff->records; rec != NULL; rec = rec->next) + { + vcfParseGenotypes(rec); + dyStringClear(tmp); + dyStringAppend(tmp, rec->alt); + char *altAlleles[256]; + int altCount = chopCommas(tmp->string, altAlleles); + int x1 = round((double)(rec->chromStart-winStart)*scale) + xOff; + int x2 = round((double)(rec->chromEnd-winStart)*scale) + xOff; + int w = x2-x1; + if (w < 1) + w = 1; + int y = yOff; + int gtIx; + for (gtIx = 0; gtIx < vcff->genotypeCount; gtIx++) + { + struct vcfGenotype *gt = &(rec->genotypes[gtOrder[gtIx]]); + y = drawOneGenotype(gt, rec->ref, altAlleles, altCount, + hvg, x1, y, w, itemHeight, lineHeight); + } + //#*** TODO: pgSnp-like mouseover text? + mapBoxHgcOrHgGene(hvg, rec->chromStart, rec->chromEnd, x1, yOff, w, tg->height, tg->track, + rec->name, rec->name, FALSE, TRUE, NULL); + } +if (vis == tvPack && withLeftLabels) + { + int y = yOff, ploidy = 2, gtIx; + for (gtIx = 0; gtIx < vcff->genotypeCount; gtIx++) + { + hvGfxTextRight(hvg, leftLabelX, y, leftLabelWidth-1, itemHeight*ploidy, + color, font, vcff->genotypeIds[gtOrder[gtIx]]); + y += lineHeight * ploidy; + } + } +} + +/* For qsorting genotypes-in-window: */ +struct gtIxAndStr + { + int ix; // index into vcfRecord's array of vcfGenotypes + struct dyString *dy; // Concatenation of all genotypes in window + }; + +static void gtIxAndStrInit(struct gtIxAndStr *gt, int ix) +/* Initialize the already-allocated gt. */ +{ +gt->ix = ix; +gt->dy = dyStringNew(0); +} + +static int gtIxAndStrCmp(const void *el1, const void *el2) +/* Comparator for qsort */ +{ +const struct gtIxAndStr *gt1 = el1; +const struct gtIxAndStr *gt2 = el2; +int diff = strcmp(gt1->dy->string, gt2->dy->string); +if (diff == 0) + diff = gt1->ix - gt2->ix; +return diff; +} + +static void vcfHapGraphDraw(struct track *tg, int seqStart, int seqEnd, + struct hvGfx *hvg, int xOff, int yOff, int width, + MgFont *font, Color color, enum trackVisibility vis) +/* Draw per-individual haplotypes (maybe phased, maybe not) from a VCF + * file that contains genotypes. Sort samples by genotypes-in-view if specified. */ +{ +const struct vcfFile *vcff = tg->extraUiData; +const int gtCount = vcff->genotypeCount; +int *gtOrder, i; +AllocArray(gtOrder, gtCount); +if (sortByGenotype) + { + // Step through records to build up per-sample arrays of genotypes-in-window: + struct dyString *tmp = dyStringNew(0); + struct gtIxAndStr *gtIxStrs; + AllocArray(gtIxStrs, gtCount); + for (i = 0; i < gtCount; i++) + gtIxAndStrInit(&(gtIxStrs[i]), i); + struct vcfRecord *rec; + for (rec = vcff->records; rec != NULL; rec = rec->next) + { + vcfParseGenotypes(rec); + dyStringClear(tmp); + dyStringAppend(tmp, rec->alt); + char *altAlleles[256]; + (void)chopCommas(tmp->string, altAlleles); + for (i = 0; i < gtCount; i++) + { + struct vcfGenotype *gt = &(rec->genotypes[i]); + dyStringPrintf(gtIxStrs[i].dy, "%s/%s,", + hapIxToAllele(gt->hapIxA, rec->ref, altAlleles), + hapIxToAllele(gt->hapIxB, rec->ref, altAlleles)); + } + } + // Sort samples by genotypes-in-window --> ordering of genotype Ids + qsort(gtIxStrs, gtCount, sizeof(gtIxStrs[0]), gtIxAndStrCmp); + for (i = 0; i < gtCount; i++) + gtOrder[i] = gtIxStrs[i].ix; + } +else + { + // Draw genotypes in order of appearance in VCF file: + for (i = 0; i < gtCount; i++) + gtOrder[i] = i; + } +vcfHapGraphDrawGtOrdered(tg, gtOrder, hvg, xOff, yOff, width, font, color, vis); +} + +static int vcfHapGraphTotalHeight(struct track *tg, enum trackVisibility vis) +/* Return height of haplotype graph (2 * #samples * lineHeight); + * 2 because we're assuming diploid genomes here, no XXY, tetraploid etc. */ +{ +// Should we make it single-height when on chrY? +const struct vcfFile *vcff = tg->extraUiData; +int ploidy = 2; +tg->height = ploidy * vcff->genotypeCount * tg->lineHeight; +return tg->height; +} + +static char *vcfHapGraphTrackName(struct track *tg, void *item) +/* If someone asks for itemName/mapItemName, just send name of track like wiggle. */ +{ +return tg->track; +} + +static void vcfHapGraphOverloadMethods(struct track *tg, struct vcfFile *vcff) +/* If we confirm at load time that we can draw a haplotype graph, use + * this to overwrite the methods for the rest of execution: */ +{ +tg->heightPer = (tg->visibility == tvSquish) ? (tl.fontHeight/4) : (tl.fontHeight / 2); +tg->lineHeight = tg->heightPer + 1; +tg->drawItems = vcfHapGraphDraw; +tg->totalHeight = vcfHapGraphTotalHeight; +tg->itemHeight = tgFixedItemHeight; +tg->itemName = vcfHapGraphTrackName; +tg->mapItemName = vcfHapGraphTrackName; +tg->itemStart = tgItemNoStart; +tg->itemEnd = tgItemNoEnd; +tg->mapsSelf = TRUE; +tg->extraUiData = vcff; +} + +static void vcfTabixLoadItems(struct track *tg) +/* Load items in window from VCF file using its tabix index file. */ +{ +struct sqlConnection *conn = hAllocConnTrack(database, tg->tdb); +// TODO: may need to handle per-chrom files like bam, maybe fold bamFileNameFromTable into this:: +char *fileOrUrl = bbiNameFromSettingOrTable(tg->tdb, conn, tg->table); +hFreeConn(&conn); +int vcfMaxErr = 100; +struct vcfFile *vcff = NULL; +/* protect against temporary network error */ +struct errCatch *errCatch = errCatchNew(); +if (errCatchStart(errCatch)) + { + vcff = vcfTabixFileMayOpen(fileOrUrl, chromName, winStart, winEnd, vcfMaxErr); + } +errCatchEnd(errCatch); +if (errCatch->gotError) + { + if (isNotEmpty(errCatch->message->string)) + tg->networkErrMsg = cloneString(errCatch->message->string); + tg->drawItems = bigDrawWarning; + tg->totalHeight = bigWarnTotalHeight; + } +errCatchFree(&errCatch); +if (vcff != NULL) + { + if (boringBed) + tg->items = vcfFileToBed4(vcff); + else if (doHapGraphDisplay && vcff->genotypeCount > 0 && + (tg->visibility == tvPack || tg->visibility == tvSquish)) + vcfHapGraphOverloadMethods(tg, vcff); + else + { + tg->items = vcfFileToPgSnp(vcff); + /* base coloring/display decision on count of items */ + tg->customInt = slCount(tg->items); + } + // Don't vcfFileFree here -- we are using its string pointers! + } +} + +void vcfTabixMethods(struct track *track) +/* Methods for VCF + tabix files. */ +{ +if (boringBed == TRUE) + bedMethods(track); +else + pgSnpMethods(track); +track->loadItems = vcfTabixLoadItems; +track->canPack = TRUE; +} + +#else // no USE_TABIX: + +// If code was not built with USE_TABIX=1, but there are vcfTabix tracks, display a message +// in place of the tracks (instead of annoying "No track handler" warning messages). + +static void drawUseVcfTabixWarning(struct track *tg, int seqStart, int seqEnd, struct hvGfx *hvg, + int xOff, int yOff, int width, MgFont *font, Color color, + enum trackVisibility vis) +/* Draw a message saying that the code needs to be built with USE_TABIX=1. */ +{ +char message[512]; +safef(message, sizeof(message), + "Get tabix from samtools.sourceforge.net and recompile kent/src with USE_TABIX=1"); +Color yellow = hvGfxFindRgb(hvg, &undefinedYellowColor); +hvGfxBox(hvg, xOff, yOff, width, tg->heightPer, yellow); +hvGfxTextCentered(hvg, xOff, yOff, width, tg->heightPer, MG_BLACK, font, message); +} + +void vcfTabixMethods(struct track *track) +/* Methods for VCF alignment files, in absence of tabix lib. */ +{ +#if (defined USE_TABIX && defined KNETFILE_HOOKS) +knetUdcInstall(); +if (udcCacheTimeout() < 300) + udcSetCacheTimeout(300); +#endif//def USE_TABIX && KNETFILE_HOOKS +messageLineMethods(track); +track->drawItems = drawUseVcfTabixWarning; +} + +#endif // no USE_TABIX