491395482b817cfe82cbdef4dd7f8ee0723c1a4a angie Wed Mar 23 21:01:49 2011 -0700 Feature #2822, #2823 (VCF customFactory + track handler):Added a new track type, vcfTabix, with handlers in hgTracks and hgc and a customFactory. It is a new bigDataUrl type of track; the remote VCF file must be compressed and indexed by tabix, so like BAM a separate index file is required. If the VCF file has genotypes, then each sample's two haplotypes are graphed in a line, with one line per sample. Otherwise, alleles and counts (if available) are drawn using Belinda's pgSnp methods. The source code has to be compiled with USE_TABIX=1 (which is automatically set for us by common.mk when it finds the local installation) in order for the CGIs to recognize the track type. diff --git src/hg/hgc/vcfClick.c src/hg/hgc/vcfClick.c new file mode 100644 index 0000000..7dd415c --- /dev/null +++ src/hg/hgc/vcfClick.c @@ -0,0 +1,244 @@ +/* vcfTrack -- handlers for Variant Call Format data. */ + +#ifdef USE_TABIX + +#include "common.h" +#include "dystring.h" +#include "errCatch.h" +#include "hCommon.h" +#include "hdb.h" +#include "hgc.h" +#if (defined USE_TABIX && defined KNETFILE_HOOKS) +#include "knetUdc.h" +#include "udc.h" +#endif//def USE_TABIX && KNETFILE_HOOKS +#include "pgSnp.h" +#include "trashDir.h" +#include "vcf.h" + + +#define NA "n/a" + +static void printListWithDescriptions(struct vcfFile *vcff, char *str, char *sep, struct vcfInfoDef *infoDefs) +/* Given a VCF field, its separator char and a list of vcfInfoDefs, print out a list + * of values with descriptions if descriptions are available. */ +{ +char *copy = cloneString(str); +char *words[256]; +int i, wordCount = chopString(copy, sep, words, ArraySize(words)); +for (i = 0; i < wordCount; i++) + { + if (i > 0) + printf(", "); + char *key = words[i]; + const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key); + if (def != NULL) + printf("%s (%s)", key, def->description); + else + printf("%s", key); + } +printf("
\n"); +} + +static void vcfAltAlleleDetails(struct vcfRecord *rec) +/* If VCF header specifies any symbolic alternate alleles, pull in descriptions. */ +{ +printf("Alternate allele(s): "); +if (sameString(rec->alt, ".")) + { + printf(NA"
\n"); + return; + } +struct vcfFile *vcff = rec->file; +printListWithDescriptions(vcff, rec->alt, ",", vcff->altDefs); +} + +static void vcfFilterDetails(struct vcfRecord *rec) +/* If VCF header specifies any filters, pull in descriptions. */ +{ +if (sameString(rec->filter, "PASS")) + printf("Filter: "); +else + printf("Filter failures: "); +struct vcfFile *vcff = rec->file; +if (sameString(rec->filter, ".")) + printf(NA"
\n"); +else + printListWithDescriptions(vcff, rec->filter, ",", vcff->filterDefs); +} + +static void vcfInfoDetails(struct vcfRecord *rec) +/* Expand info keys to descriptions, then print out keys and values. */ +{ +if (rec->infoCount == 0) + return; +struct vcfFile *vcff = rec->file; +puts("INFO column annotations:
"); +puts(""); +int i; +for (i = 0; i < rec->infoCount; i++) + { + struct vcfInfoElement *el = &(rec->infoElements[i]); + const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, el->key); + if (def == NULL) + continue; + printf("\n"); + } +puts("
%s: ", el->key); + int j; + enum vcfInfoType type = def->type; + if (type == vcfInfoFlag && el->count == 0) + printf("Yes"); // no values, so we can't call vcfPrintDatum... + // However, if this is older VCF, type vcfInfoFlag might have a value. + for (j = 0; j < el->count; j++) + { + if (j > 0) + printf(", "); + vcfPrintDatum(stdout, el->values[j], type); + } + if (def != NULL) + printf(" %s", def->description); + else + printf(""); + printf("
"); +} + +static char *hapFromIx(char *ref, char *altAlleles[], unsigned char altAlCount, unsigned char hapIx) +/* Look up the allele specified by hapIx: 0 = ref, 1 & up = offset index into altAlleles */ +{ +if (hapIx == 0) + return ref; +else if (hapIx-1 < altAlCount) + return altAlleles[hapIx-1]; +else + errAbort("hapFromIx: index %d is out of range (%d alleles specified)", hapIx, altAlCount+1); +return NULL; +} + +static void vcfGenotypesDetails(struct vcfRecord *rec) +/* Print genotypes in some kind of table... */ +{ +struct vcfFile *vcff = rec->file; +if (vcff->genotypeCount == 0) + return; +static struct dyString *tmp1 = NULL, *tmp2 = NULL; +if (tmp1 == NULL) + { + tmp1 = dyStringNew(0); + tmp2 = dyStringNew(0); + } +// TODO: make this a collapsible section +vcfParseGenotypes(rec); +dyStringClear(tmp1); +dyStringAppend(tmp1, rec->format); +enum vcfInfoType formatTypes[256]; +char *formatKeys[256]; +int formatCount = chopString(tmp1->string, ":", formatKeys, ArraySize(formatKeys)); +puts("
Genotype info key:
"); +int i; +for (i = 0; i < formatCount; i++) + { + if (sameString(formatKeys[i], vcfGtGenotype)) + continue; + const struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, formatKeys[i]); + char *desc = def ? def->description : "not described in VCF header"; + printf("  %s: %s
\n", formatKeys[i], desc); + formatTypes[i] = def->type; + } +hTableStart(); +puts("Sample IDGenotypePhased?"); +for (i = 0; i < formatCount; i++) + { + if (sameString(formatKeys[i], vcfGtGenotype)) + continue; + printf("%s", formatKeys[i]); + } +puts("\n"); +dyStringClear(tmp2); +dyStringAppend(tmp2, rec->alt); +char *altAlleles[256]; +unsigned char altCount = chopCommas(tmp2->string, altAlleles); +for (i = 0; i < vcff->genotypeCount; i++) + { + struct vcfGenotype *gt = &(rec->genotypes[i]); + char *hapA = hapFromIx(rec->ref, altAlleles, altCount, gt->hapIxA); + char *hapB = gt->isHaploid ? NA : hapFromIx(rec->ref, altAlleles, altCount, gt->hapIxB); + char sep = gt->isPhased ? '|' : '/'; + char *phasing = gt->isHaploid ? NA : gt->isPhased ? "Y" : "n"; + printf("%s%s%c%s%s", vcff->genotypeIds[i], + hapA, sep, hapB, phasing); + int j; + for (j = 0; j < formatCount; j++) + { + if (sameString(formatKeys[j], vcfGtGenotype)) + continue; + printf(""); + struct vcfInfoElement *el = &(gt->infoElements[j]); + int k; + for (k = 0; k < el->count; k++) + { + if (k > 0) + printf(", "); + vcfPrintDatum(stdout, el->values[k], formatTypes[j]); + } + printf(""); + } + puts(""); + } +hTableEnd(); +} + +static void vcfRecordDetails(struct vcfRecord *rec) +/* Display the contents of a single line of VCF. */ +{ +printf("Name: %s
\n", rec->name); +printPosOnChrom(seqName, rec->chromStart, rec->chromEnd, NULL, FALSE, rec->name); +printf("Reference allele: %s
\n", rec->ref); +vcfAltAlleleDetails(rec); +if (rec->qual != 0.0) + printf("Call quality: %.1f
\n", rec->qual); +vcfFilterDetails(rec); +vcfInfoDetails(rec); +vcfGenotypesDetails(rec); +} + +void doVcfTabixDetails(struct trackDb *tdb, char *item) +/* Show details of an alignment from a VCF file compressed and indexed by tabix. */ +{ +#if (defined USE_TABIX && defined KNETFILE_HOOKS) +knetUdcInstall(); +if (udcCacheTimeout() < 300) + udcSetCacheTimeout(300); +#endif//def USE_TABIX && KNETFILE_HOOKS +int start = cartInt(cart, "o"); +int end = cartInt(cart, "t"); +struct sqlConnection *conn = hAllocConnTrack(database, tdb); +// TODO: will need to handle per-chrom files like bam, maybe fold bamFileNameFromTable into this:: +char *fileOrUrl = bbiNameFromSettingOrTable(tdb, conn, tdb->table); +hFreeConn(&conn); +int vcfMaxErr = 100; +struct vcfFile *vcff = NULL; +/* protect against temporary network error */ +struct errCatch *errCatch = errCatchNew(); +if (errCatchStart(errCatch)) + { + vcff = vcfTabixFileMayOpen(fileOrUrl, seqName, start, end, vcfMaxErr); + } +errCatchEnd(errCatch); +if (errCatch->gotError) + { + if (isNotEmpty(errCatch->message->string)) + warn("%s", errCatch->message->string); + } +errCatchFree(&errCatch); +if (vcff != NULL) + { + struct vcfRecord *rec; + for (rec = vcff->records; rec != NULL; rec = rec->next) + if (rec->chromStart == start && rec->chromEnd == end) // in pgSnp mode, don't get name + vcfRecordDetails(rec); + } +} + + +#endif // no USE_TABIX