2c0f74b4fbfefc533d27bf3ccdc18e4ecfdecae7 angie Fri Oct 5 17:07:18 2012 -0700 Follow-up to ce70491d:1. Move hgTracks left-base-of-indel trimming code up to vcf.[ch] for sharing w/hgc. 2. Correct chromStart in hgTracks mapBox links to hgc when we have trimmed a left base. 3. In hgc, abbreviate long sequences (e.g. 40kb deletion) and show trimmed left base in parentheses for consistency with VCF file (and sometimes INFO fields that use left-inclusive coords/seqs). 4. In pgSnpFromVcfRecord, don't truncate long alleles because hgTracks and hgc do their own abbreviating. diff --git src/hg/hgc/vcfClick.c src/hg/hgc/vcfClick.c index 6ee70bc..753368d 100644 --- src/hg/hgc/vcfClick.c +++ src/hg/hgc/vcfClick.c @@ -1,351 +1,399 @@ /* vcfTrack -- handlers for Variant Call Format data. */ #ifdef USE_TABIX #include "common.h" #include "dystring.h" #include "errCatch.h" #include "hCommon.h" #include "hdb.h" #include "hgc.h" #include "htmshell.h" #include "jsHelper.h" #if (defined USE_TABIX && defined KNETFILE_HOOKS) #include "knetUdc.h" #include "udc.h" #endif//def USE_TABIX && KNETFILE_HOOKS #include "pgSnp.h" #include "trashDir.h" #include "vcf.h" #include "vcfUi.h" #define NA "<em>n/a</em>" static void printKeysWithDescriptions(struct vcfFile *vcff, int wordCount, char **words, struct vcfInfoDef *infoDefs) /* Given an array of keys, print out a list of values with * descriptions if descriptions are available. */ { int i; for (i = 0; i < wordCount; i++) { if (i > 0) printf(", "); char *key = words[i]; const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key); if (def != NULL) printf("%s (%s)", htmlEncode(key), def->description); else printf("%s", htmlEncode(key)); } printf("<BR>\n"); } -static void vcfAltAlleleDetails(struct vcfRecord *rec) +static void vcfAltAlleleDetails(struct vcfRecord *rec, char **displayAls) /* If VCF header specifies any symbolic alternate alleles, pull in descriptions. */ { printf("<B>Alternate allele(s):</B> "); if (rec->alleleCount < 2 || sameString(rec->alleles[1], ".")) { printf(NA"<BR>\n"); return; } struct vcfFile *vcff = rec->file; -printKeysWithDescriptions(vcff, rec->alleleCount-1, &(rec->alleles[1]), vcff->altDefs); +printKeysWithDescriptions(vcff, rec->alleleCount-1, &(displayAls[1]), vcff->altDefs); } static void vcfQualDetails(struct vcfRecord *rec) /* If VCF header specifies a quality/confidence score (not "."), print it out. */ { printf("<B>Quality/confidence score:</B> %s<BR>\n", sameString(rec->qual, ".") ? NA : rec->qual); } static void vcfFilterDetails(struct vcfRecord *rec) /* If VCF header specifies any filters, pull in descriptions. */ { if (rec->filterCount == 0 || sameString(rec->filters[0], ".")) printf("<B>Filter:</B> "NA"<BR>\n"); else if (rec->filterCount == 1 && sameString(rec->filters[0], "PASS")) printf("<B>Filter:</B> PASS<BR>\n"); else { printf("<B>Filter failures:</B> "); printf("<font style='font-weight: bold; color: #FF0000;'>\n"); struct vcfFile *vcff = rec->file; printKeysWithDescriptions(vcff, rec->filterCount, rec->filters, vcff->filterDefs); printf("</font>\n"); } } static void vcfInfoDetails(struct vcfRecord *rec) /* Expand info keys to descriptions, then print out keys and values. */ { if (rec->infoCount == 0) return; struct vcfFile *vcff = rec->file; puts("<B>INFO column annotations:</B><BR>"); puts("<TABLE border=0 cellspacing=0 cellpadding=0>"); int i; for (i = 0; i < rec->infoCount; i++) { struct vcfInfoElement *el = &(rec->infoElements[i]); const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, el->key); if (def == NULL) continue; printf("<TR><TD align=\"right\"><B>%s:</B></TD><TD> ", el->key); int j; enum vcfInfoType type = def->type; if (type == vcfInfoFlag && el->count == 0) printf("Yes"); // no values, so we can't call vcfPrintDatum... // However, if this is older VCF, type vcfInfoFlag might have a value. for (j = 0; j < el->count; j++) { if (j > 0) printf(", "); if (el->missingData[j]) printf("."); else vcfPrintDatum(stdout, el->values[j], type); } if (def != NULL) printf("</TD><TD> %s", def->description); else printf("</TD><TD>"); printf("</TD></TR>\n"); } puts("</TABLE>"); } +static void vcfGenotypeTable(struct vcfRecord *rec, char *track, char **displayAls) +/* Put the table containing details about each genotype into a collapsible section. */ +{ +static struct dyString *tmp1 = NULL; +if (tmp1 == NULL) + tmp1 = dyStringNew(0); +jsBeginCollapsibleSection(cart, track, "genotypes", "Detailed genotypes", FALSE); +dyStringClear(tmp1); +dyStringAppend(tmp1, rec->format); +struct vcfFile *vcff = rec->file; +enum vcfInfoType formatTypes[256]; +char *formatKeys[256]; +int formatCount = chopString(tmp1->string, ":", formatKeys, ArraySize(formatKeys)); +puts("<B>Genotype info key:</B><BR>"); +int i; +for (i = 0; i < formatCount; i++) + { + if (sameString(formatKeys[i], vcfGtGenotype)) + continue; + const struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, formatKeys[i]); + char *desc = def ? def->description : "<em>not described in VCF header</em>"; + printf(" <B>%s:</B> %s<BR>\n", formatKeys[i], desc); + formatTypes[i] = def->type; + } +hTableStart(); +puts("<TR><TH>Sample ID</TH><TH>Genotype</TH><TH>Phased?</TH>"); +for (i = 0; i < formatCount; i++) + { + if (sameString(formatKeys[i], vcfGtGenotype)) + continue; + printf("<TH>%s</TH>", formatKeys[i]); + } +puts("</TR>\n"); +for (i = 0; i < vcff->genotypeCount; i++) + { + struct vcfGenotype *gt = &(rec->genotypes[i]); + char *hapA = ".", *hapB = "."; + if (gt->hapIxA >= 0) + hapA = displayAls[(unsigned char)gt->hapIxA]; + if (gt->isHaploid) + hapB = ""; + else if (gt->hapIxB >= 0) + hapB = displayAls[(unsigned char)gt->hapIxB]; + char sep = gt->isHaploid ? ' ' : gt->isPhased ? '|' : '/'; + char *phasing = gt->isHaploid ? NA : gt->isPhased ? "Y" : "n"; + printf("<TR><TD>%s</TD><TD>%s%c%s</TD><TD>%s</TD>", vcff->genotypeIds[i], + hapA, sep, hapB, phasing); + int j; + for (j = 0; j < gt->infoCount; j++) + { + if (sameString(formatKeys[j], vcfGtGenotype)) + continue; + printf("<TD>"); + struct vcfInfoElement *el = &(gt->infoElements[j]); + int k; + for (k = 0; k < el->count; k++) + { + if (k > 0) + printf(", "); + if (el->missingData[k]) + printf("."); + else + vcfPrintDatum(stdout, el->values[k], formatTypes[j]); + } + printf("</TD>"); + } + puts("</TR>"); + } +hTableEnd(); +jsEndCollapsibleSection(); +} + static void ignoreEm(char *format, va_list args) /* Ignore warnings from genotype parsing -- when there's one, there * are usually hundreds more just like it. */ { } -static void vcfGenotypesDetails(struct vcfRecord *rec, char *track) -/* Print genotypes in some kind of table... */ +static void vcfGenotypesDetails(struct vcfRecord *rec, char *track, char **displayAls) +/* Print summary of allele and genotype frequency, plus collapsible section + * with table of genotype details. */ { struct vcfFile *vcff = rec->file; if (vcff->genotypeCount == 0) return; -static struct dyString *tmp1 = NULL; -if (tmp1 == NULL) - tmp1 = dyStringNew(0); +// Wrapper table for collapsible section: +puts("<TABLE>"); pushWarnHandler(ignoreEm); vcfParseGenotypes(rec); popWarnHandler(); // Tally genotypes and alleles for summary: int refs = 0, alts = 0, unks = 0; int refRefs = 0, refAlts = 0, altAlts = 0, gtUnk = 0, gtOther = 0, phasedGts = 0; int i; for (i = 0; i < vcff->genotypeCount; i++) { struct vcfGenotype *gt = &(rec->genotypes[i]); if (gt->isPhased) phasedGts++; if (gt->hapIxA == 0) refs++; else if (gt->hapIxA > 0) alts++; else unks++; if (!gt->isHaploid) { if (gt->hapIxB == 0) refs++; else if (gt->hapIxB > 0) alts++; else unks++; if (gt->hapIxA == 0 && gt->hapIxB == 0) refRefs++; else if (gt->hapIxA == 1 && gt->hapIxB == 1) altAlts++; else if ((gt->hapIxA == 1 && gt->hapIxB == 0) || (gt->hapIxA == 0 && gt->hapIxB == 1)) refAlts++; else if (gt->hapIxA < 0 || gt->hapIxB < 0) gtUnk++; else gtOther++; } } printf("<B>Genotype count:</B> %d", vcff->genotypeCount); if (differentString(seqName, "chrY")) printf(" (%d phased)", phasedGts); else printf(" (haploid)"); puts("<BR>"); int totalAlleles = refs + alts + unks; double refAf = (double)refs/totalAlleles; double altAf = (double)alts/totalAlleles; printf("<B>Alleles:</B> %s: %d (%.3f%%); %s: %d (%.3f%%)", - rec->alleles[0], refs, 100*refAf, rec->alleles[1], alts, 100*altAf); + displayAls[0], refs, 100*refAf, displayAls[1], alts, 100*altAf); if (unks > 0) printf("; unknown: %d (%.3f%%)", unks, 100 * (double)unks/totalAlleles); puts("<BR>"); // Should be a better way to detect haploid chromosomes than comparison with "chrY": if (vcff->genotypeCount > 1 && differentString(seqName, "chrY")) { printf("<B>Genotypes:</B> %s/%s: %d (%.3f%%); %s/%s: %d (%.3f%%); %s/%s: %d (%.3f%%)", - rec->alleles[0], rec->alleles[0], refRefs, 100*(double)refRefs/vcff->genotypeCount, - rec->alleles[0], rec->alleles[1], refAlts, 100*(double)refAlts/vcff->genotypeCount, - rec->alleles[1], rec->alleles[1], altAlts, 100*(double)altAlts/vcff->genotypeCount); + displayAls[0], displayAls[0], refRefs, 100*(double)refRefs/vcff->genotypeCount, + displayAls[0], displayAls[1], refAlts, 100*(double)refAlts/vcff->genotypeCount, + displayAls[1], displayAls[1], altAlts, 100*(double)altAlts/vcff->genotypeCount); if (gtUnk > 0) printf("; unknown: %d (%.3f%%)", gtUnk, 100*(double)gtUnk/vcff->genotypeCount); if (gtOther > 0) printf("; other: %d (%.3f%%)", gtOther, 100*(double)gtOther/vcff->genotypeCount); printf("<BR>\n"); if (rec->alleleCount == 2) printf("<B>Hardy-Weinberg equilibrium:</B> " "P(%s/%s) = %.3f%%; P(%s/%s) = %.3f%%; P(%s/%s) = %.3f%%<BR>", - rec->alleles[0], rec->alleles[0], 100*refAf*refAf, - rec->alleles[0], rec->alleles[1], 100*2*refAf*altAf, - rec->alleles[1], rec->alleles[1], 100*altAf*altAf); + displayAls[0], displayAls[0], 100*refAf*refAf, + displayAls[0], displayAls[1], 100*2*refAf*altAf, + displayAls[1], displayAls[1], 100*altAf*altAf); } -jsBeginCollapsibleSection(cart, track, "genotypes", "Detailed genotypes", FALSE); -dyStringClear(tmp1); -dyStringAppend(tmp1, rec->format); -enum vcfInfoType formatTypes[256]; -char *formatKeys[256]; -int formatCount = chopString(tmp1->string, ":", formatKeys, ArraySize(formatKeys)); -puts("<B>Genotype info key:</B><BR>"); -for (i = 0; i < formatCount; i++) - { - if (sameString(formatKeys[i], vcfGtGenotype)) - continue; - const struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, formatKeys[i]); - char *desc = def ? def->description : "<em>not described in VCF header</em>"; - printf(" <B>%s:</B> %s<BR>\n", formatKeys[i], desc); - formatTypes[i] = def->type; - } -hTableStart(); -puts("<TR><TH>Sample ID</TH><TH>Genotype</TH><TH>Phased?</TH>"); -for (i = 0; i < formatCount; i++) - { - if (sameString(formatKeys[i], vcfGtGenotype)) - continue; - printf("<TH>%s</TH>", formatKeys[i]); - } -puts("</TR>\n"); -for (i = 0; i < vcff->genotypeCount; i++) - { - struct vcfGenotype *gt = &(rec->genotypes[i]); - char *hapA = ".", *hapB = "."; - if (gt->hapIxA >= 0) - hapA = rec->alleles[(unsigned char)gt->hapIxA]; - if (gt->isHaploid) - hapB = ""; - else if (gt->hapIxB >= 0) - hapB = rec->alleles[(unsigned char)gt->hapIxB]; - char sep = gt->isHaploid ? ' ' : gt->isPhased ? '|' : '/'; - char *phasing = gt->isHaploid ? NA : gt->isPhased ? "Y" : "n"; - printf("<TR><TD>%s</TD><TD>%s%c%s</TD><TD>%s</TD>", vcff->genotypeIds[i], - hapA, sep, hapB, phasing); - int j; - for (j = 0; j < gt->infoCount; j++) - { - if (sameString(formatKeys[j], vcfGtGenotype)) - continue; - printf("<TD>"); - struct vcfInfoElement *el = &(gt->infoElements[j]); - int k; - for (k = 0; k < el->count; k++) - { - if (k > 0) - printf(", "); - if (el->missingData[k]) - printf("."); - else - vcfPrintDatum(stdout, el->values[k], formatTypes[j]); - } - printf("</TD>"); - } - puts("</TR>"); - } -hTableEnd(); -jsEndCollapsibleSection(); +vcfGenotypeTable(rec, track, displayAls); +puts("</TABLE>"); } static void pgSnpCodingDetail(struct vcfRecord *rec) /* Translate rec into pgSnp (with proper chrom name) and call Belinda's * coding effect predictor from pgSnp details. */ { char *genePredTable = "knownGene"; if (hTableExists(database, genePredTable)) { struct pgSnp *pgs = pgSnpFromVcfRecord(rec); if (!sameString(rec->chrom, seqName)) // rec->chrom might be missing "chr" prefix: pgs->chrom = seqName; printSeqCodDisplay(database, pgs, genePredTable); } } +static void abbreviateLongSeq(char *seqIn, int endLength, struct dyString *dy) +/* If seqIn is longer than 2*endLength plus abbreviation fudge, abbreviate it + * to its first endLength bases, ellipsis that says how many bases are skipped, + * and its last endLength bases; add result to dy. */ +{ +int threshold = 2*endLength + 30; +int seqInLen = strlen(seqIn); +if (seqInLen > threshold) + { + dyStringAppendN(dy, seqIn, endLength); + int skippedLen = seqInLen-2*endLength; + dyStringPrintf(dy, "...<%d bases>...%s", + skippedLen, seqIn+seqInLen-endLength); + } +else + dyStringAppend(dy, seqIn); +} + +static void makeDisplayAlleles(struct vcfRecord *rec, boolean showLeftBase, char leftBase, + int endLength, char **displayAls) +/* If necessary, show the left base that we trimmed and/or abbreviate long sequences. */ +{ +int i; +for (i = 0; i < rec->alleleCount; i++) + { + struct dyString *dy = dyStringNew(128); + if (showLeftBase) + dyStringPrintf(dy, "(%c)", leftBase); + abbreviateLongSeq(rec->alleles[i], endLength, dy); + displayAls[i] = dy->string; // leak some mem + } +} + static void vcfRecordDetails(struct trackDb *tdb, struct vcfRecord *rec) /* Display the contents of a single line of VCF, assumed to be from seqName * (using seqName instead of rec->chrom because rec->chrom might lack "chr"). */ { printf("<B>Name:</B> %s<BR>\n", rec->name); printCustomUrl(tdb, rec->name, TRUE); static char *formName = "vcfCfgHapCenter"; printf("<FORM NAME=\"%s\" ACTION=\"%s\">\n", formName, hgTracksName()); cartSaveSession(cart); vcfCfgHaplotypeCenter(cart, tdb, tdb->track, FALSE, rec->file, rec->name, seqName, rec->chromStart, formName); printf("</FORM>\n"); +char leftBase = rec->alleles[0][0]; +unsigned int vcfStart = vcfRecordTrimIndelLeftBase(rec); +boolean showLeftBase = (rec->chromStart == vcfStart+1); +char *displayAls[rec->alleleCount]; +makeDisplayAlleles(rec, showLeftBase, leftBase, 20, displayAls); printPosOnChrom(seqName, rec->chromStart, rec->chromEnd, NULL, FALSE, rec->name); -printf("<B>Reference allele:</B> %s<BR>\n", rec->alleles[0]); -vcfAltAlleleDetails(rec); +printf("<B>Reference allele:</B> %s<BR>\n", displayAls[0]); +vcfAltAlleleDetails(rec, displayAls); vcfQualDetails(rec); vcfFilterDetails(rec); vcfInfoDetails(rec); pgSnpCodingDetail(rec); -// Wrapper table for collapsible section: -puts("<TABLE>"); -vcfGenotypesDetails(rec, tdb->track); -puts("</TABLE>"); +makeDisplayAlleles(rec, showLeftBase, leftBase, 5, displayAls); +vcfGenotypesDetails(rec, tdb->track, displayAls); } void doVcfTabixDetails(struct trackDb *tdb, char *item) /* Show details of an alignment from a VCF file compressed and indexed by tabix. */ { #if (defined USE_TABIX && defined KNETFILE_HOOKS) knetUdcInstall(); if (udcCacheTimeout() < 300) udcSetCacheTimeout(300); #endif//def USE_TABIX && KNETFILE_HOOKS int start = cartInt(cart, "o"); int end = cartInt(cart, "t"); struct sqlConnection *conn = hAllocConnTrack(database, tdb); // TODO: will need to handle per-chrom files like bam, maybe fold bamFileNameFromTable into this:: char *fileOrUrl = bbiNameFromSettingOrTableChrom(tdb, conn, tdb->table, seqName); hFreeConn(&conn); int vcfMaxErr = -1; struct vcfFile *vcff = NULL; /* protect against temporary network error */ struct errCatch *errCatch = errCatchNew(); if (errCatchStart(errCatch)) { vcff = vcfTabixFileMayOpen(fileOrUrl, seqName, start, end, vcfMaxErr, -1); } errCatchEnd(errCatch); if (errCatch->gotError) { if (isNotEmpty(errCatch->message->string)) warn("%s", errCatch->message->string); } errCatchFree(&errCatch); if (vcff != NULL) { struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) if (rec->chromStart == start && rec->chromEnd == end) // in pgSnp mode, don't get name vcfRecordDetails(tdb, rec); } else printf("Sorry, unable to open %s<BR>\n", fileOrUrl); } #endif // no USE_TABIX