519e0946826199d1d9792fa8df5972843fce021c angie Tue Aug 9 14:39:38 2011 -0700 Feature #2821 (VCF parser): improved representation of alleles:parse ref and comma-sep'd alt allele string into count and array inside record, so callers don't all have to parse the comma-sep'd alternate allele string. diff --git src/hg/hgc/vcfClick.c src/hg/hgc/vcfClick.c index 1ad1a7c..a26df9d 100644 --- src/hg/hgc/vcfClick.c +++ src/hg/hgc/vcfClick.c @@ -29,51 +29,41 @@ int i; for (i = 0; i < wordCount; i++) { if (i > 0) printf(", "); char *key = words[i]; const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key); if (def != NULL) printf("%s (%s)", htmlEncode(key), def->description); else printf("%s", htmlEncode(key)); } printf("
\n"); } -static void printListWithDescriptions(struct vcfFile *vcff, char *str, char *sep, struct vcfInfoDef *infoDefs) -/* Given a VCF field, its separator char and a list of vcfInfoDefs, print out a list - * of values with descriptions if descriptions are available. */ -{ -char *copy = cloneString(str); -char *words[256]; -int wordCount = chopString(copy, sep, words, ArraySize(words)); -printKeysWithDescriptions(vcff, wordCount, words, infoDefs); -} - static void vcfAltAlleleDetails(struct vcfRecord *rec) /* If VCF header specifies any symbolic alternate alleles, pull in descriptions. */ { printf("Alternate allele(s): "); -if (sameString(rec->alt, ".")) +if (rec->alleleCount < 2 || sameString(rec->alleles[1], ".")) { printf(NA"
\n"); return; } struct vcfFile *vcff = rec->file; -printListWithDescriptions(vcff, rec->alt, ",", vcff->altDefs); +printKeysWithDescriptions(vcff, rec->alleleCount-1, &(rec->alleles[1]), vcff->altDefs); } static void vcfQualDetails(struct vcfRecord *rec) /* If VCF header specifies a quality/confidence score (not "."), print it out. */ { printf("Quality/confidence score: %s
\n", sameString(rec->qual, ".") ? NA : rec->qual); } static void vcfFilterDetails(struct vcfRecord *rec) /* If VCF header specifies any filters, pull in descriptions. */ { if (rec->filterCount == 0 || sameString(rec->filters[0], ".")) printf("Filter: "NA"
\n"); else if (rec->filterCount == 1 && sameString(rec->filters[0], "PASS")) printf("Filter: PASS
\n"); @@ -111,125 +101,113 @@ for (j = 0; j < el->count; j++) { if (j > 0) printf(", "); vcfPrintDatum(stdout, el->values[j], type); } if (def != NULL) printf(" %s", def->description); else printf(""); printf("\n"); } puts(""); } -static char *hapFromIx(char *ref, char *altAlleles[], unsigned char altAlCount, unsigned char hapIx) -/* Look up the allele specified by hapIx: 0 = ref, 1 & up = offset index into altAlleles */ -{ -if (hapIx == 0) - return ref; -else if (hapIx-1 < altAlCount) - return altAlleles[hapIx-1]; -else - errAbort("hapFromIx: index %d is out of range (%d alleles specified)", hapIx, altAlCount+1); -return NULL; -} - static void vcfGenotypesDetails(struct vcfRecord *rec, char *track) /* Print genotypes in some kind of table... */ { struct vcfFile *vcff = rec->file; if (vcff->genotypeCount == 0) return; -static struct dyString *tmp1 = NULL, *tmp2 = NULL; +static struct dyString *tmp1 = NULL; if (tmp1 == NULL) - { tmp1 = dyStringNew(0); - tmp2 = dyStringNew(0); - } vcfParseGenotypes(rec); // Tally genotypes and alleles for summary: -// Note: this lumps all alternate alleles together. -int refs = 0, alts = 0, refRefs = 0, refAlts = 0, altAlts = 0, phasedGts = 0; +int refs = 0, alts = 0, refRefs = 0, refAlts = 0, altAlts = 0, gtOther = 0, phasedGts = 0; int i; for (i = 0; i < vcff->genotypeCount; i++) { struct vcfGenotype *gt = &(rec->genotypes[i]); if (gt->isPhased) phasedGts++; if (gt->hapIxA == 0) refs++; else alts++; if (!gt->isHaploid) { if (gt->hapIxB == 0) refs++; else alts++; if (gt->hapIxA == 0 && gt->hapIxB == 0) refRefs++; - else if (gt->hapIxA != 0 && gt->hapIxB != 0) + else if (gt->hapIxA == 1 && gt->hapIxB == 1) altAlts++; - else + else if ((gt->hapIxA == 1 && gt->hapIxB == 0) || + (gt->hapIxA == 0 && gt->hapIxB == 1)) refAlts++; + else + gtOther++; } } printf("Genotype count: %d (%d phased)
\n", vcff->genotypeCount, phasedGts); printf("Alleles: %s: %d (%.2f); %s: %d (%.2f)
\n", - rec->ref, refs, (double)refs/(2*vcff->genotypeCount), - rec->alt, alts, (double)alts/(2*vcff->genotypeCount)); + rec->alleles[0], refs, (double)refs/(2*vcff->genotypeCount), + rec->alleles[1], alts, (double)alts/(2*vcff->genotypeCount)); if (vcff->genotypeCount > 1) - printf("Genotypes: %s/%s: %d (%.2f); %s/%s: %d (%.2f); %s/%s: %d (%.2f)
\n", - rec->ref, rec->ref, refRefs, (double)refRefs/vcff->genotypeCount, - rec->ref, rec->alt, refAlts, (double)refAlts/vcff->genotypeCount, - rec->alt, rec->alt, altAlts, (double)altAlts/vcff->genotypeCount); + { + printf("Genotypes: %s/%s: %d (%.2f); %s/%s: %d (%.2f); %s/%s: %d (%.2f)", + rec->alleles[0], rec->alleles[0], refRefs, (double)refRefs/vcff->genotypeCount, + rec->alleles[0], rec->alleles[1], refAlts, (double)refAlts/vcff->genotypeCount, + rec->alleles[1], rec->alleles[1], altAlts, (double)altAlts/vcff->genotypeCount); + if (gtOther > 0) + printf("; other: %d (%.2f)", gtOther, (double)gtOther/vcff->genotypeCount); + printf("
\n"); + } jsBeginCollapsibleSection(cart, track, "genotypes", "Detailed genotypes", FALSE); dyStringClear(tmp1); dyStringAppend(tmp1, rec->format); enum vcfInfoType formatTypes[256]; char *formatKeys[256]; int formatCount = chopString(tmp1->string, ":", formatKeys, ArraySize(formatKeys)); puts("Genotype info key:
"); for (i = 0; i < formatCount; i++) { if (sameString(formatKeys[i], vcfGtGenotype)) continue; const struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, formatKeys[i]); char *desc = def ? def->description : "not described in VCF header"; printf("  %s: %s
\n", formatKeys[i], desc); formatTypes[i] = def->type; } hTableStart(); puts("Sample IDGenotypePhased?"); for (i = 0; i < formatCount; i++) { if (sameString(formatKeys[i], vcfGtGenotype)) continue; printf("%s", formatKeys[i]); } puts("\n"); -dyStringClear(tmp2); -dyStringAppend(tmp2, rec->alt); -char *altAlleles[256]; -unsigned char altCount = chopCommas(tmp2->string, altAlleles); for (i = 0; i < vcff->genotypeCount; i++) { struct vcfGenotype *gt = &(rec->genotypes[i]); - char *hapA = hapFromIx(rec->ref, altAlleles, altCount, gt->hapIxA); - char *hapB = gt->isHaploid ? NA : hapFromIx(rec->ref, altAlleles, altCount, gt->hapIxB); + char *hapA = rec->alleles[gt->hapIxA]; + char *hapB = gt->isHaploid ? NA : rec->alleles[gt->hapIxB]; char sep = gt->isPhased ? '|' : '/'; char *phasing = gt->isHaploid ? NA : gt->isPhased ? "Y" : "n"; printf("%s%s%c%s%s", vcff->genotypeIds[i], hapA, sep, hapB, phasing); int j; for (j = 0; j < formatCount; j++) { if (sameString(formatKeys[j], vcfGtGenotype)) continue; printf(""); struct vcfInfoElement *el = &(gt->infoElements[j]); int k; for (k = 0; k < el->count; k++) { if (k > 0) @@ -257,31 +235,31 @@ printSeqCodDisplay(database, pgs); } } static void vcfRecordDetails(struct trackDb *tdb, struct vcfRecord *rec) /* Display the contents of a single line of VCF, assumed to be from seqName * (using seqName instead of rec->chrom because rec->chrom might lack "chr"). */ { printf("Name: %s
\n", rec->name); printCustomUrl(tdb, rec->name, TRUE); static char *formName = "vcfCfgHapCenter"; printf("
\n", formName, hgTracksName()); vcfCfgHaplotypeCenter(cart, tdb, rec->file, rec->name, seqName, rec->chromStart, formName); printf("
\n"); printPosOnChrom(seqName, rec->chromStart, rec->chromEnd, NULL, FALSE, rec->name); -printf("Reference allele: %s
\n", rec->ref); +printf("Reference allele: %s
\n", rec->alleles[0]); vcfAltAlleleDetails(rec); vcfQualDetails(rec); vcfFilterDetails(rec); vcfInfoDetails(rec); pgSnpCodingDetail(rec); // Wrapper table for collapsible section: puts(""); vcfGenotypesDetails(rec, tdb->track); puts("
"); } void doVcfTabixDetails(struct trackDb *tdb, char *item) /* Show details of an alignment from a VCF file compressed and indexed by tabix. */ { #if (defined USE_TABIX && defined KNETFILE_HOOKS)