66c9381654739f32b29ad55e3f1188a138b12b66 angie Mon Jul 23 08:44:19 2012 -0700 Track #7964 (1000 Genomes Phase 1): added recently-released chrY.Fixed bugs with display of haploid genotype data. 1000 Genomes' chrY has tons of missing calls, and it turns out that clustering results are much better if missing info is completely omitted from distance measure -- although the tree calls things identical that clearly aren't. diff --git src/hg/hgc/vcfClick.c src/hg/hgc/vcfClick.c index 8a8eb68..6ee70bc 100644 --- src/hg/hgc/vcfClick.c +++ src/hg/hgc/vcfClick.c @@ -123,69 +123,88 @@ } static void vcfGenotypesDetails(struct vcfRecord *rec, char *track) /* Print genotypes in some kind of table... */ { struct vcfFile *vcff = rec->file; if (vcff->genotypeCount == 0) return; static struct dyString *tmp1 = NULL; if (tmp1 == NULL) tmp1 = dyStringNew(0); pushWarnHandler(ignoreEm); vcfParseGenotypes(rec); popWarnHandler(); // Tally genotypes and alleles for summary: -int refs = 0, alts = 0, refRefs = 0, refAlts = 0, altAlts = 0, gtOther = 0, phasedGts = 0; +int refs = 0, alts = 0, unks = 0; +int refRefs = 0, refAlts = 0, altAlts = 0, gtUnk = 0, gtOther = 0, phasedGts = 0; int i; for (i = 0; i < vcff->genotypeCount; i++) { struct vcfGenotype *gt = &(rec->genotypes[i]); if (gt->isPhased) phasedGts++; if (gt->hapIxA == 0) refs++; else if (gt->hapIxA > 0) alts++; + else + unks++; if (!gt->isHaploid) { if (gt->hapIxB == 0) refs++; else if (gt->hapIxB > 0) alts++; + else + unks++; if (gt->hapIxA == 0 && gt->hapIxB == 0) refRefs++; else if (gt->hapIxA == 1 && gt->hapIxB == 1) altAlts++; else if ((gt->hapIxA == 1 && gt->hapIxB == 0) || (gt->hapIxA == 0 && gt->hapIxB == 1)) refAlts++; + else if (gt->hapIxA < 0 || gt->hapIxB < 0) + gtUnk++; else gtOther++; } } -printf("<B>Genotype count:</B> %d (%d phased)<BR>\n", vcff->genotypeCount, phasedGts); -double refAf = (double)refs/(2*vcff->genotypeCount); -double altAf = (double)alts/(2*vcff->genotypeCount); -printf("<B>Alleles:</B> %s: %d (%.3f%%); %s: %d (%.3f%%)<BR>\n", +printf("<B>Genotype count:</B> %d", vcff->genotypeCount); +if (differentString(seqName, "chrY")) + printf(" (%d phased)", phasedGts); +else + printf(" (haploid)"); +puts("<BR>"); +int totalAlleles = refs + alts + unks; +double refAf = (double)refs/totalAlleles; +double altAf = (double)alts/totalAlleles; +printf("<B>Alleles:</B> %s: %d (%.3f%%); %s: %d (%.3f%%)", rec->alleles[0], refs, 100*refAf, rec->alleles[1], alts, 100*altAf); -if (vcff->genotypeCount > 1) +if (unks > 0) + printf("; unknown: %d (%.3f%%)", unks, 100 * (double)unks/totalAlleles); +puts("<BR>"); +// Should be a better way to detect haploid chromosomes than comparison with "chrY": +if (vcff->genotypeCount > 1 && differentString(seqName, "chrY")) { printf("<B>Genotypes:</B> %s/%s: %d (%.3f%%); %s/%s: %d (%.3f%%); %s/%s: %d (%.3f%%)", rec->alleles[0], rec->alleles[0], refRefs, 100*(double)refRefs/vcff->genotypeCount, rec->alleles[0], rec->alleles[1], refAlts, 100*(double)refAlts/vcff->genotypeCount, rec->alleles[1], rec->alleles[1], altAlts, 100*(double)altAlts/vcff->genotypeCount); + if (gtUnk > 0) + printf("; unknown: %d (%.3f%%)", gtUnk, 100*(double)gtUnk/vcff->genotypeCount); if (gtOther > 0) printf("; other: %d (%.3f%%)", gtOther, 100*(double)gtOther/vcff->genotypeCount); printf("<BR>\n"); if (rec->alleleCount == 2) printf("<B>Hardy-Weinberg equilibrium:</B> " "P(%s/%s) = %.3f%%; P(%s/%s) = %.3f%%; P(%s/%s) = %.3f%%<BR>", rec->alleles[0], rec->alleles[0], 100*refAf*refAf, rec->alleles[0], rec->alleles[1], 100*2*refAf*altAf, rec->alleles[1], rec->alleles[1], 100*altAf*altAf); } jsBeginCollapsibleSection(cart, track, "genotypes", "Detailed genotypes", FALSE); dyStringClear(tmp1); dyStringAppend(tmp1, rec->format); enum vcfInfoType formatTypes[256]; char *formatKeys[256]; @@ -204,34 +223,34 @@ puts("<TR><TH>Sample ID</TH><TH>Genotype</TH><TH>Phased?</TH>"); for (i = 0; i < formatCount; i++) { if (sameString(formatKeys[i], vcfGtGenotype)) continue; printf("<TH>%s</TH>", formatKeys[i]); } puts("</TR>\n"); for (i = 0; i < vcff->genotypeCount; i++) { struct vcfGenotype *gt = &(rec->genotypes[i]); char *hapA = ".", *hapB = "."; if (gt->hapIxA >= 0) hapA = rec->alleles[(unsigned char)gt->hapIxA]; if (gt->isHaploid) - hapB = NA; + hapB = ""; else if (gt->hapIxB >= 0) hapB = rec->alleles[(unsigned char)gt->hapIxB]; - char sep = gt->isPhased ? '|' : '/'; + char sep = gt->isHaploid ? ' ' : gt->isPhased ? '|' : '/'; char *phasing = gt->isHaploid ? NA : gt->isPhased ? "Y" : "n"; printf("<TR><TD>%s</TD><TD>%s%c%s</TD><TD>%s</TD>", vcff->genotypeIds[i], hapA, sep, hapB, phasing); int j; for (j = 0; j < gt->infoCount; j++) { if (sameString(formatKeys[j], vcfGtGenotype)) continue; printf("<TD>"); struct vcfInfoElement *el = &(gt->infoElements[j]); int k; for (k = 0; k < el->count; k++) { if (k > 0) printf(", ");