c693f56c78b1d5a56c53bf7473ff850446c4f1f4 angie Mon Oct 17 15:47:46 2011 -0700 Feature #3710 (vcfTabix UI options): added filtering on minor allelefrequency (actually, 1 - major allele frequency), if INFO column contains the AF tag, or AC and AN tags. Also, in vcfTrack.c, fixed a bug in multi-allele abbreviation code in vcfFileToPgSnp. diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index 14ab5c6..dd49567 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -7,31 +7,31 @@ #include "hacTree.h" #include "hdb.h" #include "hgTracks.h" #include "pgSnp.h" #include "trashDir.h" #include "vcf.h" #include "vcfUi.h" #if (defined USE_TABIX && defined KNETFILE_HOOKS) #include "knetUdc.h" #include "udc.h" #endif//def USE_TABIX && KNETFILE_HOOKS #ifdef USE_TABIX static boolean getMinQual(struct trackDb *tdb, double *retMinQual, boolean compositeLevel) -/* Return TRUE and set retMinQual id cart contains minimum QUAL filter */ +/* Return TRUE and set retMinQual if cart contains minimum QUAL filter */ { char cartVar[512]; safef(cartVar, sizeof(cartVar), "%s."VCF_APPLY_MIN_QUAL_VAR, tdb->track); if (cartUsualBooleanClosestToHome(cart, tdb, compositeLevel, VCF_APPLY_MIN_QUAL_VAR, VCF_DEFAULT_APPLY_MIN_QUAL)) { if (retMinQual != NULL) *retMinQual = cartUsualDoubleClosestToHome(cart, tdb, compositeLevel, VCF_MIN_QUAL_VAR, VCF_DEFAULT_MIN_QUAL); return TRUE; } return FALSE; } static boolean minQualFail(struct vcfRecord *record, double minQual) @@ -58,86 +58,165 @@ return TRUE; } return FALSE; } static boolean filterColumnFail(struct vcfRecord *record, struct slName *filterValues) /* Return TRUE if record's FILTER column value(s) matches one of filterValues (from cart). */ { int i; for (i = 0; i < record->filterCount; i++) if (slNameInList(filterValues, record->filters[i])) return TRUE; return FALSE; } +static boolean getMinFreq(struct trackDb *tdb, double *retMinFreq, boolean compositeLevel) +/* Return TRUE and set retMinFreq if cart contains nonzero minimum minor allele frequency. */ +{ +char cartVar[512]; +//#*** is there an ExistsClosestToHome? +safef(cartVar, sizeof(cartVar), "%s."VCF_MIN_ALLELE_FREQ_VAR, tdb->track); +if (cartVarExists(cart, cartVar)) + { + double minFreq = cartUsualDoubleClosestToHome(cart, tdb, compositeLevel, + VCF_MIN_ALLELE_FREQ_VAR, VCF_DEFAULT_MIN_ALLELE_FREQ); + if (minFreq > 0) + { + if (retMinFreq != NULL) + *retMinFreq = minFreq; + return TRUE; + } + } +return FALSE; +} + +static boolean minFreqFail(struct vcfRecord *record, double minFreq) +/* Return TRUE if record's INFO include AF (alternate allele frequencies) or AC+AN + * (alternate allele counts and total count of observed alleles) and the minor allele + * frequency < minFreq -- or rather, major allele frequency > (1 - minFreq) because + * variants with > 2 alleles might have some significant minor frequencies along with + * tiny minor frequencies). */ +{ +struct vcfFile *vcff = record->file; +boolean gotInfo = FALSE; +double refFreq = 1.0; +double maxAltFreq = 0.0; +int i; +const struct vcfInfoElement *afEl = vcfRecordFindInfo(record, "AF"); +const struct vcfInfoDef *afDef = vcfInfoDefForKey(vcff, "AF"); +if (afEl != NULL && afDef != NULL && afDef->type == vcfInfoFloat) + { + // If INFO includes alt allele freqs, use them directly. + gotInfo = TRUE; + for (i = 0; i < afEl->count; i++) + { + double altFreq = afEl->values[i].datFloat; + refFreq -= altFreq; + if (altFreq > maxAltFreq) + maxAltFreq = altFreq; + } + } +else + { + // Calculate alternate allele freqs from AC and AN: + const struct vcfInfoElement *acEl = vcfRecordFindInfo(record, "AC"); + const struct vcfInfoDef *acDef = vcfInfoDefForKey(vcff, "AC"); + const struct vcfInfoElement *anEl = vcfRecordFindInfo(record, "AN"); + const struct vcfInfoDef *anDef = vcfInfoDefForKey(vcff, "AN"); + if (acEl != NULL && acDef != NULL && acDef->type == vcfInfoInteger && + anEl != NULL && anDef != NULL && anDef->type == vcfInfoInteger && anEl->count == 1) + { + gotInfo = TRUE; + int totalCount = anEl->values[0].datFloat; + for (i = 0; i < acEl->count; i++) + { + int altCount = acEl->values[i].datFloat; + double altFreq = (double)altCount / totalCount; + refFreq -= altFreq; + if (altFreq < maxAltFreq) + maxAltFreq = altFreq; + } + } + } +if (gotInfo) + { + double majorAlFreq = max(refFreq, maxAltFreq); + if (majorAlFreq > (1.0 - minFreq)) + return TRUE; + } +return FALSE; +} + static void filterRecords(struct vcfFile *vcff, struct trackDb *tdb) /* If a filter is specified in the cart, remove any records that don't pass filter. */ { boolean compositeLevel = isNameAtCompositeLevel(tdb, tdb->track); -double minQual = 0; +double minQual = VCF_DEFAULT_MIN_QUAL; struct slName *filterValues = NULL; +double minFreq = VCF_DEFAULT_MIN_ALLELE_FREQ; boolean gotQualFilter = getMinQual(tdb, &minQual, compositeLevel); boolean gotFilterFilter = getFilterValues(tdb, &filterValues, compositeLevel); -if (! (gotQualFilter || gotFilterFilter) ) +boolean gotMinFreqFilter = getMinFreq(tdb, &minFreq, compositeLevel); +if (! (gotQualFilter || gotFilterFilter || gotMinFreqFilter) ) return; struct vcfRecord *rec, *nextRec, *newList = NULL; for (rec = vcff->records; rec != NULL; rec = nextRec) { nextRec = rec->next; if (! ((gotQualFilter && minQualFail(rec, minQual)) || - (gotFilterFilter && filterColumnFail(rec, filterValues))) ) + (gotFilterFilter && filterColumnFail(rec, filterValues)) || + (gotMinFreqFilter && minFreqFail(rec, minFreq)) )) slAddHead(&newList, rec); } slReverse(&newList); vcff->records = newList; } static struct pgSnp *vcfFileToPgSnp(struct vcfFile *vcff, struct trackDb *tdb) /* Convert vcff's records to pgSnp; don't free vcff until you're done with pgSnp * because it contains pointers into vcff's records' chrom. */ { struct pgSnp *pgsList = NULL; struct vcfRecord *rec; int maxLen = 33; +int maxAlCount = 5; for (rec = vcff->records; rec != NULL; rec = rec->next) { struct pgSnp *pgs = pgSnpFromVcfRecord(rec); // Insertion sequences can be quite long; abbreviate here for display. int len = strlen(pgs->name); if (len > maxLen) { - if (strchr(pgs->name, '/') != NULL) - { - char *copy = cloneString(pgs->name); - char *allele[8]; - int cnt = chopByChar(copy, '/', allele, pgs->alleleCount); - int maxAlLen = maxLen / pgs->alleleCount; + int maxAlLen = maxLen / min(rec->alleleCount, maxAlCount); pgs->name[0] = '\0'; int i; - for (i = 0; i < cnt; i++) + for (i = 0; i < rec->alleleCount; i++) { if (i > 0) safencat(pgs->name, len+1, "/", 1); - if (strlen(allele[i]) > maxAlLen-3) - strcpy(allele[i]+maxAlLen-3, "..."); - safencat(pgs->name, len+1, allele[i], maxAlLen); + if (i >= maxAlCount) + { + safecat(pgs->name, len+1, "..."); + pgs->alleleCount = maxAlCount; + break; } + if (strlen(rec->alleles[i]) > maxAlLen-3) + strcpy(rec->alleles[i]+maxAlLen-3, "..."); + safencat(pgs->name, len+1, rec->alleles[i], maxAlLen); } - else - strcpy(pgs->name+maxLen-3, "..."); } slAddHead(&pgsList, pgs); } slReverse(&pgsList); return pgsList; } // Center-weighted alpha clustering of haplotypes -- see Redmine #3711, #2823 note 7 // It might be nice to use an allele-frequency representation here instead of [ACGTN] strings // with "N" for missing info or differences, but keep it simple. struct cwaExtraData /* Helper data for hacTree clustering of haplotypes by center-weighted alpha distance */ {