dacecbaf00b2193872633dde175c53d263922ff9 angie Wed May 11 16:18:38 2011 -0700 Feature #2823 (VCF track handler): request from alpha-test user CarlosBorroto: call Belinda's coding variant functional prediction function as on the pgSnp details page. diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index 02c2a01..2c6994d 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -9,123 +9,39 @@ #include "hgTracks.h" #include "pgSnp.h" #include "trashDir.h" #include "vcf.h" #if (defined USE_TABIX && defined KNETFILE_HOOKS) #include "knetUdc.h" #include "udc.h" #endif//def USE_TABIX && KNETFILE_HOOKS #ifdef USE_TABIX //#*** TODO: use trackDb/cart setting or something static boolean doHapClusterDisplay = TRUE; static boolean colorHapByRefAlt = TRUE; -#define VCF_MAX_ALLELE_LEN 80 - static struct pgSnp *vcfFileToPgSnp(struct vcfFile *vcff) /* Convert vcff's records to pgSnp; don't free vcff until you're done with pgSnp * because it contains pointers into vcff's records' chrom. */ { struct pgSnp *pgsList = NULL; struct vcfRecord *rec; -struct dyString *dy = dyStringNew(0); for (rec = vcff->records; rec != NULL; rec = rec->next) { - struct pgSnp *pgs; - AllocVar(pgs); - pgs->chrom = rec->chrom; - pgs->chromStart = rec->chromStart; - pgs->chromEnd = rec->chromEnd; - // Build up slash-separated allele string from rec->ref + rec->alt: - dyStringClear(dy); - dyStringAppend(dy, rec->ref); - int altCount, i; - if (sameString(rec->alt, ".")) - altCount = 0; - else - { - char *words[64]; // we're going to truncate anyway if there are this many alleles! - char copy[VCF_MAX_ALLELE_LEN+1]; - strncpy(copy, rec->alt, VCF_MAX_ALLELE_LEN); - copy[VCF_MAX_ALLELE_LEN] = '\0'; - altCount = chopCommas(copy, words); - for (i = 0; i < altCount && dy->stringSize < VCF_MAX_ALLELE_LEN; i++) - dyStringPrintf(dy, "/%s", words[i]); - if (i < altCount) - altCount = i; - } - pgs->name = cloneStringZ(dy->string, dy->stringSize+1); - pgs->alleleCount = altCount + 1; - // Build up comma-sep list of per-allele counts, if available: - dyStringClear(dy); - int refAlleleCount = 0; - boolean gotAltCounts = FALSE; - for (i = 0; i < rec->infoCount; i++) - if (sameString(rec->infoElements[i].key, "AN")) - { - refAlleleCount = rec->infoElements[i].values[0].datInt; - break; - } - for (i = 0; i < rec->infoCount; i++) - if (sameString(rec->infoElements[i].key, "AC")) - { - int alCounts[64]; - int j; - gotAltCounts = (rec->infoElements[i].count > 0); - for (j = 0; j < rec->infoElements[i].count; j++) - { - int ac = rec->infoElements[i].values[j].datInt; - if (j < altCount) - alCounts[1+j] = ac; - refAlleleCount -= ac; - } - if (gotAltCounts) - { - while (j++ < altCount) - alCounts[1+j] = -1; - alCounts[0] = refAlleleCount; - if (refAlleleCount >= 0) - dyStringPrintf(dy, "%d", refAlleleCount); - else - dyStringAppend(dy, "-1"); - for (j = 0; j < altCount; j++) - if (alCounts[1+j] >= 0) - dyStringPrintf(dy, ",%d", alCounts[1+j]); - else - dyStringAppend(dy, ",-1"); - } - break; - } - if (refAlleleCount > 0 && !gotAltCounts) - dyStringPrintf(dy, "%d", refAlleleCount); - pgs->alleleFreq = cloneStringZ(dy->string, dy->stringSize+1); - // Build up comma-sep list... supposed to be per-allele quality scores but I think - // the VCF spec only gives us one BQ... for the reference position? should ask. - dyStringClear(dy); - for (i = 0; i < rec->infoCount; i++) - if (sameString(rec->infoElements[i].key, "BQ")) - { - float qual = rec->infoElements[i].values[0].datFloat; - dyStringPrintf(dy, "%.1f", qual); - int j; - for (j = 0; j < altCount; j++) - dyStringPrintf(dy, ",%.1f", qual); - break; - } - pgs->alleleScores = cloneStringZ(dy->string, dy->stringSize+1); + struct pgSnp *pgs = pgSnpFromVcfRecord(rec); slAddHead(&pgsList, pgs); } slReverse(&pgsList); return pgsList; } // Center-weighted alpha clustering of haplotypes -- see Redmine #3711, #2823 note 7 // It might be nice to use an allele-frequency representation here instead of [ACGTN] strings // with "N" for missing info or differences, but keep it simple. struct cwaExtraData /* Helper data for hacTree clustering of haplotypes by center-weighted alpha distance */ { int center; // index from which each point's contribution to distance is to be weighted @@ -144,47 +60,55 @@ struct hapCluster *next; // hacTree wants slList of items unsigned short *refCounts; // per-variant count of reference alleles observed unsigned short *unkCounts; // per-variant count of unknown (or unphased het) alleles unsigned short leafCount; // number of leaves under this node (or 1 if leaf) unsigned short gtHapIx; // if leaf, (genotype index << 1) + hapIx (0 or 1 for diploid) }; INLINE boolean isRef(const struct hapCluster *c, int varIx) // Return TRUE if the leaves of cluster have at least as many reference alleles // as alternate alleles for variant varIx. { unsigned short altCount = c->leafCount - c->refCounts[varIx] - c->unkCounts[varIx]; return (c->refCounts[varIx] >= altCount); } +INLINE boolean hasUnk(const struct hapCluster *c, int varIx) +// Return TRUE if at least one haplotype in this cluster has an unknown/unphased value at varIx. +{ +return (c->unkCounts[varIx] > 0); +} + static double cwaDistance(const struct slList *item1, const struct slList *item2, void *extraData) /* Center-weighted alpha sequence distance function for hacTree clustering of haplotype seqs */ // This is inner-loop so I am not doing defensive checks. Caller must ensure: // 1. kids's sequences' lengths are both equal to helper->len // 2. 0 <= helper->center <= len // 3. 0.0 < helper->alpha <= 1.0 { const struct hapCluster *kid1 = (const struct hapCluster *)item1; const struct hapCluster *kid2 = (const struct hapCluster *)item2; struct cwaExtraData *helper = extraData; double distance = 0; double weight = 1; // start at center: alpha to the 0th power int i; for (i=helper->center; i >= 0; i--) { if (isRef(kid1, i) != isRef(kid2, i)) distance += weight; + else if (hasUnk(kid1, i) != hasUnk(kid2, i)) + distance += weight/2; weight *= helper->alpha; } weight = helper->alpha; // start at center+1: alpha to the 1st power for (i=helper->center+1; i < helper->len; i++) { if (isRef(kid1, i) != isRef(kid2, i)) distance += weight; weight *= helper->alpha; } return distance; } static struct hapCluster *lmHapCluster(struct cwaExtraData *helper) /* Use localMem to allocate a new cluster of the given len. */ {