519e0946826199d1d9792fa8df5972843fce021c angie Tue Aug 9 14:39:38 2011 -0700 Feature #2821 (VCF parser): improved representation of alleles:parse ref and comma-sep'd alt allele string into count and array inside record, so callers don't all have to parse the comma-sep'd alternate allele string. diff --git src/hg/lib/pgSnp.c src/hg/lib/pgSnp.c index 40b44e6..bdb716e 100644 --- src/hg/lib/pgSnp.c +++ src/hg/lib/pgSnp.c @@ -678,108 +678,121 @@ item->alleleFreq = cloneString(row[5]); char pattern[128]; safef(pattern, sizeof(pattern), "^[0-9]+(,[0-9]+){%d}$", item->alleleCount - 1); if (! regexMatchNoCase(row[5], pattern)) lineFileAbort(lf, "invalid allele frequency, %s with count of %d", row[5], item->alleleCount); /* scores, comma separated list of numbers with above # of items */ item->alleleScores = cloneString(row[6]); safef(pattern, sizeof(pattern), "^[0-9.]+(,[0-9.]+){%d}$", item->alleleCount - 1); if (! regexMatchNoCase(row[6], pattern)) lineFileAbort(lf, "invalid allele scores, %s with count of %d", row[6], item->alleleCount); return item; } #define VCF_MAX_ALLELE_LEN 80 -struct pgSnp *pgSnpFromVcfRecord(struct vcfRecord *rec) -/* Convert VCF rec to pgSnp; don't free rec->file (vcfFile) until - * you're done with pgSnp because pgSnp points to rec->chrom. */ +static char *alleleCountsFromVcfRecord(struct vcfRecord *rec, int alDescCount) +/* Build up comma-sep list of per-allele counts, if available, up to alDescCount + * which may be less than rec->alleleCount: */ { static struct dyString *dy = NULL; if (dy == NULL) dy = dyStringNew(0); else dyStringClear(dy); -struct pgSnp *pgs; -AllocVar(pgs); -pgs->chrom = rec->chrom; -pgs->chromStart = rec->chromStart; -pgs->chromEnd = rec->chromEnd; -// Build up slash-separated allele string from rec->ref + rec->alt: -dyStringAppend(dy, rec->ref); -int altCount, i; -if (sameString(rec->alt, ".")) - altCount = 0; -else - { - char *words[VCF_MAX_ALLELE_LEN/2]; - char copy[VCF_MAX_ALLELE_LEN+1]; - strncpy(copy, rec->alt, VCF_MAX_ALLELE_LEN); - copy[VCF_MAX_ALLELE_LEN] = '\0'; - altCount = chopCommas(copy, words); - for (i = 0; i < altCount && dy->stringSize < VCF_MAX_ALLELE_LEN; i++) - dyStringPrintf(dy, "/%s", words[i]); - if (i < altCount) - altCount = i; - } -pgs->name = cloneStringZ(dy->string, dy->stringSize+1); -pgs->alleleCount = altCount + 1; -// Build up comma-sep list of per-allele counts, if available: dyStringClear(dy); -int refAlleleCount = 0; -boolean gotAltCounts = FALSE; +int alCounts[VCF_MAX_ALLELE_LEN]; +boolean gotTotalCount = FALSE, gotAltCounts = FALSE; +int i; for (i = 0; i < rec->infoCount; i++) if (sameString(rec->infoElements[i].key, "AN")) { - refAlleleCount = rec->infoElements[i].values[0].datInt; + gotTotalCount = TRUE; + // Set ref allele to total count, subtract alt counts below. + alCounts[0] = rec->infoElements[i].values[0].datInt; break; } for (i = 0; i < rec->infoCount; i++) if (sameString(rec->infoElements[i].key, "AC")) { - int alCounts[64]; + if (rec->infoElements[i].count > 0) + { + gotAltCounts = TRUE; int j; - gotAltCounts = (rec->infoElements[i].count > 0); - for (j = 0; j < rec->infoElements[i].count; j++) + for (j = 0; j < rec->infoElements[i].count && j < alDescCount-1; j++) { int ac = rec->infoElements[i].values[j].datInt; - if (j < altCount) alCounts[1+j] = ac; - refAlleleCount -= ac; + if (gotTotalCount) + alCounts[0] -= ac; } - if (gotAltCounts) - { - while (j++ < altCount) + while (j++ < alDescCount-1) alCounts[1+j] = -1; - alCounts[0] = refAlleleCount; - if (refAlleleCount >= 0) - dyStringPrintf(dy, "%d", refAlleleCount); + if (gotTotalCount) + dyStringPrintf(dy, "%d", alCounts[0]); else dyStringAppend(dy, "-1"); - for (j = 0; j < altCount; j++) - if (alCounts[1+j] >= 0) - dyStringPrintf(dy, ",%d", alCounts[1+j]); + for (j = 1; j < alDescCount; j++) + if (alCounts[j] >= 0) + dyStringPrintf(dy, ",%d", alCounts[j]); else dyStringAppend(dy, ",-1"); } break; } -if (refAlleleCount > 0 && !gotAltCounts) - dyStringPrintf(dy, "%d", refAlleleCount); -pgs->alleleFreq = cloneStringZ(dy->string, dy->stringSize+1); +if (gotTotalCount && !gotAltCounts) + dyStringPrintf(dy, "%d", alCounts[0]); +return cloneStringZ(dy->string, dy->stringSize+1); +} + +struct pgSnp *pgSnpFromVcfRecord(struct vcfRecord *rec) +/* Convert VCF rec to pgSnp; don't free rec->file (vcfFile) until + * you're done with pgSnp because pgSnp points to rec->chrom. */ +{ +static struct dyString *dy = NULL; +if (dy == NULL) + dy = dyStringNew(0); +else + dyStringClear(dy); +struct pgSnp *pgs; +AllocVar(pgs); +pgs->chrom = rec->chrom; +pgs->chromStart = rec->chromStart; +pgs->chromEnd = rec->chromEnd; +// Build up slash-separated allele string from rec->alleles, starting with ref allele: +dyStringAppend(dy, rec->alleles[0]); +int alCount = rec->alleleCount, i; +if (rec->alleleCount == 2 && sameString(rec->alleles[1], ".")) + // ignore N/A alternate allele + alCount = 1; +else if (rec->alleleCount >= 2) + { + // append /-sep'd alternate alleles, unless/until it gets too long: + for (i = 1; i < rec->alleleCount; i++) + { + if ((dy->stringSize + 1 + strlen(rec->alleles[i])) > VCF_MAX_ALLELE_LEN) + break; + dyStringPrintf(dy, "/%s", rec->alleles[i]); + } + if (i < rec->alleleCount) + alCount = i; + } +pgs->name = cloneStringZ(dy->string, dy->stringSize+1); +pgs->alleleCount = alCount; +pgs->alleleFreq = alleleCountsFromVcfRecord(rec, alCount); // Build up comma-sep list... supposed to be per-allele quality scores but I think // the VCF spec only gives us one BQ... for the reference position? should ask. dyStringClear(dy); for (i = 0; i < rec->infoCount; i++) if (sameString(rec->infoElements[i].key, "BQ")) { float qual = rec->infoElements[i].values[0].datFloat; dyStringPrintf(dy, "%.1f", qual); int j; - for (j = 0; j < altCount; j++) + for (j = 1; j < rec->alleleCount; j++) dyStringPrintf(dy, ",%.1f", qual); break; } pgs->alleleScores = cloneStringZ(dy->string, dy->stringSize+1); return pgs; }