675a334117c7a98be59035fdf620ac8ca764fd1e angie Thu Jan 18 12:11:33 2018 -0800 isAllNt ignores last base in string so beware; also, isAllNt returns true for "-" (translates it to 'n') but that means the empty string so exclude it from base-trimming. diff --git src/lib/vcf.c src/lib/vcf.c index a2acfa7..183bb82 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -749,38 +749,47 @@ { char *words[VCF_MAX_COLUMNS]; int wordCount; if ((wordCount = lineFileChopTab(vcff->lf, words)) <= 0) return NULL; wordCount = checkWordCount(vcff, words, wordCount); return vcfRecordFromRow(vcff, words); } static boolean allelesHavePaddingBase(char **alleles, int alleleCount) /* Examine alleles to see if they either a) all start with the same base or * b) include a symbolic or 0-length allele. In either of those cases, there * must be an initial padding base that we'll need to trim from non-symbolic * alleles. */ { +if (sameString(alleles[0], "-")) + return FALSE; boolean hasPaddingBase = TRUE; char firstBase = '\0'; -if (isAllNt(alleles[0], strlen(alleles[0]))) +if (isAllNt(alleles[0], strlen(alleles[0]) + +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 firstBase = alleles[0][0]; int i; for (i = 1; i < alleleCount; i++) { - if (isAllNt(alleles[i], strlen(alleles[i]))) + if (sameString(alleles[i], "-")) + { + hasPaddingBase = FALSE; + break; + } + else if (isAllNt(alleles[i], strlen(alleles[i]) + +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 { if (firstBase == '\0') firstBase = alleles[i][0]; if (alleles[i][0] != firstBase) // Different first base implies unpadded alleles. hasPaddingBase = FALSE; } else if (sameString(alleles[i], "<X>") || sameString(alleles[i], "<*>")) { // Special case for samtools mpileup "<X>" or gVCF "<*>" (no alternate allele observed) -- // being symbolic doesn't make this an indel and ref base is not necessarily padded. hasPaddingBase = FALSE; } else { @@ -804,31 +813,32 @@ * record in hgc -- so return the original chromStart. */ { unsigned int chromStartOrig = rec->chromStart; struct vcfFile *vcff = rec->file; if (rec->alleleCount > 1) { boolean hasPaddingBase = allelesHavePaddingBase(rec->alleles, rec->alleleCount); if (hasPaddingBase) { rec->chromStart++; int i; for (i = 0; i < rec->alleleCount; i++) { if (rec->alleles[i][1] == '\0') rec->alleles[i] = vcfFilePooledStr(vcff, "-"); - else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i]))) + else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i]) + +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1); else // don't trim first character of symbolic allele rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]); } } } return chromStartOrig; } static boolean allEndsGEStartsAndIdentical(char **starts, char **ends, int count) /* Given two arrays with <count> elements, return true if all strings in ends[] are * greater than or equal to the corresponding strings in starts[], and all ends[] * have the same char. */ { int i; @@ -838,31 +848,33 @@ if (ends[i] < starts[i] || ends[i][0] != refEnd) return FALSE; } return TRUE; } static int countIdenticalBasesRight(char **alleles, int alCount) /* Return the number of bases that are identical at the end of each allele (usually 0). */ { char *alleleEnds[alCount]; int i; for (i = 0; i < alCount; i++) { int alLen = strlen(alleles[i]); // If any allele is symbolic, don't try to trim. - if (!isAllNt(alleles[i], alLen)) + if (sameString(alleles[i], "-") || + !isAllNt(alleles[i], alLen + +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 return 0; alleleEnds[i] = alleles[i] + alLen-1; } int trimmedBases = 0; while (allEndsGEStartsAndIdentical(alleles, alleleEnds, alCount)) { trimmedBases++; // Trim identical last base of alleles and move alleleEnds[] items back. for (i = 0; i < alCount; i++) alleleEnds[i]--; } return trimmedBases; } unsigned int vcfRecordTrimAllelesRight(struct vcfRecord *rec) @@ -1454,31 +1466,32 @@ // VCF reference allele gets its own column: char *refAllele = words[3]; char *altAlleles = words[4]; // Make a vcfRecord-like allele array (ref in [0], alts after) so we can check for padding base: int alCount = 1 + countChars(altAlleles, ',') + 1; char *alleles[alCount]; alleles[0] = refAllele; char altAlCopy[strlen(altAlleles)+1]; safecpy(altAlCopy, sizeof(altAlCopy), altAlleles); chopByChar(altAlCopy, ',', &(alleles[1]), alCount-1); int i; if (allelesHavePaddingBase(alleles, alCount)) { // Skip padding base (unless we have a symbolic allele): for (i = 0; i < alCount; i++) - if (isAllNt(alleles[i], strlen(alleles[i]))) + if (isAllNt(alleles[i], strlen(alleles[i]) + +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 alleles[i]++; } // Having dealt with left padding base, now look for identical bases on the right: int trimmedBases = countIdenticalBasesRight(alleles, alCount); // Build a /-separated allele string, trimming bases on the right if necessary: dyStringClear(dy); for (i = 0; i < alCount; i++) { char *allele = alleles[i]; if (!sameString(allele, ".")) { if (i > 0) dyStringAppendC(dy, '/'); if (allele[trimmedBases] == '\0') dyStringAppendC(dy, '-');