ad3a176457ad1d21a0fedc47f349ec3484e751f1 angie Fri Feb 9 16:27:51 2018 -0800 Cleaning up some old ugliness about the size parameter to isAllDna and isAllNt. hgc's printSnpAlignment code that parsed snpNNN.fa was using lineSize as length but lineSize is length+1. Then isAllDna was written with "i<size-1" as the loop test instead of "i < size". I didn't fix that properly when I separated out isAllNt from isAllDna. Later, I (re?)discovered that isAllNt needed length+1 as its size and just added some FIXME comments. Thanks Brian R for prodding me to actually fix it. refs #20895 diff --git src/lib/vcf.c src/lib/vcf.c index f980d12..246faaf 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -764,43 +764,41 @@ } static boolean allelesHavePaddingBase(char **alleles, int alleleCount) /* Examine alleles to see if they either a) all start with the same base or * b) include a symbolic or 0-length allele. In either of those cases, there * must be an initial padding base that we'll need to trim from non-symbolic * alleles. */ { if (sameString(alleles[0], "-")) return FALSE; else if (noAltAllele(alleles, alleleCount)) // Don't trim assertion of no change (ref == alt) return FALSE; boolean hasPaddingBase = TRUE; char firstBase = '\0'; -if (isAllNt(alleles[0], strlen(alleles[0]) - +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 +if (isAllNt(alleles[0], strlen(alleles[0]))) firstBase = alleles[0][0]; int i; for (i = 1; i < alleleCount; i++) { if (sameString(alleles[i], "-")) { hasPaddingBase = FALSE; break; } - else if (isAllNt(alleles[i], strlen(alleles[i]) - +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 + else if (isAllNt(alleles[i], strlen(alleles[i]))) { if (firstBase == '\0') firstBase = alleles[i][0]; if (alleles[i][0] != firstBase) // Different first base implies unpadded alleles. hasPaddingBase = FALSE; } else if (sameString(alleles[i], "<X>") || sameString(alleles[i], "<*>")) { // Special case for samtools mpileup "<X>" or gVCF "<*>" (no alternate allele observed) -- // being symbolic doesn't make this an indel and ref base is not necessarily padded. hasPaddingBase = FALSE; } else { @@ -824,32 +822,31 @@ * record in hgc -- so return the original chromStart. */ { unsigned int chromStartOrig = rec->chromStart; struct vcfFile *vcff = rec->file; if (rec->alleleCount > 1) { boolean hasPaddingBase = allelesHavePaddingBase(rec->alleles, rec->alleleCount); if (hasPaddingBase) { rec->chromStart++; int i; for (i = 0; i < rec->alleleCount; i++) { if (rec->alleles[i][1] == '\0') rec->alleles[i] = vcfFilePooledStr(vcff, "-"); - else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i]) - +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 + else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i]))) rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1); else // don't trim first character of symbolic allele rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]); } } } return chromStartOrig; } static boolean allEndsGEStartsAndIdentical(char **starts, char **ends, int count) /* Given two arrays with <count> elements, return true if all strings in ends[] are * greater than or equal to the corresponding strings in starts[], and all ends[] * have the same char. */ { int i; @@ -863,32 +860,31 @@ } static int countIdenticalBasesRight(char **alleles, int alCount) /* Return the number of bases that are identical at the end of each allele (usually 0). */ { if (noAltAllele(alleles, alCount)) // Don't trim assertion of no change (ref == alt) return 0; char *alleleEnds[alCount]; int i; for (i = 0; i < alCount; i++) { int alLen = strlen(alleles[i]); // If any allele is symbolic, don't try to trim. if (sameString(alleles[i], "-") || - !isAllNt(alleles[i], alLen - +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 + !isAllNt(alleles[i], alLen)) return 0; alleleEnds[i] = alleles[i] + alLen-1; } int trimmedBases = 0; while (allEndsGEStartsAndIdentical(alleles, alleleEnds, alCount)) { trimmedBases++; // Trim identical last base of alleles and move alleleEnds[] items back. for (i = 0; i < alCount; i++) alleleEnds[i]--; } return trimmedBases; } unsigned int vcfRecordTrimAllelesRight(struct vcfRecord *rec) @@ -1480,32 +1476,31 @@ // VCF reference allele gets its own column: char *refAllele = words[3]; char *altAlleles = words[4]; // Make a vcfRecord-like allele array (ref in [0], alts after) so we can check for padding base: int alCount = 1 + countChars(altAlleles, ',') + 1; char *alleles[alCount]; alleles[0] = refAllele; char altAlCopy[strlen(altAlleles)+1]; safecpy(altAlCopy, sizeof(altAlCopy), altAlleles); chopByChar(altAlCopy, ',', &(alleles[1]), alCount-1); int i; if (allelesHavePaddingBase(alleles, alCount)) { // Skip padding base (unless we have a symbolic allele): for (i = 0; i < alCount; i++) - if (isAllNt(alleles[i], strlen(alleles[i]) - +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1 + if (isAllNt(alleles[i], strlen(alleles[i]))) alleles[i]++; } // Having dealt with left padding base, now look for identical bases on the right: int trimmedBases = countIdenticalBasesRight(alleles, alCount); // Build a /-separated allele string, trimming bases on the right if necessary: dyStringClear(dy); if (noAltAllele(alleles, alCount)) alCount = 1; for (i = 0; i < alCount; i++) { char *allele = alleles[i]; if (!sameString(allele, ".")) { if (i != 0) {