6212068bffa294be01c27e5e34f7b51def22f25a angie Tue Apr 12 17:58:51 2022 -0700 Treat '-' bases as 'N' when counting differing bases between ref and input seq for -maxDiff. diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c index 0ff67ae..e182c11 100644 --- src/hg/utils/faToVcf/faToVcf.c +++ src/hg/utils/faToVcf/faToVcf.c @@ -106,42 +106,42 @@ if (hashLookup(excludedSeqs, seq->name)) excludeCount++; else slAddHead(&newList, seq); } hashFree(&excludedSeqs); slReverse(&newList); sequences = newList; verbose(2, "Excluded %d sequences named in %s (%d sequences remaining including reference)\n", excludeCount, excludeFile, slCount(sequences)); } return sequences; } static int countDiffs(struct dnaSeq *ref, struct dnaSeq *seq) -/* Return the number of bases that differ between ref and seq ignoring 'N'. */ +/* Return the number of bases that differ between ref and seq ignoring 'N' and '-'. */ { if (ref->size != seq->size) errAbort("countDiffs: expecting equally sized sequences but %s size %d != %s size %d", ref->name, ref->size, seq->name, seq->size); int diffs = 0; int i; for (i = 0; i < ref->size; i++) { char refBase = toupper(ref->dna[i]); char seqBase = toupper(seq->dna[i]); - if (refBase != 'N' && seqBase != 'N' && seqBase != refBase) + if (refBase != 'N' && seqBase != 'N' && seqBase != '-' && seqBase != refBase) diffs++; } return diffs; } static struct dnaSeq *filterMaxDiff(struct dnaSeq *sequences) /* If -maxDiff was passed in, remove any sequences with more than that number of differences * from the reference (ignoring Ns but not IUPAC ambiguous bases). */ { int maxDiff = optionInt("maxDiff", 0); if (maxDiff > 0) { int excludeCount = 0; struct dnaSeq *ref = sequences; struct dnaSeq *newList = NULL, *seq, *nextSeq = NULL;