0f5e8ee9ef8d65cf207610e97d1ce7c368623df5 angie Wed Jan 27 15:13:20 2021 -0800 qBaseInsert and tBaseInsert may include sequence skipped on both sides, e.g. due to NNN's. Count actual inserted and deleted bases more carefully. refs #26868 diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 2a44636..6f64fe4 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1108,34 +1108,34 @@ "beginning and/or end") ""); else puts("VCF Sample\n" "#Ns" TOOLTIP("Number of no-call variants for this sample in uploaded VCF, " "i.e. '.' used in genotype column") ""); puts("#Mixed" TOOLTIP("Number of IUPAC ambiguous bases, e.g. 'R' for 'A or G'") ""); if (isFasta) puts("Bases aligned" TOOLTIP("Number of bases aligned to reference NC_045512.2 Wuhan/Hu-1, including " "matches and mismatches") - "\nInsertions" + "\nInserted bases" TOOLTIP("Number of bases in aligned portion of uploaded sequence that are not present in " "reference NC_045512.2 Wuhan/Hu-1") - "\nDeletions" + "\nDeleted bases" TOOLTIP("Number of bases in reference NC_045512.2 Wuhan/Hu-1 that are not " "present in aligned portion of uploaded sequence") ""); puts("#SNVs used for placement" TOOLTIP("Number of single-nucleotide variants in uploaded sample " "(does not include N's or mixed bases) used by UShER to place sample " "in phylogenetic tree") "\n#Masked SNVs" TOOLTIP("Number of single-nucleotide variants in uploaded sample that are masked " "(not used for placement) because they occur at known " "Problematic Sites") "\nNeighboring sample in tree" TOOLTIP("A sample already in the tree that is a child of the node at which the uploaded " "sample was placed, to give an example of a closely related sample") @@ -1323,46 +1323,69 @@ si->ambigCount - alignedAmbigCount); printTooltip(dy->string); } printf(""); if (isFasta) { struct psl *psl = si->psl; if (psl) { int aliCount = psl->match + psl->misMatch + psl->repMatch; printf("%d ", qcClassForLength(aliCount), aliCount); dyStringClear(dy); dyStringPrintf(dy, "bases %d - %d align to reference bases %d - %d", psl->qStart+1, psl->qEnd, psl->tStart+1, psl->tEnd); printTooltip(dy->string); + int insBases = 0, insCount = 0, delBases = 0, delCount = 0; + if (psl->qBaseInsert || psl->tBaseInsert) + { + // Tally up actual insertions and deletions; ignore skipped N bases. + int ix; + for (ix = 0; ix < psl->blockCount - 1; ix++) + { + int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix]; + int qGapEnd = psl->qStarts[ix+1]; + int qGapLen = qGapEnd - qGapStart; + int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix]; + int tGapEnd = psl->tStarts[ix+1]; + int tGapLen = tGapEnd - tGapStart; + if (qGapLen > tGapLen) + { + insCount++; + insBases += qGapLen - tGapLen; + } + else if (tGapLen > qGapLen) + { + delCount++; + delBases += tGapLen - qGapLen; + } + } + } printf("%d ", - qcClassForIndel(psl->qBaseInsert), psl->qBaseInsert); - if (psl->qBaseInsert) + qcClassForIndel(insBases), insBases); + if (insBases) { dyStringClear(dy); - dyStringPrintf(dy, "%d bases in %d locations", - psl->qBaseInsert, psl->qNumInsert); + dyStringPrintf(dy, "%d bases in %d locations", insBases, insCount); printTooltip(dy->string); } printf("%d ", - qcClassForIndel(psl->tBaseInsert), psl->tBaseInsert); - if (psl->tBaseInsert) + qcClassForIndel(delBases), delBases); + if (delBases) { dyStringClear(dy); - dyStringPrintf(dy, "%d bases in %d locations", - psl->tBaseInsert, psl->tNumInsert); + dyStringPrintf(dy, "%d bases in %d locations", delBases, delCount); printTooltip(dy->string); } printf(""); } else printf(" not alignable ", qcClassForLength(0)); } int snvCount = slCount(si->sncList) - alignedAmbigCount; printf("%d", qcClassForSNVs(snvCount), snvCount); if (snvCount > 0) { dyStringClear(dy); struct singleNucChange *snc; for (snc = si->sncList; snc != NULL; snc = snc->next)