198c9b8daecc44fbda6a6494c566c723920f030a lrnassar Wed Mar 11 18:25:21 2026 -0700 Fixing a few hundred clear typos with the help of Claude. Some are less important in code comments, but majority of them are in user-facing places. I manually approved 60%+ of the changes and didn't see any that were an incorrect suggestion, at worst it was potentially uncessesary, like a code comment having cant instead of can't. No RM. diff --git src/hg/utils/pslMismatchGapToBed/pslMismatchGapToBed.c src/hg/utils/pslMismatchGapToBed/pslMismatchGapToBed.c index 017e5211cdb..b2eab00bf69 100644 --- src/hg/utils/pslMismatchGapToBed/pslMismatchGapToBed.c +++ src/hg/utils/pslMismatchGapToBed/pslMismatchGapToBed.c @@ -1,590 +1,590 @@ /* pslMismatchGapToBed - Find mismatches and short indels between target and query (e.g. genome & transcript). */ #include "common.h" #include "basicBed.h" #include "fa.h" #include "indelShift.h" #include "linefile.h" #include "hash.h" #include "hdb.h" #include "hgHgvs.h" #include "options.h" #include "psl.h" #include "txAliDiff.h" // Arbitrary cutoff for minimum plausible intron size #define MIN_INTRON 45 // Different color options #define MISMATCH_COLOR 0xff0000 //red #define SHORTGAP_COLOR 0xffa500 //orange #define SHIFTYGAP_COLOR 0xffa500 //orange #define DOUBLEGAP_COLOR 0x888888 //grey #define QSKIPPED_COLOR 0x800080 //purple // Command line option default static char *cdsFile = NULL; static char *db = NULL; static char *ignoreQNamePrefix = NULL; void usage() /* Explain usage and exit. */ { errAbort( "pslMismatchGapToBed - Find mismatches and too-short-to-be-intron indels (<%d bases)\n" " between target and query (intended for comparing genome & transcripts)\n" "usage:\n" " pslMismatchGapToBed in.psl target.2bit query.fa[.gz] outBaseName\n" "options:\n" " -cdsFile=X Get Genbank CDS strings from file X\n" " -db=X Get RefSeq chromosome accessions for target from database X\n" " -ignoreQNamePrefix=X Ignore PSL rows whose qName starts with X\n" , MIN_INTRON); } // Command line validation table static struct optionSpec options[] = { {"cdsFile", OPTION_STRING}, {"db", OPTION_STRING}, {"ignoreQNamePrefix", OPTION_STRING}, {NULL, 0}, }; static void dnaSeqCopyMaybeRc(struct dnaSeq *seq, int start, int len, boolean isRc, char *buf, size_t bufSize) /* Copy len bases of seq into buf. If isRc, reverse-complement the sequence. */ { safencpy(buf, bufSize, seq->dna+start, len); if (isRc) reverseComplement(buf, len); } static void addCoordCols(struct txAliDiff *newDiff, char *acc, int start, int end) /* Fill out the txName, txStart, and txEnd fields of a txAliDiff. */ { newDiff->txName = cloneString(acc); newDiff->txStart = start; newDiff->txEnd = end; } static void compareExonBases(struct psl *psl, int ix, struct seqWindow *gSeqWin, struct dnaSeq *txSeq, struct genbankCds *cds, char *chromAcc, struct txAliDiff **differencesList) /* Compare exon ix sequence from genome and transcript to find mismatched bases. */ { boolean isRc = (pslQStrand(psl) == '-'); int blockSize = psl->blockSizes[ix]; char exonT[blockSize+1]; seqWindowCopy(gSeqWin, psl->tStarts[ix], blockSize, exonT, sizeof(exonT)); char exonQ[blockSize+1]; int qExonStart = isRc ? psl->qSize - psl->qStarts[ix] - blockSize : psl->qStarts[ix]; dnaSeqCopyMaybeRc(txSeq, qExonStart, blockSize, isRc, exonQ, sizeof(exonQ)); static struct dyString *namePlus = NULL; if (namePlus == NULL) namePlus = dyStringNew(0); int jx; for (jx = 0; jx < blockSize; jx++) { if (exonT[jx] != exonQ[jx]) { struct txAliDiff *newMismatch = NULL; AllocVar(newMismatch); int tStart = psl->tStarts[ix] + jx; char *tRef = exonT + jx; char *tAlt = exonQ + jx; int len = 1; while (jx < blockSize - 1 && exonT[jx+len] != exonQ[jx+len]) len++; char qRef[len+1]; char qAlt[len+1]; safencpy(qRef, sizeof(qRef), tAlt, len); safencpy(qAlt, sizeof(qAlt), tRef, len); int qStart = psl->qStarts[ix] + jx; if (isRc) { int qEnd = qStart + len; qStart = psl->qSize - qEnd; reverseComplement(qRef, len); reverseComplement(qAlt, len); } dyStringClear(namePlus); dyStringPrintf(namePlus, "Mismatch: %s>%s", qRef, qAlt); newMismatch->name = cloneString(namePlus->string); newMismatch->strand[0] = psl->strand[0]; addCoordCols(newMismatch, psl->qName, qStart, qStart+len); // Make HGVS g. struct bed3 variantBed; variantBed.chrom = psl->tName; variantBed.chromStart = tStart; variantBed.chromEnd = tStart+len; char alt[len+1]; safencpy(alt, sizeof(alt), tAlt, len); char *hgvsG = hgvsGFromVariant(gSeqWin, &variantBed, alt, chromAcc, TRUE); newMismatch->hgvsG = cloneString(hgvsG); // Make HGVS c./n. char gRef[len+1]; safencpy(gRef, sizeof(gRef), tRef, len); struct vpTx vpTx; vpTx.txName = psl->qName; vpTx.start.region = vpTx.end.region = vpExon; vpTx.start.aliBlkIx = vpTx.end.aliBlkIx = ix; vpTx.start.txOffset = qStart; vpTx.start.gOffset = tStart; vpTx.end.txOffset = qStart+len; vpTx.end.gOffset = tStart+len; vpTx.gRef = qAlt; vpTx.gAlt = qRef; vpTx.txRef = qRef; vpTx.txAlt = qAlt; char *hgvsCN; if (cds) hgvsCN = hgvsCFromVpTx(&vpTx, gSeqWin, psl, cds, txSeq, TRUE); else hgvsCN = hgvsNFromVpTx(&vpTx, gSeqWin, psl, txSeq, TRUE); newMismatch->hgvsCN = cloneString(hgvsCN); newMismatch->chrom = cloneString(psl->tName); newMismatch->chromStart = tStart; newMismatch->chromEnd = tStart + len; newMismatch->thickStart = tStart; newMismatch->thickEnd = tStart + len; newMismatch->reserved = MISMATCH_COLOR; newMismatch->hgvsN = ""; newMismatch->hgvsPosCN = ""; slAddHead(differencesList, newMismatch); freeMem(hgvsG); jx += (len - 1); } } } static void addCNRange(struct dyString *dy, int start, int end, struct genbankCds *cds) /* Append an HGVS c. or n. position or range to dy. */ { if (cds) { dyStringPrintf(dy, "c."); char cdsPrefix[2]; cdsPrefix[0] = '\0'; uint cdsCoord = hgvsTxToCds(start, cds, TRUE, cdsPrefix); dyStringPrintf(dy, "%s%u", cdsPrefix, cdsCoord); if (end > start) { cdsCoord = hgvsTxToCds(end, cds, FALSE, cdsPrefix); dyStringPrintf(dy, "_%s%u", cdsPrefix, cdsCoord); } } else { dyStringPrintf(dy, "n.%d", start+1); if (end > start+1) dyStringPrintf(dy, "_%d", end); } } static void checkIntrons(struct psl *psl, int ix, struct seqWindow *gSeqWin, struct dnaSeq *txSeq, struct genbankCds *cds, char *chromAcc, struct txAliDiff **differencesList) /* Examine each intron to see if it's too short to be plausible. Report too-short introns * and determine whether they are ambiguously placed, i.e. if they could be shifted left * and/or right on genome and transcript (see also vpExpandIndelGaps). If not too short, * check for skipped query bases. */ { boolean isRc = (pslQStrand(psl) == '-'); static struct dyString *namePlus = NULL; if (namePlus == NULL) namePlus = dyStringNew(0); if (ix < psl->blockCount - 1) { int blockSize = psl->blockSizes[ix]; int tGapStart = psl->tStarts[ix] + blockSize; int tGapEnd = psl->tStarts[ix+1]; int tGapLen = tGapEnd - tGapStart; int qGapStart = isRc ? psl->qSize - psl->qStarts[ix+1] : psl->qStarts[ix] + blockSize; int qGapEnd = isRc ? psl->qSize - psl->qStarts[ix] - blockSize : psl->qStarts[ix+1]; int qGapLen = qGapEnd - qGapStart; if (tGapLen < MIN_INTRON) { // See if it is possible to shift the gap left and/or right on the genome. uint gStartL = tGapStart; uint gEndL = tGapEnd; uint gStartR = gStartL, gEndR = gEndL; char txCpyL[qGapLen+1], txCpyR[qGapLen+1]; dnaSeqCopyMaybeRc(txSeq, qGapStart, qGapLen, isRc, txCpyL, sizeof(txCpyL)); safencpy(txCpyR, sizeof(txCpyR), txCpyL, qGapLen); int shiftL = indelShift(gSeqWin, &gStartL, &gEndL, txCpyL, INDEL_SHIFT_NO_MAX, isdLeft); int shiftR = indelShift(gSeqWin, &gStartR, &gEndR, txCpyR, INDEL_SHIFT_NO_MAX, isdRight); if (shiftL || shiftR) { if (blockSize < shiftL) warn("pslMismatchGapToBed: %s gapIx %d shifted left %d bases, but previous block " "size is only %d; report to NCBI", psl->qName, ix, shiftL, blockSize); if (psl->blockSizes[ix+1] < shiftR) warn("pslMismatchGapToBed: %s gapIx %d shifted right %d bases, but next block " "size is only %d; report to NCBI", psl->qName, ix, shiftR, psl->blockSizes[ix+1]); struct txAliDiff *shiftyItem = NULL; AllocVar(shiftyItem); shiftyItem->chrom = cloneString(psl->tName); shiftyItem->chromStart = tGapStart - shiftL; shiftyItem->chromEnd = tGapEnd + shiftR; shiftyItem->reserved = SHIFTYGAP_COLOR; shiftyItem->hgvsG = ""; shiftyItem->hgvsN = ""; shiftyItem->hgvsCN = ""; dyStringClear(namePlus); dyStringPrintf(namePlus, "Shifted Gap shift: L%d,R%d", shiftL, shiftR); shiftyItem->name = cloneString(namePlus->string); int qStart = qGapStart - (isRc ? shiftR : shiftL); int qEnd = qGapEnd + (isRc ? shiftL : shiftR); addCoordCols(shiftyItem, psl->qName, qStart, qEnd); shiftyItem->gSkipped = tGapLen; shiftyItem->txSkipped = qGapLen; shiftyItem->shiftL = shiftL; shiftyItem->shiftR = shiftR; dyStringClear(namePlus); dyStringPrintf(namePlus, "%s:", psl->qName); addCNRange(namePlus, qStart, qEnd, cds); shiftyItem->hgvsPosCN = cloneString(namePlus->string); shiftyItem->strand[0] = psl->strand[0]; int thickSize = tGapLen; if (qGapLen > thickSize) thickSize = qGapLen; if (isRc) { shiftyItem->thickStart = shiftyItem->chromStart; shiftyItem->thickEnd = shiftyItem->chromStart + thickSize; if (shiftyItem->thickEnd > shiftyItem->chromEnd) shiftyItem->thickEnd = shiftyItem->chromEnd; } else { shiftyItem->thickStart = shiftyItem->chromEnd - thickSize; if (shiftyItem->thickStart < shiftyItem->chromStart) shiftyItem->thickStart = shiftyItem->chromStart; shiftyItem->thickEnd = shiftyItem->chromEnd; } slAddHead(differencesList, shiftyItem); } struct txAliDiff *shortItem = NULL; AllocVar(shortItem); dyStringClear(namePlus); dyStringPrintf(namePlus, "Short Gap skip: G%d,T%d", tGapLen, qGapLen); shortItem->name = cloneString(namePlus->string); shortItem->chrom = cloneString(psl->tName); shortItem->chromStart = tGapStart; shortItem->chromEnd = tGapEnd; shortItem->thickStart = tGapStart; shortItem->thickEnd = tGapEnd; shortItem->reserved = SHORTGAP_COLOR; shortItem->strand[0] = psl->strand[0]; shortItem->hgvsPosCN = ""; shortItem->hgvsCN = ""; // Make HGVS g. struct bed3 variantBed; variantBed.chrom = psl->tName; variantBed.chromStart = gStartR; variantBed.chromEnd = gEndR; char *hgvsG = hgvsGFromVariant(gSeqWin, &variantBed, txCpyR, chromAcc, TRUE); // Make HGVS c./n. variantBed.chromStart = tGapStart; variantBed.chromEnd = tGapEnd; char tRef[tGapLen+1]; seqWindowCopy(gSeqWin, variantBed.chromStart, tGapLen, tRef, sizeof(tRef)); struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &variantBed, tRef, psl, txSeq); char *hgvsCN; if (cds) hgvsCN = hgvsCFromVpTx(vpTx, gSeqWin, psl, cds, txSeq, TRUE); else hgvsCN = hgvsNFromVpTx(vpTx, gSeqWin, psl, txSeq, TRUE); addCoordCols(shortItem, psl->qName, vpTx->start.txOffset, vpTx->end.txOffset); shortItem->gSkipped = tGapLen; shortItem->txSkipped = qGapLen; shortItem->hgvsG = cloneString(hgvsG); shortItem->hgvsN = cloneString(hgvsCN); slAddHead(differencesList, shortItem); freeMem(hgvsG); vpTxFree(&vpTx); } else if (qGapLen > 0) { // Not too short, but skips transcript bases struct txAliDiff *doubleItem = NULL; AllocVar(doubleItem); doubleItem->chrom = cloneString(psl->tName); doubleItem->chromStart = tGapStart; doubleItem->chromEnd = tGapEnd; doubleItem->reserved = DOUBLEGAP_COLOR; doubleItem->hgvsG = ""; doubleItem->hgvsN = ""; doubleItem->hgvsPosCN = ""; dyStringClear(namePlus); dyStringPrintf(namePlus, "Double Gap: G%d,T%d", tGapLen, qGapLen); doubleItem->name = cloneString(namePlus->string); doubleItem->strand[0] = psl->strand[0]; addCoordCols(doubleItem, psl->qName, qGapStart, qGapEnd); doubleItem->gSkipped = tGapLen; doubleItem->txSkipped = qGapLen; dyStringClear(namePlus); dyStringPrintf(namePlus, "%s:", psl->qName); addCNRange(namePlus, qGapStart, qGapEnd, cds); dyStringAppend(namePlus, "del"); doubleItem->hgvsCN = cloneString(namePlus->string); doubleItem->thickStart = tGapStart; doubleItem->thickEnd = tGapEnd; slAddHead(differencesList, doubleItem); } } } static void checkUnalignedTxSeq(struct psl *psl, struct seqWindow *gSeqWin, struct dnaSeq *txSeq, struct genbankCds *cds, struct txAliDiff **differencesList) /* Check for skipped q bases before alignment start and non-polyA skipped q bases after end * of alignment. */ { boolean isRc = (pslQStrand(psl) == '-'); static struct dyString *namePlus = NULL; if (namePlus == NULL) namePlus = dyStringNew(0); if (psl->qStarts[0] != 0) { struct txAliDiff *newItem = NULL; AllocVar(newItem); newItem->chrom = cloneString(psl->tName); newItem->chromStart = psl->tStart; newItem->chromEnd = psl->tStart; newItem->thickStart = psl->tStart; newItem->thickEnd = psl->tStart; newItem->reserved = QSKIPPED_COLOR; newItem->hgvsG = ""; newItem->hgvsN = ""; newItem->hgvsPosCN = ""; if (isRc) { // Is it polyA? Using the same heuristic as in hgTracks/cds.c, consider it close // enough if it's within 3 bases of the number of unaligned bases. int polyASize = tailPolyASizeLoose(txSeq->dna, txSeq->size); boolean isPolyA = (polyASize > 0 && polyASize + 3 >= psl->qStarts[0]); if (! isPolyA) { // Unaligned transcript sequence at end int qSkipStart = psl->qSize - psl->qStarts[0]; dyStringClear(namePlus); dyStringPrintf(namePlus, "Transcript Skip: last%ddel", psl->qStarts[0]); newItem->name = cloneString(namePlus->string); dyStringClear(namePlus); newItem->strand[0] = psl->strand[0]; addCoordCols(newItem, psl->qName, qSkipStart, psl->qSize); dyStringPrintf(namePlus, "%s:", psl->qName); addCNRange(namePlus, qSkipStart, psl->qSize, cds); dyStringAppend(namePlus, "del"); newItem->hgvsCN = cloneString(namePlus->string); slAddHead(differencesList, newItem); } } else { // Unaligned transcript sequence at start dyStringClear(namePlus); dyStringPrintf(namePlus, "Transcript Skip: first%ddel", psl->qStarts[0]); newItem->name = cloneString(namePlus->string); dyStringClear(namePlus); newItem->strand[0] = psl->strand[0]; addCoordCols(newItem, psl->qName, 0, psl->qStarts[0]); dyStringPrintf(namePlus, "%s:", psl->qName); addCNRange(namePlus, 0, psl->qStarts[0], cds); dyStringAppend(namePlus, "del"); newItem->hgvsCN = cloneString(namePlus->string); slAddHead(differencesList, newItem); } } // Unlike psl->qStarts[], psl->qStart and psl->qEnd are not reversed; for my sanity, keep it // all on genomic + until making tx coords. int lastIx = psl->blockCount - 1; int qEnd = psl->qStarts[lastIx] + psl->blockSizes[lastIx]; int qSkipped = psl->qSize - qEnd; if (qSkipped > 0) { struct txAliDiff *newItem = NULL; AllocVar(newItem); newItem->chrom = cloneString(psl->tName); newItem->chromStart = psl->tEnd; newItem->chromEnd = psl->tEnd; newItem->thickStart = psl->tEnd; newItem->thickEnd = psl->tEnd; newItem->reserved = QSKIPPED_COLOR; newItem->hgvsG = ""; newItem->hgvsN = ""; newItem->hgvsPosCN = ""; if (isRc) { // Unaligned transcript sequence at start dyStringClear(namePlus); dyStringPrintf(namePlus, "Transcript Skip: first%ddel", qSkipped); newItem->name = cloneString(namePlus->string); dyStringClear(namePlus); newItem->strand[0] = psl->strand[0]; addCoordCols(newItem, psl->qName, 0, qSkipped); dyStringPrintf(namePlus, "%s:", psl->qName); addCNRange(namePlus, 0, qSkipped, cds); dyStringAppend(namePlus, "del"); newItem->hgvsCN = cloneString(namePlus->string); slAddHead(differencesList, newItem); } else { // Is it polyA? Using the same heuristic as in hgTracks/cds.c, consider it close // enough if it's within 3 bases of the number of unaligned bases. int polyASize = tailPolyASizeLoose(txSeq->dna, txSeq->size); boolean isPolyA = (polyASize > 0 && polyASize + 3 >= qSkipped); if (! isPolyA) { // Unaligned transcript sequence at end dyStringClear(namePlus); dyStringPrintf(namePlus, "Transcript Skip: last%ddel", qSkipped); newItem->name = cloneString(namePlus->string); dyStringClear(namePlus); newItem->strand[0] = psl->strand[0]; addCoordCols(newItem, psl->qName, qEnd, psl->qSize); dyStringPrintf(namePlus, "%s:", psl->qName); addCNRange(namePlus, qEnd, psl->qSize, cds); dyStringAppend(namePlus, "del"); newItem->hgvsCN = cloneString(namePlus->string); slAddHead(differencesList, newItem); } } } } static void checkTranscript(struct psl *psl, struct seqWindow *gSeqWin, struct dnaSeq *txSeq, struct genbankCds *cds, struct txAliDiff **differencesList) /* Build up lists of mismatches and non-exon, non-polyA indels between genome and transcript. */ { if (psl->qSize != txSeq->size) errAbort("%s psl->qSize is %d but txSeq->size is %d", psl->qName, psl->qSize, txSeq->size); int fudge = 1024; int winStart = (psl->tStart > fudge) ? psl->tStart - fudge : 0; int winEnd = psl->tEnd + fudge; gSeqWin->fetch(gSeqWin, psl->tName, winStart, winEnd); touppers(txSeq->dna); char *chromAcc = db ? hRefSeqAccForChrom(db, psl->tName) : psl->tName; int ix; for (ix = 0; ix < psl->blockCount; ix++) { compareExonBases(psl, ix, gSeqWin, txSeq, cds, chromAcc, differencesList); checkIntrons(psl, ix, gSeqWin, txSeq, cds, chromAcc, differencesList); } checkUnalignedTxSeq(psl, gSeqWin, txSeq, cds, differencesList); } static void sortAndDumpBedPlus(char *outBaseName, struct txAliDiff *diffList) -/* Sort the mostly-backwards bed list and and dump to file named outBaseName.bed. */ +/* Sort the mostly-backwards bed list and dump to file named outBaseName.bed. */ { char fileName[PATH_LEN]; safef(fileName, sizeof(fileName), "%s.bed", outBaseName); FILE *f = mustOpen(fileName, "w"); slReverse(&diffList); slSort(&diffList, bedCmp); struct txAliDiff *diffItem; for (diffItem = diffList; diffItem != NULL; diffItem = diffItem->next) { txAliDiffOutput(diffItem, f, '\t', '\n'); } carefulClose(&f); } static struct hash *hashDnaSeqFromFa(char *faFile) /* Return a hash that maps sequence names from faFile to dnaSeq records. */ { struct hash *hash = hashNew(17); verbose(2, "Reading query fasta from %s\n", faFile); struct dnaSeq *allSeqs = faReadAllDna(faFile), *seq; verbose(2, "Got %d sequences from %s.\n", slCount(allSeqs), faFile); for (seq = allSeqs; seq != NULL; seq = seq->next) hashAdd(hash, seq->name, seq); return hash; } static struct hash *hashCdsFromFile(char *cdsFile) /* Return a hash that maps sequence names from cdsFile to struct genbankCds *, * or NULL if cdsFile is NULL. */ { struct hash *hash = NULL; if (cdsFile) { verbose(2, "Reading Genbank CDS strings from %s\n", cdsFile); hash = hashNew(17); int count = 0; struct lineFile *lf = lineFileOpen(cdsFile, TRUE); char *line = NULL; while (lineFileNextReal(lf, &line)) { char *words[2]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, 2, wordCount); struct genbankCds *cds; AllocVar(cds); if (! genbankCdsParse(words[1], cds)) lineFileAbort(lf, "Unable to parse '%s' as Genbank CDS string", words[1]); hashAdd(hash, words[0], cds); count++; } lineFileClose(&lf); verbose(2, "Got %d CDS strings from %s.\n", count, cdsFile); } return hash; } void pslMismatchGapToBed(char *pslFile, char *targetTwoBit, char *queryFa, char *outBaseName) /* pslMismatchGapToBed - Find mismatches and short indels between target and query * (e.g. genome & transcript). */ { struct txAliDiff *differencesList = NULL; struct seqWindow *gSeqWin = twoBitSeqWindowNew(targetTwoBit, NULL, 0, 0); struct hash *qSeqHash = hashDnaSeqFromFa(queryFa); struct hash *cdsHash = hashCdsFromFile(cdsFile); struct lineFile *pslLf = pslFileOpen(pslFile); struct psl *psl; while ((psl = pslNext(pslLf)) != NULL) { if (ignoreQNamePrefix == NULL || !startsWith(ignoreQNamePrefix, psl->qName)) { struct dnaSeq *txSeq = hashFindVal(qSeqHash, psl->qName); if (txSeq) { struct genbankCds *cds = cdsHash ? hashFindVal(cdsHash, psl->qName) : NULL; checkTranscript(psl, gSeqWin, txSeq, cds, &differencesList); } else { errAbort("No sequence found for psl->qName = '%s' - exiting.", psl->qName); } pslFree(&psl); } } lineFileClose(&pslLf); sortAndDumpBedPlus(outBaseName, differencesList); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); cdsFile = optionVal("cdsFile", cdsFile); db = optionVal("db", db); ignoreQNamePrefix = optionVal("ignoreQNamePrefix", ignoreQNamePrefix); pslMismatchGapToBed(argv[1], argv[2], argv[3], argv[4]); return 0; }