7106942562801f0a7a7c9dc4c101c9f7e8afc1a8 angie Fri May 26 15:55:20 2023 -0700 User reported in MLQ #31322 that axtChain gave fewer results for fasta inputs than for 2bit inputs. It turned out to be an almost 20-year-old bug that we never noticed because we always use 2bit: loadFaSeq mimiced the strand-change detection logic in loadIfNewSeq but ignored the fact that loadIfNewSeq gets a fresh new sequence on the + strand every time the name changes, and only compares to the previous value of strand when the sequence name has not changed. Instead of unsuccessfully mimicing that code, just let the caller reverse-complement the sequence when strand is - and then restore it to + strand after use. diff --git src/hg/mouseStuff/axtChain/axtChain.c src/hg/mouseStuff/axtChain/axtChain.c index 6f54d68..fe63006 100644 --- src/hg/mouseStuff/axtChain/axtChain.c +++ src/hg/mouseStuff/axtChain/axtChain.c @@ -142,54 +142,42 @@ verbose(1, "Loaded %d bases of %s from %s\n", seq->size, newName, seqPath); } else { snprintf(fileName, sizeof(fileName), "%s/%s.nib", seqPath, newName); *pSeq = seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); verbose(1, "Loaded %d bases in %s\n", seq->size, fileName); } *pName = newName; *pStrand = strand; if (strand == '-') reverseComplement(seq->dna, seq->size); } } -static void loadFaSeq(struct hash *faHash, char *newName, char strand, - char **pName, struct dnaSeq **pSeq, char *pStrand, char *fastaFileName) -/* retrieve sequence from hash. Reverse complement - * if necessary. */ +static void loadFaSeq(struct hash *faHash, char *newName, + char **pName, struct dnaSeq **pSeq, char *fastaFileName) +/* retrieve sequence from hash. Leave reverse complementing (and restoration to original strand) + * up to caller. */ { struct dnaSeq *seq; -if (sameString(newName, *pName)) - { - if (strand != *pStrand) - { - seq = *pSeq; - reverseComplement(seq->dna, seq->size); - *pStrand = strand; - } - } -else +if (differentString(newName, *pName)) { *pName = newName; *pSeq = seq = hashFindVal(faHash, newName); if (NULL == seq) errAbort("ERROR: can not find sequence name '%s' from fasta file '%s'\n", newName, fastaFileName); - *pStrand = strand; - if (strand == '-') - reverseComplement(seq->dna, seq->size); verbose(1, "Loaded %d bases from %s fa\n", seq->size, newName); } } void removeExactOverlaps(struct cBlock **pBoxList) /* Remove from list blocks that start in exactly the same * place on both coordinates. */ { struct cBlock *newList = NULL, *b, *next, *last = NULL; slSort(pBoxList, cBlockCmpBoth); for (b = *pBoxList; b != NULL; b = next) { next = b->next; if (last != NULL && b->qStart == last->qStart && b->tStart == last->tStart) { @@ -393,87 +381,94 @@ { struct hash *pairHash = newHash(0); /* Hash keyed by qSeq<strand>tSeq */ struct seqPair *spList = NULL, *sp; FILE *f = mustOpen(chainOut, "w"); char *qName = "", *tName = ""; struct dnaSeq *qSeq = NULL, *tSeq = NULL; char qStrand = 0, tStrand = 0; struct chain *chainList = NULL, *chain; FILE *details = NULL; struct dnaSeq *seq = NULL; struct hash *qFaHash = newHash(0); struct hash *tFaHash = newHash(0); FILE *faF; boolean qIsTwoBit = twoBitIsFile(qNibDir); boolean tIsTwoBit = twoBitIsFile(tNibDir); +boolean qIsFa = optionExists("faQ"); +boolean tIsFa = optionExists("faT"); axtScoreSchemeDnaWrite(scoreScheme, f, "axtChain"); if (detailsName != NULL) details = mustOpen(detailsName, "w"); /* Read input file and divide alignments into various parts. */ if (optionExists("psl")) spList = readPslBlocks(axtIn, pairHash, f); else spList = readAxtBlocks(axtIn, pairHash, f); -if (optionExists("faQ")) +if (qIsFa) { faF = mustOpen(qNibDir, "r"); verbose(1, "reading query fasta sequence from '%s'\n", qNibDir); while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq)) hashAdd(qFaHash, seq->name, seq); fclose(faF); } -if (optionExists("faT")) +if (tIsFa) { faF = mustOpen(tNibDir, "r"); verbose(1, "reading target fasta sequence from '%s'\n", tNibDir); while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq)) hashAdd(tFaHash, seq->name, seq); fclose(faF); } for (sp = spList; sp != NULL; sp = sp->next) { slReverse(&sp->blockList); removeExactOverlaps(&sp->blockList); verbose(1, "%d blocks after duplicate removal\n", slCount(sp->blockList)); - if (optionExists("faQ")) + if (qIsFa) { assert (qFaHash != NULL); - loadFaSeq(qFaHash, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand, qNibDir); + loadFaSeq(qFaHash, sp->qName, &qName, &qSeq, qNibDir); + qStrand = sp->qStrand; + if (sp->qStrand == '-') + reverseComplement(qSeq->dna, qSeq->size); } else { if (! qIsTwoBit && ! isDirectory(qNibDir)) errAbort("given qNibDir argument: '%s' is not a 2bit file or a directory\n", qNibDir); loadIfNewSeq(qNibDir, qIsTwoBit, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); } - if (optionExists("faT")) + if (tIsFa) { assert (tFaHash != NULL); - loadFaSeq(tFaHash, sp->tName, '+', &tName, &tSeq, &tStrand, tNibDir); + loadFaSeq(tFaHash, sp->tName, &tName, &tSeq, tNibDir); } else { if (! tIsTwoBit && ! isDirectory(tNibDir)) errAbort("given tNibDir argument: '%s' is not a 2bit file or a directory\n", tNibDir); loadIfNewSeq(tNibDir, tIsTwoBit, sp->tName, '+', &tName, &tSeq, &tStrand); } chainPair(sp, qSeq, tSeq, &chainList, details); + if (qIsFa && sp->qStrand == '-') + reverseComplement(qSeq->dna, qSeq->size); } slSort(&chainList, chainCmpScore); for (chain = chainList; chain != NULL; chain = chain->next) { assert(chain->qStart == chain->blockList->qStart && chain->tStart == chain->blockList->tStart); chainWrite(chain, f); } carefulClose(&f); } struct gapCalc *gapCalcReadOrDefault(char *fileName) /* Return gaps from file, or default if fileName is NULL. */ {