e640dce06376856492a074bcc8676267e8943584 angie Mon Jun 4 15:18:44 2018 -0700 lrgReconstructSequence also needed updates to handle partial LRG mappings. sorta refs #13359 #17877 diff --git src/hg/lib/lrg.c src/hg/lib/lrg.c index f72dd50..ffb13f3 100644 --- src/hg/lib/lrg.c +++ src/hg/lib/lrg.c @@ -325,61 +325,74 @@ void lrgDiffFreeList(struct lrgDiff **pDiff) /* Free up a list of parsed diffs. */ { if (pDiff == NULL || *pDiff == NULL) return; struct lrgDiff *diff; for (diff = *pDiff; diff != NULL; diff = diff->next) { freeMem(diff->chromSeq); freeMem(diff->lrgSeq); } slFreeList(pDiff); } +static boolean lrgNameParseRange(char *lrgName, int lrgSize, char *buf, size_t bufSize, + int *retStart, int *retEnd) +/* If lrgName is of the format name:start-end, write the name into buf, set retStart and retEnd + * and return TRUE. Otherwise leave inputs alone and return FALSE. */ +{ +if (strchr(lrgName, ':')) + { + // Parse lrgStart and lrgEnd out of name field (LRG_*:start-end) + safecpy(buf, bufSize, lrgName); + char *p = strchr(buf, ':'); + // Chop at : (after name) + *p++ = '\0'; + char *num = p; + p = strchr(num, '-'); + if (p == NULL) + errAbort("Error parsing LRG name:start-end '%s' -- no '-' following ':'", lrgName); + *p++ = '\0'; + *retStart = atoi(num) - 1; + num = p; + *retEnd = atoi(num); + if (*retStart < 0 || *retStart >= *retEnd || *retEnd < 0 || *retEnd > lrgSize) + errAbort("lrgNameParseRange: got bad coords (%d, %d) from '%s' size %d", + *retStart, *retEnd, lrgName, lrgSize); + return TRUE; + } +return FALSE; +} + static int calcBlockSize(uint nextTStart, uint nextQStart, struct psl *psl, int blockIx) { int tLen = nextTStart - psl->tStarts[blockIx]; int qLen = nextQStart - psl->qStarts[blockIx]; return min(tLen, qLen); } struct psl *lrgToPsl(struct lrg *lrg, uint chromSize) /* Use lrg's mismatches and indels to make a PSL. */ { struct lrgDiff *mismatches = lrgParseMismatches(lrg), *indels = lrgParseIndels(lrg); int lrgStart = 0, lrgEnd = lrg->lrgSize; char *lrgName = lrg->name; char lrgNameBuf[strlen(lrgName)+1]; -if (strchr(lrgName, ':')) - { - // Parse lrgStart and lrgEnd out of name field (LRG_*:start-end) - safecpy(lrgNameBuf, sizeof(lrgNameBuf), lrgName); - char *p = strchr(lrgNameBuf, ':'); - // Chop at : (after name) - *p++ = '\0'; - char *num = p; - p = strchr(num, '-'); - if (p == NULL) - errAbort("Error parsing LRG name:start-end '%s' -- no '-' following ':'", lrg->name); - *p++ = '\0'; - lrgStart = atoi(num) - 1; - num = p; - lrgEnd = atoi(num); +if (lrgNameParseRange(lrgName, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd)) lrgName = lrgNameBuf; - } int blockCount = slCount(indels) + 1; unsigned opts = 0; struct psl *psl = pslNew(lrgName, lrg->lrgSize, lrgStart, lrgEnd, lrg->chrom, chromSize, lrg->chromStart, lrg->chromEnd, lrg->strand, blockCount, opts); psl->blockCount = blockCount; psl->misMatch = slCount(mismatches); boolean isRc = (lrg->strand[0] == '-'); // Translate gap coords from indels into block coords: psl->qStarts[0] = isRc ? (lrg->lrgSize - lrgEnd) : lrgStart; psl->tStarts[0] = lrg->chromStart; int alignedBaseCount = 0; int blockIx = 1; struct lrgDiff *diff; for (diff = indels; diff != NULL; diff = diff->next) @@ -419,70 +432,97 @@ /* Return the sum of insertions of lrg into reference sequence, ignoring deletions. * That is the max headroom that we will need while shifting bases around. */ { int sumIns = 0; struct lrgDiff *diff; for (diff = indels; diff != NULL; diff = diff->next) sumIns += (diff->lrgEnd - diff->lrgStart); return sumIns; } struct dnaSeq *lrgReconstructSequence(struct lrg *lrg, char *db) /* Use genomic sequence, lrg->mismatches and lrg->indels to reconstruct LRG sequence */ { struct lrgDiff *diff, *indels = lrgParseIndels(lrg); int refSize = lrg->chromEnd - lrg->chromStart; -// Fetch genomic sequence plus extra headroom for insertions (if any): -int seqSize = refSize + sumInsertions(indels); -struct dnaSeq *lrgSeq = hDnaFromSeq(db, lrg->chrom, lrg->chromStart, lrg->chromStart+seqSize, - dnaUpper); -char *lrgSeqDna = lrgSeq->dna; -if (seqSize > refSize) - zeroBytes(lrgSeqDna+refSize, seqSize-refSize); +// Fetch genomic sequence: +struct dnaSeq *lrgSeq = hDnaFromSeq(db, lrg->chrom, lrg->chromStart, lrg->chromEnd, dnaUpper); boolean isRc = (lrg->strand[0] == '-'); if (isRc) - reverseComplement(lrgSeqDna, refSize); -// Go through indels backwards w.r.t. genome assembly so we move sequence to the right + reverseComplement(lrgSeq->dna, refSize); +// If this was only a partial mapping of the LRG, pad N's at the beginning and/or end to get +// the correct size. +int lrgStart = 0, lrgEnd = lrg->lrgSize; +char lrgNameBuf[strlen(lrg->name)+1]; +lrgNameParseRange(lrg->name, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd); +// Replace lrgSeq->dna with a larger version that has room for genomic sequence plus inserted bases +// as well as possible padding at start and end for partial LRG mappings +int addedBases = sumInsertions(indels); +int paddedSize = refSize + lrgStart + (lrg->lrgSize - lrgEnd) + addedBases + 1; +char *lrgSeqDna = needMem(paddedSize); +if (lrgStart > 0) + memset(lrgSeqDna, 'N', lrgStart); +// Copy in the genomic data +safencpy(lrgSeqDna+lrgStart, paddedSize-lrgStart, lrgSeq->dna, refSize); +refSize += lrgStart; +// If the LRG has inserted bases, zero out some extra bytes after genomic seq +if (addedBases) + zeroBytes(lrgSeqDna+refSize, addedBases); +freeMem(lrgSeq->dna); +lrgSeq->dna = lrgSeqDna; +// Go through indels backwards w.r.t. LRG so we move sequence to the right // while not changing coords to the left: if (!isRc) slReverse(&indels); for (diff = indels; diff != NULL; diff = diff->next) { int lrgLen = diff->lrgEnd - diff->lrgStart; int refLen = diff->chromEnd - diff->chromStart; - int refStart = diff->chromStart - lrg->chromStart; + int refStart = diff->chromStart - lrg->chromStart + lrgStart; if (isRc) refStart = refSize - (refStart + refLen); if (lrgLen > refLen) { // LRG inserts sequence: shift the rest of the sequence to the right int insSize = lrgLen - refLen; - int moveSize = seqSize - refStart - insSize; + int moveSize = refSize + addedBases - refStart - insSize; memmove(lrgSeqDna+refStart+insSize, lrgSeqDna+refStart, moveSize); } else { // LRG deletes sequence: shift the rest of the sequence to the left int delSize = refLen - lrgLen; - int moveSize = seqSize - refStart - delSize + 1; // '\0' at end too + int moveSize = refSize + addedBases - refStart - delSize + 1; // '\0' at end too memmove(lrgSeqDna+refStart, lrgSeqDna+refStart+delSize, moveSize); } // If there is LRG sequence, copy it in. if (lrgLen > 0) memcpy(lrgSeqDna+refStart, diff->lrgSeq, lrgLen); } +int len = strlen(lrgSeqDna); +if (len != lrgEnd) + errAbort("lrgReconstructSequence: expected sequence length to be lrgEnd %d, got %d", + lrgEnd, len); // Now that indels have been resolved, use LRG coords to substitute mismatches: struct lrgDiff *mismatches = lrgParseMismatches(lrg); for (diff = mismatches; diff != NULL; diff = diff->next) { if (lrgSeqDna[diff->lrgStart] != diff->chromSeq[0]) errAbort("Mismatch with unexpected assembly sequence: expected '%c' on %c strand, " "got '%c'", diff->chromSeq[0], lrg->strand[0], lrgSeqDna[diff->lrgStart]); int size = diff->lrgEnd - diff->lrgStart; memcpy(lrgSeqDna+diff->lrgStart, diff->lrgSeq, size); } -if (strlen(lrgSeqDna) != lrg->lrgSize) - errAbort("maybeGetSeqUpper: Error applying LRG indels for '%s'", lrg->name); +// Add padding N's at end if applicable +if (lrgEnd < lrg->lrgSize) + { + memset(lrgSeqDna+lrgEnd, 'N', lrg->lrgSize - lrgEnd); + lrgSeqDna[lrg->lrgSize] = '\0'; + } +len = strlen(lrgSeqDna); +if (len != lrg->lrgSize) + errAbort("lrgReconstructSequence: Error applying LRG indels for '%s': expected size %d, got %d", + lrg->name, lrg->lrgSize, len); lrgSeq->size = lrg->lrgSize; return lrgSeq; }