e640dce06376856492a074bcc8676267e8943584
angie
  Mon Jun 4 15:18:44 2018 -0700
lrgReconstructSequence also needed updates to handle partial LRG mappings.  sorta refs #13359 #17877

diff --git src/hg/lib/lrg.c src/hg/lib/lrg.c
index f72dd50..ffb13f3 100644
--- src/hg/lib/lrg.c
+++ src/hg/lib/lrg.c
@@ -325,61 +325,74 @@
 
 void lrgDiffFreeList(struct lrgDiff **pDiff)
 /* Free up a list of parsed diffs. */
 {
 if (pDiff == NULL || *pDiff == NULL)
     return;
 struct lrgDiff *diff;
 for (diff = *pDiff;  diff != NULL;  diff = diff->next)
     {
     freeMem(diff->chromSeq);
     freeMem(diff->lrgSeq);
     }
 slFreeList(pDiff);
 }
 
+static boolean lrgNameParseRange(char *lrgName, int lrgSize, char *buf, size_t bufSize,
+                                 int *retStart, int *retEnd)
+/* If lrgName is of the format name:start-end, write the name into buf, set retStart and retEnd
+ * and return TRUE.  Otherwise leave inputs alone and return FALSE. */
+{
+if (strchr(lrgName, ':'))
+    {
+    // Parse lrgStart and lrgEnd out of name field (LRG_*:start-end)
+    safecpy(buf, bufSize, lrgName);
+    char *p = strchr(buf, ':');
+    // Chop at : (after name)
+    *p++ = '\0';
+    char *num = p;
+    p = strchr(num, '-');
+    if (p == NULL)
+        errAbort("Error parsing LRG name:start-end '%s' -- no '-' following ':'", lrgName);
+    *p++ = '\0';
+    *retStart = atoi(num) - 1;
+    num = p;
+    *retEnd = atoi(num);
+    if (*retStart < 0 || *retStart >= *retEnd || *retEnd < 0 || *retEnd > lrgSize)
+        errAbort("lrgNameParseRange: got bad coords (%d, %d) from '%s' size %d",
+                 *retStart, *retEnd, lrgName, lrgSize);
+    return TRUE;
+    }
+return FALSE;
+}
+
 static int calcBlockSize(uint nextTStart, uint nextQStart, struct psl *psl, int blockIx)
 {
 int tLen = nextTStart - psl->tStarts[blockIx];
 int qLen = nextQStart - psl->qStarts[blockIx];
 return min(tLen, qLen);
 }
 
 struct psl *lrgToPsl(struct lrg *lrg, uint chromSize)
 /* Use lrg's mismatches and indels to make a PSL. */
 {
 struct lrgDiff *mismatches = lrgParseMismatches(lrg), *indels = lrgParseIndels(lrg);
 int lrgStart = 0, lrgEnd = lrg->lrgSize;
 char *lrgName = lrg->name;
 char lrgNameBuf[strlen(lrgName)+1];
-if (strchr(lrgName, ':'))
-    {
-    // Parse lrgStart and lrgEnd out of name field (LRG_*:start-end)
-    safecpy(lrgNameBuf, sizeof(lrgNameBuf), lrgName);
-    char *p = strchr(lrgNameBuf, ':');
-    // Chop at : (after name)
-    *p++ = '\0';
-    char *num = p;
-    p = strchr(num, '-');
-    if (p == NULL)
-        errAbort("Error parsing LRG name:start-end '%s' -- no '-' following ':'", lrg->name);
-    *p++ = '\0';
-    lrgStart = atoi(num) - 1;
-    num = p;
-    lrgEnd = atoi(num);
+if (lrgNameParseRange(lrgName, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd))
     lrgName = lrgNameBuf;
-    }
 int blockCount = slCount(indels) + 1;
 unsigned opts = 0;
 struct psl *psl = pslNew(lrgName, lrg->lrgSize, lrgStart, lrgEnd,
                          lrg->chrom, chromSize, lrg->chromStart, lrg->chromEnd,
                          lrg->strand, blockCount, opts);
 psl->blockCount = blockCount;
 psl->misMatch = slCount(mismatches);
 boolean isRc = (lrg->strand[0] == '-');
 // Translate gap coords from indels into block coords:
 psl->qStarts[0] = isRc ? (lrg->lrgSize - lrgEnd) : lrgStart;
 psl->tStarts[0] = lrg->chromStart;
 int alignedBaseCount = 0;
 int blockIx = 1;
 struct lrgDiff *diff;
 for (diff = indels;  diff != NULL;  diff = diff->next)
@@ -419,70 +432,97 @@
 /* Return the sum of insertions of lrg into reference sequence, ignoring deletions.
  * That is the max headroom that we will need while shifting bases around. */
 {
 int sumIns = 0;
 struct lrgDiff *diff;
 for (diff = indels;  diff != NULL;  diff = diff->next)
     sumIns += (diff->lrgEnd - diff->lrgStart);
 return sumIns;
 }
 
 struct dnaSeq *lrgReconstructSequence(struct lrg *lrg, char *db)
 /* Use genomic sequence, lrg->mismatches and lrg->indels to reconstruct LRG sequence */
 {
 struct lrgDiff *diff, *indels = lrgParseIndels(lrg);
 int refSize = lrg->chromEnd - lrg->chromStart;
-// Fetch genomic sequence plus extra headroom for insertions (if any):
-int seqSize = refSize + sumInsertions(indels);
-struct dnaSeq *lrgSeq = hDnaFromSeq(db, lrg->chrom, lrg->chromStart, lrg->chromStart+seqSize,
-				    dnaUpper);
-char *lrgSeqDna = lrgSeq->dna;
-if (seqSize > refSize)
-    zeroBytes(lrgSeqDna+refSize, seqSize-refSize);
+// Fetch genomic sequence:
+struct dnaSeq *lrgSeq = hDnaFromSeq(db, lrg->chrom, lrg->chromStart, lrg->chromEnd, dnaUpper);
 boolean isRc = (lrg->strand[0] == '-');
 if (isRc)
-    reverseComplement(lrgSeqDna, refSize);
-// Go through indels backwards w.r.t. genome assembly so we move sequence to the right
+    reverseComplement(lrgSeq->dna, refSize);
+// If this was only a partial mapping of the LRG, pad N's at the beginning and/or end to get
+// the correct size.
+int lrgStart = 0, lrgEnd = lrg->lrgSize;
+char lrgNameBuf[strlen(lrg->name)+1];
+lrgNameParseRange(lrg->name, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd);
+// Replace lrgSeq->dna with a larger version that has room for genomic sequence plus inserted bases
+// as well as possible padding at start and end for partial LRG mappings
+int addedBases = sumInsertions(indels);
+int paddedSize = refSize + lrgStart + (lrg->lrgSize - lrgEnd) + addedBases + 1;
+char *lrgSeqDna = needMem(paddedSize);
+if (lrgStart > 0)
+    memset(lrgSeqDna, 'N', lrgStart);
+// Copy in the genomic data
+safencpy(lrgSeqDna+lrgStart, paddedSize-lrgStart, lrgSeq->dna, refSize);
+refSize += lrgStart;
+// If the LRG has inserted bases, zero out some extra bytes after genomic seq
+if (addedBases)
+    zeroBytes(lrgSeqDna+refSize, addedBases);
+freeMem(lrgSeq->dna);
+lrgSeq->dna = lrgSeqDna;
+// Go through indels backwards w.r.t. LRG so we move sequence to the right
 // while not changing coords to the left:
 if (!isRc)
     slReverse(&indels);
 for (diff = indels;  diff != NULL;  diff = diff->next)
     {
     int lrgLen = diff->lrgEnd - diff->lrgStart;
     int refLen = diff->chromEnd - diff->chromStart;
-    int refStart = diff->chromStart - lrg->chromStart;
+    int refStart = diff->chromStart - lrg->chromStart + lrgStart;
     if (isRc)
 	refStart = refSize - (refStart + refLen);
     if (lrgLen > refLen)
 	{
 	// LRG inserts sequence: shift the rest of the sequence to the right
 	int insSize = lrgLen - refLen;
-	int moveSize = seqSize - refStart - insSize;
+	int moveSize = refSize + addedBases - refStart - insSize;
 	memmove(lrgSeqDna+refStart+insSize, lrgSeqDna+refStart, moveSize);
 	}
     else
 	{
 	// LRG deletes sequence: shift the rest of the sequence to the left
 	int delSize = refLen - lrgLen;
-	int moveSize = seqSize - refStart - delSize + 1; // '\0' at end too
+	int moveSize = refSize + addedBases - refStart - delSize + 1; // '\0' at end too
 	memmove(lrgSeqDna+refStart, lrgSeqDna+refStart+delSize, moveSize);
 	}
     // If there is LRG sequence, copy it in.
     if (lrgLen > 0)
 	memcpy(lrgSeqDna+refStart, diff->lrgSeq, lrgLen);
     }
+int len = strlen(lrgSeqDna);
+if (len != lrgEnd)
+    errAbort("lrgReconstructSequence: expected sequence length to be lrgEnd %d, got %d",
+             lrgEnd, len);
 // Now that indels have been resolved, use LRG coords to substitute mismatches:
 struct lrgDiff *mismatches = lrgParseMismatches(lrg);
 for (diff = mismatches;  diff != NULL;  diff = diff->next)
     {
     if (lrgSeqDna[diff->lrgStart] != diff->chromSeq[0])
 	errAbort("Mismatch with unexpected assembly sequence: expected '%c' on %c strand, "
 		 "got '%c'", diff->chromSeq[0], lrg->strand[0], lrgSeqDna[diff->lrgStart]);
     int size = diff->lrgEnd - diff->lrgStart;
     memcpy(lrgSeqDna+diff->lrgStart, diff->lrgSeq, size);
     }
-if (strlen(lrgSeqDna) != lrg->lrgSize)
-    errAbort("maybeGetSeqUpper: Error applying LRG indels for '%s'", lrg->name);
+// Add padding N's at end if applicable
+if (lrgEnd < lrg->lrgSize)
+    {
+    memset(lrgSeqDna+lrgEnd, 'N', lrg->lrgSize - lrgEnd);
+    lrgSeqDna[lrg->lrgSize] = '\0';
+    }
+len = strlen(lrgSeqDna);
+if (len != lrg->lrgSize)
+    errAbort("lrgReconstructSequence: Error applying LRG indels for '%s': expected size %d, got %d",
+             lrg->name, lrg->lrgSize, len);
 lrgSeq->size = lrg->lrgSize;
 return lrgSeq;
 }