d7ff43fdbd408928d1d0b66a1f791e4635032c47
markd
  Sun Feb 20 08:44:04 2022 -0800
add program to take repeat masker .align.tsv files and convert to PSL for use with pslMap

diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
new file mode 100644
index 0000000..9f67c94
--- /dev/null
+++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
@@ -0,0 +1,586 @@
+/* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */
+#include "common.h"
+#include "linefile.h"
+#include "rmskAlign.h"
+#include "psl.h"
+#include "options.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n"
+  "\n"
+  "usage:\n"
+  "   rmskAlignToPsl rmskAlignTab rmskPslFile\n"
+  "\n"
+  "  -dump - print alignments to stdout for debugging purposes\n"
+  "\n"
+  "This convert *.fa.align.tsv file, created by\n"
+  "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n"
+  "Non-TE Repeats without consensus sequence are not included.\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {"dump", OPTION_BOOLEAN},
+   {NULL, 0},
+};
+
+static boolean dump = FALSE;
+
+/*
+ * rmskAlign notes
+ * 
+ *  - genoStart - genoEnd are zero-base, half-open
+ *  - repStart, repEnd - are one-based
+ *  - rmskAlign records are multi-part, grouped by id.  
+ *  - A zero-repeat length record in the middle of group
+ *    will have repStart == repEnd, appearing to have a length of 1.
+ *    It appears these all have a alignment starting with '-'
+ * 
+ * Alignment string format:
+ *
+ * from hg/hgc/rmskJoinedClick.c
+ * Print RepeatMasker alignment data stored in RM's cAlign format.
+ * The format is basically a lightly compressed diff format where
+ * the query and subject are merged into one squence line. The
+ * runs of exact matching sequences are interrupted by either
+ * single base substitutions annotated as queryBase "/" subjectBase,
+ * insertions in the subject annotated as "+ACTAT+", or deletions
+ * in the query annotated as "-ACTTTG-".
+ *
+ * Alignment parts must be decoded separately and then concatenated.
+ * as blocks starting with a '-' or '+' maybe terminated by the end of the string
+ *
+ * Alignments parts may overlap and must then be split.  A given id might
+ * have multiple repNames and are split into multiple PSL.
+ */
+
+static unsigned getGenomeSize(struct rmskAlign *alignParts)
+/* get the length or the chromosome */
+{
+return alignParts->genoEnd + alignParts->genoLeft;
+}
+
+static unsigned getRepeatSize(struct rmskAlign *alignParts)
+/* Get the length of repeat sequence.  This is not all that easy. When a
+ * repeat is split, CrossMatch reports the same repLeft for other alignments,
+ * at least in some cases.  RepeatMasker compensates for this when creating
+ * the .out. However, rmToUCSCTables.pl doesn't correct for this.  So we just
+ * take the largest length from all alignments.
+ *
+ * Example .align problem:
+ * 103 20.51 0.00 9.86 chr1 147061 147161 (248809261) C AluJr#SINE/Alu (146) 166 70 [m_b3s356i8] 1
+ * 103 20.51 0.00 9.86 chr1 147481 147535 (248808887) C AluJr#SINE/Alu (146) 69 25 [m_b3s356i8] 1
+ */
+{
+unsigned size = 0;
+for (struct rmskAlign *rmskAlign = alignParts; rmskAlign != NULL; rmskAlign = rmskAlign->next)
+    size = max(rmskAlign->repEnd + rmskAlign->repLeft, size);
+return size;
+}
+
+static void rmskAlignToPairwise(struct rmskAlign *rmskAlign,
+                                char *tStr, char *qStr)
+/* convert a rmsk alignment string to a pairwise string to print.  Output
+ * tStr/qStr should be allocated to the total alignment string from all parts
+ * plus one. */
+{
+/// see 'Alignment string format' above
+char *t = tStr, *q = qStr;
+char *a = rmskAlign->alignment;
+while (*a != '\0')
+    {
+    if (*a == '+')
+        {
+        // query insert
+        a++;
+        while ((*a != '+') && (*a != '\0'))
+            {
+            *t++ = '-';
+            *q++ = *a;
+            a++;
+            }
+        }
+    else if (*a == '-')
+        {
+        // target insert
+        a++;
+        while ((*a != '-') && (*a != '\0'))
+            {
+            *t++ = *a;
+            *q++ = '-';
+            a++;
+            }
+        }
+    else
+        {
+        // aligned
+        if (*(a+1) == '/')
+            {
+            *t++ = *a++;
+            a++;  // skip '/'
+            *q++ = *a;
+            }
+        else
+            {
+            *t++ = *a;
+            *q++ = *a;
+            }
+        }
+    if (*a == '\0')
+        break;
+    a++;
+    }
+*t = *q = '\0';
+assert(strlen(tStr) == strlen(qStr));
+assert(strlen(tStr) <= strlen(rmskAlign->alignment));
+assert(strlen(qStr) <= strlen(rmskAlign->alignment));
+}
+
+static void rmskAlignPrint(struct rmskAlign *alignPart,
+                           struct rmskAlign *alignParts,
+                           FILE *fh)
+/* print out pairwise rmskAlign with alignment string decoded
+ * for debugging */
+{
+int alnLen = strlen(alignPart->alignment);
+char tStr[alnLen + 1];
+char qStr[alnLen + 1];
+rmskAlignToPairwise(alignPart, tStr, qStr);
+
+rmskAlignTabOut(alignPart, fh);
+fprintf(fh, "    cAlign: %s\n", alignPart->alignment);
+char *fmt = "    %-8s %8d %8d |%d| [%s]: %s\n";
+fprintf(fh, fmt, alignPart->genoName, alignPart->genoStart, alignPart->genoEnd,
+        alignPart->genoEnd + alignPart->genoLeft,  "+", tStr);
+fprintf(fh, fmt, alignPart->repName, alignPart->repStart, alignPart->repEnd,
+        getRepeatSize(alignParts), alignPart->strand, qStr);
+}
+
+static void rmskAlignListPrint(struct rmskAlign *alignParts,
+                               FILE *fh)
+/* print out alignments in list */
+{
+for (struct rmskAlign *ap = alignParts; ap != NULL; ap = ap->next)
+    rmskAlignPrint(ap, alignParts, fh);
+}
+
+static boolean shouldConvert(struct rmskAlign *rmskAlign)
+/* does this repeat type have a consensus and should be converted? */
+{
+return !sameString(rmskAlign->repClass, "Simple_repeat");
+}
+
+static struct rmskAlign **groupRmskAligns(struct rmskAlign **rmskAlignsPtr,
+                                          unsigned *maxAlignIdPtr)
+/* build array of alignment by ids */
+{
+unsigned maxAlignId = 0;
+for (struct rmskAlign *ra = *rmskAlignsPtr; ra != NULL; ra = ra->next)
+    maxAlignId = max(maxAlignId, ra->id);
+
+struct rmskAlign **rmskAlignsById = AllocN(struct rmskAlign *, maxAlignId + 1);
+for (struct rmskAlign *ra = slPopHead(rmskAlignsPtr); ra != NULL; ra = slPopHead(rmskAlignsPtr))
+    slAddTail((&rmskAlignsById[ra->id]), ra);  // keep in genomic order
+
+*maxAlignIdPtr = maxAlignId;
+return rmskAlignsById;
+}
+
+int rmskAlignGenomeCmp(const void *va, const void *vb)
+/* sort in ascending genomic location  */
+{
+const struct rmskAlign *a = *((struct rmskAlign **)va);
+const struct rmskAlign *b = *((struct rmskAlign **)vb);
+int diff = strcmp(a->genoName, b->genoName);
+if (diff != 0)
+    return diff;
+return a->genoStart - b->genoStart;
+}
+
+static boolean rmskLinear(struct rmskAlign *prev,
+                          struct rmskAlign *next)
+/* are the two alignments linear? assumes genomic sort */
+{
+// rmskAlign 1-based and not in strand-specific order
+if (prev->strand[0] == '+')
+    return next->repStart >= prev->repEnd;
+else
+    return prev->repEnd > next->repStart;
+}
+
+static boolean rmskOverlap(struct rmskAlign *rm1,
+                           struct rmskAlign *rm2)
+/* do they overlap in genomic or repeat coordinates */
+{
+return ((rm1->genoStart < rm2->genoEnd) && (rm1->genoEnd > rm2->genoStart))
+    || ((rm1->repStart - 1 < rm2->repEnd) && (rm1->repEnd > rm2->repStart - 1));
+}
+
+static boolean anyOverlap(struct rmskAlign *alignNext,
+                          struct rmskAlign *alignParts)
+/* Check if the next entry overlaps any of the parts in genomic
+ * or repeat */
+{
+for (struct rmskAlign *ap = alignParts; ap != NULL; ap = ap->next)
+    {
+    if (rmskOverlap(ap, alignNext))
+        return TRUE;
+    }
+return FALSE;
+}
+
+static boolean poppedAlignProcess(struct rmskAlign *nextAlign,
+                                  struct rmskAlign **alignPartsPtr,
+                                  struct rmskAlign **skippedAlignsPtr)
+/* Do something with alignment from a group, either keep or add to skped list.
+, Return FALSE if we should not look for any more for this PSL */
+{
+struct rmskAlign *prevAlign = *alignPartsPtr;  // previous saved
+if (!(sameString(nextAlign->repName, prevAlign->repName)
+      && sameString(nextAlign->strand, prevAlign->strand)))
+    {
+    // different name, strand not, can't be in same PSL
+    slAddHead(skippedAlignsPtr, nextAlign);
+    return TRUE;
+    }
+if ((!rmskLinear(prevAlign, nextAlign)) || anyOverlap(nextAlign, *alignPartsPtr))
+    {
+    // hit an overlap or non-linear, give up on this PSL
+    slAddHead(skippedAlignsPtr, nextAlign);
+    return FALSE;
+    }
+
+slAddHead(alignPartsPtr, nextAlign);
+return TRUE;
+}
+
+static struct rmskAlign *popAlignParts(struct rmskAlign **rmskAlignsPtr)
+/* Remove alignments from a id group that can be turned into a PSL.  Stop if
+ * an overlapping alignments, name mismatch, strand mismatch, or non-linear
+ * alignment is found. These are returned in the next call.  Alignments in a
+ * give repeat masker alignment set may not be linear in some cases.
+ * Results will in genomic order (as must be input). */
+{
+if (*rmskAlignsPtr == NULL)
+    return NULL;
+
+// first part
+struct rmskAlign *alignParts = slPopHead(rmskAlignsPtr);
+struct rmskAlign *skippedAligns = NULL;
+
+// find other parts
+while (*rmskAlignsPtr != NULL)
+    {
+    struct rmskAlign *nextAlign = slPopHead(rmskAlignsPtr);
+    if (!poppedAlignProcess(nextAlign, &alignParts, &skippedAligns))
+        break;
+    }
+
+// restore skipped
+slReverse(&skippedAligns);
+*rmskAlignsPtr = slCat(skippedAligns, *rmskAlignsPtr);
+
+slReverse(&alignParts);
+return alignParts;
+}
+
+struct blkCoord
+/* coordinates and count of bases in a block or gap from alignment string */
+{
+    struct blkCoord *next;
+    int qStart;           // query coords
+    int qSize;            // zero if tInsert
+    int tStart;           // target coords
+    int tSize;            // zero if qInsert
+    unsigned matches;     // match count
+    unsigned mismatches;  // mismatch count
+};
+
+static struct blkCoord *blkCoordNew(int qStart, int tStart)
+/* allocate a new blkCoord object */
+{
+struct blkCoord *cnts;
+AllocVar(cnts);
+cnts->qStart = qStart;
+cnts->tStart = tStart;
+return cnts;
+}
+
+static struct blkCoord *blkCoordNewAdd(struct blkCoord **bcList,
+                                       int qStart, int tStart)
+/* allocate a new blkCoord object and add to the list */
+{
+struct blkCoord *cnts = blkCoordNew(qStart, tStart);
+slAddHead(bcList, cnts);
+return cnts;
+}
+
+static void blkCoordListPrint(struct blkCoord *blkCoords,
+                              FILE *fh)
+/* print a list of blkCoord for debugging */
+{
+fprintf(fh, "Blocks:\n");
+int i = 0;
+for (struct blkCoord *blk = blkCoords; blk != NULL; blk = blk->next, i++)
+    fprintf(fh, "\t%d %d-%d [%d] <=> %d-%d [%d]: %u %u\n", i,
+            blk->qStart, blk->qStart + blk->qSize, blk->qSize,
+            blk->tStart, blk->tStart + blk->tSize, blk->tSize,
+            blk->matches, blk->mismatches);
+}
+
+static struct blkCoord *parseCAlign(struct rmskAlign *rmskAlign,
+                                    unsigned repSize)
+/* Parse an alignment into list of counts. Warn and return NULL if the parse
+ * alignment is invalid */
+{
+struct blkCoord *blkCoords = NULL;
+struct blkCoord *cur = NULL;
+int tNext = rmskAlign->genoStart;
+int tEnd = rmskAlign->genoEnd;
+int qNext = rmskAlign->repStart - 1;
+int qEnd = rmskAlign->repEnd;
+if (rmskAlign->strand[0] == '-')
+    reverseIntRange(&qNext, &qEnd, repSize);
+char *a = rmskAlign->alignment;
+while (*a != '\0')
+    {
+    if (*a == '+')
+        {
+        // query insert
+        a++;
+        cur = blkCoordNewAdd(&blkCoords, qNext, tNext);
+        while ((*a != '+') && (*a != '\0'))
+            {
+            qNext++;
+            cur->qSize++;
+            a++;
+            }
+        cur = NULL;
+        }
+    else if (*a == '-')
+        {
+        // target insert
+        a++;
+        cur = blkCoordNewAdd(&blkCoords, qNext, tNext);
+        while ((*a != '-') && (*a != '\0'))
+            {
+            tNext++;
+            cur->tSize++;
+            a++;
+            }
+        cur = NULL;
+        }
+    else
+        {
+        // aligned
+        if (cur == NULL)
+            cur = blkCoordNewAdd(&blkCoords, qNext, tNext);
+        if (*(a+1) == '/')
+            {
+            cur->mismatches++;
+            a += 2;
+            }
+        else
+            {
+            cur->matches++;
+            }
+        cur->qSize++;
+        cur->tSize++;
+        tNext++;
+        qNext++;
+        }
+    if (*a == '\0')
+        break;
+    a++;
+    }
+slReverse(&blkCoords);
+if ((tNext > tEnd) || (qNext > qEnd))
+    {
+    fprintf(stderr, "Note: can't convert %s:%d-%d to %s:%d-%d, id %d, parsed alignment length exceeds bounds\n",
+            rmskAlign->repName, rmskAlign->repStart, rmskAlign->repEnd,
+            rmskAlign->genoName, rmskAlign->genoStart, rmskAlign->genoEnd, rmskAlign->id);
+    slFreeList(&blkCoords);
+    return NULL;
+    }
+
+return blkCoords;
+}
+
+static void trimLeadingIndels(struct blkCoord **blkCoordsPtr)
+/* drop blocks blocks with indels at the start */
+{
+struct blkCoord *blkCoords = *blkCoordsPtr;
+while ((blkCoords != NULL) && ((blkCoords->qSize == 0) || (blkCoords->tSize == 0)))
+    freeMem(slPopHead(&blkCoords));
+*blkCoordsPtr = blkCoords;
+}
+
+static struct blkCoord *parseCAligns(struct rmskAlign *alignParts)
+/* parses alignment strings from all parts to go into a PSL  */
+{
+unsigned repSize = getRepeatSize(alignParts);
+
+// build list
+struct blkCoord *blkCoords = NULL;
+for (struct rmskAlign *alignPart = alignParts; alignPart != NULL; alignPart = alignPart->next)
+    {
+    struct blkCoord *blkCoordsPart = parseCAlign(alignPart, repSize);
+    if (blkCoordsPart == NULL)
+        {
+        slFreeList(&blkCoords);
+        return NULL;
+        }
+    blkCoords = slCat(blkCoords, blkCoordsPart);
+    }
+
+// trim leading and trailing indels
+trimLeadingIndels(&blkCoords);
+slReverse(&blkCoords);
+trimLeadingIndels(&blkCoords);
+slReverse(&blkCoords);
+return blkCoords;
+}
+
+static void addPslBlocks(struct blkCoord *blkCoords,
+                         struct psl *psl,
+                         int *blockSpacePtr)
+/* add blocks to PSL */
+{
+for (struct blkCoord *blk = blkCoords; blk != NULL; blk = blk->next)
+    {
+    if (blk->qSize == 0)
+        {
+        psl->tNumInsert++;
+        psl->tBaseInsert += blk->tSize;
+        }
+    else if (blk->tSize == 0)
+        {
+        psl->qNumInsert++;
+        psl->qBaseInsert += blk->qSize;
+        }
+    else
+        {
+        assert(blk->qSize == blk->tSize);
+        psl->match += blk->matches;
+        psl->repMatch += blk->matches;
+        psl->misMatch += blk->mismatches;
+        if (psl->blockCount >= *blockSpacePtr)
+            pslGrow(psl, blockSpacePtr);
+        psl->tStarts[psl->blockCount] = blk->tStart;
+        psl->qStarts[psl->blockCount] = blk->qStart;
+        psl->blockSizes[psl->blockCount] = blk->tSize;
+        psl->blockCount += 1;
+        }
+    }
+}
+
+static struct psl *convertBlocksToPsl(struct rmskAlign *alignParts,
+                                      struct blkCoord *blkCoords)
+/* create a PSL from a repeat masker alignment */
+{
+/* must use coordinates from blocks, as end-indels have been trimmed */
+struct blkCoord *blkCountN = slLastEl(blkCoords);
+
+unsigned qSize = getRepeatSize(alignParts);
+unsigned qStart = blkCoords->qStart;
+unsigned qEnd = blkCountN->qStart + blkCountN->qSize;
+if (alignParts->strand[0] == '-')
+    reverseUnsignedRange(&qStart, &qEnd, qSize);
+
+unsigned tSize = getGenomeSize(alignParts);
+unsigned tStart = blkCoords->tStart;
+unsigned tEnd = blkCountN->tStart + blkCountN->tSize;
+
+
+int blockSpace = slCount(blkCoords);
+struct psl *psl = pslNew(alignParts->repName, qSize, qStart, qEnd,
+                         alignParts->genoName, tSize, tStart, tEnd, 
+                         alignParts->strand, blockSpace, 0);
+addPslBlocks(blkCoords, psl, &blockSpace);
+pslComputeInsertCounts(psl);
+slFreeList(&blkCoords);
+return psl;
+}
+
+static struct psl *convertToPsl(struct rmskAlign *alignParts)
+/* create a PSL from a repeat masker alignment, return NULL if fails */
+{
+struct blkCoord *blkCoords = parseCAligns(alignParts);
+if (blkCoords == NULL)
+    return NULL;
+
+if (dump)
+    blkCoordListPrint(blkCoords, stderr);
+
+return convertBlocksToPsl(alignParts, blkCoords);
+}
+
+static struct psl *alignToPsl(struct rmskAlign *alignParts)
+/* convert and output one set of alignment parts */
+{
+struct psl* psl = convertToPsl(alignParts);
+if ((psl != NULL) && (pslCheck("rmskAlign", stderr, psl) != 0))
+    {
+    fprintf(stderr, "Invalid PSL created\n");
+    pslTabOut(psl, stderr);
+    rmskAlignListPrint(alignParts, stderr);
+    errAbort("BUG invalid PSL created");
+    }
+return psl;
+}
+
+static void convertAlignGroup(struct rmskAlign **rmskAlignGroupPtr,
+                              FILE* outFh)
+/* convert an alignment group */
+{
+struct rmskAlign *alignParts;
+while ((alignParts = popAlignParts(rmskAlignGroupPtr)) != NULL)
+    {
+    if (dump)
+        rmskAlignListPrint(alignParts, stderr);
+
+    if (shouldConvert(alignParts))
+        {
+        struct psl *psl = alignToPsl(alignParts);
+        if (psl != NULL)
+            {
+            pslTabOut(psl, outFh);
+            pslFree(&psl);
+            }
+        }
+    }
+}
+
+
+static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile)
+/* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */
+{
+// load all, so we can join ones split by other insertions by id
+// don't bother freeing
+struct rmskAlign *rmskAligns = rmskAlignLoadAllByTab(rmskAlignFile);
+slSort(&rmskAligns, rmskAlignGenomeCmp);
+
+// get counts parts for each id
+unsigned maxAlignId;
+struct rmskAlign **rmskAlignGroups = groupRmskAligns(&rmskAligns, &maxAlignId);
+
+FILE* outFh = mustOpen(rmskPslFile, "w");
+for (unsigned id = 0; id <= maxAlignId; id++)
+    convertAlignGroup(&(rmskAlignGroups[id]), outFh);
+carefulClose(&outFh);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 3)
+    usage();
+dump = optionExists("dump");
+rmskAlignToPsl(argv[1], argv[2]);
+return 0;
+}