57d31ee9aede2bf610907285b05af8d9cbfe9bec
markd
  Sun Mar 13 10:45:03 2022 -0700
added ability to override incorrect sizes often found in RepeatMaster alignments

diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
index ebb039d..2c72ad4 100644
--- src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
+++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
@@ -1,44 +1,49 @@
 /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */
 #include "common.h"
 #include "linefile.h"
 #include "rmskAlign.h"
 #include "bigRmskAlignBed.h"
 #include "psl.h"
+#include "hash.h"
 #include "options.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n"
   "\n"
   "usage:\n"
   "   rmskAlignToPsl rmskAlignTab rmskPslFile\n"
   "\n"
   "  -bigRmsk - input is the text version of bigRmskAlignBed files.\n"
+  "  -repSizes=tab - two column tab file with repeat name and size.\n"
+  "   Sometimes the repeat sizes are incorrect in the align file.\n"
+  "   Obtain the sizes from this file instead.\n"
   "  -dump - print alignments to stdout for debugging purposes\n"
   "\n"
   "This convert *.fa.align.tsv file, created by\n"
   "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n"
   "Non-TE Repeats without consensus sequence are not included.\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"bigRmsk", OPTION_BOOLEAN},
+   {"repSizes", OPTION_STRING},
    {"dump", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 static boolean bigRmsk = FALSE;
 static boolean dump = FALSE;
 
 /*
  * rmskAlign notes
  * 
  *  - genoStart - genoEnd are zero-base, half-open
  *  - repStart, repEnd - are one-based
  *  - rmskAlign records are multi-part, grouped by id.  
  *  - A zero-repeat length record in the middle of group
  *    will have repStart == repEnd, appearing to have a length of 1.
@@ -50,54 +55,79 @@
  * Print RepeatMasker alignment data stored in RM's cAlign format.
  * The format is basically a lightly compressed diff format where
  * the query and subject are merged into one squence line. The
  * runs of exact matching sequences are interrupted by either
  * single base substitutions annotated as queryBase "/" subjectBase,
  * insertions in the subject annotated as "+ACTAT+", or deletions
  * in the query annotated as "-ACTTTG-".
  *
  * Alignment parts must be decoded separately and then concatenated.
  * as blocks starting with a '-' or '+' maybe terminated by the end of the string
  *
  * Alignments parts may overlap and must then be split.  A given id might
  * have multiple repNames and are split into multiple PSL.
  */
 
+
+static struct hash* loadRepSizes(char *repSizesFile)
+/* load repeat sizes file into a hash */
+{
+struct hash* repSizes = hashNew(20);
+struct lineFile* lf = lineFileOpen(repSizesFile, TRUE);
+char *row[2];
+while (lineFileNextRowTab(lf, row, ArraySize(row)))
+    hashAddInt(repSizes, row[0], lineFileNeedNum(lf, row, 1));
+lineFileClose(&lf);
+return repSizes;
+}
+
 static unsigned getGenomeSize(struct rmskAlign *alignParts)
 /* get the length or the chromosome */
 {
 return alignParts->genoEnd + alignParts->genoLeft;
 }
 
 static unsigned getRepeatSize(struct rmskAlign *alignParts)
 /* Get the length of repeat sequence.  This is not all that easy. When a
  * repeat is split, CrossMatch reports the same repLeft for other alignments,
  * at least in some cases.  RepeatMasker compensates for this when creating
  * the .out. However, rmToUCSCTables.pl doesn't correct for this.  So we just
  * take the largest length from all alignments.
  *
  * Example .align problem:
  * 103 20.51 0.00 9.86 chr1 147061 147161 (248809261) C AluJr#SINE/Alu (146) 166 70 [m_b3s356i8] 1
  * 103 20.51 0.00 9.86 chr1 147481 147535 (248808887) C AluJr#SINE/Alu (146) 69 25 [m_b3s356i8] 1
+ *
+ * Still this doesn't always get the correct size.
  */
 {
 unsigned size = 0;
 for (struct rmskAlign *rmskAlign = alignParts; rmskAlign != NULL; rmskAlign = rmskAlign->next)
     size = max(rmskAlign->repEnd + rmskAlign->repLeft, size);
 return size;
 }
 
+static unsigned lookupRepeatSize(struct rmskAlign *alignParts,
+                                 struct hash* repSizes)
+/* get repeat size from supplied table in available, otherwise from
+ * alignments */
+{
+return (repSizes != NULL)
+    ? hashIntVal(repSizes, alignParts->repName)
+    : getRepeatSize(alignParts);
+}
+
 static void rmskAlignToPairwise(struct rmskAlign *rmskAlign,
                                 char *tStr, char *qStr)
 /* convert a rmsk alignment string to a pairwise string to print.  Output
  * tStr/qStr should be allocated to the total alignment string from all parts
  * plus one. */
 {
 /// see 'Alignment string format' above
 char *t = tStr, *q = qStr;
 char *a = rmskAlign->alignment;
 while (*a != '\0')
     {
     if (*a == '+')
         {
         // query insert
         a++;
@@ -466,34 +496,35 @@
     return NULL;
     }
 
 return blkCoords;
 }
 
 static void trimLeadingIndels(struct blkCoord **blkCoordsPtr)
 /* drop blocks blocks with indels at the start */
 {
 struct blkCoord *blkCoords = *blkCoordsPtr;
 while ((blkCoords != NULL) && ((blkCoords->qSize == 0) || (blkCoords->tSize == 0)))
     freeMem(slPopHead(&blkCoords));
 *blkCoordsPtr = blkCoords;
 }
 
-static struct blkCoord *parseCAligns(struct rmskAlign *alignParts)
+static struct blkCoord *parseCAligns(struct rmskAlign *alignParts,
+                                     struct hash* repSizes)
 /* parses alignment strings from all parts to go into a PSL  */
 {
-unsigned repSize = getRepeatSize(alignParts);
+unsigned repSize = lookupRepeatSize(alignParts, repSizes);
 
 // build list
 struct blkCoord *blkCoords = NULL;
 for (struct rmskAlign *alignPart = alignParts; alignPart != NULL; alignPart = alignPart->next)
     {
     struct blkCoord *blkCoordsPart = parseCAlign(alignPart, repSize);
     if (blkCoordsPart == NULL)
         {
         slFreeList(&blkCoords);
         return NULL;
         }
     blkCoords = slCat(blkCoords, blkCoordsPart);
     }
 
 // trim leading and trailing indels
@@ -526,121 +557,128 @@
         assert(blk->qSize == blk->tSize);
         psl->match += blk->matches;
         psl->repMatch += blk->matches;
         psl->misMatch += blk->mismatches;
         if (psl->blockCount >= *blockSpacePtr)
             pslGrow(psl, blockSpacePtr);
         psl->tStarts[psl->blockCount] = blk->tStart;
         psl->qStarts[psl->blockCount] = blk->qStart;
         psl->blockSizes[psl->blockCount] = blk->tSize;
         psl->blockCount += 1;
         }
     }
 }
 
 static struct psl *convertBlocksToPsl(struct rmskAlign *alignParts,
-                                      struct blkCoord *blkCoords)
+                                      struct blkCoord *blkCoords,
+                                      struct hash* repSizes)
 /* create a PSL from a repeat masker alignment */
 {
 /* must use coordinates from blocks, as end-indels have been trimmed */
 struct blkCoord *blkCountN = slLastEl(blkCoords);
 
-unsigned qSize = getRepeatSize(alignParts);
+unsigned qSize = lookupRepeatSize(alignParts, repSizes);
 unsigned qStart = blkCoords->qStart;
 unsigned qEnd = blkCountN->qStart + blkCountN->qSize;
 if (alignParts->strand[0] == '-')
     reverseUnsignedRange(&qStart, &qEnd, qSize);
 
 unsigned tSize = getGenomeSize(alignParts);
 unsigned tStart = blkCoords->tStart;
 unsigned tEnd = blkCountN->tStart + blkCountN->tSize;
 
 
 int blockSpace = slCount(blkCoords);
 struct psl *psl = pslNew(alignParts->repName, qSize, qStart, qEnd,
                          alignParts->genoName, tSize, tStart, tEnd, 
                          alignParts->strand, blockSpace, 0);
 addPslBlocks(blkCoords, psl, &blockSpace);
 pslComputeInsertCounts(psl);
 slFreeList(&blkCoords);
 return psl;
 }
 
-static struct psl *convertToPsl(struct rmskAlign *alignParts)
+static struct psl *convertToPsl(struct rmskAlign *alignParts,
+                                struct hash* repSizes)
 /* create a PSL from a repeat masker alignment, return NULL if fails */
 {
-struct blkCoord *blkCoords = parseCAligns(alignParts);
+struct blkCoord *blkCoords = parseCAligns(alignParts, repSizes);
 if (blkCoords == NULL)
     return NULL;
 
 if (dump)
     blkCoordListPrint(blkCoords, stderr);
 
-return convertBlocksToPsl(alignParts, blkCoords);
+return convertBlocksToPsl(alignParts, blkCoords, repSizes);
 }
 
-static struct psl *alignToPsl(struct rmskAlign *alignParts)
+static struct psl *alignToPsl(struct rmskAlign *alignParts, struct hash* repSizes)
 /* convert and output one set of alignment parts */
 {
-struct psl* psl = convertToPsl(alignParts);
+struct psl* psl = convertToPsl(alignParts, repSizes);
 if ((psl != NULL) && (pslCheck("rmskAlign", stderr, psl) != 0))
     {
     fprintf(stderr, "Invalid PSL created\n");
     pslTabOut(psl, stderr);
     rmskAlignListPrint(alignParts, stderr);
     errAbort("BUG invalid PSL created");
     }
 return psl;
 }
 
 static void convertAlignGroup(struct rmskAlign **rmskAlignGroupPtr,
+                              struct hash* repSizes,
                               FILE* outFh)
 /* convert an alignment group */
 {
 struct rmskAlign *alignParts;
 while ((alignParts = popAlignParts(rmskAlignGroupPtr)) != NULL)
     {
     if (dump)
         rmskAlignListPrint(alignParts, stderr);
 
     if (shouldConvert(alignParts))
         {
-        struct psl *psl = alignToPsl(alignParts);
+        struct psl *psl = alignToPsl(alignParts, repSizes);
         if (psl != NULL)
             {
             pslTabOut(psl, outFh);
             pslFree(&psl);
             }
         }
     }
 }
 
 
-static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile)
+static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile,
+                           struct hash* repSizes)
 /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */
 {
 // load all, so we can join ones split by other insertions by id
 // don't bother freeing
 struct rmskAlign **rmskAlignGroups = NULL;
 unsigned maxAlignId = 0;
 if (bigRmsk)
     rmskAlignGroups = loadBigRmskAlign(rmskAlignFile, &maxAlignId);
 else
     rmskAlignGroups = loadRmskAlign(rmskAlignFile, &maxAlignId);
 
 FILE* outFh = mustOpen(rmskPslFile, "w");
 for (unsigned id = 0; id <= maxAlignId; id++)
-    convertAlignGroup(&(rmskAlignGroups[id]), outFh);
+    convertAlignGroup(&(rmskAlignGroups[id]), repSizes, outFh);
 carefulClose(&outFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 bigRmsk = optionExists("bigRmsk");
 dump = optionExists("dump");
-rmskAlignToPsl(argv[1], argv[2]);
+struct hash* repSizes = NULL;
+if (optionExists("repSizes"))
+    repSizes = loadRepSizes(optionVal("repSizes", NULL));
+rmskAlignToPsl(argv[1], argv[2], repSizes);
 return 0;
 }