47b962229a3ebcd05377a52db7d389d48a5f38fa
markd
  Fri Mar 25 09:08:31 2022 -0700
added option to specify expected repeat sizes and discard PSLs that don't match

diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
index 1cf25e6..1ad1c7d 100644
--- src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
+++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
@@ -5,41 +5,47 @@
 #include "bigRmskAlignBed.h"
 #include "psl.h"
 #include "hash.h"
 #include "options.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n"
   "\n"
   "usage:\n"
   "   rmskAlignToPsl rmskAlignTab rmskPslFile\n"
   "\n"
   "  -bigRmsk - input is the text version of bigRmskAlignBed files.\n"
+  "  -repSizes=tab - two column tab file with repeat name and size.\n"
+  "   Sometimes the repeat sizes are incorrect in the align file.\n"
+  "   If a repeat alignment doesn't match the size here or is not\n"
+  "   in the file it will be discarded.\n"
+  "   \n"
   "  -dump - print alignments to stdout for debugging purposes\n"
   "\n"
   "This convert *.fa.align.tsv file, created by\n"
   "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n"
   "Non-TE Repeats without consensus sequence are not included.\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"bigRmsk", OPTION_BOOLEAN},
+   {"repSizes", OPTION_STRING},
    {"dump", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 static boolean bigRmsk = FALSE;
 static boolean dump = FALSE;
 
 /*
  * rmskAlign notes
  * 
  *  - genoStart - genoEnd are zero-base, half-open
  *  - repStart, repEnd - are one-based
  *  - rmskAlign records are multi-part, grouped by id.  
  *  - A zero-repeat length record in the middle of group
  *    will have repStart == repEnd, appearing to have a length of 1.
@@ -52,30 +58,42 @@
  * The format is basically a lightly compressed diff format where
  * the query and subject are merged into one squence line. The
  * runs of exact matching sequences are interrupted by either
  * single base substitutions annotated as queryBase "/" subjectBase,
  * insertions in the subject annotated as "+ACTAT+", or deletions
  * in the query annotated as "-ACTTTG-".
  *
  * Alignment parts must be decoded separately and then concatenated.
  * as blocks starting with a '-' or '+' maybe terminated by the end of the string
  *
  * Alignments parts may overlap and must then be split.  A given id might
  * have multiple repNames and are split into multiple PSL.
  */
 
 
+static struct hash* loadRepSizes(char *repSizesFile)
+/* load repeat sizes file into a hash */
+{
+struct hash* repSizes = hashNew(20);
+struct lineFile* lf = lineFileOpen(repSizesFile, TRUE);
+char *row[2];
+while (lineFileNextRowTab(lf, row, ArraySize(row)))
+    hashAddInt(repSizes, row[0], lineFileNeedNum(lf, row, 1));
+lineFileClose(&lf);
+return repSizes;
+}
+
 static unsigned getGenomeSize(struct rmskAlign *alignParts)
 /* get the length or the chromosome */
 {
 return alignParts->genoEnd + alignParts->genoLeft;
 }
 
 static unsigned getRepeatSize(struct rmskAlign *alignParts)
 /* Get the length of repeat sequence.  This is not all that easy. When a
  * repeat is split, CrossMatch reports the same repLeft for other alignments,
  * at least in some cases.  RepeatMasker compensates for this when creating
  * the .out. However, rmToUCSCTables.pl doesn't correct for this.  So we just
  * take the largest length from all alignments.
  *
  * Example .align problem:
  * 103 20.51 0.00 9.86 chr1 147061 147161 (248809261) C AluJr#SINE/Alu (146) 166 70 [m_b3s356i8] 1
@@ -609,47 +627,86 @@
         rmskAlignListPrint(alignParts, stderr);
 
     if (shouldConvert(alignParts))
         {
         struct psl *psl = alignToPsl(alignParts);
         if (psl != NULL)
             {
             pslTabOut(psl, outFh);
             pslFree(&psl);
             }
         }
     }
 }
 
 
-static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile)
+static boolean checkSizeWithExpected(struct rmskAlign* rmskAlignGroup,
+                                     struct hash *repSizes,
+                                     struct hash *repSizeWarned)
+/* Check if we have an expected repSize for this and they match query.  If not, warn
+ * for first occurrence.  These are skipped */
+{
+if (hashLookup(repSizes, rmskAlignGroup->repName) == NULL)
+    {
+    if (hashLookup(repSizeWarned, rmskAlignGroup->repName) == NULL)
+        {
+        fprintf(stderr, "Warning: '%s' expected not found, skipping\n", rmskAlignGroup->repName);
+        hashAddInt(repSizeWarned, rmskAlignGroup->repName, TRUE);
+        }
+    return FALSE;
+    }
+int gotRepSize = getRepeatSize(rmskAlignGroup);
+int expectRepSize = hashIntVal(repSizes, rmskAlignGroup->repName);
+if (gotRepSize != expectRepSize)
+    {
+    if (hashLookup(repSizeWarned, rmskAlignGroup->repName) == NULL)
+        {
+        fprintf(stderr, "Warning: '%s' size %d does not match expected size %d, skipping\n",
+                rmskAlignGroup->repName, gotRepSize, expectRepSize);
+        hashAddInt(repSizeWarned, rmskAlignGroup->repName, TRUE);
+        }
+    return FALSE;
+    }
+return TRUE;
+}
+
+static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile,
+                           struct hash* repSizes)
 /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */
 {
 // load all, so we can join ones split by other insertions by id
 // don't bother freeing
 struct rmskAlign **rmskAlignGroups = NULL;
 unsigned maxAlignId = 0;
 if (bigRmsk)
     rmskAlignGroups = loadBigRmskAlign(rmskAlignFile, &maxAlignId);
 else
     rmskAlignGroups = loadRmskAlign(rmskAlignFile, &maxAlignId);
 
+struct hash* repSizeWarned = hashNew(12);  // don't warn multiple times on same sequence
+
 FILE* outFh = mustOpen(rmskPslFile, "w");
 for (unsigned id = 0; id <= maxAlignId; id++)
     {
     if (rmskAlignGroups[id] != NULL)
+        {
+        if ((repSizes == NULL) || checkSizeWithExpected(rmskAlignGroups[id], repSizes, repSizeWarned))
             convertAlignGroup(&(rmskAlignGroups[id]), outFh);
         }
+    }
 carefulClose(&outFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 bigRmsk = optionExists("bigRmsk");
 dump = optionExists("dump");
-rmskAlignToPsl(argv[1], argv[2]);
+struct hash* repSizes = NULL;
+if (optionExists("repSizes"))
+    repSizes = loadRepSizes(optionVal("repSizes", NULL));
+rmskAlignToPsl(argv[1], argv[2], repSizes);
 return 0;
 }