d99300bbd4e6250c0a72144745b3383a973adb14
markd
  Sun Mar 13 12:32:04 2022 -0700
drop querys not in repSizes file rather than generate an error

diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
index fecc67b..8dcc404 100644
--- src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
+++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
@@ -7,31 +7,32 @@
 #include "hash.h"
 #include "options.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n"
   "\n"
   "usage:\n"
   "   rmskAlignToPsl rmskAlignTab rmskPslFile\n"
   "\n"
   "  -bigRmsk - input is the text version of bigRmskAlignBed files.\n"
   "  -repSizes=tab - two column tab file with repeat name and size.\n"
   "   Sometimes the repeat sizes are incorrect in the align file.\n"
-  "   Obtain the sizes from this file instead.\n"
+  "   Obtain the sizes from this file instead.  Alignments not found\n"
+  "   in file are dropped.\n"
   "  -dump - print alignments to stdout for debugging purposes\n"
   "\n"
   "This convert *.fa.align.tsv file, created by\n"
   "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n"
   "Non-TE Repeats without consensus sequence are not included.\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"bigRmsk", OPTION_BOOLEAN},
    {"repSizes", OPTION_STRING},
    {"dump", OPTION_BOOLEAN},
    {NULL, 0},
 };
@@ -584,34 +585,37 @@
 unsigned tSize = getGenomeSize(alignParts);
 unsigned tStart = blkCoords->tStart;
 unsigned tEnd = blkCountN->tStart + blkCountN->tSize;
 
 
 int blockSpace = slCount(blkCoords);
 struct psl *psl = pslNew(alignParts->repName, qSize, qStart, qEnd,
                          alignParts->genoName, tSize, tStart, tEnd, 
                          alignParts->strand, blockSpace, 0);
 addPslBlocks(blkCoords, psl, &blockSpace);
 pslComputeInsertCounts(psl);
 slFreeList(&blkCoords);
 return psl;
 }
 
+
+
 static struct psl *convertToPsl(struct rmskAlign *alignParts,
                                 struct hash* repSizes)
 /* create a PSL from a repeat masker alignment, return NULL if fails */
 {
+
 struct blkCoord *blkCoords = parseCAligns(alignParts, repSizes);
 if (blkCoords == NULL)
     return NULL;
 
 if (dump)
     blkCoordListPrint(blkCoords, stderr);
 
 return convertBlocksToPsl(alignParts, blkCoords, repSizes);
 }
 
 static struct psl *alignToPsl(struct rmskAlign *alignParts, struct hash* repSizes)
 /* convert and output one set of alignment parts */
 {
 struct psl* psl = convertToPsl(alignParts, repSizes);
 if ((psl != NULL) && (pslCheck("rmskAlign", stderr, psl) != 0))
@@ -636,46 +640,73 @@
         rmskAlignListPrint(alignParts, stderr);
 
     if (shouldConvert(alignParts))
         {
         struct psl *psl = alignToPsl(alignParts, repSizes);
         if (psl != NULL)
             {
             pslTabOut(psl, outFh);
             pslFree(&psl);
             }
         }
     }
 }
 
 
+static boolean checkMissingSize(struct rmskAlign* rmskAlignGroup,
+                                struct hash *repSizes,
+                                struct hash *repSizeWarned)
+/* If repSizes are supplied, check if we have it for this query.  If not, warn
+ * for first occurrence.  These are skipped */
+{
+if (repSizes != NULL)
+    {
+    if (hashLookup(repSizes, rmskAlignGroup->repName) == NULL)
+        {
+        if (hashLookup(repSizeWarned, rmskAlignGroup->repName) == NULL)
+            {
+            fprintf(stderr, "Warning: '%s' size not found for query, skipping\n", rmskAlignGroup->repName);
+            hashAddInt(repSizeWarned, rmskAlignGroup->repName, TRUE);
+            }
+        return FALSE;
+        }
+    }
+return TRUE;
+}
+
 static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile,
                            struct hash* repSizes)
 /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */
 {
 // load all, so we can join ones split by other insertions by id
 // don't bother freeing
 struct rmskAlign **rmskAlignGroups = NULL;
 unsigned maxAlignId = 0;
 if (bigRmsk)
     rmskAlignGroups = loadBigRmskAlign(rmskAlignFile, &maxAlignId);
 else
     rmskAlignGroups = loadRmskAlign(rmskAlignFile, &maxAlignId);
 
+struct hash* repSizeWarned = hashNew(12);  // don't warn multiple times on same sequence
+
 FILE* outFh = mustOpen(rmskPslFile, "w");
 for (unsigned id = 0; id <= maxAlignId; id++)
+    {
+    if ((rmskAlignGroups[id] != NULL)
+        && checkMissingSize(rmskAlignGroups[id], repSizes, repSizeWarned))
         convertAlignGroup(&(rmskAlignGroups[id]), repSizes, outFh);
+    }
 carefulClose(&outFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 bigRmsk = optionExists("bigRmsk");
 dump = optionExists("dump");
 struct hash* repSizes = NULL;
 if (optionExists("repSizes"))
     repSizes = loadRepSizes(optionVal("repSizes", NULL));
 rmskAlignToPsl(argv[1], argv[2], repSizes);