c6eed4550b6684cc7cde7779b9084a72573b5d83
markd
  Fri Mar 11 19:54:00 2022 -0800
added ability to read ascii version of bigRmskAlign files

diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
index 9f67c94..ebb039d 100644
--- src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
+++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c
@@ -1,45 +1,49 @@
 /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */
 #include "common.h"
 #include "linefile.h"
 #include "rmskAlign.h"
+#include "bigRmskAlignBed.h"
 #include "psl.h"
 #include "options.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n"
   "\n"
   "usage:\n"
   "   rmskAlignToPsl rmskAlignTab rmskPslFile\n"
   "\n"
+  "  -bigRmsk - input is the text version of bigRmskAlignBed files.\n"
   "  -dump - print alignments to stdout for debugging purposes\n"
   "\n"
   "This convert *.fa.align.tsv file, created by\n"
   "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n"
   "Non-TE Repeats without consensus sequence are not included.\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
+   {"bigRmsk", OPTION_BOOLEAN},
    {"dump", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
+static boolean bigRmsk = FALSE;
 static boolean dump = FALSE;
 
 /*
  * rmskAlign notes
  * 
  *  - genoStart - genoEnd are zero-base, half-open
  *  - repStart, repEnd - are one-based
  *  - rmskAlign records are multi-part, grouped by id.  
  *  - A zero-repeat length record in the middle of group
  *    will have repStart == repEnd, appearing to have a length of 1.
  *    It appears these all have a alignment starting with '-'
  * 
  * Alignment string format:
  *
  * from hg/hgc/rmskJoinedClick.c
@@ -162,57 +166,112 @@
 
 static void rmskAlignListPrint(struct rmskAlign *alignParts,
                                FILE *fh)
 /* print out alignments in list */
 {
 for (struct rmskAlign *ap = alignParts; ap != NULL; ap = ap->next)
     rmskAlignPrint(ap, alignParts, fh);
 }
 
 static boolean shouldConvert(struct rmskAlign *rmskAlign)
 /* does this repeat type have a consensus and should be converted? */
 {
 return !sameString(rmskAlign->repClass, "Simple_repeat");
 }
 
-static struct rmskAlign **groupRmskAligns(struct rmskAlign **rmskAlignsPtr,
+static struct rmskAlign **groupRmskAligns(struct rmskAlign *rmskAligns,
                                           unsigned *maxAlignIdPtr)
 /* build array of alignment by ids */
 {
 unsigned maxAlignId = 0;
-for (struct rmskAlign *ra = *rmskAlignsPtr; ra != NULL; ra = ra->next)
+for (struct rmskAlign *ra = rmskAligns; ra != NULL; ra = ra->next)
     maxAlignId = max(maxAlignId, ra->id);
 
 struct rmskAlign **rmskAlignsById = AllocN(struct rmskAlign *, maxAlignId + 1);
-for (struct rmskAlign *ra = slPopHead(rmskAlignsPtr); ra != NULL; ra = slPopHead(rmskAlignsPtr))
+for (struct rmskAlign *ra = slPopHead(&rmskAligns); ra != NULL; ra = slPopHead(&rmskAligns))
     slAddTail((&rmskAlignsById[ra->id]), ra);  // keep in genomic order
 
 *maxAlignIdPtr = maxAlignId;
 return rmskAlignsById;
 }
 
 int rmskAlignGenomeCmp(const void *va, const void *vb)
 /* sort in ascending genomic location  */
 {
 const struct rmskAlign *a = *((struct rmskAlign **)va);
 const struct rmskAlign *b = *((struct rmskAlign **)vb);
 int diff = strcmp(a->genoName, b->genoName);
 if (diff != 0)
     return diff;
 return a->genoStart - b->genoStart;
 }
 
+static struct rmskAlign **loadRmskAlign(char *rmskAlignFile,
+                                        unsigned *maxAlignIdPtr)
+/* load rmskAlign file into an array of lists by indexed by alignment id */
+{
+struct rmskAlign *rmskAligns = rmskAlignLoadAllByTab(rmskAlignFile);
+slSort(&rmskAligns, rmskAlignGenomeCmp);
+
+return groupRmskAligns(rmskAligns, maxAlignIdPtr);
+}
+
+static struct rmskAlign *bigRmskAlignToRmskAlign(struct bigRmskAlignBed* bigRmskAlign)
+/* convert a bigRmskAlign record to an rmskAlign record */
+{
+struct rmskAlign *rmskAlign;
+AllocVar(rmskAlign);
+
+rmskAlign->swScore = bigRmskAlign->score;
+rmskAlign->milliDiv = 1000 * bigRmskAlign->percSubst;
+rmskAlign->milliDel = 1000 * bigRmskAlign->percDel;
+rmskAlign->milliIns = 1000 * bigRmskAlign->percIns;
+rmskAlign->genoName = cloneString(bigRmskAlign->chrom);
+rmskAlign->genoStart = bigRmskAlign->chromStart;
+rmskAlign->genoEnd = bigRmskAlign->chromEnd;
+rmskAlign->genoLeft = bigRmskAlign->chromRemain;
+rmskAlign->strand[0] = bigRmskAlign->strand[0];
+rmskAlign->repName = cloneString(bigRmskAlign->repName);
+rmskAlign->repClass = cloneString(bigRmskAlign->repType);
+rmskAlign->repFamily = cloneString(bigRmskAlign->repSubtype);
+rmskAlign->repStart = bigRmskAlign->repStart;
+rmskAlign->repEnd = bigRmskAlign->repEnd;
+rmskAlign->repLeft = bigRmskAlign->repRemain;
+rmskAlign->id = bigRmskAlign->id;
+rmskAlign->alignment = cloneString(bigRmskAlign->calignData);
+
+return rmskAlign;
+}
+
+static struct rmskAlign **loadBigRmskAlign(char *bigRmskAlignFile,
+                                           unsigned *maxAlignIdPtr)
+/* load bigRmskAlignBed file into an array of lists by indexed by alignment id */
+{
+struct bigRmskAlignBed *bigRmskAligns = bigRmskAlignBedLoadAllByTab(bigRmskAlignFile);
+struct rmskAlign *rmskAligns = NULL;
+struct bigRmskAlignBed *bigRmskAlign;
+
+for (bigRmskAlign = slPopHead(&bigRmskAligns); bigRmskAlign != NULL; bigRmskAlign = slPopHead(&bigRmskAligns))
+    {
+    slAddHead(&rmskAligns, bigRmskAlignToRmskAlign(bigRmskAlign));
+    bigRmskAlignBedFree(&bigRmskAlign);
+    }
+
+slSort(&rmskAligns, rmskAlignGenomeCmp);
+return groupRmskAligns(rmskAligns, maxAlignIdPtr);
+}
+
 static boolean rmskLinear(struct rmskAlign *prev,
                           struct rmskAlign *next)
 /* are the two alignments linear? assumes genomic sort */
 {
 // rmskAlign 1-based and not in strand-specific order
 if (prev->strand[0] == '+')
     return next->repStart >= prev->repEnd;
 else
     return prev->repEnd > next->repStart;
 }
 
 static boolean rmskOverlap(struct rmskAlign *rm1,
                            struct rmskAlign *rm2)
 /* do they overlap in genomic or repeat coordinates */
 {
@@ -549,38 +608,39 @@
         if (psl != NULL)
             {
             pslTabOut(psl, outFh);
             pslFree(&psl);
             }
         }
     }
 }
 
 
 static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile)
 /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */
 {
 // load all, so we can join ones split by other insertions by id
 // don't bother freeing
-struct rmskAlign *rmskAligns = rmskAlignLoadAllByTab(rmskAlignFile);
-slSort(&rmskAligns, rmskAlignGenomeCmp);
-
-// get counts parts for each id
-unsigned maxAlignId;
-struct rmskAlign **rmskAlignGroups = groupRmskAligns(&rmskAligns, &maxAlignId);
+struct rmskAlign **rmskAlignGroups = NULL;
+unsigned maxAlignId = 0;
+if (bigRmsk)
+    rmskAlignGroups = loadBigRmskAlign(rmskAlignFile, &maxAlignId);
+else
+    rmskAlignGroups = loadRmskAlign(rmskAlignFile, &maxAlignId);
 
 FILE* outFh = mustOpen(rmskPslFile, "w");
 for (unsigned id = 0; id <= maxAlignId; id++)
     convertAlignGroup(&(rmskAlignGroups[id]), outFh);
 carefulClose(&outFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
+bigRmsk = optionExists("bigRmsk");
 dump = optionExists("dump");
 rmskAlignToPsl(argv[1], argv[2]);
 return 0;
 }