68684c7c01875a771889d1fd03354f58bae34427
braney
  Tue Oct 31 15:01:59 2023 -0700
rename bedCollect to chainArrangeCollect in response to code review

diff --git src/hg/utils/chainArrangeCollect/chainArrangeCollect.c src/hg/utils/chainArrangeCollect/chainArrangeCollect.c
new file mode 100644
index 0000000..2564085
--- /dev/null
+++ src/hg/utils/chainArrangeCollect/chainArrangeCollect.c
@@ -0,0 +1,111 @@
+/* chainArrangeCollect - collect overlapping beds into a single bed. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "basicBed.h"
+#include "dystring.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "chainArrangeCollect - collect overlapping beds into a chainArrange.as structure\n"
+  "usage:\n"
+  "   chainArrangeCollect input.bed output.bed\n"
+  "note: input beds need to be sorted with bedSort\n"
+  "options:\n"
+  "   -exact       overlapping blocks must be exactly the same range and score\n"
+  );
+}
+
+boolean exact;  // overlapping blocks must be exactly the same range and score
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {"exact", OPTION_BOOLEAN},
+   {NULL, 0},
+};
+
+static void outBed(FILE *f, struct bed *bed, struct hash *nameHash)
+{
+static int count = 0;
+struct slName *names = hashSlNameFromHash(nameHash);
+int sizeQuery = bed->score;
+bed->score = slCount(names);
+struct dyString *dy = newDyString(100);
+for(; names; names = names->next)
+    {
+    dyStringAppend(dy, names->name);
+    if (names->next)
+        dyStringAppend(dy, ",");
+    }
+bed->name = dy->string;
+
+// we're actually not outputting chainArrange structure because the label is coming
+// from an external program currently
+fprintf(f, "%s %d %d arr%d %d + %d %d 0 %s %d\n", bed->chrom, bed->chromStart, bed->chromEnd, count++, bed->score, bed->chromStart, bed->chromEnd, bed->name, sizeQuery);
+}
+
+void chainArrangeCollect(char *inFile, char *outFile)
+/* chainArrangeCollect - collect overlapping beds into a single bed. */
+{
+struct bed *allBeds = bedLoadAll(inFile);
+FILE *f = mustOpen(outFile, "w");
+struct bed *bed, *prevBed = allBeds;
+prevBed->score = 1;
+struct hash *nameHash = newHash(0);
+hashStore(nameHash, prevBed->name);
+
+if (exact)
+    {
+    for(bed = prevBed->next; bed;  bed = bed->next)
+        {
+        if (differentString(prevBed->chrom, bed->chrom) || (prevBed->chromStart != bed->chromStart) || (prevBed->chromEnd != bed->chromEnd) || (prevBed->score != bed->score))
+            {
+            outBed(f, prevBed, nameHash);
+
+            freeHash(&nameHash);
+            nameHash = newHash(0);
+            prevBed = bed;
+            hashStore(nameHash, bed->name);
+            }
+        else
+            {
+            hashStore(nameHash, bed->name);
+            }
+        }
+    }
+else
+    {
+    for(bed = prevBed->next; bed;  bed = bed->next)
+        {
+        if (differentString(prevBed->chrom, bed->chrom) || (prevBed->chromEnd <= bed->chromStart))
+            {
+            outBed(f, prevBed, nameHash);
+
+            freeHash(&nameHash);
+            nameHash = newHash(0);
+            prevBed = bed;
+            hashStore(nameHash, bed->name);
+            }
+        else
+            {
+            hashStore(nameHash, bed->name);
+            prevBed->chromEnd = (bed->chromEnd > prevBed->chromEnd) ?  bed->chromEnd : prevBed->chromEnd;
+            }
+        }
+    outBed(f, prevBed, nameHash);
+    }
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 3)
+    usage();
+exact = optionExists("exact");
+chainArrangeCollect(argv[1], argv[2]);
+return 0;
+}