46dc535d3f532f80ada63859bc948ad4af0a72d3
angie
  Mon Jul 8 10:27:37 2019 -0700
Rewrote bedJoinTabOffset in C to handle very large files.  Renamed script to bedJoinTabOffset.py .  refs #23283
The new C version does not yet support any of the script's command line options.

diff --git src/utils/bedJoinTabOffset/bedJoinTabOffset.c src/utils/bedJoinTabOffset/bedJoinTabOffset.c
new file mode 100644
index 0000000..a00b1b2
--- /dev/null
+++ src/utils/bedJoinTabOffset/bedJoinTabOffset.c
@@ -0,0 +1,134 @@
+/* bedJoinTabOffset - Add file offset and length of line in a text file with the same name
+ * as the BED name to each row of BED. */
+#include "common.h"
+#include "linefile.h"
+#include "localmem.h"
+#include "hash.h"
+#include "obscure.h"
+#include "options.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "usage:\n"
+  "   bedJoinTabOffset inTabFile inBedFile outBedFile\n"
+  "Given a bed file and tab file where each have a column with matching values:\n"
+  "first get the value of column0, the offset and line length from inTabFile.\n"
+  "Then go over the bed file, use the name field and append its offset and length\n"
+  "to the bed file as two separate fields.  Write the new bed file to outBed.\n"
+//  "options:\n"
+//  "   -xxx=XXX\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+struct offsetLen
+/* File offset and length of a line; value of hash keyed by name/keyword in inTabFile. */
+    {
+    off_t offset;  // Line offset within inTabFile
+    size_t len;    // Length of line
+    };
+
+struct nameOffsetLen
+/* List of name/keyword, offset and length for each line in inTabFile as we read it in. */
+    {
+    struct nameOffsetLen *next;
+    char *name;                    // name/keyword from line of inTabFile
+    struct offsetLen offLen;       // offset and length of line
+    };
+
+static struct hash *parseTabFile(struct lineFile *lf)
+/* Read all lines of inTabFile, accumulating a list of {name, offset, len}. When we know how
+ * many lines/names there are, allocate a suitably sized hash and store name -> {offset, len}. */
+{
+verbose(2, "Reading tab-sep file %s\n", lf->fileName);
+struct nameOffsetLen *nolList = NULL;
+struct lm *lm = lmInit(0);
+int itemCount = 0;
+char *line;
+while (lineFileNextReal(lf, &line))
+    {
+    struct nameOffsetLen *nol;
+    lmAllocVar(lm, nol);
+    nol->offLen.offset = lineFileTell(lf);
+    nol->offLen.len = strlen(line);
+    char *t = strchr(line, '\t');
+    if (t)
+        *t = '\0';
+    nol->name = lmCloneString(lm, line);
+    slAddHead(&nolList, nol);
+    itemCount++;
+    }
+int hashSize = digitsBaseTwo(itemCount);
+verboseTime(2, "Done reading %s; %d items, hash size %d", lf->fileName, itemCount, hashSize);
+struct hash *nameToOffsetLen = hashNew(hashSize);
+struct nameOffsetLen *nol;
+for (nol = nolList;  nol != NULL;  nol = nol->next)
+    {
+    struct offsetLen *ol;
+    AllocVar(ol);
+    ol->offset = nol->offLen.offset;
+    ol->len = nol->offLen.len;
+    hashAdd(nameToOffsetLen, nol->name, ol);
+    }
+lmCleanup(&lm);
+return nameToOffsetLen;
+}
+
+void bedJoinTabOffset(char *inTabFile, char *inBedFile, char *outBedFile)
+/* bedJoinTabOffset - Add file offset and length of line in a text file with the same name
+ * as the BED name to each row of BED. */
+{
+verboseTimeInit();
+struct lineFile *tLf = lineFileOpen(inTabFile, TRUE);
+struct lineFile *bLf = lineFileOpen(inBedFile, TRUE);
+FILE *outF = mustOpen(outBedFile, "w");
+struct hash *nameToOffsetLen = parseTabFile(tLf);
+lineFileClose(&tLf);
+struct dyString *dy = dyStringNew(0);
+verboseTime(2, "Done making hash; reading bed input %s", inBedFile);
+char *line;
+int size;
+while (lineFileNext(bLf, &line, &size))
+    {
+    char *postSpace = skipLeadingSpaces(line);
+    if (isEmpty(postSpace) || postSpace[0] == '#')
+        fprintf(outF, "%s\n", line);
+    else
+        {
+        dyStringClear(dy);
+        dyStringAppend(dy, line);
+        char *words[5];
+        int wordCount = chopTabs(dy->string, words);
+        if (wordCount < 4)
+            lineFileAbort(bLf, "Expected at least 4 words but got %d (%s).", wordCount, line);
+        char *name = words[3];
+        struct offsetLen *ol = hashFindVal(nameToOffsetLen, name);
+        if (ol == NULL)
+            lineFileAbort(bLf, "Unable to find corresponding line in %s for name '%s'",
+                          inTabFile, name);
+        fprintf(outF, "%s\t%lld\t%lld\n",
+                line, (long long)ol->offset, (long long)ol->len);
+        }
+    }
+lineFileClose(&bLf);
+carefulClose(&outF);
+freeHashAndVals(&nameToOffsetLen);
+dyStringFree(&dy);
+verboseTime(2, "Done writing bed output %s.", outBedFile);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+bedJoinTabOffset(argv[1], argv[2], argv[3]);
+return 0;
+}