474877488f16b5c6888313e4d25dc4b89c7a6896
braney
  Thu Aug 17 15:44:18 2023 -0700
a utility to find inversions and duplications in a set of chains

diff --git src/hg/utils/chainArrange/chainArrange.c src/hg/utils/chainArrange/chainArrange.c
new file mode 100644
index 0000000..78811a4
--- /dev/null
+++ src/hg/utils/chainArrange/chainArrange.c
@@ -0,0 +1,158 @@
+/* chainArrange - output inversions and duplications from chain. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "chain.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "chainArrange - output inversions and duplications from chain\n"
+  "usage:\n"
+  "   chainArrange file.chain label inversions.bed dups.bed\n"
+  "options:\n"
+  "   -xxx=XXX\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+
+int chainCmp(const void *va, const void *vb)
+/* Compare to sort based on start. */
+{
+const struct chain *a = *((struct chain **)va);
+const struct chain *b = *((struct chain **)vb);
+int ret = strcmp(a->qName, b->qName);
+if (ret)
+    return ret;
+ret = a->tStart - b->tStart;
+if (ret)
+    return ret;
+
+return  a->qStart - b->qStart;
+
+}
+
+static boolean isInversion(struct chain *chain, struct chain *enclosingChain)
+{
+if (chain->qStrand == enclosingChain->qStrand)
+    return FALSE;
+struct cBlock *cb = enclosingChain->blockList;
+struct cBlock *prevCb = NULL;
+for(; cb; prevCb = cb, cb = cb->next)
+    {
+    if (cb->tStart >= chain->tEnd)
+        {
+        if (prevCb->tEnd <= chain->tStart)
+            {
+            int qStart =  chain->qSize - chain->qEnd;
+            int qEnd =  chain->qSize - chain->qStart;
+            if ((cb->qStart >= qEnd) &&
+                (prevCb->qEnd <= qStart))
+                {
+                return TRUE;
+                }
+            }
+        break;
+        }
+    }
+return FALSE;
+}
+
+static void parseChrom(struct chain *chains,  char *label, FILE *inversions, FILE *duplications)
+{
+slSort(&chains, chainCmp);
+
+struct chain *chain;
+char *lastName = "";
+//int tStart, tEnd, qStart, qEnd;
+int  tEnd=0, qStart=0, qEnd=0;
+struct chain *enclosingChain = NULL;
+for(chain = chains; chain ; chain = chain->next)
+    {
+    // is this chain from a new qName or outside the "current" one
+    if (strcmp(lastName, chain->qName) || (chain->tStart > tEnd))
+        {
+        // new qName or disjoint chain, initialize variables
+        enclosingChain = chain;
+        lastName = chain->qName;
+        //tStart = chain->chromStart;
+        tEnd = chain->tEnd;
+        qStart = chain->qStart;
+        qEnd = chain->qEnd;
+        }
+    else
+        {
+        // is chain inside the current chain on both target and query?
+        if ((chain->tEnd <= tEnd) &&
+            (chain->qStart >= qStart) &&
+            (chain->qEnd <= qEnd))
+                {
+                //chainTabOut(chain, f);
+                if (isInversion(chain, enclosingChain))
+                    fprintf(inversions, "%s %d %d %s\n", chain->tName, chain->tStart, chain->tEnd, label);
+                else
+                    fprintf(duplications, "%s %d %d %s\n", chain->tName, chain->tStart, chain->tEnd, label);
+                }
+
+        }
+    }
+}
+
+struct chainHead
+{
+struct chainHead *next;
+struct chain *chains;
+};
+
+struct chainHead *readChains(char *inChain)
+{
+struct lineFile *lf = lineFileOpen(inChain, TRUE);
+struct hash *hash = newHash(0);
+struct chainHead *chainHead, *chainHeads = NULL;
+struct chain *chain;
+
+while ((chain = chainRead(lf)) != NULL)
+    {
+    if ((chainHead = hashFindVal(hash, chain->tName)) == NULL)
+        {
+        AllocVar(chainHead);
+        slAddHead(&chainHeads, chainHead);
+        hashAdd(hash, chain->tName, chainHead);
+        }
+
+    slAddHead(&chainHead->chains,  chain);
+    }
+
+return chainHeads;
+}
+
+void chainArrange(char *inChain, char *label, char *inversions, char *duplications)
+/* chainArrange - output a set of rearrangement breakpoints. */
+{
+struct chainHead *chainHeads = readChains(inChain);
+FILE *inversionsF = mustOpen(inversions, "w");
+FILE *duplicationsF = mustOpen(duplications, "w");
+
+for (; chainHeads != NULL; chainHeads = chainHeads->next)
+    {
+    struct chain  *chains = chainHeads->chains;
+    parseChrom(chains,  label, inversionsF, duplicationsF);
+    }
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 5)
+    usage();
+chainArrange(argv[1], argv[2], argv[3], argv[4]);
+return 0;
+}