95df32e007a5dbe0e8bfe718fbb46f970e045050
kent
  Wed Jun 15 14:25:04 2022 -0700
Some experimental work that has let me know where the alt-haplotypes land on the main chromosomes.

diff --git src/hg/oneShot/refGraphTest/refGraphTest.c src/hg/oneShot/refGraphTest/refGraphTest.c
new file mode 100644
index 0000000..b1d67da
--- /dev/null
+++ src/hg/oneShot/refGraphTest/refGraphTest.c
@@ -0,0 +1,268 @@
+/* refGraphTest - Test out some reference genome graph stuff.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "psl.h"
+#include "diGraph.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "refGraphTest - Test out some reference genome graph stuff.\n"
+  "usage:\n"
+  "   refGraphTest nodes.tab edges.psl output\n"
+  "options:\n"
+  "   -xxx=XXX\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+struct annoSeq
+/* An annotated sequence */
+    {
+    struct annoSeq *next;
+    char *name;	    /* Chromosome or other name */
+    int size;	    /* Size of sequence in bases */
+    };
+
+struct annoSeq *readChromSizes(char *fileName)
+/* Read in chrom.sizes file and return a corresponding list of annoSeq's */
+{
+struct annoSeq *list = NULL;
+struct annoSeq *annoSeq;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *row[2];
+while (lineFileRow(lf, row))
+   {
+   AllocVar(annoSeq);
+   annoSeq->name = cloneString(row[0]);
+   annoSeq->size = lineFileNeedNum(lf, row, 1);
+   slAddHead(&list, annoSeq);
+   }
+lineFileClose(&lf);
+slReverse(&list);
+return list;
+}
+
+struct dgNode *findOrMakeNode(struct diGraph *g, struct hash *seqHash, char *name)
+/* Return node associated with seq */
+{
+struct dgNode *node = dgFindNode(g, name);
+if (node == NULL)
+    {
+    struct annoSeq *seq = hashFindVal(seqHash, name);
+    if (seq == NULL)
+        errAbort("Can't find sequence %s", name);
+    node = dgAddNode(g, name, seq);
+    }
+return node;
+}
+
+boolean isAltName(char *name)
+/* Return true if it's an alt name */
+{
+return endsWith(name, "_alt");
+}
+
+boolean isFixName(char *name)
+/* Return true if it's a fix name */
+{
+return endsWith(name, "_fix");
+}
+
+void addAltZeroVal(struct hash *hash, char *name)
+/* Add name to hash with NULL val if it isn't in there already, and if
+ * it is an alt */
+{
+if (isAltName(name))
+    {
+    if (hashLookup(hash, name) == NULL)
+        hashAdd(hash, name, NULL);
+    }
+}
+        
+void saveBestAlt(struct hash *hash, char *name, int size, int start, int end, struct psl *psl)
+/* Save best psl to hash if it's an alt */
+{
+if (isAltName(name))
+    {
+    struct hashEl *hel = hashLookup(hash, name);
+    if (hel == NULL)
+	internalErr();
+    struct psl *oldPsl = hel->val;
+    if (oldPsl == NULL)
+        hel->val = psl;
+    else
+        {
+	verbose(2, "duplicate psl %s: old score %d, new score %d\n", name, pslScore(oldPsl), pslScore(psl));
+	if (pslScore(psl) > pslScore(oldPsl))
+	    hel->val = psl;
+	}
+    }
+}
+
+void checkAllHaveVal(struct hash *hash, char *hashName)
+/* Make sure all elements of hash have values */
+{
+struct hashEl *hel;
+struct hashCookie cookie = hashFirst(hash);
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    if (hel->val == NULL)
+        errAbort("%s has no full value in %s", hel->name, hashName);
+    }
+}
+
+void addPslsFromHash(struct hash *hash, struct psl **pList)
+/* Add psl values from hash into list */
+{
+struct hashEl *hel;
+struct hashCookie cookie = hashFirst(hash);
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    struct psl *psl = hel->val;
+    slAddHead(pList, psl);
+    }
+}
+
+boolean fixedPsl(struct psl *psl)
+/* Return TRUE if psl involves a fix */
+{
+return isFixName(psl->qName) || isFixName(psl->tName);
+}
+
+struct psl *pslFilterPartial(struct psl *list)
+/* Return a new list with the items that only partially cover the alts removed */
+{
+/* Make up a hash for all alts, initialized to zero counts */
+struct hash *qAltHash = hashNew(0);
+struct hash *tAltHash = hashNew(0);
+struct psl *psl;
+
+for (psl = list; psl != NULL; psl = psl->next)
+    {
+    if (!fixedPsl(psl))
+	{
+	addAltZeroVal(qAltHash, psl->qName);
+	addAltZeroVal(tAltHash, psl->tName);
+	}
+    }
+if (qAltHash->elCount != tAltHash->elCount)
+    errAbort("Got %d alts in query side, %d in target, they should be the same, aborting.", 
+	qAltHash->elCount, tAltHash->elCount);
+
+for (psl = list; psl != NULL; psl = psl->next)
+    {
+    if (!fixedPsl(psl))
+	{
+	saveBestAlt(qAltHash, psl->qName, psl->qSize, psl->qStart, psl->qEnd, psl);
+	saveBestAlt(tAltHash, psl->tName, psl->tSize, psl->tStart, psl->tEnd, psl);
+	}
+    }
+
+checkAllHaveVal(qAltHash, "query");
+checkAllHaveVal(tAltHash, "target");
+
+struct psl *newList = NULL;
+addPslsFromHash(qAltHash, &newList);
+addPslsFromHash(tAltHash, &newList);
+
+hashFree(&qAltHash);
+hashFree(&tAltHash);
+return newList;
+}
+
+struct hash *chromsAndPslsToGraphs(struct annoSeq *seqList, struct hash *seqHash, struct psl *pslList)
+/* Return hash of diGraphs, one for each chromosome, keyed by chromosome name */
+{
+/* Make chromosome hash out of sequences with no underbars in name */
+struct hash *chromHash = hashNew(0);
+struct annoSeq *seq;
+for (seq = seqList; seq != NULL; seq = seq->next)
+    {
+    if (strchr(seq->name, '_') == NULL)
+	{
+	struct diGraph *g = dgNew();
+        hashAdd(chromHash, seq->name, g);
+	dgAddNode(g, seq->name, seq);
+	}
+    }
+
+struct psl *psl;
+for (psl = pslList; psl != NULL; psl = psl->next)
+    {
+    /* Find appropriate graph */
+    struct diGraph *g = NULL;
+    if (isAltName(psl->qName))
+        {
+	g = hashFindVal(chromHash, psl->tName);
+	if (psl->qStart != 0 || psl->qEnd != psl->qSize)
+	    {
+	    verbose(2, "%s alignment %d-%d of %d is not full\n", psl->qName, psl->qStart, psl->qEnd, psl->qSize);
+	    }
+	}
+    else
+        {
+	g = hashFindVal(chromHash, psl->qName);
+	if (psl->tStart != 0 || psl->tEnd != psl->tSize)
+	    {
+	    verbose(2, "%s alignment %d-%d of %d is not full\n", psl->tName, psl->tStart, psl->tEnd, psl->tSize);
+	    }
+	}
+    if (g == NULL)
+        errAbort("Neither %s nor %s are chromosomes in psl\n", psl->qName, psl->tName); 
+
+    /* Find nodes associated with query and target */
+    struct dgNode *tn = findOrMakeNode(g, seqHash, psl->tName);
+    struct dgNode *qn = findOrMakeNode(g, seqHash, psl->qName);
+
+    /* Connect nodes with edge marked with psl. */
+    dgConnectWithVal(g, tn, qn, psl);
+    }
+return chromHash;
+}
+
+void refGraphTest(char *nodesFile, char *aliFile, char *outputFile)
+/* refGraphTest - Test out some reference genome graph stuff.. */
+{
+FILE *f = mustOpen(outputFile, "w");
+
+/* Read in annotated sequence list */
+struct annoSeq *seq, *seqList = readChromSizes(nodesFile);
+struct hash *seqHash = hashNew(0);
+for (seq = seqList; seq != NULL; seq = seq->next)
+    hashAdd(seqHash, seq->name, seq);
+
+/* Read in psl's */
+struct psl *pslList = pslLoadAll(aliFile);
+fprintf(f, "# %g raw alt seq alignments in %s\n", slCount(pslList)/2.0, aliFile);
+
+pslList = pslFilterPartial(pslList);
+fprintf(f, "# %g filtered seq alignments in %s\n", slCount(pslList)/2.0, aliFile);
+
+struct hash *chromHash = chromsAndPslsToGraphs(seqList, seqHash, pslList);
+fprintf(f, "# %s from %s and %s has %d chromosomes\n", outputFile, nodesFile, aliFile, chromHash->elCount);
+struct psl *psl;
+for (psl = pslList; psl != NULL; psl = psl->next)
+     {
+     pslTabOut(psl, f);
+     }
+
+carefulClose(&f);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+refGraphTest(argv[1], argv[2], argv[3]);
+return 0;
+}