f6ef041a6fa43e86f7b0c17f79371cb0fdeeaf8f
kent
  Fri Dec 18 16:28:29 2020 -0800
Adding utility to help pull out genes positions from a bunch of transcripts and a gene/transcript table.

diff --git src/hg/txGraph/txgToGeneBed/txgToGeneBed.c src/hg/txGraph/txgToGeneBed/txgToGeneBed.c
new file mode 100644
index 0000000..0a0aea7
--- /dev/null
+++ src/hg/txGraph/txgToGeneBed/txgToGeneBed.c
@@ -0,0 +1,75 @@
+/* txgToGeneBed - Convert txg to a simple bed with one block and the most common gene name.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "txGraph.h"
+#include "obscure.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "txgToGeneBed - Convert txg to a simple bed with one block and the most common gene name.\n"
+  "usage:\n"
+  "   txgToGeneBed in.txg txToGene.tsv	out.bed\n"
+  "options:\n"
+  "   -xxx=XXX\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+char *pickGene(struct txGraph *txg, struct hash *txToGeneHash)
+/* Pick out most commonly used name */
+{
+if (txg->sourceCount <= 2)  // If just one or two take first one
+    return hashMustFindVal(txToGeneHash, txg->sources[0].accession);
+else
+    {
+    int bestCount = 0;
+    char *bestGene = NULL;
+    struct hash *geneHash = hashNew(0);
+    int i;
+    for (i=0; i<txg->sourceCount; ++i)
+        {
+	char *gene = hashMustFindVal(txToGeneHash, txg->sources[i].accession);
+	int count = hashIncInt(geneHash, gene);
+	if (count > bestCount)
+	    {
+	    bestCount = count;
+	    bestGene = gene;
+	    }
+	}
+    hashFree(&geneHash);
+    return bestGene;
+    }
+}
+
+void txgToGeneBed(char *inTxg, char *txToGeneFile, char *outBed)
+/* txgToGeneBed - Convert txg to a simple bed with one block and the most common gene name.. */
+{
+struct hash *txToGeneHash = hashTwoColumnFile(txToGeneFile);
+struct txGraph *txgList = txGraphLoadAll(inTxg);
+FILE *f = mustOpen(outBed, "w");
+struct txGraph *txg;
+for (txg = txgList; txg != NULL; txg = txg->next)
+    {
+    char *gene = pickGene(txg, txToGeneHash);
+    fprintf(f, "%s\t%d\t%d\t%s\t0\t%s\n", txg->tName, txg->tStart, txg->tEnd,gene,txg->strand);
+    }
+carefulClose(&f);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+txgToGeneBed(argv[1], argv[2], argv[3]);
+return 0;
+}