src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c bfd7d15166c13c243f9d9a4bf1d6d50f168c4920

bfd7d15166c13c243f9d9a4bf1d6d50f168c4920
kent
  Wed Jan 20 09:14:39 2021 -0800
Adding trackDb and stats options.  Also adding scale factor per dataset.

diff --git src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c
index 96dbd61..b02a8f6 100644
--- src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c
+++ src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c
@@ -22,77 +22,87 @@
 {
 errAbort(
   "hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce a unified matrix on one gene set\n"
   "usage:\n"
   "   hcaUnifyMatrix graph.txg inList.tsv outMatrix.tsv\n"
   "where\n"
   "   graph.txg is made with txBedToGraph on the mapping.beds in the input\n"
   "   inList.tsv has one line per input we are unifying.  It is tab separated with columns:\n"
   "       1 - name of input to add to column labels\n"
   "       2 - relative priority, lower numbers get attended to first in gene mapping\n"
   "       3 - input mapping bed file, often result of gencodeVersionForGenes -bed output\n"
   "       4 - input expression matrix file in tsv format\n"
   "options:\n"
   "   -bed=mapping.bed\n"
   "   -empty=val - store val (rather than default -1) to show missing data\n"
+  "   -trackDb=stanza.ra - save trackDb stanza here\n"
+  "   -stats=stats.tsv - save merged stats file here\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"bed", OPTION_STRING},
-   {"val", OPTION_DOUBLE},
+   {"empty", OPTION_DOUBLE},
+   {"trackDb", OPTION_STRING},
+   {"stats", OPTION_STRING},
    {NULL, 0},
 };
 
 struct uniInput
 /* A single input to our unification process. */
     {
     struct uniInput *next;  /* Next in singly linked list. */
     char *name;	/* Short, will be reused as label prefix */
     int priority;	/* 1 is attended to first, then 2, etc.  */
+    double scaleOut;	/* Everything gets multiplied by this in the output */
     char *mappingBed;	/* Mapping bed file name */
     char *matrixFile;	/* Matrix file name */
+    char *colorFile;	/* File it first column sample, last color */
+    char *statsFile;	/* File stats are in */
     struct hash *rowHash;	/* keyed by 4th field, values char**  */
     struct hash *bedHash;	/* Also keyed by 4th field */
     struct slRef *rowList;	/* List of all parsed out rows as char** */
     struct memMatrix *matrix;   /* Loaded matrixFile */
     struct hash *matrixHash;	/* Matrix Y value keyed by gene name */
     };
 
 
 struct uniInput *uniInputLoad(char **row)
 /* Load a uniInput from row fetched with select * from uniInput
  * from database.  Dispose of this with uniInputFree(). */
 {
 struct uniInput *ret;
 
 AllocVar(ret);
 ret->name = cloneString(row[0]);
 ret->priority = sqlSigned(row[1]);
-ret->mappingBed = cloneString(row[2]);
-ret->matrixFile = cloneString(row[3]);
+ret->scaleOut = sqlDouble(row[2]);
+ret->mappingBed = cloneString(row[3]);
+ret->matrixFile = cloneString(row[4]);
+ret->colorFile = cloneString(row[5]);
+ret->statsFile = cloneString(row[6]);
 return ret;
 }
 
 struct uniInput *uniInputLoadAll(char *fileName) 
 /* Load all uniInput from a whitespace-separated file.
  * Dispose of this with uniInputFreeList(). */
 {
 struct uniInput *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
-char *row[4];
+char *row[7];
 
 while (lineFileRow(lf, row))
     {
     el = uniInputLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 void loadMappingBed(struct uniInput *uni)
 /* Load up a mapping bed file and make up row and bed hashes from it */
 {
 struct hash *rowHash = uni->rowHash = hashNew(0);
@@ -244,30 +254,88 @@
 	char *ourAcc = txSource->accession;
 	struct bed *ourBed = hashMustFindVal(uni->bedHash, ourAcc);
 	int overlap = bedBedOverlap(bestBed, ourBed);
 	if (overlap > bestOverlap)
 	    {
 	    bestOverlap = overlap;
 	    bestSource = txSource;
 	    }
 	}
     txSource += 1;
     }
 return bestSource;
 }
 
 
+void outputTrackDb(char *fileName, struct uniInput *uniList)
+/* Write out a trackDb stanza for us */
+{
+FILE *f = mustOpen(fileName, "w");
+fprintf(f, "track uniBarChart\n");
+fprintf(f, "type bigBarChart\n");
+struct dyString *bars = dyStringNew(0), *colors = dyStringNew(0);
+dyStringAppend(bars, "barChartBars");
+dyStringAppend(colors, "barChartColors");
+struct uniInput *uni;
+for (uni = uniList; uni != NULL; uni = uni->next)
+    {
+    struct lineFile *lf = lineFileOpen(uni->colorFile, TRUE);
+    char *line;
+    while (lineFileNextReal(lf, &line))
+        {
+	char *firstWord = nextTabWord(&line);
+	char *lastWord = NULL;
+	char *word;
+	while ((word = nextTabWord(&line)) != NULL)
+	     lastWord = word;
+	subChar(firstWord, ' ', '_');	// Oh no, underbars!
+	dyStringPrintf(bars, " %s_%s", uni->name, firstWord);
+	dyStringPrintf(colors, " %s", lastWord);
+	}
+    lineFileClose(&lf);
+    }
+fprintf(f, "%s\n", bars->string);
+fprintf(f, "%s\n", colors->string);
+carefulClose(&f);
+}
+
+void outputStats(char *fileName, struct uniInput *uniList)
+/* Write out a trackDb stanza for us */
+{
+FILE *f = mustOpen(fileName, "w");
+struct uniInput *uni;
+for (uni = uniList; uni != NULL; uni = uni->next)
+    {
+    struct lineFile *lf = lineFileOpen(uni->statsFile, TRUE);
+    char *line;
+    while (lineFileNext(lf, &line, NULL))
+        {
+	char *start = skipLeadingSpaces(line);
+	if (start[0] == '#' || start[0] == 0)
+	    fprintf(f, "%s\n", line);
+	else
+	    {
+	    fprintf(f, "%s %s\n", uni->name, line);
+	    }
+	}
+    lineFileClose(&lf);
+    }
+carefulClose(&f);
+}
+
+
+
 void writeOneGraph(struct txGraph *g,  struct uniInput *uniList, struct hash *uniHash, 
     FILE *f, FILE *bedF)
 /* Here we write out one graph.  Normally just write in one piece, but
  * we will at least snoop for those that need breaking up */
 {
 /* Find best txSource and it's associated type. */
 int i;
 struct txSource *bestTxSource = NULL;
 int bestPriority = BIGNUM;  // priority is better at 1 than 2
 for (i=0; i<g->sourceCount; ++i)
     {
     struct txSource *txSource = &g->sources[i];
     struct uniInput *uni = hashMustFindVal(uniHash, txSource->type);
     if (bestPriority > uni->priority)
         {
@@ -298,31 +366,31 @@
 	struct bed *bestBed = hashMustFindVal(bestUni->bedHash, txSource->accession);
 	for (uni = uniList; uni != NULL; uni = uni->next)
 	    {
 	    if (bestBed != bestBed) uglyAbort("Absurd!");
 	    struct txSource *ourSource = findBestSource(g, bestBed, uni);
 	    if (ourSource != NULL)
 	        {
 		char *geneName = ourSource->accession;
 		struct hashEl *hel = hashLookup(uni->matrixHash, geneName);
 		if (hel == NULL)
 		    errAbort("Can't find %s in %s but it is in %s\n", geneName, uni->matrixFile, ourSource->type);
 		int y = ptToInt(hel->val);
 		int x;
 		for (x=0; x<uni->matrix->xSize; ++x)
 		    {
-		    fprintf(f, "\t%g", uni->matrix->rows[y][x]);
+		    fprintf(f, "\t%g", uni->scaleOut*uni->matrix->rows[y][x]);
 		    }
 		}
 	    else
 	        {
 		int x;
 		for (x=0; x<uni->matrix->xSize; ++x)
 		    {
 		    fprintf(f, "\t%g", emptyVal);
 		    }
 		}
 	    }
 	fprintf(f, "\n");
 	}
     }
 }
@@ -376,27 +444,39 @@
 /* Go through outputting matrix and bed rows */
 for (g = gList; g != NULL; g = g->next)
     {
     /* We only want ones with at least a certain minimum of evidence */
     if (g->sourceCount >= minSources)
 	{
 	int typeCount = countDistinctSourceTypes(g);
 	if (typeCount >= minSources)
 	   {
 	   writeOneGraph(g, uniList, uniHash, f, bedF);
 	   }
 	}
     }
 carefulClose(&f);
 carefulClose(&bedF);
+
+char *trackDb = optionVal("trackDb", NULL);
+if (trackDb != NULL)
+    {
+    outputTrackDb(trackDb, uniList);
+    }
+
+char *stats = optionVal("stats", NULL);
+if (stats != NULL)
+    {
+    outputStats(stats, uniList);
+    }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 emptyVal = optionDouble("empty", -1);
 hcaUnifyMatrix(argv[1], argv[2], argv[3], optionVal("bed", NULL));
 return 0;
 }