src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c 0e9604766086b49eecef30226e15cea726cc14b6

0e9604766086b49eecef30226e15cea726cc14b6
kent
  Wed Nov 24 14:03:14 2021 -0800
Updated usage message, which was missing a new column.

diff --git src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c
index 2346a8b..422917a 100644
--- src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c
+++ src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c
@@ -1,482 +1,483 @@
 /* hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce 
  * a unified matrix on one gene set. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "sqlNum.h"
 #include "rangeTree.h"
 #include "vMatrix.h"
 #include "obscure.h"
 #include "../../hg/inc/txGraph.h"
 #include "../../hg/inc/bed12Source.h"
 #include "../../hg/inc/bed.h"
 
 #define BED12_NAME2_SIZE 13
 #define BED_NAME_FIELD 3
 
 double emptyVal = -1;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce a unified matrix on one gene set\n"
   "usage:\n"
   "   hcaUnifyMatrix graph.txg inList.tsv outMatrix.tsv\n"
   "where\n"
   "   graph.txg is made with txBedToGraph on the mapping.beds in the input\n"
   "   inList.tsv has one line per input we are unifying.  It is tab separated with columns:\n"
   "       1 - name of input to add to column labels\n"
   "       2 - relative priority, lower numbers get attended to first in gene mapping\n"
-  "       3 - input mapping bed file, often result of gencodeVersionForGenes -bed output\n"
-  "       4 - input expression matrix file in tsv format\n"
+  "       3 - scale - number used to scale this input relative to others\n"
+  "       4 - input mapping bed file, often result of gencodeVersionForGenes -bed output\n"
+  "       5 - input expression matrix file in tsv format\n"
   "options:\n"
   "   -bed=mapping.bed\n"
   "   -empty=val - store val (rather than default -1) to show missing data\n"
   "   -trackDb=stanza.ra - save trackDb stanza here\n"
   "   -stats=stats.tsv - save merged stats file here\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"bed", OPTION_STRING},
    {"empty", OPTION_DOUBLE},
    {"trackDb", OPTION_STRING},
    {"stats", OPTION_STRING},
    {NULL, 0},
 };
 
 struct uniInput
 /* A single input to our unification process. */
     {
     struct uniInput *next;  /* Next in singly linked list. */
     char *name;	/* Short, will be reused as label prefix */
     int priority;	/* 1 is attended to first, then 2, etc.  */
     double scaleOut;	/* Everything gets multiplied by this in the output */
     char *mappingBed;	/* Mapping bed file name */
     char *matrixFile;	/* Matrix file name */
     char *colorFile;	/* File it first column sample, last color */
     char *statsFile;	/* File stats are in */
     struct hash *rowHash;	/* keyed by 4th field, values char**  */
     struct hash *bedHash;	/* Also keyed by 4th field */
     struct slRef *rowList;	/* List of all parsed out rows as char** */
     struct memMatrix *matrix;   /* Loaded matrixFile */
     struct hash *matrixHash;	/* Matrix Y value keyed by gene name */
     };
 
 
 struct uniInput *uniInputLoad(char **row)
 /* Load a uniInput from row fetched with select * from uniInput
  * from database.  Dispose of this with uniInputFree(). */
 {
 struct uniInput *ret;
 
 AllocVar(ret);
 ret->name = cloneString(row[0]);
 ret->priority = sqlSigned(row[1]);
 ret->scaleOut = sqlDouble(row[2]);
 ret->mappingBed = cloneString(row[3]);
 ret->matrixFile = cloneString(row[4]);
 ret->colorFile = cloneString(row[5]);
 ret->statsFile = cloneString(row[6]);
 return ret;
 }
 
 struct uniInput *uniInputLoadAll(char *fileName) 
 /* Load all uniInput from a whitespace-separated file.
  * Dispose of this with uniInputFreeList(). */
 {
 struct uniInput *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[7];
 
 while (lineFileRow(lf, row))
     {
     el = uniInputLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 void loadMappingBed(struct uniInput *uni)
 /* Load up a mapping bed file and make up row and bed hashes from it */
 {
 struct hash *rowHash = uni->rowHash = hashNew(0);
 struct hash *bedHash = uni->bedHash = hashNew(0);
 struct lm *lm = rowHash->lm;
 
 struct lineFile *lf = lineFileOpen(uni->mappingBed, TRUE);
 char *row[BED12_NAME2_SIZE];
 while (lineFileRow(lf, row))
     {
     char *name = row[BED_NAME_FIELD];
     struct bed12Source *bed = bed12SourceLoad(row);
     hashAdd(bedHash, name, bed);
     char **rowCopy = lmCloneRow(lm, row, ArraySize(row));
     hashAdd(rowHash, name, rowCopy);
     refAdd(&uni->rowList, rowCopy);
     }
 slReverse(&uni->rowList);
 lineFileClose(&lf);
 }
 
 void loadMatrix(struct uniInput *uni)
 /* Load up a mapping bed file and make up row and bed hashes from it */
 {
 struct memMatrix *matrix = uni->matrix = memMatrixFromTsv(uni->matrixFile);
 struct hash *hash = uni->matrixHash = hashNew(0);
 int y;
 for (y=0; y<matrix->ySize; ++y)
     hashAddInt(hash, matrix->yLabels[y], y);
 }
 
 int countDistinctTypes(struct txGraph *gList)
 /* Just count up how many distinct source types we got to track in all graphs*/
 {
 /* Figure up how many distinct sources we have */
 struct hash *uniq = hashNew(0);
 struct txGraph *g;
 for (g = gList; g != NULL; g = g->next)
     {
     int i;
     for (i=0; i<g->sourceCount; ++i)
         {
 	char *typeName = g->sources[i].type;
 	hashStore(uniq, typeName);
 	}
     }
 int typeCount = uniq->elCount;
 hashFree(&uniq);
 return typeCount;
 }
 
 int countDistinctSourceTypes(struct txGraph *g)
 /* Count up number of sources in one graph */
 {
 struct hash *typeHash = hashNew(0);
 struct txSource *sources = g->sources;
 int count = g->sourceCount;
 int i;
 for (i=0; i<count; ++i)
     {
     char *typeName = sources[i].type;
     hashStore(typeHash, typeName);
     }
 int retVal = typeHash->elCount;
 hashFree(&typeHash);
 return retVal;
 }
 
 
 struct rbTree *rangeTreeOnBed(struct bed *bed)
 /* Return an rbTree holding all the blocks of the bed */
 {
 struct rbTree *t = rangeTreeNew();
 int i;
 for (i=0; i<bed->blockCount; ++i)
     {
     int start = bed->chromStart + bed->chromStarts[i];
     int end = start + bed->blockSizes[i];
     rangeTreeAdd(t, start, end);
     }
 return t;
 }
 
 int bedBedOverlap(struct bed *a, struct bed *b)
 /* Return amount of overlap at exon level between a and b */
 {
 if (!sameString(a->chrom, b->chrom))
     return 0;
 if (a->strand[0] != b->strand[0])
     return 0;
 int overlap = rangeIntersection(a->chromStart, a->chromEnd, b->chromStart, b->chromEnd);
 if (overlap <= 0)
     return 0;
 
 /* Ok, go do the slow things of building up a range tree.  We could write some code
  * that is a little complex like a merge sort to avoid this, maybe some day if this 
  * is too slow. */
 struct rbTree *t = rangeTreeOnBed(a);
 int totalOverlap = 0;
 int i;
 for (i=0; i<b->blockCount; ++i)
     {
     int start = b->chromStart + b->chromStarts[i];
     int end = start + b->blockSizes[i];
     totalOverlap += rangeTreeOverlapSize(t, start, end);
     }
 rangeTreeFree(&t);
 return totalOverlap;
 }
 
 char *findClosestOfType(struct txGraph *g, struct uniInput *type, struct bed *bed)
 /* Find the accession of the source of given type that overlaps most with bed */
 {
 struct txSource *txSource = g->sources;
 int i;
 int bestOverlap = 0;
 char *bestAcc = NULL;
 for (i=0; i<g->sourceCount; ++i)
     {
     if (sameString(type->mappingBed, txSource->type))
         {
 	struct bed *other = hashMustFindVal(type->bedHash, txSource->accession);
 	int overlap = bedBedOverlap(bed, other);
 	if (overlap > bestOverlap)
 	    {
 	    bestOverlap = overlap;
 	    bestAcc = txSource->accession;
 	    }
 	}
     txSource += 1;
     }
 return bestAcc;
 }
 
 struct txSource *findBestSource(struct txGraph *g, struct bed *bestBed, struct uniInput *uni)
 /* Go through all transcripts belonging to txSource in g, and try to find for them
  * the best mate among the chosen type in the very same g.  This will output no
  * more than one mate for each element of bestType and of ourType */
 {
 char *ourType = uni->mappingBed;
 int i;
 struct txSource *bestSource = NULL;
 int bestOverlap = 0;
 struct txSource *txSource = g->sources;
 for (i=0; i<g->sourceCount; ++i)
     {
     if (sameString(txSource->type, ourType))
         {
 	char *ourAcc = txSource->accession;
 	struct bed *ourBed = hashMustFindVal(uni->bedHash, ourAcc);
 	int overlap = bedBedOverlap(bestBed, ourBed);
 	if (overlap > bestOverlap)
 	    {
 	    bestOverlap = overlap;
 	    bestSource = txSource;
 	    }
 	}
     txSource += 1;
     }
 return bestSource;
 }
 
 
 void outputTrackDb(char *fileName, struct uniInput *uniList)
 /* Write out a trackDb stanza for us */
 {
 FILE *f = mustOpen(fileName, "w");
 fprintf(f, "track uniBarChart\n");
 fprintf(f, "type bigBarChart\n");
 struct dyString *bars = dyStringNew(0), *colors = dyStringNew(0);
 dyStringAppend(bars, "barChartBars");
 dyStringAppend(colors, "barChartColors");
 struct uniInput *uni;
 for (uni = uniList; uni != NULL; uni = uni->next)
     {
     struct lineFile *lf = lineFileOpen(uni->colorFile, TRUE);
     char *line;
     while (lineFileNextReal(lf, &line))
         {
 	char *firstWord = nextTabWord(&line);
 	char *lastWord = NULL;
 	char *word;
 	while ((word = nextTabWord(&line)) != NULL)
 	     lastWord = word;
 	subChar(firstWord, ' ', '_');	// Oh no, underbars!
 	dyStringPrintf(bars, " %s_%s", uni->name, firstWord);
 	dyStringPrintf(colors, " %s", lastWord);
 	}
     lineFileClose(&lf);
     }
 fprintf(f, "%s\n", bars->string);
 fprintf(f, "%s\n", colors->string);
 carefulClose(&f);
 }
 
 void outputStats(char *fileName, struct uniInput *uniList)
 /* Write out a trackDb stanza for us */
 {
 FILE *f = mustOpen(fileName, "w");
 struct uniInput *uni;
 for (uni = uniList; uni != NULL; uni = uni->next)
     {
     struct lineFile *lf = lineFileOpen(uni->statsFile, TRUE);
     char *line;
     while (lineFileNext(lf, &line, NULL))
         {
 	char *start = skipLeadingSpaces(line);
 	if (start[0] == '#' || start[0] == 0)
 	    fprintf(f, "%s\n", line);
 	else
 	    {
 	    fprintf(f, "%s %s\n", uni->name, line);
 	    }
 	}
     lineFileClose(&lf);
     }
 carefulClose(&f);
 }
 
 
 
 void writeOneGraph(struct txGraph *g,  struct uniInput *uniList, struct hash *uniHash, 
     FILE *f, FILE *bedF)
 /* Here we write out one graph.  Normally just write in one piece, but
  * we will at least snoop for those that need breaking up */
 {
 verbose(3, "writeOneGraph %s with %d sources\n", g->name, g->sourceCount);
 /* Find best txSource and it's associated type. */
 int i;
 struct txSource *bestTxSource = NULL;
 int bestPriority = BIGNUM;  // priority is better at 1 than 2
 for (i=0; i<g->sourceCount; ++i)
     {
     struct txSource *txSource = &g->sources[i];
     struct uniInput *uni = hashMustFindVal(uniHash, txSource->type);
     if (bestPriority > uni->priority)
         {
 	bestPriority = uni->priority;
 	bestTxSource = txSource;
 	}
     txSource += 1;
     }
 char *bestType = bestTxSource->type;
 struct uniInput *bestUni = hashMustFindVal(uniHash, bestType);
 struct hash *rowHash = bestUni->rowHash;
 fprintf(f, "%s", bestTxSource->accession);
 
 /* Now go through and output all from that type */
 for (i=0; i<g->sourceCount; ++i)
     {
     struct txSource *txSource = &g->sources[i];
     if (sameString(txSource->type, bestType))
         {
 	if (bedF != NULL)
 	    {
 	    char **row = hashMustFindVal(rowHash, txSource->accession);
 	    writeTsvRow(bedF, BED12_NAME2_SIZE, row);
 	    }
 
 	/* Output matrix */
 	struct uniInput *uni;
 	struct bed *bestBed = hashMustFindVal(bestUni->bedHash, txSource->accession);
 	for (uni = uniList; uni != NULL; uni = uni->next)
 	    {
 	    struct txSource *ourSource = findBestSource(g, bestBed, uni);
 	    if (ourSource != NULL)
 	        {
 		char *geneName = ourSource->accession;
 		struct hashEl *hel = hashLookup(uni->matrixHash, geneName);
 		if (hel == NULL)
 		    errAbort("Can't find %s in %s but it is in %s\n", geneName, uni->matrixFile, ourSource->type);
 		int y = ptToInt(hel->val);
 		int x;
 		for (x=0; x<uni->matrix->xSize; ++x)
 		    {
 		    fprintf(f, "\t%g", uni->scaleOut*uni->matrix->rows[y][x]);
 		    }
 		}
 	    else
 	        {
 		int x;
 		for (x=0; x<uni->matrix->xSize; ++x)
 		    {
 		    fprintf(f, "\t%g", emptyVal);
 		    }
 		}
 	    }
 	fprintf(f, "\n");
 	}
     }
 }
 
 void hcaUnifyMatrix(char *graphInput, char *inList, char *outMatrix, char *outBed)
 /* hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce a 
  * unified matrix on one gene set. */
 {
 struct txGraph *g, *gList = txGraphLoadAll(graphInput);
 verbose(1, "Read %d graphs from %s\n", slCount(gList), graphInput);
 
 int typeCount = countDistinctTypes(gList);
 int minSources = typeCount/2;
 verbose(1, "%d distinct sources of evidence in %s\n", typeCount, graphInput);
 
 
 struct uniInput *uni, *uniList = uniInputLoadAll(inList);
 struct hash *uniHash = hashNew(0);
 verbose(1, "Read %d inputs from %s\n", slCount(uniList), inList);
 int totalColumns = 0;
 for (uni = uniList; uni != NULL; uni = uni->next)
     {
     hashAdd(uniHash, uni->mappingBed, uni);
     loadMappingBed(uni);
     verbose(2, "%d genes in %s\n", uni->bedHash->elCount, uni->mappingBed);
     loadMatrix(uni);
     verbose(2, "%d genes and %d samples in %s\n", 
 	uni->matrix->ySize, uni->matrix->xSize, uni->matrixFile);
     totalColumns += uni->matrix->xSize;
     }
 verbose(1, "Output will be %d columns\n", totalColumns);
 
 /* Open output and write out header row */
 FILE *f = mustOpen(outMatrix, "w");
 fprintf(f, "#hcaUnifyMatrix");
 for (uni = uniList; uni != NULL; uni = uni->next)
     {
     struct memMatrix *mm = uni->matrix;
     int x;
     for (x=0; x<mm->xSize; ++x)
         {
 	fprintf(f, "\t%s %s", uni->name, mm->xLabels[x]);
 	}
     }
 fprintf(f, "\n");
 
 FILE *bedF = NULL;
 if (outBed != NULL)
     bedF = mustOpen(outBed, "w");
 
 /* Go through outputting matrix and bed rows */
 for (g = gList; g != NULL; g = g->next)
     {
     /* We only want ones with at least a certain minimum of evidence */
     if (g->sourceCount >= minSources)
 	{
 	int typeCount = countDistinctSourceTypes(g);
 	if (typeCount >= minSources)
 	   {
 	   writeOneGraph(g, uniList, uniHash, f, bedF);
 	   }
 	}
     }
 carefulClose(&f);
 carefulClose(&bedF);
 
 char *trackDb = optionVal("trackDb", NULL);
 if (trackDb != NULL)
     {
     outputTrackDb(trackDb, uniList);
     }
 
 char *stats = optionVal("stats", NULL);
 if (stats != NULL)
     {
     outputStats(stats, uniList);
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 emptyVal = optionDouble("empty", -1);
 hcaUnifyMatrix(argv[1], argv[2], argv[3], optionVal("bed", NULL));
 return 0;
 }