bfd7d15166c13c243f9d9a4bf1d6d50f168c4920 kent Wed Jan 20 09:14:39 2021 -0800 Adding trackDb and stats options. Also adding scale factor per dataset. diff --git src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c index 96dbd61..b02a8f6 100644 --- src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c +++ src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c @@ -22,77 +22,87 @@ { errAbort( "hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce a unified matrix on one gene set\n" "usage:\n" " hcaUnifyMatrix graph.txg inList.tsv outMatrix.tsv\n" "where\n" " graph.txg is made with txBedToGraph on the mapping.beds in the input\n" " inList.tsv has one line per input we are unifying. It is tab separated with columns:\n" " 1 - name of input to add to column labels\n" " 2 - relative priority, lower numbers get attended to first in gene mapping\n" " 3 - input mapping bed file, often result of gencodeVersionForGenes -bed output\n" " 4 - input expression matrix file in tsv format\n" "options:\n" " -bed=mapping.bed\n" " -empty=val - store val (rather than default -1) to show missing data\n" + " -trackDb=stanza.ra - save trackDb stanza here\n" + " -stats=stats.tsv - save merged stats file here\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, - {"val", OPTION_DOUBLE}, + {"empty", OPTION_DOUBLE}, + {"trackDb", OPTION_STRING}, + {"stats", OPTION_STRING}, {NULL, 0}, }; struct uniInput /* A single input to our unification process. */ { struct uniInput *next; /* Next in singly linked list. */ char *name; /* Short, will be reused as label prefix */ int priority; /* 1 is attended to first, then 2, etc. */ + double scaleOut; /* Everything gets multiplied by this in the output */ char *mappingBed; /* Mapping bed file name */ char *matrixFile; /* Matrix file name */ + char *colorFile; /* File it first column sample, last color */ + char *statsFile; /* File stats are in */ struct hash *rowHash; /* keyed by 4th field, values char** */ struct hash *bedHash; /* Also keyed by 4th field */ struct slRef *rowList; /* List of all parsed out rows as char** */ struct memMatrix *matrix; /* Loaded matrixFile */ struct hash *matrixHash; /* Matrix Y value keyed by gene name */ }; struct uniInput *uniInputLoad(char **row) /* Load a uniInput from row fetched with select * from uniInput * from database. Dispose of this with uniInputFree(). */ { struct uniInput *ret; AllocVar(ret); ret->name = cloneString(row[0]); ret->priority = sqlSigned(row[1]); -ret->mappingBed = cloneString(row[2]); -ret->matrixFile = cloneString(row[3]); +ret->scaleOut = sqlDouble(row[2]); +ret->mappingBed = cloneString(row[3]); +ret->matrixFile = cloneString(row[4]); +ret->colorFile = cloneString(row[5]); +ret->statsFile = cloneString(row[6]); return ret; } struct uniInput *uniInputLoadAll(char *fileName) /* Load all uniInput from a whitespace-separated file. * Dispose of this with uniInputFreeList(). */ { struct uniInput *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); -char *row[4]; +char *row[7]; while (lineFileRow(lf, row)) { el = uniInputLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } void loadMappingBed(struct uniInput *uni) /* Load up a mapping bed file and make up row and bed hashes from it */ { struct hash *rowHash = uni->rowHash = hashNew(0); @@ -244,30 +254,88 @@ char *ourAcc = txSource->accession; struct bed *ourBed = hashMustFindVal(uni->bedHash, ourAcc); int overlap = bedBedOverlap(bestBed, ourBed); if (overlap > bestOverlap) { bestOverlap = overlap; bestSource = txSource; } } txSource += 1; } return bestSource; } +void outputTrackDb(char *fileName, struct uniInput *uniList) +/* Write out a trackDb stanza for us */ +{ +FILE *f = mustOpen(fileName, "w"); +fprintf(f, "track uniBarChart\n"); +fprintf(f, "type bigBarChart\n"); +struct dyString *bars = dyStringNew(0), *colors = dyStringNew(0); +dyStringAppend(bars, "barChartBars"); +dyStringAppend(colors, "barChartColors"); +struct uniInput *uni; +for (uni = uniList; uni != NULL; uni = uni->next) + { + struct lineFile *lf = lineFileOpen(uni->colorFile, TRUE); + char *line; + while (lineFileNextReal(lf, &line)) + { + char *firstWord = nextTabWord(&line); + char *lastWord = NULL; + char *word; + while ((word = nextTabWord(&line)) != NULL) + lastWord = word; + subChar(firstWord, ' ', '_'); // Oh no, underbars! + dyStringPrintf(bars, " %s_%s", uni->name, firstWord); + dyStringPrintf(colors, " %s", lastWord); + } + lineFileClose(&lf); + } +fprintf(f, "%s\n", bars->string); +fprintf(f, "%s\n", colors->string); +carefulClose(&f); +} + +void outputStats(char *fileName, struct uniInput *uniList) +/* Write out a trackDb stanza for us */ +{ +FILE *f = mustOpen(fileName, "w"); +struct uniInput *uni; +for (uni = uniList; uni != NULL; uni = uni->next) + { + struct lineFile *lf = lineFileOpen(uni->statsFile, TRUE); + char *line; + while (lineFileNext(lf, &line, NULL)) + { + char *start = skipLeadingSpaces(line); + if (start[0] == '#' || start[0] == 0) + fprintf(f, "%s\n", line); + else + { + fprintf(f, "%s %s\n", uni->name, line); + } + } + lineFileClose(&lf); + } +carefulClose(&f); +} + + + void writeOneGraph(struct txGraph *g, struct uniInput *uniList, struct hash *uniHash, FILE *f, FILE *bedF) /* Here we write out one graph. Normally just write in one piece, but * we will at least snoop for those that need breaking up */ { /* Find best txSource and it's associated type. */ int i; struct txSource *bestTxSource = NULL; int bestPriority = BIGNUM; // priority is better at 1 than 2 for (i=0; i<g->sourceCount; ++i) { struct txSource *txSource = &g->sources[i]; struct uniInput *uni = hashMustFindVal(uniHash, txSource->type); if (bestPriority > uni->priority) { @@ -298,31 +366,31 @@ struct bed *bestBed = hashMustFindVal(bestUni->bedHash, txSource->accession); for (uni = uniList; uni != NULL; uni = uni->next) { if (bestBed != bestBed) uglyAbort("Absurd!"); struct txSource *ourSource = findBestSource(g, bestBed, uni); if (ourSource != NULL) { char *geneName = ourSource->accession; struct hashEl *hel = hashLookup(uni->matrixHash, geneName); if (hel == NULL) errAbort("Can't find %s in %s but it is in %s\n", geneName, uni->matrixFile, ourSource->type); int y = ptToInt(hel->val); int x; for (x=0; x<uni->matrix->xSize; ++x) { - fprintf(f, "\t%g", uni->matrix->rows[y][x]); + fprintf(f, "\t%g", uni->scaleOut*uni->matrix->rows[y][x]); } } else { int x; for (x=0; x<uni->matrix->xSize; ++x) { fprintf(f, "\t%g", emptyVal); } } } fprintf(f, "\n"); } } } @@ -376,27 +444,39 @@ /* Go through outputting matrix and bed rows */ for (g = gList; g != NULL; g = g->next) { /* We only want ones with at least a certain minimum of evidence */ if (g->sourceCount >= minSources) { int typeCount = countDistinctSourceTypes(g); if (typeCount >= minSources) { writeOneGraph(g, uniList, uniHash, f, bedF); } } } carefulClose(&f); carefulClose(&bedF); + +char *trackDb = optionVal("trackDb", NULL); +if (trackDb != NULL) + { + outputTrackDb(trackDb, uniList); + } + +char *stats = optionVal("stats", NULL); +if (stats != NULL) + { + outputStats(stats, uniList); + } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); emptyVal = optionDouble("empty", -1); hcaUnifyMatrix(argv[1], argv[2], argv[3], optionVal("bed", NULL)); return 0; }