0e9604766086b49eecef30226e15cea726cc14b6 kent Wed Nov 24 14:03:14 2021 -0800 Updated usage message, which was missing a new column. diff --git src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c index 2346a8b..422917a 100644 --- src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c +++ src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c @@ -1,482 +1,483 @@ /* hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce * a unified matrix on one gene set. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "sqlNum.h" #include "rangeTree.h" #include "vMatrix.h" #include "obscure.h" #include "../../hg/inc/txGraph.h" #include "../../hg/inc/bed12Source.h" #include "../../hg/inc/bed.h" #define BED12_NAME2_SIZE 13 #define BED_NAME_FIELD 3 double emptyVal = -1; void usage() /* Explain usage and exit. */ { errAbort( "hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce a unified matrix on one gene set\n" "usage:\n" " hcaUnifyMatrix graph.txg inList.tsv outMatrix.tsv\n" "where\n" " graph.txg is made with txBedToGraph on the mapping.beds in the input\n" " inList.tsv has one line per input we are unifying. It is tab separated with columns:\n" " 1 - name of input to add to column labels\n" " 2 - relative priority, lower numbers get attended to first in gene mapping\n" - " 3 - input mapping bed file, often result of gencodeVersionForGenes -bed output\n" - " 4 - input expression matrix file in tsv format\n" + " 3 - scale - number used to scale this input relative to others\n" + " 4 - input mapping bed file, often result of gencodeVersionForGenes -bed output\n" + " 5 - input expression matrix file in tsv format\n" "options:\n" " -bed=mapping.bed\n" " -empty=val - store val (rather than default -1) to show missing data\n" " -trackDb=stanza.ra - save trackDb stanza here\n" " -stats=stats.tsv - save merged stats file here\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, {"empty", OPTION_DOUBLE}, {"trackDb", OPTION_STRING}, {"stats", OPTION_STRING}, {NULL, 0}, }; struct uniInput /* A single input to our unification process. */ { struct uniInput *next; /* Next in singly linked list. */ char *name; /* Short, will be reused as label prefix */ int priority; /* 1 is attended to first, then 2, etc. */ double scaleOut; /* Everything gets multiplied by this in the output */ char *mappingBed; /* Mapping bed file name */ char *matrixFile; /* Matrix file name */ char *colorFile; /* File it first column sample, last color */ char *statsFile; /* File stats are in */ struct hash *rowHash; /* keyed by 4th field, values char** */ struct hash *bedHash; /* Also keyed by 4th field */ struct slRef *rowList; /* List of all parsed out rows as char** */ struct memMatrix *matrix; /* Loaded matrixFile */ struct hash *matrixHash; /* Matrix Y value keyed by gene name */ }; struct uniInput *uniInputLoad(char **row) /* Load a uniInput from row fetched with select * from uniInput * from database. Dispose of this with uniInputFree(). */ { struct uniInput *ret; AllocVar(ret); ret->name = cloneString(row[0]); ret->priority = sqlSigned(row[1]); ret->scaleOut = sqlDouble(row[2]); ret->mappingBed = cloneString(row[3]); ret->matrixFile = cloneString(row[4]); ret->colorFile = cloneString(row[5]); ret->statsFile = cloneString(row[6]); return ret; } struct uniInput *uniInputLoadAll(char *fileName) /* Load all uniInput from a whitespace-separated file. * Dispose of this with uniInputFreeList(). */ { struct uniInput *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[7]; while (lineFileRow(lf, row)) { el = uniInputLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } void loadMappingBed(struct uniInput *uni) /* Load up a mapping bed file and make up row and bed hashes from it */ { struct hash *rowHash = uni->rowHash = hashNew(0); struct hash *bedHash = uni->bedHash = hashNew(0); struct lm *lm = rowHash->lm; struct lineFile *lf = lineFileOpen(uni->mappingBed, TRUE); char *row[BED12_NAME2_SIZE]; while (lineFileRow(lf, row)) { char *name = row[BED_NAME_FIELD]; struct bed12Source *bed = bed12SourceLoad(row); hashAdd(bedHash, name, bed); char **rowCopy = lmCloneRow(lm, row, ArraySize(row)); hashAdd(rowHash, name, rowCopy); refAdd(&uni->rowList, rowCopy); } slReverse(&uni->rowList); lineFileClose(&lf); } void loadMatrix(struct uniInput *uni) /* Load up a mapping bed file and make up row and bed hashes from it */ { struct memMatrix *matrix = uni->matrix = memMatrixFromTsv(uni->matrixFile); struct hash *hash = uni->matrixHash = hashNew(0); int y; for (y=0; yySize; ++y) hashAddInt(hash, matrix->yLabels[y], y); } int countDistinctTypes(struct txGraph *gList) /* Just count up how many distinct source types we got to track in all graphs*/ { /* Figure up how many distinct sources we have */ struct hash *uniq = hashNew(0); struct txGraph *g; for (g = gList; g != NULL; g = g->next) { int i; for (i=0; isourceCount; ++i) { char *typeName = g->sources[i].type; hashStore(uniq, typeName); } } int typeCount = uniq->elCount; hashFree(&uniq); return typeCount; } int countDistinctSourceTypes(struct txGraph *g) /* Count up number of sources in one graph */ { struct hash *typeHash = hashNew(0); struct txSource *sources = g->sources; int count = g->sourceCount; int i; for (i=0; ielCount; hashFree(&typeHash); return retVal; } struct rbTree *rangeTreeOnBed(struct bed *bed) /* Return an rbTree holding all the blocks of the bed */ { struct rbTree *t = rangeTreeNew(); int i; for (i=0; iblockCount; ++i) { int start = bed->chromStart + bed->chromStarts[i]; int end = start + bed->blockSizes[i]; rangeTreeAdd(t, start, end); } return t; } int bedBedOverlap(struct bed *a, struct bed *b) /* Return amount of overlap at exon level between a and b */ { if (!sameString(a->chrom, b->chrom)) return 0; if (a->strand[0] != b->strand[0]) return 0; int overlap = rangeIntersection(a->chromStart, a->chromEnd, b->chromStart, b->chromEnd); if (overlap <= 0) return 0; /* Ok, go do the slow things of building up a range tree. We could write some code * that is a little complex like a merge sort to avoid this, maybe some day if this * is too slow. */ struct rbTree *t = rangeTreeOnBed(a); int totalOverlap = 0; int i; for (i=0; iblockCount; ++i) { int start = b->chromStart + b->chromStarts[i]; int end = start + b->blockSizes[i]; totalOverlap += rangeTreeOverlapSize(t, start, end); } rangeTreeFree(&t); return totalOverlap; } char *findClosestOfType(struct txGraph *g, struct uniInput *type, struct bed *bed) /* Find the accession of the source of given type that overlaps most with bed */ { struct txSource *txSource = g->sources; int i; int bestOverlap = 0; char *bestAcc = NULL; for (i=0; isourceCount; ++i) { if (sameString(type->mappingBed, txSource->type)) { struct bed *other = hashMustFindVal(type->bedHash, txSource->accession); int overlap = bedBedOverlap(bed, other); if (overlap > bestOverlap) { bestOverlap = overlap; bestAcc = txSource->accession; } } txSource += 1; } return bestAcc; } struct txSource *findBestSource(struct txGraph *g, struct bed *bestBed, struct uniInput *uni) /* Go through all transcripts belonging to txSource in g, and try to find for them * the best mate among the chosen type in the very same g. This will output no * more than one mate for each element of bestType and of ourType */ { char *ourType = uni->mappingBed; int i; struct txSource *bestSource = NULL; int bestOverlap = 0; struct txSource *txSource = g->sources; for (i=0; isourceCount; ++i) { if (sameString(txSource->type, ourType)) { char *ourAcc = txSource->accession; struct bed *ourBed = hashMustFindVal(uni->bedHash, ourAcc); int overlap = bedBedOverlap(bestBed, ourBed); if (overlap > bestOverlap) { bestOverlap = overlap; bestSource = txSource; } } txSource += 1; } return bestSource; } void outputTrackDb(char *fileName, struct uniInput *uniList) /* Write out a trackDb stanza for us */ { FILE *f = mustOpen(fileName, "w"); fprintf(f, "track uniBarChart\n"); fprintf(f, "type bigBarChart\n"); struct dyString *bars = dyStringNew(0), *colors = dyStringNew(0); dyStringAppend(bars, "barChartBars"); dyStringAppend(colors, "barChartColors"); struct uniInput *uni; for (uni = uniList; uni != NULL; uni = uni->next) { struct lineFile *lf = lineFileOpen(uni->colorFile, TRUE); char *line; while (lineFileNextReal(lf, &line)) { char *firstWord = nextTabWord(&line); char *lastWord = NULL; char *word; while ((word = nextTabWord(&line)) != NULL) lastWord = word; subChar(firstWord, ' ', '_'); // Oh no, underbars! dyStringPrintf(bars, " %s_%s", uni->name, firstWord); dyStringPrintf(colors, " %s", lastWord); } lineFileClose(&lf); } fprintf(f, "%s\n", bars->string); fprintf(f, "%s\n", colors->string); carefulClose(&f); } void outputStats(char *fileName, struct uniInput *uniList) /* Write out a trackDb stanza for us */ { FILE *f = mustOpen(fileName, "w"); struct uniInput *uni; for (uni = uniList; uni != NULL; uni = uni->next) { struct lineFile *lf = lineFileOpen(uni->statsFile, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { char *start = skipLeadingSpaces(line); if (start[0] == '#' || start[0] == 0) fprintf(f, "%s\n", line); else { fprintf(f, "%s %s\n", uni->name, line); } } lineFileClose(&lf); } carefulClose(&f); } void writeOneGraph(struct txGraph *g, struct uniInput *uniList, struct hash *uniHash, FILE *f, FILE *bedF) /* Here we write out one graph. Normally just write in one piece, but * we will at least snoop for those that need breaking up */ { verbose(3, "writeOneGraph %s with %d sources\n", g->name, g->sourceCount); /* Find best txSource and it's associated type. */ int i; struct txSource *bestTxSource = NULL; int bestPriority = BIGNUM; // priority is better at 1 than 2 for (i=0; isourceCount; ++i) { struct txSource *txSource = &g->sources[i]; struct uniInput *uni = hashMustFindVal(uniHash, txSource->type); if (bestPriority > uni->priority) { bestPriority = uni->priority; bestTxSource = txSource; } txSource += 1; } char *bestType = bestTxSource->type; struct uniInput *bestUni = hashMustFindVal(uniHash, bestType); struct hash *rowHash = bestUni->rowHash; fprintf(f, "%s", bestTxSource->accession); /* Now go through and output all from that type */ for (i=0; isourceCount; ++i) { struct txSource *txSource = &g->sources[i]; if (sameString(txSource->type, bestType)) { if (bedF != NULL) { char **row = hashMustFindVal(rowHash, txSource->accession); writeTsvRow(bedF, BED12_NAME2_SIZE, row); } /* Output matrix */ struct uniInput *uni; struct bed *bestBed = hashMustFindVal(bestUni->bedHash, txSource->accession); for (uni = uniList; uni != NULL; uni = uni->next) { struct txSource *ourSource = findBestSource(g, bestBed, uni); if (ourSource != NULL) { char *geneName = ourSource->accession; struct hashEl *hel = hashLookup(uni->matrixHash, geneName); if (hel == NULL) errAbort("Can't find %s in %s but it is in %s\n", geneName, uni->matrixFile, ourSource->type); int y = ptToInt(hel->val); int x; for (x=0; xmatrix->xSize; ++x) { fprintf(f, "\t%g", uni->scaleOut*uni->matrix->rows[y][x]); } } else { int x; for (x=0; xmatrix->xSize; ++x) { fprintf(f, "\t%g", emptyVal); } } } fprintf(f, "\n"); } } } void hcaUnifyMatrix(char *graphInput, char *inList, char *outMatrix, char *outBed) /* hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce a * unified matrix on one gene set. */ { struct txGraph *g, *gList = txGraphLoadAll(graphInput); verbose(1, "Read %d graphs from %s\n", slCount(gList), graphInput); int typeCount = countDistinctTypes(gList); int minSources = typeCount/2; verbose(1, "%d distinct sources of evidence in %s\n", typeCount, graphInput); struct uniInput *uni, *uniList = uniInputLoadAll(inList); struct hash *uniHash = hashNew(0); verbose(1, "Read %d inputs from %s\n", slCount(uniList), inList); int totalColumns = 0; for (uni = uniList; uni != NULL; uni = uni->next) { hashAdd(uniHash, uni->mappingBed, uni); loadMappingBed(uni); verbose(2, "%d genes in %s\n", uni->bedHash->elCount, uni->mappingBed); loadMatrix(uni); verbose(2, "%d genes and %d samples in %s\n", uni->matrix->ySize, uni->matrix->xSize, uni->matrixFile); totalColumns += uni->matrix->xSize; } verbose(1, "Output will be %d columns\n", totalColumns); /* Open output and write out header row */ FILE *f = mustOpen(outMatrix, "w"); fprintf(f, "#hcaUnifyMatrix"); for (uni = uniList; uni != NULL; uni = uni->next) { struct memMatrix *mm = uni->matrix; int x; for (x=0; xxSize; ++x) { fprintf(f, "\t%s %s", uni->name, mm->xLabels[x]); } } fprintf(f, "\n"); FILE *bedF = NULL; if (outBed != NULL) bedF = mustOpen(outBed, "w"); /* Go through outputting matrix and bed rows */ for (g = gList; g != NULL; g = g->next) { /* We only want ones with at least a certain minimum of evidence */ if (g->sourceCount >= minSources) { int typeCount = countDistinctSourceTypes(g); if (typeCount >= minSources) { writeOneGraph(g, uniList, uniHash, f, bedF); } } } carefulClose(&f); carefulClose(&bedF); char *trackDb = optionVal("trackDb", NULL); if (trackDb != NULL) { outputTrackDb(trackDb, uniList); } char *stats = optionVal("stats", NULL); if (stats != NULL) { outputStats(stats, uniList); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); emptyVal = optionDouble("empty", -1); hcaUnifyMatrix(argv[1], argv[2], argv[3], optionVal("bed", NULL)); return 0; }