3d815b5ddda4f027ed21ddc71995eebec2a8574b kent Wed Jan 20 01:46:40 2021 -0800 This utility to help merge expression matrices across data sets built on different gene models is starting to work. diff --git src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c new file mode 100644 index 0000000..96dbd61 --- /dev/null +++ src/hca/hcaUnifyMatrix/hcaUnifyMatrix.c @@ -0,0 +1,402 @@ +/* hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce + * a unified matrix on one gene set. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "sqlNum.h" +#include "rangeTree.h" +#include "vMatrix.h" +#include "obscure.h" +#include "../../hg/inc/txGraph.h" +#include "../../hg/inc/bed12Source.h" +#include "../../hg/inc/bed.h" + +#define BED12_NAME2_SIZE 13 +#define BED_NAME_FIELD 3 + +double emptyVal = -1; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce a unified matrix on one gene set\n" + "usage:\n" + " hcaUnifyMatrix graph.txg inList.tsv outMatrix.tsv\n" + "where\n" + " graph.txg is made with txBedToGraph on the mapping.beds in the input\n" + " inList.tsv has one line per input we are unifying. It is tab separated with columns:\n" + " 1 - name of input to add to column labels\n" + " 2 - relative priority, lower numbers get attended to first in gene mapping\n" + " 3 - input mapping bed file, often result of gencodeVersionForGenes -bed output\n" + " 4 - input expression matrix file in tsv format\n" + "options:\n" + " -bed=mapping.bed\n" + " -empty=val - store val (rather than default -1) to show missing data\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"bed", OPTION_STRING}, + {"val", OPTION_DOUBLE}, + {NULL, 0}, +}; + +struct uniInput +/* A single input to our unification process. */ + { + struct uniInput *next; /* Next in singly linked list. */ + char *name; /* Short, will be reused as label prefix */ + int priority; /* 1 is attended to first, then 2, etc. */ + char *mappingBed; /* Mapping bed file name */ + char *matrixFile; /* Matrix file name */ + struct hash *rowHash; /* keyed by 4th field, values char** */ + struct hash *bedHash; /* Also keyed by 4th field */ + struct slRef *rowList; /* List of all parsed out rows as char** */ + struct memMatrix *matrix; /* Loaded matrixFile */ + struct hash *matrixHash; /* Matrix Y value keyed by gene name */ + }; + + +struct uniInput *uniInputLoad(char **row) +/* Load a uniInput from row fetched with select * from uniInput + * from database. Dispose of this with uniInputFree(). */ +{ +struct uniInput *ret; + +AllocVar(ret); +ret->name = cloneString(row[0]); +ret->priority = sqlSigned(row[1]); +ret->mappingBed = cloneString(row[2]); +ret->matrixFile = cloneString(row[3]); +return ret; +} + +struct uniInput *uniInputLoadAll(char *fileName) +/* Load all uniInput from a whitespace-separated file. + * Dispose of this with uniInputFreeList(). */ +{ +struct uniInput *list = NULL, *el; +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *row[4]; + +while (lineFileRow(lf, row)) + { + el = uniInputLoad(row); + slAddHead(&list, el); + } +lineFileClose(&lf); +slReverse(&list); +return list; +} + +void loadMappingBed(struct uniInput *uni) +/* Load up a mapping bed file and make up row and bed hashes from it */ +{ +struct hash *rowHash = uni->rowHash = hashNew(0); +struct hash *bedHash = uni->bedHash = hashNew(0); +struct lm *lm = rowHash->lm; + +struct lineFile *lf = lineFileOpen(uni->mappingBed, TRUE); +char *row[BED12_NAME2_SIZE]; +while (lineFileRow(lf, row)) + { + char *name = row[BED_NAME_FIELD]; + struct bed12Source *bed = bed12SourceLoad(row); + hashAdd(bedHash, name, bed); + char **rowCopy = lmCloneRow(lm, row, ArraySize(row)); + hashAdd(rowHash, name, rowCopy); + refAdd(&uni->rowList, rowCopy); + } +slReverse(&uni->rowList); +lineFileClose(&lf); +} + +void loadMatrix(struct uniInput *uni) +/* Load up a mapping bed file and make up row and bed hashes from it */ +{ +struct memMatrix *matrix = uni->matrix = memMatrixFromTsv(uni->matrixFile); +struct hash *hash = uni->matrixHash = hashNew(0); +int y; +for (y=0; y<matrix->ySize; ++y) + hashAddInt(hash, matrix->yLabels[y], y); +} + +int countDistinctTypes(struct txGraph *gList) +/* Just count up how many distinct source types we got to track in all graphs*/ +{ +/* Figure up how many distinct sources we have */ +struct hash *uniq = hashNew(0); +struct txGraph *g; +for (g = gList; g != NULL; g = g->next) + { + int i; + for (i=0; i<g->sourceCount; ++i) + { + char *typeName = g->sources[i].type; + hashStore(uniq, typeName); + } + } +int typeCount = uniq->elCount; +hashFree(&uniq); +return typeCount; +} + +int countDistinctSourceTypes(struct txGraph *g) +/* Count up number of sources in one graph */ +{ +struct hash *typeHash = hashNew(0); +struct txSource *sources = g->sources; +int count = g->sourceCount; +int i; +for (i=0; i<count; ++i) + { + char *typeName = sources[i].type; + hashStore(typeHash, typeName); + } +int retVal = typeHash->elCount; +hashFree(&typeHash); +return retVal; +} + + +struct rbTree *rangeTreeOnBed(struct bed *bed) +/* Return an rbTree holding all the blocks of the bed */ +{ +struct rbTree *t = rangeTreeNew(); +int i; +for (i=0; i<bed->blockCount; ++i) + { + int start = bed->chromStart + bed->chromStarts[i]; + int end = start + bed->blockSizes[i]; + rangeTreeAdd(t, start, end); + } +return t; +} + +int bedBedOverlap(struct bed *a, struct bed *b) +/* Return amount of overlap at exon level between a and b */ +{ +if (!sameString(a->chrom, b->chrom)) + return 0; +if (a->strand[0] != b->strand[0]) + return 0; +int overlap = rangeIntersection(a->chromStart, a->chromEnd, b->chromStart, b->chromEnd); +if (overlap <= 0) + return 0; + +/* Ok, go do the slow things of building up a range tree. We could write some code + * that is a little complex like a merge sort to avoid this, maybe some day if this + * is too slow. */ +struct rbTree *t = rangeTreeOnBed(a); +int totalOverlap = 0; +int i; +for (i=0; i<b->blockCount; ++i) + { + int start = b->chromStart + b->chromStarts[i]; + int end = start + b->blockSizes[i]; + totalOverlap += rangeTreeOverlapSize(t, start, end); + } +rangeTreeFree(&t); +return totalOverlap; +} + +char *findClosestOfType(struct txGraph *g, struct uniInput *type, struct bed *bed) +/* Find the accession of the source of given type that overlaps most with bed */ +{ +struct txSource *txSource = g->sources; +int i; +int bestOverlap = 0; +char *bestAcc = NULL; +for (i=0; i<g->sourceCount; ++i) + { + if (sameString(type->mappingBed, txSource->type)) + { + struct bed *other = hashMustFindVal(type->bedHash, txSource->accession); + int overlap = bedBedOverlap(bed, other); + if (overlap > bestOverlap) + { + bestOverlap = overlap; + bestAcc = txSource->accession; + } + } + txSource += 1; + } +return bestAcc; +} + +struct txSource *findBestSource(struct txGraph *g, struct bed *bestBed, struct uniInput *uni) +/* Go through all transcripts belonging to txSource in g, and try to find for them + * the best mate among the chosen type in the very same g. This will output no + * more than one mate for each element of bestType and of ourType */ +{ +char *ourType = uni->mappingBed; +int i; +struct txSource *bestSource = NULL; +int bestOverlap = 0; +struct txSource *txSource = g->sources; +for (i=0; i<g->sourceCount; ++i) + { + if (sameString(txSource->type, ourType)) + { + char *ourAcc = txSource->accession; + struct bed *ourBed = hashMustFindVal(uni->bedHash, ourAcc); + int overlap = bedBedOverlap(bestBed, ourBed); + if (overlap > bestOverlap) + { + bestOverlap = overlap; + bestSource = txSource; + } + } + txSource += 1; + } +return bestSource; +} + + +void writeOneGraph(struct txGraph *g, struct uniInput *uniList, struct hash *uniHash, + FILE *f, FILE *bedF) +/* Here we write out one graph. Normally just write in one piece, but + * we will at least snoop for those that need breaking up */ +{ +/* Find best txSource and it's associated type. */ +int i; +struct txSource *bestTxSource = NULL; +int bestPriority = BIGNUM; // priority is better at 1 than 2 +for (i=0; i<g->sourceCount; ++i) + { + struct txSource *txSource = &g->sources[i]; + struct uniInput *uni = hashMustFindVal(uniHash, txSource->type); + if (bestPriority > uni->priority) + { + bestPriority = uni->priority; + bestTxSource = txSource; + } + txSource += 1; + } +char *bestType = bestTxSource->type; +struct uniInput *bestUni = hashMustFindVal(uniHash, bestType); +struct hash *rowHash = bestUni->rowHash; +fprintf(f, "%s", bestTxSource->accession); + +/* Now go through and output all from that type */ +for (i=0; i<g->sourceCount; ++i) + { + struct txSource *txSource = &g->sources[i]; + if (sameString(txSource->type, bestType)) + { + if (bedF != NULL) + { + char **row = hashMustFindVal(rowHash, txSource->accession); + writeTsvRow(bedF, BED12_NAME2_SIZE, row); + } + + /* Output matrix */ + struct uniInput *uni; + struct bed *bestBed = hashMustFindVal(bestUni->bedHash, txSource->accession); + for (uni = uniList; uni != NULL; uni = uni->next) + { + if (bestBed != bestBed) uglyAbort("Absurd!"); + struct txSource *ourSource = findBestSource(g, bestBed, uni); + if (ourSource != NULL) + { + char *geneName = ourSource->accession; + struct hashEl *hel = hashLookup(uni->matrixHash, geneName); + if (hel == NULL) + errAbort("Can't find %s in %s but it is in %s\n", geneName, uni->matrixFile, ourSource->type); + int y = ptToInt(hel->val); + int x; + for (x=0; x<uni->matrix->xSize; ++x) + { + fprintf(f, "\t%g", uni->matrix->rows[y][x]); + } + } + else + { + int x; + for (x=0; x<uni->matrix->xSize; ++x) + { + fprintf(f, "\t%g", emptyVal); + } + } + } + fprintf(f, "\n"); + } + } +} + +void hcaUnifyMatrix(char *graphInput, char *inList, char *outMatrix, char *outBed) +/* hcaUnifyMatrix - Given a list of matrices on maybe different gene sets produce a + * unified matrix on one gene set. */ +{ +struct txGraph *g, *gList = txGraphLoadAll(graphInput); +verbose(1, "Read %d graphs from %s\n", slCount(gList), graphInput); + +int typeCount = countDistinctTypes(gList); +int minSources = typeCount/2; +verbose(1, "%d distinct sources of evidence in %s\n", typeCount, graphInput); + + +struct uniInput *uni, *uniList = uniInputLoadAll(inList); +struct hash *uniHash = hashNew(0); +verbose(1, "Read %d inputs from %s\n", slCount(uniList), inList); +int totalColumns = 0; +for (uni = uniList; uni != NULL; uni = uni->next) + { + hashAdd(uniHash, uni->mappingBed, uni); + loadMappingBed(uni); + verbose(1, "%d genes in %s\n", uni->bedHash->elCount, uni->mappingBed); + loadMatrix(uni); + verbose(1, "%d genes and %d samples in %s\n", + uni->matrix->ySize, uni->matrix->xSize, uni->matrixFile); + totalColumns += uni->matrix->xSize; + } +verbose(1, "Output will be %d columns\n", totalColumns); + +/* Open output and write out header row */ +FILE *f = mustOpen(outMatrix, "w"); +fprintf(f, "#hcaUnifyMatrix"); +for (uni = uniList; uni != NULL; uni = uni->next) + { + struct memMatrix *mm = uni->matrix; + int x; + for (x=0; x<mm->xSize; ++x) + { + fprintf(f, "\t%s %s", uni->name, mm->xLabels[x]); + } + } +fprintf(f, "\n"); + +FILE *bedF = NULL; +if (outBed != NULL) + bedF = mustOpen(outBed, "w"); + +/* Go through outputting matrix and bed rows */ +for (g = gList; g != NULL; g = g->next) + { + /* We only want ones with at least a certain minimum of evidence */ + if (g->sourceCount >= minSources) + { + int typeCount = countDistinctSourceTypes(g); + if (typeCount >= minSources) + { + writeOneGraph(g, uniList, uniHash, f, bedF); + } + } + } +carefulClose(&f); +carefulClose(&bedF); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 4) + usage(); +emptyVal = optionDouble("empty", -1); +hcaUnifyMatrix(argv[1], argv[2], argv[3], optionVal("bed", NULL)); +return 0; +}