be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c new file mode 100644 index 0000000..c82d45a --- /dev/null +++ src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c @@ -0,0 +1,325 @@ +/* clusterMatrixToBarChartBed - Compute a barchart bed file from a gene matrix + * and a gene bed file and a way to cluster samples. */ + +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "localmem.h" +#include "obscure.h" +#include "sqlNum.h" + +boolean clSimple = FALSE; +boolean clMedian = FALSE; +char *clName2 = NULL; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "clusterMatrixToBarChartBed - Compute a barchart bed file from a gene matrix\n" + "and a gene bed file and a way to cluster samples.\n" + "NOTE: consider using matrixClusterColumns and matrixToBarChartBed instead\n" + "usage:\n" + " clusterMatrixToBarChartBed sampleClusters.tsv geneMatrix.tsv geneset.bed output.bed\n" + "where:\n" + " sampleClusters.tsv is a two column tab separated file with sampleId and clusterId\n" + " geneMatrix.tsv has a row for each gene. The first row uses the same sampleId as above\n" + " geneset.bed has the maps the genes in the matrix (from it's first column) to the genome\n" + " geneset.bed needs 6 standard bed fields. Unless name2 is set it also needs a name2\n" + " field as the last field\n" + " output.bed is the resulting bar chart, with one column per cluster\n" + "options:\n" + " -simple - don't store the position of gene in geneMatrix.tsv file in output\n" + " -median - use median (instead of mean)\n" + " -name2=twoColFile.tsv - get name2 from file where first col is same ase geneset.bed's name\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"simple", OPTION_BOOLEAN}, + {"median", OPTION_BOOLEAN}, + {"name2", OPTION_STRING}, + {NULL, 0}, +}; + +void hashSamplesAndClusters(char *tsvFile, + struct hash **retSampleHash, struct hash **retClusterHash) +/* Read two column tsv file into a hash keyed by first column */ +{ +struct hash *sampleHash = hashNew(0); +struct hash *clusterHash = hashNew(0); +char *row[2]; +struct lineFile *lf = lineFileOpen(tsvFile, TRUE); +while (lineFileNextRowTab(lf, row, ArraySize(row)) ) + { + /* Find cluster in cluster hash, if it doesn't exist make it. */ + char *clusterName = row[1]; + struct hashEl *hel = hashLookup(clusterHash, clusterName); + if (hel == NULL) + hel = hashAddInt(clusterHash, clusterName, 1); + else + hel->val = ((char *)hel->val)+1; // Increment hash pointer as per hashIncInt + char *clusterStableName = hel->name; // This is allocated in clusterHash + hashAdd(sampleHash, row[0], clusterStableName); + } +lineFileClose(&lf); +*retSampleHash = sampleHash; +*retClusterHash = clusterHash; +} + +void clusterMatrixToBarChartBed(char *sampleClusters, char *matrixTsv, char *geneBed, char *output) +/* clusterMatrixToBarChartBed - Compute a barchart bed file from a gene matrix + * and a gene bed file and a way to cluster samples. */ +{ +/* Figure out if we need to do medians etc */ +boolean doMedian = clMedian; + +/* Load up the gene set */ +verbose(2, "clusterMatrixToBarChartBed(%s,%s,%s,%s)\n", sampleClusters, matrixTsv, geneBed, output); +int bedRowSize = 0; +struct hash *geneHash = hashTsvBy(geneBed, 3, &bedRowSize); +verbose(2, "%d columns about %d genes in %s\n", bedRowSize, geneHash->elCount, geneBed); + +/* Deal with external gene hash */ +struct hash *nameToName2 = NULL; +if (clName2 != NULL) + { + int colCount = 0; + nameToName2 = hashTsvBy(clName2, 0, &colCount); + if (colCount != 2) + errAbort("Expecting %s to be a two column tab separated file", clName2); + } + +/* Keep track of how many fields gene bed has to have and locate name2 */ +int geneBedMinSize = 7; +int name2Ix = bedRowSize - 1; // Last field if it is in bed +if (clName2 != NULL) + geneBedMinSize -= 1; +if (bedRowSize < geneBedMinSize) + { + if (clName2 == NULL) + errAbort("%s needs to have at least 6 standard BED fields and a name2 field\n", geneBed); + else + errAbort("%s needs to have at least 6 standard BED fields\n", geneBed); + } + + +/* Load up the sample clustering */ +struct hash *sampleHash = NULL, *clusterHash = NULL; +hashSamplesAndClusters(sampleClusters, &sampleHash, &clusterHash); +int clusterCount = clusterHash->elCount; +verbose(1, "%d samples and %d clusters in %s\n", sampleHash->elCount, clusterCount, + sampleClusters); +if (clusterCount <= 1 || clusterCount >= 10000) + errAbort("%d is not a good number of clusters", clusterCount); +double clusterTotal[clusterCount]; +int clusterElements[clusterCount]; + +/* Alphabetize cluster names */ +char *clusterNames[clusterCount]; +struct hashEl *hel; +struct hashCookie cookie = hashFirst(clusterHash); +int clusterIx = 0; +while ((hel = hashNext(&cookie)) != NULL) + { + clusterNames[clusterIx] = hel->name; + ++clusterIx; + } +sortStrings(clusterNames, clusterCount); +verbose(2, "%s to %s\n", clusterNames[0], clusterNames[clusterCount-1]); + +/* Figure out size of each alphabetized cluster in terms of number of samples in cluster + * if we are doing median */ +int clusterSize[clusterCount]; +double *clusterSamples[clusterCount]; +if (doMedian) + { + for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx) + { + clusterSize[clusterIx] = hashIntVal(clusterHash, clusterNames[clusterIx]); + verbose(2, "clusterSize[%d] = %d\n", clusterIx, clusterSize[clusterIx]); + } + + /* Make up array of doubles for each cluster to hold all samples in that clusters */ + for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx) + { + double *samples; + AllocArray(samples, clusterSize[clusterIx]); + clusterSamples[clusterIx] = samples; + } + } + +/* Make hash that goes from cluster name to cluster index */ +struct hash *clusterToClusterIdHash = hashNew(0); +for (clusterIx = 0; clusterIxval; // Get our bed as string array out of hash + + /* Zero out cluster histogram */ + int i; + for (i=0; i= clusterSize[clusterIx]) + internalErr(); + clusterSamples[clusterIx][valCount] = val; + } + else + clusterTotal[clusterIx] += val; + } + + /* Output info - first six from the bed, then name2, then our barchart */ + for (i=0; i<6; ++i) + fprintf(f, "%s\t", geneBedVal[i]); + + char *name = geneBedVal[3]; // By bed definition it's fourth field + char *name2 = NULL; + if (nameToName2 != NULL) + { + char **namedRow = hashFindVal(nameToName2, name); + if (namedRow != NULL) + name2 = namedRow[1]; // [0] is name + else + warn("Can't find %s in %s", name, clName2); + } + else + name2 = geneBedVal[name2Ix]; + if (name2 == NULL) + name2 = name; + fprintf(f, "%s\t", name2); + + fprintf(f, "%d\t", clusterCount); + for (i=0; i