ab7847c47ac5507a7e126691bb78826ed4ba345d chmalee Mon Sep 13 16:34:47 2021 -0700 Staging GTEx single nuclei gene expression track, refs #29954 diff --git src/utils/matrixClusterColumns/matrixClusterColumns.c src/utils/matrixClusterColumns/matrixClusterColumns.c index d0f47bd..ff70afa 100644 --- src/utils/matrixClusterColumns/matrixClusterColumns.c +++ src/utils/matrixClusterColumns/matrixClusterColumns.c @@ -18,37 +18,39 @@ { errAbort( "matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix with\n" "the same number of rows and generally much fewer columns. Combines columns by taking mean.\n" "usage:\n" " matrixClusterColumns inMatrix.tsv meta.tsv cluster outMatrix.tsv outStats.tsv [cluster2 outMatrix2.tsv outStats2.tsv ... ]\n" "where:\n" " inMatrix.tsv is a file in tsv format with cell labels in first row and gene labels in first column\n" " meta.tsv is a table where the first row is field labels and the first column is sample ids\n" " cluster is the name of the field with the cluster names\n" "You can produce multiple clusterings in the same pass through the input matrix by specifying\n" "additional cluster/outMatrix/outStats triples in the command line.\n" "options:\n" " -makeIndex=index.tsv - output index tsv file with <matrix-col1><input-file-pos><line-len>\n" " -median if set ouput median rather than mean cluster value\n" + " -excludeZeros if set exclude zeros when calculating mean/median\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"makeIndex", OPTION_STRING}, {"median", OPTION_BOOLEAN}, + {"excludeZeros", OPTION_BOOLEAN}, {NULL, 0}, }; void readLineArray(char *fileName, int *retCount, char ***retLines) /* Return an array of strings, one for each line of file. Return # of lines in file too */ { /* This is sloppy with memory but it doesn't matter since we won't free it. */ struct slName *el, *list = readAllLines(fileName); if (list == NULL) errAbort("%s is empty", fileName); int count = slCount(list); char **lines; AllocArray(lines, count); int i; @@ -124,43 +126,44 @@ { struct clustering *next; char *clusterField; /* Field to cluster on */ char *outMatrixFile; /* Where to put matrix result */ char *outStatsFile; /* Where to put stats result */ int clusterMetaIx; /* Index of cluster field in meta table */ int *colToCluster; /* Maps input matrix column to clustered column */ int clusterCount; /* Number of different values in column we are clustering */ double *clusterTotal; /* A place to hold totals for each cluster */ double *clusterGrandTotal; /* Total over all rows */ int *clusterElements; /* A place to hold counts for each cluster */ double *clusterResults; /* Results after clustering */ struct hash *clusterSizeHash; /* Keyed by cluster, int value is elements in cluster */ char **clusterNames; /* Holds name of each cluster */ int *clusterSizes; /* An array that holds size of each cluster */ + boolean excludeZeros; /* If true exclude zero vals from calc */ /* Things needed by median handling */ boolean doMedian; /* If true we calculate median */ double **clusterSamples; /* An array that holds an array big enough for all vals in cluster. */ struct dyString **chunkLinesOut; /* parallel output */ double **chunkSubtotals; /* Totals */ FILE *matrixFile; /* serial output */ }; struct clustering *clusteringNew(char *clusterField, char *outMatrixFile, char *outStatsFile, - struct fieldedTable *metaTable, struct ccMatrix *v, boolean doMedian) + struct fieldedTable *metaTable, struct ccMatrix *v, boolean doMedian, boolean excludeZeros) /* Make up a new clustering structure */ { /* Check that all column names in matrix are unique */ int colCount = v->colCount; char **colLabels = v->colLabels; struct hash *uniqColHash = hashNew(0); int colIx; for (colIx=0; colIx < colCount; colIx = colIx+1) { char *label = colLabels[colIx]; if (hashLookup(uniqColHash, label) == NULL) hashAdd(uniqColHash, label, NULL); else errAbort("Duplicated column label %s in input matrix", label); } @@ -226,30 +229,32 @@ if (doMedian) { /* Allocate arrays to hold number of samples and all sample vals for each cluster */ clustering->doMedian = doMedian; AllocArray(clustering->clusterSamples, clusterCount); int clusterIx; for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx) { double *samples; AllocArray(samples, clustering->clusterSizes[clusterIx]); clustering->clusterSamples[clusterIx] = samples; } } +clustering->excludeZeros = excludeZeros; + /* Make up array that has -1 where no cluster available, otherwise output index, also * hash up all column labels. */ int *colToCluster = clustering->colToCluster = needHugeMem(colCount * sizeof(colToCluster[0])); int unclusteredColumns = 0, missCount = 0; for (colIx=0; colIx < colCount; colIx = colIx+1) { char *colName = colLabels[colIx]; char *clusterName = hashFindVal(sampleHash, colName); colToCluster[colIx] = -1; if (clusterName != NULL) { int clusterId = hashIntValDefault(clusterIxHash, clusterName, -1); colToCluster[colIx] = clusterId; if (clusterId == -1) @@ -332,30 +337,32 @@ { clusterTotal[i] = 0.0; clusterElements[i] = 0; } /* Loop through rest of row filling in histogram */ int *colToCluster = clustering->colToCluster; boolean doMedian = clustering->doMedian; for (i=0; i<colCount; ++i) { int clusterIx = colToCluster[i]; if (clusterIx >= 0) { double val = a[i]; int valCount = clusterElements[clusterIx]; + if (clustering->excludeZeros && val == 0.0) + continue; clusterElements[clusterIx] = valCount+1; clusterTotal[clusterIx] += val; if (doMedian) { if (valCount >= clustering->clusterSizes[clusterIx]) internalErr(); clustering->clusterSamples[clusterIx][valCount] = val; } } } /* Do output to outstrng and do grand totalling */ dyStringClear(out); dyStringPrintf(out, "%s", rowLabel); for (i=0; i<clusterCount; ++i) @@ -443,55 +450,55 @@ } /* Then do the clustering */ struct clustering *clustering; for (clustering = lii->clusteringList; clustering != NULL; clustering = clustering->next) { int chunkIx = lii->chunkIx; clusterRow(clustering, xSize, rowLabel, lii->vals, lii->totalingTemp, lii->elementsTemp, clustering->chunkLinesOut[chunkIx], clustering->chunkSubtotals[chunkIx]); } } void matrixClusterColumns(char *matrixFile, char *metaFile, char *sampleField, int outputCount, char **clusterFields, char **outMatrixFiles, char **outStatsFiles, - char *outputIndex, boolean doMedian) + char *outputIndex, boolean doMedian, boolean excludeZeros) /* matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix * the with same number of rows and generally much fewer columns.. */ { FILE *fIndex = NULL; if (outputIndex) fIndex = mustOpen(outputIndex, "w"); /* Load up metadata and make sure we have all of the cluster fields we need * and fill out array of clusterIx corresponding to clusterFields in metaFile. */ struct fieldedTable *metaTable = fieldedTableFromTabFile(metaFile, metaFile, NULL, 0); struct hash *metaHash = fieldedTableIndex(metaTable, sampleField); verbose(1, "Read %d rows from %s\n", metaHash->elCount, metaFile); /* Load up input matrix first line at least */ struct ccMatrix *v = ccMatrixOpen(matrixFile); verbose(1, "matrix %s has %d fields\n", matrixFile, v->colCount); /* Create a clustering for each output and find index in metaTable for each. */ struct clustering *clusteringList = NULL, *clustering; int i; for (i=0; i<outputCount; ++i) { clustering = clusteringNew(clusterFields[i], outMatrixFiles[i], outStatsFiles[i], - metaTable, v, doMedian); + metaTable, v, doMedian, excludeZeros); slAddTail(&clusteringList, clustering); } /* Set up buffers for pthread workers */ struct lineIoItem chunks[chunkMaxSize]; for (i=0; i<chunkMaxSize; ++i) { struct lineIoItem *chunk = &chunks[i]; chunk->fileName = matrixFile; chunk->clusteringList = clusteringList; chunk->chunkIx = i; chunk->lineIn = dyStringNew(0); chunk->rowLabel = dyStringNew(0); AllocArray(chunk->vals, v->colCount); AllocArray(chunk->totalingTemp, v->colCount); @@ -575,18 +582,18 @@ int minArgc = 6; if (argc < minArgc || ((argc-minArgc)%3)!=0) // Force minimum, even number usage(); int outputCount = 1 + (argc-minArgc)/3; // Add one since at minimum have 1 char *clusterFields[outputCount], *outMatrixFiles[outputCount], *outStatsFiles[outputCount]; int i; char **triples = argv + minArgc - 3; for (i=0; i<outputCount; ++i) { clusterFields[i] = triples[0]; outMatrixFiles[i] = triples[1]; outStatsFiles[i] = triples[2]; triples += 3; } matrixClusterColumns(argv[1], argv[2], argv[3], - outputCount, clusterFields, outMatrixFiles, outStatsFiles, makeIndex, optionExists("median")); + outputCount, clusterFields, outMatrixFiles, outStatsFiles, makeIndex, optionExists("median"), optionExists("excludeZeros")); return 0; }