f6a7885f1bd5a869ced47dc1dfed6d03daec7c3a kent Thu Dec 31 16:15:46 2020 -0800 Adding median flag and the start of a test set. diff --git src/utils/matrixClusterColumns/matrixClusterColumns.c src/utils/matrixClusterColumns/matrixClusterColumns.c index ed29a17..6bc3dba 100644 --- src/utils/matrixClusterColumns/matrixClusterColumns.c +++ src/utils/matrixClusterColumns/matrixClusterColumns.c @@ -1,53 +1,54 @@ /* matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix * the with same number of rows and generally much fewer columns.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "obscure.h" #include "fieldedTable.h" #include "sqlNum.h" - void usage() /* Explain usage and exit. */ { errAbort( "matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix with\n" "the same number of rows and generally much fewer columns. Combines columns by taking mean.\n" "usage:\n" " matrixClusterColumns input meta.tsv sample cluster output.tsv [clusterCol output2 ... ]\n" "where:\n" " input is a file either in tsv format or in mtx (matrix mart sorted) format\n" " meta.tsv is a table where the first row is field labels and the first column is sample ids\n" " sample is the name of the field with the sample (often single cell) id's\n" " cluster is the name of the field with the cluster names\n" "You can produce multiple clusterings in the same pass through the input matrix by specifying\n" "additional cluster/output pairs in the command line.\n" "options:\n" " -columnLabels=file.txt - a file with a line for each column, required for mtx inputs\n" " -rowLabels=file.txt - a file with a label for each row, also required for mtx inputs\n" " -makeIndex=index.tsv - output index tsv file with <matrix-col1><input-file-pos><line-len>\n" + " -median if set ouput median rather than mean cluster value\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"columnLabels", OPTION_STRING}, {"rowLabels", OPTION_STRING}, {"makeIndex", OPTION_STRING}, + {"median", OPTION_BOOLEAN}, {NULL, 0}, }; void readLineArray(char *fileName, int *retCount, char ***retLines) /* Return an array of strings, one for each line of file. Return # of lines in file too */ { /* This is sloppy with memory but it doesn't matter since we won't free it. */ struct slName *el, *list = readAllLines(fileName); if (list == NULL) errAbort("%s is empty", fileName); int count = slCount(list); char **lines; AllocArray(lines, count); int i; @@ -166,91 +167,123 @@ } struct clustering /* Stuff we need to cluster something. This is something we might do * repeatedly to same input matrix */ { struct clustering *next; char *clusterField; /* Field to cluster on */ char *outputFile; /* Where to put result */ int clusterMetaIx; /* Index of cluster field in meta table */ int *colToCluster; /* Maps input matrix column to clustered column */ int clusterCount; /* Number of different values in column we are clustering */ double *clusterTotal; /* A place to hold totals for each cluster */ int *clusterElements; /* A place to hold counts for each cluster */ double *clusterResults; /* Results after clustering */ + struct hash *clusterSizeHash; /* Keyed by cluster, int value is elements in cluster */ + int *clusterSizes; /* An array that holds size of each cluster */ + + /* Things needed by median handling */ + boolean doMedian; /* If true we calculate median */ + double **clusterSamples; /* An array that holds an array big enough for all vals in cluster. */ + FILE *file; /* output */ }; struct clustering *clusteringNew(char *clusterField, char *outputFile, - struct fieldedTable *metaTable, int sampleFieldIx, struct vMatrix *v) + struct fieldedTable *metaTable, int sampleFieldIx, struct vMatrix *v, boolean doMedian) /* Make up a new clustering job structure */ { struct clustering *job; AllocVar(job); job->clusterField = clusterField; job->outputFile = outputFile; int clusterFieldIx = job->clusterMetaIx = fieldedTableMustFindFieldIx(metaTable, clusterField); -/* Make up hash of sample names with cluster values */ +/* Make up hash of sample names with cluster name values + * and also hash of cluster names with size values */ struct hash *sampleHash = hashNew(0); /* Keyed by sample value is cluster */ +struct hash *clusterSizeHash = job->clusterSizeHash = hashNew(0); struct fieldedRow *fr; for (fr = metaTable->rowList; fr != NULL; fr = fr->next) { char **row = fr->row; hashAdd(sampleHash, row[sampleFieldIx], row[clusterFieldIx]); + hashIncInt(clusterSizeHash, row[clusterFieldIx]); } /* Find all uniq cluster names */ struct slName *nameList = NULL; struct hash *uniqHash = hashNew(0); for (fr = metaTable->rowList; fr != NULL; fr = fr->next) { char *cluster = fr->row[clusterFieldIx]; if (hashLookup(uniqHash, cluster) == NULL) { slNameAddHead(&nameList, cluster); hashAdd(uniqHash, cluster, NULL); } } hashFree(&uniqHash); /* Just alphabetize names for now */ slNameSort(&nameList); /* Make up hash that maps cluster names to cluster ids */ -struct hash *clusterHash = hashNew(0); /* Keyed by cluster, no value */ +struct hash *clusterIxHash = hashNew(0); /* Keyed by cluster, no value */ struct slName *name; int i; for (name = nameList, i=0; name != NULL; name = name->next, ++i) - hashAddInt(clusterHash, name->name, i); -int clusterCount = job->clusterCount = clusterHash->elCount; + hashAddInt(clusterIxHash, name->name, i); +int clusterCount = job->clusterCount = clusterIxHash->elCount; + +/* Make up array that holds size of each cluster */ +AllocArray(job->clusterSizes, clusterCount); +for (i = 0, name = nameList; i < clusterCount; ++i, name = name->next) + { + job->clusterSizes[i] = hashIntVal(job->clusterSizeHash, name->name); + verbose(2, "clusterSizes[%d] = %d\n", i, job->clusterSizes[i]); + } + +if (doMedian) + { + /* Allocate arrays to hold number of samples and all sample vals for each cluster */ + job->doMedian = doMedian; + AllocArray(job->clusterSamples, clusterCount); + int clusterIx; + for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx) + { + double *samples; + AllocArray(samples, job->clusterSizes[clusterIx]); + job->clusterSamples[clusterIx] = samples; + } + } /* Make up array that has -1 where no cluster available, otherwise output index */ int colCount = v->colCount; int *colToCluster = job->colToCluster = needHugeMem(colCount * sizeof(colToCluster[0])); int colIx; int unclusteredColumns = 0, missCount = 0; for (colIx=0; colIx < colCount; colIx = colIx+1) { char *colName = v->colLabels[colIx]; char *clusterName = hashFindVal(sampleHash, colName); colToCluster[colIx] = -1; if (clusterName != NULL) { - int clusterId = hashIntValDefault(clusterHash, clusterName, -1); + int clusterId = hashIntValDefault(clusterIxHash, clusterName, -1); colToCluster[colIx] = clusterId; if (clusterId == -1) { verbose(3, "%s is in expression matrix but not in sample cluster file", clusterName); ++missCount; } } else unclusteredColumns += 1; } verbose(1, "%d total columns, %d unclustered, %d misses\n", colCount, unclusteredColumns, missCount); /* Allocate space for results for clustering one line */ job->clusterResults = needHugeMem(clusterCount * sizeof(job->clusterResults[0])); @@ -265,124 +298,121 @@ fputc('#', f); /* First field name agrees with first column of matrix */ fputs( v->ft->fields[0],f); /* Use our clusters for the rest of the names */ for (name = nameList; name != NULL; name = name->next) { fputc('\t', f); fputs(name->name, f); } fputc('\n', f); /* Clean up and return result */ hashFree(&sampleHash); -hashFree(&clusterHash); +hashFree(&clusterIxHash); return job; } void clusterRow(struct clustering *job, struct vMatrix *v, double *a) /* Process a row in a, outputting in job->file */ { /* Zero out cluster histogram */ double *clusterTotal = job->clusterTotal; int *clusterElements = job->clusterElements; int clusterCount = job->clusterCount; int i; for (i=0; i<clusterCount; ++i) { clusterTotal[i] = 0.0; clusterElements[i] = 0; } /* Loop through rest of row filling in histogram */ int colCount = v->colCount; int *colToCluster = job->colToCluster; +boolean doMedian = job->doMedian; for (i=0; i<colCount; ++i) { int clusterIx = colToCluster[i]; if (clusterIx >= 0) { double val = a[i]; int valCount = clusterElements[clusterIx]; clusterElements[clusterIx] = valCount+1; -#ifdef SOON if (doMedian) { - if (valCount >= clusterSize[clusterIx]) + if (valCount >= job->clusterSizes[clusterIx]) internalErr(); - clusterSamples[clusterIx][valCount] = val; + job->clusterSamples[clusterIx][valCount] = val; } else -#endif /* SOON */ clusterTotal[clusterIx] += val; } } /* Do output to file */ FILE *f = job->file; fprintf(f, "%s", v->rowLabel->string); for (i=0; i<clusterCount; ++i) { fprintf(f, "\t"); double val; -#ifdef SOON if (doMedian) - val = doubleMedian(clusterElements[i], clusterSamples[i]); + val = doubleMedian(clusterElements[i], job->clusterSamples[i]); else -#endif /* SOON */ val = clusterTotal[i]/clusterElements[i]; fprintf(f, "%g", val); } /* Data file offset info */ fprintf(f, "\n"); } void matrixClusterColumns(char *matrixFile, char *metaFile, char *sampleField, - int outputCount, char **clusterFields, char **outputFiles, char *outputIndex) + int outputCount, char **clusterFields, char **outputFiles, char *outputIndex, boolean doMedian) /* matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix * the with same number of rows and generally much fewer columns.. */ { FILE *fIndex = NULL; if (outputIndex) fIndex = mustOpen(outputIndex, "w"); /* Load up metadata and make sure we have all of the cluster fields we need * and fill out array of clusterIx corresponding to clusterFields in metaFile. */ struct fieldedTable *metaTable = fieldedTableFromTabFile(metaFile, metaFile, NULL, 0); int sampleFieldIx = fieldedTableMustFindFieldIx(metaTable, sampleField); struct hash *metaHash = fieldedTableIndex(metaTable, sampleField); verbose(1, "Read %d rows from %s\n", metaHash->elCount, metaFile); /* Load up input matrix and labels */ char *columnFile = optionVal("columnLabels", NULL); char *rowFile = optionVal("rowLabels", NULL); struct vMatrix *v = vMatrixOpen(matrixFile, columnFile, rowFile); verbose(1, "matrix %s has %d fields\n", matrixFile, v->colCount); /* Create a clustering for each output and find index in metaTable for each. */ struct clustering *jobList = NULL, *job; int i; for (i=0; i<outputCount; ++i) { - job = clusteringNew(clusterFields[i], outputFiles[i], metaTable, sampleFieldIx, v); + job = clusteringNew(clusterFields[i], outputFiles[i], metaTable, sampleFieldIx, v, doMedian); slAddTail(&jobList, job); } /* Chug through big matrix a row at a time clustering */ dotForUserInit(100); for (;;) { double *a = vMatrixNextRow(v); if (a == NULL) break; if (fIndex) { fprintf(fIndex, "%s", v->rowLabel->string); struct lineFile *lf = v->lf; @@ -408,18 +438,19 @@ optionInit(&argc, argv, options); char *makeIndex = optionVal("makeIndex", NULL); int minArgc = 6; if (argc < minArgc || ((argc-minArgc)%2)!=0) // Force minimum, even number usage(); int outputCount = 1 + (argc-minArgc)/2; // Add one since at minimum have 1 char *clusterFields[outputCount], *outputFiles[outputCount]; int i; char **pair = argv + minArgc - 2; for (i=0; i<outputCount; ++i) { clusterFields[i] = pair[0]; outputFiles[i] = pair[1]; pair += 2; } -matrixClusterColumns(argv[1], argv[2], argv[3], outputCount, clusterFields, outputFiles, makeIndex); +matrixClusterColumns(argv[1], argv[2], argv[3], + outputCount, clusterFields, outputFiles, makeIndex, optionExists("median")); return 0; }