3cacf62418cf270f09b8b10df1daae429f191522 kent Sun Jan 17 11:39:41 2021 -0800 Catching error condition of meta row labels not matching up to matrix column labels. diff --git src/utils/matrixClusterColumns/matrixClusterColumns.c src/utils/matrixClusterColumns/matrixClusterColumns.c index aa8c3ba..0e2d049 100644 --- src/utils/matrixClusterColumns/matrixClusterColumns.c +++ src/utils/matrixClusterColumns/matrixClusterColumns.c @@ -184,106 +184,124 @@ char **clusterNames; /* Holds name of each cluster */ int *clusterSizes; /* An array that holds size of each cluster */ /* Things needed by median handling */ boolean doMedian; /* If true we calculate median */ double **clusterSamples; /* An array that holds an array big enough for all vals in cluster. */ FILE *matrixFile; /* output */ }; struct clustering *clusteringNew(char *clusterField, char *outMatrixFile, char *outStatsFile, struct fieldedTable *metaTable, struct vMatrix *v, boolean doMedian) /* Make up a new clustering job structure */ { +/* Check that all column names in matrix are unique */ +int colCount = v->colCount; +char **colLabels = v->colLabels; +struct hash *uniqColHash = hashNew(0); +int colIx; +for (colIx=0; colIx < colCount; colIx = colIx+1) + { + char *label = colLabels[colIx]; + if (hashLookup(uniqColHash, label) == NULL) + hashAdd(uniqColHash, label, NULL); + else + errAbort("Duplicated column label %s in input matrix", label); + } + struct clustering *job; AllocVar(job); job->clusterField = clusterField; job->outMatrixFile = outMatrixFile; job->outStatsFile = outStatsFile; int clusterFieldIx = job->clusterMetaIx = fieldedTableMustFindFieldIx(metaTable, clusterField); /* Make up hash of sample names with cluster name values * and also hash of cluster names with size values */ struct hash *sampleHash = hashNew(0); /* Keyed by sample value is cluster */ struct hash *clusterSizeHash = job->clusterSizeHash = hashNew(0); struct fieldedRow *fr; for (fr = metaTable->rowList; fr != NULL; fr = fr->next) { char **row = fr->row; - hashAdd(sampleHash, row[0], row[clusterFieldIx]); + char *sample = row[0]; + if (!hashLookup(uniqColHash, sample)) + errAbort("%s is in %s but not input matrix", sample, metaTable->name); + + hashAdd(sampleHash, sample, row[clusterFieldIx]); hashIncInt(clusterSizeHash, row[clusterFieldIx]); } /* Find all uniq cluster names */ struct slName *nameList = NULL; struct hash *uniqHash = hashNew(0); for (fr = metaTable->rowList; fr != NULL; fr = fr->next) { char *cluster = fr->row[clusterFieldIx]; if (hashLookup(uniqHash, cluster) == NULL) { slNameAddHead(&nameList, cluster); hashAdd(uniqHash, cluster, NULL); } } hashFree(&uniqHash); /* Just alphabetize names for now */ slNameSort(&nameList); /* Make up hash that maps cluster names to cluster ids */ struct hash *clusterIxHash = hashNew(0); /* Keyed by cluster, no value */ -struct slName *name; int i; +struct slName *name; for (name = nameList, i=0; name != NULL; name = name->next, ++i) hashAddInt(clusterIxHash, name->name, i); int clusterCount = job->clusterCount = clusterIxHash->elCount; /* Make up array that holds size of each cluster */ AllocArray(job->clusterSizes, clusterCount); AllocArray(job->clusterNames, clusterCount); for (i = 0, name = nameList; i < clusterCount; ++i, name = name->next) { job->clusterSizes[i] = hashIntVal(job->clusterSizeHash, name->name); job->clusterNames[i] = name->name; verbose(2, "clusterSizes[%d] = %d\n", i, job->clusterSizes[i]); } if (doMedian) { /* Allocate arrays to hold number of samples and all sample vals for each cluster */ job->doMedian = doMedian; AllocArray(job->clusterSamples, clusterCount); int clusterIx; for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx) { double *samples; AllocArray(samples, job->clusterSizes[clusterIx]); job->clusterSamples[clusterIx] = samples; } } -/* Make up array that has -1 where no cluster available, otherwise output index */ -int colCount = v->colCount; + +/* Make up array that has -1 where no cluster available, otherwise output index, also + * hash up all column labels. */ int *colToCluster = job->colToCluster = needHugeMem(colCount * sizeof(colToCluster[0])); -int colIx; int unclusteredColumns = 0, missCount = 0; for (colIx=0; colIx < colCount; colIx = colIx+1) { - char *colName = v->colLabels[colIx]; + char *colName = colLabels[colIx]; char *clusterName = hashFindVal(sampleHash, colName); colToCluster[colIx] = -1; if (clusterName != NULL) { int clusterId = hashIntValDefault(clusterIxHash, clusterName, -1); colToCluster[colIx] = clusterId; if (clusterId == -1) { verbose(3, "%s is in expression matrix but not in sample cluster file", clusterName); ++missCount; } } else unclusteredColumns += 1; }