src/utils/matrixClusterColumns/matrixClusterColumns.c 3cacf62418cf270f09b8b10df1daae429f191522

3cacf62418cf270f09b8b10df1daae429f191522
kent
  Sun Jan 17 11:39:41 2021 -0800
Catching error condition of meta row labels not matching up to matrix column labels.

diff --git src/utils/matrixClusterColumns/matrixClusterColumns.c src/utils/matrixClusterColumns/matrixClusterColumns.c
index aa8c3ba..0e2d049 100644
--- src/utils/matrixClusterColumns/matrixClusterColumns.c
+++ src/utils/matrixClusterColumns/matrixClusterColumns.c
@@ -184,106 +184,124 @@
     char **clusterNames;	    /* Holds name of each cluster */
     int *clusterSizes;	    /* An array that holds size of each cluster */
 
     /* Things needed by median handling */
     boolean doMedian;	/* If true we calculate median */
     double **clusterSamples; /* An array that holds an array big enough for all vals in cluster. */
 
     FILE *matrixFile;		    /* output */
     };
 
 
 struct clustering *clusteringNew(char *clusterField, char *outMatrixFile, char *outStatsFile,
     struct fieldedTable *metaTable, struct vMatrix *v, boolean doMedian)
 /* Make up a new clustering job structure */
 {
+/* Check that all column names in matrix are unique */
+int colCount = v->colCount;
+char **colLabels = v->colLabels;
+struct hash *uniqColHash = hashNew(0);
+int colIx;
+for (colIx=0; colIx < colCount; colIx = colIx+1)
+    {
+    char *label = colLabels[colIx];
+    if (hashLookup(uniqColHash, label) == NULL)
+	hashAdd(uniqColHash, label, NULL);
+    else
+        errAbort("Duplicated column label %s in input matrix", label);
+    }
+
 struct clustering *job;
 AllocVar(job);
 job->clusterField = clusterField;
 job->outMatrixFile = outMatrixFile;
 job->outStatsFile = outStatsFile;
 int clusterFieldIx = job->clusterMetaIx = fieldedTableMustFindFieldIx(metaTable, clusterField);
 
 /* Make up hash of sample names with cluster name values 
  * and also hash of cluster names with size values */
 struct hash *sampleHash = hashNew(0);	/* Keyed by sample value is cluster */
 struct hash *clusterSizeHash = job->clusterSizeHash = hashNew(0);
 struct fieldedRow *fr;
 for (fr = metaTable->rowList; fr != NULL; fr = fr->next)
     {
     char **row = fr->row;
-    hashAdd(sampleHash, row[0], row[clusterFieldIx]);
+    char *sample = row[0];
+    if (!hashLookup(uniqColHash, sample))
+        errAbort("%s is in %s but not input matrix", sample, metaTable->name);
+
+    hashAdd(sampleHash, sample, row[clusterFieldIx]);
     hashIncInt(clusterSizeHash, row[clusterFieldIx]);
     }
 
 /* Find all uniq cluster names */
 struct slName *nameList = NULL;
 struct hash *uniqHash = hashNew(0);
 for (fr = metaTable->rowList; fr != NULL; fr = fr->next)
     {
     char *cluster = fr->row[clusterFieldIx];
     if (hashLookup(uniqHash, cluster) == NULL)
         {
 	slNameAddHead(&nameList, cluster);
 	hashAdd(uniqHash, cluster, NULL);
 	}
     }
 hashFree(&uniqHash);
 
 /* Just alphabetize names for now */
 slNameSort(&nameList);
 
 /* Make up hash that maps cluster names to cluster ids */
 struct hash *clusterIxHash = hashNew(0);	/* Keyed by cluster, no value */
-struct slName *name;
 int i;
+struct slName *name;
 for (name = nameList, i=0; name != NULL; name = name->next, ++i)
     hashAddInt(clusterIxHash, name->name, i);
 int clusterCount = job->clusterCount = clusterIxHash->elCount;
 
 /* Make up array that holds size of each cluster */
 AllocArray(job->clusterSizes, clusterCount);
 AllocArray(job->clusterNames, clusterCount);
 for (i = 0, name = nameList; i < clusterCount; ++i, name = name->next)
     {
     job->clusterSizes[i] = hashIntVal(job->clusterSizeHash, name->name);
     job->clusterNames[i] = name->name;
     verbose(2, "clusterSizes[%d] = %d\n", i, job->clusterSizes[i]);
     }
 
 if (doMedian)
     {	
     /* Allocate arrays to hold number of samples and all sample vals for each cluster */
     job->doMedian = doMedian;
     AllocArray(job->clusterSamples, clusterCount);
     int clusterIx;
     for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx)
 	{
 	double *samples;
 	AllocArray(samples, job->clusterSizes[clusterIx]);
 	job->clusterSamples[clusterIx] = samples;
 	}
     }
 
-/* Make up array that has -1 where no cluster available, otherwise output index */
-int colCount = v->colCount;
+
+/* Make up array that has -1 where no cluster available, otherwise output index, also
+ * hash up all column labels. */
 int *colToCluster = job->colToCluster = needHugeMem(colCount * sizeof(colToCluster[0]));
-int colIx;
 int unclusteredColumns = 0, missCount = 0;
 for (colIx=0; colIx < colCount; colIx = colIx+1)
     {
-    char *colName = v->colLabels[colIx];
+    char *colName = colLabels[colIx];
     char *clusterName = hashFindVal(sampleHash, colName);
     colToCluster[colIx] = -1;
     if (clusterName != NULL)
         {
 	int clusterId = hashIntValDefault(clusterIxHash, clusterName, -1);
 	colToCluster[colIx] = clusterId;
 	if (clusterId == -1)
 	    {
 	    verbose(3, "%s is in expression matrix but not in sample cluster file", clusterName);
 	    ++missCount;
 	    }
 	}
     else
 	unclusteredColumns += 1;
     }