src/hg/instinct/extractData/extractData.c 1.5

1.5 2010/01/14 23:15:04 jsanborn
updated
Index: src/hg/instinct/extractData/extractData.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/extractData/extractData.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 4 -r1.4 -r1.5
--- src/hg/instinct/extractData/extractData.c	14 Jan 2010 07:12:28 -0000	1.4
+++ src/hg/instinct/extractData/extractData.c	14 Jan 2010 23:15:04 -0000	1.5
@@ -26,17 +26,22 @@
   "   extractData [OPTIONS] db table sample_id genes(s)\n"
   "options:\n"
   "   -median   Output median value if multiple probes\n"
   "   -tcga     Handle TCGA ids, keeps only first 16 chars in sample_id\n"
+  "   -samples=STR  Sample comma-separated list, otherwise all sample data are returned.\n"
   );
 }
 
+#define TCGA_PATIENT_PREFIX 12
+#define TCGA_SAMPLE_PREFIX 16
+
 boolean median = FALSE;
 boolean isTCGA = FALSE;
 
 static struct optionSpec options[] = {
     {"median", OPTION_BOOLEAN},
     {"tcga", OPTION_BOOLEAN},
+    {"samples", OPTION_STRING},
     {NULL, 0},
 };
 
 struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
@@ -130,9 +135,18 @@
 
 return gaHash;                   
 }
 
-struct slDouble *getProbeData(struct sqlConnection *hgConn, int expId, 
+struct sampleVals {
+    struct sampleVals *next;
+
+    int expId;
+    char *name;
+    struct slDouble *vals;
+    struct hash *valHash;
+}; 
+
+void setProbeData(struct sqlConnection *hgConn, struct sampleVals *svList, 
 			      char *dataTable, struct slName *probes)
 {
 struct slName *sl;
 struct dyString *dy = newDyString(100);
@@ -151,34 +165,139 @@
 /* Get bed15 data from hg18 database */
 struct sqlResult *sr = sqlGetResult(hgConn, query);
 
 char **row = NULL;
-struct slDouble *sdList = NULL;
+struct sampleVals *sv;
 
 while ((row = sqlNextRow(sr)) != NULL)
     {
     struct bed *nb = bedLoadN(row+1, 15);
   
-    if (expId >= nb->expCount)
+    for (sv = svList; sv; sv = sv->next)
+	{
+	if (sv->expId >= nb->expCount)
 	continue;
-
-    struct slDouble *sd = slDoubleNew(nb->expScores[expId]);
-    slAddHead(&sdList, sd);
+	struct slDouble *sd = slDoubleNew(nb->expScores[sv->expId]);
+	slAddHead(&sv->vals, sd);
+	hashAdd(sv->valHash, nb->name, sd);
+	}
 
     bedFree(&nb);
     }
 sqlFreeResult(&sr);
+}
+
+
+struct sampleVals *prepareSampleVals(char *samples, struct maGrouping *allA)
+{
+boolean allSamples = FALSE;
+struct hash *sampleHash = hashNew(0);
+
+if (!samples)
+    allSamples = TRUE;
+else
+    {
+    struct slName *sl, *slList = slNameListFromComma(samples);
+    for (sl = slList; sl; sl = sl->next)
+	{
+	char *sample = sl->name;
+	if (isTCGA)
+	    sample = cloneStringZ(sl->name, TCGA_SAMPLE_PREFIX);	
+	hashAddInt(sampleHash, sample, 1);
+	}
+    }
+
+struct hashEl *el;
+struct sampleVals *sv, *svList = NULL;
+int i;
+for (i = 0; i < allA->size; i++)
+    {
+    int expId = allA->expIds[i];
+    char *maName = allA->names[i];
+    if (isTCGA)
+	maName = cloneStringZ(allA->names[i], TCGA_SAMPLE_PREFIX);
+
+    if (allSamples || (el = hashLookup(sampleHash, maName)) != NULL)
+	{
+	sv = AllocA(struct sampleVals);
+	sv->name = cloneString(allA->names[i]);
+	sv->expId = expId;
+	sv->vals = NULL;
+	sv->valHash = hashNew(0);
+
+	slAddHead(&svList, sv);
+	}
+    
+    }
+slReverse(&svList);
+
+return svList;
+}
+
+void printProbeData(struct geneAlias *ga, struct sampleVals *svList)
+{
+struct sampleVals *sv;
+struct dyString *dy = newDyString(100);
+
+if (median)
+    {
+    dyStringPrintf(dy, "%s\tmedian\t", ga->gene);
+    for (sv = svList; sv; sv = sv->next)
+	{
+	double val = slDoubleMedian(sv->vals);
+	dyStringPrintf(dy, "%f", val);
+	
+	if (sv->next)
+	    dyStringPrintf(dy, "\t");
+	}
+    dyStringPrintf(dy, "\n");
+    }
+else
+    {
+    struct hashEl *el;
+    struct slName *sl;
+    for (sl = ga->probes; sl; sl = sl->next)
+	{
+	dyStringPrintf(dy, "%s\t%s\t", ga->gene, sl->name);
+	for (sv = svList; sv; sv = sv->next)
+	    {
+	    if ((el = hashLookup(sv->valHash, sl->name)) != NULL)
+		{
+		struct slDouble *sd = el->val;
+		dyStringPrintf(dy, "%f", sd->val);
+		}
+	    else
+		dyStringPrintf(dy, "NA");
+		    
+	    if (sv->next)
+		dyStringPrintf(dy, "\t");
+	    }
+	dyStringPrintf(dy, "\n");
+	}
+    }
 
-return sdList;
+char *valStr = dyStringCannibalize(&dy);
+printf("%s", valStr);
 }
 
-void extractData(char *tableName, char *sampleName, char *geneList)
+void clearProbeData(struct sampleVals *svList)
+{
+struct sampleVals *sv;
+for (sv = svList; sv; sv = sv->next)
+    {
+    slFreeList(&sv->vals);
+    hashFree(&sv->valHash);
+
+    sv->vals = NULL;
+    sv->valHash = hashNew(0);
+    }
+}
+
+
+void extractData(char *tableName, char *geneList, char *samples)
 {
 if (!geneList)
     return;
-char *sample = sampleName;
-if (isTCGA)
-    sample = cloneStringZ(sampleName, 16);
 
 struct slName *genes = slNameListFromComma(geneList);
 
 struct hashEl *el;
@@ -188,25 +307,9 @@
 struct maGrouping *allA = getMaGrouping(hgConn, tableName);
 if (!allA)
     errAbort("Could not find maGrouping for %s!", tableName);
 
-int i = 0, expId = -1;
-for (; i < allA->size; i++)
-    {
-    char *maName = allA->names[i];
-    if (isTCGA)
-	maName = cloneStringZ(allA->names[i], 16);
-
-    if (!sameString(maName, sample))
-	continue;
-    expId = allA->expIds[i];
-    break;
-    }
-if (expId < 0)
-    errAbort("Couldn't find sample %s in %s.\n", sample, tableName);
-
 struct hash *settings = getSettings(tableName);
-
 el = hashLookup(settings, "aliasTable");
 if (!el)
     errAbort("No aliasTable.\n");
 char *aliasTable = cloneString(el->val);
@@ -212,37 +315,35 @@
 char *aliasTable = cloneString(el->val);
 
 struct hash *gaHash = getAliases(hgConn, aliasTable, genes);
 
+struct sampleVals *sv, *svList = prepareSampleVals(samples, allA);
+if (!svList)
+    errAbort("No samples allocated.\n");
+
+printf("gene\tprobe");
+for (sv = svList; sv; sv = sv->next)
+    printf("\t%s", sv->name);
+printf("\n");
+
 struct hashCookie cookie = hashFirst(gaHash);
 while ((el = hashNext(&cookie)) != NULL)
     {
-    char *name = el->name;
     struct geneAlias *ga = el->val;    
-    struct slDouble *sd, *sdList = getProbeData(hgConn, expId, tableName, ga->probes);
+    setProbeData(hgConn, svList, tableName, ga->probes);
+
+    printProbeData(ga, svList);
+
+    clearProbeData(svList);
+    }
 
+#if 0
     if (!sdList)
 	continue;
 
-    printf("%s\t", name);
-    if (median)
-	{
-	double val = slDoubleMedian(sdList);
-	printf("%f\n", val);
-	}
-    else
-	{
-	struct dyString *dy = newDyString(100);
-	for (sd = sdList; sd; sd = sd->next)
-	    {
-	    dyStringPrintf(dy, "%f", sd->val);
-	    if (sd->next)
-		dyStringPrintf(dy, ",");
-	    }
-	char *val = dyStringCannibalize(&dy);
-	printf("%s\n", val);
-	}
+
     }
+#endif
 
 hFreeConn(&hgConn);
 }
 
@@ -250,15 +351,16 @@
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
-if (argc != 4)
+if (argc != 3)
     usage();
 
 if (optionExists("median"))
     median = TRUE;
 if (optionExists("tcga"))
     isTCGA = TRUE;
 
-extractData(argv[1], argv[2], argv[3]);
+char *samples = optionVal("samples", NULL);
+extractData(argv[1], argv[2], samples);
 return 0;
 }