src/hg/instinct/extractData/extractData.c 1.1

1.1 2010/01/14 05:18:36 jsanborn
initial commit
Index: src/hg/instinct/extractData/extractData.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/extractData/extractData.c,v
retrieving revision 1.7
retrieving revision 1.1
diff -b -B -U 4 -r1.7 -r1.1
--- src/hg/instinct/extractData/extractData.c	15 Jan 2010 01:45:21 -0000	1.7
+++ src/hg/instinct/extractData/extractData.c	14 Jan 2010 05:18:36 -0000	1.1
@@ -22,38 +22,18 @@
 /* Explain usage and exit. */
 {
 errAbort(
   "extractData \n"
-  "   extractData [OPTIONS] db table sample_id genes(s)\n"
+  "   extractData [OPTIONS] db table samplesgenes(s)\n"
   "options:\n"
   "   -median       Output median value if multiple probes\n"
-  "   -tcga         Handle TCGA ids, keeps only first 16 chars in sample_id\n"
-  "   -samples=STR  Sample comma-separated list, otherwise all sample data are returned.\n"
-  "examples:\n"
-  "* Single sample / Single gene / All probes:\n"
-  "  ./extractData -tcga -samples=TCGA-06-0145-01A harvardCGH egfr\n"
-  "\n"
-  "* Multiple samples / Single gene /Median-value of probes:\n"
-  "  ./extractData -tcga -median -samples=TCGA-06-0145-01A,TCGA-06-0145-10A harvardCGH egfr\n"
-  "\n"
-  "* All samples / multiple genes / All probes of all genes:\n"
-  "  ./extractData harvardCGH egfr,erbb2,esr1\n"
-  "\n"
-  "* All samples / multiple genes / Median-value of probes:\n"
-  "  ./extractData harvardCGH egfr,erbb2,esr1\n"
   );
 }
 
-#define TCGA_PATIENT_PREFIX 12
-#define TCGA_SAMPLE_PREFIX 16
-
 boolean median = FALSE;
-boolean isTCGA = FALSE;
 
 static struct optionSpec options[] = {
     {"median", OPTION_BOOLEAN},
-    {"tcga", OPTION_BOOLEAN},
-    {"samples", OPTION_STRING},
     {NULL, 0},
 };
 
 struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
@@ -147,18 +127,9 @@
 
 return gaHash;                   
 }
 
-struct sampleVals {
-    struct sampleVals *next;
-
-    int expId;
-    char *name;
-    struct slDouble *vals;
-    struct hash *valHash;
-}; 
-
-void setProbeData(struct sqlConnection *hgConn, struct sampleVals *svList, 
+void getProbeData(struct sqlConnection *hgConn, struct maGrouping *allA, 
 		  char *dataTable, struct slName *probes)
 {
 struct slName *sl;
 struct dyString *dy = newDyString(100);
@@ -173,140 +144,30 @@
     }
 dyStringPrintf(dy, ");");
 char *query = dyStringCannibalize(&dy);
 
+fprintf(stderr, "query: %s\n", query);
 /* Get bed15 data from hg18 database */
 struct sqlResult *sr = sqlGetResult(hgConn, query);
 
+int count = 0;
 char **row = NULL;
-struct sampleVals *sv;
-
 while ((row = sqlNextRow(sr)) != NULL)
     {
     struct bed *nb = bedLoadN(row+1, 15);
   
-    for (sv = svList; sv; sv = sv->next)
-	{
-	if (sv->expId >= nb->expCount)
-	    continue;
-	struct slDouble *sd = slDoubleNew(nb->expScores[sv->expId]);
-	slAddHead(&sv->vals, sd);
-	hashAdd(sv->valHash, nb->name, sd);
-	}
+// nb->expCount
+// nb->expScores
+    count += 1; 
 
     bedFree(&nb);
     }
 sqlFreeResult(&sr);
-}
-
-
-struct sampleVals *prepareSampleVals(char *samples, struct maGrouping *allA)
-{
-boolean allSamples = FALSE;
-struct hash *sampleHash = hashNew(0);
-
-if (!samples)
-    allSamples = TRUE;
-else
-    {
-    struct slName *sl, *slList = slNameListFromComma(samples);
-    for (sl = slList; sl; sl = sl->next)
-	{
-	char *sample = sl->name;
-	if (isTCGA)
-	    sample = cloneStringZ(sl->name, TCGA_SAMPLE_PREFIX);	
-	hashAddInt(sampleHash, sample, 1);
-	}
-    }
-
-struct hashEl *el;
-struct sampleVals *sv, *svList = NULL;
-int i;
-for (i = 0; i < allA->size; i++)
-    {
-    int expId = allA->expIds[i];
-    char *maName = allA->names[i];
-    if (isTCGA)
-	maName = cloneStringZ(allA->names[i], TCGA_SAMPLE_PREFIX);
-
-    if (allSamples || (el = hashLookup(sampleHash, maName)) != NULL)
-	{
-	sv = AllocA(struct sampleVals);
-	sv->name = cloneString(allA->names[i]);
-	sv->expId = expId;
-	sv->vals = NULL;
-	sv->valHash = hashNew(0);
-
-	slAddHead(&svList, sv);
-	}
-    
-    }
-slReverse(&svList);
-
-return svList;
-}
-
-void printProbeData(struct geneAlias *ga, struct sampleVals *svList)
-{
-struct sampleVals *sv;
-struct dyString *dy = newDyString(100);
-
-if (median)
-    {
-    dyStringPrintf(dy, "%s\tmedian\t", ga->gene);
-    for (sv = svList; sv; sv = sv->next)
-	{
-	double val = slDoubleMedian(sv->vals);
-	dyStringPrintf(dy, "%f", val);
 	
-	if (sv->next)
-	    dyStringPrintf(dy, "\t");
-	}
-    dyStringPrintf(dy, "\n");
-    }
-else
-    {
-    struct hashEl *el;
-    struct slName *sl;
-    for (sl = ga->probes; sl; sl = sl->next)
-	{
-	dyStringPrintf(dy, "%s\t%s\t", ga->gene, sl->name);
-	for (sv = svList; sv; sv = sv->next)
-	    {
-	    if ((el = hashLookup(sv->valHash, sl->name)) != NULL)
-		{
-		struct slDouble *sd = el->val;
-		dyStringPrintf(dy, "%f", sd->val);
-		}
-	    else
-		dyStringPrintf(dy, "NA");
-		    
-	    if (sv->next)
-		dyStringPrintf(dy, "\t");
-	    }
-	dyStringPrintf(dy, "\n");
-	}
-    }
-
-char *valStr = dyStringCannibalize(&dy);
-printf("%s", valStr);
-}
-
-void clearProbeData(struct sampleVals *svList)
-{
-struct sampleVals *sv;
-for (sv = svList; sv; sv = sv->next)
-    {
-    slFreeList(&sv->vals);
-    hashFree(&sv->valHash);
-
-    sv->vals = NULL;
-    sv->valHash = hashNew(0);
-    }
+fprintf(stderr, "found %d probes\n", count);
 }
 
-
-void extractData(char *tableName, char *geneList, char *samples)
+void extractData(char *tableName, char *sampleName, char *geneList)
 {
 if (!geneList)
     return;
 
@@ -327,26 +189,13 @@
 char *aliasTable = cloneString(el->val);
 
 struct hash *gaHash = getAliases(hgConn, aliasTable, genes);
 
-struct sampleVals *sv, *svList = prepareSampleVals(samples, allA);
-if (!svList)
-    errAbort("No samples allocated.\n");
-
-printf("gene\tprobe");
-for (sv = svList; sv; sv = sv->next)
-    printf("\t%s", sv->name);
-printf("\n");
-
 struct hashCookie cookie = hashFirst(gaHash);
 while ((el = hashNext(&cookie)) != NULL)
     {
     struct geneAlias *ga = el->val;    
-    setProbeData(hgConn, svList, tableName, ga->probes);
-
-    printProbeData(ga, svList);
-
-    clearProbeData(svList);
+    getProbeData(hgConn, allA, tableName, ga->probes);
     }
 
 hFreeConn(&hgConn);
 }
@@ -355,16 +204,13 @@
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
-if (argc != 3)
+if (argc != 4)
     usage();
 
 if (optionExists("median"))
     median = TRUE;
-if (optionExists("tcga"))
-    isTCGA = TRUE;
 
-char *samples = optionVal("samples", NULL);
-extractData(argv[1], argv[2], samples);
+extractData(argv[1], argv[2], argv[3]);
 return 0;
 }