src/hg/instinct/bioInt2/populateDb.c 1.7

1.7 2009/04/27 06:15:49 jsanborn
updated lots of stuff, will break older implementation of database
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/populateDb.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 4 -r1.6 -r1.7
--- src/hg/instinct/bioInt2/populateDb.c	4 Apr 2009 00:39:22 -0000	1.6
+++ src/hg/instinct/bioInt2/populateDb.c	27 Apr 2009 06:15:49 -0000	1.7
@@ -11,8 +11,10 @@
 #include "microarray.h"
 #include "ra.h"
 #include "featuresLib.h"
 #include "hgHeatmapLib.h"
+#include "cprob.h"
+#include "hgStatsLib.h" 
 #include "bioIntDriver.h"
 #include "bioIntDb.h"
 
 char *hgDb = "hg18";
@@ -109,10 +111,9 @@
     char *probe;
     struct slName *genes;
 }; 
 
- 
-struct geneAlias *getAliases(struct sqlConnection *hgConn, char *tableName)
+struct hash *getAliases(struct sqlConnection *hgConn, char *tableName)
 {
 if (!hgConn || !tableName)
     return NULL;
 
@@ -146,20 +147,19 @@
 
     slNameAddHead(&ga->genes, gene);	
     }
 
-hashFree(&gaHash);
 sqlFreeResult(&sr);              
 
-return gaList;                   
+return gaHash;                   
 }
 
 struct dataTypes *findDataType(struct sqlConnection *biConn, char *type, char *platform)
 {
 if (!sameString(type, "bed 15"))
     errAbort("populateDb only runs on bed 15 files.");
 
-char *data_format = "probeVals";
+char *data_format = "analysisVals";
 
 char query[256];
 safef(query, sizeof(query), 
       "select * from %s where format = \"%s\" and name = \"%s\"",
@@ -173,29 +173,16 @@
 int nextId = sqlTableSize(biConn, DT_TABLE);
 struct dataTypes *dt;
 AllocVar(dt);
 dt->id = nextId;
-dt->format = cloneString("probeVals");
+dt->format = cloneString("analysisVals");
 dt->name   = cloneString(platform);
 
 /* Save to db */
 dataTypesSaveToDb(biConn, dt, DT_TABLE, 100);
 return dt;
 } 
 
-void createDataTypesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "format varchar(255) not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
 struct dataTypes *setupDataType(struct sqlConnection *biConn, 
 				char *type, char *platform)
 {
 if (!sqlTableExists(biConn, DT_TABLE))
@@ -233,20 +220,8 @@
 tissuesSaveToDb(biConn, ti, TI_TABLE, 100);
 return ti;
 } 
 
-void createTissuesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
 struct tissues *setupTissue(struct sqlConnection *biConn, char *tissue)
 {
 if (!sqlTableExists(biConn, TI_TABLE))
     {
@@ -286,14 +261,8 @@
 if (!el)
     errAbort("No name");
 char *dataTable = cloneString(el->val);
 
-char probeTable[256];
-safef(probeTable, sizeof(probeTable), "%s_probeInfo", dataTable);
-
-char p2gTable[256];
-safef(p2gTable, sizeof(p2gTable), "%s_probeToGene", dataTable);
-  
 char *platform;
 el = hashLookup(settings, "platform");
 if (!el)
     platform = cloneString("Expression");
@@ -318,10 +287,8 @@
 da->type_id = dt->id;
 da->num_samples = numSamples;
 da->name = shortLabel;
 da->data_table = dataTable;
-da->probe_table = cloneString(probeTable);
-da->probe_to_gene_table = cloneString(p2gTable);
 
 dataTypesFree(&dt);
 tissuesFree(&ti);
 
@@ -330,26 +297,8 @@
 
 return da;
 }
 
-void createDatasetsTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
-dyStringPrintf(dy, "type_id int unsigned not null,\n");
-dyStringPrintf(dy, "num_samples int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "data_table varchar(255) not null,\n");
-dyStringPrintf(dy, "probe_table varchar(255) not null,\n");
-dyStringPrintf(dy, "probe_to_gene_table varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
 struct datasets *setupDataset(struct sqlConnection *biConn, 
 			      char *tableName, char *tissue, int numSamples)
 {
 if (!sqlTableExists(biConn, DA_TABLE))
@@ -405,12 +354,11 @@
       "and name = \"%s\" "
       "and patient_id = %d "
       "and patient_name = \"%s\" "
       "and dataset_id = %d "
-      "and exp_id = %d "
       "and tissue_id = %d ",
       SA_TABLE, sa->id, sa->name, sa->patient_id, sa->patient_name, sa->dataset_id,
-      sa->exp_id, sa->tissue_id);
+      sa->tissue_id);
 
 return sqlExists(biConn, query);
 }
 
@@ -458,9 +406,8 @@
 	sampleName = cloneString(allA->names[i]);
 	patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName); 
 	}
 
-    int expId = allA->expIds[i];
     int sampleId = findId(biConn, "id", "name", sampleName);
     int patientId = findId(biConn, "patient_id", "patient_name", patientName);
 
     AllocVar(sa);
@@ -468,9 +415,8 @@
     sa->name = sampleName;
     sa->patient_id = patientId;
     sa->patient_name = patientName;
     sa->dataset_id = datasetId;
-    sa->exp_id = expId;
     sa->tissue_id = tissueId;
 
     if (!sampleExists(biConn, sa))
 	samplesSaveToDb(biConn, sa, SA_TABLE, 100);
@@ -480,33 +426,13 @@
 
 hFreeConn(&pdConn);
 }
 
-void createSamplesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "patient_id int unsigned not null,\n");
-dyStringPrintf(dy, "patient_name varchar(255) not null,\n");
-dyStringPrintf(dy, "dataset_id int unsigned not null,\n");
-dyStringPrintf(dy, "exp_id int unsigned not null,\n");
-dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(dataset_id),\n");
-dyStringPrintf(dy, "KEY(id, dataset_id),\n");
-dyStringPrintf(dy, "KEY(dataset_id,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
 struct samples *getSamples(struct sqlConnection *biConn, struct datasets *da)
 {
 char query[256];
 safef(query, sizeof(query),
-      "select * from %s where dataset_id = %d order by exp_id;",
+      "select * from %s where dataset_id = %d order by id;",
       SA_TABLE, da->id);
 
 return samplesLoadByQuery(biConn, query);
 }
@@ -522,10 +447,11 @@
     }
 
 createSamples(biConn, da, allA);
 struct samples *saList = getSamples(biConn, da);
-if (slCount(saList) != allA->size)
-    errAbort("Sample count from microarrayGroups and database don't match!");
+
+//if (slCount(saList) != allA->size)
+//    errAbort("Sample count from microarrayGroups and database don't match!");
 
 return saList;
 }
 
@@ -597,41 +523,8 @@
 
 return TRUE;
 }
 
-void createFeaturesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "shortLabel varchar(255) not null,\n");
-dyStringPrintf(dy, "longLabel varchar(255) not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(name),\n");
-dyStringPrintf(dy, "KEY(id,name)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
-void createClinicalDataTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "sample_id int unsigned not null,\n");
-dyStringPrintf(dy, "feature_id int unsigned not null,\n");
-dyStringPrintf(dy, "val double not null,\n");
-dyStringPrintf(dy, "code varchar(255),\n");
-dyStringPrintf(dy, "KEY(sample_id),\n");
-dyStringPrintf(dy, "KEY(feature_id),\n");
-dyStringPrintf(dy, "KEY(sample_id,feature_id),\n");
-dyStringPrintf(dy, "KEY(feature_id,sample_id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
 void setupClinicalInfo(struct sqlConnection *biConn, struct datasets *da, struct samples *saList)
 {
 if (!saList)
     return;
@@ -741,335 +634,262 @@
 
 hFreeConn(&pdConn);
 }
 
-void createProbeValsTable(struct sqlConnection *biConn, char *tableName)
+
+struct analysisFeatures *getAnalysisFeatures(struct sqlConnection *biConn, 
+					     char *names, char *type)
 {
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "probe_id int unsigned not null,\n");
-dyStringPrintf(dy, "sample_count int unsigned not null,\n");
-dyStringPrintf(dy, "sample_data longblob not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(probe_id)\n"); 
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
-void createProbeInfoTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "chrom varchar(255) not null,\n");
-dyStringPrintf(dy, "start int unsigned not null,\n");
-dyStringPrintf(dy, "stop int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id),\n");
-dyStringPrintf(dy, "KEY(name),\n");
-dyStringPrintf(dy, "KEY(id,name)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
-void addProbeValsToDb(struct sqlConnection *biConn, char *tableName, 
-		      int probe_id, int sample_count, char *dataString, int updateSize)
-{ /* This bypasses the need to convert to float array to enter in database, may not
-   * be faster, but by converting first to float array any "blank" datapoints are 
-   * converted to 0.0, instead of leaving blank */
-struct dyString *update = newDyString(updateSize);
-dyStringPrintf(update, "insert into %s values ( %u,%u,'%s')",
-	       tableName, probe_id,  sample_count,  dataString );
-sqlUpdate(biConn, update->string);
-freeDyString(&update);
-}
+if (!names)
+    return NULL;
+struct slName *sl, *slList = slNameListFromComma(names);
 
+struct dyString *dy = newDyString(100);
+dyStringPrintf(dy, 
+	       "select * from %s where type = \"%s\" "
+	       "and feature_name in (", AF_TABLE, type);
+for (sl = slList; sl; sl = sl->next)
+    {
+    dyStringPrintf(dy, "\"%s\"", sl->name);
+    if (sl->next)
+	dyStringPrintf(dy, ",");
+    }
+dyStringPrintf(dy, ");");
+char *query = dyStringCannibalize(&dy);
 
-void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn, 
-		    struct datasets *da)
-{
-char *dataTable = da->data_table;
-char *probeTable = da->probe_table;
+return analysisFeaturesLoadByQuery(biConn, query);
+}
 
-if (!dataTable || !probeTable)
-    errAbort("datasets entry not complete, data_table or probe_table not set.");
 
-boolean inputProbeVals = FALSE;
-boolean inputProbeInfo = FALSE;
 
-if (sqlTableExists(biConn, dataTable) && dropTable)
+void addDataToHash(struct hash *dataHash, char *gene, 
+		   unsigned int expCount, float *expScores, struct maGrouping *allA)
+{
+struct hash *hash;
+struct hashEl *el = hashLookup(dataHash, gene);
+if (!el)
     {
-    fprintf(stderr, "probeVals table %s already exists in db, dropping...\n", dataTable);
-    sqlDropTable(biConn, dataTable);
+    hash = hashNew(0);
+    hashAdd(dataHash, gene, hash);
     }
+else
+    hash = el->val;
 
-if (!sqlTableExists(biConn, dataTable))
+int i;
+for (i = 0; i < expCount; i++)
     {
-    fprintf(stderr, "Creating probeVals table %s...\n", dataTable);
-    createProbeValsTable(biConn, dataTable);
-    inputProbeVals = TRUE;  // empty table, input
-    }
+    float val = expScores[i];
+    char *name;
+    if (isTCGA)
+	name = cloneStringZ(allA->names[i], 16);
+    else
+	name = cloneString(allA->names[i]);
 
-if (sqlTableExists(biConn, probeTable) && dropTable)
+    struct slDouble *sd = slDoubleNew(val);
+    
+    el = hashLookup(hash, name);
+    if (!el)
+	hashAdd(hash, name, sd);
+    else
     {
-    fprintf(stderr, "probeInfo table %s already exists in db, dropping...\n", probeTable);
-    sqlDropTable(biConn, probeTable);
+	struct slDouble *sdList = el->val;
+	slAddTail(&sdList, sd);
     }
+    }
+}
+
+boolean reduceDataHash(struct hash *dataHash, double *retMed, double *retStd)
+{
+struct slDouble *sd, *allSd, *allSdList = NULL;
+struct hashEl *outEl;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
+    {
+    struct hash *hash = outEl->val;
 
-if (!sqlTableExists(biConn, probeTable))
+    struct hashEl *inEl, *elList = hashElListHash(hash);
+    for (inEl = elList; inEl != NULL; inEl = inEl->next)
     {
-    fprintf(stderr, "Creating probeInfo table %s...\n", probeTable);
-    createProbeInfoTable(biConn, probeTable);
-    inputProbeInfo = TRUE;   // empty table, input
-    }
+	char *sample = inEl->name;
+	struct slDouble *sdList = inEl->val;
+	double med = slDoubleMedian(sdList);
+	sd = slDoubleNew(med);
+	allSd = slDoubleNew(med);
 
-char query[256];
-safef(query, sizeof(query), "select * from %s;", dataTable);
+	slAddHead(&allSdList, allSd);
 
-/* Get bed15 data from hg18 database */
-int id = 0;
-struct sqlResult *sr = sqlGetResult(hgConn, query);
+	hashRemove(hash, sample);
+	slFreeList(&sdList);
 
-char **row = NULL;
-while ((row = sqlNextRow(sr)) != NULL)
-    {
-    char *chrom = row[1];
-    unsigned chromStart = sqlUnsigned(row[2]);
-    unsigned chromEnd = sqlUnsigned(row[3]);
-    char *name = row[4];
-    unsigned expCount = sqlUnsigned(row[13]);
-    char *expScores = row[15];
+	hashAdd(hash, sample, sd);
+	}
+    hashElFreeList(&elList);
+    }
 
-    /* Make probeInfo entry and load into db*/
-    struct probeInfo *pi;
-    AllocVar(pi);
-    pi->id = id;
-    pi->chrom = cloneString(chrom);
-    pi->start = chromStart;
-    pi->stop  = chromEnd;
-    pi->name  = cloneString(name);
+double count = (double) slCount(allSdList);
+double allMedian = slDoubleMedian(allSdList);
     
-    if (inputProbeInfo)
-	probeInfoSaveToDb(biConn, pi, probeTable, 100);
+slSort(allSdList, slDoubleCmp);
+int low = round(count * (0.0015));
+int high = round(count * (1.0 - 0.0015));
 
-    /* Make probeVals entry and load into db, straight-up copying longblob*/
-    if (inputProbeVals)
-	addProbeValsToDb(biConn, dataTable, id, expCount, expScores, 500);
+sd = slElementFromIx(allSdList, low);
+double lowVal = sd->val;
 
-    id++;
-    probeInfoFree(&pi);
-    }
-}
+sd = slElementFromIx(allSdList, high);
+double highVal = sd->val;
 
-void createGeneLookupTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "kgId varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id),\n");
-dyStringPrintf(dy, "KEY(kgId),\n");
-dyStringPrintf(dy, "KEY(id,kgId),\n");
-dyStringPrintf(dy, "KEY(kgId,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
+double mad = max(fabs(lowVal - allMedian), fabs(highVal - allMedian))/3.0;
+double std = mad * 1.43;
+
+*retStd = std;
+*retMed = allMedian;
+
+return TRUE;
 }    
 
-void createGeneLookup(struct sqlConnection *biConn)
+struct analysisVals *getAnalysisVals(struct sqlConnection *biConn, struct hash *dataHash, 
+				     double med, double std)
 {
-if (!sqlTableExists(biConn, GL_TABLE))
-    {
-    fprintf(stderr, "geneLookup table doesn't exist in bioInt database, recreating it.\n");
-    createGeneLookupTable(biConn, GL_TABLE);
-    }
+struct hash *sampleHash = createIdHash(biConn, SA_TABLE, "name");
 
-if (sqlTableSize(biConn, GL_TABLE) > 0)
+double z, p, val;
+double maxLogP = 88.0;
+struct hashEl *inEl, *outEl;
+struct analysisVals *av, *avList = NULL;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
     {
-    fprintf(stderr, "geneLookup table already has data in it, doing nothing.\n");
-    return;
-    }
-
-if (!sqlTableExists(biConn, "knownGene"))
-    errAbort("Need knownGene table in bioInt database.");
+    char *gene = outEl->name;
+    struct hash *hash = outEl->val;
 
-char query[256];
-safef(query, sizeof(query), "select name from knownGene;");
+    struct analysisFeatures *af = getAnalysisFeatures(biConn, gene, "gene");
 
-struct slName *sl, *slList = sqlQuickList(biConn, query);
+    if (!af)
+	continue;
 
-int id = 0;
-struct geneLookup *gl;
-for (sl = slList; sl; sl = sl->next)
+    struct hashEl *elList = hashElListHash(hash);
+    for (inEl = elList; inEl != NULL; inEl = inEl->next)
     {
-    AllocVar(gl);
-    gl->id = id;
-    gl->kgId = cloneString(sl->name);
-    
-    id++;
-    geneLookupSaveToDb(biConn, gl, GL_TABLE, 100);
-    geneLookupFree(&gl);
-    }
-
-slNameFreeList(&slList);
-}
+	char *sample = inEl->name;
+	struct slDouble *sd = inEl->val;
 
-void createProbeToGeneTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "probe_id int unsigned not null,\n");
-dyStringPrintf(dy, "gene_id int unsigned not null,\n");
-dyStringPrintf(dy, "KEY(probe_id),\n");
-dyStringPrintf(dy, "KEY(gene_id),\n");
-dyStringPrintf(dy, "KEY(probe_id,gene_id),\n");
-dyStringPrintf(dy, "KEY(gene_id,probe_id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
+	int sample_id = hashIntValDefault(sampleHash, sample, -1);
+	if (sample_id == -1)
+	    errAbort("No sample by name of %s\n", sample);
+
+	AllocVar(av);
+	av->sample_id = sample_id;
+	av->feature_id = af->id;
+	av->val = sd->val;
+
+	z = (av->val - med)/std;
+	p = ndtr(-1.0*fabs(z));
+	if (p > 0)
+	    val = min(-log(p)/log(10.0), maxLogP);
+	else
+	    val = maxLogP;
 
-int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
-{
-char query[128];
-safef(query, sizeof(query),
-      "select id from %s where name = \"%s\"",
-      tableName, name);
+	if (z < 0.0)
+	    val = -1.0*val;  // signed log(p-value)
+        av->conf = val; 
 
-if (!sqlExists(biConn, query))
-    return -1;
+	slAddHead(&avList, av);
+	}
+    hashElFreeList(&elList);
+    }
 
-return sqlQuickNum(biConn, query);
+return avList;
 }
 
-struct slInt *getGeneIdsBySymbol(struct sqlConnection *biConn, 
-				 struct slName *slList)
+void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn, 
+		    struct datasets *da, struct maGrouping *allA)
 {
-if (!slList)
-    return NULL;
+char *dataTable = da->data_table;
 
-struct slName *sl;
-struct dyString *dy = newDyString(100);
-dyStringPrintf(dy, "select id from %s "
-	       "join kgXref on %s.kgId = kgXref.kgId "
-	       "where kgXref.geneSymbol in (", GL_TABLE, GL_TABLE);
-for (sl = slList; sl; sl = sl->next)
+if (!dataTable)
+    errAbort("datasets entry not complete, data_table not set.");
+
+boolean inputProbeVals = FALSE;
+if (sqlTableExists(biConn, dataTable) && dropTable)
     {
-    dyStringPrintf(dy, "\"%s\"", sl->name);
-    if (sl->next)
-	dyStringPrintf(dy, ",");
+    fprintf(stderr, "analysisVals table %s already exists in db, dropping...\n", dataTable);
+    sqlDropTable(biConn, dataTable);
     }
-dyStringPrintf(dy, ");");
-char *query = dyStringCannibalize(&dy);
-
-return sqlQuickNumList(biConn, query);
-}
 
-
-void setupProbeToGene(struct sqlConnection *hgConn, 
-		      struct sqlConnection *biConn, struct datasets *da)
-{
-char *p2gTable = da->probe_to_gene_table;
-if (!p2gTable)
+if (!sqlTableExists(biConn, dataTable))
     {
-    fprintf(stderr, "probeToGene table not set, doing nothing.\n");
-    return;
+    fprintf(stderr, "Creating analysisVals table %s...\n", dataTable);
+    inputProbeVals = TRUE;  // empty table, input
     }
 
-struct hash *settings = getSettings(da->data_table);
+if (!inputProbeVals)
+    return;
+
+struct hash *settings = getSettings(dataTable);
 struct hashEl *el = hashLookup(settings, "aliasTable");
 if (!el)
     errAbort("No aliasTable.\n");
 char *aliasTable = cloneString(el->val);
+struct hash *gaHash = getAliases(hgConn, aliasTable);
 
-if (!sqlTableExists(hgConn, aliasTable))
-    errAbort("Table %s not found in hg18 database.\n", aliasTable);
+char query[256];
+safef(query, sizeof(query), "select * from %s;", dataTable);
 
-if (!sqlTableExists(biConn, "kgXref"))
-    errAbort("kgXref table not found in database. Cannot create probeToGene table.\n");
+/* Get bed15 data from hg18 database */
+struct sqlResult *sr = sqlGetResult(hgConn, query);
 
-if (sqlTableExists(biConn, p2gTable) && dropTable)
-    {
-    fprintf(stderr, "Table %s already exists, dropping and recreating.\n", p2gTable);
-    sqlDropTable(biConn, p2gTable);
-    }
+struct hash *dataHash = hashNew(0);
 
-boolean inputProbeToGene = FALSE;
-if (!sqlTableExists(biConn, p2gTable))
+char **row = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
     {
-    fprintf(stderr, "Creating probeToGene table...\n");
-    createProbeToGeneTable(biConn, p2gTable);
-    inputProbeToGene = TRUE;
-    }
+    struct bed *nb = bedLoadN(row+1, 15);
 
-if (!inputProbeToGene)
-    return;
+    char *name        = nb->name; // row[4];
+    unsigned expCount = nb->expCount; //sqlUnsigned(row[13]);
+    float *expScores  = nb->expScores; // row[15];
 
-struct geneAlias *ga, *gaList = getAliases(hgConn, aliasTable);
-for (ga = gaList; ga; ga = ga->next)
-    {
-    int probeId = getProbeId(biConn, da->probe_table, ga->probe);
-    if (probeId < 0)  // probe in alias table doesn't exist in dataset
+    struct hashEl *el = hashLookup(gaHash, name);
+    if (!el)
 	continue;
+    struct geneAlias *ga = el->val;
+    struct slName *sl;
+    for (sl = ga->genes; sl; sl = sl->next)
+	addDataToHash(dataHash, sl->name, expCount, expScores, allA);
+    }
+
+if (hashNumEntries(dataHash) == 0)
+    errAbort("no entries in hash\n");
+
+fprintf(stderr, "\treducing hash...\n");
+double med, std;
+if (!reduceDataHash(dataHash, &med, &std))
+    errAbort("problem reducing hash\n");
+
+fprintf(stderr, "\tconverting hash to analysisVals...\n");
+struct analysisVals *avList = getAnalysisVals(biConn, dataHash, med, std);
+
+fprintf(stderr, "\tstoring analysisVals...\n");
+storeAnalysisValsInDb(biConn, dataTable, avList);
+analysisValsFreeList(&avList);
+}
 
-    struct slInt *si, *geneIds = getGeneIdsBySymbol(biConn, ga->genes);
 
-    struct probeToGene *pg;
-    AllocVar(pg);
-    pg->probe_id = probeId;
-    for (si = geneIds; si; si = si->next)
-	{
-	pg->gene_id = si->val;
-	probeToGeneSaveToDb(biConn, pg, p2gTable, 10);
-	}
-    probeToGeneFree(&pg);
-    }
-}
+int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
+{
+char query[128];
+safef(query, sizeof(query),
+      "select id from %s where name = \"%s\"",
+      tableName, name);
 
-void createPathwaysTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "source varchar(255) not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(name),\n");
-dyStringPrintf(dy, "KEY(id,name),\n");
-dyStringPrintf(dy, "KEY(name,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
-void createPathwayGenesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "gene_id int unsigned not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(gene_id),\n");
-dyStringPrintf(dy, "KEY(id,gene_id),\n");
-dyStringPrintf(dy, "KEY(gene_id,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}    
-
-void createPathwayInfoTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "description longblob not null,\n");
-dyStringPrintf(dy, "KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
+if (!sqlExists(biConn, query))
+    return -1;
+
+return sqlQuickNum(biConn, query);
 }    
 
+
 char *getPathwayDescription(struct sqlConnection *pdConn, char *name)
 {
 if (!sqlTableExists(pdConn, name))
     return NULL;
@@ -1081,59 +901,59 @@
 
 return sqlQuickString(pdConn, query);
 }
 
-void setupPathways(struct sqlConnection *biConn)
+void setupGenesets(struct sqlConnection *biConn)
 {
-boolean inputPathways = FALSE;
-boolean inputPathwayInfo = FALSE;
-boolean inputPathwayGenes = FALSE;
+boolean inputGenesets = FALSE;
+boolean inputGenesetInfo = FALSE;
+boolean inputGenesetGenes = FALSE;
 
 struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
 if (!pdConn)
     errAbort("Could not connect to pathways database.\n");
 
-if (sqlTableExists(biConn, PA_TABLE) && dropTable)
+if (sqlTableExists(biConn, GE_TABLE) && dropTable)
     {
-    fprintf(stderr, "pathways table already exists, dropping and recreating.\n");
-    sqlDropTable(biConn, PA_TABLE);
+    fprintf(stderr, "%s table already exists, dropping and recreating.\n", GE_TABLE);
+    sqlDropTable(biConn, GE_TABLE);
     }
-if (!sqlTableExists(biConn, PA_TABLE))
+if (!sqlTableExists(biConn, GE_TABLE))
     {
-    fprintf(stderr, "Creating pathways table.\n");
-    createPathwaysTable(biConn, PA_TABLE);
-    inputPathways = TRUE;
+    fprintf(stderr, "Creating %s table.\n", GE_TABLE);
+    createGenesetsTable(biConn, GE_TABLE);
+    inputGenesets = TRUE;
     }
 
-if (sqlTableExists(biConn, PG_TABLE) && dropTable)
+if (sqlTableExists(biConn, GG_TABLE) && dropTable)
     {
-    fprintf(stderr, "pathwayGenes table already exists, dropping and recreating.\n");
-    sqlDropTable(biConn, PG_TABLE);
+    fprintf(stderr, "%s table already exists, dropping and recreating.\n", GG_TABLE);
+    sqlDropTable(biConn, GG_TABLE);
     }
 
-if (!sqlTableExists(biConn, PG_TABLE))
+if (!sqlTableExists(biConn, GG_TABLE))
     {
-    fprintf(stderr, "Creating pathwayGenes table.\n");
-    createPathwayGenesTable(biConn, PG_TABLE);
-    inputPathwayGenes = TRUE;
+    fprintf(stderr, "Creating %s table.\n", GG_TABLE);
+    createGenesetGenesTable(biConn, GG_TABLE);
+    inputGenesetGenes = TRUE;
     }
 
-if (sqlTableExists(biConn, PI_TABLE) && dropTable)
+if (sqlTableExists(biConn, GI_TABLE) && dropTable)
     {
-    fprintf(stderr, "pathwayInfo table already exists, dropping and recreating.\n");
-    sqlDropTable(biConn, PI_TABLE);
+    fprintf(stderr, "%s table already exists, dropping and recreating.\n", GI_TABLE);
+    sqlDropTable(biConn, GI_TABLE);
     }
 
-if (!sqlTableExists(biConn, PI_TABLE))
+if (!sqlTableExists(biConn, GI_TABLE))
     {
-    fprintf(stderr, "Creeting pathwayInfo table.\n");
-    createPathwayInfoTable(biConn, PI_TABLE);
-    inputPathwayInfo = TRUE;
+    fprintf(stderr, "Creeting %s table.\n", GI_TABLE);
+    createGenesetInfoTable(biConn, GI_TABLE);
+    inputGenesetInfo = TRUE;
     }
 
-if (!inputPathways && !inputPathwayInfo && !inputPathwayGenes)
+if (!inputGenesets && !inputGenesetInfo && !inputGenesetGenes)
     {
-    fprintf(stderr, "Nothing to do for pathway tables.\n");
+    fprintf(stderr, "Nothing to do for geneset tables.\n");
     return;
     }
 
 /* Setting up pathways table */
@@ -1155,59 +975,164 @@
 slReverse(&names);
 slReverse(&genes);
 sqlFreeResult(&sr);
 
-int id = 0;
 for (na = names, ge = genes; na && ge; na = na->next, ge = ge->next)
     {
     char *name = na->name;
-    char *genes = ge->name;
+    char *members = ge->name;
 
-    struct slName *slList = slNameListFromComma(genes);
-    struct slInt *si, *siList = getGeneIdsBySymbol(biConn, slList); 
+    struct analysisFeatures *gsAf = getAnalysisFeatures(biConn, name, "geneset");
+    struct analysisFeatures *af, *afList = getAnalysisFeatures(biConn, members, "gene"); 
     
-    struct pathways *ps;
+    if (!gsAf)
+	continue;
+
+    if (!afList)
+	continue;
 
-    if (inputPathways)
+    if (inputGenesets)
 	{
-	AllocVar(ps);
-	ps->id = id;
-	ps->name = cloneString(name);
-	ps->source = cloneString("N/A");
-	pathwaysSaveToDb(biConn, ps, PA_TABLE, 100);
-	pathwaysFree(&ps);
+	struct genesets *gs;
+	AllocVar(gs);
+	gs->id = gsAf->id;
+	gs->name = cloneString(name);
+	gs->source = cloneString("N/A");
+	genesetsSaveToDb(biConn, gs, GE_TABLE, 100);
+	genesetsFree(&gs);
 	}
 
-    if (inputPathwayGenes)
+    if (inputGenesetGenes)
 	{
-	struct pathwayGenes *pg;
-	AllocVar(pg);
-	pg->id = id;
-	for (si = siList; si; si = si->next)
+	struct genesetGenes *gg;
+	AllocVar(gg);
+	gg->id = gsAf->id;
+	for (af = afList; af; af = af->next)
 	    {
-	    pg->gene_id = si->val;
-	    pathwayGenesSaveToDb(biConn, pg, PG_TABLE, 100);
+	    gg->gene_id = af->id;
+	    genesetGenesSaveToDb(biConn, gg, GG_TABLE, 100);
 	    }
-	pathwayGenesFree(&pg);
+	genesetGenesFree(&gg);
 	}
     
-    if (inputPathwayInfo)
+    if (inputGenesetInfo)
 	{
 	char *desc = getPathwayDescription(pdConn, name);
 	if (desc)
 	    {
-	    struct pathwayInfo *pi;
-	    AllocVar(pi);
-	    pi->id = id;
-	    pi->description = desc;
-	    pathwayInfoSaveToDbEscaped(biConn, pi, PI_TABLE, 200);
-	    pathwayInfoFree(&pi);
+	    struct genesetInfo *gi;
+	    AllocVar(gi);
+	    gi->id = gsAf->id;
+	    gi->description = desc;
+	    genesetInfoSaveToDbEscaped(biConn, gi, GI_TABLE, 200);
+	    genesetInfoFree(&gi);
+	    }
 	    }
+
+    analysisFeaturesFree(&gsAf);
+    analysisFeaturesFreeList(&afList);
+    }
+
+hFreeConn(&pdConn);
+}
+
+boolean analysisFeatureExists(struct sqlConnection *biConn, struct analysisFeatures *af)
+{
+char query[256];
+safef(query, sizeof(query),
+      "select * from %s where id = %d "
+      "and feature_name = \"%s\" "
+      "and type = \"%s\"",
+      AF_TABLE, af->id, af->feature_name, af->type);
+
+return sqlExists(biConn, query);
+}   
+
+int findIdForAnalysisFeature(struct sqlConnection *biConn, char *tableName,
+			     struct analysisFeatures *af)
+{
+if (sqlTableSize(biConn, tableName) == 0)  /* brand new table, return 0 */
+    return 0;
+
+char query[256];
+safef(query, sizeof(query),
+      "select DISTINCT id from %s where feature_name = \"%s\" "
+      "and type = \"%s\";",
+      tableName, af->feature_name, af->type);
+if (sqlExists(biConn, query))  /* sample name found, use same id */
+    return sqlQuickNum(biConn, query);
+
+/* Else, find maximum sample id and add one to it */
+safef(query, sizeof(query),
+      "select max(id) from %s;",
+      tableName);
+int maxId = sqlQuickNum(biConn, query);
+return maxId + 1;
+}   
+
+void setupAnalysisFeatures(struct sqlConnection *biConn)
+{
+if (!sqlTableExists(biConn, AF_TABLE))
+    createAnalysisFeaturesTable(biConn, AF_TABLE);
+
+if (sqlTableSize(biConn, AF_TABLE) != 0)
+    {
+    fprintf(stderr, "%s table is not empty, doing nothing\n", AF_TABLE);
+    return;
 	}
-    id++;
+
+/* set up gene features */
+char query[256];
+safef(query, sizeof(query), 
+      "select DISTINCT geneSymbol from %s;", KX_TABLE);
+struct slName *sl, *slList = sqlQuickList(biConn, query);
+
+struct analysisFeatures *af, *afList = NULL;
+
+for (sl = slList; sl; sl = sl->next)
+    {
+    AllocVar(af);
+    af->id = 0;
+    af->feature_name = cloneString(sl->name);
+    af->type = cloneString("gene");
+   
+    slAddHead(&afList, af);
     }
+slNameFreeList(&slList);
+
+/* set up geneset features */
+struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
+safef(query, sizeof(query), 
+      "select name from genesets;");
+slList = sqlQuickList(pdConn, query);
 
+for (sl = slList; sl; sl = sl->next)
+    {
+    AllocVar(af);
+    af->id = 0;
+    af->feature_name = cloneString(sl->name);
+    af->type = cloneString("geneset");
+    
+    slAddHead(&afList, af);
+    }
 hFreeConn(&pdConn);
+slNameFreeList(&slList);
+
+slReverse(&afList);
+
+/* set up pathway features (TODO) */
+
+
+/* save features to db */
+for (af = afList; af; af = af->next)
+    {
+    int feature_id = findIdForAnalysisFeature(biConn, AF_TABLE, af);
+    af->id = feature_id;
+    if (!analysisFeatureExists(biConn, af))
+	analysisFeaturesSaveToDb(biConn, af, AF_TABLE, 10);
+    }
+
+analysisFeaturesFree(&afList);
 }
 
 
 void populateDb(char *db, char *tableName, char *tissue)
@@ -1215,18 +1140,24 @@
 tissue = strLower(tissue);
 struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
 struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
 
-/* Create geneLookup table (if necessary) */
+/* Create analysis features (if necessary) */
 uglyTime(NULL);
-fprintf(stderr, "Setting up geneLookup table...\n");
-createGeneLookup(biConn);
+fprintf(stderr, "Setting up analysis features...\n");
+setupAnalysisFeatures(biConn);
 uglyTime("Time");
 
+/* Create geneLookup table (if necessary) */
+//uglyTime(NULL);
+//fprintf(stderr, "Setting up geneLookup table...\n");
+//createGeneLookup(biConn);
+//uglyTime("Time");
+
 /* Set up pathways */
 uglyTime(NULL);
 fprintf(stderr, "Setting up pathways tables...\n");
-setupPathways(biConn);
+setupGenesets(biConn);
 uglyTime("Time");
 
 /* Set up datasets entry */
 struct maGrouping *allA = getMaGrouping(hgConn, tableName);
@@ -1253,15 +1184,9 @@
 
 /* Set up probeInfo table (if necessary) and probeVals table */
 uglyTime(NULL);
 fprintf(stderr, "Setting up probe data tables (be patient!)...\n");
-setupProbeData(hgConn, biConn, da);
-uglyTime("Time");
-
-/* Set up probeToGene table (if necessary) */
-uglyTime(NULL);
-fprintf(stderr, "Setting up probeToGene table...\n");
-setupProbeToGene(hgConn, biConn, da);
+setupProbeData(hgConn, biConn, da, allA);
 uglyTime("Time");
 
 fprintf(stderr, "Done!");
 fprintf(stderr, "Please run 'setCohort' to find datasets that have overlapping samples.\n");