src/hg/instinct/bioInt2/populateDb.c 1.1

1.1 2009/03/20 06:06:32 jsanborn
initial commit
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: src/hg/instinct/bioInt2/populateDb.c
diff -N src/hg/instinct/bioInt2/populateDb.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/instinct/bioInt2/populateDb.c	20 Mar 2009 06:06:32 -0000	1.1
@@ -0,0 +1,1274 @@
+/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "bed.h"
+#include "genePred.h"
+#include "hPrint.h"
+#include "hdb.h"  
+#include "microarray.h"
+#include "ra.h"
+#include "featuresLib.h"
+#include "hgHeatmapLib.h"
+#include "bioIntDb.h"
+
+#define DEBUG 1
+
+char *hgDb = "hg18";
+char *genome = "Human";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "populateDb \n"
+  "   populateDb [OPTIONS] db table tissue\n"
+  "options:\n"
+  "   -dropAll   Drop/recreate any table\n"
+  "\n"
+  );
+}
+
+boolean dropTable = FALSE;   // If true, any table that should be dropped/recreated will be
+
+static struct optionSpec options[] = {
+    {"dropAll", OPTION_BOOLEAN},
+    {NULL, 0},
+};
+
+char *getId(struct sqlConnection *conn, char *table, char *key, char *sample, char *value)
+/* get ISPY ID from sample (or experiment) Id */
+{
+char query[512];
+safef(query, sizeof(query), "select %s from %s where %s = \"%s\" ", key, table, value, sample);
+return sqlQuickString(conn, query);
+}
+
+
+struct slName *getProbesFromTable(struct sqlConnection *hgConn, char *tableName)
+{
+char query[512];
+char *key = "name";
+safef(query, sizeof(query), "select DISTINCT %s from %s ", key, tableName);
+struct sqlResult *sr = sqlGetResult(hgConn, query);
+char **row = NULL;
+
+struct slName *sl, *slList = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    sl = slNameNew(row[0]); 
+    slAddHead(&slList, sl);                                                                       
+    }
+
+slReverse(&slList);
+sqlFreeResult(&sr);
+return slList;
+}
+
+
+struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
+{
+/*microarray specific settings*/
+struct trackDb *tdb = hMaybeTrackInfo(hgConn, tableName);  
+struct microarrayGroups *maGs = maGroupings("hg18", tableName);
+trackDbFreeList(&tdb);
+if (!maGs)
+    return NULL;
+return maGs->allArrays;
+}
+
+struct hash *getSettings(char *tableName)
+{
+struct column *raList = getColumns(NULL, "datasets.ra", NULL);  
+
+struct column *col;
+struct hash *settings = NULL;
+for (col = raList; col; col = col->next)
+    {
+    if (!sameString(col->name, tableName))
+	continue;
+
+    settings = col->settings;
+    break;
+    }
+
+if (!settings)
+    errAbort("Couldn't find datasets.ra listing for %s", tableName);
+
+return settings;
+}
+
+struct geneAlias {
+    struct geneAlias *next;
+
+    char *probe;
+    struct slName *genes;
+}; 
+
+ 
+struct geneAlias *getAliases(struct sqlConnection *hgConn, char *tableName)
+{
+if (!hgConn || !tableName)
+    return NULL;
+
+char query[512];
+char **row;
+  
+safef(query, sizeof(query), "select * from %s", tableName);
+
+struct sqlResult *sr = sqlGetResult(hgConn, query);
+
+struct geneAlias *ga, *gaList = NULL;
+struct hash *gaHash = hashNew(0);
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *probe = cloneString(row[0]);
+    char *gene = cloneString(row[1]);
+
+    struct hashEl *el = hashLookup(gaHash, probe);
+
+    if (!el)
+	{
+	ga = AllocA(struct geneAlias);
+	ga->probe = cloneString(probe);
+	ga->genes = NULL;
+
+	slAddHead(&gaList, ga);
+	hashAdd(gaHash, probe, ga);
+	}
+    else
+	ga = el->val;
+
+    slNameAddHead(&ga->genes, gene);	
+    }
+
+hashFree(&gaHash);
+sqlFreeResult(&sr);              
+
+return gaList;                   
+}
+
+struct dataTypes *findDataType(struct sqlConnection *biConn, char *type, char *platform)
+{
+if (!sameString(type, "bed 15"))
+    errAbort("populateDb only runs on bed 15 files.");
+
+char *data_format = "probeVals";
+
+char query[256];
+safef(query, sizeof(query), 
+      "select * from dataTypes where format = \"%s\" and name = \"%s\"",
+      data_format, platform);
+
+return dataTypesLoadByQuery(biConn, query);
+}
+
+struct dataTypes *createDataType(struct sqlConnection *biConn, char *type, char *platform)
+{
+int nextId = sqlTableSize(biConn, "dataTypes");
+struct dataTypes *dt;
+AllocVar(dt);
+dt->id = nextId;
+dt->format = cloneString("probeVals");
+dt->name   = cloneString(platform);
+
+/* Save to db */
+dataTypesSaveToDb(biConn, dt, "dataTypes", 100);
+return dt;
+} 
+
+void createDataTypesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "format varchar(255) not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+struct dataTypes *setupDataType(struct sqlConnection *biConn, 
+				char *type, char *platform)
+{
+if (!sqlTableExists(biConn, "dataTypes"))
+    {
+    fprintf(stderr, "Tables dataTypes doesn't exist, creating...\n");
+    createDataTypesTable(biConn, "dataTypes");
+    }
+
+struct dataTypes *dt = findDataType(biConn, type, platform);
+if (!dt)
+    dt = createDataType(biConn, type, platform);
+
+return dt;
+}
+
+
+struct tissues *findTissue(struct sqlConnection *biConn, char *tissue)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select * from tissues where name = \"%s\";",
+      tissue);
+return tissuesLoadByQuery(biConn, query);
+}
+
+struct tissues *createTissue(struct sqlConnection *biConn, char *tissue)
+{
+int nextId = sqlTableSize(biConn, "tissues");
+struct tissues *ti;
+AllocVar(ti);
+ti->id = nextId;
+ti->name = cloneString(tissue);
+
+/* Save to db */
+tissuesSaveToDb(biConn, ti, "tissues", 100);
+return ti;
+} 
+
+void createTissuesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+struct tissues *setupTissue(struct sqlConnection *biConn, char *tissue)
+{
+if (!sqlTableExists(biConn, "tissues"))
+    {
+    fprintf(stderr, "Tables tissues doesn't exist, creating...\n");
+    createTissuesTable(biConn, "tissues");
+    }
+
+struct tissues *ti = findTissue(biConn, tissue);
+if (!ti)
+    ti = createTissue(biConn, tissue);
+
+return ti;
+}
+
+
+struct datasets *findDataset(struct sqlConnection *biConn, char *name)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select * from datasets where data_table = \"%s\";",
+      name);
+return datasetsLoadByQuery(biConn, query);
+}
+
+
+struct datasets *createDataset(struct sqlConnection *biConn, 
+			       char *tableName, char *tissue, int numSamples)
+{
+struct hash *settings = getSettings(tableName);
+
+struct hashEl *el = hashLookup(settings, "shortLabel");
+if (!el)
+    errAbort("No shortLabel");
+char *shortLabel = cloneString(el->val);
+
+el = hashLookup(settings, "name");
+if (!el)
+    errAbort("No name");
+char *dataTable = cloneString(el->val);
+
+char probeTable[256];
+safef(probeTable, sizeof(probeTable), "%s_probeInfo", dataTable);
+
+char p2gTable[256];
+safef(p2gTable, sizeof(p2gTable), "%s_probeToGene", dataTable);
+  
+char *platform;
+el = hashLookup(settings, "platform");
+if (!el)
+    platform = cloneString("Expression");
+else
+    platform = cloneString(el->val);
+
+el = hashLookup(settings, "dataType");
+if (!el)
+    errAbort("No dataType");
+char *dataType = cloneString(el->val);
+
+struct dataTypes *dt = setupDataType(biConn, dataType, platform);
+
+struct tissues *ti = setupTissue(biConn, tissue);
+
+int nextId = sqlTableSize(biConn, "datasets");
+
+struct datasets *da;
+AllocVar(da);
+da->id = nextId;
+da->tissue_id = ti->id;
+da->type_id = dt->id;
+da->num_samples = numSamples;
+da->name = shortLabel;
+da->data_table = dataTable;
+da->probe_table = cloneString(probeTable);
+da->probe_to_gene_table = cloneString(p2gTable);
+
+dataTypesFree(&dt);
+tissuesFree(&ti);
+
+/* Write datasets */
+datasetsSaveToDbEscaped(biConn, da, "datasets", 100); 
+
+return da;
+}
+
+void createDatasetsTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
+dyStringPrintf(dy, "type_id int unsigned not null,\n");
+dyStringPrintf(dy, "num_samples int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "data_table varchar(255) not null,\n");
+dyStringPrintf(dy, "probe_table varchar(255) not null,\n");
+dyStringPrintf(dy, "probe_to_gene_table varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+struct datasets *setupDataset(struct sqlConnection *biConn, 
+			      char *tableName, char *tissue, int numSamples)
+{
+if (!sqlTableExists(biConn, "datasets"))
+    {
+    fprintf(stderr, "Tables datasets doesn't exist, creating...");
+    createDatasetsTable(biConn, "datasets");
+    }
+
+struct datasets *da = findDataset(biConn, tableName);
+if (!da)
+    da = createDataset(biConn, tableName, tissue, numSamples);
+
+return da;
+}
+
+
+char *findPatientName(struct sqlConnection *pdConn, char *pTable, 
+		      char *pField, char *sField, char *sName)
+{
+char query[256];
+safef(query, sizeof(query),
+      "select %s from %s where %s = \"%s\"",
+      pField, pTable, sField, sName);
+
+return sqlQuickString(pdConn, query);
+}
+
+int findId(struct sqlConnection *biConn, char *idField, char *sField, char *name)
+{
+if (sqlTableSize(biConn, "samples") == 0)  /* brand new table, return 0 */
+    return 0;
+
+char query[256];
+safef(query, sizeof(query), 
+      "select DISTINCT %s from samples where %s = \"%s\";",
+      idField, sField, name);
+if (sqlExists(biConn, query))  /* sample name found, use same id */
+    return sqlQuickNum(biConn, query);
+
+/* Else, find maximum sample id and add one to it */
+safef(query, sizeof(query),
+      "select max(%s) from samples;", 
+      idField);
+int maxId = sqlQuickNum(biConn, query);
+return maxId + 1;
+}
+
+boolean sampleExists(struct sqlConnection *biConn, struct samples *sa)
+{
+char query[256];
+safef(query, sizeof(query),
+      "select * from samples where id = %d "
+      "and name = \"%s\" "
+      "and patient_id = %d "
+      "and patient_name = \"%s\" "
+      "and dataset_id = %d "
+      "and exp_id = %d "
+      "and tissue_id = %d ",
+      sa->id, sa->name, sa->patient_id, sa->patient_name, sa->dataset_id,
+      sa->exp_id, sa->tissue_id);
+
+return sqlExists(biConn, query);
+}
+
+void createSamples(struct sqlConnection *biConn, struct datasets *da, struct maGrouping *allA)
+{
+int datasetId = da->id;
+int tissueId = da->tissue_id;
+
+struct hash *settings = getSettings(da->data_table);
+
+struct hashEl *el = hashLookup(settings, "patDb");
+if (!el)
+    errAbort("No patDb!");
+char *patDb = cloneString(el->val);
+
+el = hashLookup(settings, "patTable");
+if (!el)
+    errAbort("No patTable");
+char *patTable = cloneString(el->val);
+
+el = hashLookup(settings, "patField");
+if (!el)
+    errAbort("No patField");
+char *patField = cloneString(el->val);
+
+el = hashLookup(settings, "sampleField");
+if (!el)
+    errAbort("No sampleField");
+char *sampleField = cloneString(el->val);
+
+struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb);
+
+int i;
+struct samples *sa;
+for (i = 0; i < allA->size; i++)
+    {
+    char *sampleName = cloneString(allA->names[i]);
+    int expId = allA->expIds[i];
+
+    int sampleId = findId(biConn, "id", "name", sampleName);
+    char *patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName); 
+    int patientId = findId(biConn, "patient_id", "patient_name", patientName);
+
+    AllocVar(sa);
+    sa->id = sampleId;
+    sa->name = sampleName;
+    sa->patient_id = patientId;
+    sa->patient_name = patientName;
+    sa->dataset_id = datasetId;
+    sa->exp_id = expId;
+    sa->tissue_id = tissueId;
+
+    if (!sampleExists(biConn, sa))
+	samplesSaveToDb(biConn, sa, "samples", 100);
+
+    samplesFree(&sa);
+    }
+
+hFreeConn(&pdConn);
+}
+
+void createSamplesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "patient_id int unsigned not null,\n");
+dyStringPrintf(dy, "patient_name varchar(255) not null,\n");
+dyStringPrintf(dy, "dataset_id int unsigned not null,\n");
+dyStringPrintf(dy, "exp_id int unsigned not null,\n");
+dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
+dyStringPrintf(dy, "KEY(id),\n");
+dyStringPrintf(dy, "KEY(dataset_id),\n");
+dyStringPrintf(dy, "KEY(id, dataset_id),\n");
+dyStringPrintf(dy, "KEY(dataset_id,id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+struct samples *getSamples(struct sqlConnection *biConn, struct datasets *da)
+{
+char query[256];
+safef(query, sizeof(query),
+      "select * from samples where dataset_id = %d order by exp_id;",
+      da->id);
+
+return samplesLoadByQuery(biConn, query);
+}
+
+
+struct samples *setupSamples(struct sqlConnection *biConn, struct datasets *da, 
+			     struct maGrouping *allA)
+{
+if (!sqlTableExists(biConn, "samples"))
+    {
+    fprintf(stderr, "Table samples doesn't exist, creating...\n");
+    createSamplesTable(biConn, "samples");
+    }
+
+createSamples(biConn, da, allA);
+struct samples *saList = getSamples(biConn, da);
+if (slCount(saList) != allA->size)
+    errAbort("Sample count from microarrayGroups and database don't match!");
+
+return saList;
+}
+
+
+int getFeatureId(struct sqlConnection *biConn, char *name)
+{
+if (sqlTableSize(biConn, "features") == 0)  /* brand new table, return 0 */
+    return 0;
+
+char query[256];
+safef(query, sizeof(query), 
+      "select id from features where name = \"%s\";",
+      name);
+
+if (sqlExists(biConn, query))  /* sample name found, use same id */
+    return sqlQuickNum(biConn, query);
+else
+    return sqlTableSize(biConn, "features");
+}
+
+struct features *getFeature(struct sqlConnection *biConn, char *name)
+{
+char query[256];
+safef(query, sizeof(query),
+      "select * from features where name = \"%s\";",
+      name);
+
+return featuresLoadByQuery(biConn, query);
+}
+
+boolean featureExists(struct sqlConnection *biConn, struct features *fs)
+{
+char query[256];
+safef(query, sizeof(query),
+      "select * from features where name = \"%s\";",
+      fs->name);
+
+return sqlExists(biConn, query);
+}
+
+boolean clinicalDataExists(struct sqlConnection *biConn, struct clinicalData *cd)
+{
+char query[256];
+safef(query, sizeof(query),
+      "select * from clinicalData where sample_id = %d "
+      "and feature_id = %d; ",
+      cd->sample_id, cd->feature_id);
+
+if (!sqlExists(biConn, query))  /* entry doesn't exist, report */
+    return FALSE;
+
+/* Make sure entry has same values, if not there is a problem 
+ * (sample_id, feature_id) should be unique */
+
+struct clinicalData *cd2 = clinicalDataLoadByQuery(biConn, query);
+if (slCount(cd2) != 1)
+    errAbort("clinicalData entries not unique, sample_id = %d, feature_id = %d",
+	     cd->sample_id, cd->feature_id);
+
+if (cd->val != cd2->val)
+    errAbort("clinicalData values don't match, sample_id = %d, feature_id = %d, "
+	     "%f != %f",
+	     cd->sample_id, cd->feature_id, cd->val, cd2->val);
+
+if (cd->code && cd2->code)
+    if (!sameString(cd->code, cd2->code))	
+	errAbort("clinicalData codes don't match, sample_id = %d, feature_id = %d",
+		 cd->sample_id, cd->feature_id);
+
+return TRUE;
+}
+
+void createFeaturesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "shortLabel varchar(255) not null,\n");
+dyStringPrintf(dy, "longLabel varchar(255) not null,\n");
+dyStringPrintf(dy, "KEY(id),\n");
+dyStringPrintf(dy, "KEY(name),\n");
+dyStringPrintf(dy, "KEY(id,name)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+void createClinicalDataTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "sample_id int unsigned not null,\n");
+dyStringPrintf(dy, "feature_id int unsigned not null,\n");
+dyStringPrintf(dy, "val double not null,\n");
+dyStringPrintf(dy, "code varchar(255),\n");
+dyStringPrintf(dy, "KEY(sample_id),\n");
+dyStringPrintf(dy, "KEY(feature_id),\n");
+dyStringPrintf(dy, "KEY(sample_id,feature_id),\n");
+dyStringPrintf(dy, "KEY(feature_id,sample_id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+void setupClinicalInfo(struct sqlConnection *biConn, struct datasets *da, struct samples *saList)
+{
+if (!saList)
+    return;
+
+if (!sqlTableExists(biConn, "features"))
+    {
+    fprintf(stderr, "Table features doesn't exist, creating...\n");
+    createFeaturesTable(biConn, "features");
+    }
+
+if (!sqlTableExists(biConn, "clinicalData"))
+    {
+    fprintf(stderr, "Table clinicalData doesn't exist, creating...\n");
+    createClinicalDataTable(biConn, "clinicalData");
+    }
+
+struct hash *settings = getSettings(da->data_table);
+
+struct hashEl *el = hashLookup(settings, "raFile");
+if (!el)
+    errAbort("No raFile");
+char *raFile = cloneString(el->val);
+
+el = hashLookup(settings, "patDb");
+if (!el)
+    errAbort("No patDb");
+char *patDb = cloneString(el->val);
+
+el = hashLookup(settings, "patTable");
+if (!el)
+    errAbort("No patTable");
+char *patTable = cloneString(el->val);
+
+el = hashLookup(settings, "patField");
+if (!el)
+    errAbort("No patField");
+char *patField = cloneString(el->val);
+
+el = hashLookup(settings, "sampleField");
+if (!el)
+    errAbort("No sampleField");
+char *sampleField = cloneString(el->val);
+
+if (!raFile || !patDb || !patTable || !patField || !sampleField)
+    errAbort("Incomplete ra entry for %s.", da->data_table);
+
+struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb); //connection to patient data
+
+if (DEBUG)
+    fprintf(stderr, "Getting columns of clinical data...\n");
+struct column *col, *colList = getColumns(pdConn, raFile, patDb);
+
+/* Set up features */
+struct features *fs;
+for (col = colList; col; col = col->next)
+    {
+    char *name = col->name;
+    char *shortLabel = col->shortLabel;
+    char *longLabel = col->longLabel;
+
+    int id = getFeatureId(biConn, name);
+    AllocVar(fs);
+    fs->id = id;
+    fs->name = cloneString(name);
+    fs->shortLabel = cloneString(shortLabel);
+    fs->longLabel= cloneString(longLabel);
+
+    if (!featureExists(biConn, fs))
+	featuresSaveToDbEscaped(biConn, fs, "features", 100);
+    featuresFree(&fs);
+
+    fs = getFeature(biConn, name);
+    if (!fs)
+	errAbort("Could not find feature %s.", name);
+    
+    if (slCount(fs) != 1)
+	errAbort("Could not find unique feature by name = %s.", name);
+
+    /* Loop through all samples, putting data in database */
+    struct samples *sa; 
+    struct clinicalData *cd;
+    for (sa = saList; sa; sa = sa->next)
+	{
+	struct slName *id = slNameNew(getId(pdConn, patTable, patField, sa->name, sampleField));
+
+	char *cellVal = col->cellVal(col, id, pdConn);
+	if (!cellVal)
+	    continue; 
+
+	AllocVar(cd);
+	cd->sample_id = sa->id;
+	cd->feature_id = fs->id;
+
+	cd->val = atof(cellVal);
+	cd->code = NULL;
+	if (col->cellCoded(col, pdConn))
+	    cd->code = cloneString(col->cellCodedVal(col, id, pdConn));
+	
+	if (!clinicalDataExists(biConn, cd))
+	    clinicalDataSaveToDb(biConn, cd, "clinicalData", 100);
+
+	clinicalDataFree(&cd);
+	slNameFree(&id);
+	}
+    featuresFree(&fs);
+    }
+
+hFreeConn(&pdConn);
+}
+
+void createProbeValsTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "probe_id int unsigned not null,\n");
+dyStringPrintf(dy, "sample_count int unsigned not null,\n");
+dyStringPrintf(dy, "sample_data longblob not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(probe_id)\n"); 
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+void createProbeInfoTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "chrom varchar(255) not null,\n");
+dyStringPrintf(dy, "start int unsigned not null,\n");
+dyStringPrintf(dy, "stop int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id),\n");
+dyStringPrintf(dy, "KEY(name),\n");
+dyStringPrintf(dy, "KEY(id,name)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+void addProbeValsToDb(struct sqlConnection *biConn, char *tableName, 
+		      int probe_id, int sample_count, char *dataString, int updateSize)
+{ /* This bypasses the need to convert to float array to enter in database, may not
+   * be faster, but by converting first to float array any "blank" datapoints are 
+   * converted to 0.0, instead of leaving blank */
+struct dyString *update = newDyString(updateSize);
+dyStringPrintf(update, "insert into %s values ( %u,%u,'%s')",
+	       tableName, probe_id,  sample_count,  dataString );
+sqlUpdate(biConn, update->string);
+freeDyString(&update);
+}
+
+
+void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn, 
+		    struct datasets *da)
+{
+char *dataTable = da->data_table;
+char *probeTable = da->probe_table;
+
+if (!dataTable || !probeTable)
+    errAbort("datasets entry not complete, data_table or probe_table not set.");
+
+boolean inputProbeVals = FALSE;
+boolean inputProbeInfo = FALSE;
+
+if (sqlTableExists(biConn, dataTable) && dropTable)
+    {
+    fprintf(stderr, "probeVals table %s already exists in db, dropping...\n", dataTable);
+    sqlDropTable(biConn, dataTable);
+    }
+
+if (!sqlTableExists(biConn, dataTable))
+    {
+    fprintf(stderr, "Creating probeVals table %s...\n", dataTable);
+    createProbeValsTable(biConn, dataTable);
+    inputProbeVals = TRUE;  // empty table, input
+    }
+
+if (sqlTableExists(biConn, probeTable) && dropTable)
+    {
+    fprintf(stderr, "probeInfo table %s already exists in db, dropping...\n", probeTable);
+    sqlDropTable(biConn, probeTable);
+    }
+
+if (!sqlTableExists(biConn, probeTable))
+    {
+    fprintf(stderr, "Creating probeInfo table %s...\n", probeTable);
+    createProbeInfoTable(biConn, probeTable);
+    inputProbeInfo = TRUE;   // empty table, input
+    }
+
+char query[256];
+safef(query, sizeof(query), "select * from %s;", dataTable);
+
+/* Get bed15 data from hg18 database */
+int id = 0;
+struct sqlResult *sr = sqlGetResult(hgConn, query);
+
+char **row = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *chrom = row[1];
+    unsigned chromStart = sqlUnsigned(row[2]);
+    unsigned chromEnd = sqlUnsigned(row[3]);
+    char *name = row[4];
+    unsigned expCount = sqlUnsigned(row[13]);
+    char *expScores = row[15];
+
+    /* Make probeInfo entry and load into db*/
+    struct probeInfo *pi;
+    AllocVar(pi);
+    pi->id = id;
+    pi->chrom = cloneString(chrom);
+    pi->start = chromStart;
+    pi->stop  = chromEnd;
+    pi->name  = cloneString(name);
+    
+    if (inputProbeInfo)
+	probeInfoSaveToDb(biConn, pi, probeTable, 100);
+
+    /* Make probeVals entry and load into db, straight-up copying longblob*/
+    if (inputProbeVals)
+	addProbeValsToDb(biConn, dataTable, id, expCount, expScores, 500);
+
+    id++;
+    probeInfoFree(&pi);
+    }
+}
+
+void createGeneLookupTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "kgId varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id),\n");
+dyStringPrintf(dy, "KEY(kgId),\n");
+dyStringPrintf(dy, "KEY(id,kgId),\n");
+dyStringPrintf(dy, "KEY(kgId,id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+void createGeneLookup(struct sqlConnection *biConn)
+{
+if (!sqlTableExists(biConn, "geneLookup"))
+    {
+    fprintf(stderr, "geneLookup table doesn't exist in bioInt database, recreating it.\n");
+    createGeneLookupTable(biConn, "geneLookup");
+    }
+
+if (sqlTableSize(biConn, "geneLookup") > 0)
+    {
+    fprintf(stderr, "geneLookup table already has data in it, doing nothing.\n");
+    return;
+    }
+
+if (!sqlTableExists(biConn, "knownGene"))
+    errAbort("Need knownGene table in bioInt database.");
+
+char query[256];
+safef(query, sizeof(query), "select name from knownGene;");
+
+struct slName *sl, *slList = sqlQuickList(biConn, query);
+
+int id = 0;
+struct geneLookup *gl;
+for (sl = slList; sl; sl = sl->next)
+    {
+    AllocVar(gl);
+    gl->id = id;
+    gl->kgId = cloneString(sl->name);
+    
+    id++;
+    geneLookupSaveToDb(biConn, gl, "geneLookup", 100);
+    geneLookupFree(&gl);
+    }
+
+slNameFreeList(&slList);
+}
+
+void createProbeToGeneTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "probe_id int unsigned not null,\n");
+dyStringPrintf(dy, "gene_id int unsigned not null,\n");
+dyStringPrintf(dy, "KEY(probe_id),\n");
+dyStringPrintf(dy, "KEY(gene_id),\n");
+dyStringPrintf(dy, "KEY(probe_id,gene_id),\n");
+dyStringPrintf(dy, "KEY(gene_id,probe_id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
+{
+char query[128];
+safef(query, sizeof(query),
+      "select id from %s where name = \"%s\"",
+      tableName, name);
+
+if (!sqlExists(biConn, query))
+    return -1;
+
+return sqlQuickNum(biConn, query);
+}
+
+struct slInt *getGeneIdsBySymbol(struct sqlConnection *biConn, 
+				 struct slName *slList)
+{
+if (!slList)
+    return NULL;
+
+struct slName *sl;
+struct dyString *dy = newDyString(100);
+dyStringPrintf(dy, "select id from geneLookup "
+	       "join kgXref on geneLookup.kgId = kgXref.kgId "
+	       "where kgXref.geneSymbol in (");
+for (sl = slList; sl; sl = sl->next)
+    {
+    dyStringPrintf(dy, "\"%s\"", sl->name);
+    if (sl->next)
+	dyStringPrintf(dy, ",");
+    }
+dyStringPrintf(dy, ");");
+char *query = dyStringCannibalize(&dy);
+
+return sqlQuickNumList(biConn, query);
+}
+
+
+void setupProbeToGene(struct sqlConnection *hgConn, 
+		      struct sqlConnection *biConn, struct datasets *da)
+{
+char *p2gTable = da->probe_to_gene_table;
+if (!p2gTable)
+    {
+    fprintf(stderr, "probeToGene table not set, doing nothing.\n");
+    return;
+    }
+
+struct hash *settings = getSettings(da->data_table);
+struct hashEl *el = hashLookup(settings, "aliasTable");
+if (!el)
+    errAbort("No aliasTable.\n");
+char *aliasTable = cloneString(el->val);
+
+if (!sqlTableExists(hgConn, aliasTable))
+    errAbort("Table %s not found in hg18 database.\n", aliasTable);
+
+if (!sqlTableExists(biConn, "kgXref"))
+    errAbort("kgXref table not found in database. Cannot create probeToGene table.\n");
+
+if (sqlTableExists(biConn, p2gTable) && dropTable)
+    {
+    fprintf(stderr, "Table %s already exists, dropping and recreating.\n", p2gTable);
+    sqlDropTable(biConn, p2gTable);
+    }
+
+boolean inputProbeToGene = FALSE;
+if (!sqlTableExists(biConn, p2gTable))
+    {
+    fprintf(stderr, "Creating probeToGene table...\n");
+    createProbeToGeneTable(biConn, p2gTable);
+    inputProbeToGene = TRUE;
+    }
+
+if (!inputProbeToGene)
+    return;
+
+struct geneAlias *ga, *gaList = getAliases(hgConn, aliasTable);
+for (ga = gaList; ga; ga = ga->next)
+    {
+    int probeId = getProbeId(biConn, da->probe_table, ga->probe);
+    if (probeId < 0)  // probe in alias table doesn't exist in dataset
+	continue;
+
+    struct slInt *si, *geneIds = getGeneIdsBySymbol(biConn, ga->genes);
+
+    struct probeToGene *pg;
+    AllocVar(pg);
+    pg->probe_id = probeId;
+    for (si = geneIds; si; si = si->next)
+	{
+	pg->gene_id = si->val;
+	probeToGeneSaveToDb(biConn, pg, p2gTable, 10);
+	}
+    probeToGeneFree(&pg);
+    }
+}
+
+void createPathwaysTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "source varchar(255) not null,\n");
+dyStringPrintf(dy, "KEY(id),\n");
+dyStringPrintf(dy, "KEY(name),\n");
+dyStringPrintf(dy, "KEY(id,name),\n");
+dyStringPrintf(dy, "KEY(name,id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+void createPathwayGenesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "gene_id int unsigned not null,\n");
+dyStringPrintf(dy, "KEY(id),\n");
+dyStringPrintf(dy, "KEY(gene_id),\n");
+dyStringPrintf(dy, "KEY(id,gene_id),\n");
+dyStringPrintf(dy, "KEY(gene_id,id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+void createPathwayInfoTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "description longblob not null,\n");
+dyStringPrintf(dy, "KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}    
+
+char *getPathwayDescription(struct sqlConnection *pdConn, char *name)
+{
+if (!sqlTableExists(pdConn, name))
+    return NULL;
+
+char query[128];
+safef(query, sizeof(query), 
+      "select description from descriptions where name = \"%s\";",
+      name);
+
+return sqlQuickString(pdConn, query);
+}
+
+void setupPathways(struct sqlConnection *biConn)
+{
+boolean inputPathways = FALSE;
+boolean inputPathwayInfo = FALSE;
+boolean inputPathwayGenes = FALSE;
+
+struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
+if (!pdConn)
+    errAbort("Could not connect to pathways database.\n");
+
+if (sqlTableExists(biConn, "pathways") && dropTable)
+    {
+    fprintf(stderr, "pathways table already exists, dropping and recreating.\n");
+    sqlDropTable(biConn, "pathways");
+    }
+if (!sqlTableExists(biConn, "pathways"))
+    {
+    fprintf(stderr, "Creating pathways table.\n");
+    createPathwaysTable(biConn, "pathways");
+    inputPathways = TRUE;
+    }
+
+if (sqlTableExists(biConn, "pathwayGenes") && dropTable)
+    {
+    fprintf(stderr, "pathwayGenes table already exists, dropping and recreating.\n");
+    sqlDropTable(biConn, "pathwayGenes");
+    }
+
+if (!sqlTableExists(biConn, "pathwayGenes"))
+    {
+    fprintf(stderr, "Creating pathwayGenes table.\n");
+    createPathwayGenesTable(biConn, "pathwayGenes");
+    inputPathwayGenes = TRUE;
+    }
+
+if (sqlTableExists(biConn, "pathwayInfo") && dropTable)
+    {
+    fprintf(stderr, "pathwayInfo table already exists, dropping and recreating.\n");
+    sqlDropTable(biConn, "pathwayInfo");
+    }
+
+if (!sqlTableExists(biConn, "pathwayInfo"))
+    {
+    fprintf(stderr, "Creeting pathwayInfo table.\n");
+    createPathwayInfoTable(biConn, "pathwayInfo");
+    inputPathwayInfo = TRUE;
+    }
+
+if (!inputPathways && !inputPathwayInfo && !inputPathwayGenes)
+    {
+    fprintf(stderr, "Nothing to do for pathway tables.\n");
+    return;
+    }
+
+/* Setting up pathways table */
+char query[128];
+safef(query, sizeof(query), "select * from genesets;");
+
+struct sqlResult *sr = sqlGetResult(pdConn, query);
+char **row = NULL;
+
+/* Save all data in lists to avoid "out of sync" error when attempting
+ * query inside of a running query on same db */
+struct slName *na, *names = NULL;
+struct slName *ge, *genes = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    slNameAddHead(&names, row[0]);
+    slNameAddHead(&genes, row[1]);
+    }
+slReverse(&names);
+slReverse(&genes);
+sqlFreeResult(&sr);
+
+int id = 0;
+for (na = names, ge = genes; na && ge; na = na->next, ge = ge->next)
+    {
+    char *name = na->name;
+    char *genes = ge->name;
+
+    struct slName *slList = slNameListFromComma(genes);
+    struct slInt *si, *siList = getGeneIdsBySymbol(biConn, slList); 
+    
+    struct pathways *ps;
+
+    if (inputPathways)
+	{
+	AllocVar(ps);
+	ps->id = id;
+	ps->name = cloneString(name);
+	ps->source = cloneString("N/A");
+	pathwaysSaveToDb(biConn, ps, "pathways", 100);
+	pathwaysFree(&ps);
+	}
+
+    if (inputPathwayGenes)
+	{
+	struct pathwayGenes *pg;
+	AllocVar(pg);
+	pg->id = id;
+	for (si = siList; si; si = si->next)
+	    {
+	    pg->gene_id = si->val;
+	    pathwayGenesSaveToDb(biConn, pg, "pathwayGenes", 100);
+	    }
+	pathwayGenesFree(&pg);
+	}
+    
+    if (inputPathwayInfo)
+	{
+	char *desc = getPathwayDescription(pdConn, name);
+	if (desc)
+	    {
+	    struct pathwayInfo *pi;
+	    AllocVar(pi);
+	    pi->id = id;
+	    pi->description = desc;
+	    pathwayInfoSaveToDbEscaped(biConn, pi, "pathwayInfo", 200);
+	    pathwayInfoFree(&pi);
+	    }
+	}
+    id++;
+    }
+
+hFreeConn(&pdConn);
+}
+
+
+void populateDb(char *db, char *tableName, char *tissue)
+{
+tissue = strLower(tissue);
+struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
+struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
+
+/* Create geneLookup table (if necessary) */
+uglyTime(NULL);
+fprintf(stderr, "Setting up geneLookup table...\n");
+createGeneLookup(biConn);
+uglyTime("Time");
+
+/* Set up pathways */
+uglyTime(NULL);
+fprintf(stderr, "Setting up pathways tables...\n");
+setupPathways(biConn);
+uglyTime("Time");
+
+/* Set up datasets entry */
+struct maGrouping *allA = getMaGrouping(hgConn, tableName);
+if (!allA)
+    errAbort("Could not find maGrouping for %s!", tableName);
+
+uglyTime(NULL);
+int numSamples = allA->size;
+fprintf(stderr, "Adding datasets entry...\n");
+struct datasets *da = setupDataset(biConn, tableName, tissue, numSamples);
+uglyTime("Time");
+
+/* Set up samples entries */
+uglyTime(NULL);
+fprintf(stderr, "Adding to samples table...\n");
+struct samples *saList = setupSamples(biConn, da, allA);
+uglyTime("Time");
+
+/* Set up features and clinicalData */
+uglyTime(NULL);
+fprintf(stderr, "Setting up clinical data tables...\n");
+setupClinicalInfo(biConn, da, saList);
+uglyTime("Time");
+
+/* Set up probeInfo table (if necessary) and probeVals table */
+uglyTime(NULL);
+fprintf(stderr, "Setting up probe data tables (be patient!)...\n");
+setupProbeData(hgConn, biConn, da);
+uglyTime("Time");
+
+/* Set up probeToGene table (if necessary) */
+uglyTime(NULL);
+fprintf(stderr, "Setting up probeToGene table...\n");
+setupProbeToGene(hgConn, biConn, da);
+uglyTime("Time");
+
+fprintf(stderr, "Done!");
+
+hFreeConn(&biConn);
+hFreeConn(&hgConn);
+}
+
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+
+dropTable = FALSE;
+if (optionExists("dropAll"))
+    dropTable = TRUE;
+
+populateDb(argv[1], argv[2], argv[3]);
+return 0;
+}