src/hg/instinct/bioInt2/populateDb.c 1.1
1.1 2009/03/20 06:06:32 jsanborn
initial commit
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: src/hg/instinct/bioInt2/populateDb.c
diff -N src/hg/instinct/bioInt2/populateDb.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/instinct/bioInt2/populateDb.c 20 Mar 2009 06:06:32 -0000 1.1
@@ -0,0 +1,1274 @@
+/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "bed.h"
+#include "genePred.h"
+#include "hPrint.h"
+#include "hdb.h"
+#include "microarray.h"
+#include "ra.h"
+#include "featuresLib.h"
+#include "hgHeatmapLib.h"
+#include "bioIntDb.h"
+
+#define DEBUG 1
+
+char *hgDb = "hg18";
+char *genome = "Human";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "populateDb \n"
+ " populateDb [OPTIONS] db table tissue\n"
+ "options:\n"
+ " -dropAll Drop/recreate any table\n"
+ "\n"
+ );
+}
+
+boolean dropTable = FALSE; // If true, any table that should be dropped/recreated will be
+
+static struct optionSpec options[] = {
+ {"dropAll", OPTION_BOOLEAN},
+ {NULL, 0},
+};
+
+char *getId(struct sqlConnection *conn, char *table, char *key, char *sample, char *value)
+/* get ISPY ID from sample (or experiment) Id */
+{
+char query[512];
+safef(query, sizeof(query), "select %s from %s where %s = \"%s\" ", key, table, value, sample);
+return sqlQuickString(conn, query);
+}
+
+
+struct slName *getProbesFromTable(struct sqlConnection *hgConn, char *tableName)
+{
+char query[512];
+char *key = "name";
+safef(query, sizeof(query), "select DISTINCT %s from %s ", key, tableName);
+struct sqlResult *sr = sqlGetResult(hgConn, query);
+char **row = NULL;
+
+struct slName *sl, *slList = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ sl = slNameNew(row[0]);
+ slAddHead(&slList, sl);
+ }
+
+slReverse(&slList);
+sqlFreeResult(&sr);
+return slList;
+}
+
+
+struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
+{
+/*microarray specific settings*/
+struct trackDb *tdb = hMaybeTrackInfo(hgConn, tableName);
+struct microarrayGroups *maGs = maGroupings("hg18", tableName);
+trackDbFreeList(&tdb);
+if (!maGs)
+ return NULL;
+return maGs->allArrays;
+}
+
+struct hash *getSettings(char *tableName)
+{
+struct column *raList = getColumns(NULL, "datasets.ra", NULL);
+
+struct column *col;
+struct hash *settings = NULL;
+for (col = raList; col; col = col->next)
+ {
+ if (!sameString(col->name, tableName))
+ continue;
+
+ settings = col->settings;
+ break;
+ }
+
+if (!settings)
+ errAbort("Couldn't find datasets.ra listing for %s", tableName);
+
+return settings;
+}
+
+struct geneAlias {
+ struct geneAlias *next;
+
+ char *probe;
+ struct slName *genes;
+};
+
+
+struct geneAlias *getAliases(struct sqlConnection *hgConn, char *tableName)
+{
+if (!hgConn || !tableName)
+ return NULL;
+
+char query[512];
+char **row;
+
+safef(query, sizeof(query), "select * from %s", tableName);
+
+struct sqlResult *sr = sqlGetResult(hgConn, query);
+
+struct geneAlias *ga, *gaList = NULL;
+struct hash *gaHash = hashNew(0);
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ char *probe = cloneString(row[0]);
+ char *gene = cloneString(row[1]);
+
+ struct hashEl *el = hashLookup(gaHash, probe);
+
+ if (!el)
+ {
+ ga = AllocA(struct geneAlias);
+ ga->probe = cloneString(probe);
+ ga->genes = NULL;
+
+ slAddHead(&gaList, ga);
+ hashAdd(gaHash, probe, ga);
+ }
+ else
+ ga = el->val;
+
+ slNameAddHead(&ga->genes, gene);
+ }
+
+hashFree(&gaHash);
+sqlFreeResult(&sr);
+
+return gaList;
+}
+
+struct dataTypes *findDataType(struct sqlConnection *biConn, char *type, char *platform)
+{
+if (!sameString(type, "bed 15"))
+ errAbort("populateDb only runs on bed 15 files.");
+
+char *data_format = "probeVals";
+
+char query[256];
+safef(query, sizeof(query),
+ "select * from dataTypes where format = \"%s\" and name = \"%s\"",
+ data_format, platform);
+
+return dataTypesLoadByQuery(biConn, query);
+}
+
+struct dataTypes *createDataType(struct sqlConnection *biConn, char *type, char *platform)
+{
+int nextId = sqlTableSize(biConn, "dataTypes");
+struct dataTypes *dt;
+AllocVar(dt);
+dt->id = nextId;
+dt->format = cloneString("probeVals");
+dt->name = cloneString(platform);
+
+/* Save to db */
+dataTypesSaveToDb(biConn, dt, "dataTypes", 100);
+return dt;
+}
+
+void createDataTypesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "format varchar(255) not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+struct dataTypes *setupDataType(struct sqlConnection *biConn,
+ char *type, char *platform)
+{
+if (!sqlTableExists(biConn, "dataTypes"))
+ {
+ fprintf(stderr, "Tables dataTypes doesn't exist, creating...\n");
+ createDataTypesTable(biConn, "dataTypes");
+ }
+
+struct dataTypes *dt = findDataType(biConn, type, platform);
+if (!dt)
+ dt = createDataType(biConn, type, platform);
+
+return dt;
+}
+
+
+struct tissues *findTissue(struct sqlConnection *biConn, char *tissue)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from tissues where name = \"%s\";",
+ tissue);
+return tissuesLoadByQuery(biConn, query);
+}
+
+struct tissues *createTissue(struct sqlConnection *biConn, char *tissue)
+{
+int nextId = sqlTableSize(biConn, "tissues");
+struct tissues *ti;
+AllocVar(ti);
+ti->id = nextId;
+ti->name = cloneString(tissue);
+
+/* Save to db */
+tissuesSaveToDb(biConn, ti, "tissues", 100);
+return ti;
+}
+
+void createTissuesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+struct tissues *setupTissue(struct sqlConnection *biConn, char *tissue)
+{
+if (!sqlTableExists(biConn, "tissues"))
+ {
+ fprintf(stderr, "Tables tissues doesn't exist, creating...\n");
+ createTissuesTable(biConn, "tissues");
+ }
+
+struct tissues *ti = findTissue(biConn, tissue);
+if (!ti)
+ ti = createTissue(biConn, tissue);
+
+return ti;
+}
+
+
+struct datasets *findDataset(struct sqlConnection *biConn, char *name)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from datasets where data_table = \"%s\";",
+ name);
+return datasetsLoadByQuery(biConn, query);
+}
+
+
+struct datasets *createDataset(struct sqlConnection *biConn,
+ char *tableName, char *tissue, int numSamples)
+{
+struct hash *settings = getSettings(tableName);
+
+struct hashEl *el = hashLookup(settings, "shortLabel");
+if (!el)
+ errAbort("No shortLabel");
+char *shortLabel = cloneString(el->val);
+
+el = hashLookup(settings, "name");
+if (!el)
+ errAbort("No name");
+char *dataTable = cloneString(el->val);
+
+char probeTable[256];
+safef(probeTable, sizeof(probeTable), "%s_probeInfo", dataTable);
+
+char p2gTable[256];
+safef(p2gTable, sizeof(p2gTable), "%s_probeToGene", dataTable);
+
+char *platform;
+el = hashLookup(settings, "platform");
+if (!el)
+ platform = cloneString("Expression");
+else
+ platform = cloneString(el->val);
+
+el = hashLookup(settings, "dataType");
+if (!el)
+ errAbort("No dataType");
+char *dataType = cloneString(el->val);
+
+struct dataTypes *dt = setupDataType(biConn, dataType, platform);
+
+struct tissues *ti = setupTissue(biConn, tissue);
+
+int nextId = sqlTableSize(biConn, "datasets");
+
+struct datasets *da;
+AllocVar(da);
+da->id = nextId;
+da->tissue_id = ti->id;
+da->type_id = dt->id;
+da->num_samples = numSamples;
+da->name = shortLabel;
+da->data_table = dataTable;
+da->probe_table = cloneString(probeTable);
+da->probe_to_gene_table = cloneString(p2gTable);
+
+dataTypesFree(&dt);
+tissuesFree(&ti);
+
+/* Write datasets */
+datasetsSaveToDbEscaped(biConn, da, "datasets", 100);
+
+return da;
+}
+
+void createDatasetsTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
+dyStringPrintf(dy, "type_id int unsigned not null,\n");
+dyStringPrintf(dy, "num_samples int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "data_table varchar(255) not null,\n");
+dyStringPrintf(dy, "probe_table varchar(255) not null,\n");
+dyStringPrintf(dy, "probe_to_gene_table varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+struct datasets *setupDataset(struct sqlConnection *biConn,
+ char *tableName, char *tissue, int numSamples)
+{
+if (!sqlTableExists(biConn, "datasets"))
+ {
+ fprintf(stderr, "Tables datasets doesn't exist, creating...");
+ createDatasetsTable(biConn, "datasets");
+ }
+
+struct datasets *da = findDataset(biConn, tableName);
+if (!da)
+ da = createDataset(biConn, tableName, tissue, numSamples);
+
+return da;
+}
+
+
+char *findPatientName(struct sqlConnection *pdConn, char *pTable,
+ char *pField, char *sField, char *sName)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select %s from %s where %s = \"%s\"",
+ pField, pTable, sField, sName);
+
+return sqlQuickString(pdConn, query);
+}
+
+int findId(struct sqlConnection *biConn, char *idField, char *sField, char *name)
+{
+if (sqlTableSize(biConn, "samples") == 0) /* brand new table, return 0 */
+ return 0;
+
+char query[256];
+safef(query, sizeof(query),
+ "select DISTINCT %s from samples where %s = \"%s\";",
+ idField, sField, name);
+if (sqlExists(biConn, query)) /* sample name found, use same id */
+ return sqlQuickNum(biConn, query);
+
+/* Else, find maximum sample id and add one to it */
+safef(query, sizeof(query),
+ "select max(%s) from samples;",
+ idField);
+int maxId = sqlQuickNum(biConn, query);
+return maxId + 1;
+}
+
+boolean sampleExists(struct sqlConnection *biConn, struct samples *sa)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from samples where id = %d "
+ "and name = \"%s\" "
+ "and patient_id = %d "
+ "and patient_name = \"%s\" "
+ "and dataset_id = %d "
+ "and exp_id = %d "
+ "and tissue_id = %d ",
+ sa->id, sa->name, sa->patient_id, sa->patient_name, sa->dataset_id,
+ sa->exp_id, sa->tissue_id);
+
+return sqlExists(biConn, query);
+}
+
+void createSamples(struct sqlConnection *biConn, struct datasets *da, struct maGrouping *allA)
+{
+int datasetId = da->id;
+int tissueId = da->tissue_id;
+
+struct hash *settings = getSettings(da->data_table);
+
+struct hashEl *el = hashLookup(settings, "patDb");
+if (!el)
+ errAbort("No patDb!");
+char *patDb = cloneString(el->val);
+
+el = hashLookup(settings, "patTable");
+if (!el)
+ errAbort("No patTable");
+char *patTable = cloneString(el->val);
+
+el = hashLookup(settings, "patField");
+if (!el)
+ errAbort("No patField");
+char *patField = cloneString(el->val);
+
+el = hashLookup(settings, "sampleField");
+if (!el)
+ errAbort("No sampleField");
+char *sampleField = cloneString(el->val);
+
+struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb);
+
+int i;
+struct samples *sa;
+for (i = 0; i < allA->size; i++)
+ {
+ char *sampleName = cloneString(allA->names[i]);
+ int expId = allA->expIds[i];
+
+ int sampleId = findId(biConn, "id", "name", sampleName);
+ char *patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName);
+ int patientId = findId(biConn, "patient_id", "patient_name", patientName);
+
+ AllocVar(sa);
+ sa->id = sampleId;
+ sa->name = sampleName;
+ sa->patient_id = patientId;
+ sa->patient_name = patientName;
+ sa->dataset_id = datasetId;
+ sa->exp_id = expId;
+ sa->tissue_id = tissueId;
+
+ if (!sampleExists(biConn, sa))
+ samplesSaveToDb(biConn, sa, "samples", 100);
+
+ samplesFree(&sa);
+ }
+
+hFreeConn(&pdConn);
+}
+
+void createSamplesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "patient_id int unsigned not null,\n");
+dyStringPrintf(dy, "patient_name varchar(255) not null,\n");
+dyStringPrintf(dy, "dataset_id int unsigned not null,\n");
+dyStringPrintf(dy, "exp_id int unsigned not null,\n");
+dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
+dyStringPrintf(dy, "KEY(id),\n");
+dyStringPrintf(dy, "KEY(dataset_id),\n");
+dyStringPrintf(dy, "KEY(id, dataset_id),\n");
+dyStringPrintf(dy, "KEY(dataset_id,id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+struct samples *getSamples(struct sqlConnection *biConn, struct datasets *da)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from samples where dataset_id = %d order by exp_id;",
+ da->id);
+
+return samplesLoadByQuery(biConn, query);
+}
+
+
+struct samples *setupSamples(struct sqlConnection *biConn, struct datasets *da,
+ struct maGrouping *allA)
+{
+if (!sqlTableExists(biConn, "samples"))
+ {
+ fprintf(stderr, "Table samples doesn't exist, creating...\n");
+ createSamplesTable(biConn, "samples");
+ }
+
+createSamples(biConn, da, allA);
+struct samples *saList = getSamples(biConn, da);
+if (slCount(saList) != allA->size)
+ errAbort("Sample count from microarrayGroups and database don't match!");
+
+return saList;
+}
+
+
+int getFeatureId(struct sqlConnection *biConn, char *name)
+{
+if (sqlTableSize(biConn, "features") == 0) /* brand new table, return 0 */
+ return 0;
+
+char query[256];
+safef(query, sizeof(query),
+ "select id from features where name = \"%s\";",
+ name);
+
+if (sqlExists(biConn, query)) /* sample name found, use same id */
+ return sqlQuickNum(biConn, query);
+else
+ return sqlTableSize(biConn, "features");
+}
+
+struct features *getFeature(struct sqlConnection *biConn, char *name)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from features where name = \"%s\";",
+ name);
+
+return featuresLoadByQuery(biConn, query);
+}
+
+boolean featureExists(struct sqlConnection *biConn, struct features *fs)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from features where name = \"%s\";",
+ fs->name);
+
+return sqlExists(biConn, query);
+}
+
+boolean clinicalDataExists(struct sqlConnection *biConn, struct clinicalData *cd)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from clinicalData where sample_id = %d "
+ "and feature_id = %d; ",
+ cd->sample_id, cd->feature_id);
+
+if (!sqlExists(biConn, query)) /* entry doesn't exist, report */
+ return FALSE;
+
+/* Make sure entry has same values, if not there is a problem
+ * (sample_id, feature_id) should be unique */
+
+struct clinicalData *cd2 = clinicalDataLoadByQuery(biConn, query);
+if (slCount(cd2) != 1)
+ errAbort("clinicalData entries not unique, sample_id = %d, feature_id = %d",
+ cd->sample_id, cd->feature_id);
+
+if (cd->val != cd2->val)
+ errAbort("clinicalData values don't match, sample_id = %d, feature_id = %d, "
+ "%f != %f",
+ cd->sample_id, cd->feature_id, cd->val, cd2->val);
+
+if (cd->code && cd2->code)
+ if (!sameString(cd->code, cd2->code))
+ errAbort("clinicalData codes don't match, sample_id = %d, feature_id = %d",
+ cd->sample_id, cd->feature_id);
+
+return TRUE;
+}
+
+void createFeaturesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "shortLabel varchar(255) not null,\n");
+dyStringPrintf(dy, "longLabel varchar(255) not null,\n");
+dyStringPrintf(dy, "KEY(id),\n");
+dyStringPrintf(dy, "KEY(name),\n");
+dyStringPrintf(dy, "KEY(id,name)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+void createClinicalDataTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "sample_id int unsigned not null,\n");
+dyStringPrintf(dy, "feature_id int unsigned not null,\n");
+dyStringPrintf(dy, "val double not null,\n");
+dyStringPrintf(dy, "code varchar(255),\n");
+dyStringPrintf(dy, "KEY(sample_id),\n");
+dyStringPrintf(dy, "KEY(feature_id),\n");
+dyStringPrintf(dy, "KEY(sample_id,feature_id),\n");
+dyStringPrintf(dy, "KEY(feature_id,sample_id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+void setupClinicalInfo(struct sqlConnection *biConn, struct datasets *da, struct samples *saList)
+{
+if (!saList)
+ return;
+
+if (!sqlTableExists(biConn, "features"))
+ {
+ fprintf(stderr, "Table features doesn't exist, creating...\n");
+ createFeaturesTable(biConn, "features");
+ }
+
+if (!sqlTableExists(biConn, "clinicalData"))
+ {
+ fprintf(stderr, "Table clinicalData doesn't exist, creating...\n");
+ createClinicalDataTable(biConn, "clinicalData");
+ }
+
+struct hash *settings = getSettings(da->data_table);
+
+struct hashEl *el = hashLookup(settings, "raFile");
+if (!el)
+ errAbort("No raFile");
+char *raFile = cloneString(el->val);
+
+el = hashLookup(settings, "patDb");
+if (!el)
+ errAbort("No patDb");
+char *patDb = cloneString(el->val);
+
+el = hashLookup(settings, "patTable");
+if (!el)
+ errAbort("No patTable");
+char *patTable = cloneString(el->val);
+
+el = hashLookup(settings, "patField");
+if (!el)
+ errAbort("No patField");
+char *patField = cloneString(el->val);
+
+el = hashLookup(settings, "sampleField");
+if (!el)
+ errAbort("No sampleField");
+char *sampleField = cloneString(el->val);
+
+if (!raFile || !patDb || !patTable || !patField || !sampleField)
+ errAbort("Incomplete ra entry for %s.", da->data_table);
+
+struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb); //connection to patient data
+
+if (DEBUG)
+ fprintf(stderr, "Getting columns of clinical data...\n");
+struct column *col, *colList = getColumns(pdConn, raFile, patDb);
+
+/* Set up features */
+struct features *fs;
+for (col = colList; col; col = col->next)
+ {
+ char *name = col->name;
+ char *shortLabel = col->shortLabel;
+ char *longLabel = col->longLabel;
+
+ int id = getFeatureId(biConn, name);
+ AllocVar(fs);
+ fs->id = id;
+ fs->name = cloneString(name);
+ fs->shortLabel = cloneString(shortLabel);
+ fs->longLabel= cloneString(longLabel);
+
+ if (!featureExists(biConn, fs))
+ featuresSaveToDbEscaped(biConn, fs, "features", 100);
+ featuresFree(&fs);
+
+ fs = getFeature(biConn, name);
+ if (!fs)
+ errAbort("Could not find feature %s.", name);
+
+ if (slCount(fs) != 1)
+ errAbort("Could not find unique feature by name = %s.", name);
+
+ /* Loop through all samples, putting data in database */
+ struct samples *sa;
+ struct clinicalData *cd;
+ for (sa = saList; sa; sa = sa->next)
+ {
+ struct slName *id = slNameNew(getId(pdConn, patTable, patField, sa->name, sampleField));
+
+ char *cellVal = col->cellVal(col, id, pdConn);
+ if (!cellVal)
+ continue;
+
+ AllocVar(cd);
+ cd->sample_id = sa->id;
+ cd->feature_id = fs->id;
+
+ cd->val = atof(cellVal);
+ cd->code = NULL;
+ if (col->cellCoded(col, pdConn))
+ cd->code = cloneString(col->cellCodedVal(col, id, pdConn));
+
+ if (!clinicalDataExists(biConn, cd))
+ clinicalDataSaveToDb(biConn, cd, "clinicalData", 100);
+
+ clinicalDataFree(&cd);
+ slNameFree(&id);
+ }
+ featuresFree(&fs);
+ }
+
+hFreeConn(&pdConn);
+}
+
+void createProbeValsTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "probe_id int unsigned not null,\n");
+dyStringPrintf(dy, "sample_count int unsigned not null,\n");
+dyStringPrintf(dy, "sample_data longblob not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(probe_id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+void createProbeInfoTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "chrom varchar(255) not null,\n");
+dyStringPrintf(dy, "start int unsigned not null,\n");
+dyStringPrintf(dy, "stop int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id),\n");
+dyStringPrintf(dy, "KEY(name),\n");
+dyStringPrintf(dy, "KEY(id,name)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+void addProbeValsToDb(struct sqlConnection *biConn, char *tableName,
+ int probe_id, int sample_count, char *dataString, int updateSize)
+{ /* This bypasses the need to convert to float array to enter in database, may not
+ * be faster, but by converting first to float array any "blank" datapoints are
+ * converted to 0.0, instead of leaving blank */
+struct dyString *update = newDyString(updateSize);
+dyStringPrintf(update, "insert into %s values ( %u,%u,'%s')",
+ tableName, probe_id, sample_count, dataString );
+sqlUpdate(biConn, update->string);
+freeDyString(&update);
+}
+
+
+void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn,
+ struct datasets *da)
+{
+char *dataTable = da->data_table;
+char *probeTable = da->probe_table;
+
+if (!dataTable || !probeTable)
+ errAbort("datasets entry not complete, data_table or probe_table not set.");
+
+boolean inputProbeVals = FALSE;
+boolean inputProbeInfo = FALSE;
+
+if (sqlTableExists(biConn, dataTable) && dropTable)
+ {
+ fprintf(stderr, "probeVals table %s already exists in db, dropping...\n", dataTable);
+ sqlDropTable(biConn, dataTable);
+ }
+
+if (!sqlTableExists(biConn, dataTable))
+ {
+ fprintf(stderr, "Creating probeVals table %s...\n", dataTable);
+ createProbeValsTable(biConn, dataTable);
+ inputProbeVals = TRUE; // empty table, input
+ }
+
+if (sqlTableExists(biConn, probeTable) && dropTable)
+ {
+ fprintf(stderr, "probeInfo table %s already exists in db, dropping...\n", probeTable);
+ sqlDropTable(biConn, probeTable);
+ }
+
+if (!sqlTableExists(biConn, probeTable))
+ {
+ fprintf(stderr, "Creating probeInfo table %s...\n", probeTable);
+ createProbeInfoTable(biConn, probeTable);
+ inputProbeInfo = TRUE; // empty table, input
+ }
+
+char query[256];
+safef(query, sizeof(query), "select * from %s;", dataTable);
+
+/* Get bed15 data from hg18 database */
+int id = 0;
+struct sqlResult *sr = sqlGetResult(hgConn, query);
+
+char **row = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ char *chrom = row[1];
+ unsigned chromStart = sqlUnsigned(row[2]);
+ unsigned chromEnd = sqlUnsigned(row[3]);
+ char *name = row[4];
+ unsigned expCount = sqlUnsigned(row[13]);
+ char *expScores = row[15];
+
+ /* Make probeInfo entry and load into db*/
+ struct probeInfo *pi;
+ AllocVar(pi);
+ pi->id = id;
+ pi->chrom = cloneString(chrom);
+ pi->start = chromStart;
+ pi->stop = chromEnd;
+ pi->name = cloneString(name);
+
+ if (inputProbeInfo)
+ probeInfoSaveToDb(biConn, pi, probeTable, 100);
+
+ /* Make probeVals entry and load into db, straight-up copying longblob*/
+ if (inputProbeVals)
+ addProbeValsToDb(biConn, dataTable, id, expCount, expScores, 500);
+
+ id++;
+ probeInfoFree(&pi);
+ }
+}
+
+void createGeneLookupTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "kgId varchar(255) not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id),\n");
+dyStringPrintf(dy, "KEY(kgId),\n");
+dyStringPrintf(dy, "KEY(id,kgId),\n");
+dyStringPrintf(dy, "KEY(kgId,id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+void createGeneLookup(struct sqlConnection *biConn)
+{
+if (!sqlTableExists(biConn, "geneLookup"))
+ {
+ fprintf(stderr, "geneLookup table doesn't exist in bioInt database, recreating it.\n");
+ createGeneLookupTable(biConn, "geneLookup");
+ }
+
+if (sqlTableSize(biConn, "geneLookup") > 0)
+ {
+ fprintf(stderr, "geneLookup table already has data in it, doing nothing.\n");
+ return;
+ }
+
+if (!sqlTableExists(biConn, "knownGene"))
+ errAbort("Need knownGene table in bioInt database.");
+
+char query[256];
+safef(query, sizeof(query), "select name from knownGene;");
+
+struct slName *sl, *slList = sqlQuickList(biConn, query);
+
+int id = 0;
+struct geneLookup *gl;
+for (sl = slList; sl; sl = sl->next)
+ {
+ AllocVar(gl);
+ gl->id = id;
+ gl->kgId = cloneString(sl->name);
+
+ id++;
+ geneLookupSaveToDb(biConn, gl, "geneLookup", 100);
+ geneLookupFree(&gl);
+ }
+
+slNameFreeList(&slList);
+}
+
+void createProbeToGeneTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "probe_id int unsigned not null,\n");
+dyStringPrintf(dy, "gene_id int unsigned not null,\n");
+dyStringPrintf(dy, "KEY(probe_id),\n");
+dyStringPrintf(dy, "KEY(gene_id),\n");
+dyStringPrintf(dy, "KEY(probe_id,gene_id),\n");
+dyStringPrintf(dy, "KEY(gene_id,probe_id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
+{
+char query[128];
+safef(query, sizeof(query),
+ "select id from %s where name = \"%s\"",
+ tableName, name);
+
+if (!sqlExists(biConn, query))
+ return -1;
+
+return sqlQuickNum(biConn, query);
+}
+
+struct slInt *getGeneIdsBySymbol(struct sqlConnection *biConn,
+ struct slName *slList)
+{
+if (!slList)
+ return NULL;
+
+struct slName *sl;
+struct dyString *dy = newDyString(100);
+dyStringPrintf(dy, "select id from geneLookup "
+ "join kgXref on geneLookup.kgId = kgXref.kgId "
+ "where kgXref.geneSymbol in (");
+for (sl = slList; sl; sl = sl->next)
+ {
+ dyStringPrintf(dy, "\"%s\"", sl->name);
+ if (sl->next)
+ dyStringPrintf(dy, ",");
+ }
+dyStringPrintf(dy, ");");
+char *query = dyStringCannibalize(&dy);
+
+return sqlQuickNumList(biConn, query);
+}
+
+
+void setupProbeToGene(struct sqlConnection *hgConn,
+ struct sqlConnection *biConn, struct datasets *da)
+{
+char *p2gTable = da->probe_to_gene_table;
+if (!p2gTable)
+ {
+ fprintf(stderr, "probeToGene table not set, doing nothing.\n");
+ return;
+ }
+
+struct hash *settings = getSettings(da->data_table);
+struct hashEl *el = hashLookup(settings, "aliasTable");
+if (!el)
+ errAbort("No aliasTable.\n");
+char *aliasTable = cloneString(el->val);
+
+if (!sqlTableExists(hgConn, aliasTable))
+ errAbort("Table %s not found in hg18 database.\n", aliasTable);
+
+if (!sqlTableExists(biConn, "kgXref"))
+ errAbort("kgXref table not found in database. Cannot create probeToGene table.\n");
+
+if (sqlTableExists(biConn, p2gTable) && dropTable)
+ {
+ fprintf(stderr, "Table %s already exists, dropping and recreating.\n", p2gTable);
+ sqlDropTable(biConn, p2gTable);
+ }
+
+boolean inputProbeToGene = FALSE;
+if (!sqlTableExists(biConn, p2gTable))
+ {
+ fprintf(stderr, "Creating probeToGene table...\n");
+ createProbeToGeneTable(biConn, p2gTable);
+ inputProbeToGene = TRUE;
+ }
+
+if (!inputProbeToGene)
+ return;
+
+struct geneAlias *ga, *gaList = getAliases(hgConn, aliasTable);
+for (ga = gaList; ga; ga = ga->next)
+ {
+ int probeId = getProbeId(biConn, da->probe_table, ga->probe);
+ if (probeId < 0) // probe in alias table doesn't exist in dataset
+ continue;
+
+ struct slInt *si, *geneIds = getGeneIdsBySymbol(biConn, ga->genes);
+
+ struct probeToGene *pg;
+ AllocVar(pg);
+ pg->probe_id = probeId;
+ for (si = geneIds; si; si = si->next)
+ {
+ pg->gene_id = si->val;
+ probeToGeneSaveToDb(biConn, pg, p2gTable, 10);
+ }
+ probeToGeneFree(&pg);
+ }
+}
+
+void createPathwaysTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "source varchar(255) not null,\n");
+dyStringPrintf(dy, "KEY(id),\n");
+dyStringPrintf(dy, "KEY(name),\n");
+dyStringPrintf(dy, "KEY(id,name),\n");
+dyStringPrintf(dy, "KEY(name,id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+void createPathwayGenesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "gene_id int unsigned not null,\n");
+dyStringPrintf(dy, "KEY(id),\n");
+dyStringPrintf(dy, "KEY(gene_id),\n");
+dyStringPrintf(dy, "KEY(id,gene_id),\n");
+dyStringPrintf(dy, "KEY(gene_id,id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+void createPathwayInfoTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "description longblob not null,\n");
+dyStringPrintf(dy, "KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+char *getPathwayDescription(struct sqlConnection *pdConn, char *name)
+{
+if (!sqlTableExists(pdConn, name))
+ return NULL;
+
+char query[128];
+safef(query, sizeof(query),
+ "select description from descriptions where name = \"%s\";",
+ name);
+
+return sqlQuickString(pdConn, query);
+}
+
+void setupPathways(struct sqlConnection *biConn)
+{
+boolean inputPathways = FALSE;
+boolean inputPathwayInfo = FALSE;
+boolean inputPathwayGenes = FALSE;
+
+struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
+if (!pdConn)
+ errAbort("Could not connect to pathways database.\n");
+
+if (sqlTableExists(biConn, "pathways") && dropTable)
+ {
+ fprintf(stderr, "pathways table already exists, dropping and recreating.\n");
+ sqlDropTable(biConn, "pathways");
+ }
+if (!sqlTableExists(biConn, "pathways"))
+ {
+ fprintf(stderr, "Creating pathways table.\n");
+ createPathwaysTable(biConn, "pathways");
+ inputPathways = TRUE;
+ }
+
+if (sqlTableExists(biConn, "pathwayGenes") && dropTable)
+ {
+ fprintf(stderr, "pathwayGenes table already exists, dropping and recreating.\n");
+ sqlDropTable(biConn, "pathwayGenes");
+ }
+
+if (!sqlTableExists(biConn, "pathwayGenes"))
+ {
+ fprintf(stderr, "Creating pathwayGenes table.\n");
+ createPathwayGenesTable(biConn, "pathwayGenes");
+ inputPathwayGenes = TRUE;
+ }
+
+if (sqlTableExists(biConn, "pathwayInfo") && dropTable)
+ {
+ fprintf(stderr, "pathwayInfo table already exists, dropping and recreating.\n");
+ sqlDropTable(biConn, "pathwayInfo");
+ }
+
+if (!sqlTableExists(biConn, "pathwayInfo"))
+ {
+ fprintf(stderr, "Creeting pathwayInfo table.\n");
+ createPathwayInfoTable(biConn, "pathwayInfo");
+ inputPathwayInfo = TRUE;
+ }
+
+if (!inputPathways && !inputPathwayInfo && !inputPathwayGenes)
+ {
+ fprintf(stderr, "Nothing to do for pathway tables.\n");
+ return;
+ }
+
+/* Setting up pathways table */
+char query[128];
+safef(query, sizeof(query), "select * from genesets;");
+
+struct sqlResult *sr = sqlGetResult(pdConn, query);
+char **row = NULL;
+
+/* Save all data in lists to avoid "out of sync" error when attempting
+ * query inside of a running query on same db */
+struct slName *na, *names = NULL;
+struct slName *ge, *genes = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ slNameAddHead(&names, row[0]);
+ slNameAddHead(&genes, row[1]);
+ }
+slReverse(&names);
+slReverse(&genes);
+sqlFreeResult(&sr);
+
+int id = 0;
+for (na = names, ge = genes; na && ge; na = na->next, ge = ge->next)
+ {
+ char *name = na->name;
+ char *genes = ge->name;
+
+ struct slName *slList = slNameListFromComma(genes);
+ struct slInt *si, *siList = getGeneIdsBySymbol(biConn, slList);
+
+ struct pathways *ps;
+
+ if (inputPathways)
+ {
+ AllocVar(ps);
+ ps->id = id;
+ ps->name = cloneString(name);
+ ps->source = cloneString("N/A");
+ pathwaysSaveToDb(biConn, ps, "pathways", 100);
+ pathwaysFree(&ps);
+ }
+
+ if (inputPathwayGenes)
+ {
+ struct pathwayGenes *pg;
+ AllocVar(pg);
+ pg->id = id;
+ for (si = siList; si; si = si->next)
+ {
+ pg->gene_id = si->val;
+ pathwayGenesSaveToDb(biConn, pg, "pathwayGenes", 100);
+ }
+ pathwayGenesFree(&pg);
+ }
+
+ if (inputPathwayInfo)
+ {
+ char *desc = getPathwayDescription(pdConn, name);
+ if (desc)
+ {
+ struct pathwayInfo *pi;
+ AllocVar(pi);
+ pi->id = id;
+ pi->description = desc;
+ pathwayInfoSaveToDbEscaped(biConn, pi, "pathwayInfo", 200);
+ pathwayInfoFree(&pi);
+ }
+ }
+ id++;
+ }
+
+hFreeConn(&pdConn);
+}
+
+
+void populateDb(char *db, char *tableName, char *tissue)
+{
+tissue = strLower(tissue);
+struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
+struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
+
+/* Create geneLookup table (if necessary) */
+uglyTime(NULL);
+fprintf(stderr, "Setting up geneLookup table...\n");
+createGeneLookup(biConn);
+uglyTime("Time");
+
+/* Set up pathways */
+uglyTime(NULL);
+fprintf(stderr, "Setting up pathways tables...\n");
+setupPathways(biConn);
+uglyTime("Time");
+
+/* Set up datasets entry */
+struct maGrouping *allA = getMaGrouping(hgConn, tableName);
+if (!allA)
+ errAbort("Could not find maGrouping for %s!", tableName);
+
+uglyTime(NULL);
+int numSamples = allA->size;
+fprintf(stderr, "Adding datasets entry...\n");
+struct datasets *da = setupDataset(biConn, tableName, tissue, numSamples);
+uglyTime("Time");
+
+/* Set up samples entries */
+uglyTime(NULL);
+fprintf(stderr, "Adding to samples table...\n");
+struct samples *saList = setupSamples(biConn, da, allA);
+uglyTime("Time");
+
+/* Set up features and clinicalData */
+uglyTime(NULL);
+fprintf(stderr, "Setting up clinical data tables...\n");
+setupClinicalInfo(biConn, da, saList);
+uglyTime("Time");
+
+/* Set up probeInfo table (if necessary) and probeVals table */
+uglyTime(NULL);
+fprintf(stderr, "Setting up probe data tables (be patient!)...\n");
+setupProbeData(hgConn, biConn, da);
+uglyTime("Time");
+
+/* Set up probeToGene table (if necessary) */
+uglyTime(NULL);
+fprintf(stderr, "Setting up probeToGene table...\n");
+setupProbeToGene(hgConn, biConn, da);
+uglyTime("Time");
+
+fprintf(stderr, "Done!");
+
+hFreeConn(&biConn);
+hFreeConn(&hgConn);
+}
+
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+ usage();
+
+dropTable = FALSE;
+if (optionExists("dropAll"))
+ dropTable = TRUE;
+
+populateDb(argv[1], argv[2], argv[3]);
+return 0;
+}