src/hg/instinct/bioInt2/populateDb.c 1.6
1.6 2009/04/04 00:39:22 jsanborn
added cohorts api
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/populateDb.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 1000000 -r1.5 -r1.6
--- src/hg/instinct/bioInt2/populateDb.c 27 Mar 2009 21:56:27 -0000 1.5
+++ src/hg/instinct/bioInt2/populateDb.c 4 Apr 2009 00:39:22 -0000 1.6
@@ -1,1275 +1,1289 @@
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "bed.h"
#include "genePred.h"
#include "hPrint.h"
#include "hdb.h"
#include "microarray.h"
#include "ra.h"
#include "featuresLib.h"
#include "hgHeatmapLib.h"
#include "bioIntDriver.h"
#include "bioIntDb.h"
char *hgDb = "hg18";
char *genome = "Human";
void usage()
/* Explain usage and exit. */
{
errAbort(
"populateDb \n"
" populateDb [OPTIONS] db table tissue\n"
"options:\n"
" -dropAll Drop/recreate any table\n"
+ " -tcga handles TCGA ids\n"
"\n"
);
}
boolean dropTable = FALSE; // If true, any table that should be dropped/recreated will be
+boolean isTCGA = FALSE; // If true, specially handle TCGA ids
static struct optionSpec options[] = {
{"dropAll", OPTION_BOOLEAN},
+ {"tcga", OPTION_BOOLEAN},
{NULL, 0},
};
char *getId(struct sqlConnection *conn, char *table, char *key, char *sample, char *value)
/* get ISPY ID from sample (or experiment) Id */
{
char query[512];
safef(query, sizeof(query), "select %s from %s where %s = \"%s\" ", key, table, value, sample);
return sqlQuickString(conn, query);
}
struct slName *getProbesFromTable(struct sqlConnection *hgConn, char *tableName)
{
char query[512];
char *key = "name";
safef(query, sizeof(query), "select DISTINCT %s from %s ", key, tableName);
struct sqlResult *sr = sqlGetResult(hgConn, query);
char **row = NULL;
struct slName *sl, *slList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
sl = slNameNew(row[0]);
slAddHead(&slList, sl);
}
slReverse(&slList);
sqlFreeResult(&sr);
return slList;
}
struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
{
/*microarray specific settings*/
struct trackDb *tdb = hMaybeTrackInfo(hgConn, tableName);
struct microarrayGroups *maGs = maGroupings("hg18", tableName);
trackDbFreeList(&tdb);
if (!maGs)
return NULL;
return maGs->allArrays;
}
struct hash *getSettings(char *tableName)
{
struct column *raList = getColumns(NULL, "datasets.ra", NULL);
struct column *col;
struct hash *settings = NULL;
for (col = raList; col; col = col->next)
{
if (!sameString(col->name, tableName))
continue;
settings = col->settings;
break;
}
if (!settings)
errAbort("Couldn't find datasets.ra listing for %s", tableName);
return settings;
}
struct geneAlias {
struct geneAlias *next;
char *probe;
struct slName *genes;
};
struct geneAlias *getAliases(struct sqlConnection *hgConn, char *tableName)
{
if (!hgConn || !tableName)
return NULL;
char query[512];
char **row;
safef(query, sizeof(query), "select * from %s", tableName);
struct sqlResult *sr = sqlGetResult(hgConn, query);
struct geneAlias *ga, *gaList = NULL;
struct hash *gaHash = hashNew(0);
while ((row = sqlNextRow(sr)) != NULL)
{
char *probe = cloneString(row[0]);
char *gene = cloneString(row[1]);
struct hashEl *el = hashLookup(gaHash, probe);
if (!el)
{
ga = AllocA(struct geneAlias);
ga->probe = cloneString(probe);
ga->genes = NULL;
slAddHead(&gaList, ga);
hashAdd(gaHash, probe, ga);
}
else
ga = el->val;
slNameAddHead(&ga->genes, gene);
}
hashFree(&gaHash);
sqlFreeResult(&sr);
return gaList;
}
struct dataTypes *findDataType(struct sqlConnection *biConn, char *type, char *platform)
{
if (!sameString(type, "bed 15"))
errAbort("populateDb only runs on bed 15 files.");
char *data_format = "probeVals";
char query[256];
safef(query, sizeof(query),
"select * from %s where format = \"%s\" and name = \"%s\"",
DT_TABLE, data_format, platform);
return dataTypesLoadByQuery(biConn, query);
}
struct dataTypes *createDataType(struct sqlConnection *biConn, char *type, char *platform)
{
int nextId = sqlTableSize(biConn, DT_TABLE);
struct dataTypes *dt;
AllocVar(dt);
dt->id = nextId;
dt->format = cloneString("probeVals");
dt->name = cloneString(platform);
/* Save to db */
dataTypesSaveToDb(biConn, dt, DT_TABLE, 100);
return dt;
}
void createDataTypesTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "format varchar(255) not null,\n");
dyStringPrintf(dy, "name varchar(255) not null,\n");
dyStringPrintf(dy, "PRIMARY KEY(id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
struct dataTypes *setupDataType(struct sqlConnection *biConn,
char *type, char *platform)
{
if (!sqlTableExists(biConn, DT_TABLE))
{
fprintf(stderr, "Tables dataTypes doesn't exist, creating...\n");
createDataTypesTable(biConn, DT_TABLE);
}
struct dataTypes *dt = findDataType(biConn, type, platform);
if (!dt)
dt = createDataType(biConn, type, platform);
return dt;
}
struct tissues *findTissue(struct sqlConnection *biConn, char *tissue)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where name = \"%s\";",
TI_TABLE, tissue);
return tissuesLoadByQuery(biConn, query);
}
struct tissues *createTissue(struct sqlConnection *biConn, char *tissue)
{
int nextId = sqlTableSize(biConn, TI_TABLE);
struct tissues *ti;
AllocVar(ti);
ti->id = nextId;
ti->name = cloneString(tissue);
/* Save to db */
tissuesSaveToDb(biConn, ti, TI_TABLE, 100);
return ti;
}
void createTissuesTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "name varchar(255) not null,\n");
dyStringPrintf(dy, "PRIMARY KEY(id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
struct tissues *setupTissue(struct sqlConnection *biConn, char *tissue)
{
if (!sqlTableExists(biConn, TI_TABLE))
{
fprintf(stderr, "Tables tissues doesn't exist, creating...\n");
createTissuesTable(biConn, TI_TABLE);
}
struct tissues *ti = findTissue(biConn, tissue);
if (!ti)
ti = createTissue(biConn, tissue);
return ti;
}
struct datasets *findDataset(struct sqlConnection *biConn, char *name)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where data_table = \"%s\";",
DA_TABLE, name);
return datasetsLoadByQuery(biConn, query);
}
struct datasets *createDataset(struct sqlConnection *biConn,
char *tableName, char *tissue, int numSamples)
{
struct hash *settings = getSettings(tableName);
struct hashEl *el = hashLookup(settings, "shortLabel");
if (!el)
errAbort("No shortLabel");
char *shortLabel = cloneString(el->val);
el = hashLookup(settings, "name");
if (!el)
errAbort("No name");
char *dataTable = cloneString(el->val);
char probeTable[256];
safef(probeTable, sizeof(probeTable), "%s_probeInfo", dataTable);
char p2gTable[256];
safef(p2gTable, sizeof(p2gTable), "%s_probeToGene", dataTable);
char *platform;
el = hashLookup(settings, "platform");
if (!el)
platform = cloneString("Expression");
else
platform = cloneString(el->val);
el = hashLookup(settings, "dataType");
if (!el)
errAbort("No dataType");
char *dataType = cloneString(el->val);
struct dataTypes *dt = setupDataType(biConn, dataType, platform);
struct tissues *ti = setupTissue(biConn, tissue);
int nextId = sqlTableSize(biConn, DA_TABLE);
struct datasets *da;
AllocVar(da);
da->id = nextId;
da->tissue_id = ti->id;
da->type_id = dt->id;
da->num_samples = numSamples;
da->name = shortLabel;
da->data_table = dataTable;
da->probe_table = cloneString(probeTable);
da->probe_to_gene_table = cloneString(p2gTable);
dataTypesFree(&dt);
tissuesFree(&ti);
/* Write datasets */
datasetsSaveToDbEscaped(biConn, da, DA_TABLE, 100);
return da;
}
void createDatasetsTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
dyStringPrintf(dy, "type_id int unsigned not null,\n");
dyStringPrintf(dy, "num_samples int unsigned not null,\n");
dyStringPrintf(dy, "name varchar(255) not null,\n");
dyStringPrintf(dy, "data_table varchar(255) not null,\n");
dyStringPrintf(dy, "probe_table varchar(255) not null,\n");
dyStringPrintf(dy, "probe_to_gene_table varchar(255) not null,\n");
dyStringPrintf(dy, "PRIMARY KEY(id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
struct datasets *setupDataset(struct sqlConnection *biConn,
char *tableName, char *tissue, int numSamples)
{
if (!sqlTableExists(biConn, DA_TABLE))
{
fprintf(stderr, "Tables datasets doesn't exist, creating...");
createDatasetsTable(biConn, DA_TABLE);
}
struct datasets *da = findDataset(biConn, tableName);
if (!da)
da = createDataset(biConn, tableName, tissue, numSamples);
return da;
}
char *findPatientName(struct sqlConnection *pdConn, char *pTable,
char *pField, char *sField, char *sName)
{
char query[256];
safef(query, sizeof(query),
"select %s from %s where %s = \"%s\"",
pField, pTable, sField, sName);
return sqlQuickString(pdConn, query);
}
int findId(struct sqlConnection *biConn, char *idField, char *sField, char *name)
{
if (sqlTableSize(biConn, SA_TABLE) == 0) /* brand new table, return 0 */
return 0;
char query[256];
safef(query, sizeof(query),
"select DISTINCT %s from %s where %s = \"%s\";",
idField, SA_TABLE, sField, name);
if (sqlExists(biConn, query)) /* sample name found, use same id */
return sqlQuickNum(biConn, query);
/* Else, find maximum sample id and add one to it */
safef(query, sizeof(query),
"select max(%s) from %s;",
idField, SA_TABLE);
int maxId = sqlQuickNum(biConn, query);
return maxId + 1;
}
boolean sampleExists(struct sqlConnection *biConn, struct samples *sa)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where id = %d "
"and name = \"%s\" "
"and patient_id = %d "
"and patient_name = \"%s\" "
"and dataset_id = %d "
"and exp_id = %d "
"and tissue_id = %d ",
SA_TABLE, sa->id, sa->name, sa->patient_id, sa->patient_name, sa->dataset_id,
sa->exp_id, sa->tissue_id);
return sqlExists(biConn, query);
}
void createSamples(struct sqlConnection *biConn, struct datasets *da, struct maGrouping *allA)
{
int datasetId = da->id;
int tissueId = da->tissue_id;
struct hash *settings = getSettings(da->data_table);
struct hashEl *el = hashLookup(settings, "patDb");
if (!el)
errAbort("No patDb!");
char *patDb = cloneString(el->val);
el = hashLookup(settings, "patTable");
if (!el)
errAbort("No patTable");
char *patTable = cloneString(el->val);
el = hashLookup(settings, "patField");
if (!el)
errAbort("No patField");
char *patField = cloneString(el->val);
el = hashLookup(settings, "sampleField");
if (!el)
errAbort("No sampleField");
char *sampleField = cloneString(el->val);
struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb);
int i;
struct samples *sa;
for (i = 0; i < allA->size; i++)
{
- char *sampleName = cloneString(allA->names[i]);
- int expId = allA->expIds[i];
+ char *sampleName, *patientName;
+ if (isTCGA)
+ {
+ sampleName = cloneStringZ(allA->names[i], 16);
+ patientName = cloneStringZ(allA->names[i], 12);
+ }
+ else
+ {
+ sampleName = cloneString(allA->names[i]);
+ patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName);
+ }
+ int expId = allA->expIds[i];
int sampleId = findId(biConn, "id", "name", sampleName);
- char *patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName);
int patientId = findId(biConn, "patient_id", "patient_name", patientName);
AllocVar(sa);
sa->id = sampleId;
sa->name = sampleName;
sa->patient_id = patientId;
sa->patient_name = patientName;
sa->dataset_id = datasetId;
sa->exp_id = expId;
sa->tissue_id = tissueId;
if (!sampleExists(biConn, sa))
samplesSaveToDb(biConn, sa, SA_TABLE, 100);
samplesFree(&sa);
}
hFreeConn(&pdConn);
}
void createSamplesTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "name varchar(255) not null,\n");
dyStringPrintf(dy, "patient_id int unsigned not null,\n");
dyStringPrintf(dy, "patient_name varchar(255) not null,\n");
dyStringPrintf(dy, "dataset_id int unsigned not null,\n");
dyStringPrintf(dy, "exp_id int unsigned not null,\n");
dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
dyStringPrintf(dy, "KEY(id),\n");
dyStringPrintf(dy, "KEY(dataset_id),\n");
dyStringPrintf(dy, "KEY(id, dataset_id),\n");
dyStringPrintf(dy, "KEY(dataset_id,id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
struct samples *getSamples(struct sqlConnection *biConn, struct datasets *da)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where dataset_id = %d order by exp_id;",
SA_TABLE, da->id);
return samplesLoadByQuery(biConn, query);
}
struct samples *setupSamples(struct sqlConnection *biConn, struct datasets *da,
struct maGrouping *allA)
{
if (!sqlTableExists(biConn, SA_TABLE))
{
fprintf(stderr, "Table samples doesn't exist, creating...\n");
createSamplesTable(biConn, SA_TABLE);
}
createSamples(biConn, da, allA);
struct samples *saList = getSamples(biConn, da);
if (slCount(saList) != allA->size)
errAbort("Sample count from microarrayGroups and database don't match!");
return saList;
}
int getFeatureId(struct sqlConnection *biConn, char *name)
{
if (sqlTableSize(biConn, FE_TABLE) == 0) /* brand new table, return 0 */
return 0;
char query[256];
safef(query, sizeof(query),
"select id from %s where name = \"%s\";",
FE_TABLE, name);
if (sqlExists(biConn, query)) /* sample name found, use same id */
return sqlQuickNum(biConn, query);
else
return sqlTableSize(biConn, FE_TABLE);
}
struct features *getFeature(struct sqlConnection *biConn, char *name)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where name = \"%s\";",
FE_TABLE, name);
return featuresLoadByQuery(biConn, query);
}
boolean featureExists(struct sqlConnection *biConn, struct features *fs)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where name = \"%s\";",
FE_TABLE, fs->name);
return sqlExists(biConn, query);
}
boolean clinicalDataExists(struct sqlConnection *biConn, struct clinicalData *cd)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where sample_id = %d "
"and feature_id = %d; ",
CD_TABLE, cd->sample_id, cd->feature_id);
if (!sqlExists(biConn, query)) /* entry doesn't exist, report */
return FALSE;
/* Make sure entry has same values, if not there is a problem
* (sample_id, feature_id) should be unique */
struct clinicalData *cd2 = clinicalDataLoadByQuery(biConn, query);
if (slCount(cd2) != 1)
errAbort("clinicalData entries not unique, sample_id = %d, feature_id = %d",
cd->sample_id, cd->feature_id);
if (cd->val != cd2->val)
errAbort("clinicalData values don't match, sample_id = %d, feature_id = %d, "
"%f != %f",
cd->sample_id, cd->feature_id, cd->val, cd2->val);
if (cd->code && cd2->code)
if (!sameString(cd->code, cd2->code))
errAbort("clinicalData codes don't match, sample_id = %d, feature_id = %d",
cd->sample_id, cd->feature_id);
return TRUE;
}
void createFeaturesTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "name varchar(255) not null,\n");
dyStringPrintf(dy, "shortLabel varchar(255) not null,\n");
dyStringPrintf(dy, "longLabel varchar(255) not null,\n");
dyStringPrintf(dy, "KEY(id),\n");
dyStringPrintf(dy, "KEY(name),\n");
dyStringPrintf(dy, "KEY(id,name)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
void createClinicalDataTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "sample_id int unsigned not null,\n");
dyStringPrintf(dy, "feature_id int unsigned not null,\n");
dyStringPrintf(dy, "val double not null,\n");
dyStringPrintf(dy, "code varchar(255),\n");
dyStringPrintf(dy, "KEY(sample_id),\n");
dyStringPrintf(dy, "KEY(feature_id),\n");
dyStringPrintf(dy, "KEY(sample_id,feature_id),\n");
dyStringPrintf(dy, "KEY(feature_id,sample_id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
void setupClinicalInfo(struct sqlConnection *biConn, struct datasets *da, struct samples *saList)
{
if (!saList)
return;
if (!sqlTableExists(biConn, FE_TABLE))
{
fprintf(stderr, "Table features doesn't exist, creating...\n");
createFeaturesTable(biConn, FE_TABLE);
}
if (!sqlTableExists(biConn, CD_TABLE))
{
fprintf(stderr, "Table clinicalData doesn't exist, creating...\n");
createClinicalDataTable(biConn, CD_TABLE);
}
struct hash *settings = getSettings(da->data_table);
struct hashEl *el = hashLookup(settings, "raFile");
if (!el)
errAbort("No raFile");
char *raFile = cloneString(el->val);
el = hashLookup(settings, "patDb");
if (!el)
errAbort("No patDb");
char *patDb = cloneString(el->val);
el = hashLookup(settings, "patTable");
if (!el)
errAbort("No patTable");
char *patTable = cloneString(el->val);
el = hashLookup(settings, "patField");
if (!el)
errAbort("No patField");
char *patField = cloneString(el->val);
el = hashLookup(settings, "sampleField");
if (!el)
errAbort("No sampleField");
char *sampleField = cloneString(el->val);
if (!raFile || !patDb || !patTable || !patField || !sampleField)
errAbort("Incomplete ra entry for %s.", da->data_table);
struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb); //connection to patient data
if (DEBUG)
fprintf(stderr, "Getting columns of clinical data...\n");
struct column *col, *colList = getColumns(pdConn, raFile, patDb);
/* Set up features */
struct features *fs;
for (col = colList; col; col = col->next)
{
char *name = col->name;
char *shortLabel = col->shortLabel;
char *longLabel = col->longLabel;
int id = getFeatureId(biConn, name);
AllocVar(fs);
fs->id = id;
fs->name = cloneString(name);
fs->shortLabel = cloneString(shortLabel);
fs->longLabel= cloneString(longLabel);
if (!featureExists(biConn, fs))
featuresSaveToDbEscaped(biConn, fs, FE_TABLE, 100);
featuresFree(&fs);
fs = getFeature(biConn, name);
if (!fs)
errAbort("Could not find feature %s.", name);
if (slCount(fs) != 1)
errAbort("Could not find unique feature by name = %s.", name);
/* Loop through all samples, putting data in database */
struct samples *sa;
struct clinicalData *cd;
for (sa = saList; sa; sa = sa->next)
{
struct slName *id = slNameNew(getId(pdConn, patTable, patField, sa->name, sampleField));
char *cellVal = col->cellVal(col, id, pdConn);
if (!cellVal)
continue;
AllocVar(cd);
cd->sample_id = sa->id;
cd->feature_id = fs->id;
cd->val = atof(cellVal);
cd->code = NULL;
if (col->cellCoded(col, pdConn))
cd->code = cloneString(col->cellCodedVal(col, id, pdConn));
if (!clinicalDataExists(biConn, cd))
clinicalDataSaveToDb(biConn, cd, CD_TABLE, 100);
clinicalDataFree(&cd);
slNameFree(&id);
}
featuresFree(&fs);
}
hFreeConn(&pdConn);
}
void createProbeValsTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "probe_id int unsigned not null,\n");
dyStringPrintf(dy, "sample_count int unsigned not null,\n");
dyStringPrintf(dy, "sample_data longblob not null,\n");
dyStringPrintf(dy, "PRIMARY KEY(probe_id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
void createProbeInfoTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "chrom varchar(255) not null,\n");
dyStringPrintf(dy, "start int unsigned not null,\n");
dyStringPrintf(dy, "stop int unsigned not null,\n");
dyStringPrintf(dy, "name varchar(255) not null,\n");
dyStringPrintf(dy, "PRIMARY KEY(id),\n");
dyStringPrintf(dy, "KEY(name),\n");
dyStringPrintf(dy, "KEY(id,name)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
void addProbeValsToDb(struct sqlConnection *biConn, char *tableName,
int probe_id, int sample_count, char *dataString, int updateSize)
{ /* This bypasses the need to convert to float array to enter in database, may not
* be faster, but by converting first to float array any "blank" datapoints are
* converted to 0.0, instead of leaving blank */
struct dyString *update = newDyString(updateSize);
dyStringPrintf(update, "insert into %s values ( %u,%u,'%s')",
tableName, probe_id, sample_count, dataString );
sqlUpdate(biConn, update->string);
freeDyString(&update);
}
void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn,
struct datasets *da)
{
char *dataTable = da->data_table;
char *probeTable = da->probe_table;
if (!dataTable || !probeTable)
errAbort("datasets entry not complete, data_table or probe_table not set.");
boolean inputProbeVals = FALSE;
boolean inputProbeInfo = FALSE;
if (sqlTableExists(biConn, dataTable) && dropTable)
{
fprintf(stderr, "probeVals table %s already exists in db, dropping...\n", dataTable);
sqlDropTable(biConn, dataTable);
}
if (!sqlTableExists(biConn, dataTable))
{
fprintf(stderr, "Creating probeVals table %s...\n", dataTable);
createProbeValsTable(biConn, dataTable);
inputProbeVals = TRUE; // empty table, input
}
if (sqlTableExists(biConn, probeTable) && dropTable)
{
fprintf(stderr, "probeInfo table %s already exists in db, dropping...\n", probeTable);
sqlDropTable(biConn, probeTable);
}
if (!sqlTableExists(biConn, probeTable))
{
fprintf(stderr, "Creating probeInfo table %s...\n", probeTable);
createProbeInfoTable(biConn, probeTable);
inputProbeInfo = TRUE; // empty table, input
}
char query[256];
safef(query, sizeof(query), "select * from %s;", dataTable);
/* Get bed15 data from hg18 database */
int id = 0;
struct sqlResult *sr = sqlGetResult(hgConn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
char *chrom = row[1];
unsigned chromStart = sqlUnsigned(row[2]);
unsigned chromEnd = sqlUnsigned(row[3]);
char *name = row[4];
unsigned expCount = sqlUnsigned(row[13]);
char *expScores = row[15];
/* Make probeInfo entry and load into db*/
struct probeInfo *pi;
AllocVar(pi);
pi->id = id;
pi->chrom = cloneString(chrom);
pi->start = chromStart;
pi->stop = chromEnd;
pi->name = cloneString(name);
if (inputProbeInfo)
probeInfoSaveToDb(biConn, pi, probeTable, 100);
/* Make probeVals entry and load into db, straight-up copying longblob*/
if (inputProbeVals)
addProbeValsToDb(biConn, dataTable, id, expCount, expScores, 500);
id++;
probeInfoFree(&pi);
}
}
void createGeneLookupTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "kgId varchar(255) not null,\n");
dyStringPrintf(dy, "PRIMARY KEY(id),\n");
dyStringPrintf(dy, "KEY(kgId),\n");
dyStringPrintf(dy, "KEY(id,kgId),\n");
dyStringPrintf(dy, "KEY(kgId,id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
void createGeneLookup(struct sqlConnection *biConn)
{
if (!sqlTableExists(biConn, GL_TABLE))
{
fprintf(stderr, "geneLookup table doesn't exist in bioInt database, recreating it.\n");
createGeneLookupTable(biConn, GL_TABLE);
}
if (sqlTableSize(biConn, GL_TABLE) > 0)
{
fprintf(stderr, "geneLookup table already has data in it, doing nothing.\n");
return;
}
if (!sqlTableExists(biConn, "knownGene"))
errAbort("Need knownGene table in bioInt database.");
char query[256];
safef(query, sizeof(query), "select name from knownGene;");
struct slName *sl, *slList = sqlQuickList(biConn, query);
int id = 0;
struct geneLookup *gl;
for (sl = slList; sl; sl = sl->next)
{
AllocVar(gl);
gl->id = id;
gl->kgId = cloneString(sl->name);
id++;
geneLookupSaveToDb(biConn, gl, GL_TABLE, 100);
geneLookupFree(&gl);
}
slNameFreeList(&slList);
}
void createProbeToGeneTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "probe_id int unsigned not null,\n");
dyStringPrintf(dy, "gene_id int unsigned not null,\n");
dyStringPrintf(dy, "KEY(probe_id),\n");
dyStringPrintf(dy, "KEY(gene_id),\n");
dyStringPrintf(dy, "KEY(probe_id,gene_id),\n");
dyStringPrintf(dy, "KEY(gene_id,probe_id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
{
char query[128];
safef(query, sizeof(query),
"select id from %s where name = \"%s\"",
tableName, name);
if (!sqlExists(biConn, query))
return -1;
return sqlQuickNum(biConn, query);
}
struct slInt *getGeneIdsBySymbol(struct sqlConnection *biConn,
struct slName *slList)
{
if (!slList)
return NULL;
struct slName *sl;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy, "select id from %s "
"join kgXref on %s.kgId = kgXref.kgId "
"where kgXref.geneSymbol in (", GL_TABLE, GL_TABLE);
for (sl = slList; sl; sl = sl->next)
{
dyStringPrintf(dy, "\"%s\"", sl->name);
if (sl->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ");");
char *query = dyStringCannibalize(&dy);
return sqlQuickNumList(biConn, query);
}
void setupProbeToGene(struct sqlConnection *hgConn,
struct sqlConnection *biConn, struct datasets *da)
{
char *p2gTable = da->probe_to_gene_table;
if (!p2gTable)
{
fprintf(stderr, "probeToGene table not set, doing nothing.\n");
return;
}
struct hash *settings = getSettings(da->data_table);
struct hashEl *el = hashLookup(settings, "aliasTable");
if (!el)
errAbort("No aliasTable.\n");
char *aliasTable = cloneString(el->val);
if (!sqlTableExists(hgConn, aliasTable))
errAbort("Table %s not found in hg18 database.\n", aliasTable);
if (!sqlTableExists(biConn, "kgXref"))
errAbort("kgXref table not found in database. Cannot create probeToGene table.\n");
if (sqlTableExists(biConn, p2gTable) && dropTable)
{
fprintf(stderr, "Table %s already exists, dropping and recreating.\n", p2gTable);
sqlDropTable(biConn, p2gTable);
}
boolean inputProbeToGene = FALSE;
if (!sqlTableExists(biConn, p2gTable))
{
fprintf(stderr, "Creating probeToGene table...\n");
createProbeToGeneTable(biConn, p2gTable);
inputProbeToGene = TRUE;
}
if (!inputProbeToGene)
return;
struct geneAlias *ga, *gaList = getAliases(hgConn, aliasTable);
for (ga = gaList; ga; ga = ga->next)
{
int probeId = getProbeId(biConn, da->probe_table, ga->probe);
if (probeId < 0) // probe in alias table doesn't exist in dataset
continue;
struct slInt *si, *geneIds = getGeneIdsBySymbol(biConn, ga->genes);
struct probeToGene *pg;
AllocVar(pg);
pg->probe_id = probeId;
for (si = geneIds; si; si = si->next)
{
pg->gene_id = si->val;
probeToGeneSaveToDb(biConn, pg, p2gTable, 10);
}
probeToGeneFree(&pg);
}
}
void createPathwaysTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "name varchar(255) not null,\n");
dyStringPrintf(dy, "source varchar(255) not null,\n");
dyStringPrintf(dy, "KEY(id),\n");
dyStringPrintf(dy, "KEY(name),\n");
dyStringPrintf(dy, "KEY(id,name),\n");
dyStringPrintf(dy, "KEY(name,id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
void createPathwayGenesTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "gene_id int unsigned not null,\n");
dyStringPrintf(dy, "KEY(id),\n");
dyStringPrintf(dy, "KEY(gene_id),\n");
dyStringPrintf(dy, "KEY(id,gene_id),\n");
dyStringPrintf(dy, "KEY(gene_id,id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
void createPathwayInfoTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "description longblob not null,\n");
dyStringPrintf(dy, "KEY(id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
char *getPathwayDescription(struct sqlConnection *pdConn, char *name)
{
if (!sqlTableExists(pdConn, name))
return NULL;
char query[128];
safef(query, sizeof(query),
"select description from descriptions where name = \"%s\";",
name);
return sqlQuickString(pdConn, query);
}
void setupPathways(struct sqlConnection *biConn)
{
boolean inputPathways = FALSE;
boolean inputPathwayInfo = FALSE;
boolean inputPathwayGenes = FALSE;
struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
if (!pdConn)
errAbort("Could not connect to pathways database.\n");
if (sqlTableExists(biConn, PA_TABLE) && dropTable)
{
fprintf(stderr, "pathways table already exists, dropping and recreating.\n");
sqlDropTable(biConn, PA_TABLE);
}
if (!sqlTableExists(biConn, PA_TABLE))
{
fprintf(stderr, "Creating pathways table.\n");
createPathwaysTable(biConn, PA_TABLE);
inputPathways = TRUE;
}
if (sqlTableExists(biConn, PG_TABLE) && dropTable)
{
fprintf(stderr, "pathwayGenes table already exists, dropping and recreating.\n");
sqlDropTable(biConn, PG_TABLE);
}
if (!sqlTableExists(biConn, PG_TABLE))
{
fprintf(stderr, "Creating pathwayGenes table.\n");
createPathwayGenesTable(biConn, PG_TABLE);
inputPathwayGenes = TRUE;
}
if (sqlTableExists(biConn, PI_TABLE) && dropTable)
{
fprintf(stderr, "pathwayInfo table already exists, dropping and recreating.\n");
sqlDropTable(biConn, PI_TABLE);
}
if (!sqlTableExists(biConn, PI_TABLE))
{
fprintf(stderr, "Creeting pathwayInfo table.\n");
createPathwayInfoTable(biConn, PI_TABLE);
inputPathwayInfo = TRUE;
}
if (!inputPathways && !inputPathwayInfo && !inputPathwayGenes)
{
fprintf(stderr, "Nothing to do for pathway tables.\n");
return;
}
/* Setting up pathways table */
char query[128];
safef(query, sizeof(query), "select * from genesets;");
struct sqlResult *sr = sqlGetResult(pdConn, query);
char **row = NULL;
/* Save all data in lists to avoid "out of sync" error when attempting
* query inside of a running query on same db */
struct slName *na, *names = NULL;
struct slName *ge, *genes = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
slNameAddHead(&names, row[0]);
slNameAddHead(&genes, row[1]);
}
slReverse(&names);
slReverse(&genes);
sqlFreeResult(&sr);
int id = 0;
for (na = names, ge = genes; na && ge; na = na->next, ge = ge->next)
{
char *name = na->name;
char *genes = ge->name;
struct slName *slList = slNameListFromComma(genes);
struct slInt *si, *siList = getGeneIdsBySymbol(biConn, slList);
struct pathways *ps;
if (inputPathways)
{
AllocVar(ps);
ps->id = id;
ps->name = cloneString(name);
ps->source = cloneString("N/A");
pathwaysSaveToDb(biConn, ps, PA_TABLE, 100);
pathwaysFree(&ps);
}
if (inputPathwayGenes)
{
struct pathwayGenes *pg;
AllocVar(pg);
pg->id = id;
for (si = siList; si; si = si->next)
{
pg->gene_id = si->val;
pathwayGenesSaveToDb(biConn, pg, PG_TABLE, 100);
}
pathwayGenesFree(&pg);
}
if (inputPathwayInfo)
{
char *desc = getPathwayDescription(pdConn, name);
if (desc)
{
struct pathwayInfo *pi;
AllocVar(pi);
pi->id = id;
pi->description = desc;
pathwayInfoSaveToDbEscaped(biConn, pi, PI_TABLE, 200);
pathwayInfoFree(&pi);
}
}
id++;
}
hFreeConn(&pdConn);
}
void populateDb(char *db, char *tableName, char *tissue)
{
tissue = strLower(tissue);
struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
/* Create geneLookup table (if necessary) */
uglyTime(NULL);
fprintf(stderr, "Setting up geneLookup table...\n");
createGeneLookup(biConn);
uglyTime("Time");
/* Set up pathways */
uglyTime(NULL);
fprintf(stderr, "Setting up pathways tables...\n");
setupPathways(biConn);
uglyTime("Time");
/* Set up datasets entry */
struct maGrouping *allA = getMaGrouping(hgConn, tableName);
if (!allA)
errAbort("Could not find maGrouping for %s!", tableName);
uglyTime(NULL);
int numSamples = allA->size;
fprintf(stderr, "Adding datasets entry...\n");
struct datasets *da = setupDataset(biConn, tableName, tissue, numSamples);
uglyTime("Time");
/* Set up samples entries */
uglyTime(NULL);
fprintf(stderr, "Adding to samples table...\n");
struct samples *saList = setupSamples(biConn, da, allA);
uglyTime("Time");
/* Set up features and clinicalData */
uglyTime(NULL);
fprintf(stderr, "Setting up clinical data tables...\n");
setupClinicalInfo(biConn, da, saList);
uglyTime("Time");
/* Set up probeInfo table (if necessary) and probeVals table */
uglyTime(NULL);
fprintf(stderr, "Setting up probe data tables (be patient!)...\n");
setupProbeData(hgConn, biConn, da);
uglyTime("Time");
/* Set up probeToGene table (if necessary) */
uglyTime(NULL);
fprintf(stderr, "Setting up probeToGene table...\n");
setupProbeToGene(hgConn, biConn, da);
uglyTime("Time");
fprintf(stderr, "Done!");
fprintf(stderr, "Please run 'setCohort' to find datasets that have overlapping samples.\n");
hFreeConn(&biConn);
hFreeConn(&hgConn);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 4)
usage();
dropTable = FALSE;
if (optionExists("dropAll"))
dropTable = TRUE;
+if (optionExists("tcga"))
+ isTCGA = TRUE;
populateDb(argv[1], argv[2], argv[3]);
return 0;
}