src/hg/instinct/bioInt2/populateDb.c 1.7
1.7 2009/04/27 06:15:49 jsanborn
updated lots of stuff, will break older implementation of database
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/populateDb.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 1000000 -r1.6 -r1.7
--- src/hg/instinct/bioInt2/populateDb.c 4 Apr 2009 00:39:22 -0000 1.6
+++ src/hg/instinct/bioInt2/populateDb.c 27 Apr 2009 06:15:49 -0000 1.7
@@ -1,1289 +1,1214 @@
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "bed.h"
#include "genePred.h"
#include "hPrint.h"
#include "hdb.h"
#include "microarray.h"
#include "ra.h"
#include "featuresLib.h"
#include "hgHeatmapLib.h"
+#include "cprob.h"
+#include "hgStatsLib.h"
#include "bioIntDriver.h"
#include "bioIntDb.h"
char *hgDb = "hg18";
char *genome = "Human";
void usage()
/* Explain usage and exit. */
{
errAbort(
"populateDb \n"
" populateDb [OPTIONS] db table tissue\n"
"options:\n"
" -dropAll Drop/recreate any table\n"
" -tcga handles TCGA ids\n"
"\n"
);
}
boolean dropTable = FALSE; // If true, any table that should be dropped/recreated will be
boolean isTCGA = FALSE; // If true, specially handle TCGA ids
static struct optionSpec options[] = {
{"dropAll", OPTION_BOOLEAN},
{"tcga", OPTION_BOOLEAN},
{NULL, 0},
};
char *getId(struct sqlConnection *conn, char *table, char *key, char *sample, char *value)
/* get ISPY ID from sample (or experiment) Id */
{
char query[512];
safef(query, sizeof(query), "select %s from %s where %s = \"%s\" ", key, table, value, sample);
return sqlQuickString(conn, query);
}
struct slName *getProbesFromTable(struct sqlConnection *hgConn, char *tableName)
{
char query[512];
char *key = "name";
safef(query, sizeof(query), "select DISTINCT %s from %s ", key, tableName);
struct sqlResult *sr = sqlGetResult(hgConn, query);
char **row = NULL;
struct slName *sl, *slList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
sl = slNameNew(row[0]);
slAddHead(&slList, sl);
}
slReverse(&slList);
sqlFreeResult(&sr);
return slList;
}
struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
{
/*microarray specific settings*/
struct trackDb *tdb = hMaybeTrackInfo(hgConn, tableName);
struct microarrayGroups *maGs = maGroupings("hg18", tableName);
trackDbFreeList(&tdb);
if (!maGs)
return NULL;
return maGs->allArrays;
}
struct hash *getSettings(char *tableName)
{
struct column *raList = getColumns(NULL, "datasets.ra", NULL);
struct column *col;
struct hash *settings = NULL;
for (col = raList; col; col = col->next)
{
if (!sameString(col->name, tableName))
continue;
settings = col->settings;
break;
}
if (!settings)
errAbort("Couldn't find datasets.ra listing for %s", tableName);
return settings;
}
struct geneAlias {
struct geneAlias *next;
char *probe;
struct slName *genes;
};
-
-struct geneAlias *getAliases(struct sqlConnection *hgConn, char *tableName)
+struct hash *getAliases(struct sqlConnection *hgConn, char *tableName)
{
if (!hgConn || !tableName)
return NULL;
char query[512];
char **row;
safef(query, sizeof(query), "select * from %s", tableName);
struct sqlResult *sr = sqlGetResult(hgConn, query);
struct geneAlias *ga, *gaList = NULL;
struct hash *gaHash = hashNew(0);
while ((row = sqlNextRow(sr)) != NULL)
{
char *probe = cloneString(row[0]);
char *gene = cloneString(row[1]);
struct hashEl *el = hashLookup(gaHash, probe);
if (!el)
{
ga = AllocA(struct geneAlias);
ga->probe = cloneString(probe);
ga->genes = NULL;
slAddHead(&gaList, ga);
hashAdd(gaHash, probe, ga);
}
else
ga = el->val;
slNameAddHead(&ga->genes, gene);
}
-hashFree(&gaHash);
sqlFreeResult(&sr);
-return gaList;
+return gaHash;
}
struct dataTypes *findDataType(struct sqlConnection *biConn, char *type, char *platform)
{
if (!sameString(type, "bed 15"))
errAbort("populateDb only runs on bed 15 files.");
-char *data_format = "probeVals";
+char *data_format = "analysisVals";
char query[256];
safef(query, sizeof(query),
"select * from %s where format = \"%s\" and name = \"%s\"",
DT_TABLE, data_format, platform);
return dataTypesLoadByQuery(biConn, query);
}
struct dataTypes *createDataType(struct sqlConnection *biConn, char *type, char *platform)
{
int nextId = sqlTableSize(biConn, DT_TABLE);
struct dataTypes *dt;
AllocVar(dt);
dt->id = nextId;
-dt->format = cloneString("probeVals");
+dt->format = cloneString("analysisVals");
dt->name = cloneString(platform);
/* Save to db */
dataTypesSaveToDb(biConn, dt, DT_TABLE, 100);
return dt;
}
-void createDataTypesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "format varchar(255) not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
struct dataTypes *setupDataType(struct sqlConnection *biConn,
char *type, char *platform)
{
if (!sqlTableExists(biConn, DT_TABLE))
{
fprintf(stderr, "Tables dataTypes doesn't exist, creating...\n");
createDataTypesTable(biConn, DT_TABLE);
}
struct dataTypes *dt = findDataType(biConn, type, platform);
if (!dt)
dt = createDataType(biConn, type, platform);
return dt;
}
struct tissues *findTissue(struct sqlConnection *biConn, char *tissue)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where name = \"%s\";",
TI_TABLE, tissue);
return tissuesLoadByQuery(biConn, query);
}
struct tissues *createTissue(struct sqlConnection *biConn, char *tissue)
{
int nextId = sqlTableSize(biConn, TI_TABLE);
struct tissues *ti;
AllocVar(ti);
ti->id = nextId;
ti->name = cloneString(tissue);
/* Save to db */
tissuesSaveToDb(biConn, ti, TI_TABLE, 100);
return ti;
}
-void createTissuesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
struct tissues *setupTissue(struct sqlConnection *biConn, char *tissue)
{
if (!sqlTableExists(biConn, TI_TABLE))
{
fprintf(stderr, "Tables tissues doesn't exist, creating...\n");
createTissuesTable(biConn, TI_TABLE);
}
struct tissues *ti = findTissue(biConn, tissue);
if (!ti)
ti = createTissue(biConn, tissue);
return ti;
}
struct datasets *findDataset(struct sqlConnection *biConn, char *name)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where data_table = \"%s\";",
DA_TABLE, name);
return datasetsLoadByQuery(biConn, query);
}
struct datasets *createDataset(struct sqlConnection *biConn,
char *tableName, char *tissue, int numSamples)
{
struct hash *settings = getSettings(tableName);
struct hashEl *el = hashLookup(settings, "shortLabel");
if (!el)
errAbort("No shortLabel");
char *shortLabel = cloneString(el->val);
el = hashLookup(settings, "name");
if (!el)
errAbort("No name");
char *dataTable = cloneString(el->val);
-char probeTable[256];
-safef(probeTable, sizeof(probeTable), "%s_probeInfo", dataTable);
-
-char p2gTable[256];
-safef(p2gTable, sizeof(p2gTable), "%s_probeToGene", dataTable);
-
char *platform;
el = hashLookup(settings, "platform");
if (!el)
platform = cloneString("Expression");
else
platform = cloneString(el->val);
el = hashLookup(settings, "dataType");
if (!el)
errAbort("No dataType");
char *dataType = cloneString(el->val);
struct dataTypes *dt = setupDataType(biConn, dataType, platform);
struct tissues *ti = setupTissue(biConn, tissue);
int nextId = sqlTableSize(biConn, DA_TABLE);
struct datasets *da;
AllocVar(da);
da->id = nextId;
da->tissue_id = ti->id;
da->type_id = dt->id;
da->num_samples = numSamples;
da->name = shortLabel;
da->data_table = dataTable;
-da->probe_table = cloneString(probeTable);
-da->probe_to_gene_table = cloneString(p2gTable);
dataTypesFree(&dt);
tissuesFree(&ti);
/* Write datasets */
datasetsSaveToDbEscaped(biConn, da, DA_TABLE, 100);
return da;
}
-void createDatasetsTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
-dyStringPrintf(dy, "type_id int unsigned not null,\n");
-dyStringPrintf(dy, "num_samples int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "data_table varchar(255) not null,\n");
-dyStringPrintf(dy, "probe_table varchar(255) not null,\n");
-dyStringPrintf(dy, "probe_to_gene_table varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
struct datasets *setupDataset(struct sqlConnection *biConn,
char *tableName, char *tissue, int numSamples)
{
if (!sqlTableExists(biConn, DA_TABLE))
{
fprintf(stderr, "Tables datasets doesn't exist, creating...");
createDatasetsTable(biConn, DA_TABLE);
}
struct datasets *da = findDataset(biConn, tableName);
if (!da)
da = createDataset(biConn, tableName, tissue, numSamples);
return da;
}
char *findPatientName(struct sqlConnection *pdConn, char *pTable,
char *pField, char *sField, char *sName)
{
char query[256];
safef(query, sizeof(query),
"select %s from %s where %s = \"%s\"",
pField, pTable, sField, sName);
return sqlQuickString(pdConn, query);
}
int findId(struct sqlConnection *biConn, char *idField, char *sField, char *name)
{
if (sqlTableSize(biConn, SA_TABLE) == 0) /* brand new table, return 0 */
return 0;
char query[256];
safef(query, sizeof(query),
"select DISTINCT %s from %s where %s = \"%s\";",
idField, SA_TABLE, sField, name);
if (sqlExists(biConn, query)) /* sample name found, use same id */
return sqlQuickNum(biConn, query);
/* Else, find maximum sample id and add one to it */
safef(query, sizeof(query),
"select max(%s) from %s;",
idField, SA_TABLE);
int maxId = sqlQuickNum(biConn, query);
return maxId + 1;
}
boolean sampleExists(struct sqlConnection *biConn, struct samples *sa)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where id = %d "
"and name = \"%s\" "
"and patient_id = %d "
"and patient_name = \"%s\" "
"and dataset_id = %d "
- "and exp_id = %d "
"and tissue_id = %d ",
SA_TABLE, sa->id, sa->name, sa->patient_id, sa->patient_name, sa->dataset_id,
- sa->exp_id, sa->tissue_id);
+ sa->tissue_id);
return sqlExists(biConn, query);
}
void createSamples(struct sqlConnection *biConn, struct datasets *da, struct maGrouping *allA)
{
int datasetId = da->id;
int tissueId = da->tissue_id;
struct hash *settings = getSettings(da->data_table);
struct hashEl *el = hashLookup(settings, "patDb");
if (!el)
errAbort("No patDb!");
char *patDb = cloneString(el->val);
el = hashLookup(settings, "patTable");
if (!el)
errAbort("No patTable");
char *patTable = cloneString(el->val);
el = hashLookup(settings, "patField");
if (!el)
errAbort("No patField");
char *patField = cloneString(el->val);
el = hashLookup(settings, "sampleField");
if (!el)
errAbort("No sampleField");
char *sampleField = cloneString(el->val);
struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb);
int i;
struct samples *sa;
for (i = 0; i < allA->size; i++)
{
char *sampleName, *patientName;
if (isTCGA)
{
sampleName = cloneStringZ(allA->names[i], 16);
patientName = cloneStringZ(allA->names[i], 12);
}
else
{
sampleName = cloneString(allA->names[i]);
patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName);
}
- int expId = allA->expIds[i];
int sampleId = findId(biConn, "id", "name", sampleName);
int patientId = findId(biConn, "patient_id", "patient_name", patientName);
AllocVar(sa);
sa->id = sampleId;
sa->name = sampleName;
sa->patient_id = patientId;
sa->patient_name = patientName;
sa->dataset_id = datasetId;
- sa->exp_id = expId;
sa->tissue_id = tissueId;
if (!sampleExists(biConn, sa))
samplesSaveToDb(biConn, sa, SA_TABLE, 100);
samplesFree(&sa);
}
hFreeConn(&pdConn);
}
-void createSamplesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "patient_id int unsigned not null,\n");
-dyStringPrintf(dy, "patient_name varchar(255) not null,\n");
-dyStringPrintf(dy, "dataset_id int unsigned not null,\n");
-dyStringPrintf(dy, "exp_id int unsigned not null,\n");
-dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(dataset_id),\n");
-dyStringPrintf(dy, "KEY(id, dataset_id),\n");
-dyStringPrintf(dy, "KEY(dataset_id,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
struct samples *getSamples(struct sqlConnection *biConn, struct datasets *da)
{
char query[256];
safef(query, sizeof(query),
- "select * from %s where dataset_id = %d order by exp_id;",
+ "select * from %s where dataset_id = %d order by id;",
SA_TABLE, da->id);
return samplesLoadByQuery(biConn, query);
}
-
struct samples *setupSamples(struct sqlConnection *biConn, struct datasets *da,
struct maGrouping *allA)
{
if (!sqlTableExists(biConn, SA_TABLE))
{
fprintf(stderr, "Table samples doesn't exist, creating...\n");
createSamplesTable(biConn, SA_TABLE);
}
createSamples(biConn, da, allA);
struct samples *saList = getSamples(biConn, da);
-if (slCount(saList) != allA->size)
- errAbort("Sample count from microarrayGroups and database don't match!");
+
+//if (slCount(saList) != allA->size)
+// errAbort("Sample count from microarrayGroups and database don't match!");
return saList;
}
int getFeatureId(struct sqlConnection *biConn, char *name)
{
if (sqlTableSize(biConn, FE_TABLE) == 0) /* brand new table, return 0 */
return 0;
char query[256];
safef(query, sizeof(query),
"select id from %s where name = \"%s\";",
FE_TABLE, name);
if (sqlExists(biConn, query)) /* sample name found, use same id */
return sqlQuickNum(biConn, query);
else
return sqlTableSize(biConn, FE_TABLE);
}
struct features *getFeature(struct sqlConnection *biConn, char *name)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where name = \"%s\";",
FE_TABLE, name);
return featuresLoadByQuery(biConn, query);
}
boolean featureExists(struct sqlConnection *biConn, struct features *fs)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where name = \"%s\";",
FE_TABLE, fs->name);
return sqlExists(biConn, query);
}
boolean clinicalDataExists(struct sqlConnection *biConn, struct clinicalData *cd)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where sample_id = %d "
"and feature_id = %d; ",
CD_TABLE, cd->sample_id, cd->feature_id);
if (!sqlExists(biConn, query)) /* entry doesn't exist, report */
return FALSE;
/* Make sure entry has same values, if not there is a problem
* (sample_id, feature_id) should be unique */
struct clinicalData *cd2 = clinicalDataLoadByQuery(biConn, query);
if (slCount(cd2) != 1)
errAbort("clinicalData entries not unique, sample_id = %d, feature_id = %d",
cd->sample_id, cd->feature_id);
if (cd->val != cd2->val)
errAbort("clinicalData values don't match, sample_id = %d, feature_id = %d, "
"%f != %f",
cd->sample_id, cd->feature_id, cd->val, cd2->val);
if (cd->code && cd2->code)
if (!sameString(cd->code, cd2->code))
errAbort("clinicalData codes don't match, sample_id = %d, feature_id = %d",
cd->sample_id, cd->feature_id);
return TRUE;
}
-void createFeaturesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "shortLabel varchar(255) not null,\n");
-dyStringPrintf(dy, "longLabel varchar(255) not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(name),\n");
-dyStringPrintf(dy, "KEY(id,name)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void createClinicalDataTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "sample_id int unsigned not null,\n");
-dyStringPrintf(dy, "feature_id int unsigned not null,\n");
-dyStringPrintf(dy, "val double not null,\n");
-dyStringPrintf(dy, "code varchar(255),\n");
-dyStringPrintf(dy, "KEY(sample_id),\n");
-dyStringPrintf(dy, "KEY(feature_id),\n");
-dyStringPrintf(dy, "KEY(sample_id,feature_id),\n");
-dyStringPrintf(dy, "KEY(feature_id,sample_id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
void setupClinicalInfo(struct sqlConnection *biConn, struct datasets *da, struct samples *saList)
{
if (!saList)
return;
if (!sqlTableExists(biConn, FE_TABLE))
{
fprintf(stderr, "Table features doesn't exist, creating...\n");
createFeaturesTable(biConn, FE_TABLE);
}
if (!sqlTableExists(biConn, CD_TABLE))
{
fprintf(stderr, "Table clinicalData doesn't exist, creating...\n");
createClinicalDataTable(biConn, CD_TABLE);
}
struct hash *settings = getSettings(da->data_table);
struct hashEl *el = hashLookup(settings, "raFile");
if (!el)
errAbort("No raFile");
char *raFile = cloneString(el->val);
el = hashLookup(settings, "patDb");
if (!el)
errAbort("No patDb");
char *patDb = cloneString(el->val);
el = hashLookup(settings, "patTable");
if (!el)
errAbort("No patTable");
char *patTable = cloneString(el->val);
el = hashLookup(settings, "patField");
if (!el)
errAbort("No patField");
char *patField = cloneString(el->val);
el = hashLookup(settings, "sampleField");
if (!el)
errAbort("No sampleField");
char *sampleField = cloneString(el->val);
if (!raFile || !patDb || !patTable || !patField || !sampleField)
errAbort("Incomplete ra entry for %s.", da->data_table);
struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb); //connection to patient data
if (DEBUG)
fprintf(stderr, "Getting columns of clinical data...\n");
struct column *col, *colList = getColumns(pdConn, raFile, patDb);
/* Set up features */
struct features *fs;
for (col = colList; col; col = col->next)
{
char *name = col->name;
char *shortLabel = col->shortLabel;
char *longLabel = col->longLabel;
int id = getFeatureId(biConn, name);
AllocVar(fs);
fs->id = id;
fs->name = cloneString(name);
fs->shortLabel = cloneString(shortLabel);
fs->longLabel= cloneString(longLabel);
if (!featureExists(biConn, fs))
featuresSaveToDbEscaped(biConn, fs, FE_TABLE, 100);
featuresFree(&fs);
fs = getFeature(biConn, name);
if (!fs)
errAbort("Could not find feature %s.", name);
if (slCount(fs) != 1)
errAbort("Could not find unique feature by name = %s.", name);
/* Loop through all samples, putting data in database */
struct samples *sa;
struct clinicalData *cd;
for (sa = saList; sa; sa = sa->next)
{
struct slName *id = slNameNew(getId(pdConn, patTable, patField, sa->name, sampleField));
char *cellVal = col->cellVal(col, id, pdConn);
if (!cellVal)
continue;
AllocVar(cd);
cd->sample_id = sa->id;
cd->feature_id = fs->id;
cd->val = atof(cellVal);
cd->code = NULL;
if (col->cellCoded(col, pdConn))
cd->code = cloneString(col->cellCodedVal(col, id, pdConn));
if (!clinicalDataExists(biConn, cd))
clinicalDataSaveToDb(biConn, cd, CD_TABLE, 100);
clinicalDataFree(&cd);
slNameFree(&id);
}
featuresFree(&fs);
}
hFreeConn(&pdConn);
}
-void createProbeValsTable(struct sqlConnection *biConn, char *tableName)
+
+struct analysisFeatures *getAnalysisFeatures(struct sqlConnection *biConn,
+ char *names, char *type)
{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "probe_id int unsigned not null,\n");
-dyStringPrintf(dy, "sample_count int unsigned not null,\n");
-dyStringPrintf(dy, "sample_data longblob not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(probe_id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void createProbeInfoTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "chrom varchar(255) not null,\n");
-dyStringPrintf(dy, "start int unsigned not null,\n");
-dyStringPrintf(dy, "stop int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id),\n");
-dyStringPrintf(dy, "KEY(name),\n");
-dyStringPrintf(dy, "KEY(id,name)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void addProbeValsToDb(struct sqlConnection *biConn, char *tableName,
- int probe_id, int sample_count, char *dataString, int updateSize)
-{ /* This bypasses the need to convert to float array to enter in database, may not
- * be faster, but by converting first to float array any "blank" datapoints are
- * converted to 0.0, instead of leaving blank */
-struct dyString *update = newDyString(updateSize);
-dyStringPrintf(update, "insert into %s values ( %u,%u,'%s')",
- tableName, probe_id, sample_count, dataString );
-sqlUpdate(biConn, update->string);
-freeDyString(&update);
-}
+if (!names)
+ return NULL;
+struct slName *sl, *slList = slNameListFromComma(names);
+struct dyString *dy = newDyString(100);
+dyStringPrintf(dy,
+ "select * from %s where type = \"%s\" "
+ "and feature_name in (", AF_TABLE, type);
+for (sl = slList; sl; sl = sl->next)
+ {
+ dyStringPrintf(dy, "\"%s\"", sl->name);
+ if (sl->next)
+ dyStringPrintf(dy, ",");
+ }
+dyStringPrintf(dy, ");");
+char *query = dyStringCannibalize(&dy);
-void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn,
- struct datasets *da)
-{
-char *dataTable = da->data_table;
-char *probeTable = da->probe_table;
+return analysisFeaturesLoadByQuery(biConn, query);
+}
-if (!dataTable || !probeTable)
- errAbort("datasets entry not complete, data_table or probe_table not set.");
-boolean inputProbeVals = FALSE;
-boolean inputProbeInfo = FALSE;
-if (sqlTableExists(biConn, dataTable) && dropTable)
+void addDataToHash(struct hash *dataHash, char *gene,
+ unsigned int expCount, float *expScores, struct maGrouping *allA)
+{
+struct hash *hash;
+struct hashEl *el = hashLookup(dataHash, gene);
+if (!el)
{
- fprintf(stderr, "probeVals table %s already exists in db, dropping...\n", dataTable);
- sqlDropTable(biConn, dataTable);
+ hash = hashNew(0);
+ hashAdd(dataHash, gene, hash);
}
+else
+ hash = el->val;
-if (!sqlTableExists(biConn, dataTable))
+int i;
+for (i = 0; i < expCount; i++)
{
- fprintf(stderr, "Creating probeVals table %s...\n", dataTable);
- createProbeValsTable(biConn, dataTable);
- inputProbeVals = TRUE; // empty table, input
- }
+ float val = expScores[i];
+ char *name;
+ if (isTCGA)
+ name = cloneStringZ(allA->names[i], 16);
+ else
+ name = cloneString(allA->names[i]);
-if (sqlTableExists(biConn, probeTable) && dropTable)
+ struct slDouble *sd = slDoubleNew(val);
+
+ el = hashLookup(hash, name);
+ if (!el)
+ hashAdd(hash, name, sd);
+ else
{
- fprintf(stderr, "probeInfo table %s already exists in db, dropping...\n", probeTable);
- sqlDropTable(biConn, probeTable);
+ struct slDouble *sdList = el->val;
+ slAddTail(&sdList, sd);
}
+ }
+}
+
+boolean reduceDataHash(struct hash *dataHash, double *retMed, double *retStd)
+{
+struct slDouble *sd, *allSd, *allSdList = NULL;
+struct hashEl *outEl;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
+ {
+ struct hash *hash = outEl->val;
-if (!sqlTableExists(biConn, probeTable))
+ struct hashEl *inEl, *elList = hashElListHash(hash);
+ for (inEl = elList; inEl != NULL; inEl = inEl->next)
{
- fprintf(stderr, "Creating probeInfo table %s...\n", probeTable);
- createProbeInfoTable(biConn, probeTable);
- inputProbeInfo = TRUE; // empty table, input
- }
+ char *sample = inEl->name;
+ struct slDouble *sdList = inEl->val;
+ double med = slDoubleMedian(sdList);
+ sd = slDoubleNew(med);
+ allSd = slDoubleNew(med);
-char query[256];
-safef(query, sizeof(query), "select * from %s;", dataTable);
+ slAddHead(&allSdList, allSd);
-/* Get bed15 data from hg18 database */
-int id = 0;
-struct sqlResult *sr = sqlGetResult(hgConn, query);
+ hashRemove(hash, sample);
+ slFreeList(&sdList);
-char **row = NULL;
-while ((row = sqlNextRow(sr)) != NULL)
- {
- char *chrom = row[1];
- unsigned chromStart = sqlUnsigned(row[2]);
- unsigned chromEnd = sqlUnsigned(row[3]);
- char *name = row[4];
- unsigned expCount = sqlUnsigned(row[13]);
- char *expScores = row[15];
+ hashAdd(hash, sample, sd);
+ }
+ hashElFreeList(&elList);
+ }
- /* Make probeInfo entry and load into db*/
- struct probeInfo *pi;
- AllocVar(pi);
- pi->id = id;
- pi->chrom = cloneString(chrom);
- pi->start = chromStart;
- pi->stop = chromEnd;
- pi->name = cloneString(name);
+double count = (double) slCount(allSdList);
+double allMedian = slDoubleMedian(allSdList);
- if (inputProbeInfo)
- probeInfoSaveToDb(biConn, pi, probeTable, 100);
+slSort(allSdList, slDoubleCmp);
+int low = round(count * (0.0015));
+int high = round(count * (1.0 - 0.0015));
- /* Make probeVals entry and load into db, straight-up copying longblob*/
- if (inputProbeVals)
- addProbeValsToDb(biConn, dataTable, id, expCount, expScores, 500);
+sd = slElementFromIx(allSdList, low);
+double lowVal = sd->val;
- id++;
- probeInfoFree(&pi);
- }
-}
+sd = slElementFromIx(allSdList, high);
+double highVal = sd->val;
-void createGeneLookupTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "kgId varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id),\n");
-dyStringPrintf(dy, "KEY(kgId),\n");
-dyStringPrintf(dy, "KEY(id,kgId),\n");
-dyStringPrintf(dy, "KEY(kgId,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
+double mad = max(fabs(lowVal - allMedian), fabs(highVal - allMedian))/3.0;
+double std = mad * 1.43;
+
+*retStd = std;
+*retMed = allMedian;
+
+return TRUE;
}
-void createGeneLookup(struct sqlConnection *biConn)
+struct analysisVals *getAnalysisVals(struct sqlConnection *biConn, struct hash *dataHash,
+ double med, double std)
{
-if (!sqlTableExists(biConn, GL_TABLE))
- {
- fprintf(stderr, "geneLookup table doesn't exist in bioInt database, recreating it.\n");
- createGeneLookupTable(biConn, GL_TABLE);
- }
+struct hash *sampleHash = createIdHash(biConn, SA_TABLE, "name");
-if (sqlTableSize(biConn, GL_TABLE) > 0)
+double z, p, val;
+double maxLogP = 88.0;
+struct hashEl *inEl, *outEl;
+struct analysisVals *av, *avList = NULL;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
{
- fprintf(stderr, "geneLookup table already has data in it, doing nothing.\n");
- return;
- }
-
-if (!sqlTableExists(biConn, "knownGene"))
- errAbort("Need knownGene table in bioInt database.");
+ char *gene = outEl->name;
+ struct hash *hash = outEl->val;
-char query[256];
-safef(query, sizeof(query), "select name from knownGene;");
+ struct analysisFeatures *af = getAnalysisFeatures(biConn, gene, "gene");
-struct slName *sl, *slList = sqlQuickList(biConn, query);
+ if (!af)
+ continue;
-int id = 0;
-struct geneLookup *gl;
-for (sl = slList; sl; sl = sl->next)
+ struct hashEl *elList = hashElListHash(hash);
+ for (inEl = elList; inEl != NULL; inEl = inEl->next)
{
- AllocVar(gl);
- gl->id = id;
- gl->kgId = cloneString(sl->name);
-
- id++;
- geneLookupSaveToDb(biConn, gl, GL_TABLE, 100);
- geneLookupFree(&gl);
- }
-
-slNameFreeList(&slList);
-}
+ char *sample = inEl->name;
+ struct slDouble *sd = inEl->val;
-void createProbeToGeneTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "probe_id int unsigned not null,\n");
-dyStringPrintf(dy, "gene_id int unsigned not null,\n");
-dyStringPrintf(dy, "KEY(probe_id),\n");
-dyStringPrintf(dy, "KEY(gene_id),\n");
-dyStringPrintf(dy, "KEY(probe_id,gene_id),\n");
-dyStringPrintf(dy, "KEY(gene_id,probe_id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
+ int sample_id = hashIntValDefault(sampleHash, sample, -1);
+ if (sample_id == -1)
+ errAbort("No sample by name of %s\n", sample);
+
+ AllocVar(av);
+ av->sample_id = sample_id;
+ av->feature_id = af->id;
+ av->val = sd->val;
+
+ z = (av->val - med)/std;
+ p = ndtr(-1.0*fabs(z));
+ if (p > 0)
+ val = min(-log(p)/log(10.0), maxLogP);
+ else
+ val = maxLogP;
-int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
-{
-char query[128];
-safef(query, sizeof(query),
- "select id from %s where name = \"%s\"",
- tableName, name);
+ if (z < 0.0)
+ val = -1.0*val; // signed log(p-value)
+ av->conf = val;
-if (!sqlExists(biConn, query))
- return -1;
+ slAddHead(&avList, av);
+ }
+ hashElFreeList(&elList);
+ }
-return sqlQuickNum(biConn, query);
+return avList;
}
-struct slInt *getGeneIdsBySymbol(struct sqlConnection *biConn,
- struct slName *slList)
+void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn,
+ struct datasets *da, struct maGrouping *allA)
{
-if (!slList)
- return NULL;
+char *dataTable = da->data_table;
-struct slName *sl;
-struct dyString *dy = newDyString(100);
-dyStringPrintf(dy, "select id from %s "
- "join kgXref on %s.kgId = kgXref.kgId "
- "where kgXref.geneSymbol in (", GL_TABLE, GL_TABLE);
-for (sl = slList; sl; sl = sl->next)
+if (!dataTable)
+ errAbort("datasets entry not complete, data_table not set.");
+
+boolean inputProbeVals = FALSE;
+if (sqlTableExists(biConn, dataTable) && dropTable)
{
- dyStringPrintf(dy, "\"%s\"", sl->name);
- if (sl->next)
- dyStringPrintf(dy, ",");
+ fprintf(stderr, "analysisVals table %s already exists in db, dropping...\n", dataTable);
+ sqlDropTable(biConn, dataTable);
}
-dyStringPrintf(dy, ");");
-char *query = dyStringCannibalize(&dy);
-
-return sqlQuickNumList(biConn, query);
-}
-
-void setupProbeToGene(struct sqlConnection *hgConn,
- struct sqlConnection *biConn, struct datasets *da)
-{
-char *p2gTable = da->probe_to_gene_table;
-if (!p2gTable)
+if (!sqlTableExists(biConn, dataTable))
{
- fprintf(stderr, "probeToGene table not set, doing nothing.\n");
- return;
+ fprintf(stderr, "Creating analysisVals table %s...\n", dataTable);
+ inputProbeVals = TRUE; // empty table, input
}
-struct hash *settings = getSettings(da->data_table);
+if (!inputProbeVals)
+ return;
+
+struct hash *settings = getSettings(dataTable);
struct hashEl *el = hashLookup(settings, "aliasTable");
if (!el)
errAbort("No aliasTable.\n");
char *aliasTable = cloneString(el->val);
+struct hash *gaHash = getAliases(hgConn, aliasTable);
-if (!sqlTableExists(hgConn, aliasTable))
- errAbort("Table %s not found in hg18 database.\n", aliasTable);
+char query[256];
+safef(query, sizeof(query), "select * from %s;", dataTable);
-if (!sqlTableExists(biConn, "kgXref"))
- errAbort("kgXref table not found in database. Cannot create probeToGene table.\n");
+/* Get bed15 data from hg18 database */
+struct sqlResult *sr = sqlGetResult(hgConn, query);
-if (sqlTableExists(biConn, p2gTable) && dropTable)
- {
- fprintf(stderr, "Table %s already exists, dropping and recreating.\n", p2gTable);
- sqlDropTable(biConn, p2gTable);
- }
+struct hash *dataHash = hashNew(0);
-boolean inputProbeToGene = FALSE;
-if (!sqlTableExists(biConn, p2gTable))
+char **row = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
{
- fprintf(stderr, "Creating probeToGene table...\n");
- createProbeToGeneTable(biConn, p2gTable);
- inputProbeToGene = TRUE;
- }
+ struct bed *nb = bedLoadN(row+1, 15);
-if (!inputProbeToGene)
- return;
+ char *name = nb->name; // row[4];
+ unsigned expCount = nb->expCount; //sqlUnsigned(row[13]);
+ float *expScores = nb->expScores; // row[15];
-struct geneAlias *ga, *gaList = getAliases(hgConn, aliasTable);
-for (ga = gaList; ga; ga = ga->next)
- {
- int probeId = getProbeId(biConn, da->probe_table, ga->probe);
- if (probeId < 0) // probe in alias table doesn't exist in dataset
+ struct hashEl *el = hashLookup(gaHash, name);
+ if (!el)
continue;
+ struct geneAlias *ga = el->val;
+ struct slName *sl;
+ for (sl = ga->genes; sl; sl = sl->next)
+ addDataToHash(dataHash, sl->name, expCount, expScores, allA);
+ }
+
+if (hashNumEntries(dataHash) == 0)
+ errAbort("no entries in hash\n");
+
+fprintf(stderr, "\treducing hash...\n");
+double med, std;
+if (!reduceDataHash(dataHash, &med, &std))
+ errAbort("problem reducing hash\n");
+
+fprintf(stderr, "\tconverting hash to analysisVals...\n");
+struct analysisVals *avList = getAnalysisVals(biConn, dataHash, med, std);
+
+fprintf(stderr, "\tstoring analysisVals...\n");
+storeAnalysisValsInDb(biConn, dataTable, avList);
+analysisValsFreeList(&avList);
+}
- struct slInt *si, *geneIds = getGeneIdsBySymbol(biConn, ga->genes);
- struct probeToGene *pg;
- AllocVar(pg);
- pg->probe_id = probeId;
- for (si = geneIds; si; si = si->next)
- {
- pg->gene_id = si->val;
- probeToGeneSaveToDb(biConn, pg, p2gTable, 10);
- }
- probeToGeneFree(&pg);
- }
-}
+int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
+{
+char query[128];
+safef(query, sizeof(query),
+ "select id from %s where name = \"%s\"",
+ tableName, name);
-void createPathwaysTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "source varchar(255) not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(name),\n");
-dyStringPrintf(dy, "KEY(id,name),\n");
-dyStringPrintf(dy, "KEY(name,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void createPathwayGenesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "gene_id int unsigned not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(gene_id),\n");
-dyStringPrintf(dy, "KEY(id,gene_id),\n");
-dyStringPrintf(dy, "KEY(gene_id,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void createPathwayInfoTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "description longblob not null,\n");
-dyStringPrintf(dy, "KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
+if (!sqlExists(biConn, query))
+ return -1;
+
+return sqlQuickNum(biConn, query);
}
+
char *getPathwayDescription(struct sqlConnection *pdConn, char *name)
{
if (!sqlTableExists(pdConn, name))
return NULL;
char query[128];
safef(query, sizeof(query),
"select description from descriptions where name = \"%s\";",
name);
return sqlQuickString(pdConn, query);
}
-void setupPathways(struct sqlConnection *biConn)
+void setupGenesets(struct sqlConnection *biConn)
{
-boolean inputPathways = FALSE;
-boolean inputPathwayInfo = FALSE;
-boolean inputPathwayGenes = FALSE;
+boolean inputGenesets = FALSE;
+boolean inputGenesetInfo = FALSE;
+boolean inputGenesetGenes = FALSE;
struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
if (!pdConn)
errAbort("Could not connect to pathways database.\n");
-if (sqlTableExists(biConn, PA_TABLE) && dropTable)
+if (sqlTableExists(biConn, GE_TABLE) && dropTable)
{
- fprintf(stderr, "pathways table already exists, dropping and recreating.\n");
- sqlDropTable(biConn, PA_TABLE);
+ fprintf(stderr, "%s table already exists, dropping and recreating.\n", GE_TABLE);
+ sqlDropTable(biConn, GE_TABLE);
}
-if (!sqlTableExists(biConn, PA_TABLE))
+if (!sqlTableExists(biConn, GE_TABLE))
{
- fprintf(stderr, "Creating pathways table.\n");
- createPathwaysTable(biConn, PA_TABLE);
- inputPathways = TRUE;
+ fprintf(stderr, "Creating %s table.\n", GE_TABLE);
+ createGenesetsTable(biConn, GE_TABLE);
+ inputGenesets = TRUE;
}
-if (sqlTableExists(biConn, PG_TABLE) && dropTable)
+if (sqlTableExists(biConn, GG_TABLE) && dropTable)
{
- fprintf(stderr, "pathwayGenes table already exists, dropping and recreating.\n");
- sqlDropTable(biConn, PG_TABLE);
+ fprintf(stderr, "%s table already exists, dropping and recreating.\n", GG_TABLE);
+ sqlDropTable(biConn, GG_TABLE);
}
-if (!sqlTableExists(biConn, PG_TABLE))
+if (!sqlTableExists(biConn, GG_TABLE))
{
- fprintf(stderr, "Creating pathwayGenes table.\n");
- createPathwayGenesTable(biConn, PG_TABLE);
- inputPathwayGenes = TRUE;
+ fprintf(stderr, "Creating %s table.\n", GG_TABLE);
+ createGenesetGenesTable(biConn, GG_TABLE);
+ inputGenesetGenes = TRUE;
}
-if (sqlTableExists(biConn, PI_TABLE) && dropTable)
+if (sqlTableExists(biConn, GI_TABLE) && dropTable)
{
- fprintf(stderr, "pathwayInfo table already exists, dropping and recreating.\n");
- sqlDropTable(biConn, PI_TABLE);
+ fprintf(stderr, "%s table already exists, dropping and recreating.\n", GI_TABLE);
+ sqlDropTable(biConn, GI_TABLE);
}
-if (!sqlTableExists(biConn, PI_TABLE))
+if (!sqlTableExists(biConn, GI_TABLE))
{
- fprintf(stderr, "Creeting pathwayInfo table.\n");
- createPathwayInfoTable(biConn, PI_TABLE);
- inputPathwayInfo = TRUE;
+ fprintf(stderr, "Creeting %s table.\n", GI_TABLE);
+ createGenesetInfoTable(biConn, GI_TABLE);
+ inputGenesetInfo = TRUE;
}
-if (!inputPathways && !inputPathwayInfo && !inputPathwayGenes)
+if (!inputGenesets && !inputGenesetInfo && !inputGenesetGenes)
{
- fprintf(stderr, "Nothing to do for pathway tables.\n");
+ fprintf(stderr, "Nothing to do for geneset tables.\n");
return;
}
/* Setting up pathways table */
char query[128];
safef(query, sizeof(query), "select * from genesets;");
struct sqlResult *sr = sqlGetResult(pdConn, query);
char **row = NULL;
/* Save all data in lists to avoid "out of sync" error when attempting
* query inside of a running query on same db */
struct slName *na, *names = NULL;
struct slName *ge, *genes = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
slNameAddHead(&names, row[0]);
slNameAddHead(&genes, row[1]);
}
slReverse(&names);
slReverse(&genes);
sqlFreeResult(&sr);
-int id = 0;
for (na = names, ge = genes; na && ge; na = na->next, ge = ge->next)
{
char *name = na->name;
- char *genes = ge->name;
+ char *members = ge->name;
- struct slName *slList = slNameListFromComma(genes);
- struct slInt *si, *siList = getGeneIdsBySymbol(biConn, slList);
+ struct analysisFeatures *gsAf = getAnalysisFeatures(biConn, name, "geneset");
+ struct analysisFeatures *af, *afList = getAnalysisFeatures(biConn, members, "gene");
- struct pathways *ps;
+ if (!gsAf)
+ continue;
+
+ if (!afList)
+ continue;
- if (inputPathways)
+ if (inputGenesets)
{
- AllocVar(ps);
- ps->id = id;
- ps->name = cloneString(name);
- ps->source = cloneString("N/A");
- pathwaysSaveToDb(biConn, ps, PA_TABLE, 100);
- pathwaysFree(&ps);
+ struct genesets *gs;
+ AllocVar(gs);
+ gs->id = gsAf->id;
+ gs->name = cloneString(name);
+ gs->source = cloneString("N/A");
+ genesetsSaveToDb(biConn, gs, GE_TABLE, 100);
+ genesetsFree(&gs);
}
- if (inputPathwayGenes)
+ if (inputGenesetGenes)
{
- struct pathwayGenes *pg;
- AllocVar(pg);
- pg->id = id;
- for (si = siList; si; si = si->next)
+ struct genesetGenes *gg;
+ AllocVar(gg);
+ gg->id = gsAf->id;
+ for (af = afList; af; af = af->next)
{
- pg->gene_id = si->val;
- pathwayGenesSaveToDb(biConn, pg, PG_TABLE, 100);
+ gg->gene_id = af->id;
+ genesetGenesSaveToDb(biConn, gg, GG_TABLE, 100);
}
- pathwayGenesFree(&pg);
+ genesetGenesFree(&gg);
}
- if (inputPathwayInfo)
+ if (inputGenesetInfo)
{
char *desc = getPathwayDescription(pdConn, name);
if (desc)
{
- struct pathwayInfo *pi;
- AllocVar(pi);
- pi->id = id;
- pi->description = desc;
- pathwayInfoSaveToDbEscaped(biConn, pi, PI_TABLE, 200);
- pathwayInfoFree(&pi);
+ struct genesetInfo *gi;
+ AllocVar(gi);
+ gi->id = gsAf->id;
+ gi->description = desc;
+ genesetInfoSaveToDbEscaped(biConn, gi, GI_TABLE, 200);
+ genesetInfoFree(&gi);
+ }
}
+
+ analysisFeaturesFree(&gsAf);
+ analysisFeaturesFreeList(&afList);
+ }
+
+hFreeConn(&pdConn);
+}
+
+boolean analysisFeatureExists(struct sqlConnection *biConn, struct analysisFeatures *af)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from %s where id = %d "
+ "and feature_name = \"%s\" "
+ "and type = \"%s\"",
+ AF_TABLE, af->id, af->feature_name, af->type);
+
+return sqlExists(biConn, query);
+}
+
+int findIdForAnalysisFeature(struct sqlConnection *biConn, char *tableName,
+ struct analysisFeatures *af)
+{
+if (sqlTableSize(biConn, tableName) == 0) /* brand new table, return 0 */
+ return 0;
+
+char query[256];
+safef(query, sizeof(query),
+ "select DISTINCT id from %s where feature_name = \"%s\" "
+ "and type = \"%s\";",
+ tableName, af->feature_name, af->type);
+if (sqlExists(biConn, query)) /* sample name found, use same id */
+ return sqlQuickNum(biConn, query);
+
+/* Else, find maximum sample id and add one to it */
+safef(query, sizeof(query),
+ "select max(id) from %s;",
+ tableName);
+int maxId = sqlQuickNum(biConn, query);
+return maxId + 1;
+}
+
+void setupAnalysisFeatures(struct sqlConnection *biConn)
+{
+if (!sqlTableExists(biConn, AF_TABLE))
+ createAnalysisFeaturesTable(biConn, AF_TABLE);
+
+if (sqlTableSize(biConn, AF_TABLE) != 0)
+ {
+ fprintf(stderr, "%s table is not empty, doing nothing\n", AF_TABLE);
+ return;
}
- id++;
+
+/* set up gene features */
+char query[256];
+safef(query, sizeof(query),
+ "select DISTINCT geneSymbol from %s;", KX_TABLE);
+struct slName *sl, *slList = sqlQuickList(biConn, query);
+
+struct analysisFeatures *af, *afList = NULL;
+
+for (sl = slList; sl; sl = sl->next)
+ {
+ AllocVar(af);
+ af->id = 0;
+ af->feature_name = cloneString(sl->name);
+ af->type = cloneString("gene");
+
+ slAddHead(&afList, af);
}
+slNameFreeList(&slList);
+
+/* set up geneset features */
+struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
+safef(query, sizeof(query),
+ "select name from genesets;");
+slList = sqlQuickList(pdConn, query);
+for (sl = slList; sl; sl = sl->next)
+ {
+ AllocVar(af);
+ af->id = 0;
+ af->feature_name = cloneString(sl->name);
+ af->type = cloneString("geneset");
+
+ slAddHead(&afList, af);
+ }
hFreeConn(&pdConn);
+slNameFreeList(&slList);
+
+slReverse(&afList);
+
+/* set up pathway features (TODO) */
+
+
+/* save features to db */
+for (af = afList; af; af = af->next)
+ {
+ int feature_id = findIdForAnalysisFeature(biConn, AF_TABLE, af);
+ af->id = feature_id;
+ if (!analysisFeatureExists(biConn, af))
+ analysisFeaturesSaveToDb(biConn, af, AF_TABLE, 10);
+ }
+
+analysisFeaturesFree(&afList);
}
void populateDb(char *db, char *tableName, char *tissue)
{
tissue = strLower(tissue);
struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
-/* Create geneLookup table (if necessary) */
+/* Create analysis features (if necessary) */
uglyTime(NULL);
-fprintf(stderr, "Setting up geneLookup table...\n");
-createGeneLookup(biConn);
+fprintf(stderr, "Setting up analysis features...\n");
+setupAnalysisFeatures(biConn);
uglyTime("Time");
+/* Create geneLookup table (if necessary) */
+//uglyTime(NULL);
+//fprintf(stderr, "Setting up geneLookup table...\n");
+//createGeneLookup(biConn);
+//uglyTime("Time");
+
/* Set up pathways */
uglyTime(NULL);
fprintf(stderr, "Setting up pathways tables...\n");
-setupPathways(biConn);
+setupGenesets(biConn);
uglyTime("Time");
/* Set up datasets entry */
struct maGrouping *allA = getMaGrouping(hgConn, tableName);
if (!allA)
errAbort("Could not find maGrouping for %s!", tableName);
uglyTime(NULL);
int numSamples = allA->size;
fprintf(stderr, "Adding datasets entry...\n");
struct datasets *da = setupDataset(biConn, tableName, tissue, numSamples);
uglyTime("Time");
/* Set up samples entries */
uglyTime(NULL);
fprintf(stderr, "Adding to samples table...\n");
struct samples *saList = setupSamples(biConn, da, allA);
uglyTime("Time");
/* Set up features and clinicalData */
uglyTime(NULL);
fprintf(stderr, "Setting up clinical data tables...\n");
setupClinicalInfo(biConn, da, saList);
uglyTime("Time");
/* Set up probeInfo table (if necessary) and probeVals table */
uglyTime(NULL);
fprintf(stderr, "Setting up probe data tables (be patient!)...\n");
-setupProbeData(hgConn, biConn, da);
-uglyTime("Time");
-
-/* Set up probeToGene table (if necessary) */
-uglyTime(NULL);
-fprintf(stderr, "Setting up probeToGene table...\n");
-setupProbeToGene(hgConn, biConn, da);
+setupProbeData(hgConn, biConn, da, allA);
uglyTime("Time");
fprintf(stderr, "Done!");
fprintf(stderr, "Please run 'setCohort' to find datasets that have overlapping samples.\n");
hFreeConn(&biConn);
hFreeConn(&hgConn);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 4)
usage();
dropTable = FALSE;
if (optionExists("dropAll"))
dropTable = TRUE;
if (optionExists("tcga"))
isTCGA = TRUE;
populateDb(argv[1], argv[2], argv[3]);
return 0;
}