src/hg/instinct/bioInt2/populateDb.c 1.9
1.9 2009/04/27 22:05:48 jsanborn
added drill down into meta-genes
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/populateDb.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -b -B -U 1000000 -r1.8 -r1.9
--- src/hg/instinct/bioInt2/populateDb.c 27 Apr 2009 18:13:34 -0000 1.8
+++ src/hg/instinct/bioInt2/populateDb.c 27 Apr 2009 22:05:48 -0000 1.9
@@ -1,1216 +1,1216 @@
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "bed.h"
#include "genePred.h"
#include "hPrint.h"
#include "hdb.h"
#include "microarray.h"
#include "ra.h"
#include "featuresLib.h"
#include "hgHeatmapLib.h"
#include "cprob.h"
#include "hgStatsLib.h"
#include "bioIntDriver.h"
#include "bioIntDb.h"
char *hgDb = "hg18";
char *genome = "Human";
void usage()
/* Explain usage and exit. */
{
errAbort(
"populateDb \n"
" populateDb [OPTIONS] db table tissue\n"
"options:\n"
" -dropAll Drop/recreate any table\n"
" -tcga handles TCGA ids\n"
"\n"
);
}
boolean dropTable = FALSE; // If true, any table that should be dropped/recreated will be
boolean isTCGA = FALSE; // If true, specially handle TCGA ids
static struct optionSpec options[] = {
{"dropAll", OPTION_BOOLEAN},
{"tcga", OPTION_BOOLEAN},
{NULL, 0},
};
char *getId(struct sqlConnection *conn, char *table, char *key, char *sample, char *value)
/* get ISPY ID from sample (or experiment) Id */
{
char query[512];
safef(query, sizeof(query), "select %s from %s where %s = \"%s\" ", key, table, value, sample);
return sqlQuickString(conn, query);
}
struct slName *getProbesFromTable(struct sqlConnection *hgConn, char *tableName)
{
char query[512];
char *key = "name";
safef(query, sizeof(query), "select DISTINCT %s from %s ", key, tableName);
struct sqlResult *sr = sqlGetResult(hgConn, query);
char **row = NULL;
struct slName *sl, *slList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
sl = slNameNew(row[0]);
slAddHead(&slList, sl);
}
slReverse(&slList);
sqlFreeResult(&sr);
return slList;
}
struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
{
/*microarray specific settings*/
struct trackDb *tdb = hMaybeTrackInfo(hgConn, tableName);
struct microarrayGroups *maGs = maGroupings("hg18", tableName);
trackDbFreeList(&tdb);
if (!maGs)
return NULL;
return maGs->allArrays;
}
struct hash *getSettings(char *tableName)
{
struct column *raList = getColumns(NULL, "datasets.ra", NULL);
struct column *col;
struct hash *settings = NULL;
for (col = raList; col; col = col->next)
{
if (!sameString(col->name, tableName))
continue;
settings = col->settings;
break;
}
if (!settings)
errAbort("Couldn't find datasets.ra listing for %s", tableName);
return settings;
}
struct geneAlias {
struct geneAlias *next;
char *probe;
struct slName *genes;
};
struct hash *getAliases(struct sqlConnection *hgConn, char *tableName)
{
if (!hgConn || !tableName)
return NULL;
char query[512];
char **row;
safef(query, sizeof(query), "select * from %s", tableName);
struct sqlResult *sr = sqlGetResult(hgConn, query);
struct geneAlias *ga, *gaList = NULL;
struct hash *gaHash = hashNew(0);
while ((row = sqlNextRow(sr)) != NULL)
{
char *probe = cloneString(row[0]);
char *gene = cloneString(row[1]);
struct hashEl *el = hashLookup(gaHash, probe);
if (!el)
{
ga = AllocA(struct geneAlias);
ga->probe = cloneString(probe);
ga->genes = NULL;
slAddHead(&gaList, ga);
hashAdd(gaHash, probe, ga);
}
else
ga = el->val;
slNameAddHead(&ga->genes, gene);
}
sqlFreeResult(&sr);
return gaHash;
}
struct dataTypes *findDataType(struct sqlConnection *biConn, char *type, char *platform)
{
if (!sameString(type, "bed 15"))
errAbort("populateDb only runs on bed 15 files.");
char *data_format = "analysisVals";
char query[256];
safef(query, sizeof(query),
"select * from %s where format = \"%s\" and name = \"%s\"",
DT_TABLE, data_format, platform);
return dataTypesLoadByQuery(biConn, query);
}
struct dataTypes *createDataType(struct sqlConnection *biConn, char *type, char *platform)
{
int nextId = sqlTableSize(biConn, DT_TABLE);
struct dataTypes *dt;
AllocVar(dt);
dt->id = nextId;
dt->format = cloneString("analysisVals");
dt->name = cloneString(platform);
/* Save to db */
dataTypesSaveToDb(biConn, dt, DT_TABLE, 100);
return dt;
}
struct dataTypes *setupDataType(struct sqlConnection *biConn,
char *type, char *platform)
{
if (!sqlTableExists(biConn, DT_TABLE))
{
fprintf(stderr, "Tables dataTypes doesn't exist, creating...\n");
createDataTypesTable(biConn, DT_TABLE);
}
struct dataTypes *dt = findDataType(biConn, type, platform);
if (!dt)
dt = createDataType(biConn, type, platform);
return dt;
}
struct tissues *findTissue(struct sqlConnection *biConn, char *tissue)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where name = \"%s\";",
TI_TABLE, tissue);
return tissuesLoadByQuery(biConn, query);
}
struct tissues *createTissue(struct sqlConnection *biConn, char *tissue)
{
int nextId = sqlTableSize(biConn, TI_TABLE);
struct tissues *ti;
AllocVar(ti);
ti->id = nextId;
ti->name = cloneString(tissue);
/* Save to db */
tissuesSaveToDb(biConn, ti, TI_TABLE, 100);
return ti;
}
struct tissues *setupTissue(struct sqlConnection *biConn, char *tissue)
{
if (!sqlTableExists(biConn, TI_TABLE))
{
fprintf(stderr, "Tables tissues doesn't exist, creating...\n");
createTissuesTable(biConn, TI_TABLE);
}
struct tissues *ti = findTissue(biConn, tissue);
if (!ti)
ti = createTissue(biConn, tissue);
return ti;
}
struct datasets *findDataset(struct sqlConnection *biConn, char *name)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where data_table = \"%s\";",
DA_TABLE, name);
return datasetsLoadByQuery(biConn, query);
}
struct datasets *createDataset(struct sqlConnection *biConn,
char *tableName, char *tissue, int numSamples)
{
struct hash *settings = getSettings(tableName);
struct hashEl *el = hashLookup(settings, "shortLabel");
if (!el)
errAbort("No shortLabel");
char *shortLabel = cloneString(el->val);
el = hashLookup(settings, "name");
if (!el)
errAbort("No name");
char *dataTable = cloneString(el->val);
char *platform;
el = hashLookup(settings, "platform");
if (!el)
platform = cloneString("Expression");
else
platform = cloneString(el->val);
el = hashLookup(settings, "dataType");
if (!el)
errAbort("No dataType");
char *dataType = cloneString(el->val);
struct dataTypes *dt = setupDataType(biConn, dataType, platform);
struct tissues *ti = setupTissue(biConn, tissue);
int nextId = sqlTableSize(biConn, DA_TABLE);
struct datasets *da;
AllocVar(da);
da->id = nextId;
da->tissue_id = ti->id;
da->type_id = dt->id;
da->num_samples = numSamples;
da->name = shortLabel;
da->data_table = dataTable;
dataTypesFree(&dt);
tissuesFree(&ti);
/* Write datasets */
datasetsSaveToDbEscaped(biConn, da, DA_TABLE, 100);
return da;
}
struct datasets *setupDataset(struct sqlConnection *biConn,
char *tableName, char *tissue, int numSamples)
{
if (!sqlTableExists(biConn, DA_TABLE))
{
fprintf(stderr, "Tables datasets doesn't exist, creating...");
createDatasetsTable(biConn, DA_TABLE);
}
struct datasets *da = findDataset(biConn, tableName);
if (!da)
da = createDataset(biConn, tableName, tissue, numSamples);
return da;
}
char *findPatientName(struct sqlConnection *pdConn, char *pTable,
char *pField, char *sField, char *sName)
{
char query[256];
safef(query, sizeof(query),
"select %s from %s where %s = \"%s\"",
pField, pTable, sField, sName);
return sqlQuickString(pdConn, query);
}
int findId(struct sqlConnection *biConn, char *idField, char *sField, char *name)
{
if (sqlTableSize(biConn, SA_TABLE) == 0) /* brand new table, return 0 */
return 0;
char query[256];
safef(query, sizeof(query),
"select DISTINCT %s from %s where %s = \"%s\";",
idField, SA_TABLE, sField, name);
if (sqlExists(biConn, query)) /* sample name found, use same id */
return sqlQuickNum(biConn, query);
/* Else, find maximum sample id and add one to it */
safef(query, sizeof(query),
"select max(%s) from %s;",
idField, SA_TABLE);
int maxId = sqlQuickNum(biConn, query);
return maxId + 1;
}
boolean sampleExists(struct sqlConnection *biConn, struct samples *sa)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where id = %d "
"and name = \"%s\" "
"and patient_id = %d "
"and patient_name = \"%s\" "
"and dataset_id = %d "
"and tissue_id = %d ",
SA_TABLE, sa->id, sa->name, sa->patient_id, sa->patient_name, sa->dataset_id,
sa->tissue_id);
return sqlExists(biConn, query);
}
void createSamples(struct sqlConnection *biConn, struct datasets *da, struct maGrouping *allA)
{
int datasetId = da->id;
int tissueId = da->tissue_id;
struct hash *settings = getSettings(da->data_table);
struct hashEl *el = hashLookup(settings, "patDb");
if (!el)
errAbort("No patDb!");
char *patDb = cloneString(el->val);
el = hashLookup(settings, "patTable");
if (!el)
errAbort("No patTable");
char *patTable = cloneString(el->val);
el = hashLookup(settings, "patField");
if (!el)
errAbort("No patField");
char *patField = cloneString(el->val);
el = hashLookup(settings, "sampleField");
if (!el)
errAbort("No sampleField");
char *sampleField = cloneString(el->val);
struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb);
int i;
struct samples *sa;
for (i = 0; i < allA->size; i++)
{
char *sampleName, *patientName;
if (isTCGA)
{
sampleName = cloneStringZ(allA->names[i], 16);
patientName = cloneStringZ(allA->names[i], 12);
}
else
{
sampleName = cloneString(allA->names[i]);
patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName);
}
int sampleId = findId(biConn, "id", "name", sampleName);
int patientId = findId(biConn, "patient_id", "patient_name", patientName);
AllocVar(sa);
sa->id = sampleId;
sa->name = sampleName;
sa->patient_id = patientId;
sa->patient_name = patientName;
sa->dataset_id = datasetId;
sa->tissue_id = tissueId;
if (!sampleExists(biConn, sa))
samplesSaveToDb(biConn, sa, SA_TABLE, 100);
samplesFree(&sa);
}
hFreeConn(&pdConn);
}
struct samples *getSamples(struct sqlConnection *biConn, struct datasets *da)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where dataset_id = %d order by id;",
SA_TABLE, da->id);
return samplesLoadByQuery(biConn, query);
}
struct samples *setupSamples(struct sqlConnection *biConn, struct datasets *da,
struct maGrouping *allA)
{
if (!sqlTableExists(biConn, SA_TABLE))
{
fprintf(stderr, "Table samples doesn't exist, creating...\n");
createSamplesTable(biConn, SA_TABLE);
}
createSamples(biConn, da, allA);
struct samples *saList = getSamples(biConn, da);
//if (slCount(saList) != allA->size)
// errAbort("Sample count from microarrayGroups and database don't match!");
return saList;
}
int getFeatureId(struct sqlConnection *biConn, char *name)
{
if (sqlTableSize(biConn, FE_TABLE) == 0) /* brand new table, return 0 */
return 0;
char query[256];
safef(query, sizeof(query),
"select id from %s where name = \"%s\";",
FE_TABLE, name);
if (sqlExists(biConn, query)) /* sample name found, use same id */
return sqlQuickNum(biConn, query);
else
return sqlTableSize(biConn, FE_TABLE);
}
struct features *getFeature(struct sqlConnection *biConn, char *name)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where name = \"%s\";",
FE_TABLE, name);
return featuresLoadByQuery(biConn, query);
}
boolean featureExists(struct sqlConnection *biConn, struct features *fs)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where name = \"%s\";",
FE_TABLE, fs->name);
return sqlExists(biConn, query);
}
boolean clinicalDataExists(struct sqlConnection *biConn, struct clinicalData *cd)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where sample_id = %d "
"and feature_id = %d; ",
CD_TABLE, cd->sample_id, cd->feature_id);
if (!sqlExists(biConn, query)) /* entry doesn't exist, report */
return FALSE;
/* Make sure entry has same values, if not there is a problem
* (sample_id, feature_id) should be unique */
struct clinicalData *cd2 = clinicalDataLoadByQuery(biConn, query);
if (slCount(cd2) != 1)
errAbort("clinicalData entries not unique, sample_id = %d, feature_id = %d",
cd->sample_id, cd->feature_id);
if (cd->val != cd2->val)
errAbort("clinicalData values don't match, sample_id = %d, feature_id = %d, "
"%f != %f",
cd->sample_id, cd->feature_id, cd->val, cd2->val);
if (cd->code && cd2->code)
if (!sameString(cd->code, cd2->code))
errAbort("clinicalData codes don't match, sample_id = %d, feature_id = %d",
cd->sample_id, cd->feature_id);
return TRUE;
}
void setupClinicalInfo(struct sqlConnection *biConn, struct datasets *da, struct samples *saList)
{
if (!saList)
return;
if (!sqlTableExists(biConn, FE_TABLE))
{
fprintf(stderr, "Table features doesn't exist, creating...\n");
createFeaturesTable(biConn, FE_TABLE);
}
if (!sqlTableExists(biConn, CD_TABLE))
{
fprintf(stderr, "Table clinicalData doesn't exist, creating...\n");
createClinicalDataTable(biConn, CD_TABLE);
}
struct hash *settings = getSettings(da->data_table);
struct hashEl *el = hashLookup(settings, "raFile");
if (!el)
errAbort("No raFile");
char *raFile = cloneString(el->val);
el = hashLookup(settings, "patDb");
if (!el)
errAbort("No patDb");
char *patDb = cloneString(el->val);
el = hashLookup(settings, "patTable");
if (!el)
errAbort("No patTable");
char *patTable = cloneString(el->val);
el = hashLookup(settings, "patField");
if (!el)
errAbort("No patField");
char *patField = cloneString(el->val);
el = hashLookup(settings, "sampleField");
if (!el)
errAbort("No sampleField");
char *sampleField = cloneString(el->val);
if (!raFile || !patDb || !patTable || !patField || !sampleField)
errAbort("Incomplete ra entry for %s.", da->data_table);
struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb); //connection to patient data
if (DEBUG)
fprintf(stderr, "Getting columns of clinical data...\n");
struct column *col, *colList = getColumns(pdConn, raFile, patDb);
/* Set up features */
struct features *fs;
for (col = colList; col; col = col->next)
{
char *name = col->name;
char *shortLabel = col->shortLabel;
char *longLabel = col->longLabel;
int id = getFeatureId(biConn, name);
AllocVar(fs);
fs->id = id;
fs->name = cloneString(name);
fs->shortLabel = cloneString(shortLabel);
fs->longLabel= cloneString(longLabel);
if (!featureExists(biConn, fs))
featuresSaveToDbEscaped(biConn, fs, FE_TABLE, 100);
featuresFree(&fs);
fs = getFeature(biConn, name);
if (!fs)
errAbort("Could not find feature %s.", name);
if (slCount(fs) != 1)
errAbort("Could not find unique feature by name = %s.", name);
/* Loop through all samples, putting data in database */
struct samples *sa;
struct clinicalData *cd;
for (sa = saList; sa; sa = sa->next)
{
struct slName *id = slNameNew(getId(pdConn, patTable, patField, sa->name, sampleField));
char *cellVal = col->cellVal(col, id, pdConn);
if (!cellVal)
continue;
AllocVar(cd);
cd->sample_id = sa->id;
cd->feature_id = fs->id;
cd->val = atof(cellVal);
cd->code = NULL;
if (col->cellCoded(col, pdConn))
cd->code = cloneString(col->cellCodedVal(col, id, pdConn));
if (!clinicalDataExists(biConn, cd))
clinicalDataSaveToDb(biConn, cd, CD_TABLE, 100);
clinicalDataFree(&cd);
slNameFree(&id);
}
featuresFree(&fs);
}
hFreeConn(&pdConn);
}
struct analysisFeatures *getAnalysisFeatures(struct sqlConnection *biConn,
char *names, char *type)
{
if (!names)
return NULL;
struct slName *sl, *slList = slNameListFromComma(names);
struct dyString *dy = newDyString(100);
dyStringPrintf(dy,
"select * from %s where type = \"%s\" "
"and feature_name in (", AF_TABLE, type);
for (sl = slList; sl; sl = sl->next)
{
dyStringPrintf(dy, "\"%s\"", sl->name);
if (sl->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ");");
char *query = dyStringCannibalize(&dy);
return analysisFeaturesLoadByQuery(biConn, query);
}
void addDataToHash(struct hash *dataHash, char *gene,
unsigned int expCount, float *expScores, struct maGrouping *allA)
{
struct hash *hash;
struct hashEl *el = hashLookup(dataHash, gene);
if (!el)
{
hash = hashNew(0);
hashAdd(dataHash, gene, hash);
}
else
hash = el->val;
int i;
for (i = 0; i < expCount; i++)
{
float val = expScores[i];
char *name;
if (isTCGA)
name = cloneStringZ(allA->names[i], 16);
else
name = cloneString(allA->names[i]);
struct slDouble *sd = slDoubleNew(val);
el = hashLookup(hash, name);
if (!el)
hashAdd(hash, name, sd);
else
{
struct slDouble *sdList = el->val;
slAddTail(&sdList, sd);
}
- freeMem(&name);
+ freeMem(name);
}
}
boolean reduceDataHash(struct hash *dataHash, double *retMed, double *retStd)
{
struct slDouble *sd, *allSd, *allSdList = NULL;
struct hashEl *outEl;
struct hashCookie cookie = hashFirst(dataHash);
while ((outEl = hashNext(&cookie)) != NULL)
{
struct hash *hash = outEl->val;
struct hashEl *inEl, *elList = hashElListHash(hash);
for (inEl = elList; inEl != NULL; inEl = inEl->next)
{
char *sample = inEl->name;
struct slDouble *sdList = inEl->val;
double med = slDoubleMedian(sdList);
sd = slDoubleNew(med);
allSd = slDoubleNew(med);
slAddHead(&allSdList, allSd);
hashRemove(hash, sample);
slFreeList(&sdList);
hashAdd(hash, sample, sd);
}
hashElFreeList(&elList);
}
double count = (double) slCount(allSdList);
double allMedian = slDoubleMedian(allSdList);
slSort(allSdList, slDoubleCmp);
int low = round(count * (0.0015));
int high = round(count * (1.0 - 0.0015));
sd = slElementFromIx(allSdList, low);
double lowVal = sd->val;
sd = slElementFromIx(allSdList, high);
double highVal = sd->val;
double mad = max(fabs(lowVal - allMedian), fabs(highVal - allMedian))/3.0;
double std = mad * 1.43;
*retStd = std;
*retMed = allMedian;
slFreeList(&allSdList);
return TRUE;
}
struct analysisVals *getAnalysisVals(struct sqlConnection *biConn, struct hash *dataHash,
double med, double std)
{
struct hash *sampleHash = createIdHash(biConn, SA_TABLE, "name");
double z, p, val;
double maxLogP = 88.0;
struct hashEl *inEl, *outEl;
struct analysisVals *av, *avList = NULL;
struct hashCookie cookie = hashFirst(dataHash);
while ((outEl = hashNext(&cookie)) != NULL)
{
char *gene = outEl->name;
struct hash *hash = outEl->val;
struct analysisFeatures *af = getAnalysisFeatures(biConn, gene, "gene");
if (!af)
continue;
struct hashEl *elList = hashElListHash(hash);
for (inEl = elList; inEl != NULL; inEl = inEl->next)
{
char *sample = inEl->name;
struct slDouble *sd = inEl->val;
int sample_id = hashIntValDefault(sampleHash, sample, -1);
if (sample_id == -1)
errAbort("No sample by name of %s\n", sample);
AllocVar(av);
av->sample_id = sample_id;
av->feature_id = af->id;
av->val = sd->val;
z = (av->val - med)/std;
p = ndtr(-1.0*fabs(z));
if (p > 0)
val = min(-log(p)/log(10.0), maxLogP);
else
val = maxLogP;
if (z < 0.0)
val = -1.0*val; // signed log(p-value)
av->conf = val;
slAddHead(&avList, av);
}
analysisFeaturesFree(&af);
hashElFreeList(&elList);
}
return avList;
}
void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn,
struct datasets *da, struct maGrouping *allA)
{
char *dataTable = da->data_table;
if (!dataTable)
errAbort("datasets entry not complete, data_table not set.");
boolean inputProbeVals = FALSE;
if (sqlTableExists(biConn, dataTable) && dropTable)
{
fprintf(stderr, "analysisVals table %s already exists in db, dropping...\n", dataTable);
sqlDropTable(biConn, dataTable);
}
if (!sqlTableExists(biConn, dataTable))
{
fprintf(stderr, "Creating analysisVals table %s...\n", dataTable);
inputProbeVals = TRUE; // empty table, input
}
if (!inputProbeVals)
return;
struct hash *settings = getSettings(dataTable);
struct hashEl *el = hashLookup(settings, "aliasTable");
if (!el)
errAbort("No aliasTable.\n");
char *aliasTable = cloneString(el->val);
struct hash *gaHash = getAliases(hgConn, aliasTable);
char query[256];
safef(query, sizeof(query), "select * from %s;", dataTable);
/* Get bed15 data from hg18 database */
struct sqlResult *sr = sqlGetResult(hgConn, query);
struct hash *dataHash = hashNew(0);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
struct bed *nb = bedLoadN(row+1, 15);
struct hashEl *el = hashLookup(gaHash, nb->name);
if (el)
{
struct geneAlias *ga = el->val;
struct slName *sl;
for (sl = ga->genes; sl; sl = sl->next)
addDataToHash(dataHash, sl->name, nb->expCount, nb->expScores, allA);
}
bedFree(&nb);
}
sqlFreeResult(&sr);
if (hashNumEntries(dataHash) == 0)
errAbort("no entries in hash\n");
fprintf(stderr, "\treducing hash...\n");
double med, std;
if (!reduceDataHash(dataHash, &med, &std))
errAbort("problem reducing hash\n");
fprintf(stderr, "\tconverting hash to analysisVals...\n");
struct analysisVals *avList = getAnalysisVals(biConn, dataHash, med, std);
fprintf(stderr, "\tstoring analysisVals...\n");
storeAnalysisValsInDb(biConn, dataTable, avList);
analysisValsFreeList(&avList);
hashFree(&dataHash);
}
int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
{
char query[128];
safef(query, sizeof(query),
"select id from %s where name = \"%s\"",
tableName, name);
if (!sqlExists(biConn, query))
return -1;
return sqlQuickNum(biConn, query);
}
char *getPathwayDescription(struct sqlConnection *pdConn, char *name)
{
if (!sqlTableExists(pdConn, name))
return NULL;
char query[128];
safef(query, sizeof(query),
"select description from descriptions where name = \"%s\";",
name);
return sqlQuickString(pdConn, query);
}
void setupGenesets(struct sqlConnection *biConn)
{
boolean inputGenesets = FALSE;
boolean inputGenesetInfo = FALSE;
boolean inputGenesetGenes = FALSE;
struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
if (!pdConn)
errAbort("Could not connect to pathways database.\n");
if (sqlTableExists(biConn, GE_TABLE) && dropTable)
{
fprintf(stderr, "%s table already exists, dropping and recreating.\n", GE_TABLE);
sqlDropTable(biConn, GE_TABLE);
}
if (!sqlTableExists(biConn, GE_TABLE))
{
fprintf(stderr, "Creating %s table.\n", GE_TABLE);
createGenesetsTable(biConn, GE_TABLE);
inputGenesets = TRUE;
}
if (sqlTableExists(biConn, GG_TABLE) && dropTable)
{
fprintf(stderr, "%s table already exists, dropping and recreating.\n", GG_TABLE);
sqlDropTable(biConn, GG_TABLE);
}
if (!sqlTableExists(biConn, GG_TABLE))
{
fprintf(stderr, "Creating %s table.\n", GG_TABLE);
createGenesetGenesTable(biConn, GG_TABLE);
inputGenesetGenes = TRUE;
}
if (sqlTableExists(biConn, GI_TABLE) && dropTable)
{
fprintf(stderr, "%s table already exists, dropping and recreating.\n", GI_TABLE);
sqlDropTable(biConn, GI_TABLE);
}
if (!sqlTableExists(biConn, GI_TABLE))
{
fprintf(stderr, "Creeting %s table.\n", GI_TABLE);
createGenesetInfoTable(biConn, GI_TABLE);
inputGenesetInfo = TRUE;
}
if (!inputGenesets && !inputGenesetInfo && !inputGenesetGenes)
{
fprintf(stderr, "Nothing to do for geneset tables.\n");
return;
}
/* Setting up pathways table */
char query[128];
safef(query, sizeof(query), "select * from genesets;");
struct sqlResult *sr = sqlGetResult(pdConn, query);
char **row = NULL;
/* Save all data in lists to avoid "out of sync" error when attempting
* query inside of a running query on same db */
struct slName *na, *names = NULL;
struct slName *ge, *genes = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
slNameAddHead(&names, row[0]);
slNameAddHead(&genes, row[1]);
}
slReverse(&names);
slReverse(&genes);
sqlFreeResult(&sr);
for (na = names, ge = genes; na && ge; na = na->next, ge = ge->next)
{
char *name = na->name;
char *members = ge->name;
struct analysisFeatures *gsAf = getAnalysisFeatures(biConn, name, "geneset");
struct analysisFeatures *af, *afList = getAnalysisFeatures(biConn, members, "gene");
if (!gsAf)
continue;
if (!afList)
continue;
if (inputGenesets)
{
struct genesets *gs;
AllocVar(gs);
gs->id = gsAf->id;
gs->name = cloneString(name);
gs->source = cloneString("N/A");
genesetsSaveToDb(biConn, gs, GE_TABLE, 100);
genesetsFree(&gs);
}
if (inputGenesetGenes)
{
struct genesetGenes *gg;
AllocVar(gg);
gg->id = gsAf->id;
for (af = afList; af; af = af->next)
{
gg->gene_id = af->id;
genesetGenesSaveToDb(biConn, gg, GG_TABLE, 100);
}
genesetGenesFree(&gg);
}
if (inputGenesetInfo)
{
char *desc = getPathwayDescription(pdConn, name);
if (desc)
{
struct genesetInfo *gi;
AllocVar(gi);
gi->id = gsAf->id;
gi->description = desc;
genesetInfoSaveToDbEscaped(biConn, gi, GI_TABLE, 200);
genesetInfoFree(&gi);
}
}
analysisFeaturesFree(&gsAf);
analysisFeaturesFreeList(&afList);
}
hFreeConn(&pdConn);
}
boolean analysisFeatureExists(struct sqlConnection *biConn, struct analysisFeatures *af)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where id = %d "
"and feature_name = \"%s\" "
"and type = \"%s\"",
AF_TABLE, af->id, af->feature_name, af->type);
return sqlExists(biConn, query);
}
int findIdForAnalysisFeature(struct sqlConnection *biConn, char *tableName,
struct analysisFeatures *af)
{
if (sqlTableSize(biConn, tableName) == 0) /* brand new table, return 0 */
return 0;
char query[256];
safef(query, sizeof(query),
"select DISTINCT id from %s where feature_name = \"%s\" "
"and type = \"%s\";",
tableName, af->feature_name, af->type);
if (sqlExists(biConn, query)) /* sample name found, use same id */
return sqlQuickNum(biConn, query);
/* Else, find maximum sample id and add one to it */
safef(query, sizeof(query),
"select max(id) from %s;",
tableName);
int maxId = sqlQuickNum(biConn, query);
return maxId + 1;
}
void setupAnalysisFeatures(struct sqlConnection *biConn)
{
if (!sqlTableExists(biConn, AF_TABLE))
createAnalysisFeaturesTable(biConn, AF_TABLE);
if (sqlTableSize(biConn, AF_TABLE) != 0)
{
fprintf(stderr, "%s table is not empty, doing nothing\n", AF_TABLE);
return;
}
/* set up gene features */
char query[256];
safef(query, sizeof(query),
"select DISTINCT geneSymbol from %s;", KX_TABLE);
struct slName *sl, *slList = sqlQuickList(biConn, query);
struct analysisFeatures *af, *afList = NULL;
for (sl = slList; sl; sl = sl->next)
{
AllocVar(af);
af->id = 0;
af->feature_name = cloneString(sl->name);
af->type = cloneString("gene");
slAddHead(&afList, af);
}
slNameFreeList(&slList);
/* set up geneset features */
struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
safef(query, sizeof(query),
"select name from genesets;");
slList = sqlQuickList(pdConn, query);
for (sl = slList; sl; sl = sl->next)
{
AllocVar(af);
af->id = 0;
af->feature_name = cloneString(sl->name);
af->type = cloneString("geneset");
slAddHead(&afList, af);
}
hFreeConn(&pdConn);
slNameFreeList(&slList);
slReverse(&afList);
/* set up pathway features (TODO) */
/* save features to db */
for (af = afList; af; af = af->next)
{
int feature_id = findIdForAnalysisFeature(biConn, AF_TABLE, af);
af->id = feature_id;
if (!analysisFeatureExists(biConn, af))
analysisFeaturesSaveToDb(biConn, af, AF_TABLE, 10);
}
analysisFeaturesFree(&afList);
}
void populateDb(char *db, char *tableName, char *tissue)
{
tissue = strLower(tissue);
struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
/* Create analysis features (if necessary) */
uglyTime(NULL);
fprintf(stderr, "Setting up analysis features...\n");
setupAnalysisFeatures(biConn);
uglyTime("Time");
/* Create geneLookup table (if necessary) */
//uglyTime(NULL);
//fprintf(stderr, "Setting up geneLookup table...\n");
//createGeneLookup(biConn);
//uglyTime("Time");
/* Set up pathways */
uglyTime(NULL);
fprintf(stderr, "Setting up pathways tables...\n");
setupGenesets(biConn);
uglyTime("Time");
/* Set up datasets entry */
struct maGrouping *allA = getMaGrouping(hgConn, tableName);
if (!allA)
errAbort("Could not find maGrouping for %s!", tableName);
uglyTime(NULL);
int numSamples = allA->size;
fprintf(stderr, "Adding datasets entry...\n");
struct datasets *da = setupDataset(biConn, tableName, tissue, numSamples);
uglyTime("Time");
/* Set up samples entries */
uglyTime(NULL);
fprintf(stderr, "Adding to samples table...\n");
struct samples *saList = setupSamples(biConn, da, allA);
uglyTime("Time");
/* Set up features and clinicalData */
uglyTime(NULL);
fprintf(stderr, "Setting up clinical data tables...\n");
setupClinicalInfo(biConn, da, saList);
uglyTime("Time");
/* Set up probeInfo table (if necessary) and probeVals table */
uglyTime(NULL);
fprintf(stderr, "Setting up probe data tables (be patient!)...\n");
setupProbeData(hgConn, biConn, da, allA);
uglyTime("Time");
fprintf(stderr, "Done!");
fprintf(stderr, "Please run 'setCohort' to find datasets that have overlapping samples.\n");
hFreeConn(&biConn);
hFreeConn(&hgConn);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 4)
usage();
dropTable = FALSE;
if (optionExists("dropAll"))
dropTable = TRUE;
if (optionExists("tcga"))
isTCGA = TRUE;
populateDb(argv[1], argv[2], argv[3]);
return 0;
}