src/hg/instinct/bioInt2/populateDb.c 1.7
1.7 2009/04/27 06:15:49 jsanborn
updated lots of stuff, will break older implementation of database
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/populateDb.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 4 -r1.6 -r1.7
--- src/hg/instinct/bioInt2/populateDb.c 4 Apr 2009 00:39:22 -0000 1.6
+++ src/hg/instinct/bioInt2/populateDb.c 27 Apr 2009 06:15:49 -0000 1.7
@@ -11,8 +11,10 @@
#include "microarray.h"
#include "ra.h"
#include "featuresLib.h"
#include "hgHeatmapLib.h"
+#include "cprob.h"
+#include "hgStatsLib.h"
#include "bioIntDriver.h"
#include "bioIntDb.h"
char *hgDb = "hg18";
@@ -109,10 +111,9 @@
char *probe;
struct slName *genes;
};
-
-struct geneAlias *getAliases(struct sqlConnection *hgConn, char *tableName)
+struct hash *getAliases(struct sqlConnection *hgConn, char *tableName)
{
if (!hgConn || !tableName)
return NULL;
@@ -146,20 +147,19 @@
slNameAddHead(&ga->genes, gene);
}
-hashFree(&gaHash);
sqlFreeResult(&sr);
-return gaList;
+return gaHash;
}
struct dataTypes *findDataType(struct sqlConnection *biConn, char *type, char *platform)
{
if (!sameString(type, "bed 15"))
errAbort("populateDb only runs on bed 15 files.");
-char *data_format = "probeVals";
+char *data_format = "analysisVals";
char query[256];
safef(query, sizeof(query),
"select * from %s where format = \"%s\" and name = \"%s\"",
@@ -173,29 +173,16 @@
int nextId = sqlTableSize(biConn, DT_TABLE);
struct dataTypes *dt;
AllocVar(dt);
dt->id = nextId;
-dt->format = cloneString("probeVals");
+dt->format = cloneString("analysisVals");
dt->name = cloneString(platform);
/* Save to db */
dataTypesSaveToDb(biConn, dt, DT_TABLE, 100);
return dt;
}
-void createDataTypesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "format varchar(255) not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
struct dataTypes *setupDataType(struct sqlConnection *biConn,
char *type, char *platform)
{
if (!sqlTableExists(biConn, DT_TABLE))
@@ -233,20 +220,8 @@
tissuesSaveToDb(biConn, ti, TI_TABLE, 100);
return ti;
}
-void createTissuesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
struct tissues *setupTissue(struct sqlConnection *biConn, char *tissue)
{
if (!sqlTableExists(biConn, TI_TABLE))
{
@@ -286,14 +261,8 @@
if (!el)
errAbort("No name");
char *dataTable = cloneString(el->val);
-char probeTable[256];
-safef(probeTable, sizeof(probeTable), "%s_probeInfo", dataTable);
-
-char p2gTable[256];
-safef(p2gTable, sizeof(p2gTable), "%s_probeToGene", dataTable);
-
char *platform;
el = hashLookup(settings, "platform");
if (!el)
platform = cloneString("Expression");
@@ -318,10 +287,8 @@
da->type_id = dt->id;
da->num_samples = numSamples;
da->name = shortLabel;
da->data_table = dataTable;
-da->probe_table = cloneString(probeTable);
-da->probe_to_gene_table = cloneString(p2gTable);
dataTypesFree(&dt);
tissuesFree(&ti);
@@ -330,26 +297,8 @@
return da;
}
-void createDatasetsTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
-dyStringPrintf(dy, "type_id int unsigned not null,\n");
-dyStringPrintf(dy, "num_samples int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "data_table varchar(255) not null,\n");
-dyStringPrintf(dy, "probe_table varchar(255) not null,\n");
-dyStringPrintf(dy, "probe_to_gene_table varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
struct datasets *setupDataset(struct sqlConnection *biConn,
char *tableName, char *tissue, int numSamples)
{
if (!sqlTableExists(biConn, DA_TABLE))
@@ -405,12 +354,11 @@
"and name = \"%s\" "
"and patient_id = %d "
"and patient_name = \"%s\" "
"and dataset_id = %d "
- "and exp_id = %d "
"and tissue_id = %d ",
SA_TABLE, sa->id, sa->name, sa->patient_id, sa->patient_name, sa->dataset_id,
- sa->exp_id, sa->tissue_id);
+ sa->tissue_id);
return sqlExists(biConn, query);
}
@@ -458,9 +406,8 @@
sampleName = cloneString(allA->names[i]);
patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName);
}
- int expId = allA->expIds[i];
int sampleId = findId(biConn, "id", "name", sampleName);
int patientId = findId(biConn, "patient_id", "patient_name", patientName);
AllocVar(sa);
@@ -468,9 +415,8 @@
sa->name = sampleName;
sa->patient_id = patientId;
sa->patient_name = patientName;
sa->dataset_id = datasetId;
- sa->exp_id = expId;
sa->tissue_id = tissueId;
if (!sampleExists(biConn, sa))
samplesSaveToDb(biConn, sa, SA_TABLE, 100);
@@ -480,33 +426,13 @@
hFreeConn(&pdConn);
}
-void createSamplesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "patient_id int unsigned not null,\n");
-dyStringPrintf(dy, "patient_name varchar(255) not null,\n");
-dyStringPrintf(dy, "dataset_id int unsigned not null,\n");
-dyStringPrintf(dy, "exp_id int unsigned not null,\n");
-dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(dataset_id),\n");
-dyStringPrintf(dy, "KEY(id, dataset_id),\n");
-dyStringPrintf(dy, "KEY(dataset_id,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
struct samples *getSamples(struct sqlConnection *biConn, struct datasets *da)
{
char query[256];
safef(query, sizeof(query),
- "select * from %s where dataset_id = %d order by exp_id;",
+ "select * from %s where dataset_id = %d order by id;",
SA_TABLE, da->id);
return samplesLoadByQuery(biConn, query);
}
@@ -522,10 +447,11 @@
}
createSamples(biConn, da, allA);
struct samples *saList = getSamples(biConn, da);
-if (slCount(saList) != allA->size)
- errAbort("Sample count from microarrayGroups and database don't match!");
+
+//if (slCount(saList) != allA->size)
+// errAbort("Sample count from microarrayGroups and database don't match!");
return saList;
}
@@ -597,41 +523,8 @@
return TRUE;
}
-void createFeaturesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "shortLabel varchar(255) not null,\n");
-dyStringPrintf(dy, "longLabel varchar(255) not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(name),\n");
-dyStringPrintf(dy, "KEY(id,name)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void createClinicalDataTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "sample_id int unsigned not null,\n");
-dyStringPrintf(dy, "feature_id int unsigned not null,\n");
-dyStringPrintf(dy, "val double not null,\n");
-dyStringPrintf(dy, "code varchar(255),\n");
-dyStringPrintf(dy, "KEY(sample_id),\n");
-dyStringPrintf(dy, "KEY(feature_id),\n");
-dyStringPrintf(dy, "KEY(sample_id,feature_id),\n");
-dyStringPrintf(dy, "KEY(feature_id,sample_id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
void setupClinicalInfo(struct sqlConnection *biConn, struct datasets *da, struct samples *saList)
{
if (!saList)
return;
@@ -741,335 +634,262 @@
hFreeConn(&pdConn);
}
-void createProbeValsTable(struct sqlConnection *biConn, char *tableName)
+
+struct analysisFeatures *getAnalysisFeatures(struct sqlConnection *biConn,
+ char *names, char *type)
{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "probe_id int unsigned not null,\n");
-dyStringPrintf(dy, "sample_count int unsigned not null,\n");
-dyStringPrintf(dy, "sample_data longblob not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(probe_id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void createProbeInfoTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "chrom varchar(255) not null,\n");
-dyStringPrintf(dy, "start int unsigned not null,\n");
-dyStringPrintf(dy, "stop int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id),\n");
-dyStringPrintf(dy, "KEY(name),\n");
-dyStringPrintf(dy, "KEY(id,name)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void addProbeValsToDb(struct sqlConnection *biConn, char *tableName,
- int probe_id, int sample_count, char *dataString, int updateSize)
-{ /* This bypasses the need to convert to float array to enter in database, may not
- * be faster, but by converting first to float array any "blank" datapoints are
- * converted to 0.0, instead of leaving blank */
-struct dyString *update = newDyString(updateSize);
-dyStringPrintf(update, "insert into %s values ( %u,%u,'%s')",
- tableName, probe_id, sample_count, dataString );
-sqlUpdate(biConn, update->string);
-freeDyString(&update);
-}
+if (!names)
+ return NULL;
+struct slName *sl, *slList = slNameListFromComma(names);
+struct dyString *dy = newDyString(100);
+dyStringPrintf(dy,
+ "select * from %s where type = \"%s\" "
+ "and feature_name in (", AF_TABLE, type);
+for (sl = slList; sl; sl = sl->next)
+ {
+ dyStringPrintf(dy, "\"%s\"", sl->name);
+ if (sl->next)
+ dyStringPrintf(dy, ",");
+ }
+dyStringPrintf(dy, ");");
+char *query = dyStringCannibalize(&dy);
-void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn,
- struct datasets *da)
-{
-char *dataTable = da->data_table;
-char *probeTable = da->probe_table;
+return analysisFeaturesLoadByQuery(biConn, query);
+}
-if (!dataTable || !probeTable)
- errAbort("datasets entry not complete, data_table or probe_table not set.");
-boolean inputProbeVals = FALSE;
-boolean inputProbeInfo = FALSE;
-if (sqlTableExists(biConn, dataTable) && dropTable)
+void addDataToHash(struct hash *dataHash, char *gene,
+ unsigned int expCount, float *expScores, struct maGrouping *allA)
+{
+struct hash *hash;
+struct hashEl *el = hashLookup(dataHash, gene);
+if (!el)
{
- fprintf(stderr, "probeVals table %s already exists in db, dropping...\n", dataTable);
- sqlDropTable(biConn, dataTable);
+ hash = hashNew(0);
+ hashAdd(dataHash, gene, hash);
}
+else
+ hash = el->val;
-if (!sqlTableExists(biConn, dataTable))
+int i;
+for (i = 0; i < expCount; i++)
{
- fprintf(stderr, "Creating probeVals table %s...\n", dataTable);
- createProbeValsTable(biConn, dataTable);
- inputProbeVals = TRUE; // empty table, input
- }
+ float val = expScores[i];
+ char *name;
+ if (isTCGA)
+ name = cloneStringZ(allA->names[i], 16);
+ else
+ name = cloneString(allA->names[i]);
-if (sqlTableExists(biConn, probeTable) && dropTable)
+ struct slDouble *sd = slDoubleNew(val);
+
+ el = hashLookup(hash, name);
+ if (!el)
+ hashAdd(hash, name, sd);
+ else
{
- fprintf(stderr, "probeInfo table %s already exists in db, dropping...\n", probeTable);
- sqlDropTable(biConn, probeTable);
+ struct slDouble *sdList = el->val;
+ slAddTail(&sdList, sd);
}
+ }
+}
+
+boolean reduceDataHash(struct hash *dataHash, double *retMed, double *retStd)
+{
+struct slDouble *sd, *allSd, *allSdList = NULL;
+struct hashEl *outEl;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
+ {
+ struct hash *hash = outEl->val;
-if (!sqlTableExists(biConn, probeTable))
+ struct hashEl *inEl, *elList = hashElListHash(hash);
+ for (inEl = elList; inEl != NULL; inEl = inEl->next)
{
- fprintf(stderr, "Creating probeInfo table %s...\n", probeTable);
- createProbeInfoTable(biConn, probeTable);
- inputProbeInfo = TRUE; // empty table, input
- }
+ char *sample = inEl->name;
+ struct slDouble *sdList = inEl->val;
+ double med = slDoubleMedian(sdList);
+ sd = slDoubleNew(med);
+ allSd = slDoubleNew(med);
-char query[256];
-safef(query, sizeof(query), "select * from %s;", dataTable);
+ slAddHead(&allSdList, allSd);
-/* Get bed15 data from hg18 database */
-int id = 0;
-struct sqlResult *sr = sqlGetResult(hgConn, query);
+ hashRemove(hash, sample);
+ slFreeList(&sdList);
-char **row = NULL;
-while ((row = sqlNextRow(sr)) != NULL)
- {
- char *chrom = row[1];
- unsigned chromStart = sqlUnsigned(row[2]);
- unsigned chromEnd = sqlUnsigned(row[3]);
- char *name = row[4];
- unsigned expCount = sqlUnsigned(row[13]);
- char *expScores = row[15];
+ hashAdd(hash, sample, sd);
+ }
+ hashElFreeList(&elList);
+ }
- /* Make probeInfo entry and load into db*/
- struct probeInfo *pi;
- AllocVar(pi);
- pi->id = id;
- pi->chrom = cloneString(chrom);
- pi->start = chromStart;
- pi->stop = chromEnd;
- pi->name = cloneString(name);
+double count = (double) slCount(allSdList);
+double allMedian = slDoubleMedian(allSdList);
- if (inputProbeInfo)
- probeInfoSaveToDb(biConn, pi, probeTable, 100);
+slSort(allSdList, slDoubleCmp);
+int low = round(count * (0.0015));
+int high = round(count * (1.0 - 0.0015));
- /* Make probeVals entry and load into db, straight-up copying longblob*/
- if (inputProbeVals)
- addProbeValsToDb(biConn, dataTable, id, expCount, expScores, 500);
+sd = slElementFromIx(allSdList, low);
+double lowVal = sd->val;
- id++;
- probeInfoFree(&pi);
- }
-}
+sd = slElementFromIx(allSdList, high);
+double highVal = sd->val;
-void createGeneLookupTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "kgId varchar(255) not null,\n");
-dyStringPrintf(dy, "PRIMARY KEY(id),\n");
-dyStringPrintf(dy, "KEY(kgId),\n");
-dyStringPrintf(dy, "KEY(id,kgId),\n");
-dyStringPrintf(dy, "KEY(kgId,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
+double mad = max(fabs(lowVal - allMedian), fabs(highVal - allMedian))/3.0;
+double std = mad * 1.43;
+
+*retStd = std;
+*retMed = allMedian;
+
+return TRUE;
}
-void createGeneLookup(struct sqlConnection *biConn)
+struct analysisVals *getAnalysisVals(struct sqlConnection *biConn, struct hash *dataHash,
+ double med, double std)
{
-if (!sqlTableExists(biConn, GL_TABLE))
- {
- fprintf(stderr, "geneLookup table doesn't exist in bioInt database, recreating it.\n");
- createGeneLookupTable(biConn, GL_TABLE);
- }
+struct hash *sampleHash = createIdHash(biConn, SA_TABLE, "name");
-if (sqlTableSize(biConn, GL_TABLE) > 0)
+double z, p, val;
+double maxLogP = 88.0;
+struct hashEl *inEl, *outEl;
+struct analysisVals *av, *avList = NULL;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
{
- fprintf(stderr, "geneLookup table already has data in it, doing nothing.\n");
- return;
- }
-
-if (!sqlTableExists(biConn, "knownGene"))
- errAbort("Need knownGene table in bioInt database.");
+ char *gene = outEl->name;
+ struct hash *hash = outEl->val;
-char query[256];
-safef(query, sizeof(query), "select name from knownGene;");
+ struct analysisFeatures *af = getAnalysisFeatures(biConn, gene, "gene");
-struct slName *sl, *slList = sqlQuickList(biConn, query);
+ if (!af)
+ continue;
-int id = 0;
-struct geneLookup *gl;
-for (sl = slList; sl; sl = sl->next)
+ struct hashEl *elList = hashElListHash(hash);
+ for (inEl = elList; inEl != NULL; inEl = inEl->next)
{
- AllocVar(gl);
- gl->id = id;
- gl->kgId = cloneString(sl->name);
-
- id++;
- geneLookupSaveToDb(biConn, gl, GL_TABLE, 100);
- geneLookupFree(&gl);
- }
-
-slNameFreeList(&slList);
-}
+ char *sample = inEl->name;
+ struct slDouble *sd = inEl->val;
-void createProbeToGeneTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "probe_id int unsigned not null,\n");
-dyStringPrintf(dy, "gene_id int unsigned not null,\n");
-dyStringPrintf(dy, "KEY(probe_id),\n");
-dyStringPrintf(dy, "KEY(gene_id),\n");
-dyStringPrintf(dy, "KEY(probe_id,gene_id),\n");
-dyStringPrintf(dy, "KEY(gene_id,probe_id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
+ int sample_id = hashIntValDefault(sampleHash, sample, -1);
+ if (sample_id == -1)
+ errAbort("No sample by name of %s\n", sample);
+
+ AllocVar(av);
+ av->sample_id = sample_id;
+ av->feature_id = af->id;
+ av->val = sd->val;
+
+ z = (av->val - med)/std;
+ p = ndtr(-1.0*fabs(z));
+ if (p > 0)
+ val = min(-log(p)/log(10.0), maxLogP);
+ else
+ val = maxLogP;
-int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
-{
-char query[128];
-safef(query, sizeof(query),
- "select id from %s where name = \"%s\"",
- tableName, name);
+ if (z < 0.0)
+ val = -1.0*val; // signed log(p-value)
+ av->conf = val;
-if (!sqlExists(biConn, query))
- return -1;
+ slAddHead(&avList, av);
+ }
+ hashElFreeList(&elList);
+ }
-return sqlQuickNum(biConn, query);
+return avList;
}
-struct slInt *getGeneIdsBySymbol(struct sqlConnection *biConn,
- struct slName *slList)
+void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn,
+ struct datasets *da, struct maGrouping *allA)
{
-if (!slList)
- return NULL;
+char *dataTable = da->data_table;
-struct slName *sl;
-struct dyString *dy = newDyString(100);
-dyStringPrintf(dy, "select id from %s "
- "join kgXref on %s.kgId = kgXref.kgId "
- "where kgXref.geneSymbol in (", GL_TABLE, GL_TABLE);
-for (sl = slList; sl; sl = sl->next)
+if (!dataTable)
+ errAbort("datasets entry not complete, data_table not set.");
+
+boolean inputProbeVals = FALSE;
+if (sqlTableExists(biConn, dataTable) && dropTable)
{
- dyStringPrintf(dy, "\"%s\"", sl->name);
- if (sl->next)
- dyStringPrintf(dy, ",");
+ fprintf(stderr, "analysisVals table %s already exists in db, dropping...\n", dataTable);
+ sqlDropTable(biConn, dataTable);
}
-dyStringPrintf(dy, ");");
-char *query = dyStringCannibalize(&dy);
-
-return sqlQuickNumList(biConn, query);
-}
-
-void setupProbeToGene(struct sqlConnection *hgConn,
- struct sqlConnection *biConn, struct datasets *da)
-{
-char *p2gTable = da->probe_to_gene_table;
-if (!p2gTable)
+if (!sqlTableExists(biConn, dataTable))
{
- fprintf(stderr, "probeToGene table not set, doing nothing.\n");
- return;
+ fprintf(stderr, "Creating analysisVals table %s...\n", dataTable);
+ inputProbeVals = TRUE; // empty table, input
}
-struct hash *settings = getSettings(da->data_table);
+if (!inputProbeVals)
+ return;
+
+struct hash *settings = getSettings(dataTable);
struct hashEl *el = hashLookup(settings, "aliasTable");
if (!el)
errAbort("No aliasTable.\n");
char *aliasTable = cloneString(el->val);
+struct hash *gaHash = getAliases(hgConn, aliasTable);
-if (!sqlTableExists(hgConn, aliasTable))
- errAbort("Table %s not found in hg18 database.\n", aliasTable);
+char query[256];
+safef(query, sizeof(query), "select * from %s;", dataTable);
-if (!sqlTableExists(biConn, "kgXref"))
- errAbort("kgXref table not found in database. Cannot create probeToGene table.\n");
+/* Get bed15 data from hg18 database */
+struct sqlResult *sr = sqlGetResult(hgConn, query);
-if (sqlTableExists(biConn, p2gTable) && dropTable)
- {
- fprintf(stderr, "Table %s already exists, dropping and recreating.\n", p2gTable);
- sqlDropTable(biConn, p2gTable);
- }
+struct hash *dataHash = hashNew(0);
-boolean inputProbeToGene = FALSE;
-if (!sqlTableExists(biConn, p2gTable))
+char **row = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
{
- fprintf(stderr, "Creating probeToGene table...\n");
- createProbeToGeneTable(biConn, p2gTable);
- inputProbeToGene = TRUE;
- }
+ struct bed *nb = bedLoadN(row+1, 15);
-if (!inputProbeToGene)
- return;
+ char *name = nb->name; // row[4];
+ unsigned expCount = nb->expCount; //sqlUnsigned(row[13]);
+ float *expScores = nb->expScores; // row[15];
-struct geneAlias *ga, *gaList = getAliases(hgConn, aliasTable);
-for (ga = gaList; ga; ga = ga->next)
- {
- int probeId = getProbeId(biConn, da->probe_table, ga->probe);
- if (probeId < 0) // probe in alias table doesn't exist in dataset
+ struct hashEl *el = hashLookup(gaHash, name);
+ if (!el)
continue;
+ struct geneAlias *ga = el->val;
+ struct slName *sl;
+ for (sl = ga->genes; sl; sl = sl->next)
+ addDataToHash(dataHash, sl->name, expCount, expScores, allA);
+ }
+
+if (hashNumEntries(dataHash) == 0)
+ errAbort("no entries in hash\n");
+
+fprintf(stderr, "\treducing hash...\n");
+double med, std;
+if (!reduceDataHash(dataHash, &med, &std))
+ errAbort("problem reducing hash\n");
+
+fprintf(stderr, "\tconverting hash to analysisVals...\n");
+struct analysisVals *avList = getAnalysisVals(biConn, dataHash, med, std);
+
+fprintf(stderr, "\tstoring analysisVals...\n");
+storeAnalysisValsInDb(biConn, dataTable, avList);
+analysisValsFreeList(&avList);
+}
- struct slInt *si, *geneIds = getGeneIdsBySymbol(biConn, ga->genes);
- struct probeToGene *pg;
- AllocVar(pg);
- pg->probe_id = probeId;
- for (si = geneIds; si; si = si->next)
- {
- pg->gene_id = si->val;
- probeToGeneSaveToDb(biConn, pg, p2gTable, 10);
- }
- probeToGeneFree(&pg);
- }
-}
+int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
+{
+char query[128];
+safef(query, sizeof(query),
+ "select id from %s where name = \"%s\"",
+ tableName, name);
-void createPathwaysTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "name varchar(255) not null,\n");
-dyStringPrintf(dy, "source varchar(255) not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(name),\n");
-dyStringPrintf(dy, "KEY(id,name),\n");
-dyStringPrintf(dy, "KEY(name,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void createPathwayGenesTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "gene_id int unsigned not null,\n");
-dyStringPrintf(dy, "KEY(id),\n");
-dyStringPrintf(dy, "KEY(gene_id),\n");
-dyStringPrintf(dy, "KEY(id,gene_id),\n");
-dyStringPrintf(dy, "KEY(gene_id,id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
-}
-
-void createPathwayInfoTable(struct sqlConnection *biConn, char *tableName)
-{
-struct dyString *dy = newDyString(1024);
-dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "id int unsigned not null,\n");
-dyStringPrintf(dy, "description longblob not null,\n");
-dyStringPrintf(dy, "KEY(id)\n");
-dyStringPrintf(dy, ")\n");
-sqlUpdate(biConn,dy->string);
-dyStringFree(&dy);
+if (!sqlExists(biConn, query))
+ return -1;
+
+return sqlQuickNum(biConn, query);
}
+
char *getPathwayDescription(struct sqlConnection *pdConn, char *name)
{
if (!sqlTableExists(pdConn, name))
return NULL;
@@ -1081,59 +901,59 @@
return sqlQuickString(pdConn, query);
}
-void setupPathways(struct sqlConnection *biConn)
+void setupGenesets(struct sqlConnection *biConn)
{
-boolean inputPathways = FALSE;
-boolean inputPathwayInfo = FALSE;
-boolean inputPathwayGenes = FALSE;
+boolean inputGenesets = FALSE;
+boolean inputGenesetInfo = FALSE;
+boolean inputGenesetGenes = FALSE;
struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
if (!pdConn)
errAbort("Could not connect to pathways database.\n");
-if (sqlTableExists(biConn, PA_TABLE) && dropTable)
+if (sqlTableExists(biConn, GE_TABLE) && dropTable)
{
- fprintf(stderr, "pathways table already exists, dropping and recreating.\n");
- sqlDropTable(biConn, PA_TABLE);
+ fprintf(stderr, "%s table already exists, dropping and recreating.\n", GE_TABLE);
+ sqlDropTable(biConn, GE_TABLE);
}
-if (!sqlTableExists(biConn, PA_TABLE))
+if (!sqlTableExists(biConn, GE_TABLE))
{
- fprintf(stderr, "Creating pathways table.\n");
- createPathwaysTable(biConn, PA_TABLE);
- inputPathways = TRUE;
+ fprintf(stderr, "Creating %s table.\n", GE_TABLE);
+ createGenesetsTable(biConn, GE_TABLE);
+ inputGenesets = TRUE;
}
-if (sqlTableExists(biConn, PG_TABLE) && dropTable)
+if (sqlTableExists(biConn, GG_TABLE) && dropTable)
{
- fprintf(stderr, "pathwayGenes table already exists, dropping and recreating.\n");
- sqlDropTable(biConn, PG_TABLE);
+ fprintf(stderr, "%s table already exists, dropping and recreating.\n", GG_TABLE);
+ sqlDropTable(biConn, GG_TABLE);
}
-if (!sqlTableExists(biConn, PG_TABLE))
+if (!sqlTableExists(biConn, GG_TABLE))
{
- fprintf(stderr, "Creating pathwayGenes table.\n");
- createPathwayGenesTable(biConn, PG_TABLE);
- inputPathwayGenes = TRUE;
+ fprintf(stderr, "Creating %s table.\n", GG_TABLE);
+ createGenesetGenesTable(biConn, GG_TABLE);
+ inputGenesetGenes = TRUE;
}
-if (sqlTableExists(biConn, PI_TABLE) && dropTable)
+if (sqlTableExists(biConn, GI_TABLE) && dropTable)
{
- fprintf(stderr, "pathwayInfo table already exists, dropping and recreating.\n");
- sqlDropTable(biConn, PI_TABLE);
+ fprintf(stderr, "%s table already exists, dropping and recreating.\n", GI_TABLE);
+ sqlDropTable(biConn, GI_TABLE);
}
-if (!sqlTableExists(biConn, PI_TABLE))
+if (!sqlTableExists(biConn, GI_TABLE))
{
- fprintf(stderr, "Creeting pathwayInfo table.\n");
- createPathwayInfoTable(biConn, PI_TABLE);
- inputPathwayInfo = TRUE;
+ fprintf(stderr, "Creeting %s table.\n", GI_TABLE);
+ createGenesetInfoTable(biConn, GI_TABLE);
+ inputGenesetInfo = TRUE;
}
-if (!inputPathways && !inputPathwayInfo && !inputPathwayGenes)
+if (!inputGenesets && !inputGenesetInfo && !inputGenesetGenes)
{
- fprintf(stderr, "Nothing to do for pathway tables.\n");
+ fprintf(stderr, "Nothing to do for geneset tables.\n");
return;
}
/* Setting up pathways table */
@@ -1155,59 +975,164 @@
slReverse(&names);
slReverse(&genes);
sqlFreeResult(&sr);
-int id = 0;
for (na = names, ge = genes; na && ge; na = na->next, ge = ge->next)
{
char *name = na->name;
- char *genes = ge->name;
+ char *members = ge->name;
- struct slName *slList = slNameListFromComma(genes);
- struct slInt *si, *siList = getGeneIdsBySymbol(biConn, slList);
+ struct analysisFeatures *gsAf = getAnalysisFeatures(biConn, name, "geneset");
+ struct analysisFeatures *af, *afList = getAnalysisFeatures(biConn, members, "gene");
- struct pathways *ps;
+ if (!gsAf)
+ continue;
+
+ if (!afList)
+ continue;
- if (inputPathways)
+ if (inputGenesets)
{
- AllocVar(ps);
- ps->id = id;
- ps->name = cloneString(name);
- ps->source = cloneString("N/A");
- pathwaysSaveToDb(biConn, ps, PA_TABLE, 100);
- pathwaysFree(&ps);
+ struct genesets *gs;
+ AllocVar(gs);
+ gs->id = gsAf->id;
+ gs->name = cloneString(name);
+ gs->source = cloneString("N/A");
+ genesetsSaveToDb(biConn, gs, GE_TABLE, 100);
+ genesetsFree(&gs);
}
- if (inputPathwayGenes)
+ if (inputGenesetGenes)
{
- struct pathwayGenes *pg;
- AllocVar(pg);
- pg->id = id;
- for (si = siList; si; si = si->next)
+ struct genesetGenes *gg;
+ AllocVar(gg);
+ gg->id = gsAf->id;
+ for (af = afList; af; af = af->next)
{
- pg->gene_id = si->val;
- pathwayGenesSaveToDb(biConn, pg, PG_TABLE, 100);
+ gg->gene_id = af->id;
+ genesetGenesSaveToDb(biConn, gg, GG_TABLE, 100);
}
- pathwayGenesFree(&pg);
+ genesetGenesFree(&gg);
}
- if (inputPathwayInfo)
+ if (inputGenesetInfo)
{
char *desc = getPathwayDescription(pdConn, name);
if (desc)
{
- struct pathwayInfo *pi;
- AllocVar(pi);
- pi->id = id;
- pi->description = desc;
- pathwayInfoSaveToDbEscaped(biConn, pi, PI_TABLE, 200);
- pathwayInfoFree(&pi);
+ struct genesetInfo *gi;
+ AllocVar(gi);
+ gi->id = gsAf->id;
+ gi->description = desc;
+ genesetInfoSaveToDbEscaped(biConn, gi, GI_TABLE, 200);
+ genesetInfoFree(&gi);
+ }
}
+
+ analysisFeaturesFree(&gsAf);
+ analysisFeaturesFreeList(&afList);
+ }
+
+hFreeConn(&pdConn);
+}
+
+boolean analysisFeatureExists(struct sqlConnection *biConn, struct analysisFeatures *af)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from %s where id = %d "
+ "and feature_name = \"%s\" "
+ "and type = \"%s\"",
+ AF_TABLE, af->id, af->feature_name, af->type);
+
+return sqlExists(biConn, query);
+}
+
+int findIdForAnalysisFeature(struct sqlConnection *biConn, char *tableName,
+ struct analysisFeatures *af)
+{
+if (sqlTableSize(biConn, tableName) == 0) /* brand new table, return 0 */
+ return 0;
+
+char query[256];
+safef(query, sizeof(query),
+ "select DISTINCT id from %s where feature_name = \"%s\" "
+ "and type = \"%s\";",
+ tableName, af->feature_name, af->type);
+if (sqlExists(biConn, query)) /* sample name found, use same id */
+ return sqlQuickNum(biConn, query);
+
+/* Else, find maximum sample id and add one to it */
+safef(query, sizeof(query),
+ "select max(id) from %s;",
+ tableName);
+int maxId = sqlQuickNum(biConn, query);
+return maxId + 1;
+}
+
+void setupAnalysisFeatures(struct sqlConnection *biConn)
+{
+if (!sqlTableExists(biConn, AF_TABLE))
+ createAnalysisFeaturesTable(biConn, AF_TABLE);
+
+if (sqlTableSize(biConn, AF_TABLE) != 0)
+ {
+ fprintf(stderr, "%s table is not empty, doing nothing\n", AF_TABLE);
+ return;
}
- id++;
+
+/* set up gene features */
+char query[256];
+safef(query, sizeof(query),
+ "select DISTINCT geneSymbol from %s;", KX_TABLE);
+struct slName *sl, *slList = sqlQuickList(biConn, query);
+
+struct analysisFeatures *af, *afList = NULL;
+
+for (sl = slList; sl; sl = sl->next)
+ {
+ AllocVar(af);
+ af->id = 0;
+ af->feature_name = cloneString(sl->name);
+ af->type = cloneString("gene");
+
+ slAddHead(&afList, af);
}
+slNameFreeList(&slList);
+
+/* set up geneset features */
+struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
+safef(query, sizeof(query),
+ "select name from genesets;");
+slList = sqlQuickList(pdConn, query);
+for (sl = slList; sl; sl = sl->next)
+ {
+ AllocVar(af);
+ af->id = 0;
+ af->feature_name = cloneString(sl->name);
+ af->type = cloneString("geneset");
+
+ slAddHead(&afList, af);
+ }
hFreeConn(&pdConn);
+slNameFreeList(&slList);
+
+slReverse(&afList);
+
+/* set up pathway features (TODO) */
+
+
+/* save features to db */
+for (af = afList; af; af = af->next)
+ {
+ int feature_id = findIdForAnalysisFeature(biConn, AF_TABLE, af);
+ af->id = feature_id;
+ if (!analysisFeatureExists(biConn, af))
+ analysisFeaturesSaveToDb(biConn, af, AF_TABLE, 10);
+ }
+
+analysisFeaturesFree(&afList);
}
void populateDb(char *db, char *tableName, char *tissue)
@@ -1215,18 +1140,24 @@
tissue = strLower(tissue);
struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
-/* Create geneLookup table (if necessary) */
+/* Create analysis features (if necessary) */
uglyTime(NULL);
-fprintf(stderr, "Setting up geneLookup table...\n");
-createGeneLookup(biConn);
+fprintf(stderr, "Setting up analysis features...\n");
+setupAnalysisFeatures(biConn);
uglyTime("Time");
+/* Create geneLookup table (if necessary) */
+//uglyTime(NULL);
+//fprintf(stderr, "Setting up geneLookup table...\n");
+//createGeneLookup(biConn);
+//uglyTime("Time");
+
/* Set up pathways */
uglyTime(NULL);
fprintf(stderr, "Setting up pathways tables...\n");
-setupPathways(biConn);
+setupGenesets(biConn);
uglyTime("Time");
/* Set up datasets entry */
struct maGrouping *allA = getMaGrouping(hgConn, tableName);
@@ -1253,15 +1184,9 @@
/* Set up probeInfo table (if necessary) and probeVals table */
uglyTime(NULL);
fprintf(stderr, "Setting up probe data tables (be patient!)...\n");
-setupProbeData(hgConn, biConn, da);
-uglyTime("Time");
-
-/* Set up probeToGene table (if necessary) */
-uglyTime(NULL);
-fprintf(stderr, "Setting up probeToGene table...\n");
-setupProbeToGene(hgConn, biConn, da);
+setupProbeData(hgConn, biConn, da, allA);
uglyTime("Time");
fprintf(stderr, "Done!");
fprintf(stderr, "Please run 'setCohort' to find datasets that have overlapping samples.\n");