src/hg/instinct/bioInt2/bioIntDriver.c 1.2
1.2 2009/03/24 05:21:54 jsanborn
updated
Index: src/hg/instinct/bioInt2/bioIntDriver.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioIntDriver.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/instinct/bioInt2/bioIntDriver.c 20 Mar 2009 06:06:31 -0000 1.1
+++ src/hg/instinct/bioInt2/bioIntDriver.c 24 Mar 2009 05:21:54 -0000 1.2
@@ -1,1706 +1,1714 @@
/* bioIntDriver.c
* All rights reserved -- J. Zachary Sanborn
*/
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "hdb.h"
#include "dystring.h"
#include "cprob.h"
#include "hgStatsLib.h"
#include "bioIntDriver.h"
static char *heatMapDbProfile = "localDb";
/* Begin helper functions */
int addDelimStringToList(struct slName **list, char *str, char sep)
{
if (!str)
return 0;
struct slName *sl, *slList = slNameListFromString(str, sep);
for (sl = slList; sl; sl = sl->next)
slNameAddHead(list, sl->name);
slReverse(list);
slNameFreeList(&slList);
return 1;
}
void slNamePrint(struct slName *list)
{
struct slName *sl;
for (sl = list; sl; sl = sl->next)
fprintf(stdout, "%s ", sl->name);
fprintf(stdout, "\n");
}
struct hash *uniqueHashFromSlNameList(void *list)
/* Create a unique hash out of a list of slNames or any kind of list where the */
/* first field is the next pointer and the second is the name.
* -- Adapted from hashFromSlNameList in hash.h */
{
if (!list)
return NULL;
struct slName *namedList = list, *item;
struct hash *hash = hashNew(10);
for (item = namedList; item != NULL; item = item->next)
{
if (!hashLookup(hash, item->name))
hashAdd(hash, item->name, item);
}
return hash;
}
struct slName *slNameUniqueList(struct slName *list)
{
if (!list)
return NULL;
struct hash *hash = uniqueHashFromSlNameList(list);
/* Iterate through names in hash, saving to new list */
struct slName *slList = NULL;
struct hashCookie cookie = hashFirst(hash);
char *name;
while ((name = hashNextName(&cookie)) != NULL)
slNameAddHead(&slList, name);
slNameFreeList(&list);
freeHash(&hash);
return slList;
}
/* End helper functions */
/* Begin database functions */
void addSample(struct biOmics *bi, struct samples *sa)
{
struct hashEl *el = hashLookup(bi->samplesHash, sa->name);
if (el)
{ /* already in hash, free memory and move on */
samplesFree(&sa);
return;
}
slAddHead(&bi->samples, sa);
hashAdd(bi->samplesHash, sa->name, sa);
}
void loadAllSamples(struct sqlConnection *conn, struct biOmics *bi)
{
struct datasets *da = bi->dataset;
char query[128];
safef(query, sizeof(query),
- "select * from samples where dataset_id = %d order by id;",
- da->id);
+ "select * from %s where dataset_id = %d order by id;",
+ SA_TABLE, da->id);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
struct samples *sa = samplesLoad(row);
addSample(bi, sa);
}
sqlFreeResult(&sr);
}
void loadSamplesInList(struct sqlConnection *conn, struct biOmics *bi,
struct slName *sampleList)
{
if (!sampleList)
return;
struct slName *sl;
struct datasets *da = bi->dataset;
struct dyString *dy = newDyString(100);
-dyStringPrintf(dy, "select * from samples where name in (");
+dyStringPrintf(dy, "select * from %s where name in (", SA_TABLE);
for (sl = sampleList; sl; sl = sl->next)
{
dyStringPrintf(dy, "'%s'", sl->name);
if (sl->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ") and dataset_id = %d order by id;", da->id);
char *query = dyStringCannibalize(&dy);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
struct samples *sa = samplesLoad(row);
addSample(bi, sa);
}
sqlFreeResult(&sr);
}
void loadSamplesMatchingFeatureVals(struct sqlConnection *conn, struct biOmics *bi,
struct slName *featureValList)
{
if (!featureValList)
return;
//TODO: Inner JOIN!
//select * from clinicalData p1 join clinicalData p2 on p1.sample_id = p2.sample_id and p1.code = 'Positive' and p1.feature_id = 0 and p2.code = 'Positive' and p2.feature_id = 1;
struct datasets *da = bi->dataset;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy,
- "select * from samples "
- "join clinicalData on samples.id = clinicalData.sample_id "
- "join features on clinicalData.feature_id = features.id "
- "where samples.dataset_id = %d ",
- da->id);
+ "select * from %s "
+ "join %s on %s.id = %s.sample_id "
+ "join %s on %s.feature_id = %s.id "
+ "where %s.dataset_id = %d ",
+ SA_TABLE, CD_TABLE, SA_TABLE, CD_TABLE, FE_TABLE,
+ CD_TABLE, FE_TABLE, SA_TABLE, da->id);
struct slName *sl, *fv, *slList = NULL;
for (fv = featureValList; fv; fv = fv->next)
{
addDelimStringToList(&slList, fv->name, ' ');
if (slCount(slList) != 3)
{
fprintf(stderr, "Improperly formatted feature-value pair: %s\n", fv->name);
slNameFreeList(&slList);
slList = NULL;
continue;
}
sl = slList;
char *name = sl->name;
sl = sl->next;
char *operation = sl->name;
sl = sl->next;
char *val = sl->name;
- dyStringPrintf(dy, " and features.name = '%s'", name);
- dyStringPrintf(dy, " and clinicalData.val %s %s", operation, val);
+ dyStringPrintf(dy, " and %s.name = '%s'", FE_TABLE, name);
+ dyStringPrintf(dy, " and %s.val %s %s", CD_TABLE, operation, val);
slNameFreeList(&slList);
slList = NULL;
}
-dyStringPrintf(dy, " order by samples.id;");
+dyStringPrintf(dy, " order by %s.id;", SA_TABLE);
char *query = dyStringCannibalize(&dy);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
struct samples *sa = samplesLoad(row);
addSample(bi, sa);
}
sqlFreeResult(&sr);
}
void loadSamplesMatchingFeatureCodes(struct sqlConnection *conn, struct biOmics *bi,
struct slName *featureCodeList)
{
if (!featureCodeList)
return;
//TODO: Inner JOIN!
//select * from clinicalData p1 join clinicalData p2 on p1.sample_id = p2.sample_id and p1.code = 'Positive' and p1.feature_id = 0 and p2.code = 'Positive' and p2.feature_id = 1;
struct datasets *da = bi->dataset;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy,
- "select * from samples "
- "join clinicalData on samples.id = clinicalData.sample_id "
- "join features on clinicalData.feature_id = features.id "
- "where samples.dataset_id = %d and (",
- da->id);
+ "select * from %s "
+ "join %s on %s.id = %s.sample_id "
+ "join %s on %s.feature_id = %s.id "
+ "where %s.dataset_id = %d and (",
+ SA_TABLE, CD_TABLE, SA_TABLE, CD_TABLE, FE_TABLE,
+ CD_TABLE, FE_TABLE, SA_TABLE, da->id);
struct slName *sl, *fc, *slList = NULL;
for (fc = featureCodeList; fc; fc = fc->next)
{
addDelimStringToList(&slList, fc->name, ' ');
if (slCount(slList) != 3)
{
fprintf(stderr, "Improperly formatted feature-code pair: %s\n", fc->name);
slNameFreeList(&slList);
slList = NULL;
continue;
}
sl = slList;
char *name = sl->name;
sl = sl->next;
char *operation = sl->name;
sl = sl->next;
char *code = sl->name;
- dyStringPrintf(dy, " (features.name = '%s'", name);
+ dyStringPrintf(dy, " (%s.name = '%s'", FE_TABLE, name);
if (sameString(code, "NULL"))
- dyStringPrintf(dy, " and clinicalData.code is NULL)");
+ dyStringPrintf(dy, " and %s.code is NULL)", CD_TABLE);
else
- dyStringPrintf(dy, " and clinicalData.code %s '%s')", operation, code);
+ dyStringPrintf(dy, " and %s.code %s '%s')", CD_TABLE, operation, code);
slNameFreeList(&slList);
slList = NULL;
if (fc->next)
dyStringPrintf(dy, " or");
}
-dyStringPrintf(dy, ") order by samples.id;");
+dyStringPrintf(dy, ") order by %s.id;", SA_TABLE);
char *query = dyStringCannibalize(&dy);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
struct samples *sa = samplesLoad(row);
addSample(bi, sa);
}
sqlFreeResult(&sr);
}
void loadSamples(struct sqlConnection *conn, struct biOmics *bi, struct biQuery *bq)
{
if (!bq->sampleList && !bq->featureValList && !bq->featureCodeList)
{
fprintf(stderr, "Loading ALL samples from dataset.\n");
loadAllSamples(conn, bi);
return;
}
/* Load samples from list */
loadSamplesInList(conn, bi, bq->sampleList);
/* Load samples matching feature-value pairs, e.g. B <= 5,C > 6,... */
loadSamplesMatchingFeatureVals(conn, bi, bq->featureValList);
/* Load samples matching feature-code pairs, e.g. ER = Positive */
loadSamplesMatchingFeatureCodes(conn, bi, bq->featureCodeList);
}
void addProbeInfo(struct biOmics *bi, struct probeInfo *pi)
{
struct hashEl *el = hashLookup(bi->probeInfoHash, pi->name);
if (el)
{
probeInfoFree(&pi);
return; // already there;
}
slAddHead(&bi->probeInfo, pi);
hashAdd(bi->probeInfoHash, pi->name, pi);
/* Allocate space for slPair data */
char probe_id[128];
safef(probe_id, sizeof(probe_id), "%d", pi->id);
struct slPair *sp;
AllocVar(sp);
sp->name = cloneString(pi->name);
sp->val = NULL;
slAddHead(&bi->probes, sp);
hashAdd(bi->probesHash, probe_id, sp);
}
void loadPathways(struct sqlConnection *conn, struct biOmics *bi, struct slName *pathwayList)
{
if (!pathwayList)
return;
struct slName *sl;
struct datasets *da = bi->dataset;
char *pInfo = da->probe_table;
char *p2g = da->probe_to_gene_table;
if (!pInfo || !p2g)
return;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy,
"select %s.* from %s join %s on %s.probe_id = %s.id "
- "join pathwayGenes on pathwayGenes.gene_id = %s.gene_id "
- "join pathways on pathways.id = pathwayGenes.id "
- "where pathways.name in (",
- pInfo, pInfo, p2g, p2g, pInfo, p2g);
+ "join %s on %s.gene_id = %s.gene_id "
+ "join %s on %s.id = %s.id "
+ "where %s.name in (",
+ pInfo, pInfo, p2g, p2g, pInfo,
+ PG_TABLE, PG_TABLE, p2g,
+ PA_TABLE, PA_TABLE, PG_TABLE,
+ PA_TABLE);
for (sl = pathwayList; sl; sl = sl->next)
{
dyStringPrintf(dy, "'%s'", sl->name);
if (sl->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ");");
char *query = dyStringCannibalize(&dy);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
struct probeInfo *pi = probeInfoLoad(row);
addProbeInfo(bi, pi);
}
sqlFreeResult(&sr);
}
void loadGenes(struct sqlConnection *conn, struct biOmics *bi, struct slName *geneList)
{
if (!geneList)
return;
struct slName *sl;
struct datasets *da = bi->dataset;
char *pInfo = da->probe_table;
char *p2g = da->probe_to_gene_table;
if (!pInfo || !p2g)
return;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy,
"select * from %s join %s on %s.probe_id = %s.id "
- "join geneLookup on geneLookup.id = %s.gene_id where kgId in (",
- pInfo, p2g, p2g, pInfo, p2g);
+ "join %s on %s.id = %s.gene_id where kgId in (",
+ pInfo, p2g, p2g, pInfo,
+ GL_TABLE, GL_TABLE, p2g);
for (sl = geneList; sl; sl = sl->next)
{
dyStringPrintf(dy, "'%s'", sl->name);
if (sl->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ");");
char *query = dyStringCannibalize(&dy);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
struct probeInfo *pi = probeInfoLoad(row);
addProbeInfo(bi, pi);
}
sqlFreeResult(&sr);
}
void loadProbeInfo(struct sqlConnection *conn, struct biOmics *bi,
struct slName *probeList, boolean allData)
{
if (!probeList && !allData)
return;
struct datasets *da = bi->dataset;
struct slName *sl;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy, "select * from %s ", da->probe_table);
if (probeList)
{
dyStringPrintf(dy, "where name in (");
for (sl = probeList; sl; sl = sl->next)
{
dyStringPrintf(dy, "'%s'", sl->name);
if (sl->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ");");
}
char *query = dyStringCannibalize(&dy);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
struct probeInfo *pi = probeInfoLoad(row);
addProbeInfo(bi, pi);
}
sqlFreeResult(&sr);
}
void loadProbeInfoRandom(struct sqlConnection *conn, struct biOmics *bi, int numProbes)
{
if (numProbes <= 0)
return;
struct datasets *da = bi->dataset;
char q[256];
safef(q, sizeof(q), "select max(id) from %s;", da->probe_table);
int maxProbes = sqlQuickNum(conn, q);
if (numProbes >= maxProbes) // what's the point?!?
errAbort("Number of random probes greater than probes in table!.");
int i = 0, total = 0;
char rStr[128];
struct hash *hash = hashNew(0);
while (i < numProbes && total < maxProbes * 2)
{
total++;
int r = random() % maxProbes;
safef(rStr, sizeof(rStr), "%d", r);
if (hashLookup(hash, rStr))
continue;
hashAddInt(hash, rStr, 1);
i++;
}
struct slInt *si, *siList = NULL;
struct hashCookie cookie = hashFirst(hash);
char *name;
while ((name = hashNextName(&cookie)) != NULL)
{
si = slIntNew(atoi(name));
slAddHead(&siList, si);
}
slSort(&siList, slIntCmp);
hashFree(&hash);
if (slCount(siList) != numProbes)
errAbort("Random list not correct length %d != %d.", slCount(siList), numProbes);
struct dyString *dy = newDyString(100);
dyStringPrintf(dy, "select * from %s where id in (", da->probe_table);
for (si = siList; si; si = si->next)
{
dyStringPrintf(dy, "%d", si->val);
if (si->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ");");
char *query = dyStringCannibalize(&dy);
slFreeList(&siList);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
struct probeInfo *pi = probeInfoLoad(row);
addProbeInfo(bi, pi);
}
sqlFreeResult(&sr);
}
void loadProbeSampleValData(struct sqlConnection *conn, struct biOmics *bi, boolean allData)
{ /* boolean allData is unused in this function */
struct datasets *da = bi->dataset;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy, "select * from %s", da->data_table);
if (bi->probeInfo)
{
struct probeInfo *pi;
dyStringPrintf(dy, " where probe_id in (");
for (pi = bi->probeInfo; pi; pi = pi->next)
{
dyStringPrintf(dy, "%d", pi->id);
if (pi->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ")");
}
if (bi->samples)
{
if (bi->probeInfo)
dyStringPrintf(dy, " and");
else
dyStringPrintf(dy, " where");
struct samples *sa;
dyStringPrintf(dy, " sample_id in (");
for (sa = bi->samples; sa; sa = sa->next)
{
dyStringPrintf(dy, "%d", sa->id);
if (sa->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ")");
}
dyStringPrintf(dy, ";");
char *query = dyStringCannibalize(&dy);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
int count = 0;
struct slPair *sp;
struct probeSampleVal *pv;
while ((row = sqlNextRow(sr)) != NULL)
{
char *probe_id = row[0];
struct hashEl *el = hashLookup(bi->probesHash, probe_id);
if (!el)
continue;
sp = el->val;
pv = probeSampleValLoad(row);
slAddHead(&sp->val, pv);
count++;
}
if (DEBUG)
fprintf(stdout, "found %d probes\n", count);
sqlFreeResult(&sr);
}
void loadProbeValsData(struct sqlConnection *conn, struct biOmics *bi, boolean allData)
{
if (!bi->probeInfo && !allData)
return;
struct datasets *da = bi->dataset;
struct probeInfo *pi;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy, "select * from %s ", da->data_table);
if (bi->probeInfo && !allData)
{
dyStringPrintf(dy, "where probe_id in (");
for (pi = bi->probeInfo; pi; pi = pi->next)
{
dyStringPrintf(dy, "%d", pi->id);
if (pi->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ")");
}
char *query = dyStringCannibalize(&dy);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
int count = 0;
struct slPair *sp;
struct probeVals *pv;
while ((row = sqlNextRow(sr)) != NULL)
{
char *probe_id = row[0];
struct hashEl *el = hashLookup(bi->probesHash, probe_id);
if (!el)
continue;
sp = el->val;
pv = probeValsLoad(row);
count += pv->sample_count;
slAddHead(&sp->val, pv);
}
if (DEBUG)
fprintf(stdout, "found %d probes\n", count);
sqlFreeResult(&sr);
}
struct biData *probeValsForProbe(struct biOmics *bi, char *probe)
{
struct hashEl *el = hashLookup(bi->probeInfoHash, probe);
if (!el)
return NULL;
struct probeInfo *pi = el->val;
int probeId = pi->id;
char pStr[128];
safef(pStr, sizeof(pStr), "%d", probeId);
el = hashLookup(bi->probesHash, pStr);
if (!el)
return NULL;
struct slPair *sp = el->val;
struct biData *bd = biDataNew(probe);
struct samples *sa;
struct probeVals *pv;
struct slDouble *sd;
for (pv = sp->val; pv; pv = pv->next)
{
for (sa = bi->samples; sa; sa = sa->next)
{
sd = slDoubleNew(pv->sample_data[sa->exp_id]);
slAddHead(&bd->data, sd);
hashAdd(bd->hash, sa->name, sd);
}
}
slReverse(&bd->data);
return bd;
}
struct biData *probeValsForSample(struct biOmics *bi, char *sample)
{
struct hashEl *el = hashLookup(bi->samplesHash, sample);
if (!el)
return NULL;
struct samples *sa = el->val;
int expId = sa->exp_id;
struct biData *bd = biDataNew(NULL);
struct slDouble *sd;
struct slPair *sp;
for (sp = bi->probes; sp; sp = sp->next)
{
struct probeVals *pv;
for (pv = sp->val; pv; pv = pv->next)
{
if (expId >= pv->sample_count)
errAbort("expId is greater than bd->vals[] array length.");
sd = slDoubleNew(pv->sample_data[expId]);
slAddHead(&bd->data, sd);
hashAdd(bd->hash, sp->name, sd);
}
}
slReverse(&bd->data);
return bd;
}
double probeValsForProbeSample(struct biOmics *bi,
char *probe, char *sample)
{
struct hashEl *el = hashLookup(bi->probeInfoHash, probe);
if (!el)
return DBL_NULL;
struct probeInfo *pi = el->val;
int probeId = pi->id;
char pStr[128];
safef(pStr, sizeof(pStr), "%d", probeId);
el = hashLookup(bi->probesHash, pStr);
if (!el)
return DBL_NULL;
struct slPair *sp = el->val;
el = hashLookup(bi->samplesHash, sample);
if (!el)
return DBL_NULL;
struct samples *sa = el->val;
int expId = sa->exp_id;
struct probeVals *pv = sp->val;
return pv->sample_data[expId];
}
boolean calcMedianMAD(struct biOmics *bi, double *median, double *mad)
{
int count = 0;
struct samples *sa;
struct slPair *sp;
struct probeVals *pv;
struct slDouble *sd, *sdList = NULL;
for (sp = bi->probes; sp; sp = sp->next)
{
pv = sp->val;
for (sa = bi->samples; sa; sa = sa->next)
{
sd = slDoubleNew(pv->sample_data[sa->exp_id]);
slAddHead(&sdList, sd);
count++;
}
}
if (!sdList)
return FALSE;
if (count <= 1)
{
slFreeList(&sdList);
return FALSE;
}
double medianTmp = slDoubleMedian(sdList);
/* Manually calculate approx std dev according to 99.7% cut-off (3 sigma) */
slSort(sdList, slDoubleCmp);
int low = round(count * (0.0015));
int high = round(count * (1.0 - 0.0015));
sd = slElementFromIx(sdList, low);
double lowVal = sd->val;
sd = slElementFromIx(sdList, high);
double highVal = sd->val;
*median = medianTmp;
*mad = max(fabs(lowVal - medianTmp), fabs(highVal - medianTmp))/3.0;
slFreeList(&sdList);
return TRUE;
}
///* Calculate median absolute deviation for a more
// * robust measure than standard deviation */
//for (sd = sdList; sd; sd = sd->next)
// sd->val = fabs(sd->val - medianTmp);
//*median = medianTmp;
//*mad = slDoubleMedian(sdList);
//slFreeList(&sdList);
//return TRUE;
//}
void probeValsConvertToLogP(struct biOmics *bi)
{ /* tranforms all data to -log(p-value) based on normal z-score transform */
double median, mad;
if (!calcMedianMAD(bi, &median, &mad))
return;
/* To estimate std deviation from mad, use std = 1.43 * mad
* Brideau et al. J Biomol Screen, 2003 */
double std = mad * 1.43;
fprintf(stderr, "median = %f, mad = %f, std = %f\n", median, mad, std);
if (std == 0.0)
return;
double maxLogP = 88.0; /* max, in case p-value comes back zero */
struct samples *sa;
struct slPair *sp;
struct probeVals *pv;
double z, p;
double val;
for (sp = bi->probes; sp; sp = sp->next)
{
pv = sp->val;
for (sa = bi->samples; sa; sa = sa->next)
{
z = (pv->sample_data[sa->exp_id] - median)/std;
p = ndtr(-1.0*fabs(z));
if (p > 0)
val = min(-log(p)/log(10.0), maxLogP);
else
val = maxLogP;
if (z < 0.0)
val = -1.0*val; // signed log(p-value)
pv->sample_data[sa->exp_id] = val;
}
}
}
void probeValsConvertToZscores(struct biOmics *bi)
{ /* transforms all data to z-scores, median-centered */
double median, mad;
if (!calcMedianMAD(bi, &median, &mad))
return;
/* To estimate std deviation from mad, use std = 1.43 * mad
* Brideau et al. J Biomol Screen, 2003 */
double std = mad * 1.43;
fprintf(stderr, "median = %f, mad = %f, std = %f\n", median, mad, std);
if (std == 0.0)
return;
struct samples *sa;
struct slPair *sp;
struct probeVals *pv;
double z;
for (sp = bi->probes; sp; sp = sp->next)
{
pv = sp->val;
for (sa = bi->samples; sa; sa = sa->next)
{
z = (pv->sample_data[sa->exp_id] - median)/std;
pv->sample_data[sa->exp_id] = z;
}
}
}
void probeSampleValConvertToLogP(struct biOmics *bi)
{
return;
}
void probeSampleValConvertToZscores(struct biOmics *bi)
{
return;
}
struct biData *probeSampleValForProbe(struct biOmics *bi, char *probe)
{
struct hashEl *el = hashLookup(bi->probeInfoHash, probe);
if (!el)
return NULL;
struct probeInfo *pi = el->val;
int probeId = pi->id;
char pStr[128];
safef(pStr, sizeof(pStr), "%d", probeId);
el = hashLookup(bi->probesHash, pStr);
if (!el)
return NULL;
struct slPair *sp = el->val;
struct biData *bd = biDataNew(probe);
struct samples *sa;
struct probeSampleVal *pv;
struct slDouble *sd;
for (pv = sp->val; pv; pv = pv->next)
{
for (sa = bi->samples; sa; sa = sa->next)
{
if (!sa->exp_id == pv->sample_id)
continue;
sd = slDoubleNew(pv->val);
slAddHead(&bd->data, sd);
hashAdd(bd->hash, sa->name, sd);
break;
}
}
slReverse(&bd->data);
return bd;
}
struct biData *probeSampleValForSample(struct biOmics *bi, char *sample)
{
struct hashEl *el = hashLookup(bi->samplesHash, sample);
if (!el)
return NULL;
struct samples *sa = el->val;
int expId = sa->exp_id;
struct biData *bd = biDataNew(sample);
struct slDouble *sd;
struct slPair *sp;
for (sp = bi->probes; sp; sp = sp->next)
{
struct probeSampleVal *pv;
for (pv = sp->val; pv; pv = pv->next)
{
if (expId != pv->sample_id)
continue;
sd = slDoubleNew(pv->val);
slAddHead(&bd->data, sd);
hashAdd(bd->hash, sp->name, sd);
}
}
slReverse(&bd->data);
return bd;
}
double probeSampleValForProbeSample(struct biOmics *bi,
char *probe, char *sample)
{
struct hashEl *el = hashLookup(bi->probeInfoHash, probe);
if (!el)
return DBL_NULL;
struct probeInfo *pi = el->val;
int probeId = pi->id;
char pStr[128];
safef(pStr, sizeof(pStr), "%d", probeId);
el = hashLookup(bi->probesHash, pStr);
if (!el)
return DBL_NULL;
struct slPair *sp = el->val;
el = hashLookup(bi->samplesHash, sample);
if (!el)
return DBL_NULL;
struct samples *sa = el->val;
int expId = sa->exp_id;
struct probeSampleVal *pv = sp->val;
for (pv = sp->val; pv; pv = pv->next)
{
if (expId != pv->sample_id)
continue;
return pv->val;
}
return DBL_NULL;
}
void setDataType(struct sqlConnection *conn, struct biOmics *bi)
{
struct datasets *da = bi->dataset;
char query[128];
safef(query, sizeof(query),
- "select * from dataTypes where id = %d",
- da->type_id);
+ "select * from %s where id = %d",
+ DT_TABLE, da->type_id);
struct dataTypes *dt = dataTypesLoadByQuery(conn, query);
if (!dt)
errAbort("Datatype with id = %d not found in database", da->type_id);
bi->type = cloneString(dt->name);
if (sameString(dt->format, "probeVals"))
{
bi->loadData = loadProbeValsData;
bi->freeData = slPairValsFreeList;
bi->dataForProbe = probeValsForProbe;
bi->dataForSample = probeValsForSample;
bi->dataForProbeSample = probeValsForProbeSample;
bi->toZscores = probeValsConvertToZscores;
bi->toLogP = probeValsConvertToLogP;
}
else if (sameString(dt->format, "probeSampleVal"))
{
bi->loadData = loadProbeSampleValData;
bi->freeData = slPairSampleValFreeList;
bi->dataForProbe = probeSampleValForProbe;
bi->dataForSample = probeSampleValForSample;
bi->dataForProbeSample = probeSampleValForProbeSample;
bi->toZscores = probeSampleValConvertToZscores;
bi->toLogP = probeSampleValConvertToLogP;
}
else
errAbort("Unrecognized datatype");
dataTypesFreeList(&dt);
}
void loadDataset(struct sqlConnection *conn, struct biOmics *bi)
{
if (!bi->name)
return;
/* Make sure only one dataset is loaded (hence 'limit 1') */
char query[256];
safef(query, sizeof(query),
- "select * from datasets where data_table = '%s' limit 1;",
- bi->name);
+ "select * from %s where data_table = '%s' limit 1;",
+ DA_TABLE, bi->name);
bi->dataset = datasetsLoadByQuery(conn, query);
if (!bi->dataset)
errAbort("No datasets named %s found in database.", bi->name);
/* Set data type */
setDataType(conn, bi);
}
/* End database functions */
struct biOmics *biOmicsMatchDataset(struct biOmics *biList, char *name)
{
if (!name)
return NULL;
struct biOmics *bi;
for (bi = biList; bi; bi = bi->next)
if (sameString(bi->name, name))
return bi;
return NULL;
}
int biOmicsPopulateAll(struct biOmics *bi, struct biQuery *bq)
{
struct sqlConnection *biConn = hAllocConnProfile(heatMapDbProfile, bi->db);
loadDataset(biConn, bi);
loadSamples(biConn, bi, bq);
loadProbeInfo(biConn, bi, NULL, TRUE);
bi->loadData(biConn, bi, TRUE);
hFreeConn(&biConn);
return 0;
}
int biOmicsPopulate(struct biOmics *bi, struct biQuery *bq)
{
struct sqlConnection *biConn = hAllocConnProfile(heatMapDbProfile, bi->db);
if (DEBUG)
uglyTime(NULL);
loadDataset(biConn, bi);
if (DEBUG)
uglyTime("Load Dataset");
loadSamples(biConn, bi, bq);
if (DEBUG)
uglyTime("Load Samples");
loadPathways(biConn, bi, bq->pathwayList);
if (DEBUG)
uglyTime("Load Pathways");
loadGenes(biConn, bi, bq->geneList);
if (DEBUG)
uglyTime("Load Genes");
loadProbeInfo(biConn, bi, bq->probeList, FALSE);
if (DEBUG)
uglyTime("Load Probe Info");
bi->loadData(biConn, bi, FALSE);
if (DEBUG)
uglyTime("Load Probe");
hFreeConn(&biConn);
return 0;
}
int biOmicsPopulateRandom(struct biOmics *bi, struct biQuery *bq, int numProbes)
{
struct sqlConnection *biConn = hAllocConnProfile(heatMapDbProfile, bi->db);
if (DEBUG)
uglyTime(NULL);
loadDataset(biConn, bi);
if (DEBUG)
uglyTime("Load Dataset");
loadSamples(biConn, bi, bq);
if (DEBUG)
uglyTime("Load Samples");
loadProbeInfoRandom(biConn, bi, numProbes);
if (DEBUG)
uglyTime("Load Random Probe Info");
bi->loadData(biConn, bi, FALSE);
if (DEBUG)
uglyTime("Load Probe");
hFreeConn(&biConn);
return 0;
}
struct slName *biOmicsGetSamples(struct biOmics *bi)
{
struct slName *slList = NULL;
struct samples *sa;
for (sa = bi->samples; sa; sa = sa->next)
slNameAddHead(&slList, sa->name);
slReverse(&slList);
return slList;
}
struct slName *biOmicsGetProbes(struct biOmics *bi)
{
struct slName *slList = NULL;
struct probeInfo *pi;
for (pi = bi->probeInfo; pi; pi = pi->next)
slNameAddHead(&slList, pi->name);
slReverse(&slList);
return slList;
}
void populateAliases(struct biOmics *bi)
{
struct sqlConnection *biConn = hAllocConnProfile(heatMapDbProfile, bi->db);
struct datasets *da = bi->dataset;
fprintf(stderr, "loading gene aliases...\n");
if (DEBUG)
uglyTime(NULL);
char query[1024];
safef(query, sizeof(query),
"select DISTINCT kgXref.geneSymbol, %s.name from %s "
"join %s on %s.id = %s.probe_id "
- "join geneLookup on geneLookup.id = %s.gene_id "
- "join kgXref on kgXref.kgId = geneLookup.kgId;",
- da->probe_table, da->probe_table, da->probe_to_gene_table,
- da->probe_table, da->probe_to_gene_table, da->probe_to_gene_table);
+ "join %s on %s.id = %s.gene_id "
+ "join kgXref on kgXref.kgId = %s.kgId;",
+ da->probe_table, da->probe_table,
+ da->probe_to_gene_table, da->probe_table, da->probe_to_gene_table,
+ GL_TABLE, GL_TABLE, da->probe_to_gene_table,
+ GL_TABLE);
struct sqlResult *sr = sqlGetResult(biConn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
char *gene = row[0];
char *probe = cloneString(row[1]);
hashAdd(bi->geneAliases, gene, probe);
}
sqlFreeResult(&sr);
if (DEBUG)
uglyTime("loaded gene aliases");
hFreeConn(&biConn);
}
struct slName *biOmicsGetProbesForGene(struct biOmics *bi, char *gene)
{
if (hashNumEntries(bi->geneAliases) == 0)
populateAliases(bi);
struct hashEl *el;
struct slName *slList = NULL;
for(el = hashLookup(bi->geneAliases, gene); el != NULL; el = hashLookupNext(el))
slNameAddHead(&slList, (char *) el->val);
return slList;
}
struct biOmics *newBiOmics(char *db, char *dataset)
{
struct biOmics *bi;
AllocVar(bi);
bi->db = cloneString(db);
bi->name = cloneString(dataset);
bi->sampleIndices = hashNew(0);
bi->geneAliases = hashNew(0);
bi->dataset = NULL;
bi->samples = NULL;
bi->samplesHash = hashNew(0);
bi->probeInfo = NULL;
bi->probeInfoHash = hashNew(0);
bi->probes = NULL;
bi->probesHash = hashNew(0);
/* Methods */
bi->populate = biOmicsPopulate;
bi->populateAll = biOmicsPopulateAll;
bi->populateRandom = biOmicsPopulateRandom;
bi->allProbes = biOmicsGetProbes;
bi->allSamples = biOmicsGetSamples;
bi->probesForGene = biOmicsGetProbesForGene;
/* These are set according to dataType */
bi->loadData = NULL;
bi->freeData = NULL;
bi->dataForProbe = NULL;
bi->dataForSample = NULL;
bi->dataForProbeSample = NULL;
bi->toZscores = NULL;
return bi;
}
void slPairSampleValFree(struct slPair **pEl)
{
struct slPair *el;
if ((el = *pEl) == NULL) return;
freeMem(el->name);
struct probeSampleVal *pv = el->val;
probeSampleValFree(&pv);
freez(pEl);
}
void slPairSampleValFreeList(struct slPair **pList)
{
struct slPair *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
slPairSampleValFree(&el);
}
*pList = NULL;
}
void slPairValsFree(struct slPair **pEl)
{
struct slPair *el;
if ((el = *pEl) == NULL) return;
freeMem(el->name);
struct probeVals *pv = el->val;
probeValsFreeList(&pv);
freez(pEl);
}
void slPairValsFreeList(struct slPair **pList)
{
struct slPair *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
slPairValsFree(&el);
}
*pList = NULL;
}
void biOmicsFree(struct biOmics **pEl)
{
struct biOmics *el;
if ((el = *pEl) == NULL) return;
freeMem(el->db);
freeMem(el->name);
freeHash(&el->sampleIndices);
freeHash(&el->geneAliases);
datasetsFreeList(&el->dataset);
samplesFreeList(&el->samples);
freeHash(&el->samplesHash);
probeInfoFreeList(&el->probeInfo);
freeHash(&el->probeInfoHash);
el->freeData(&el->probes);
freeHash(&el->probesHash);
*pEl = NULL;
}
void biOmicsFreeList(struct biOmics **pList)
{
struct biOmics *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
biOmicsFree(&el);
}
*pList = NULL;
}
/*** biResults code ***/
struct slName *biResultsGetDatasets(struct biResults *br)
{
struct slName *slList = NULL;
struct biOmics *bi;
for (bi = br->datasets; bi; bi = bi->next)
slNameAddHead(&slList, bi->name);
return slList;
}
struct slName *biResultsGetProbesInDataset(struct biResults *br, char *dataset)
{
struct biOmics *bi = biOmicsMatchDataset(br->datasets, dataset);
if (!bi)
return NULL;
return bi->allProbes(bi);
}
struct slName *biResultsGetProbes(struct biResults *br)
{
struct slName *slList = NULL;
struct biOmics *bi;
for (bi = br->datasets; bi; bi = bi->next)
{
struct slName *probes = bi->allProbes(bi);
slList = slCat(slList, probes);
}
return slNameUniqueList(slList);
}
struct slName *biResultsGetSamplesInDataset(struct biResults *br, char *dataset)
{
struct biOmics *bi = biOmicsMatchDataset(br->datasets, dataset);
if (!bi)
return NULL;
return bi->allSamples(bi);
}
struct slName *biResultsGetSamples(struct biResults *br)
{
struct slName *slList = NULL;
struct biOmics *bi;
for (bi = br->datasets; bi; bi = bi->next)
{
struct slName *samples = bi->allSamples(bi);
slList = slCat(slList, samples);
}
return slNameUniqueList(slList);
}
struct slName *getMatching(struct slName *list1, struct slName *list2)
{
if (!list1 || !list2)
return NULL;
fprintf(stderr, "matching: count(list1) = %d, count(list2) = %d\n",
slCount(list1), slCount(list2));
struct slName *sl1, *sl2, *matched = NULL;
for (sl1 = list1; sl1; sl1 = sl1->next)
{
for (sl2 = list2; sl2; sl2 = sl2->next)
{
if (sameString(sl1->name, sl2->name))
{
if (!slNameInList(matched, sl1->name))
slNameAddHead(&matched, sl1->name);
}
}
}
return matched;
}
struct slName *biResultsGetSamplesInCommon(struct biResults *br)
{
struct biOmics *bi = br->datasets;
/* Start off the list */
struct slName *prevMatched, *matched = bi->allSamples(bi);
bi = bi->next;
for ( ; bi; bi = bi->next)
{
prevMatched = matched;
struct slName *samples = bi->allSamples(bi);
matched = getMatching(prevMatched, samples);
slNameFreeList(&samples);
slNameFreeList(&prevMatched);
}
return matched;
}
struct slName *biResultsProbesForGeneInDataset(struct biResults *br, char *gene, char *dataset)
{
struct biOmics *bi = biOmicsMatchDataset(br->datasets, dataset);
if (!bi)
return NULL;
return bi->probesForGene(bi, gene);
}
struct slName *biResultsProbesForGene(struct biResults *br, char *gene)
{
struct slName *slList = NULL;
struct biOmics *bi;
for (bi = br->datasets; bi; bi = bi->next)
{
struct slName *probes = bi->probesForGene(bi, gene);
slList = slCat(slList, probes);
}
return slNameUniqueList(slList);
}
struct biData *biResultsDataForProbeInDataset(struct biResults *br, char *probe, char *dataset)
{
struct biOmics *bi = biOmicsMatchDataset(br->datasets, dataset);
if (!bi)
return NULL;
return bi->dataForProbe(bi, probe);
}
struct biData *biResultsDataForProbe(struct biResults *br, char *probe)
{
struct biData *bd,*bdList = NULL;
struct biOmics *bi;
for (bi = br->datasets; bi; bi = bi->next)
{
bd = bi->dataForProbe(bi, probe);
biDataAppendName(bd, bi->name);
slAddHead(&bdList, bd);
}
return bdList;
}
struct biData *biResultsDataForSampleInDataset(struct biResults *br, char *sample,
char *dataset)
{
struct biOmics *bi = biOmicsMatchDataset(br->datasets, dataset);
if (!bi)
return NULL;
return bi->dataForSample(bi, sample);
}
struct biData *biResultsDataForSample(struct biResults *br, char *sample)
{
struct biData *bd, *bdList = NULL;
struct biOmics *bi;
for (bi = br->datasets; bi; bi = bi->next)
{
bd = bi->dataForSample(bi, sample);
bd->type = cloneString(bi->type);
biDataAppendName(bd, bi->name);
slAddHead(&bdList, bd);
}
return bdList;
}
double biResultsDataForProbeSampleInDataset(struct biResults *br,
char *probe, char *sample, char *dataset)
{
struct biOmics *bi = biOmicsMatchDataset(br->datasets, dataset);
if (!bi)
return DBL_NULL;
return bi->dataForProbeSample(bi, probe, sample);
}
struct biData *biResultsDataForProbeSample(struct biResults *br, char *probe, char *sample)
{
char name[128];
safef(name, sizeof(name), "%s-%s", probe, sample);
struct slDouble *sd;
struct biData *bd = biDataNew(name);
struct biOmics *bi;
for (bi = br->datasets; bi; bi = bi->next)
{
double val = bi->dataForProbeSample(bi, probe, sample);
sd = slDoubleNew(val);
slAddHead(&bd->data, sd);
hashAdd(bd->hash, bi->name, sd);
}
slReverse(&bd->data);
return bd;
}
void biResultsConvertToZscores(struct biResults *br)
{
struct biOmics *bi;
for (bi = br->datasets; bi; bi = bi->next)
bi->toZscores(bi);
}
void biResultsConvertToLogP(struct biResults *br)
{
struct biOmics *bi;
for (bi = br->datasets; bi; bi = bi->next)
bi->toLogP(bi);
}
struct biResults *biResultsNew(void)
{
struct biResults *br;
AllocVar(br);
br->datasets = NULL;
/* Methods */
br->allDatasets = biResultsGetDatasets;
br->allProbes = biResultsGetProbes;
br->allProbesInDataset = biResultsGetProbesInDataset;
br->allSamples = biResultsGetSamples;
br->allSamplesInCommon = biResultsGetSamplesInCommon;
br->allSamplesInDataset = biResultsGetSamplesInDataset;
br->probesForGene = biResultsProbesForGene;
br->probesForGeneInDataset = biResultsProbesForGeneInDataset;
br->dataForProbe = biResultsDataForProbe;
br->dataForProbeInDataset = biResultsDataForProbeInDataset;
br->dataForSample = biResultsDataForSample;
br->dataForSampleInDataset = biResultsDataForSampleInDataset;
br->dataForProbeSample = biResultsDataForProbeSample;
br->dataForProbeSampleInDataset = biResultsDataForProbeSampleInDataset;
br->toZscores = biResultsConvertToZscores;
br->toLogP = biResultsConvertToLogP;
return br;
}
void biResultsFree(struct biResults **pEl)
{
struct biResults *el;
if ((el = *pEl) == NULL) return;
biOmicsFreeList(&el->datasets);
}
void biResultsAddBiQuery(struct biResults *br, struct biQuery *bq)
{
struct biOmics *bi = newBiOmics(bq->db, bq->dataset);
if (bq->getAllProbes)
bi->populateAll(bi, bq);
else
bi->populate(bi, bq);
slAddHead(&br->datasets, bi);
}
struct biResults *biQueryResults(struct biQuery *bqList)
{
struct biResults *br = biResultsNew();
struct biQuery *bq;
for (bq = bqList; bq; bq = bq->next)
biResultsAddBiQuery(br, bq);
return br;
}
void biResultsAddBiQueryRandom(struct biResults *br, struct biQuery *bq, int numProbes)
{
struct biOmics *bi = newBiOmics(bq->db, bq->dataset);
bi->populateRandom(bi, bq, numProbes);
slAddHead(&br->datasets, bi);
}
struct biResults *biQueryResultsRandomize(struct biQuery *bqList, int numProbes)
{
struct biResults *br = biResultsNew();
struct biQuery *bq;
for (bq = bqList; bq; bq = bq->next)
biResultsAddBiQueryRandom(br, bq, numProbes);
return br;
}
/*** End biResults code ***/
/*** biQuery code ****/
void biQueryAppend(struct biQuery **bqList, struct biQuery *bq)
{
if (!bqList)
return;
slAddHead(bqList, bq);
}
int biQueryAddPathways(struct biQuery *bq, char *pathways, char sep)
{
return addDelimStringToList(&bq->pathwayList, pathways, sep);
}
int biQueryAddProbes(struct biQuery *bq, char *probes, char sep)
{
return addDelimStringToList(&bq->probeList, probes, sep);
}
int biQueryAddGenes(struct biQuery *bq, char *genes, char sep)
{
return addDelimStringToList(&bq->geneList, genes, sep);
}
int biQueryAddSamples(struct biQuery *bq, char *samples, char sep)
{
return addDelimStringToList(&bq->sampleList, samples, sep);
}
int biQueryAddFeatureVals(struct biQuery *bq, char *featureVals, char sep)
{
return addDelimStringToList(&bq->featureValList, featureVals, sep);
}
int biQueryAddFeatureCodes(struct biQuery *bq, char *featureCodes, char sep)
{
return addDelimStringToList(&bq->featureCodeList, featureCodes, sep);
}
struct biQuery *biQueryNew(char *db, char *dataset)
{
struct biQuery *bq;
AllocVar(bq);
bq->db = cloneString(db);
bq->dataset = cloneString(dataset);
bq->getAllProbes = FALSE;
bq->pathwayList = NULL;
bq->probeList = NULL;
bq->geneList = NULL;
bq->sampleList = NULL;
bq->featureValList = NULL;
bq->featureCodeList = NULL;
bq->addPathways = biQueryAddPathways;
bq->addProbes = biQueryAddProbes;
bq->addSamples = biQueryAddSamples;
bq->addGenes = biQueryAddGenes;
bq->addFeatureVals = biQueryAddFeatureVals;
bq->addFeatureCodes = biQueryAddFeatureCodes;
return bq;
}
void biQueryFree(struct biQuery **pEl)
{
struct biQuery *el;
if ((el = *pEl) == NULL) return;
freeMem(el->db);
freeMem(el->dataset);
slNameFreeList(&el->pathwayList);
slNameFreeList(&el->probeList);
slNameFreeList(&el->geneList);
slNameFreeList(&el->sampleList);
slNameFreeList(&el->featureValList);
slNameFreeList(&el->featureCodeList);
}
void biQueryFreeList(struct biQuery **pList)
{
struct biQuery *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
biQueryFree(&el);
}
*pList = NULL;
}
void biDataAppendName(struct biData *bd, char *name)
{
if (!bd->name)
{
bd->name = cloneString(name);
return;
}
char newName[128];
safef(newName, sizeof(newName), "%s,%s", bd->name, name);
freeMem(bd->name);
bd->name = cloneString(newName);
}
struct biData *biDataFind(struct biData *bdList, char *name)
{
struct biData *bd;
for (bd = bdList; bd; bd = bd->next)
{
if (!sameString(bd->name, name))
continue;
return bd;
}
return NULL;
}
struct biData *biDataNew(char *name)
{
struct biData *bd;
AllocVar(bd);
bd->name = NULL;
if (name)
bd->name = cloneString(name);
bd->type = NULL;
bd->data = NULL;
bd->hash = hashNew(0);
return bd;
}
void biDataFree(struct biData **pEl)
{
struct biData *el;
if ((el = *pEl) == NULL) return;
freeMem(el->name);
freeMem(el->type);
slFreeList(&el->data);
freeHash(&el->hash);
}
void biDataFreeList(struct biData **pList)
{
struct biData *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
biDataFree(&el);
}
*pList = NULL;
}
/**** End biQuery Code *****/