src/hg/instinct/bioInt2/grabData.c 1.3
1.3 2009/04/07 03:41:09 jsanborn
updated
Index: src/hg/instinct/bioInt2/grabData.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/grabData.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/instinct/bioInt2/grabData.c 6 Apr 2009 18:51:43 -0000 1.2
+++ src/hg/instinct/bioInt2/grabData.c 7 Apr 2009 03:41:09 -0000 1.3
@@ -1,482 +1,484 @@
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "hPrint.h"
#include "hdb.h"
#include "dystring.h"
#include "bioIntDb.h"
#include "bioIntDriver.h"
void usage()
/* Explain usage and exit. */
{
errAbort(
"grabData \n"
"usage:\n"
" grabData [options] table\n"
" -clinical get clinical data corresponding to table\n"
" -transpose samples as rows, features as columns\n"
);
}
char db[128] = "bioInt";
char localDb[128] = "localDb";
boolean getClinical = FALSE;
boolean transpose = FALSE;
static struct optionSpec options[] = {
{"clinical", OPTION_BOOLEAN},
{"transpose", OPTION_BOOLEAN},
{NULL, 0}
};
void spHashListTranspose(struct slPair *spList, struct slName *slList, struct slPair **retSpList, struct slName **retSlList)
{
// Transpose data in spList. So, sp->name = sample name, sp->hash keyed by feature name
struct slName *sl, *outSlList = NULL;
struct slPair *sp, *outSp, *outSpList = NULL;
struct hash *featureHash = hashNew(0);
struct hash *sampleHash = hashNew(0);
for (sl = slList; sl; sl = sl->next)
{
struct hash *dataHash;
struct hashEl *el = hashLookup(sampleHash, sl->name);
if (!el)
{
AllocVar(outSp);
outSp->name = cloneString(sl->name);
dataHash = hashNew(0);
outSp->val = dataHash;
slAddHead(&outSpList, outSp);
hashAdd(sampleHash, sl->name, outSp);
}
else
outSp = el->val;
for (sp = spList; sp; sp = sp->next)
{
el = hashLookup(featureHash, sp->name);
if (!el)
{
slNameAddHead(&outSlList, sp->name);
hashAddInt(featureHash, sp->name, 1);
}
struct hash *hash = sp->val;
el = hashLookup(hash, sl->name);
if (el)
{
struct slDouble *sd = el->val;
hash = outSp->val;
hashAdd(hash, sp->name, sd);
}
}
}
*retSpList = outSpList;
*retSlList = outSlList;
}
void spHashListPrint(struct slPair *inSpList, struct slName *inSlList, char *tableName)
{
struct slPair *spList = NULL;
struct slName *slList = NULL;
+
+char filename[256];
if (transpose)
{
+ safef(filename, sizeof(filename), "%s_data_transpose.tab", tableName);
fprintf(stdout, "transposing hash\n");
spHashListTranspose(inSpList, inSlList, &spList, &slList);
fprintf(stdout, "finished transposing\n");
}
else
{
+ safef(filename, sizeof(filename), "%s_data.tab", tableName);
spList = inSpList;
slList = inSlList;
}
-char filename[256];
-safef(filename, sizeof(filename), "%s_data.tab", tableName);
FILE *f = mustOpen(filename, "w");
struct slName *sl;
struct slPair *sp;
// Print header
fprintf(f, "feature_name\t");
for (sl = slList; sl; sl = sl->next)
{
fprintf(f, "%s", sl->name);
if (sl->next)
fprintf(f, ",");
}
fprintf(f, "\n");
struct hashEl *el;
for (sp = spList; sp; sp = sp->next)
{
fprintf(f, "%s\t", sp->name);
struct hash *hash = sp->val;
for (sl = slList; sl; sl = sl->next)
{
el = hashLookup(hash, sl->name);
if (el)
{
struct slDouble *sd = el->val;
fprintf(f, "%f", sd->val);
}
if (sl->next)
fprintf(f, ",");
}
fprintf(f, "\n");
}
}
struct hash *createHash(struct sqlConnection *biConn,
char *table, char *key_field, char *val_field)
{
struct hash *hash = hashNew(0);
char query[128];
safef(query, sizeof(query), "select %s, %s from %s", key_field, val_field, table);
struct sqlResult *sr = sqlGetResult(biConn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
char *id = row[0];
char *name = cloneString(row[1]);
hashAdd(hash, id, name);
}
return hash;
}
struct datasets *findDataset(struct sqlConnection *conn, char *data_table)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where data_table = \"%s\" ",
DA_TABLE, data_table);
return datasetsLoadByQuery(conn, query);
}
struct samples *getSamples(struct sqlConnection *conn, struct datasets *da)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where dataset_id = %d order by exp_id",
SA_TABLE, da->id);
return samplesLoadByQuery(conn, query);
}
void grabClinicalData(struct sqlConnection *conn, char *tableName)
{
struct datasets *da = findDataset(conn, tableName);
if (!da)
errAbort("No dataset by name of %s in db\n", tableName);
struct samples *sa, *saList = getSamples(conn, da);
struct dyString *dy = newDyString(100);
dyStringPrintf(dy,
"select %s.name, %s.name, %s.val from %s join %s on %s.sample_id = %s.id "
"join %s on %s.feature_id = %s.id "
"where %s.id in (",
SA_TABLE, FE_TABLE, CD_TABLE,
CD_TABLE, SA_TABLE, CD_TABLE, SA_TABLE,
FE_TABLE, CD_TABLE, FE_TABLE, SA_TABLE);
for (sa = saList; sa; sa = sa->next)
{
dyStringPrintf(dy, "%d", sa->id);
if (sa->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ")");
char *query = dyStringCannibalize(&dy);
struct slPair *sp, *spList = NULL;
struct slName *slList = NULL;
struct hash *samplesHash = hashNew(0);
struct hash *featureData = hashNew(0);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
char *sample = row[0];
char *feature = row[1];
double val = atof(row[2]);
struct hash *sampleData;
struct hashEl *el = hashLookup(featureData, feature);
if (!el)
{
sampleData = hashNew(0);
hashAdd(featureData, feature, sampleData);
AllocVar(sp);
sp->name = cloneString(feature);
sp->val = sampleData;
slAddHead(&spList, sp);
}
else
sampleData = el->val;
if (!hashLookup(samplesHash, sample))
{
slNameAddHead(&slList, sample);
hashAddInt(samplesHash, sample, 1);
}
struct slDouble *sd = slDoubleNew(val);
hashAdd(sampleData, sample, sd);
}
sqlFreeResult(&sr);
hashFree(&featureData);
slReverse(&spList);
slReverse(&slList);
char clinTableName[512];
safef(clinTableName, sizeof(clinTableName),
"%s_clinical",
tableName);
spHashListPrint(spList, slList, clinTableName);
}
struct hash *probeIdToGene(struct sqlConnection *conn, struct datasets *da)
{
char query[512];
safef(query, sizeof(query),
"select DISTINCT %s.id,geneSymbol from %s join %s on id=probe_id "
"join %s on gene_id=%s.id join %s on %s.kgId=%s.kgId;",
da->probe_table, da->probe_table, da->probe_to_gene_table, GL_TABLE,
GL_TABLE, KX_TABLE, GL_TABLE, KX_TABLE);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
struct hash *hash = hashNew(0);
while ((row = sqlNextRow(sr)) != NULL)
{
char *id = row[0];
char *gene = cloneString(row[1]);
hashAdd(hash, id, gene);
}
sqlFreeResult(&sr);
return hash;
}
void medianSpHashList(struct slPair *spList, struct slName *slList)
{
struct hashEl *el;
struct slPair *sp;
struct slName *sl;
for (sp = spList; sp; sp = sp->next)
{
struct hash *hash = sp->val;
for (sl = slList; sl; sl = sl->next)
{
el = hashLookup(hash, sl->name);
struct slDouble *sd, *sdList = el->val;
double val = slDoubleMedian(sdList);
sd = slDoubleNew(val);
hashRemove(hash, sl->name);
slFreeList(&sdList);
hashAdd(hash, sl->name, sd);
}
}
}
void grabRawData(struct sqlConnection *conn, char *tableName)
{
struct datasets *da = findDataset(conn, tableName);
if (!da)
errAbort("No dataset by name of %s in db\n", tableName);
struct samples *sa, *saList = getSamples(conn, da);
struct hash *idGeneHash = probeIdToGene(conn, da);
char query[256];
safef(query, sizeof(query), "select * from %s", tableName);
struct hash *featureData = hashNew(0);
struct slPair *sp, *spList = NULL;
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
char *probe_id = row[0];
struct probeVals *pv = probeValsLoad(row);
struct hashEl *el = hashLookup(idGeneHash, probe_id);
if (!el)
continue;
char *gene = el->val;
struct hash *sampleData;
el = hashLookup(featureData, gene);
if (!el)
{
sampleData = hashNew(0);
hashAdd(featureData, gene, sampleData);
AllocVar(sp);
sp->name = cloneString(gene);
sp->val = sampleData;
slAddHead(&spList, sp);
}
else
sampleData = el->val;
for (sa = saList; sa; sa = sa->next)
{
double val = pv->sample_data[sa->exp_id];
struct slDouble *sd = slDoubleNew(val);
el = hashLookup(sampleData, sa->name);
if (!el)
hashAdd(sampleData, sa->name, sd);
else
{
struct slDouble *sdList = el->val;
slAddTail(&sdList, sd);
}
}
}
sqlFreeResult(&sr);
hashFree(&featureData);
struct slName *slList = NULL;
for (sa = saList; sa; sa = sa->next)
slNameAddHead(&slList, sa->name);
slReverse(&spList);
slReverse(&slList);
medianSpHashList(spList, slList);
spHashListPrint(spList, slList, tableName);
}
void grabAnalysisData(struct sqlConnection *conn, char *tableName)
{
struct hash *sampleIdHash = createHash(conn, SA_TABLE, "id", "name");
struct hash *featureIdHash = createHash(conn, AF_TABLE, "id", "feature_name");
char query[256];
safef(query, sizeof(query), "select * from %s", tableName);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
struct hash *samplesHash = hashNew(0);
struct slName *slList = NULL;
struct hash *featureData = hashNew(0);
struct slPair *sp, *spList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
char *sample_id = row[0];
char *feature_id = row[1];
double val = atof(row[2]);
struct hash *sampleData;
struct hashEl *el = hashLookup(featureData, feature_id);
if (!el)
{
sampleData = hashNew(0);
hashAdd(featureData, feature_id, sampleData);
el = hashLookup(featureIdHash, feature_id);
if (el)
{
char *name = el->val;
AllocVar(sp);
sp->name = cloneString(name);
sp->val = sampleData;
slAddHead(&spList, sp);
}
}
else
sampleData = el->val;
el = hashLookup(sampleIdHash, sample_id);
char *name = el->val;
if (!hashLookup(samplesHash, name))
{
slNameAddHead(&slList, name);
hashAddInt(samplesHash, name, 1);
}
struct slDouble *sd = slDoubleNew(val);
hashAdd(sampleData, name, sd);
}
sqlFreeResult(&sr);
hashFree(&featureData);
hashFree(&samplesHash);
slReverse(&spList);
slReverse(&slList);
spHashListPrint(spList, slList, tableName);
}
boolean isAnalysisTable(struct sqlConnection *conn, char *tableName)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where result_table = \"%s\" ",
AN_TABLE, tableName);
return sqlExists(conn, query);
}
void grabData(char *tableName)
{
struct sqlConnection *conn = hAllocConnProfile(localDb, db);
if (!sqlTableExists(conn, tableName))
errAbort("%s doesn't exist in %s db.\n", tableName, db);
if (isAnalysisTable(conn, tableName))
grabAnalysisData(conn, tableName);
else
{
if (getClinical)
grabClinicalData(conn, tableName);
else
grabRawData(conn, tableName);
}
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 2)
usage();
if (optionExists("clinical"))
getClinical = TRUE;
if (optionExists("transpose"))
transpose = TRUE;
grabData(argv[1]);
return 0;
}