src/hg/instinct/bioInt2/grabData.c 1.1
1.1 2009/04/05 21:08:07 jsanborn
added files, fixed cohorts API
Index: src/hg/instinct/bioInt2/grabData.c
===================================================================
RCS file: src/hg/instinct/bioInt2/grabData.c
diff -N src/hg/instinct/bioInt2/grabData.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/instinct/bioInt2/grabData.c 5 Apr 2009 21:08:07 -0000 1.1
@@ -0,0 +1,412 @@
+/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "hPrint.h"
+#include "hdb.h"
+#include "dystring.h"
+#include "bioIntDb.h"
+#include "bioIntDriver.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "grabData \n"
+ "usage:\n"
+ " grabData table\n"
+ );
+}
+
+char db[128] = "bioInt";
+char localDb[128] = "localDb";
+
+boolean getClinical = FALSE;
+
+static struct optionSpec options[] = {
+ {"clinical", OPTION_BOOLEAN},
+ {NULL, 0}
+};
+
+void spHashListPrint(struct slPair *spList, struct slName *slList, char *tableName)
+{
+char filename[256];
+safef(filename, sizeof(filename), "%s_data.tab", tableName);
+FILE *f = mustOpen(filename, "w");
+
+struct slName *sl;
+struct slPair *sp;
+
+// Print header
+fprintf(f, "feature_name\t");
+for (sl = slList; sl; sl = sl->next)
+ {
+ fprintf(f, "%s", sl->name);
+ if (sl->next)
+ fprintf(f, ",");
+ }
+fprintf(f, "\n");
+
+struct hashEl *el;
+for (sp = spList; sp; sp = sp->next)
+ {
+ fprintf(f, "%s\t", sp->name);
+ struct hash *hash = sp->val;
+ for (sl = slList; sl; sl = sl->next)
+ {
+ el = hashLookup(hash, sl->name);
+ if (el)
+ {
+ struct slDouble *sd = el->val;
+ fprintf(f, "%f", sd->val);
+ }
+ if (sl->next)
+ fprintf(f, ",");
+ }
+ fprintf(f, "\n");
+ }
+}
+
+struct hash *createHash(struct sqlConnection *biConn,
+ char *table, char *key_field, char *val_field)
+{
+struct hash *hash = hashNew(0);
+char query[128];
+safef(query, sizeof(query), "select %s, %s from %s", key_field, val_field, table);
+
+struct sqlResult *sr = sqlGetResult(biConn, query);
+char **row = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ char *id = row[0];
+ char *name = cloneString(row[1]);
+ hashAdd(hash, id, name);
+ }
+
+return hash;
+}
+
+struct datasets *findDataset(struct sqlConnection *conn, char *data_table)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from %s where data_table = \"%s\" ",
+ DA_TABLE, data_table);
+
+return datasetsLoadByQuery(conn, query);
+}
+
+struct samples *getSamples(struct sqlConnection *conn, struct datasets *da)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from %s where dataset_id = %d order by exp_id",
+ SA_TABLE, da->id);
+
+return samplesLoadByQuery(conn, query);
+}
+
+
+
+void grabClinicalData(struct sqlConnection *conn, char *tableName)
+{
+struct datasets *da = findDataset(conn, tableName);
+if (!da)
+ errAbort("No dataset by name of %s in db\n", tableName);
+
+struct samples *sa, *saList = getSamples(conn, da);
+
+struct dyString *dy = newDyString(100);
+dyStringPrintf(dy,
+ "select %s.name, %s.name, %s.val from %s join %s on %s.sample_id = %s.id "
+ "join %s on %s.feature_id = %s.id "
+ "where %s.id in (",
+ SA_TABLE, FE_TABLE, CD_TABLE,
+ CD_TABLE, SA_TABLE, CD_TABLE, SA_TABLE,
+ FE_TABLE, CD_TABLE, FE_TABLE, SA_TABLE);
+
+for (sa = saList; sa; sa = sa->next)
+ {
+ dyStringPrintf(dy, "%d", sa->id);
+ if (sa->next)
+ dyStringPrintf(dy, ",");
+ }
+dyStringPrintf(dy, ")");
+char *query = dyStringCannibalize(&dy);
+
+struct slPair *sp, *spList = NULL;
+struct slName *slList = NULL;
+struct hash *samplesHash = hashNew(0);
+struct hash *featureData = hashNew(0);
+
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ char *sample = row[0];
+ char *feature = row[1];
+ double val = atof(row[2]);
+
+ struct hash *sampleData;
+ struct hashEl *el = hashLookup(featureData, feature);
+ if (!el)
+ {
+ sampleData = hashNew(0);
+ hashAdd(featureData, feature, sampleData);
+ AllocVar(sp);
+ sp->name = cloneString(feature);
+ sp->val = sampleData;
+ slAddHead(&spList, sp);
+ }
+ else
+ sampleData = el->val;
+
+ if (!hashLookup(samplesHash, sample))
+ {
+ slNameAddHead(&slList, sample);
+ hashAddInt(samplesHash, sample, 1);
+ }
+ struct slDouble *sd = slDoubleNew(val);
+ hashAdd(sampleData, sample, sd);
+ }
+
+sqlFreeResult(&sr);
+hashFree(&featureData);
+
+slReverse(&spList);
+slReverse(&slList);
+
+char clinTableName[512];
+safef(clinTableName, sizeof(clinTableName),
+ "%s_clinical",
+ tableName);
+
+spHashListPrint(spList, slList, clinTableName);
+}
+
+
+struct hash *probeIdToGene(struct sqlConnection *conn, struct datasets *da)
+{
+char query[512];
+
+safef(query, sizeof(query),
+ "select DISTINCT %s.id,geneSymbol from %s join %s on id=probe_id "
+ "join %s on gene_id=%s.id join %s on %s.kgId=%s.kgId;",
+ da->probe_table, da->probe_table, da->probe_to_gene_table, GL_TABLE,
+ GL_TABLE, KX_TABLE, GL_TABLE, KX_TABLE);
+
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+
+struct hash *hash = hashNew(0);
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ char *id = row[0];
+ char *gene = cloneString(row[1]);
+ hashAdd(hash, id, gene);
+ }
+sqlFreeResult(&sr);
+return hash;
+}
+
+void medianSpHashList(struct slPair *spList, struct slName *slList)
+{
+struct hashEl *el;
+struct slPair *sp;
+struct slName *sl;
+for (sp = spList; sp; sp = sp->next)
+ {
+ struct hash *hash = sp->val;
+ for (sl = slList; sl; sl = sl->next)
+ {
+ el = hashLookup(hash, sl->name);
+ struct slDouble *sd, *sdList = el->val;
+
+ double val = slDoubleMedian(sdList);
+ sd = slDoubleNew(val);
+
+ hashRemove(hash, sl->name);
+ slFreeList(&sdList);
+
+ hashAdd(hash, sl->name, sd);
+ }
+ }
+}
+
+void grabRawData(struct sqlConnection *conn, char *tableName)
+{
+struct datasets *da = findDataset(conn, tableName);
+if (!da)
+ errAbort("No dataset by name of %s in db\n", tableName);
+
+struct samples *sa, *saList = getSamples(conn, da);
+struct hash *idGeneHash = probeIdToGene(conn, da);
+
+char query[256];
+safef(query, sizeof(query), "select * from %s", tableName);
+
+struct hash *featureData = hashNew(0);
+struct slPair *sp, *spList = NULL;
+
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ char *probe_id = row[0];
+ struct probeVals *pv = probeValsLoad(row);
+
+ struct hashEl *el = hashLookup(idGeneHash, probe_id);
+ if (!el)
+ continue;
+ char *gene = el->val;
+
+ struct hash *sampleData;
+ el = hashLookup(featureData, gene);
+ if (!el)
+ {
+ sampleData = hashNew(0);
+ hashAdd(featureData, gene, sampleData);
+ AllocVar(sp);
+ sp->name = cloneString(gene);
+ sp->val = sampleData;
+ slAddHead(&spList, sp);
+ }
+ else
+ sampleData = el->val;
+
+ for (sa = saList; sa; sa = sa->next)
+ {
+ double val = pv->sample_data[sa->exp_id];
+ struct slDouble *sd = slDoubleNew(val);
+
+ el = hashLookup(sampleData, sa->name);
+ if (!el)
+ hashAdd(sampleData, sa->name, sd);
+ else
+ {
+ struct slDouble *sdList = el->val;
+ slAddTail(&sdList, sd);
+ }
+ }
+ }
+sqlFreeResult(&sr);
+hashFree(&featureData);
+
+struct slName *slList = NULL;
+for (sa = saList; sa; sa = sa->next)
+ slNameAddHead(&slList, sa->name);
+
+slReverse(&spList);
+slReverse(&slList);
+
+medianSpHashList(spList, slList);
+
+spHashListPrint(spList, slList, tableName);
+}
+
+void grabAnalysisData(struct sqlConnection *conn, char *tableName)
+{
+struct hash *sampleIdHash = createHash(conn, SA_TABLE, "id", "name");
+struct hash *featureIdHash = createHash(conn, AF_TABLE, "id", "feature_name");
+
+char query[256];
+safef(query, sizeof(query), "select * from %s", tableName);
+
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+
+struct hash *samplesHash = hashNew(0);
+struct slName *slList = NULL;
+struct hash *featureData = hashNew(0);
+struct slPair *sp, *spList = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ char *sample_id = row[0];
+ char *feature_id = row[1];
+ double val = atof(row[2]);
+
+ struct hash *sampleData;
+ struct hashEl *el = hashLookup(featureData, feature_id);
+ if (!el)
+ {
+ sampleData = hashNew(0);
+ hashAdd(featureData, feature_id, sampleData);
+ el = hashLookup(featureIdHash, feature_id);
+ if (el)
+ {
+ char *name = el->val;
+ AllocVar(sp);
+ sp->name = cloneString(name);
+ sp->val = sampleData;
+ slAddHead(&spList, sp);
+ }
+ }
+ else
+ sampleData = el->val;
+
+ el = hashLookup(sampleIdHash, sample_id);
+ char *name = el->val;
+ if (!hashLookup(samplesHash, name))
+ {
+ slNameAddHead(&slList, name);
+ hashAddInt(samplesHash, name, 1);
+ }
+ struct slDouble *sd = slDoubleNew(val);
+ hashAdd(sampleData, name, sd);
+ }
+
+sqlFreeResult(&sr);
+hashFree(&featureData);
+hashFree(&samplesHash);
+
+slReverse(&spList);
+slReverse(&slList);
+
+spHashListPrint(spList, slList, tableName);
+}
+
+boolean isAnalysisTable(struct sqlConnection *conn, char *tableName)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from %s where result_table = \"%s\" ",
+ AN_TABLE, tableName);
+
+return sqlExists(conn, query);
+}
+
+void grabData(char *tableName)
+{
+struct sqlConnection *conn = hAllocConnProfile(localDb, db);
+if (!sqlTableExists(conn, tableName))
+ errAbort("%s doesn't exist in %s db.\n", tableName, db);
+
+if (isAnalysisTable(conn, tableName))
+ grabAnalysisData(conn, tableName);
+else
+ {
+ if (getClinical)
+ grabClinicalData(conn, tableName);
+ else
+ grabRawData(conn, tableName);
+ }
+}
+
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+ usage();
+
+if (optionExists("clinical"))
+ getClinical = TRUE;
+
+grabData(argv[1]);
+
+return 0;
+}