src/hg/instinct/bioInt2/grabData.c 1.1

1.1 2009/04/05 21:08:07 jsanborn
added files, fixed cohorts API
Index: src/hg/instinct/bioInt2/grabData.c
===================================================================
RCS file: src/hg/instinct/bioInt2/grabData.c
diff -N src/hg/instinct/bioInt2/grabData.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/instinct/bioInt2/grabData.c	5 Apr 2009 21:08:07 -0000	1.1
@@ -0,0 +1,412 @@
+/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "hPrint.h"
+#include "hdb.h"  
+#include "dystring.h"
+#include "bioIntDb.h"
+#include "bioIntDriver.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+	 "grabData \n"
+	 "usage:\n"
+	 "   grabData table\n"
+	 );
+}
+
+char db[128] = "bioInt";
+char localDb[128] = "localDb";
+
+boolean getClinical = FALSE;
+
+static struct optionSpec options[] = {
+    {"clinical", OPTION_BOOLEAN},
+    {NULL, 0}
+}; 
+
+void spHashListPrint(struct slPair *spList, struct slName *slList, char *tableName)
+{
+char filename[256];
+safef(filename, sizeof(filename), "%s_data.tab", tableName);
+FILE *f = mustOpen(filename, "w");
+
+struct slName *sl;
+struct slPair *sp;
+
+// Print header
+fprintf(f, "feature_name\t");
+for (sl = slList; sl; sl = sl->next)
+    {
+    fprintf(f, "%s", sl->name);
+    if (sl->next)
+	fprintf(f, ",");
+    }
+fprintf(f, "\n");
+
+struct hashEl *el;
+for (sp = spList; sp; sp = sp->next)
+    {
+    fprintf(f, "%s\t", sp->name);
+    struct hash *hash = sp->val;
+    for (sl = slList; sl; sl = sl->next)
+	{
+	el = hashLookup(hash, sl->name);
+	if (el)
+	    {
+	    struct slDouble *sd = el->val;
+	    fprintf(f, "%f", sd->val);
+	    }
+	if (sl->next)
+	    fprintf(f, ",");
+	}
+    fprintf(f, "\n");
+    }
+}
+
+struct hash *createHash(struct sqlConnection *biConn,
+			char *table, char *key_field, char *val_field)
+{
+struct hash *hash = hashNew(0);
+char query[128];
+safef(query, sizeof(query), "select %s, %s from %s", key_field, val_field, table);
+
+struct sqlResult *sr = sqlGetResult(biConn, query);
+char **row = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *id = row[0];
+    char *name = cloneString(row[1]);
+    hashAdd(hash, id, name);
+    }
+
+return hash;
+}   
+
+struct datasets *findDataset(struct sqlConnection *conn, char *data_table)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select * from %s where data_table = \"%s\" ", 
+      DA_TABLE, data_table);
+
+return datasetsLoadByQuery(conn, query);
+}
+
+struct samples *getSamples(struct sqlConnection *conn, struct datasets *da)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select * from %s where dataset_id = %d order by exp_id", 
+      SA_TABLE, da->id);
+
+return samplesLoadByQuery(conn, query);
+}
+
+
+
+void grabClinicalData(struct sqlConnection *conn, char *tableName)
+{
+struct datasets *da = findDataset(conn, tableName);
+if (!da)
+    errAbort("No dataset by name of %s in db\n", tableName);
+
+struct samples *sa, *saList = getSamples(conn, da);
+
+struct dyString *dy = newDyString(100);
+dyStringPrintf(dy, 
+	       "select %s.name, %s.name, %s.val from %s join %s on %s.sample_id = %s.id "
+	       "join %s on %s.feature_id = %s.id "
+	       "where %s.id in (",
+	       SA_TABLE, FE_TABLE, CD_TABLE,
+	       CD_TABLE, SA_TABLE, CD_TABLE, SA_TABLE, 
+	       FE_TABLE, CD_TABLE, FE_TABLE, SA_TABLE);
+
+for (sa = saList; sa; sa = sa->next)
+    {
+    dyStringPrintf(dy, "%d", sa->id);
+    if (sa->next)
+	dyStringPrintf(dy, ",");
+    }
+dyStringPrintf(dy, ")");
+char *query = dyStringCannibalize(&dy);
+
+struct slPair *sp, *spList = NULL;
+struct slName *slList = NULL;
+struct hash *samplesHash = hashNew(0);
+struct hash *featureData = hashNew(0);
+
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *sample = row[0];
+    char *feature = row[1];
+    double val = atof(row[2]);
+    
+    struct hash *sampleData;
+    struct hashEl *el = hashLookup(featureData, feature);
+    if (!el)
+	{
+	sampleData = hashNew(0);
+	hashAdd(featureData, feature, sampleData);
+	AllocVar(sp);
+	sp->name = cloneString(feature);
+	sp->val = sampleData;
+	slAddHead(&spList, sp);
+	}
+    else
+	sampleData = el->val;
+
+    if (!hashLookup(samplesHash, sample))
+	{
+	slNameAddHead(&slList, sample);
+	hashAddInt(samplesHash, sample, 1);
+	}
+    struct slDouble *sd = slDoubleNew(val);
+    hashAdd(sampleData, sample, sd);
+    } 
+
+sqlFreeResult(&sr);
+hashFree(&featureData);
+
+slReverse(&spList);
+slReverse(&slList);
+
+char clinTableName[512];
+safef(clinTableName, sizeof(clinTableName), 
+      "%s_clinical",
+      tableName);
+
+spHashListPrint(spList, slList, clinTableName);
+}
+
+
+struct hash *probeIdToGene(struct sqlConnection *conn, struct datasets *da)
+{
+char query[512];
+
+safef(query, sizeof(query), 
+      "select DISTINCT %s.id,geneSymbol from %s join %s on id=probe_id "
+      "join %s on gene_id=%s.id join %s on %s.kgId=%s.kgId;", 
+      da->probe_table, da->probe_table, da->probe_to_gene_table, GL_TABLE, 
+      GL_TABLE, KX_TABLE, GL_TABLE, KX_TABLE);
+
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+
+struct hash *hash = hashNew(0);
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *id = row[0];
+    char *gene = cloneString(row[1]);
+    hashAdd(hash, id, gene);
+    }
+sqlFreeResult(&sr);
+return hash;
+}
+
+void medianSpHashList(struct slPair *spList, struct slName *slList)
+{
+struct hashEl *el;
+struct slPair *sp;
+struct slName *sl;
+for (sp = spList; sp; sp = sp->next)
+    {
+    struct hash *hash = sp->val;
+    for (sl = slList; sl; sl = sl->next)
+	{
+	el = hashLookup(hash, sl->name);
+	struct slDouble *sd, *sdList = el->val;
+	
+	double val = slDoubleMedian(sdList);
+	sd = slDoubleNew(val);
+
+	hashRemove(hash, sl->name);
+	slFreeList(&sdList);
+
+	hashAdd(hash, sl->name, sd);
+	}
+    }
+}
+
+void grabRawData(struct sqlConnection *conn, char *tableName)
+{
+struct datasets *da = findDataset(conn, tableName);
+if (!da)
+    errAbort("No dataset by name of %s in db\n", tableName);
+
+struct samples *sa, *saList = getSamples(conn, da);
+struct hash *idGeneHash = probeIdToGene(conn, da);
+
+char query[256];
+safef(query, sizeof(query), "select * from %s", tableName);
+
+struct hash *featureData = hashNew(0);
+struct slPair *sp, *spList = NULL;
+
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *probe_id = row[0];
+    struct probeVals *pv = probeValsLoad(row);
+    
+    struct hashEl *el = hashLookup(idGeneHash, probe_id);
+    if (!el)
+	continue;
+    char *gene = el->val;
+
+    struct hash *sampleData;
+    el = hashLookup(featureData, gene);
+    if (!el)
+	{
+	sampleData = hashNew(0);
+	hashAdd(featureData, gene, sampleData);
+	AllocVar(sp);
+	sp->name = cloneString(gene);
+	sp->val = sampleData;
+	slAddHead(&spList, sp);
+	}
+    else
+	sampleData = el->val;
+
+    for (sa = saList; sa; sa = sa->next)
+	{
+	double val = pv->sample_data[sa->exp_id];
+	struct slDouble *sd = slDoubleNew(val);
+
+	el = hashLookup(sampleData, sa->name);
+	if (!el)
+	    hashAdd(sampleData, sa->name, sd);
+	else
+	    {
+	    struct slDouble *sdList = el->val;
+	    slAddTail(&sdList, sd);
+	    }
+	}
+    }
+sqlFreeResult(&sr);
+hashFree(&featureData);
+
+struct slName *slList = NULL;
+for (sa = saList; sa; sa = sa->next)
+    slNameAddHead(&slList, sa->name);
+
+slReverse(&spList);
+slReverse(&slList);
+
+medianSpHashList(spList, slList);
+
+spHashListPrint(spList, slList, tableName);
+}
+
+void grabAnalysisData(struct sqlConnection *conn, char *tableName)
+{
+struct hash *sampleIdHash = createHash(conn, SA_TABLE, "id", "name");
+struct hash *featureIdHash = createHash(conn, AF_TABLE, "id", "feature_name");
+
+char query[256];
+safef(query, sizeof(query), "select * from %s", tableName);
+
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+
+struct hash *samplesHash = hashNew(0);
+struct slName *slList = NULL;
+struct hash *featureData = hashNew(0);
+struct slPair *sp, *spList = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *sample_id = row[0];
+    char *feature_id = row[1];
+    double val = atof(row[2]);
+
+    struct hash *sampleData;
+    struct hashEl *el = hashLookup(featureData, feature_id);
+    if (!el)
+	{
+	sampleData = hashNew(0);
+	hashAdd(featureData, feature_id, sampleData);
+	el = hashLookup(featureIdHash, feature_id);
+	if (el)
+	    {
+	    char *name = el->val;
+	    AllocVar(sp);
+	    sp->name = cloneString(name);
+	    sp->val = sampleData;
+	    slAddHead(&spList, sp);
+	    }
+	}
+    else
+	sampleData = el->val;
+
+    el = hashLookup(sampleIdHash, sample_id);
+    char *name = el->val;
+    if (!hashLookup(samplesHash, name))
+	{
+	slNameAddHead(&slList, name);
+	hashAddInt(samplesHash, name, 1);
+	}
+    struct slDouble *sd = slDoubleNew(val);
+    hashAdd(sampleData, name, sd);
+    } 
+
+sqlFreeResult(&sr);
+hashFree(&featureData);
+hashFree(&samplesHash);
+
+slReverse(&spList);
+slReverse(&slList);
+
+spHashListPrint(spList, slList, tableName);
+}
+
+boolean isAnalysisTable(struct sqlConnection *conn, char *tableName)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select * from %s where result_table = \"%s\" ",
+      AN_TABLE, tableName);
+
+return sqlExists(conn, query);
+}
+
+void grabData(char *tableName)
+{
+struct sqlConnection *conn = hAllocConnProfile(localDb, db);
+if (!sqlTableExists(conn, tableName))
+    errAbort("%s doesn't exist in %s db.\n", tableName, db);
+
+if (isAnalysisTable(conn, tableName))
+    grabAnalysisData(conn, tableName);
+else
+    {
+    if (getClinical)
+	grabClinicalData(conn, tableName);
+    else
+	grabRawData(conn, tableName);
+    }
+}
+
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+    usage();
+
+if (optionExists("clinical"))
+    getClinical = TRUE;
+
+grabData(argv[1]);
+
+return 0;
+}