src/hg/instinct/bioInt2/grabData.c 1.3

1.3 2009/04/07 03:41:09 jsanborn
updated
Index: src/hg/instinct/bioInt2/grabData.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/grabData.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/instinct/bioInt2/grabData.c	6 Apr 2009 18:51:43 -0000	1.2
+++ src/hg/instinct/bioInt2/grabData.c	7 Apr 2009 03:41:09 -0000	1.3
@@ -1,482 +1,484 @@
 /* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "hPrint.h"
 #include "hdb.h"  
 #include "dystring.h"
 #include "bioIntDb.h"
 #include "bioIntDriver.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 	 "grabData \n"
 	 "usage:\n"
 	 "   grabData [options] table\n"
 	 "      -clinical    get clinical data corresponding to table\n"
 	 "      -transpose   samples as rows, features as columns\n"
 	 );
 }
 
 char db[128] = "bioInt";
 char localDb[128] = "localDb";
 
 boolean getClinical = FALSE;
 boolean transpose = FALSE;
 
 static struct optionSpec options[] = {
     {"clinical", OPTION_BOOLEAN},
     {"transpose", OPTION_BOOLEAN},
     {NULL, 0}
 }; 
 
 void spHashListTranspose(struct slPair *spList, struct slName *slList, struct slPair **retSpList, struct slName **retSlList)
 {
 // Transpose data in spList. So, sp->name = sample name, sp->hash keyed by feature name 
 
 struct slName *sl, *outSlList = NULL;
 struct slPair *sp, *outSp, *outSpList = NULL;
 
 struct hash *featureHash = hashNew(0);
 struct hash *sampleHash = hashNew(0);
 for (sl = slList; sl; sl = sl->next)
     {
     struct hash *dataHash;
     struct hashEl *el = hashLookup(sampleHash, sl->name);
     if (!el)
 	{
 	AllocVar(outSp);
 	outSp->name = cloneString(sl->name);
 	dataHash = hashNew(0);
 	outSp->val = dataHash;
 	slAddHead(&outSpList, outSp);
 	hashAdd(sampleHash, sl->name, outSp);
 	}
     else
 	outSp = el->val;
 
     for (sp = spList; sp; sp = sp->next)
 	{
 	el = hashLookup(featureHash, sp->name);
 	if (!el)
 	    {
 	    slNameAddHead(&outSlList, sp->name);
 	    hashAddInt(featureHash, sp->name, 1);
 	    }
 
 	struct hash *hash = sp->val;
 	el = hashLookup(hash, sl->name);
 	if (el)
 	    {
 	    struct slDouble *sd = el->val;
 	    hash = outSp->val;
 	    hashAdd(hash, sp->name, sd);
 	    }
 	}
     }
 
 *retSpList = outSpList;
 *retSlList = outSlList;
 }
 
 void spHashListPrint(struct slPair *inSpList, struct slName *inSlList, char *tableName)
 {
 struct slPair *spList = NULL;
 struct slName *slList = NULL;
+
+char filename[256];
 if (transpose)
     {
+    safef(filename, sizeof(filename), "%s_data_transpose.tab", tableName);
     fprintf(stdout, "transposing hash\n");
     spHashListTranspose(inSpList, inSlList, &spList, &slList);
     fprintf(stdout, "finished transposing\n");
     }
 else
     {
+    safef(filename, sizeof(filename), "%s_data.tab", tableName);
     spList = inSpList;
     slList = inSlList;
     }
-char filename[256];
-safef(filename, sizeof(filename), "%s_data.tab", tableName);
 FILE *f = mustOpen(filename, "w");
 
 struct slName *sl;
 struct slPair *sp;
 
 // Print header
 fprintf(f, "feature_name\t");
 for (sl = slList; sl; sl = sl->next)
     {
     fprintf(f, "%s", sl->name);
     if (sl->next)
 	fprintf(f, ",");
     }
 fprintf(f, "\n");
 
 struct hashEl *el;
 for (sp = spList; sp; sp = sp->next)
     {
     fprintf(f, "%s\t", sp->name);
     struct hash *hash = sp->val;
     for (sl = slList; sl; sl = sl->next)
 	{
 	el = hashLookup(hash, sl->name);
 	if (el)
 	    {
 	    struct slDouble *sd = el->val;
 	    fprintf(f, "%f", sd->val);
 	    }
 	if (sl->next)
 	    fprintf(f, ",");
 	}
     fprintf(f, "\n");
     }
 }
 
 
 struct hash *createHash(struct sqlConnection *biConn,
 			char *table, char *key_field, char *val_field)
 {
 struct hash *hash = hashNew(0);
 char query[128];
 safef(query, sizeof(query), "select %s, %s from %s", key_field, val_field, table);
 
 struct sqlResult *sr = sqlGetResult(biConn, query);
 char **row = NULL;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *id = row[0];
     char *name = cloneString(row[1]);
     hashAdd(hash, id, name);
     }
 
 return hash;
 }   
 
 struct datasets *findDataset(struct sqlConnection *conn, char *data_table)
 {
 char query[256];
 safef(query, sizeof(query), 
       "select * from %s where data_table = \"%s\" ", 
       DA_TABLE, data_table);
 
 return datasetsLoadByQuery(conn, query);
 }
 
 struct samples *getSamples(struct sqlConnection *conn, struct datasets *da)
 {
 char query[256];
 safef(query, sizeof(query), 
       "select * from %s where dataset_id = %d order by exp_id", 
       SA_TABLE, da->id);
 
 return samplesLoadByQuery(conn, query);
 }
 
 
 
 void grabClinicalData(struct sqlConnection *conn, char *tableName)
 {
 struct datasets *da = findDataset(conn, tableName);
 if (!da)
     errAbort("No dataset by name of %s in db\n", tableName);
 
 struct samples *sa, *saList = getSamples(conn, da);
 
 struct dyString *dy = newDyString(100);
 dyStringPrintf(dy, 
 	       "select %s.name, %s.name, %s.val from %s join %s on %s.sample_id = %s.id "
 	       "join %s on %s.feature_id = %s.id "
 	       "where %s.id in (",
 	       SA_TABLE, FE_TABLE, CD_TABLE,
 	       CD_TABLE, SA_TABLE, CD_TABLE, SA_TABLE, 
 	       FE_TABLE, CD_TABLE, FE_TABLE, SA_TABLE);
 
 for (sa = saList; sa; sa = sa->next)
     {
     dyStringPrintf(dy, "%d", sa->id);
     if (sa->next)
 	dyStringPrintf(dy, ",");
     }
 dyStringPrintf(dy, ")");
 char *query = dyStringCannibalize(&dy);
 
 struct slPair *sp, *spList = NULL;
 struct slName *slList = NULL;
 struct hash *samplesHash = hashNew(0);
 struct hash *featureData = hashNew(0);
 
 struct sqlResult *sr = sqlGetResult(conn, query);
 char **row = NULL;
 
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *sample = row[0];
     char *feature = row[1];
     double val = atof(row[2]);
     
     struct hash *sampleData;
     struct hashEl *el = hashLookup(featureData, feature);
     if (!el)
 	{
 	sampleData = hashNew(0);
 	hashAdd(featureData, feature, sampleData);
 	AllocVar(sp);
 	sp->name = cloneString(feature);
 	sp->val = sampleData;
 	slAddHead(&spList, sp);
 	}
     else
 	sampleData = el->val;
 
     if (!hashLookup(samplesHash, sample))
 	{
 	slNameAddHead(&slList, sample);
 	hashAddInt(samplesHash, sample, 1);
 	}
     struct slDouble *sd = slDoubleNew(val);
     hashAdd(sampleData, sample, sd);
     } 
 
 sqlFreeResult(&sr);
 hashFree(&featureData);
 
 slReverse(&spList);
 slReverse(&slList);
 
 char clinTableName[512];
 safef(clinTableName, sizeof(clinTableName), 
       "%s_clinical",
       tableName);
 
 spHashListPrint(spList, slList, clinTableName);
 }
 
 
 struct hash *probeIdToGene(struct sqlConnection *conn, struct datasets *da)
 {
 char query[512];
 
 safef(query, sizeof(query), 
       "select DISTINCT %s.id,geneSymbol from %s join %s on id=probe_id "
       "join %s on gene_id=%s.id join %s on %s.kgId=%s.kgId;", 
       da->probe_table, da->probe_table, da->probe_to_gene_table, GL_TABLE, 
       GL_TABLE, KX_TABLE, GL_TABLE, KX_TABLE);
 
 struct sqlResult *sr = sqlGetResult(conn, query);
 char **row = NULL;
 
 struct hash *hash = hashNew(0);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *id = row[0];
     char *gene = cloneString(row[1]);
     hashAdd(hash, id, gene);
     }
 sqlFreeResult(&sr);
 return hash;
 }
 
 void medianSpHashList(struct slPair *spList, struct slName *slList)
 {
 struct hashEl *el;
 struct slPair *sp;
 struct slName *sl;
 for (sp = spList; sp; sp = sp->next)
     {
     struct hash *hash = sp->val;
     for (sl = slList; sl; sl = sl->next)
 	{
 	el = hashLookup(hash, sl->name);
 	struct slDouble *sd, *sdList = el->val;
 	
 	double val = slDoubleMedian(sdList);
 	sd = slDoubleNew(val);
 
 	hashRemove(hash, sl->name);
 	slFreeList(&sdList);
 
 	hashAdd(hash, sl->name, sd);
 	}
     }
 }
 
 void grabRawData(struct sqlConnection *conn, char *tableName)
 {
 struct datasets *da = findDataset(conn, tableName);
 if (!da)
     errAbort("No dataset by name of %s in db\n", tableName);
 
 struct samples *sa, *saList = getSamples(conn, da);
 struct hash *idGeneHash = probeIdToGene(conn, da);
 
 char query[256];
 safef(query, sizeof(query), "select * from %s", tableName);
 
 struct hash *featureData = hashNew(0);
 struct slPair *sp, *spList = NULL;
 
 struct sqlResult *sr = sqlGetResult(conn, query);
 char **row = NULL;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *probe_id = row[0];
     struct probeVals *pv = probeValsLoad(row);
     
     struct hashEl *el = hashLookup(idGeneHash, probe_id);
     if (!el)
 	continue;
     char *gene = el->val;
 
     struct hash *sampleData;
     el = hashLookup(featureData, gene);
     if (!el)
 	{
 	sampleData = hashNew(0);
 	hashAdd(featureData, gene, sampleData);
 	AllocVar(sp);
 	sp->name = cloneString(gene);
 	sp->val = sampleData;
 	slAddHead(&spList, sp);
 	}
     else
 	sampleData = el->val;
 
     for (sa = saList; sa; sa = sa->next)
 	{
 	double val = pv->sample_data[sa->exp_id];
 	struct slDouble *sd = slDoubleNew(val);
 
 	el = hashLookup(sampleData, sa->name);
 	if (!el)
 	    hashAdd(sampleData, sa->name, sd);
 	else
 	    {
 	    struct slDouble *sdList = el->val;
 	    slAddTail(&sdList, sd);
 	    }
 	}
     }
 sqlFreeResult(&sr);
 hashFree(&featureData);
 
 struct slName *slList = NULL;
 for (sa = saList; sa; sa = sa->next)
     slNameAddHead(&slList, sa->name);
 
 slReverse(&spList);
 slReverse(&slList);
 
 medianSpHashList(spList, slList);
 
 spHashListPrint(spList, slList, tableName);
 }
 
 void grabAnalysisData(struct sqlConnection *conn, char *tableName)
 {
 struct hash *sampleIdHash = createHash(conn, SA_TABLE, "id", "name");
 struct hash *featureIdHash = createHash(conn, AF_TABLE, "id", "feature_name");
 
 char query[256];
 safef(query, sizeof(query), "select * from %s", tableName);
 
 struct sqlResult *sr = sqlGetResult(conn, query);
 char **row = NULL;
 
 struct hash *samplesHash = hashNew(0);
 struct slName *slList = NULL;
 struct hash *featureData = hashNew(0);
 struct slPair *sp, *spList = NULL;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *sample_id = row[0];
     char *feature_id = row[1];
     double val = atof(row[2]);
 
     struct hash *sampleData;
     struct hashEl *el = hashLookup(featureData, feature_id);
     if (!el)
 	{
 	sampleData = hashNew(0);
 	hashAdd(featureData, feature_id, sampleData);
 	el = hashLookup(featureIdHash, feature_id);
 	if (el)
 	    {
 	    char *name = el->val;
 	    AllocVar(sp);
 	    sp->name = cloneString(name);
 	    sp->val = sampleData;
 	    slAddHead(&spList, sp);
 	    }
 	}
     else
 	sampleData = el->val;
 
     el = hashLookup(sampleIdHash, sample_id);
     char *name = el->val;
     if (!hashLookup(samplesHash, name))
 	{
 	slNameAddHead(&slList, name);
 	hashAddInt(samplesHash, name, 1);
 	}
     struct slDouble *sd = slDoubleNew(val);
     hashAdd(sampleData, name, sd);
     } 
 
 sqlFreeResult(&sr);
 hashFree(&featureData);
 hashFree(&samplesHash);
 
 slReverse(&spList);
 slReverse(&slList);
 
 spHashListPrint(spList, slList, tableName);
 }
 
 boolean isAnalysisTable(struct sqlConnection *conn, char *tableName)
 {
 char query[256];
 safef(query, sizeof(query), 
       "select * from %s where result_table = \"%s\" ",
       AN_TABLE, tableName);
 
 return sqlExists(conn, query);
 }
 
 void grabData(char *tableName)
 {
 struct sqlConnection *conn = hAllocConnProfile(localDb, db);
 if (!sqlTableExists(conn, tableName))
     errAbort("%s doesn't exist in %s db.\n", tableName, db);
 
 if (isAnalysisTable(conn, tableName))
     grabAnalysisData(conn, tableName);
 else
     {
     if (getClinical)
 	grabClinicalData(conn, tableName);
     else
 	grabRawData(conn, tableName);
     }
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
 
 if (optionExists("clinical"))
     getClinical = TRUE;
 
 if (optionExists("transpose"))
     transpose = TRUE;
 
 grabData(argv[1]);
 
 return 0;
 }