src/hg/instinct/extractData/extractData.c 1.6

1.6 2010/01/14 23:33:14 jsanborn
updated
Index: src/hg/instinct/extractData/extractData.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/extractData/extractData.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 1000000 -r1.5 -r1.6
--- src/hg/instinct/extractData/extractData.c	14 Jan 2010 23:15:04 -0000	1.5
+++ src/hg/instinct/extractData/extractData.c	14 Jan 2010 23:33:14 -0000	1.6
@@ -1,366 +1,378 @@
 /* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "bed.h"
 #include "genePred.h"
 #include "hPrint.h"
 #include "hdb.h"  
 #include "microarray.h"
 #include "ra.h"
 #include "featuresLib.h"
 #include "hgHeatmapLib.h"
 #include "cprob.h"
 #include "hgStatsLib.h" 
 
 char *hgDb = "hg18";
 char *genome = "Human";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "extractData \n"
   "   extractData [OPTIONS] db table sample_id genes(s)\n"
   "options:\n"
   "   -median       Output median value if multiple probes\n"
   "   -tcga         Handle TCGA ids, keeps only first 16 chars in sample_id\n"
   "   -samples=STR  Sample comma-separated list, otherwise all sample data are returned.\n"
+  "examples:\n"
+  "* Single sample / Single gene / All probes:\n"
+  "  ./extractData -tcga -samples=TCGA-06-0145-01A harvardCGH egfr\n"
+  "\n"
+  "* Multiple samples / Single gene /Median-value of probes:\n"
+  "  ./extractData -tcga -median -samples=TCGA-06-0145-01A,TCGA-06-0145-10A harvardCGH egfr\n"
+  "\n"
+  "* All samples / multiple genes / All probes of all genes:\n"
+  "  ./extractData harvardCGH egfr,erbb2,esr1\n"
+  "\n"
+  "* All samples / multiple genes / Median-value of probes:\n"
+  "  ./extractData harvardCGH egfr,erbb2,esr1\n"
   );
 }
 
 #define TCGA_PATIENT_PREFIX 12
 #define TCGA_SAMPLE_PREFIX 16
 
 boolean median = FALSE;
 boolean isTCGA = FALSE;
 
 static struct optionSpec options[] = {
     {"median", OPTION_BOOLEAN},
     {"tcga", OPTION_BOOLEAN},
     {"samples", OPTION_STRING},
     {NULL, 0},
 };
 
 struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
 {
 /*microarray specific settings*/
 struct trackDb *tdb = hMaybeTrackInfo(hgConn, tableName);  
 struct microarrayGroups *maGs = maGroupings("hg18", tableName);
 trackDbFreeList(&tdb);
 if (!maGs)
     return NULL;
 return maGs->allArrays;
 }
 
 struct hash *getSettings(char *tableName)
 {
 struct column *raList = getColumns(NULL, "datasets.ra", NULL);  
 
 struct column *col;
 struct hash *settings = NULL;
 for (col = raList; col; col = col->next)
     {
     if (!sameString(col->name, tableName))
 	continue;
 
     settings = col->settings;
     break;
     }
 
 if (!settings)
     errAbort("Couldn't find datasets.ra listing for %s", tableName);
 
 return settings;
 }
 
 struct geneAlias {
     struct geneAlias *next;
 
     char *gene;
     struct slName *probes;
 }; 
  
 struct hash *getAliases(struct sqlConnection *hgConn, char *tableName, 
 			struct slName *genes)
 {
 if (!hgConn || !tableName || !genes)
     return NULL;
 
 struct slName *sl;
 struct dyString *dy = newDyString(100);
 dyStringPrintf(dy,
 	       "select * from %s where alias in (", 
 	       tableName);
 for (sl = genes; sl; sl = sl->next)
     {
     dyStringPrintf(dy, "\"%s\"", sl->name);
     if (sl->next)
 	dyStringPrintf(dy, ",");
     }
 dyStringPrintf(dy, ");");
 char *query = dyStringCannibalize(&dy);
 
 struct sqlResult *sr = sqlGetResult(hgConn, query);
 
 struct geneAlias *ga, *gaList = NULL;
 struct hash *gaHash = hashNew(0);
 
 char **row;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *probe = cloneString(row[0]);
     char *gene = cloneString(row[1]);
 
     struct hashEl *el = hashLookup(gaHash, gene);
 
     if (!el)
 	{
 	ga = AllocA(struct geneAlias);
 	ga->gene = cloneString(gene);
 	ga->probes = NULL;
 
 	slAddHead(&gaList, ga);
 	hashAdd(gaHash, gene, ga);
 	}
     else
 	ga = el->val;
 
     slNameAddHead(&ga->probes, probe);	
     }
 
 sqlFreeResult(&sr);              
 
 return gaHash;                   
 }
 
 struct sampleVals {
     struct sampleVals *next;
 
     int expId;
     char *name;
     struct slDouble *vals;
     struct hash *valHash;
 }; 
 
 void setProbeData(struct sqlConnection *hgConn, struct sampleVals *svList, 
 		  char *dataTable, struct slName *probes)
 {
 struct slName *sl;
 struct dyString *dy = newDyString(100);
 dyStringPrintf(dy,
 	       "select * from %s where name in(", 
 	       dataTable);
 for (sl = probes; sl; sl = sl->next)
     {
     dyStringPrintf(dy, "\"%s\"", sl->name);
     if (sl->next)
 	dyStringPrintf(dy, ",");
     }
 dyStringPrintf(dy, ");");
 char *query = dyStringCannibalize(&dy);
 
 /* Get bed15 data from hg18 database */
 struct sqlResult *sr = sqlGetResult(hgConn, query);
 
 char **row = NULL;
 struct sampleVals *sv;
 
 while ((row = sqlNextRow(sr)) != NULL)
     {
     struct bed *nb = bedLoadN(row+1, 15);
   
     for (sv = svList; sv; sv = sv->next)
 	{
 	if (sv->expId >= nb->expCount)
 	    continue;
 	struct slDouble *sd = slDoubleNew(nb->expScores[sv->expId]);
 	slAddHead(&sv->vals, sd);
 	hashAdd(sv->valHash, nb->name, sd);
 	}
 
     bedFree(&nb);
     }
 sqlFreeResult(&sr);
 }
 
 
 struct sampleVals *prepareSampleVals(char *samples, struct maGrouping *allA)
 {
 boolean allSamples = FALSE;
 struct hash *sampleHash = hashNew(0);
 
 if (!samples)
     allSamples = TRUE;
 else
     {
     struct slName *sl, *slList = slNameListFromComma(samples);
     for (sl = slList; sl; sl = sl->next)
 	{
 	char *sample = sl->name;
 	if (isTCGA)
 	    sample = cloneStringZ(sl->name, TCGA_SAMPLE_PREFIX);	
 	hashAddInt(sampleHash, sample, 1);
 	}
     }
 
 struct hashEl *el;
 struct sampleVals *sv, *svList = NULL;
 int i;
 for (i = 0; i < allA->size; i++)
     {
     int expId = allA->expIds[i];
     char *maName = allA->names[i];
     if (isTCGA)
 	maName = cloneStringZ(allA->names[i], TCGA_SAMPLE_PREFIX);
 
     if (allSamples || (el = hashLookup(sampleHash, maName)) != NULL)
 	{
 	sv = AllocA(struct sampleVals);
 	sv->name = cloneString(allA->names[i]);
 	sv->expId = expId;
 	sv->vals = NULL;
 	sv->valHash = hashNew(0);
 
 	slAddHead(&svList, sv);
 	}
     
     }
 slReverse(&svList);
 
 return svList;
 }
 
 void printProbeData(struct geneAlias *ga, struct sampleVals *svList)
 {
 struct sampleVals *sv;
 struct dyString *dy = newDyString(100);
 
 if (median)
     {
     dyStringPrintf(dy, "%s\tmedian\t", ga->gene);
     for (sv = svList; sv; sv = sv->next)
 	{
 	double val = slDoubleMedian(sv->vals);
 	dyStringPrintf(dy, "%f", val);
 	
 	if (sv->next)
 	    dyStringPrintf(dy, "\t");
 	}
     dyStringPrintf(dy, "\n");
     }
 else
     {
     struct hashEl *el;
     struct slName *sl;
     for (sl = ga->probes; sl; sl = sl->next)
 	{
 	dyStringPrintf(dy, "%s\t%s\t", ga->gene, sl->name);
 	for (sv = svList; sv; sv = sv->next)
 	    {
 	    if ((el = hashLookup(sv->valHash, sl->name)) != NULL)
 		{
 		struct slDouble *sd = el->val;
 		dyStringPrintf(dy, "%f", sd->val);
 		}
 	    else
 		dyStringPrintf(dy, "NA");
 		    
 	    if (sv->next)
 		dyStringPrintf(dy, "\t");
 	    }
 	dyStringPrintf(dy, "\n");
 	}
     }
 
 char *valStr = dyStringCannibalize(&dy);
 printf("%s", valStr);
 }
 
 void clearProbeData(struct sampleVals *svList)
 {
 struct sampleVals *sv;
 for (sv = svList; sv; sv = sv->next)
     {
     slFreeList(&sv->vals);
     hashFree(&sv->valHash);
 
     sv->vals = NULL;
     sv->valHash = hashNew(0);
     }
 }
 
 
 void extractData(char *tableName, char *geneList, char *samples)
 {
 if (!geneList)
     return;
 
 struct slName *genes = slNameListFromComma(geneList);
 
 struct hashEl *el;
 struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
 
 /* Set up datasets entry */
 struct maGrouping *allA = getMaGrouping(hgConn, tableName);
 if (!allA)
     errAbort("Could not find maGrouping for %s!", tableName);
 
 struct hash *settings = getSettings(tableName);
 el = hashLookup(settings, "aliasTable");
 if (!el)
     errAbort("No aliasTable.\n");
 char *aliasTable = cloneString(el->val);
 
 struct hash *gaHash = getAliases(hgConn, aliasTable, genes);
 
 struct sampleVals *sv, *svList = prepareSampleVals(samples, allA);
 if (!svList)
     errAbort("No samples allocated.\n");
 
 printf("gene\tprobe");
 for (sv = svList; sv; sv = sv->next)
     printf("\t%s", sv->name);
 printf("\n");
 
 struct hashCookie cookie = hashFirst(gaHash);
 while ((el = hashNext(&cookie)) != NULL)
     {
     struct geneAlias *ga = el->val;    
     setProbeData(hgConn, svList, tableName, ga->probes);
 
     printProbeData(ga, svList);
 
     clearProbeData(svList);
     }
 
 #if 0
     if (!sdList)
 	continue;
 
 
     }
 #endif
 
 hFreeConn(&hgConn);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 
 if (optionExists("median"))
     median = TRUE;
 if (optionExists("tcga"))
     isTCGA = TRUE;
 
 char *samples = optionVal("samples", NULL);
 extractData(argv[1], argv[2], samples);
 return 0;
 }