src/hg/instinct/extractData/extractData.c 1.1
1.1 2010/01/14 05:18:36 jsanborn
initial commit
Index: src/hg/instinct/extractData/extractData.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/extractData/extractData.c,v
retrieving revision 1.7
retrieving revision 1.1
diff -b -B -U 1000000 -r1.7 -r1.1
--- src/hg/instinct/extractData/extractData.c 15 Jan 2010 01:45:21 -0000 1.7
+++ src/hg/instinct/extractData/extractData.c 14 Jan 2010 05:18:36 -0000 1.1
@@ -1,370 +1,216 @@
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "bed.h"
#include "genePred.h"
#include "hPrint.h"
#include "hdb.h"
#include "microarray.h"
#include "ra.h"
#include "featuresLib.h"
#include "hgHeatmapLib.h"
#include "cprob.h"
#include "hgStatsLib.h"
char *hgDb = "hg18";
char *genome = "Human";
void usage()
/* Explain usage and exit. */
{
errAbort(
"extractData \n"
- " extractData [OPTIONS] db table sample_id genes(s)\n"
+ " extractData [OPTIONS] db table samplesgenes(s)\n"
"options:\n"
" -median Output median value if multiple probes\n"
- " -tcga Handle TCGA ids, keeps only first 16 chars in sample_id\n"
- " -samples=STR Sample comma-separated list, otherwise all sample data are returned.\n"
- "examples:\n"
- "* Single sample / Single gene / All probes:\n"
- " ./extractData -tcga -samples=TCGA-06-0145-01A harvardCGH egfr\n"
- "\n"
- "* Multiple samples / Single gene /Median-value of probes:\n"
- " ./extractData -tcga -median -samples=TCGA-06-0145-01A,TCGA-06-0145-10A harvardCGH egfr\n"
- "\n"
- "* All samples / multiple genes / All probes of all genes:\n"
- " ./extractData harvardCGH egfr,erbb2,esr1\n"
- "\n"
- "* All samples / multiple genes / Median-value of probes:\n"
- " ./extractData harvardCGH egfr,erbb2,esr1\n"
);
}
-#define TCGA_PATIENT_PREFIX 12
-#define TCGA_SAMPLE_PREFIX 16
-
boolean median = FALSE;
-boolean isTCGA = FALSE;
static struct optionSpec options[] = {
{"median", OPTION_BOOLEAN},
- {"tcga", OPTION_BOOLEAN},
- {"samples", OPTION_STRING},
{NULL, 0},
};
struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
{
/*microarray specific settings*/
struct trackDb *tdb = hMaybeTrackInfo(hgConn, tableName);
struct microarrayGroups *maGs = maGroupings("hg18", tableName);
trackDbFreeList(&tdb);
if (!maGs)
return NULL;
return maGs->allArrays;
}
struct hash *getSettings(char *tableName)
{
struct column *raList = getColumns(NULL, "datasets.ra", NULL);
struct column *col;
struct hash *settings = NULL;
for (col = raList; col; col = col->next)
{
if (!sameString(col->name, tableName))
continue;
settings = col->settings;
break;
}
if (!settings)
errAbort("Couldn't find datasets.ra listing for %s", tableName);
return settings;
}
struct geneAlias {
struct geneAlias *next;
char *gene;
struct slName *probes;
};
struct hash *getAliases(struct sqlConnection *hgConn, char *tableName,
struct slName *genes)
{
if (!hgConn || !tableName || !genes)
return NULL;
struct slName *sl;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy,
"select * from %s where alias in (",
tableName);
for (sl = genes; sl; sl = sl->next)
{
dyStringPrintf(dy, "\"%s\"", sl->name);
if (sl->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ");");
char *query = dyStringCannibalize(&dy);
struct sqlResult *sr = sqlGetResult(hgConn, query);
struct geneAlias *ga, *gaList = NULL;
struct hash *gaHash = hashNew(0);
char **row;
while ((row = sqlNextRow(sr)) != NULL)
{
char *probe = cloneString(row[0]);
char *gene = cloneString(row[1]);
struct hashEl *el = hashLookup(gaHash, gene);
if (!el)
{
ga = AllocA(struct geneAlias);
ga->gene = cloneString(gene);
ga->probes = NULL;
slAddHead(&gaList, ga);
hashAdd(gaHash, gene, ga);
}
else
ga = el->val;
slNameAddHead(&ga->probes, probe);
}
sqlFreeResult(&sr);
return gaHash;
}
-struct sampleVals {
- struct sampleVals *next;
-
- int expId;
- char *name;
- struct slDouble *vals;
- struct hash *valHash;
-};
-
-void setProbeData(struct sqlConnection *hgConn, struct sampleVals *svList,
+void getProbeData(struct sqlConnection *hgConn, struct maGrouping *allA,
char *dataTable, struct slName *probes)
{
struct slName *sl;
struct dyString *dy = newDyString(100);
dyStringPrintf(dy,
"select * from %s where name in(",
dataTable);
for (sl = probes; sl; sl = sl->next)
{
dyStringPrintf(dy, "\"%s\"", sl->name);
if (sl->next)
dyStringPrintf(dy, ",");
}
dyStringPrintf(dy, ");");
char *query = dyStringCannibalize(&dy);
+fprintf(stderr, "query: %s\n", query);
/* Get bed15 data from hg18 database */
struct sqlResult *sr = sqlGetResult(hgConn, query);
+int count = 0;
char **row = NULL;
-struct sampleVals *sv;
-
while ((row = sqlNextRow(sr)) != NULL)
{
struct bed *nb = bedLoadN(row+1, 15);
- for (sv = svList; sv; sv = sv->next)
- {
- if (sv->expId >= nb->expCount)
- continue;
- struct slDouble *sd = slDoubleNew(nb->expScores[sv->expId]);
- slAddHead(&sv->vals, sd);
- hashAdd(sv->valHash, nb->name, sd);
- }
+// nb->expCount
+// nb->expScores
+ count += 1;
bedFree(&nb);
}
sqlFreeResult(&sr);
-}
-
-
-struct sampleVals *prepareSampleVals(char *samples, struct maGrouping *allA)
-{
-boolean allSamples = FALSE;
-struct hash *sampleHash = hashNew(0);
-
-if (!samples)
- allSamples = TRUE;
-else
- {
- struct slName *sl, *slList = slNameListFromComma(samples);
- for (sl = slList; sl; sl = sl->next)
- {
- char *sample = sl->name;
- if (isTCGA)
- sample = cloneStringZ(sl->name, TCGA_SAMPLE_PREFIX);
- hashAddInt(sampleHash, sample, 1);
- }
- }
-
-struct hashEl *el;
-struct sampleVals *sv, *svList = NULL;
-int i;
-for (i = 0; i < allA->size; i++)
- {
- int expId = allA->expIds[i];
- char *maName = allA->names[i];
- if (isTCGA)
- maName = cloneStringZ(allA->names[i], TCGA_SAMPLE_PREFIX);
-
- if (allSamples || (el = hashLookup(sampleHash, maName)) != NULL)
- {
- sv = AllocA(struct sampleVals);
- sv->name = cloneString(allA->names[i]);
- sv->expId = expId;
- sv->vals = NULL;
- sv->valHash = hashNew(0);
-
- slAddHead(&svList, sv);
- }
-
- }
-slReverse(&svList);
-
-return svList;
-}
-
-void printProbeData(struct geneAlias *ga, struct sampleVals *svList)
-{
-struct sampleVals *sv;
-struct dyString *dy = newDyString(100);
-
-if (median)
- {
- dyStringPrintf(dy, "%s\tmedian\t", ga->gene);
- for (sv = svList; sv; sv = sv->next)
- {
- double val = slDoubleMedian(sv->vals);
- dyStringPrintf(dy, "%f", val);
- if (sv->next)
- dyStringPrintf(dy, "\t");
- }
- dyStringPrintf(dy, "\n");
- }
-else
- {
- struct hashEl *el;
- struct slName *sl;
- for (sl = ga->probes; sl; sl = sl->next)
- {
- dyStringPrintf(dy, "%s\t%s\t", ga->gene, sl->name);
- for (sv = svList; sv; sv = sv->next)
- {
- if ((el = hashLookup(sv->valHash, sl->name)) != NULL)
- {
- struct slDouble *sd = el->val;
- dyStringPrintf(dy, "%f", sd->val);
- }
- else
- dyStringPrintf(dy, "NA");
-
- if (sv->next)
- dyStringPrintf(dy, "\t");
- }
- dyStringPrintf(dy, "\n");
- }
- }
-
-char *valStr = dyStringCannibalize(&dy);
-printf("%s", valStr);
-}
-
-void clearProbeData(struct sampleVals *svList)
-{
-struct sampleVals *sv;
-for (sv = svList; sv; sv = sv->next)
- {
- slFreeList(&sv->vals);
- hashFree(&sv->valHash);
-
- sv->vals = NULL;
- sv->valHash = hashNew(0);
- }
+fprintf(stderr, "found %d probes\n", count);
}
-
-void extractData(char *tableName, char *geneList, char *samples)
+void extractData(char *tableName, char *sampleName, char *geneList)
{
if (!geneList)
return;
struct slName *genes = slNameListFromComma(geneList);
struct hashEl *el;
struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
/* Set up datasets entry */
struct maGrouping *allA = getMaGrouping(hgConn, tableName);
if (!allA)
errAbort("Could not find maGrouping for %s!", tableName);
struct hash *settings = getSettings(tableName);
+
el = hashLookup(settings, "aliasTable");
if (!el)
errAbort("No aliasTable.\n");
char *aliasTable = cloneString(el->val);
struct hash *gaHash = getAliases(hgConn, aliasTable, genes);
-struct sampleVals *sv, *svList = prepareSampleVals(samples, allA);
-if (!svList)
- errAbort("No samples allocated.\n");
-
-printf("gene\tprobe");
-for (sv = svList; sv; sv = sv->next)
- printf("\t%s", sv->name);
-printf("\n");
-
struct hashCookie cookie = hashFirst(gaHash);
while ((el = hashNext(&cookie)) != NULL)
{
struct geneAlias *ga = el->val;
- setProbeData(hgConn, svList, tableName, ga->probes);
-
- printProbeData(ga, svList);
-
- clearProbeData(svList);
+ getProbeData(hgConn, allA, tableName, ga->probes);
}
hFreeConn(&hgConn);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
-if (argc != 3)
+if (argc != 4)
usage();
if (optionExists("median"))
median = TRUE;
-if (optionExists("tcga"))
- isTCGA = TRUE;
-char *samples = optionVal("samples", NULL);
-extractData(argv[1], argv[2], samples);
+extractData(argv[1], argv[2], argv[3]);
return 0;
}