src/hg/instinct/bioInt2/bioMeta.c 1.1

1.1 2009/03/20 06:06:32 jsanborn
initial commit
Index: src/hg/instinct/bioInt2/bioMeta.c
===================================================================
RCS file: src/hg/instinct/bioInt2/bioMeta.c
diff -N src/hg/instinct/bioInt2/bioMeta.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/instinct/bioInt2/bioMeta.c	20 Mar 2009 06:06:32 -0000	1.1
@@ -0,0 +1,572 @@
+/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "hPrint.h"
+#include "hdb.h"  
+#include "dystring.h"
+#include "bioIntDb.h"
+#include "bioIntDriver.h"
+#include "cprob.h"
+#include "hgStatsLib.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+	 "bioIntegrator - our learner\n"
+	 "usage:\n"
+	 "   bioIntegrator datasets\n"
+	 );
+}
+
+char db[128] = "bioInt";
+
+static struct optionSpec options[] = {
+    {NULL, 0}
+}; 
+
+struct slName *getAllGenes()
+{
+struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
+
+char query[256];
+safef(query, sizeof(query), "select DISTINCT geneSymbol from kgXref join knownGene on kgXref.kgId = knownGene.name where not (knownGene.chrom like 'chrX%%' or knownGene.chrom like 'chrY%%')");
+
+struct slName *slList = sqlQuickList(biConn, query);
+hFreeConn(&biConn);
+
+return slList;
+}
+
+
+void printAverage(struct biResults *br1, struct biResults *br2, 
+		  struct slName *probes1, struct slName *probes2, 
+		  struct slName *samples1, struct slName *samples2, char *gene)
+{
+if (slCount(probes1) == 0 && slCount(probes2) == 0)
+    return;
+
+struct slName *sa, *pr;
+struct slName *da = br1->allDatasets(br1);
+
+fprintf(stdout, "%s\t", gene);
+for (sa = samples1; sa; sa = sa->next)
+    {
+    double count = 0.0;
+    double sum = 0.0;
+    for (pr = probes1; pr; pr = pr->next)
+	{
+	sum += br1->dataForProbeSampleInDataset(br1, pr->name, sa->name, da->name);
+	count += 1.0;
+	}
+    fprintf(stdout, "%f", sum/count);
+    
+    if (sa->next)
+	fprintf(stdout, ",");
+    }
+fprintf(stdout, "\t");
+
+da = br1->allDatasets(br1);
+for (sa = samples2; sa; sa = sa->next)
+    {
+    double count = 0.0;
+    double sum = 0.0;
+    for (pr = probes2; pr; pr = pr->next)
+	{
+	sum += br2->dataForProbeSampleInDataset(br2, pr->name, sa->name, da->name);
+	count += 1.0;
+	}
+    fprintf(stdout, "%f", sum/count);
+    
+    if (sa->next)
+	fprintf(stdout, ",");
+    }
+
+fprintf(stdout, "\n");
+}
+
+
+struct slDouble *getMetaProbes(struct biResults *br, struct slName *probes, 
+			       struct slName *samples, char *gene)
+{
+if (!probes)
+    return NULL;
+
+struct slName *da = br->allDatasets(br);
+
+struct slName *sa, *pr;
+
+struct slDouble *md, *mdList = NULL;
+for (sa = samples; sa; sa = sa->next)
+    {
+    double val;
+    struct slDouble *sd, *sdList = NULL;
+    for (pr = probes; pr; pr = pr->next)
+	{
+	val = br->dataForProbeSampleInDataset(br, pr->name, sa->name, da->name);
+	sd = slDoubleNew(val);
+	slAddHead(&sdList, sd);
+	}
+
+    if (slCount(sdList))
+	val = slDoubleMedian(sdList);
+    else
+	val = 0.0;
+
+    md = slDoubleNew(val);
+    slAddHead(&mdList, md);
+    slFreeList(&sdList);
+    }
+slReverse(&mdList);
+
+return mdList;
+}
+
+
+void getMatching(struct slName *list1, struct slName *list2, 
+		 struct slName **retList1, struct slName **retList2)
+{
+if (!list1 || !list2)
+    return;
+
+fprintf(stderr, "matching: count(list1) = %d, count(list2) = %d\n",
+	slCount(list1), slCount(list2));
+
+struct slName *matched = NULL;
+struct slName *sl1, *rlist1 = NULL;
+struct slName *sl2, *rlist2 = NULL;
+for (sl1 = list1; sl1; sl1 = sl1->next)
+    {
+    char *sh1 = cloneStringZ(sl1->name, 16);
+    for (sl2 = list2; sl2; sl2 = sl2->next)
+	{
+	char *sh2 = cloneStringZ(sl2->name, 16);
+	if (sameString(sh1, sh2))
+	    {
+	    if (!slNameInList(matched, sh1))
+		{
+		slNameAddHead(&rlist1, sl1->name);
+		slNameAddHead(&rlist2, sl2->name);
+		slNameAddHead(&matched, sh1);
+		}
+	    }
+	freeMem(sh2);
+	}
+    freeMem(sh1);
+    }
+slNameFreeList(&matched);
+
+*retList1 = rlist1;
+*retList2 = rlist2;
+}
+
+
+void getMatchingSamples(struct biResults *br1, struct biResults *br2, 
+			struct slName **retList1, struct slName **retList2)
+{
+if (!br1 || !br2)
+    return;
+
+struct slName *samples1 = NULL, *samples2 = NULL;
+
+fprintf(stderr, "getting first list of samples\n");
+struct slName *list1 = br1->allSamples(br1);
+fprintf(stderr, "getting second list of samples\n");
+struct slName *list2 = br2->allSamples(br2);
+getMatching(list1, list2, &samples1, &samples2);
+
+if (!samples1 || !samples2)
+    errAbort("no matching samples");
+if (slCount(samples1) != slCount(samples2))
+    errAbort("sample sets different lengths.");
+
+fprintf(stderr, "found matching samples, count = %d", slCount(samples1));
+*retList1 = samples1;
+*retList2 = samples2;
+}
+
+
+struct hash *getGeneLevelMeta(struct biResults *br, struct slName *samples)
+{
+if (!br)
+    return NULL;
+
+struct hash *hash = hashNew(0);
+struct slName *sl, *allGenes = getAllGenes();
+
+for (sl = allGenes; sl; sl = sl->next)
+    {
+    struct slName *pList = br->probesForGene(br, sl->name);
+    struct slDouble *metaScores = getMetaProbes(br, pList, samples, sl->name);
+
+    hashAdd(hash, sl->name, metaScores);
+    slNameFreeList(&pList);
+    }
+
+return hash;
+}
+
+struct slDouble *getMetaList(struct slDouble *list1, struct slDouble *list2)
+{
+struct slDouble *sd1, *sd2, *md, *mdList = NULL;
+
+if (list1 && list2)
+    if (slCount(list1) != slCount(list2))
+	errAbort("lists are not equal length!");
+
+sd1 = list1;
+sd2 = list2;
+while (sd1 || sd2)
+    {
+    struct slDouble *tmp1 = NULL;
+    if (sd1)
+	tmp1 = slDoubleNew(sd1->val);
+    struct slDouble *tmp2 = NULL;
+    if (sd2) 
+	tmp2 = slDoubleNew(sd2->val);
+
+    struct slDouble *tmpList = NULL;
+    if (tmp1)
+	slAddHead(&tmpList, tmp1);
+
+    if (tmp2)
+	slAddHead(&tmpList, tmp2);
+
+    float chi2, metaP;
+    if (fishersMetaSigned(tmpList, &chi2, &metaP))
+	{	
+//	metaP = fabs(metaP);
+
+	md = slDoubleNew(metaP);
+	if (fabs(metaP) > 10.0)
+	    {
+	    struct slDouble *sd;
+	    for (sd = tmpList; sd; sd = sd->next)
+		fprintf(stderr, "%f\n", sd->val);
+	    fprintf(stderr, "meta = %f, chi2 = %f", metaP, chi2);
+	    }
+	}
+    else
+	md = slDoubleNew(0.0);
+
+
+    fprintf(stdout, "%f", md->val);
+
+    if (sd1)
+	{
+	if (sd1->next)
+	    fprintf(stdout, ",");
+	}
+    else if (sd2)
+	{
+	if (sd2->next)
+	    fprintf(stdout, ",");
+	}                        
+
+    slAddHead(&mdList, md);
+    slFreeList(&tmpList);
+
+    if (sd1)
+	sd1 = sd1->next;
+
+    if (sd2)
+	sd2 = sd2->next;
+    }
+
+slReverse(&mdList);
+return mdList;
+}
+
+struct hash *combineMetaGenes(struct hash *hash1, struct hash *hash2)
+{
+if (!hash1 || !hash2)
+    return NULL;
+
+struct slDouble *list1, *list2, *metaList;
+struct hash *hash = hashNew(0);
+struct slName *sl, *allGenes = getAllGenes();
+
+for (sl = allGenes; sl; sl = sl->next)
+    {
+    struct hashEl *el = hashLookup(hash1, sl->name);
+    if (!el)
+	list1 = NULL;
+    else
+	list1 = el->val;
+
+    el = hashLookup(hash2, sl->name);
+    if (!el)
+	list2 = NULL;
+    else
+	list2 = el->val;
+
+    fprintf(stdout, "%s\t", sl->name);
+    metaList = getMetaList(list1, list2);
+    fprintf(stdout, "\n");
+    hashAdd(hash, sl->name, metaList);
+    }
+exit(0);
+
+return hash;
+}
+
+void outputMetaGeneset(struct hash *hash, char *name, struct slName *gList, 
+		       struct slName *samples)
+{
+if (!hash || !gList)
+    return;
+
+struct hashEl *el;
+struct hash *sampleData = hashNew(0);
+struct slDouble *sd, *sdList;
+struct slName *sl, *sa;
+for (sl = gList; sl; sl = sl->next)
+    {
+    el = hashLookup(hash, sl->name);
+    if (!el)
+	continue;
+
+    sdList = el->val;
+//    if (!sdList)
+//	fprintf(stderr, "%s sdList == NULL\n", sl->name);
+
+    for (sa = samples, sd = sdList; sa && sd; sa = sa->next, sd = sd->next)
+	{
+	el = hashLookup(sampleData, sa->name);
+	struct slDouble *newSd;
+	if (!el)
+	    {
+	    newSd = slDoubleNew(sd->val);
+	    hashAdd(sampleData, sa->name, newSd);
+	    }
+	else
+	    {
+	    struct slDouble *list = el->val;
+	    newSd = slDoubleNew(sd->val);
+	    slAddTail(&list, newSd);
+	    }
+	}
+    }
+
+float chi2, metaP;
+struct slDouble *md, *mdList = NULL;
+fprintf(stdout, "%s\t", name);
+for (sa = samples; sa; sa = sa->next)
+    {
+    el = hashLookup(sampleData, sa->name);
+    if (!el)  // no genes found for geneset 
+	{
+	fprintf(stdout, "none");
+	break;
+	}
+    sdList = el->val;
+
+    if (fishersMetaSigned(sdList, &chi2, &metaP))
+	{
+	metaP = fabs(metaP);
+
+	md = slDoubleNew(metaP);
+	slAddHead(&mdList, md);
+	fprintf(stdout, "%f", metaP);
+	}
+    else
+	{
+	md = slDoubleNew(0.0);
+	slAddHead(&mdList, md);
+	fprintf(stdout, "0");
+	}
+
+    if (sa->next)
+	fprintf(stdout, ",");
+    }
+
+if (fishersMetaSigned(mdList, &chi2, &metaP))
+    fprintf(stdout, "\t%f", metaP);
+else
+    fprintf(stdout, "\t0.0");
+fprintf(stdout, "\n");
+
+slFreeList(&mdList);
+
+hashFreeWithVals(&sampleData, slFreeList);
+}
+
+
+void calculatePathwayMeta(struct hash *hash, struct slName *samples)
+{
+if (!hash)
+    errAbort("can't calculate pathway meta, hash == NULL");
+
+char query[256];
+safef(query, sizeof(query), "select * from genesets");
+
+struct sqlConnection *conn = hAllocConnProfile("localDb", "pathway");
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+
+struct slName *na, *names = NULL, *me, *members = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *name = row[0];
+    char *genes = row[1];
+
+    slNameAddHead(&names, name);
+    slNameAddHead(&members, genes);
+    }
+sqlFreeResult(&sr);
+
+for (na = names, me = members; na && me; na = na->next, me = me->next)
+    {
+    struct slName *gList = slNameListFromComma(me->name);
+    outputMetaGeneset(hash, na->name, gList, samples);
+    slNameFreeList(&gList);
+    }
+
+slNameFreeList(&names);
+slNameFreeList(&members);
+}
+
+void testMeta(void)
+{
+float chi2, prob;
+
+struct slDouble *sd, *sdList = NULL;
+sd = slDoubleNew(-log(0.000000000000001)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.5)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.9)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.4)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.1)/log(10.0));
+slAddHead(&sdList, sd);
+fishersMeta(sdList, &chi2, &prob);
+fprintf(stderr, "prob = %f, chi2 = %f\n", prob, chi2); 
+
+slFreeList(&sdList);
+sdList = NULL;
+
+sd = slDoubleNew(-log(1.0)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.000001)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.00001)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.000001)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.0001)/log(10.0));
+slAddHead(&sdList, sd);
+
+fishersMeta(sdList, &chi2, &prob);
+fprintf(stderr, "prob = %f, chi2 = %f\n", prob, chi2); 
+
+
+fprintf(stderr, "ndtr(3.0) = %f, ndtr(-3.0) = %f\n", ndtr(3.0), ndtr(-3.0));
+}
+
+
+void getData(char *dataset)
+{
+if (DEBUG)
+    uglyTime(NULL);
+struct biQuery *bq = biQueryNew(db, "ucsfNeveCGH");
+bq->getAllProbes = TRUE;
+//bq->addFeatureVals(bq, "SAMPLETYPE = 1", ',');  // TODO: Separate by T1 timepoint
+
+fprintf(stderr, "getting results\n");
+struct biResults *br1 = biQueryResults(bq);
+fprintf(stderr, "got results\n");
+if (DEBUG)
+    uglyTime("Got original set #1");
+
+fprintf(stderr, "getting log p\n");
+br1->toLogP(br1);
+fprintf(stderr, "got log p\n");
+if (DEBUG)
+    uglyTime("Converted to -log(p)");
+
+fprintf(stderr, "freeing bq\n");
+biQueryFree(&bq);
+fprintf(stderr, "free'd bq\n");
+
+if (DEBUG)
+    uglyTime(NULL);
+fprintf(stderr, "getting new bq\n");
+bq = biQueryNew(db, "ucsfNeveExp");
+fprintf(stderr, "got bq\n");
+bq->getAllProbes = TRUE;
+//bq->addFeatureVals(bq, "SAMPLETYPE = 1", ','); // TODO: Separate by T1 timepoint 
+
+fprintf(stderr, "getting results\n");
+struct biResults *br2 = biQueryResults(bq);
+fprintf(stderr, "got results\n");
+if (DEBUG)
+    uglyTime("Got original set #2");
+
+br2->toLogP(br2);
+if (DEBUG)
+    uglyTime("Converted to -log(p)");
+
+biQueryFree(&bq);
+
+fprintf(stderr, "getting samples\n");
+struct slName *samples1 = NULL, *samples2 = NULL;
+getMatchingSamples(br1, br2, &samples1, &samples2);
+
+fprintf(stderr, "calculating gene level meta values for first dataset\n");
+struct hash *hash1 = getGeneLevelMeta(br1, samples1); 
+
+fprintf(stderr, "calculating gene level meta values for second dataset\n");
+struct hash *hash2 = getGeneLevelMeta(br2, samples2); 
+
+struct slName *sa;
+fprintf(stderr, "samples1:\n");
+for (sa = samples1; sa; sa = sa->next)
+    {
+    fprintf(stderr, "%s", sa->name);
+    if (sa->next)
+	fprintf(stderr, ",");
+    }
+fprintf(stderr, "\n");
+
+fprintf(stderr, "samples2:\n");
+for (sa = samples2; sa; sa = sa->next)
+    {
+    fprintf(stderr, "%s", sa->name);
+    if (sa->next)
+	fprintf(stderr, ",");
+    }
+fprintf(stderr, "\n");
+
+biResultsFree(&br1);
+biResultsFree(&br2);
+
+fprintf(stderr, "calculating meta across both datasets\n");
+struct hash *hash = combineMetaGenes(hash1, hash2);
+
+hashFreeWithVals(&hash1, slFreeList);
+hashFreeWithVals(&hash2, slFreeList);
+
+fprintf(stderr, "calculating pathway level meta\n");
+calculatePathwayMeta(hash, samples1);
+
+fprintf(stderr, "done\n");
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+    usage();
+
+srand ( time(NULL) );
+
+getData(argv[1]);
+
+return 0;
+}