src/hg/instinct/bioInt2/bioMeta.c 1.1
1.1 2009/03/20 06:06:32 jsanborn
initial commit
Index: src/hg/instinct/bioInt2/bioMeta.c
===================================================================
RCS file: src/hg/instinct/bioInt2/bioMeta.c
diff -N src/hg/instinct/bioInt2/bioMeta.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/instinct/bioInt2/bioMeta.c 20 Mar 2009 06:06:32 -0000 1.1
@@ -0,0 +1,572 @@
+/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "hPrint.h"
+#include "hdb.h"
+#include "dystring.h"
+#include "bioIntDb.h"
+#include "bioIntDriver.h"
+#include "cprob.h"
+#include "hgStatsLib.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "bioIntegrator - our learner\n"
+ "usage:\n"
+ " bioIntegrator datasets\n"
+ );
+}
+
+char db[128] = "bioInt";
+
+static struct optionSpec options[] = {
+ {NULL, 0}
+};
+
+struct slName *getAllGenes()
+{
+struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
+
+char query[256];
+safef(query, sizeof(query), "select DISTINCT geneSymbol from kgXref join knownGene on kgXref.kgId = knownGene.name where not (knownGene.chrom like 'chrX%%' or knownGene.chrom like 'chrY%%')");
+
+struct slName *slList = sqlQuickList(biConn, query);
+hFreeConn(&biConn);
+
+return slList;
+}
+
+
+void printAverage(struct biResults *br1, struct biResults *br2,
+ struct slName *probes1, struct slName *probes2,
+ struct slName *samples1, struct slName *samples2, char *gene)
+{
+if (slCount(probes1) == 0 && slCount(probes2) == 0)
+ return;
+
+struct slName *sa, *pr;
+struct slName *da = br1->allDatasets(br1);
+
+fprintf(stdout, "%s\t", gene);
+for (sa = samples1; sa; sa = sa->next)
+ {
+ double count = 0.0;
+ double sum = 0.0;
+ for (pr = probes1; pr; pr = pr->next)
+ {
+ sum += br1->dataForProbeSampleInDataset(br1, pr->name, sa->name, da->name);
+ count += 1.0;
+ }
+ fprintf(stdout, "%f", sum/count);
+
+ if (sa->next)
+ fprintf(stdout, ",");
+ }
+fprintf(stdout, "\t");
+
+da = br1->allDatasets(br1);
+for (sa = samples2; sa; sa = sa->next)
+ {
+ double count = 0.0;
+ double sum = 0.0;
+ for (pr = probes2; pr; pr = pr->next)
+ {
+ sum += br2->dataForProbeSampleInDataset(br2, pr->name, sa->name, da->name);
+ count += 1.0;
+ }
+ fprintf(stdout, "%f", sum/count);
+
+ if (sa->next)
+ fprintf(stdout, ",");
+ }
+
+fprintf(stdout, "\n");
+}
+
+
+struct slDouble *getMetaProbes(struct biResults *br, struct slName *probes,
+ struct slName *samples, char *gene)
+{
+if (!probes)
+ return NULL;
+
+struct slName *da = br->allDatasets(br);
+
+struct slName *sa, *pr;
+
+struct slDouble *md, *mdList = NULL;
+for (sa = samples; sa; sa = sa->next)
+ {
+ double val;
+ struct slDouble *sd, *sdList = NULL;
+ for (pr = probes; pr; pr = pr->next)
+ {
+ val = br->dataForProbeSampleInDataset(br, pr->name, sa->name, da->name);
+ sd = slDoubleNew(val);
+ slAddHead(&sdList, sd);
+ }
+
+ if (slCount(sdList))
+ val = slDoubleMedian(sdList);
+ else
+ val = 0.0;
+
+ md = slDoubleNew(val);
+ slAddHead(&mdList, md);
+ slFreeList(&sdList);
+ }
+slReverse(&mdList);
+
+return mdList;
+}
+
+
+void getMatching(struct slName *list1, struct slName *list2,
+ struct slName **retList1, struct slName **retList2)
+{
+if (!list1 || !list2)
+ return;
+
+fprintf(stderr, "matching: count(list1) = %d, count(list2) = %d\n",
+ slCount(list1), slCount(list2));
+
+struct slName *matched = NULL;
+struct slName *sl1, *rlist1 = NULL;
+struct slName *sl2, *rlist2 = NULL;
+for (sl1 = list1; sl1; sl1 = sl1->next)
+ {
+ char *sh1 = cloneStringZ(sl1->name, 16);
+ for (sl2 = list2; sl2; sl2 = sl2->next)
+ {
+ char *sh2 = cloneStringZ(sl2->name, 16);
+ if (sameString(sh1, sh2))
+ {
+ if (!slNameInList(matched, sh1))
+ {
+ slNameAddHead(&rlist1, sl1->name);
+ slNameAddHead(&rlist2, sl2->name);
+ slNameAddHead(&matched, sh1);
+ }
+ }
+ freeMem(sh2);
+ }
+ freeMem(sh1);
+ }
+slNameFreeList(&matched);
+
+*retList1 = rlist1;
+*retList2 = rlist2;
+}
+
+
+void getMatchingSamples(struct biResults *br1, struct biResults *br2,
+ struct slName **retList1, struct slName **retList2)
+{
+if (!br1 || !br2)
+ return;
+
+struct slName *samples1 = NULL, *samples2 = NULL;
+
+fprintf(stderr, "getting first list of samples\n");
+struct slName *list1 = br1->allSamples(br1);
+fprintf(stderr, "getting second list of samples\n");
+struct slName *list2 = br2->allSamples(br2);
+getMatching(list1, list2, &samples1, &samples2);
+
+if (!samples1 || !samples2)
+ errAbort("no matching samples");
+if (slCount(samples1) != slCount(samples2))
+ errAbort("sample sets different lengths.");
+
+fprintf(stderr, "found matching samples, count = %d", slCount(samples1));
+*retList1 = samples1;
+*retList2 = samples2;
+}
+
+
+struct hash *getGeneLevelMeta(struct biResults *br, struct slName *samples)
+{
+if (!br)
+ return NULL;
+
+struct hash *hash = hashNew(0);
+struct slName *sl, *allGenes = getAllGenes();
+
+for (sl = allGenes; sl; sl = sl->next)
+ {
+ struct slName *pList = br->probesForGene(br, sl->name);
+ struct slDouble *metaScores = getMetaProbes(br, pList, samples, sl->name);
+
+ hashAdd(hash, sl->name, metaScores);
+ slNameFreeList(&pList);
+ }
+
+return hash;
+}
+
+struct slDouble *getMetaList(struct slDouble *list1, struct slDouble *list2)
+{
+struct slDouble *sd1, *sd2, *md, *mdList = NULL;
+
+if (list1 && list2)
+ if (slCount(list1) != slCount(list2))
+ errAbort("lists are not equal length!");
+
+sd1 = list1;
+sd2 = list2;
+while (sd1 || sd2)
+ {
+ struct slDouble *tmp1 = NULL;
+ if (sd1)
+ tmp1 = slDoubleNew(sd1->val);
+ struct slDouble *tmp2 = NULL;
+ if (sd2)
+ tmp2 = slDoubleNew(sd2->val);
+
+ struct slDouble *tmpList = NULL;
+ if (tmp1)
+ slAddHead(&tmpList, tmp1);
+
+ if (tmp2)
+ slAddHead(&tmpList, tmp2);
+
+ float chi2, metaP;
+ if (fishersMetaSigned(tmpList, &chi2, &metaP))
+ {
+// metaP = fabs(metaP);
+
+ md = slDoubleNew(metaP);
+ if (fabs(metaP) > 10.0)
+ {
+ struct slDouble *sd;
+ for (sd = tmpList; sd; sd = sd->next)
+ fprintf(stderr, "%f\n", sd->val);
+ fprintf(stderr, "meta = %f, chi2 = %f", metaP, chi2);
+ }
+ }
+ else
+ md = slDoubleNew(0.0);
+
+
+ fprintf(stdout, "%f", md->val);
+
+ if (sd1)
+ {
+ if (sd1->next)
+ fprintf(stdout, ",");
+ }
+ else if (sd2)
+ {
+ if (sd2->next)
+ fprintf(stdout, ",");
+ }
+
+ slAddHead(&mdList, md);
+ slFreeList(&tmpList);
+
+ if (sd1)
+ sd1 = sd1->next;
+
+ if (sd2)
+ sd2 = sd2->next;
+ }
+
+slReverse(&mdList);
+return mdList;
+}
+
+struct hash *combineMetaGenes(struct hash *hash1, struct hash *hash2)
+{
+if (!hash1 || !hash2)
+ return NULL;
+
+struct slDouble *list1, *list2, *metaList;
+struct hash *hash = hashNew(0);
+struct slName *sl, *allGenes = getAllGenes();
+
+for (sl = allGenes; sl; sl = sl->next)
+ {
+ struct hashEl *el = hashLookup(hash1, sl->name);
+ if (!el)
+ list1 = NULL;
+ else
+ list1 = el->val;
+
+ el = hashLookup(hash2, sl->name);
+ if (!el)
+ list2 = NULL;
+ else
+ list2 = el->val;
+
+ fprintf(stdout, "%s\t", sl->name);
+ metaList = getMetaList(list1, list2);
+ fprintf(stdout, "\n");
+ hashAdd(hash, sl->name, metaList);
+ }
+exit(0);
+
+return hash;
+}
+
+void outputMetaGeneset(struct hash *hash, char *name, struct slName *gList,
+ struct slName *samples)
+{
+if (!hash || !gList)
+ return;
+
+struct hashEl *el;
+struct hash *sampleData = hashNew(0);
+struct slDouble *sd, *sdList;
+struct slName *sl, *sa;
+for (sl = gList; sl; sl = sl->next)
+ {
+ el = hashLookup(hash, sl->name);
+ if (!el)
+ continue;
+
+ sdList = el->val;
+// if (!sdList)
+// fprintf(stderr, "%s sdList == NULL\n", sl->name);
+
+ for (sa = samples, sd = sdList; sa && sd; sa = sa->next, sd = sd->next)
+ {
+ el = hashLookup(sampleData, sa->name);
+ struct slDouble *newSd;
+ if (!el)
+ {
+ newSd = slDoubleNew(sd->val);
+ hashAdd(sampleData, sa->name, newSd);
+ }
+ else
+ {
+ struct slDouble *list = el->val;
+ newSd = slDoubleNew(sd->val);
+ slAddTail(&list, newSd);
+ }
+ }
+ }
+
+float chi2, metaP;
+struct slDouble *md, *mdList = NULL;
+fprintf(stdout, "%s\t", name);
+for (sa = samples; sa; sa = sa->next)
+ {
+ el = hashLookup(sampleData, sa->name);
+ if (!el) // no genes found for geneset
+ {
+ fprintf(stdout, "none");
+ break;
+ }
+ sdList = el->val;
+
+ if (fishersMetaSigned(sdList, &chi2, &metaP))
+ {
+ metaP = fabs(metaP);
+
+ md = slDoubleNew(metaP);
+ slAddHead(&mdList, md);
+ fprintf(stdout, "%f", metaP);
+ }
+ else
+ {
+ md = slDoubleNew(0.0);
+ slAddHead(&mdList, md);
+ fprintf(stdout, "0");
+ }
+
+ if (sa->next)
+ fprintf(stdout, ",");
+ }
+
+if (fishersMetaSigned(mdList, &chi2, &metaP))
+ fprintf(stdout, "\t%f", metaP);
+else
+ fprintf(stdout, "\t0.0");
+fprintf(stdout, "\n");
+
+slFreeList(&mdList);
+
+hashFreeWithVals(&sampleData, slFreeList);
+}
+
+
+void calculatePathwayMeta(struct hash *hash, struct slName *samples)
+{
+if (!hash)
+ errAbort("can't calculate pathway meta, hash == NULL");
+
+char query[256];
+safef(query, sizeof(query), "select * from genesets");
+
+struct sqlConnection *conn = hAllocConnProfile("localDb", "pathway");
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = NULL;
+
+struct slName *na, *names = NULL, *me, *members = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ char *name = row[0];
+ char *genes = row[1];
+
+ slNameAddHead(&names, name);
+ slNameAddHead(&members, genes);
+ }
+sqlFreeResult(&sr);
+
+for (na = names, me = members; na && me; na = na->next, me = me->next)
+ {
+ struct slName *gList = slNameListFromComma(me->name);
+ outputMetaGeneset(hash, na->name, gList, samples);
+ slNameFreeList(&gList);
+ }
+
+slNameFreeList(&names);
+slNameFreeList(&members);
+}
+
+void testMeta(void)
+{
+float chi2, prob;
+
+struct slDouble *sd, *sdList = NULL;
+sd = slDoubleNew(-log(0.000000000000001)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.5)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.9)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.4)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.1)/log(10.0));
+slAddHead(&sdList, sd);
+fishersMeta(sdList, &chi2, &prob);
+fprintf(stderr, "prob = %f, chi2 = %f\n", prob, chi2);
+
+slFreeList(&sdList);
+sdList = NULL;
+
+sd = slDoubleNew(-log(1.0)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.000001)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.00001)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.000001)/log(10.0));
+slAddHead(&sdList, sd);
+sd = slDoubleNew(-log(0.0001)/log(10.0));
+slAddHead(&sdList, sd);
+
+fishersMeta(sdList, &chi2, &prob);
+fprintf(stderr, "prob = %f, chi2 = %f\n", prob, chi2);
+
+
+fprintf(stderr, "ndtr(3.0) = %f, ndtr(-3.0) = %f\n", ndtr(3.0), ndtr(-3.0));
+}
+
+
+void getData(char *dataset)
+{
+if (DEBUG)
+ uglyTime(NULL);
+struct biQuery *bq = biQueryNew(db, "ucsfNeveCGH");
+bq->getAllProbes = TRUE;
+//bq->addFeatureVals(bq, "SAMPLETYPE = 1", ','); // TODO: Separate by T1 timepoint
+
+fprintf(stderr, "getting results\n");
+struct biResults *br1 = biQueryResults(bq);
+fprintf(stderr, "got results\n");
+if (DEBUG)
+ uglyTime("Got original set #1");
+
+fprintf(stderr, "getting log p\n");
+br1->toLogP(br1);
+fprintf(stderr, "got log p\n");
+if (DEBUG)
+ uglyTime("Converted to -log(p)");
+
+fprintf(stderr, "freeing bq\n");
+biQueryFree(&bq);
+fprintf(stderr, "free'd bq\n");
+
+if (DEBUG)
+ uglyTime(NULL);
+fprintf(stderr, "getting new bq\n");
+bq = biQueryNew(db, "ucsfNeveExp");
+fprintf(stderr, "got bq\n");
+bq->getAllProbes = TRUE;
+//bq->addFeatureVals(bq, "SAMPLETYPE = 1", ','); // TODO: Separate by T1 timepoint
+
+fprintf(stderr, "getting results\n");
+struct biResults *br2 = biQueryResults(bq);
+fprintf(stderr, "got results\n");
+if (DEBUG)
+ uglyTime("Got original set #2");
+
+br2->toLogP(br2);
+if (DEBUG)
+ uglyTime("Converted to -log(p)");
+
+biQueryFree(&bq);
+
+fprintf(stderr, "getting samples\n");
+struct slName *samples1 = NULL, *samples2 = NULL;
+getMatchingSamples(br1, br2, &samples1, &samples2);
+
+fprintf(stderr, "calculating gene level meta values for first dataset\n");
+struct hash *hash1 = getGeneLevelMeta(br1, samples1);
+
+fprintf(stderr, "calculating gene level meta values for second dataset\n");
+struct hash *hash2 = getGeneLevelMeta(br2, samples2);
+
+struct slName *sa;
+fprintf(stderr, "samples1:\n");
+for (sa = samples1; sa; sa = sa->next)
+ {
+ fprintf(stderr, "%s", sa->name);
+ if (sa->next)
+ fprintf(stderr, ",");
+ }
+fprintf(stderr, "\n");
+
+fprintf(stderr, "samples2:\n");
+for (sa = samples2; sa; sa = sa->next)
+ {
+ fprintf(stderr, "%s", sa->name);
+ if (sa->next)
+ fprintf(stderr, ",");
+ }
+fprintf(stderr, "\n");
+
+biResultsFree(&br1);
+biResultsFree(&br2);
+
+fprintf(stderr, "calculating meta across both datasets\n");
+struct hash *hash = combineMetaGenes(hash1, hash2);
+
+hashFreeWithVals(&hash1, slFreeList);
+hashFreeWithVals(&hash2, slFreeList);
+
+fprintf(stderr, "calculating pathway level meta\n");
+calculatePathwayMeta(hash, samples1);
+
+fprintf(stderr, "done\n");
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+ usage();
+
+srand ( time(NULL) );
+
+getData(argv[1]);
+
+return 0;
+}