src/hg/instinct/lib/hgHeatmapLib.c 1.64
1.64 2010/02/10 21:35:35 jsanborn
fixed json wrapper to be to spec
Index: src/hg/instinct/lib/hgHeatmapLib.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/lib/hgHeatmapLib.c,v
retrieving revision 1.63
retrieving revision 1.64
diff -b -B -U 1000000 -r1.63 -r1.64
--- src/hg/instinct/lib/hgHeatmapLib.c 22 Oct 2009 18:40:26 -0000 1.63
+++ src/hg/instinct/lib/hgHeatmapLib.c 10 Feb 2010 21:35:35 -0000 1.64
@@ -1,2507 +1,2499 @@
/********************************************************************************/
/* Copyright 2007-2009 -- The Regents of the University of California */
/********************************************************************************/
/* hgHeatmap is a CGI script that produces a web page containing
* a graphic with all chromosomes in genome, and a heatmap or two
* on top of them. This module just contains the main routine,
* the routine that dispatches to various page handlers, and shared
* utility functions. */
#include "common.h"
#include "float.h"
#include "bed.h"
#include "hash.h"
#include "hCommon.h"
#include "hdb.h"
#include "hui.h"
#include "hgHeatmapLib.h"
#include "microarray.h"
#include "ra.h"
#include "heatmapUtility.h"
#include "featuresLib.h"
#include "sortFeatures.h"
#include "filterFeatures.c"
#include "hgStatsLib.h"
#include "cprob.h"
#include "json.h"
static char const rcsid[] = "$Id$";
enum blockTestType { pca, fisher,weightedZ } blockTestType;
char *blockStatTest[3];
enum metaTestType { metaTestFisherMeta,metaTestStoufferMeta,metaTestMudholkarMeta,metaTestSymmUniMeta } metaTestType;
char *metaStatTest[4];
static char *heatMapDbProfile = "localDb"; // database profile to use
struct microarrayGroups *maGroupings(char *database, char *table)
/* Get the settings from the microarrayGrouop.ra files and put them in a convenient struct. */
{
struct microarrayGroups *ret;
char groupings[strlen(table)+strlen("Groups")+1];
safef(groupings, sizeof(groupings), "%s%s", table,"Groups");
char *s = NULL;
struct hash *allGroups;
struct hash *mainGroup;
struct hash *tmpGroup;
struct hash *hashList;
if (groupings == NULL)
return NULL;
hashList = hgReadRa(hGenome(database), database, "hgCgiData",
"microarrayGroups.ra", &allGroups);
if (allGroups == NULL)
return NULL;
AllocVar(ret);
mainGroup = hashMustFindVal(allGroups, groupings);
s = hashMustFindVal(mainGroup, "all");
tmpGroup = hashMustFindVal(allGroups, s);
ret->allArrays = maHashToMaGrouping(tmpGroup);
hashFreeList(&hashList);
return ret;
}
struct genoHeatmap *getCustomHeatmap(struct customTrack *ct)
{
if (!ct)
return NULL;
if (!ct->tdb)
return NULL;
/* Connect to database on local host */
struct genoHeatmap *gh = AllocVar(gh);
struct trackDb *tdb = ct->tdb;
gh->database= "customTrash";
gh->sampleList =NULL;
gh->sampleOrder = NULL;
gh->expIdOrder= NULL;
gh->tDb = tdb;
gh->custom = TRUE;
gh->ct = ct;
/* Setting on heatmap description from datasets.ra file*/
gh->name = cloneString(ct->dbTableName);
gh->shortLabel = cloneString(tdb->shortLabel);
gh->longLabel = cloneString(tdb->longLabel);
if (sameString(tdb->type, "array") && (ct->fieldCount == 15))
gh->dataType = cloneString("bed 15");
else
gh->dataType = cloneString(tdb->type);
gh->accessTable = NULL;
gh->group = "Custom Tracks";
gh->private = FALSE;
/* Settings on patient information from datasets.ra file */
gh->raFile = NULL;
gh->patDb = NULL;
gh->patTable = NULL;
gh->patField = NULL;
gh->sampleField = NULL;
gh->displayNameTable = NULL;
struct hash *tdbHash = trackDbHashSettings(tdb);
gh->platform = cloneString("expression");
if(hashFindVal(tdbHash, "platformType"))
gh->platform = (char *)(hashFindVal(tdbHash, "platformType"));
// probe table
if (hashFindVal(tdbHash, "platform"))
{
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, gh->database);
/* Settings on gene sets from datasets.ra file */
gh->probeTable =(char *)(hashFindVal(tdbHash, "platform"));
if (!sqlTableExists(conn, gh->probeTable))
gh->probeTable= NULL;
hFreeConn(&conn);
}
else
gh->probeTable = NULL;
gh->expScale = 0;
gh->gainFull = 1.0;
gh->gainSet = 1.0;
gh->height = 0;
if (tdbHash)
{ /* Settings on graphic information from datasets.ra file */
if (hashFindVal(tdbHash, "expScale"))
gh->expScale = atof(((char *) hashFindVal(tdbHash, "expScale")));
if (hashFindVal(tdbHash, "gainFull"))
gh->gainFull = atof(((char *) hashFindVal(tdbHash, "gainFull")));
if (hashFindVal(tdbHash, "gainSet"))
gh->gainSet = atof(((char *) hashFindVal(tdbHash, "gainSet")));
if (hashFindVal(tdbHash, "height"))
gh->height = atoi(((char *) hashFindVal(tdbHash, "height")));
}
/* microarray specific settings*/
if (sameWord(gh->dataType,"bed 15"))
{
// struct microarrayGroups *maGs = maGroupings(gh->database, tableName);
// if (!maGs)
// return NULL;
struct maGrouping * allA = maGetGroupingFromCt(ct);
gh->expCount = allA->size;
}
gh->anaResult = NULL;
gh->anaResultHash = NULL;
return gh;
}
/* Get heatmap using database, tableName */
struct genoHeatmap *getHeatmap(struct sqlConnection *conn, char *database, char *tableName,
struct hash *raHash)
{
if (!sqlTableExists(conn, tableName))
return NULL;
/* Connect to database on local host */
struct genoHeatmap *gh = AllocVar(gh);
gh->database= database;
gh->sampleList =NULL;
gh->sampleOrder = NULL;
gh->expIdOrder= NULL;
gh->tDb = hMaybeTrackInfo(conn, tableName);
gh->custom = FALSE;
/* Setting on heatmap description from datasets.ra file*/
gh->name = (char *)(hashMustFindVal(raHash, "name"));
gh->shortLabel = (char *)(hashMustFindVal(raHash, "shortLabel"));
gh->longLabel = (char *)(hashMustFindVal(raHash, "longLabel"));
gh->dataType = (char *)(hashMustFindVal(raHash, "dataType"));
gh->accessTable = (char *)(hashFindVal(raHash, "accessTable"));
if (!sqlTableExists(conn, gh->accessTable))
gh->accessTable= NULL;
gh->local_url = (char *)(hashFindVal(raHash, "local_url"));
gh->group = (char *)(hashFindVal(raHash, "group"));
/* Settings on patient information from datasets.ra file */
gh->raFile = (char *)(hashFindVal(raHash, "raFile"));
gh->patDb =(char *)(hashFindVal(raHash, "patDb"));
gh->patTable =(char *)(hashFindVal(raHash, "patTable"));
gh->patField =(char *)(hashFindVal(raHash, "patField"));
gh->sampleField =(char *)(hashFindVal(raHash, "sampleField"));
/* Settings on gene sets from datasets.ra file */
gh->probeTable =(char *)(hashFindVal(raHash, "aliasTable"));
if (!sqlTableExists(conn, gh->probeTable))
gh->probeTable= NULL;
gh->displayNameTable = (char *)(hashFindVal(raHash, "displayNameTable"));
if (!sqlTableExists(conn, gh->displayNameTable))
gh->displayNameTable = NULL;
/* Platform setting, currently defaults to expression */
if (hashFindVal(raHash, "platform"))
gh->platform = (char *)(hashFindVal(raHash, "platform"));
else
gh->platform = cloneString("expression");
/* Settings on graphic information from datasets.ra file */
if (hashFindVal(raHash, "expScale"))
gh->expScale = atof(((char *) hashFindVal(raHash, "expScale")));
else
gh->expScale = 0;
if (hashFindVal(raHash, "gainFull"))
gh->gainFull = atof(((char *) hashFindVal(raHash, "gainFull")));
else
gh->gainFull = 1.0;
if (hashFindVal(raHash, "gainSet"))
gh->gainSet = atof(((char *) hashFindVal(raHash, "gainSet")));
else
gh->gainSet = 1.0;
if (hashFindVal(raHash, "height"))
gh->height = atoi(((char *) hashFindVal(raHash, "height")));
else if (sameWord(gh->dataType, "bed 15"))
gh->height = 0;
else
errAbort("not correct heatmap height in datasets.ra file for %s\n", tableName);
char *security = (char *)hashFindVal(raHash, "security");
if (!security)
gh->private = TRUE; // default to private tracks.
else
gh->private = !sameString(security, "public");
/* microarray specific settings*/
if (sameWord(gh->dataType,"bed 15"))
{
struct microarrayGroups *maGs = maGroupings(gh->database, tableName);
if (!maGs)
return NULL;
struct maGrouping *allA= maGs->allArrays;
gh->expCount = allA->size;
}
gh->anaResult = NULL;
gh->anaResultHash = NULL;
return gh;
}
/* Deep copy of gh, clean up memory after return pointer */
struct genoHeatmap *cloneHeatmap(struct genoHeatmap *gh)
{
if (!gh)
return NULL;
struct genoHeatmap *newGh = AllocVar(newGh);
newGh->next = NULL;
newGh->name = gh->name;
newGh->shortLabel = gh->shortLabel;
newGh->longLabel = gh->longLabel;
newGh->local_url = gh->local_url;
newGh->accessTable= gh->accessTable;
newGh->database= gh->database;
newGh->dataType = gh->dataType;
newGh->patDb= gh->patDb;
newGh->patTable= gh->patTable;
newGh->sampleField = gh->sampleField;
newGh->height = gh->height;
newGh->gainFull = gh->gainFull;
newGh->gainSet = gh->gainSet;
newGh->expCount = gh->expCount;
newGh->sampleList = slNameCloneList(gh->sampleList);
newGh->sampleOrder= newHash(0);
struct hashCookie cookie = hashFirst(gh->sampleOrder);
struct hashEl *el = hashNext(&cookie);
while (el)
{
char *name =cloneString(el->name);
int val = hashIntVal(gh->sampleOrder,name);
hashAddInt(newGh->sampleOrder,name, val);
el = hashNext(&cookie);
}
newGh->expIdOrder = AllocArray(newGh->expIdOrder, gh->expCount);
int i;
for (i=0; i<gh->expCount; i++)
newGh->expIdOrder[i] = gh->expIdOrder[i];
newGh->tDb= gh->tDb;
return newGh;
}
void setPersonOrder (struct genoHeatmap* gh, struct slName *personList)
/* Set the sampleOrder and sampleList of a specific heatmap to patientList;
* patientList is a list of patient names
* if patientList is null, set to default */
{
if (!personList || !gh->patDb)
{
defaultOrder(gh);
return;
}
struct slName *slPerson = personList;
if (gh->sampleOrder)
freeHash(&gh->sampleOrder);
gh->sampleOrder = hashNew(0);
if (gh->sampleList)
slFreeList(&gh->sampleList);
gh->sampleList = NULL;
if (gh->expIdOrder)
freeMem(gh->expIdOrder);
/* set expIdOrder to default , i.e. an array of -1s, -1 indicates to the drawing code that the sample will not be drawn in heatmap */
AllocArray(gh->expIdOrder, gh->expCount);
int i;
for (i=0; i< gh->expCount; i++)
gh->expIdOrder[i]= -1;
/* get the sampleIds of each person from database */
char *labTable = gh->patTable;
char *key = gh->patField;
char *db = gh->patDb;
char *val = gh->sampleField;
if ((labTable == NULL) || (key == NULL) || (db==NULL))
{
defaultOrder(gh);
return;
}
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, db);
if (!conn) // FIXME: why can this fail??
{
defaultOrder(gh);
return;
}
struct sqlResult *sr;
char **row;
char *sample;
char *patStr = sqlStrFromSlNameList(slPerson);
struct dyString *query=newDyString(2000);
dyStringPrintf(query,
"select %s from %s where %s in (%s) order by FIELD(%s, %s)",
val, labTable, key, patStr, key, patStr);
struct slName *slSample=NULL;
sr = sqlGetResult(conn, query->string);
while ((row = sqlNextRow(sr)) != NULL)
{
sample = row[0];
slNameAddHead(&(slSample),sample);
}
slReverse(&slSample);
sqlFreeResult(&sr);
hFreeConn(&conn);
struct slName *sl, *allSamples = getAllSamples(gh);
if (slCount(allSamples) != slCount(slSample))
{
for (sl = allSamples; sl; sl = sl->next)
{
if (!slNameInList(slSample, sl->name))
slNameAddTail(&slSample, sl->name);
}
}
setSampleOrder (gh, slSample);
}
void setSampleOrder (struct genoHeatmap* gh, struct slName *sampleList)
/* Set the sampleOrder and sampleList of a specific heatmap to sampleList;
* sampleList is a list of sample ids
* if sampleList is null, set to default */
{
if (!sampleList)
{
defaultOrder(gh);
return;
}
if (gh->sampleOrder)
freeHash(&gh->sampleOrder);
gh->sampleOrder = hashNew(0);
if (gh->sampleList)
slFreeList(&gh->sampleList);
gh->sampleList = NULL;
if (gh->expIdOrder)
freeMem(gh->expIdOrder);
/* set expIdOrder to default , i.e. an array of -1s, -1 indicates to the drawing code that the sample will not be drawn in heatmap */
AllocArray(gh->expIdOrder, gh->expCount);
int i;
for (i=0; i< gh->expCount; i++)
gh->expIdOrder[i]= -1;
/*microarray specific settings*/
struct maGrouping *allA= NULL;
if(gh->custom)
allA = maGetGroupingFromCt(gh->ct);
else
{
struct microarrayGroups *maGs = maGroupings(gh->database, gh->name);
if (!maGs)
return;
//struct maGrouping *allA= maGs->allArrays;
allA= maGs->allArrays;
}
int counter = 0;
int expId; // bed15 format expId
char *sample;
struct slName *sl;
for(sl=sampleList; sl; sl=sl->next)
{
sample = sl->name;
expId = -1;
for (i=0; i< allA->size; i++)
{
if (sameString(allA->names[i],sample))
{
expId = allA->expIds[i];
gh->expIdOrder[expId]=counter;
break;
}
}
if (expId == -1)
continue;
if ( hashLookup(gh->sampleOrder, sample) == NULL )
{
hashAddInt(gh->sampleOrder, sample, counter);
slNameAddHead(&(gh->sampleList),sample);
counter++;
}
}
slReverse(&gh->sampleList);
}
/* reset the default order of samples to be displayed */
void defaultOrder(struct genoHeatmap* gh)
{
if (gh->sampleOrder)
freeHash(&gh->sampleOrder);
gh->sampleOrder = hashNew(0);
if (gh->sampleList)
slFreeList(&gh->sampleList);
gh->sampleList = NULL;
if (gh->expIdOrder)
freeMem(gh->expIdOrder);
AllocArray(gh->expIdOrder, gh->expCount);
/* set expIdOrder to default , i.e. an array of -1s, -1 indicates to the drawing code that the sample will not be drawn in heatmap */
int i;
for (i=0; i< gh->expCount; i++)
gh->expIdOrder[i]= -1;
int expId;
char *sample;
//struct microarrayGroups *maGs = NULL;
struct maGrouping *allA= NULL;
if(gh->custom)
allA = maGetGroupingFromCt(gh->ct);
else
{
struct microarrayGroups *maGs = maGroupings(gh->database, gh->name);
if (!maGs)
return;
allA= maGs->allArrays;
//struct maGrouping *allA= maGs->allArrays;
}
for (i=0; i< allA->size; i++)
{
sample = allA->names[i];
expId = allA->expIds[i];
slNameAddHead(&gh->sampleList, sample);
gh->expIdOrder[expId]=i;
hashAddInt(gh->sampleOrder, sample, i);
}
slReverse(&gh->sampleList);
}
/* Return all the samples belong to a heatmap.
* free return list after use
*/
struct slName *getAllSamples(struct genoHeatmap *gh)
{
char *sample;
struct maGrouping *allA= NULL;
if(gh->custom)
allA = maGetGroupingFromCt(gh->ct);
else
{
struct microarrayGroups *maGs = maGroupings(gh->database, gh->name);
if (!maGs)
return NULL;
//struct maGrouping *allA= maGs->allArrays;
allA= maGs->allArrays;
}
struct slName *list = newSlName(NULL);
int i;
for (i=0; i< allA->size; i++)
{
sample = allA->names[i];
slNameAddHead(&list, sample);
}
slReverse(&list);
return list;
}
/* Return all the samples in list belong to a heatmap.
* free return list after use
*/
struct slName *getAllSamplesInDb(struct genoHeatmap *gh, struct slName *list)
{
struct slName *sl;
struct slName *sampleInDb = getAllSamples(gh);
struct slName *retList =NULL;
for (sl = list; sl; sl=sl->next)
if (slNameInList(sampleInDb, sl->name))
slNameAddHead(&retList, sl->name);
slReverse(&retList);
slNameFreeList(&sampleInDb);
return retList;
}
/* Return all only the patients with data in gh
* Free returned list after use
*/
struct slName *getAllPatients (struct genoHeatmap *gh)
{
if (!gh->patDb)
return NULL;
struct slName *sampleList = getAllSamples(gh);
char *sampleStr= sqlStrFromSlNameList(sampleList);
char *db = gh->patDb;
char *labTable = gh->patTable;
char *key = gh->patField;
char *val =gh->sampleField;
struct sqlResult *sr;
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, db);
char **row;
char *person;
struct slName *patList=NULL;
char query[strlen(sampleStr)+512];
safef ( query, sizeof(query), "select %s from %s where %s in (%s)",
key, labTable, val, sampleStr );
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
{
person = row[0];
slNameAddHead(&(patList),person);
}
sqlFreeResult(&sr);
hFreeConn(&conn);
return patList;
}
/* Return all the patients, including those without data in gh*/
struct slName *getAllPatientsInDb (struct genoHeatmap *gh)
{
/* get all patient ids */
char *db = gh->patDb;
char *labTable = gh->patTable;
char *key = gh->patField;
if ((labTable == NULL) || (key == NULL) || (db==NULL))
return NULL;
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, db);
if (!conn)
return NULL;
char query[512];
struct sqlResult *sr;
char **row;
char *person;
struct slName *patientList=NULL;
safef(query, sizeof(query),"select %s from %s ", key, labTable );
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
{
person = row[0];
slNameAddHead(&(patientList),person);
}
sqlFreeResult(&sr);
hFreeConn(&conn);
return patientList;
}
/* Return samples belong to patients in patList, and also have data in the heatmap
*/
struct slName *getSamplesInPatList (struct genoHeatmap *gh, struct slName *patList)
{
if (!patList)
return NULL;
char *db = gh->patDb;
char *labTable = gh->patTable;
char *key = gh->patField;
char *valField = gh->sampleField;
if (!labTable || !key || !db)
return NULL;
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, db);
if (!conn)
return NULL;
char *patStr = sqlStrFromSlNameList(patList);
struct dyString *query=newDyString(2048);
struct sqlResult *sr;
char **row;
char *patient;
dyStringPrintf(query, "select %s from %s where %s in (%s) order by FIELD(%s,%s)",
valField, labTable, key, patStr, key, patStr);
struct slName *slSample =newSlName(NULL);
sr = sqlGetResult(conn, query->string);
while ((row = sqlNextRow(sr)) != NULL)
{
patient = row[0];
slNameAddHead(&(slSample),patient);
}
slReverse(&slSample);
sqlFreeResult(&sr);
freeDyString(&query);
hFreeConn(&conn);
return slSample;
}
struct slSample {
struct slSample *next;
char *name;
int index;
double val;
double count;
};
int slSampleCmpVal(const void *va, const void *vb)
{
const struct slSample *a = *((struct slSample **)va);
const struct slSample *b = *((struct slSample **)vb);
float dif = a->val/a->count - b->val/b->count;
if (dif < 0)
return -1;
else if (dif > 0)
return 1;
else
return 0;
}
struct slName *samplesSortedByChromPos(struct genoHeatmap *gh, char *chrom,
int start, int stop, int sortDir)
{
if (!gh)
return NULL;
struct bed *nb, *ghBed = getChromHeatmapRange(gh, chrom, start, stop, FALSE);
if (!ghBed)
return NULL;
defaultOrder(gh); // needed to populate gh->sampleList;
struct slName *sl;
struct slSample *ss, *ssList = NULL;
int i = 0;
for (sl = gh->sampleList; sl; sl = sl->next)
{
AllocVar(ss);
ss->name = cloneString(sl->name);
ss->index = i;
ss->val = 0.0;
ss->count = 0.0;
i++;
slAddHead(&ssList, ss);
}
slReverse(&ssList);
for (nb = ghBed; nb; nb = nb->next)
{
for(ss = ssList; ss; ss = ss->next)
{
double val = nb->expScores[ss->index];
ss->val += val;
ss->count += 1.0;
}
}
bedFreeList(&ghBed);
slSort(&ssList, slSampleCmpVal);
if (sortDir > 0)
slReverse(&ssList);
struct slName *sampleList = NULL;
for (ss = ssList; ss; ss = ss->next)
{
sl = slNameNew(ss->name);
slAddHead(&sampleList, sl);
}
return sampleList;
}
struct slName *samplesSortedByGene(struct genoHeatmap *gh, char *gene, int sortDir)
{
if (!gh || !gene)
return NULL;
defaultOrder(gh); // needed to populate gh->sampleList;
struct slName *sl;
struct slSample *ss, *ssList = NULL;
int i = 0;
for (sl = gh->sampleList; sl; sl = sl->next)
{
AllocVar(ss);
ss->name = cloneString(sl->name);
ss->index = i;
ss->val = 0.0;
ss->count = 0.0;
i++;
slAddHead(&ssList, ss);
}
slReverse(&ssList);
// To use getChromHeatmapHash, need to use geneSet structure.
struct geneSet *gs = AllocA(struct geneSet);
gs->name = NULL;
gs->genes = slNameNew(gene);
struct hash *geneHash = NULL;
getChromHeatmapHash(&geneHash, gh->database, gh->probeTable,
gh->name, NULL, gs);
struct hashEl *el = hashLookup(geneHash, gene);
if (!el)
return NULL;
while (el)
{
struct bed *nb = el->val;
for(ss = ssList; ss; ss = ss->next)
{
double val = nb->expScores[ss->index];
ss->val += val;
ss->count += 1.0;
}
el = el->next;
}
slSort(&ssList, slSampleCmpVal);
if (sortDir > 0)
slReverse(&ssList);
struct slName *sampleList = NULL;
for (ss = ssList; ss; ss = ss->next)
{
sl = slNameNew(ss->name);
slAddHead(&sampleList, sl);
}
return sampleList;
}
/* Return an array for reordering the experiments
If the order has not been set, then use function setBedOrder to set
*/
int *getBedOrder(struct genoHeatmap *gh)
{
if (gh->expIdOrder == NULL)
sortPersonOrder(gh);
return gh->expIdOrder;
}
/* Sort and set the sample orders according to feature configuration
*/
void sortPersonOrder(struct genoHeatmap *gh)
{
if (!sameWord(gh->dataType, "bed 15"))
return;
/* Get all patients */
struct slName *patientList = getAllPatients(gh);
if (!patientList)
{
defaultOrder(gh);
return;
}
/* get colList */
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, gh->patDb);
char *raName = gh->raFile;
struct column *colList = getColumns(conn, raName, gh->patDb);
/* sort patients */
struct slName *sortList = sortPatients(conn, colList, patientList);
/* set ordering in gh */
setPersonOrder(gh, sortList);
hFreeConn(&conn);
}
/* Get gene sets from database name is the gene set group name */
struct geneSet *getPathways(char *db, char *pathwayNames)
{
char **row = NULL;
struct dyString *query = newDyString(20000);
if (pathwayNames == NULL)
{
dyStringPrintf(query, "select genesets.name, members,displayName from genesets, pathwayname where genesets.name=pathwayname.name");
}
else
{
struct slName *slList = slNameListFromComma(pathwayNames);
char *setNamesStr = sqlStrFromSlNameList(slList);
dyStringPrintf(query, "select DISTINCT genesets.name, members, displayName from genesets, pathwayname where genesets.name=pathwayname.name and genesets.name in (%s) order by FIELD(genesets.name, %s) ", setNamesStr, setNamesStr);
}
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, db);
struct sqlResult *sr = sqlGetResult(conn, query->string);
dyStringFree(&query);
struct geneSet *geneSets = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
char *members = cloneString(row[1]);
struct slName *slList = slNameListFromComma(members);
struct geneSet *gs = AllocA(struct geneSet);
gs->genes = slList;
gs->name = cloneString(row[0]);
gs->displayName = cloneString(row[2]);
gs->numGenes = slCount(slList);
gs->numGenesActive = 0;
gs->x = 0;
gs->y = 0;
gs->width = 0;
gs->pixelsPerGene = 0;
slAddHead(&geneSets, gs);
}
slReverse(&geneSets);
sqlFreeResult(&sr);
hFreeConn(&conn);
return geneSets;
}
/* Get genes from database */
struct slName *getAllGenes (char *db, char *tableName)
{
if (tableName == NULL)
return NULL;
char **row = NULL;
struct dyString *query = newDyString(20000);
dyStringPrintf(query, "select distinct alias from %s", tableName);
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, db);
struct sqlResult *sr = sqlGetResult(conn, query->string);
dyStringFree(&query);
struct slName *genes = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
char *gene = row[0];
slNameAddHead(&genes, gene);
}
slReverse(&genes);
sqlFreeResult(&sr);
hFreeConn(&conn);
return genes;
}
struct bed *getBedGraph(struct genoHeatmap *gh, char* chrom, int nField)
/* get bedGraph data for each chromosome */
{
if (!gh)
return NULL;
/* get the data from the database */
char *database = gh->database;
char *tableName = gh->name;
/* get the data from the database */
char **row = NULL;
char query[512];
query[0] = '\0';
if (chrom)
{
safef(query, sizeof(query),
"select * from %s where chrom = \"%s\" \n",
tableName, chrom);
}
else
{ /* No chromosome specified, default to full genome */
safef(query, sizeof(query),
"select * from %s \n",
tableName);
}
struct sqlConnection *conn;
conn = hAllocConnProfile(heatMapDbProfile, database);
struct sqlResult *sr = sqlGetResult(conn, query);
struct bed *tuple = NULL;
struct bed *tupleList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
tuple = bedLoadN(row+1,nField);
slAddHead(&tupleList, tuple);
}
sqlFreeResult(&sr);
hFreeConn(&conn);
return tupleList;
}
struct bed *getBedGraphRange(struct genoHeatmap *gh, char* chrom,
int start, int stop, int nField)
/* get bedGraph data for each chromosome */
{
if (!gh)
return NULL;
/* get the data from the database */
char *database = gh->database;
char *tableName = gh->name;
/* get the data from the database */
char **row = NULL;
char query[512];
query[0] = '\0';
safef(query, sizeof(query),
"select * from %s where chrom = '%s' and chromStart<%d and chromEnd>%d",
tableName, chrom, stop,start);
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, database);
struct sqlResult *sr = sqlGetResult(conn, query);
struct bed *tuple = NULL;
struct bed *tupleList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
tuple = bedLoadN(row+1,nField);
slAddHead(&tupleList, tuple);
}
sqlFreeResult(&sr);
hFreeConn(&conn);
return tupleList;
}
/* Get the bed 15 data from database for heatmap gh
* if useAccessTable, only retrive data from the down sampled table*/
struct bed *getChromHeatmap(struct genoHeatmap *gh, char *chromName, boolean useAccessTable)
{
if (gh == NULL)
return NULL;
/* get the data from the database */
char *database =gh->database;
char *tableName;
if (gh->accessTable && useAccessTable)
tableName = gh->accessTable;
else
tableName = gh->name;
char **row = NULL;
char query[512];
query[0] = '\0';
if (chromName)
{
safef(query, sizeof(query),
"select * from %s where chrom = \"%s\" \n",
tableName, chromName);
}
else
{ /* No chromosome specified, default to full genome */
safef(query, sizeof(query),
"select * from %s \n",
tableName);
}
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, database);
struct sqlResult *sr = sqlGetResult(conn, query);
struct bed *tuple = NULL;
struct bed *tupleList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
tuple = bedLoadN(row+1, 15);
slAddHead(&tupleList, tuple);
}
sqlFreeResult(&sr);
hFreeConn(&conn);
return tupleList;
}
/* Get the bed 15 data from database for heatmap gh
* if useAccessTable, only retrive data from the down sampled table*/
struct bed *getChromHeatmapRange(struct genoHeatmap *gh, char *chrom,
int start, int stop, boolean useAccessTable)
{
if (!gh)
return NULL;
/* get the data from the database */
char *database =gh->database;
char *tableName;
if (gh->accessTable && useAccessTable)
tableName = gh->accessTable;
else
tableName = gh->name;
char **row = NULL;
char query[256];
safef(query, sizeof(query),
"select * from %s where chrom = '%s' and chromStart<%d and chromEnd>%d",
tableName, chrom, stop,start);
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, database);
struct sqlResult *sr = sqlGetResult(conn, query);
struct bed *tuple = NULL;
struct bed *tupleList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
tuple = bedLoadN(row+1, 15);
slAddHead(&tupleList, tuple);
}
sqlFreeResult(&sr);
hFreeConn(&conn);
return tupleList;
}
void getChromHeatmapHash(struct hash **geneHash, char *geneDb, char *probeTable,
char *tableName, char *chromName, struct geneSet *geneSets)
/* Get the bed15 data of gneneSets
You can add probe hash information to geneHash accumulatively */
{
if (geneDb == NULL || tableName == NULL || probeTable == NULL || geneSets == NULL)
return;
struct geneSet *gs;
struct slName *nameList=NULL, *name=NULL;
struct slName *sl;
for (gs = geneSets; gs ; gs = gs->next)
{
for (sl = gs->genes; sl; sl = sl->next)
{
name = newSlName(sl->name);
slAddHead(&nameList, name);
}
}
char *geneList = sqlStrFromSlNameList(nameList);
/* get the data from the database */
char **row = NULL;
char query[strlen(geneList)+1000];
query[0] = '\0';
if (chromName)
{
/* the following is an out of date query */
safef(query, sizeof(query),
"select DISTINCT * from %s, %s where %s.name=%s.name and %s.alias in (%s) and chrom = \"%s\" \n",
tableName, probeTable, tableName, probeTable,
probeTable, geneList, chromName);
}
else
{
/* No chromosome specified, default to full genome */
safef(query, sizeof(query),
"select DISTINCT * from %s, %s where %s.name=%s.name and %s.alias in (%s)\n",
tableName, probeTable, tableName, probeTable,
probeTable, geneList);
}
struct sqlConnection *conn = hAllocConnProfile(heatMapDbProfile, geneDb);
struct sqlResult *sr = sqlGetResult(conn, query);
struct bed *tuple = NULL;
if (!(*geneHash))
*geneHash = hashNew(0);
while ((row = sqlNextRow(sr)) != NULL)
{
tuple = bedLoadN(row+1, 15);
char *name = *(row+17);
hashAdd(*geneHash, name, tuple);
}
sqlFreeResult(&sr);
hFreeConn(&conn);
free(geneList);
slFreeList(&nameList);
}
/* Get the subset smaple lists as an array of list of samples
* Returned array is created on the heap, clean up memory after use
*/
struct slName **getSubsets(struct genoHeatmap *gh, int subsetNum, char *raName )
{
if (!gh || !subsetNum)
return NULL;
//get basic info of gh
char *labTable = gh->patTable;
char *value = gh->sampleField;
char *key = gh->patField;
char *patDb = gh->patDb;
if ((labTable == NULL) || (key == NULL) || (value == NULL) || (patDb==NULL) )
return NULL;
struct sqlConnection *featureDbConn = hAllocConnProfile(heatMapDbProfile, patDb);
if (!featureDbConn)
return NULL;
struct column *colList = getColumns(featureDbConn,raName,gh->patDb);
struct slName *patList = getAllPatients(gh);
//allocate return array of list of sample names
struct slName **ptArray;
AllocArray(ptArray, subsetNum);
int subset;
for (subset = 0; subset < subsetNum; subset++)
{
struct slName *subPatList =
advFilterResults (colList, featureDbConn, patList, gh->patDb, subset);
struct slName *subSampleList = getSamplesInPatList (gh, subPatList);
struct slName *sl;
struct slName *retList =NULL;
for (sl=subSampleList; sl; sl=sl->next)
{
if (slNameInList(gh->sampleList,sl->name))
slNameAddHead(&retList, sl->name);
}
slReverse(&retList);
ptArray[subset] = retList;
if (subSampleList)
slNameFreeList(&subSampleList);
if (subPatList)
slNameFreeList(&subPatList);
}
-
hFreeConn(&featureDbConn);
if (patList)
slNameFreeList(&patList);
return ptArray;
}
int getSubsetsIfAny(struct genoHeatmap *gh, int subsetNum, char *raName, struct slName ***subsets)
{
struct slName **ptSubsets = getSubsets(gh, subsetNum, raName);
if (!ptSubsets)
return 0;
int i;
int count=0;
for (i=0; i < subsetNum; i++)
if (ptSubsets[i])
count++;
*subsets = ptSubsets;
return count;
}
/* Get the subset of bed 15 data from database for heatmap gh
* Clean up return pointer after use */
struct bed **getSubgroupChromHeatmap(struct genoHeatmap *gh, int subsetNum,
char *raName, char *chromName, boolean useAccess)
{
// get all the data
struct bed *allBed = getChromHeatmap(gh, chromName, useAccess);
// get the subsets
struct slName **ptSubsets = getSubsets(gh, subsetNum, raName);
if (!ptSubsets)
return NULL;
int i;
for (i=0; i< subsetNum; i++)
if (!ptSubsets[i])
jsonWarnAbort("Cannot compute statistics: Subgroup %d not fully defined", i+1);
/* Deep copy of gh, clean up memory after return pointer */
struct genoHeatmap *newGh = cloneHeatmap(gh);
struct bed **ptArray;
AllocArray(ptArray, subsetNum);
int subset;
for (subset =0; subset < subsetNum; subset++)
{
struct slName *sampleSubset = ptSubsets[subset];
//filter the data for each subset
defaultOrder(newGh);
setSampleOrder(newGh, sampleSubset);
int N = slCount(newGh->sampleList);
struct bed *newTupleList = NULL;
struct bed *newTuple =NULL;
struct bed *tuple = allBed;
while (tuple)
{
/* set up the new data */
int counter =0;
int i, id, order;
float score;
int *newIds = AllocArray(newIds,N);
float *newScores = AllocArray(newScores,N);
for (i=0; i< tuple->expCount; i++)
{
id = tuple->expIds[i];
order = newGh->expIdOrder[id];
if (order == -1)
continue;
score = tuple->expScores[i];
newIds[counter]= counter;
newScores[counter]=score;
counter++;
}
newTuple = cloneBed(tuple);
assert(counter == N);
newTuple->expCount = N;
free(newTuple->expIds);
newTuple->expIds = newIds;
free(newTuple->expScores);
newTuple->expScores= newScores;
slAddHead(&newTupleList, newTuple);
tuple = tuple->next;
}
ptArray[subset] = newTupleList;
}
//clean memory
if (allBed)
bedFree(&allBed);
return ptArray;
}
/* Get the subset of bed 15 data from database for heatmap gh
* Clean up return pointer after use */
struct hash **getSubgroupChromHeatmapHash(struct genoHeatmap *gh, int subsetNum,
char *raName,
struct geneSet *geneSets)
{
// get all the data as allHash
char *database = gh->database;
char *probeAliases = gh->probeTable;
char *tableName = gh->name; // no access table at gene set level
struct hash *allHash = NULL;
getChromHeatmapHash(&allHash, database, probeAliases,tableName, NULL, geneSets);
if (!allHash)
return NULL;
// get the subsets
struct slName **ptSubsets = getSubsets(gh, subsetNum, raName);
if (!ptSubsets)
return NULL;
int i;
for (i=0; i< subsetNum; i++)
if (!ptSubsets[i])
jsonWarnAbort("Cannot compute statistics: Subgroup %d not fully defined", i+1);
/* Deep copy of gh, clean up memory after return pointer */
struct genoHeatmap *newGh = cloneHeatmap(gh);
struct hash **ptArray;
AllocArray(ptArray, subsetNum);
int subset;
struct hashEl *hashList = hashElListHash(allHash);
/* Return a list of all elements of hash. Free return with hashElFreeList. */
for (subset =0; subset < subsetNum; subset++)
{
struct slName *sampleSubset = ptSubsets[subset];
//filter the data for each subset
defaultOrder(newGh);
setSampleOrder(newGh, sampleSubset);
int N = slCount(newGh->sampleList);
struct hash *newH = newHash(0);
struct bed *newTuple =NULL;
struct hashEl *hl;
struct bed *tuple;
char *name;
for (hl = hashList; hl; hl=hl->next)
{
tuple = hl->val;
name = hl->name;
/* set up the new data */
int counter =0;
int i, id, order;
float score;
int *newIds = AllocArray(newIds,N);
float *newScores = AllocArray(newScores,N);
for (i=0; i< tuple->expCount; i++)
{
id = tuple->expIds[i];
order = newGh->expIdOrder[id];
if (order == -1)
continue;
score = tuple->expScores[i];
newIds[counter]= counter;
newScores[counter]=score;
counter++;
}
newTuple = cloneBed(tuple);
assert(counter == N);
newTuple->expCount = N;
free(newTuple->expIds);
newTuple->expIds = newIds;
free(newTuple->expScores);
newTuple->expScores= newScores;
hashAdd(newH, name, newTuple);
}
ptArray[subset] = newH;
}
//clean memory
if (allHash)
freeHash(&allHash);
if (hashList)
hashElFreeList(&hashList);
return ptArray;
}
void hgStatsAdjust(struct hgStats *hsList, double corr)
{
if (!hsList)
return;
struct hgStats *hs;
for (hs = hsList; hs; hs = hs->next)
{
hs->prob += corr;
if (hs->prob < 0)
hs->prob = 0;
if (hs->outputVal >= 0)
{
hs->outputVal += corr;
if (hs->outputVal < 0)
hs->outputVal = 0.0;
}
else
{
hs->outputVal -= corr;
if (hs->outputVal > 0)
hs->outputVal = 0.0;
}
}
}
void listAdjustBonferroni(struct analysisResult *arList)
{
if (!arList)
return;
struct analysisResult *ar = arList; // I don't think this is ever a list.
struct hgStats *hsList = ar->stats;
double count = (double) slCount(hsList);
/* correction in -log10, since all p-values are in -log10 */
double corr = -log(count)/log(10.0);
hgStatsAdjust(hsList, corr);
}
void hashAdjustBonferroni(struct analysisResultHash *arHash)
{
if (!arHash)
return;
struct hash *hash = arHash->hash;
if (!hash)
return;
double count = (double) hashNumEntries(hash);
/* correction in -log10, since all p-values are in -log10 */
double corr = -log(count)/log(10.0);
struct hashEl *el;
struct hashCookie hc = hashFirst(hash);
while ((el = hashNext(&hc)) != NULL)
{
struct hgStats *hs = el->val;
hgStatsAdjust(hs, corr);
}
}
void adjustBonferroni(struct genoHeatmap *gh)
{
if (gh->anaResult)
listAdjustBonferroni(gh->anaResult);
else if (gh->anaResultHash)
hashAdjustBonferroni(gh->anaResultHash);
}
/* Calculate the differnce of means between two subsets
* Clean memory of returned pointer after use */
struct analysisResult *diffAveSubgroup (struct genoHeatmap *gh,
int subsetNum, char *raName,
boolean (*func)(float data1[], unsigned long n1,
float data2[], unsigned long n2,
float *r, float *prob),
char *chromName,
boolean useAccess)
{
if (!gh || !func || !subsetNum || (subsetNum !=2) )
return NULL;
struct bed **subBed = getSubgroupChromHeatmap(gh, subsetNum, raName, chromName, useAccess);
if (!subBed)
return NULL;
int i;
for (i=0; i< subsetNum; i++)
if (!subBed[i])
jsonWarnAbort("Cannot compute statistics: Subgroup %d not fully defined", i+1);
struct bed **pt=NULL;
AllocArray(pt, subsetNum);
for (i=0; i< subsetNum; i++)
pt[i] = subBed[i];
struct hgStats *statsList =NULL;
struct hgStats *stats=NULL;
double min = DBL_MAX, max = -DBL_MAX;
char *name;
struct bed *bed = NULL;
struct bed *bed0 = NULL, *bed1 =NULL;
while (pt[0])
{
name = pt[0]->name;
bed = (struct bed*) pt[0];
bed0 = (struct bed*) pt[0];
bed1 = (struct bed*) pt[1];
float r,p;
if (func(bed0->expScores, bed0->expCount, bed1->expScores, bed1->expCount, &r, &p))
{
if (fabs(p) > max)
max = fabs(p);
if (fabs(p) < min)
min = fabs(p);
AllocVar(stats);
stats->chrom = cloneString(bed->chrom);
stats->chromStart = bed->chromStart;
stats->chromEnd = bed->chromEnd;
stats->name = bed->name;
stats->stats = r;
stats->prob = p ;
if (stats->stats < 0)
stats->outputVal = -p;
else
stats->outputVal = p;
slAddHead(&statsList,stats);
}
for (i=0; i< subsetNum; i++)
pt[i]= pt[i]->next;
}
//free memory
for (i=0; i<subsetNum; i++)
bedFree(&subBed[i]);
free(subBed);
free(pt);
if (min == DBL_MAX || max == -DBL_MAX) // no data
return NULL;
struct analysisResult *result = AllocVar(result);
result->stats = statsList;
result->min = min;
result->max = max;
return result;
}
/* Calculate the differnce of means between two subsets
* Clean memory of returned pointer after use */
struct analysisResultHash *diffAveSubgroupHash (struct genoHeatmap *gh,
int subsetNum, char *raName,
struct geneSet *geneSets,
boolean (*func)(float data1[], unsigned long n1,
float data2[], unsigned long n2,
float *r, float *prob) )
{
if (!gh || !func || !subsetNum || (subsetNum !=2) )
return NULL;
struct hash **subHash = getSubgroupChromHeatmapHash(gh, subsetNum,raName, geneSets);
if (!subHash)
jsonWarnAbort("Cannot compute statistics");
int i;
for (i=0; i< subsetNum; i++)
if (!subHash[i])
jsonWarnAbort("Cannot compute statistics: Subgroup %d not fully defined", i+1);
struct hashEl **ptHashStart=NULL;
AllocArray(ptHashStart, subsetNum);
struct hashEl **ptHashEl=NULL;
AllocArray(ptHashEl, subsetNum);
for (i=0; i< subsetNum; i++)
{
ptHashEl[i] = hashElListHash(subHash[i]);
ptHashStart[i] = ptHashEl[i];
}
struct hash *newBed5Hash =newHash(0);
struct hgStats *stats=NULL;
double min = DBL_MAX, max = -DBL_MAX;
char *name;
struct bed *bed = NULL;
struct bed *bed0 = NULL, *bed1 =NULL;
while (ptHashEl[0])
{
name = ptHashEl[0]->name;
bed = (struct bed*) ptHashEl[0]->val;
bed0 = (struct bed*) ptHashEl[0]->val;
bed1 = (struct bed*) ptHashEl[1]->val;
float r,p ;
if (func(bed0->expScores, bed0->expCount , bed1->expScores, bed1->expCount, &r, &p))
{
if (fabs(p) > max)
max =fabs(p);
if (fabs(p) < min)
min =fabs(p);
AllocVar(stats);
stats->chrom = cloneString(bed->chrom);
stats->chromStart = bed->chromStart;
stats->chromEnd = bed->chromEnd;
stats->name = bed->name;
stats->stats = r;
stats->prob = p ;
if (stats->stats<0)
stats->outputVal = -p;
else
stats->outputVal =p;
hashAdd(newBed5Hash, name, stats);
}
for (i=0; i<subsetNum; i++)
ptHashEl[i]= ptHashEl[i]->next;
}
//free memory
for (i=0; i<subsetNum; i++)
{
hashFree(&subHash[i]);
hashElFreeList(&ptHashStart[i]);
}
free(subHash);
if (min == DBL_MAX || max == -DBL_MAX) //no data
return NULL;
struct analysisResultHash *result = AllocVar(result);
result->hash = newBed5Hash;
result->min = min;
result->max = max;
return result;
}
struct analysisResult *runMetaAnalysisChrom (int binSize, struct slRef **resultList,
boolean (*func)(struct slDouble *data,
float *r, float *prob))
{
struct hgStats *statsList =NULL;
double min = DBL_MAX, max = -DBL_MAX;
// binSize used to be basesPerPixel; need to figure out a better way to do that
struct slRef *iter=NULL;
struct hash *statBins = hashNew(0);
for(iter = *resultList; iter; iter= iter->next)
{
//refAdd(statResultList,((struct analysisResult *)(iter->val))->stats);
struct hgStats *statspt = ((struct analysisResult *)(iter->val))->stats;
while(statspt)
{
char key[100];
safef(key,100,"%s_%d",statspt->chrom,(int)(statspt->chromEnd / binSize));
hashAdd(statBins,key,statspt);
statspt = statspt->next;
}
}
struct hashCookie statBinCookie = hashFirst(statBins);
struct hashEl *currEl = NULL;
struct hgStats *stats =NULL;
while((currEl = hashNext(&statBinCookie)) != NULL)
{
struct slDouble *pVals = NULL;
char *hashKey = currEl->name;
char *chrom = strtok(hashKey,"_");
int bin = atoi(strtok(NULL,"_"));
float statSum = 0;
int statCount=0;
do
{
struct slDouble *pVal = AllocVar(pVal);
struct hgStats *statStruct = ((struct hgStats *)(currEl->val));
pVal->val = statStruct->prob;
statSum += statStruct->stats;
statCount++;
slAddHead(&pVals,pVal);
} while((currEl = hashLookupNext(currEl)) != NULL);
slReverse(pVals);
float r,p;
- //fprintf(stderr,"statsum: %f ",statSum);
if (func(pVals, &r, &p))
{
AllocVar(stats);
stats->chrom = cloneString(chrom);
stats->chromStart = bin*binSize;
stats->chromEnd = (bin+1)*binSize;
stats->name = cloneString(hashKey);
stats->stats = statSum / statCount;
stats->prob = p;
if(stats->stats < 0)
stats->outputVal = -p;
else
stats->outputVal = p;
- //fprintf(stderr,"output val = %f ",stats->outputVal);
slAddHead(&statsList,stats);
if(stats->prob < min)
min = stats->prob;
if(stats->prob > max)
max = stats->prob;
- //fprintf(stderr,"Min = %f Max = %f ",min, max);
}
}
struct analysisResult *result= AllocVar(result);
result->stats = statsList;
result->min = min;
result->max = max;
return result;
}
struct analysisResultHash *runMetaAnalysisGS (struct geneSet *geneSets, struct slRef **resultList,
boolean (*func)(struct slDouble *data,
float *r, float *prob))
{
struct hash *newBed5Hash = newHash(0);
double min = DBL_MAX, max = -DBL_MAX;
// binSize used to be basesPerPixel; need to figure out a better way to do that
struct slRef *iter=NULL;
//struct hash *statBins = hashNew(0);
struct hgStats *stats =NULL;
struct geneSet *gs;
for (gs = geneSets; gs ; gs = gs->next)
{
struct slName *sl;
for (sl = gs->genes; sl ; sl = sl->next)
{
char *geneName = cloneString(sl->name);
struct slDouble *pVals = NULL;
float statSum=0;
int statCount=0;
for(iter = *resultList; iter; iter= iter->next)
{
struct hash *statspt = ((struct analysisResultHash *)(iter->val))->hash;
struct hashEl *el = hashLookup(statspt, geneName); //get the information from genesets
while(el)
{
struct hgStats *statStruct = el->val;
struct slDouble *pVal = AllocVar(pVal);
pVal->val = statStruct->prob;
statSum += statStruct->stats;
statCount++;
slAddHead(&pVals,pVal);
el = hashLookupNext(el);
}
}
float r,p;
if (func(pVals, &r, &p))
{
int chromStart=0,chromEnd=0;
char *chrom = "unk";
AllocVar(stats);
stats->chrom = cloneString(chrom);
stats->chromStart = chromStart;
stats->chromEnd = chromEnd;
stats->name = cloneString(geneName);
stats->stats = statSum / statCount;
stats->prob = p;
if(stats->stats < 0)
stats->outputVal = -p;
else
stats->outputVal = p;
- //fprintf(stderr,"output val = %f ",stats->outputVal);
- //slAddHead(&statsList,stats);
hashAdd(newBed5Hash, geneName, stats);
if(stats->prob < min)
min = stats->prob;
if(stats->prob > max)
max = stats->prob;
-
- //fprintf(stderr,"Min = %f Max = %f ",min, max);
}
}
}
struct analysisResultHash *result= AllocVar(result);
result->hash = newBed5Hash;
result->min = min;
result->max = max;
return result;
}
/* Calculate the differnce of means between two subsets
* Clean memory of returned pointer after use */
struct analysisResultHash *diffMetaSubgroupHash (struct genoHeatmap *gh,
int subsetNum, char *raName,
struct geneSet *geneSets, char *tableName,
boolean (*func)(float data1[], unsigned long n1,
float data2[], unsigned long n2,
float *r, float *prob),char *blockStatFunc )
{
if (!gh || !func || !subsetNum || (subsetNum !=2) )
return NULL;
int useFisher = sameWord(blockStatFunc,blockStatTest[fisher]);
//int useFisher = 1;
struct hash **subHash = getSubgroupChromHeatmapHash(gh, subsetNum, raName, geneSets);
if (!subHash)
return NULL;
int i;
for (i=0; i< subsetNum; i++)
if (!subHash[i])
return NULL;
struct hashEl **ptHashStart=NULL;
AllocArray(ptHashStart, subsetNum);
struct hashEl **ptHashEl=NULL;
AllocArray(ptHashEl, subsetNum);
for (i=0; i< subsetNum; i++)
{
ptHashEl[i] = hashElListHash(subHash[i]);
ptHashStart[i] = ptHashEl[i];
}
struct hash *newBed5Hash =newHash(0);
struct hgStats *stats=NULL;
double min = DBL_MAX, max = -DBL_MAX;
char *name;
struct bed *bed = NULL;
struct bed *bed0 = NULL, *bed1 =NULL;
while (ptHashEl[0])
{
name = ptHashEl[0]->name;
bed = (struct bed*) ptHashEl[0]->val;
bed0 = (struct bed*) ptHashEl[0]->val;
bed1 = (struct bed*) ptHashEl[1]->val;
float r,p ;
if (func(bed0->expScores, bed0->expCount , bed1->expScores, bed1->expCount, &r, &p))
{
if (fabs(p) > max)
max =fabs(p);
if (fabs(p) < min)
min =fabs(p);
AllocVar(stats);
stats->chrom = cloneString(bed->chrom);
stats->chromStart = bed->chromStart;
stats->chromEnd = bed->chromEnd;
stats->name = bed->name;
stats->stats = r;
stats->prob = p ;
if (stats->stats<0)
stats->outputVal = -p;
else
stats->outputVal =p;
hashAdd(newBed5Hash, name, stats);
}
for (i=0; i<subsetNum; i++)
ptHashEl[i]= ptHashEl[i]->next;
}
for (i=0; i<subsetNum; i++)
{
hashFree(&subHash[i]);
hashElFreeList(&ptHashStart[i]);
}
free(subHash);
if (min == DBL_MAX || max == -DBL_MAX) //no data
return NULL;
struct analysisResultHash *result = AllocVar(result);
result->hash = newBed5Hash;
result->min = min;
result->max = max;
min =DBL_MAX;
max =-DBL_MAX;
struct hgStats *combStats=NULL;
struct hash *combNewBed5Hash =newHash(0);
struct geneSet *gs;
for (gs = geneSets; gs ; gs = gs->next)
{
struct slName *sl;
double statsCount = 0;
double statsSum = 0;
double probLogSum = 0;
for (sl = gs->genes; sl ; sl = sl->next)
{
struct hashEl *el = hashLookup(result->hash, sl->name); //get the information from genesets
while (el)
{
struct hgStats *val = el->val;
statsCount++;
statsSum += val->stats;
double pVal = pow(10,-1*val->prob);
if(pVal < 0)
pVal = pow(10,-290);
else if(pVal > 1)
pVal = 1-pow(10,-290);
if(useFisher)
probLogSum += -2*log(pVal);
else
probLogSum += zScore(pVal);
el = hashLookupNext(el);
}
}
// to prevent underflow...
if(statsCount > 0 && useFisher)
{
while((statsCount * log(probLogSum/2) - probLogSum/2 - lgam(statsCount)) < -500)
{
probLogSum -= 1;
}
}
if(statsCount > 0)
{
AllocVar(combStats);
combStats->chrom = cloneString("chrXXX"); // put dummy chromosomes, never used
combStats->chromStart = 0;
combStats->chromEnd = 0;
combStats->name = gs->name;
combStats->stats = (statsSum) / statsCount;
if(useFisher)
combStats->prob = log(chdtrc(statsCount*2,probLogSum))/log(10)*-1;
else
combStats->prob = log(ndtr(probLogSum/sqrt(statsCount)))/log(10)*-1;
// truncate the p vals here so we prevent underflow and to prevent
// a single gene from pulling an entire geneset significant
if(combStats->prob > 3)
combStats->prob = 3;
combStats->compIndex = 0;
combStats->totalComps = 1;
if(combStats->prob < min)
min = combStats->prob;
if(combStats->prob > max)
max = combStats->prob;
if (combStats->stats<0)
combStats->outputVal = -combStats->prob;
else
combStats->outputVal =combStats->prob;
hashAdd(combNewBed5Hash, gs->name, combStats);
}
}
if (min == DBL_MAX || max == -DBL_MAX) //no data
return NULL;
struct analysisResultHash *fResult = AllocVar(fResult);
fResult->hash = combNewBed5Hash;
fResult->min = min;
fResult->max = max;
return fResult;
}
/* Calculate the differnce of means between two subsets
* Clean memory of returned pointer after use */
struct analysisResultHash *diffSubgroupHash (struct genoHeatmap *gh,
int subsetNum, char *raName,
struct geneSet *geneSets, char *blockStatFunc,
boolean (*func)(float data1[], unsigned long n1,
float data2[], unsigned long n2,
float *r, float *prob) )
{
if (blockStatFunc &&
(sameWord(blockStatFunc,blockStatTest[fisher])
|| sameWord(blockStatFunc,blockStatTest[weightedZ])))
return diffMetaSubgroupHash(gh, subsetNum, raName, geneSets, gh->name, func,blockStatFunc);
else
return diffAveSubgroupHash(gh, subsetNum, raName, geneSets, func);
}
struct geneSet *rankDiffAvePathways ( struct genoHeatmap *gh, int subsetNum, char *raName, char *chromHeatmap,
struct geneSet *geneSets, FILE *output,
boolean (*func)(float data1[], unsigned long n1, float data2[],
unsigned long n2, float *r, float *prob),char *funcName,
boolean useAccess)
{
if (!gh || !func || !subsetNum || (subsetNum !=2) )
return NULL;
struct analysisResult *ar =
diffAveSubgroup(gh, subsetNum, raName, func, NULL, useAccess);
if (!ar)
return NULL;
char *probeAliases =gh->probeTable;
struct hash *geneHash = NULL;
getChromHeatmapHash(&geneHash,database, probeAliases, chromHeatmap, NULL, geneSets);
if (!geneHash)
return NULL;
struct hgStats *bd, *bedList = ar->stats;
struct hash *bdHash = hashNew(0);
for (bd = bedList; bd; bd = bd->next)
{
hashAdd(bdHash, bd->name, bd);
}
struct slName *sl;
struct geneSet *gs;
int totalCount;
double totalScore;
struct hgStats *setList = NULL;
for (gs = geneSets; gs; gs = gs->next)
{
totalScore = 0.0;
totalCount = 0;
for (sl = gs->genes; sl; sl = sl->next)
{
struct hashEl *el = hashLookup(geneHash, sl->name);
while (el)
{
struct bed *nb = el->val;
char *name = nb->name;
struct hashEl *bEl = hashLookup(bdHash, name);
if (bEl)
{
bd = bEl->val;
// you can also use outputval, stats (t value), or use prob (-log10(probability))
/* perturbation , does not care about direction */
//totalScore += (double) fabs(bd->stats);
/* directional change */
totalScore += (double) (bd->stats);
totalCount++;
}
el = hashLookupNext(el);
}
}
double aveScore = totalScore / (double) totalCount;
if (totalCount == 0)
aveScore = 0.0;
struct hgStats *newHgStats = AllocA(struct hgStats);
newHgStats->name = cloneString(gs->name);
newHgStats->outputVal = aveScore;
slAddHead(&setList, newHgStats);
}
slSort(&setList, hgStatsCmpScore);
slReverse(&setList);
int rank=0;
struct geneSet *newGeneSets = NULL;
if (output)
fprintf(output, "table= %s\t%s\trank\tnumber of genes in pathway\n", gh->name, funcName);
for (bd = setList; bd; bd = bd->next)
{
rank++;
if(output)
fprintf(output,"%s\t%f\t%d\t", bd->name, bd->outputVal, rank);
for (gs = geneSets; gs; gs = gs->next)
{
if (sameString(gs->name, bd->name))
{
if (output)
fprintf(output,"%d\n", slCount(gs->genes));
struct geneSet *ng = AllocA(struct geneSet);
ng->name = cloneString(gs->name);
slAddHead(&newGeneSets, ng);
break;
}
}
}
slReverse(&setList);
return newGeneSets;
}
/*Rank genes based on the differnce of between two subsets */
void rankDiffAveGenes ( struct genoHeatmap *gh,int subsetNum, char *raName, char *chromHeatmap,
struct slName *genes, FILE *output,
boolean (*func)(float data1[], unsigned long n1, float data2[],
unsigned long n2, float *r, float *prob), char *funcName,
boolean useAccess)
{
if (!gh || !func || !subsetNum || (subsetNum !=2) )
return;
struct analysisResult *ar = diffAveSubgroup(gh, subsetNum, raName, func, NULL, useAccess);
if (!ar)
return;
char *probeAliases = gh->probeTable;
struct geneSet *geneSets =AllocA(struct geneSet);
geneSets->genes = genes;
struct hash *geneHash = NULL;
getChromHeatmapHash(&geneHash, database, probeAliases, chromHeatmap, NULL, geneSets);
if (!geneHash)
return;
struct hgStats *bd, *bedList = ar->stats;
struct hash *bdHash = hashNew(0);
for (bd = bedList; bd; bd = bd->next)
{
hashAdd(bdHash, bd->name, bd);
}
struct slName *sl;
struct hgStats *setList = NULL;
for (sl = genes; sl; sl = sl->next)
{
double totalScore =0.0;
int totalCount =0;
struct hashEl *el = hashLookup(geneHash, sl->name);
if (el)
{
struct bed *nb;
while (el)
{
nb = el->val;
char *name = nb->name;
struct hashEl *bEl = hashLookup(bdHash, name);
if (bEl)
{
bd = bEl->val;
// you can also use outputval, stats (t value), or use prob (-log10(probability))
//totalScore += (double) bd->outputVal;
totalScore += (double) bd->stats;
totalCount++;
}
el = hashLookupNext(el);
}
double aveScore = totalScore / (double) totalCount;
if (totalCount == 0)
aveScore = 0.0;
struct hgStats *newHgStats = AllocA(struct hgStats);
newHgStats->name = cloneString(sl->name);
newHgStats->chrom = nb->chrom;
newHgStats->chromStart = nb->chromStart;
newHgStats->chromEnd = nb->chromEnd;
newHgStats->outputVal = aveScore;
newHgStats->stats = totalCount; // hack to record the number of probes
slAddHead(&setList, newHgStats);
}
}
slSort(&setList, hgStatsCmpScore);
slReverse(&setList);
if (output)
fprintf(output, "table= %s\t%s\trank\tmultipleProbe\n", gh->name, funcName);
int rank=0;
for (bd = setList; bd; bd = bd->next)
{
rank++;
if(output)
{
fprintf(output,"%s\t%f\t%d\t%.0f\t%s\t%d\t%d\n",
bd->name, bd->outputVal, rank, bd->stats, bd->chrom, bd->chromStart,bd->chromEnd);
}
}
}
void hgStatsFree(struct hgStats *pEl)
/* Free a list of dynamically allocated hgStats */
{
struct hgStats *el;
if ((el = pEl) == NULL) return;
freeMem(el->chrom);
freeMem(el->name);
freez(pEl);
}
void hgStatsFreeList(struct hgStats *pList)
/* Free a list of dynamically allocated hgStats's */
{
struct hgStats *el, *next;
for (el = pList; el != NULL; el = next)
{
next = el->next;
hgStatsFree(el);
}
freez(pList);
}
int hgStatsCmpScore(const void *va, const void *vb)
/* Compare to sort based on score - lowest first. */
{
const struct hgStats *a = *((struct hgStats **)va);
const struct hgStats *b = *((struct hgStats **)vb);
float r = a->outputVal - b->outputVal;
if (r<0)
return -1;
else if (r==0)
return 0;
else
return 1;
}
void analysisResultFree(struct analysisResult *pt)
/* Free analysisResult */
{
hgStatsFreeList(pt->stats);
freez(pt);
}
void analysisResultHashFree(struct analysisResultHash *pt)
/* Free analysisResultHash */
{
freeHash(&pt->hash);
freez(pt);
}
/*
struct geneSet *getCompGeneSet(int subsetNum, int numberOfGenes,
boolean (*func)(float data1[], unsigned long n1, float data2[],
unsigned long n2, float *r, float *prob), char *metaMethod)
{
if(!ghList || subsetNum != 2 || !func)
return NULL;
struct slRef *ref= NULL;
struct genoHeatmap *gh= NULL;
struct slName *fullGenes = NULL;
struct analysisResultHash *fullResult = AllocA(struct analysisResultHash);
int useFisher = sameWord(metaMethod,metaStatTest[fisherMeta]);
fullResult->hash = hashNew(0);
for (ref = ghList; ref != NULL; ref = ref->next)
{
gh= ref->val;
if (sameWord(gh->dataType,"bed 15") &&
gotAdvFilter(gh->patDb) && func)
{
// we only care about bed15s for the comp gene list
char *raName= gh->raFile;
struct slName *allGenes = getAllGenes(gh->database,gh->probeTable);
struct geneSet *gs = AllocA(struct geneSet);
gs->genes = allGenes;
gs->name = "allGenes";
gs->displayName = "All Genes";
gs->numGenes = slCount(allGenes);
gs->numGenesActive = 0;
gs->x = 0;
gs->y = 0;
gs->width = 0;
gs->pixelsPerGene = 0;
gs->next = NULL;
struct analysisResultHash *result =
diffAveSubgroupHash(gh, subsetNum, raName,gs, func);
struct slName *gene;
for(gene=allGenes;gene;gene=gene->next)
{
struct hashEl *el = hashLookup(result->hash, gene->name);
while (el)
{
hashAdd(fullResult->hash,gene->name,el->val);
el = hashLookupNext(el);
}
if(!slNameInList(fullGenes,gene->name))
slNameAddHead(&fullGenes,gene->name);
}
}
}
struct hgStats *stats=NULL;
struct hgStats *allStats=NULL;
struct slName *singleGene;
for(singleGene = fullGenes;singleGene;singleGene = singleGene->next)
{
struct hashEl *el = hashLookup(fullResult->hash, singleGene->name); //get the information from genesets
double statsCount = 0;
double statsSum = 0;
double probLogSum = 0;
while (el)
{
struct hgStats *val = el->val;
statsCount++;
statsSum += val->stats;
double pVal = pow(10,-1*val->prob);
if(pVal < 0)
pVal = pow(10,-290);
else if(pVal > 1)
pVal = 1-pow(10,-290);
if(useFisher)
probLogSum += -2*log(pVal);
else
probLogSum += zScore(pVal);
el = hashLookupNext(el);
}
if(statsCount > 0)
{
AllocVar(stats);
if(useFisher)
stats->prob = log(chdtrc(statsCount*2,probLogSum))/log(10)*-1;
else
stats->prob = log(ndtr(probLogSum/sqrt(statsCount)))/log(10)*-1;
stats->outputVal = stats->prob; // used by the sorter
stats->name = singleGene->name;
slAddHead(&allStats, stats);
}
}
slSort(&allStats, hgStatsCmpScore);
slReverse(&allStats);
struct hgStats *singleStats = allStats;
struct slName *sl, *topGenes = NULL;
int i;
for(i=0;i<numberOfGenes;i++)
{
sl = slNameNew(cloneString(singleStats->name));
slAddHead(&topGenes, sl);
singleStats = singleStats->next;
}
slFreeList(allStats);
struct geneSet *gs = AllocA(struct geneSet);
gs->genes = topGenes;
gs->name = "topGenes";
gs->displayName = "Top Genes";
gs->numGenes = slCount(topGenes);
gs->numGenesActive = 0;
gs->x = 0;
gs->y = 0;
gs->width = 0;
gs->pixelsPerGene = 0;
gs->next = NULL;
return gs;
}
*/
/*
struct analysisResultHash *makePathwayStatsHash(struct geneSet *geneSets,struct hash *geneHash, struct analysisResultHash *stats, char *metaMethod)
{
int useFisher = sameWord(metaMethod,metaStatTest[fisherMeta]);
//int useFisher = 1;
double min =DBL_MAX;
double max =-DBL_MAX;
struct hgStats *combStats=NULL;
struct hash *combNewBed5Hash =newHash(0);
struct geneSet *gs;
for (gs = geneSets; gs ; gs = gs->next)
{
struct slName *sl;
double statsCount = 0;
double statsSum = 0;
double probLogSum = 0;
for (sl = gs->genes; sl ; sl = sl->next)
{
struct hashEl *el = hashLookup(stats->hash, sl->name); //get the information from genesets
while (el)
{
struct hgStats *val = el->val;
statsCount++;
statsSum += val->stats;
double pVal = pow(10,-1.0*val->prob);
if(pVal < 0)
pVal = pow(10,-290);
else if(pVal > 1)
pVal = 1-pow(10,-290);
if(useFisher)
probLogSum += -2.0*log(pVal);
else
probLogSum += zScore(pVal);
el = hashLookupNext(el);
}
}
// to prevent underflow...
//printf("debug %f \n",(statsCount * log(probLogSum/2) - probLogSum/2 - lgam(statsCount)));
if(statsCount > 0 && useFisher)
{
while((statsCount * log(probLogSum/2) - probLogSum/2 - lgam(statsCount)) < -500)
{
probLogSum -= 1;
}
}
if(statsCount > 0)
{
AllocVar(combStats);
combStats->chrom = cloneString("chrXXX"); // put dummy chromosomes, never used
combStats->chromStart = 0;
combStats->chromEnd = 0;
combStats->name = gs->name;
combStats->stats = (statsSum) / statsCount;
if(useFisher)
combStats->prob = log(chdtrc(statsCount*2,probLogSum))/log(10)*-1;
else
combStats->prob = log(ndtr(probLogSum/sqrt(statsCount)))/log(10)*-1;
combStats->compIndex = 0;
combStats->totalComps = 1;
if(combStats->prob < min)
min = combStats->prob;
if(combStats->prob > max)
max = combStats->prob;
if (combStats->stats<0)
combStats->outputVal = -combStats->prob;
else
combStats->outputVal =combStats->prob;
hashAdd(combNewBed5Hash, gs->name, combStats);
}
}
if (min == DBL_MAX || max == -DBL_MAX) //no data
return NULL;
struct analysisResultHash *fResult = AllocVar(fResult);
fResult->hash = combNewBed5Hash;
fResult->min = min;
fResult->max = max;
return fResult;
}
*/