src/hg/instinct/lib/hgStatsLib.c 1.14
1.14 2009/06/04 03:42:50 jsanborn
added copyright notices, removed cluster library
Index: src/hg/instinct/lib/hgStatsLib.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/lib/hgStatsLib.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 1000000 -r1.13 -r1.14
--- src/hg/instinct/lib/hgStatsLib.c 27 Apr 2009 16:58:35 -0000 1.13
+++ src/hg/instinct/lib/hgStatsLib.c 4 Jun 2009 03:42:50 -0000 1.14
@@ -1,670 +1,674 @@
+/********************************************************************************/
+/* Copyright 2007-2009 -- The Regents of the University of California */
+/********************************************************************************/
+
#include <math.h>
#include "float.h"
#include "common.h"
#include "cprob.h"
#include "hgStatsLib.h"
#define MAXIT 100
#define EPS 3.0e-7
#define FPMIN 1.0e-15 // Changed for 32-bit compatibility, used to be 1.0e-30
boolean aveDiff( float data1[], unsigned long n1, float data2[], unsigned long n2,
float *r, float *p)
{
if (n1 == 0 || n2 == 0)
return FALSE;
if (!data1 || !data2)
return FALSE;
int i;
float total1 = 0.0;
for (i = 0; i < n1; i++)
total1 += data1[i];
float total2 = 0.0;
for (i = 0; i < n2; i++)
total2 += data2[i];
*p = total1/(float) n1 - total2 /(float) n2;
if (*p < 0.0)
*r = -1.0;
else
*r = 1.0;
*p = fabs(*p);
return TRUE;
}
float gammln(float xx)
/*Returns the value ln[(xx)] for xx > 0.*/
{
//Internal arithmetic will be done in double precision, a nicety that you can omit if ve-gure accuracy is good enough.
double x,y,tmp,ser;
static double cof[6]={76.18009172947146,-86.50532032941677,
24.01409824083091,-1.231739572450155,
0.1208650973866179e-2,-0.5395239384953e-5};
int j;
y=x=xx;
tmp=x+5.5;
tmp -= (x+0.5)*log(tmp);
ser=1.000000000190015;
for (j=0;j<=5;j++) ser += cof[j]/++y;
return -tmp+log(2.5066282746310005*ser/x);
}
void nrerror(char error_text[])
{
errAbort(error_text);
}
float betacf(float a, float b, float x)
{
/* Used by betai: Evaluates continued fraction for incomplete beta function by modied Lentz's method (x5.2). */
int m,m2;
float aa,c,d,del,h,qab,qam,qap;
qab=a+b; //These q's will be used in factors that occur in the coefficients (6.4.6).
qap=a+1.0;
qam=a-1.0;
c=1.0; //First step of Lentz's method.
d=1.0-qab*x/qap;
if (fabs(d) < FPMIN) d=FPMIN;
d=1.0/d;
h=d;
for (m=1;m<=MAXIT;m++)
{
m2=2*m;
aa=m*(b-m)*x/((qam+m2)*(a+m2));
d=1.0+aa*d; //One step (the even one) of the recurrence.
if (fabs(d) < FPMIN) d=FPMIN;
c=1.0+aa/c;
if (fabs(c) < FPMIN) c=FPMIN;
d=1.0/d;
h *= d*c;
aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2));
d=1.0+aa*d; //Next step of the recurrence (the odd one).
if (fabs(d) < FPMIN) d=FPMIN;
c=1.0+aa/c;
if (fabs(c) < FPMIN) c=FPMIN;
d=1.0/d;
del=d*c;
h *= del;
if (fabs(del-1.0) < EPS) break; //Are we done?
}
if (m > MAXIT) nrerror("a or b too big, or MAXIT too small in betacf");
return h;
}
float betai(float a, float b, float x)
/*Returns the incomplete beta function Ix(a; b).
* copied from NC */
{
float bt;
if (x < 0.0 || x > 1.0) nrerror("Bad x in routine betai");
if (x == 0.0 || x == 1.0) bt=0.0;
else //Factors in front of the continued fraction.
bt=exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(1.0-x));
if (x < (a+1.0)/(a+b+2.0)) //Use continued fraction directly.
return bt*betacf(a,b,x)/a;
else //Use continued fraction after making the symmetry transformation.
return 1.0-bt*betacf(b,a,1.0-x)/b;
}
void avevar(float data[], unsigned long n, float *ave, float *var)
{
unsigned long j;
float s,ep;
for (*ave=0.0, j=0; j<n; j++)
*ave+=data[j];
*ave/=n;
*var=ep=0.0;
for (j=0; j<n; j++)
{
s=data[j]-(*ave);
ep+=s;
*var += s*s;
}
*var = (*var -ep *ep/n)/(n-1);
}
boolean ttest(float data1[], unsigned long n1, float data2[], unsigned long n2,
float *t, float *prob)
/* t test copied from NC
* return 0 if failed, return 1 if successfully executed */
{
if (n1<=1 || n2<=1)
return 0;
float var1, var2, svar, df, ave1,ave2;
avevar(data1, n1, &ave1, &var1);
avevar(data2, n2, &ave2, &var2);
df = n1 + n2 -2;
svar = ((n1-1)*var1 + (n2-1)* var2) /df;
if (svar==0)
return 0;
*t = (ave1-ave2)/sqrt(svar*(1.0/ n1+ 1.0/n2));
*prob = betai (0.5*df, 0.5, df/(df+(*t)*(*t)));
// -log10(prob)
if (*prob < 0.000001)
*prob = 0.000001; // very low p-value, but we can't have zero, so low limit = 10^-6
*prob = -log10(*prob);
return 1;
}
boolean fishersLinearDisc(float data1[], unsigned long n1, float data2[], unsigned long n2,
float *j, float *prob)
/* Calculates the probability of getting the residuals you do from the subgrouping chosen.
* return 0 if failed, return 1 if successfully executed */
{
if (n1<=1 || n2<=1)
return 0;
unsigned long i=0,x=0, n3=n1+n2;
float data3[n3], ave1, ave2, var1, var2;
for(i=0;i<n1;i++)
data3[i]=data1[i];
for(i=n1;i<n3;i++)
data3[i]=data2[i-n1];
//Calc the j-val (i.e. difference) for this subgrouping split
avevar(data1, n1, &ave1,&var1);
avevar(data2, n2, &ave2,&var2);
*j = ((ave1-ave2)*(ave1-ave2)) / (var1 + var2);
//Generate 100 random j-vals independent of grouping and see how our one compares
float randSet1[n1], randSet2[n2], rand_j=0.0, dist=0.0;
srand(1);
for(x=0; x < 100; x++)
{
for(i=0;i< n1; i++)
randSet1[i]=data3[rand()%n3];
for(i=0;i< n2; i++)
randSet2[i]=data3[rand()%n3];
avevar(randSet1, n1, &ave1,&var1);
avevar(randSet2, n2, &ave2,&var2);
rand_j = ((ave1-ave2)*(ave1-ave2)) / (var1 + var2);
if(rand_j > (*j))
dist++; //larger j means better discriminant. Raise p-val for that.
}
*prob=(dist/100);
if(*prob < 0.01) *prob = 0.01;//lower bound on 100 iterations
*prob=log(*prob)/log(10)*-1;
return 1;
}
long double log_fac(int i)
/*Calculates the logged-factorial of int i - useful for doing factorials without numbers blowing up*/
{
double tmp;
if(i <= 1)
return log(1);
tmp = log(i) + log_fac(i - 1);
return tmp;
}
boolean fishersExact(float data1[], unsigned long n1, float data2[], unsigned long n2,
float *t, float *prob)
/* Coherence test
* return 0 if failed, return 1 if successfully executed */
{
unsigned long i,j;
if (n1<=1 || n2<=1 || (n1+n2) > 300 )
return 0;
//Construct a 2x2 contingency table of values around 0
int table[2][2];
for(i=0;i<2;i++)
{
for(j=0;j<2;j++) table[i][j]=0;
}
for(i=0;i<n1;i++)
{
if(data1[i] >= 0)
table[0][0]++;
else
table[0][1]++;
}
for(i=0;i<n2;i++)
{
if(data2[i] >= 0)
table[1][0]++;
else
table[1][1]++;
}
//count limits of each dimension
int r1 = table[0][0]+table[0][1];
int r2 = table[1][0]+table[1][1];
int c1 = table[0][0]+table[1][0];
int c2 = table[0][1]+table[1][1];
int tot = r1+r2;
//calc the conditional probability of this set
*t = exp(( log_fac(r1) + log_fac(r2) + log_fac(c1) + log_fac(c2) )
- ( log_fac(tot) + log_fac(table[0][0]) + log_fac(table[0][1])
+ log_fac(table[1][0]) + log_fac(table[1][1]) ));
//Sum all sets with a conditional probability smaller than this set to get the cumulative p
unsigned long a, b, c, d;
float cond_pval, cumm_pval;
i = j = a = b = c = d = 0;
cond_pval = cumm_pval = 0;
for(i = 0; i <= c1; i++)
{
for(j = 0; j <= c2; j++)
{
a = i;
b = j;
c=c1-i;
d=c2-j;
if( ((a+b) == r1) && ((c+d) == r2) )
{
table[0][0] = a;
table[0][1] = b;
table[1][0] = c;
table[1][1] = d;
cond_pval = exp(( log_fac(r1) + log_fac(r2) + log_fac(c1) + log_fac(c2) )
- ( log_fac(tot) + log_fac(table[0][0]) + log_fac(table[0][1])
+ log_fac(table[1][0]) + log_fac(table[1][1]) ));
if(cond_pval <= (*t))
cumm_pval += cond_pval;
}
}
}
*prob=log(cumm_pval)/log(10)*-1;
return 1;
}
void medvar(float data[], unsigned long n, float *med, float *var)
{
unsigned long i,j;
float tmp, s,ep,ave;
for (i=0; i<n-1; i++)
{
for (j=0; j<n-1-i; j++)
{
if (data[j+1] < data[j])
{
tmp = data[j];
data[j] = data[j+1];
data[j+1] = tmp;
}
}
}
*med=data[n/2];
//TODO clean all this up to be more efficient.
for(ave=0.0,j=0; j<n; j++)
ave+=data[j];
ave/=n;
*var=ep=ave=0.0;
for (j=0; j<n; j++)
{
s=data[j]-ave;
ep+=s;
*var += s*s;
}
*var = (*var -ep *ep/n)/(n-1);
}
float calcEC(float data[], unsigned long n, float cutoff)
/*Supplementary function to expressionCoherence. Calcs an EC instance for comparisons*/
{
unsigned long i, j;
float dist=0.0, EC=0.0, count=0.0;
for(j = 0; j < n; j++)
{
for(i = (j+1); i < n; i++)
{
dist = abs(data[i] - data[j]);
if(dist < cutoff)
EC++;
count++;
}
}
return (EC/count);
}
/* Needs major work - Makes all random spots look significant because of cluster in the middle of
* normal curve - Without variance screening this looks odd. Don't use yet*/
boolean expressionCoherence(float data1[], unsigned long n1, float data2[], unsigned long n2,float *f, float *prob)
{
if (n1<=1 || n2<=1)
return 0;
unsigned long n3=n1+n2, i=0,j=0;
float data3[n3], tmp;
for(i=0;i<n1;i++)
data3[i]=data1[i];
for(i=n1;i<n3;i++)
data3[i]=data2[i-n1];
for (i=0; i<n3-1; i++)
{
for (j=0; j<n3-1-i; j++)
{
if (data3[j+1] < data3[j])
{
tmp = data3[j];
data3[j] = data3[j+1];
data3[j+1] = tmp;
}
}
}
float cutoff = data3[n3/2];
float EC1 = calcEC(data1, n1, cutoff);
float EC2 = calcEC(data2, n2, cutoff);
//Do 100 iterations of randomization and comparison
float randSet1[n1], dist1=0.0, randSet2[n2], dist2=0.0;
srand(1);
for(i=0; i < 100; i++)
{
//generate random sets the same sizes as n1 and n2, and compare
for(j=0;j< n1; j++)
randSet1[j]=data3[rand()%n3];
if(calcEC(randSet1, n1, cutoff) < EC1)
dist1++;
for(j=0;j< n2; j++)
randSet2[j]=data3[rand()%n3];
if(calcEC(randSet2, n2, cutoff) < EC2)
dist2++;
}
*prob=(dist1/100)*(dist2/100);
if(*prob < 0.005) *prob = 0.005;//lower bound on 200 iterations
*prob=log(*prob)/log(10)*-1;
*f=1;
return 1;
}
boolean levene(float data1[], unsigned long n1, float data2[], unsigned long n2,float *f, float *prob)
/* Homogeneity of variance test
* return 0 if failed, return 1 if successfully executed */
{
if (n1<=1 || n2<=1)
return 0;
unsigned long n3=n1+n2, i=0;
float data3[n3];
for(i=0;i<n1;i++)
data3[i]=data1[i];
for(i=n1;i<n3;i++)
data3[i]=data2[i-n1];
float var1=0.0,var2=0.0, varAll=0.0, aveAll=0.0, ave1=0.0, ave2=0.0, MSE=0.0, MSBG=0.0;
avevar(data1, n1,&ave1, &var1);
avevar(data2, n2,&ave2, &var2);
avevar(data3, n3,&aveAll, &varAll);
for(i=0;i<n3;i++)
{
MSE+=((data3[i]-ave1)*(data3[i]-ave1));
MSE+=((data3[i]-ave2)*(data3[i]-ave2));
}
MSBG+=(n1*(ave1-aveAll)*(ave1-aveAll));
MSBG+=(n2*(ave2-aveAll)*(ave2-aveAll));
MSBG*=(n3-2);
int df = n3 -2;
if(MSE==0)
return 0;
*f=MSBG/MSE;
*prob = log(betai(0.5*df, 0.5, df/(df+*f)))/log(10)*-1;
if (*prob < 0.000001)
*prob = 0.000001; // very low p-value, but we can't have zero, so low limit = 10^-6
if(var1<var2) //set f so the colors go the right way
*f=(*f*-1);
return 1;
}
boolean brownForsythe(float data1[], unsigned long n1, float data2[], unsigned long n2,float *f, float *prob)
/* Homogeneity of variance test
* return 0 if failed, return 1 if successfully executed */
{
if (n1<=1 || n2<=1)
return 0;
unsigned long n3=n1+n2, i=0;
float data3[n3];
for(i=0;i<n1;i++)
data3[i]=data1[i];
for(i=n1;i<n3;i++)
data3[i]=data2[i-n1];
float medAll=0.0, med1=0.0, med2=0.0,var1=0.0,var2=0.0,varAll=0.0, MSE=0.0, MSBG=0.0;
medvar(data1, n1, &med1, &var1);
medvar(data2, n2, &med2, &var2);
medvar(data3, n3, &medAll, &varAll);
for(i=0;i<n3;i++)
{
MSE+=((data3[i]-med1)*(data3[i]-med1));
MSE+=((data3[i]-med2)*(data3[i]-med2));
}
MSBG+= (n1*(med1-medAll)*(med1-medAll));
MSBG+= (n2*(med2-medAll)*(med2-medAll));
MSBG*=(n3-2);
int df = n3 -2;
if(MSE==0)
return 0;
*f=MSBG/MSE;
*prob = log(betai(0.5*df, 0.5, df/(df+*f)))/log(10)*-1;
if (*prob < 0.000001)
*prob = 0.000001; // very low p-value, but we can't have zero, so low limit = 10^-6
if(var1<var2) //set f so the colors go the right way
*f=(*f*-1);
return 1;
}
boolean fishersMeta(struct slDouble *data, float *chi2, float *prob)
/* Fisher's MetaAnalysis
* return 0 if failed, return 1 if successfully executed */
{
if (data == NULL)
return 0;
float logSum = 0;
int logSumCount = 0;
struct slDouble *currVal = data;
while(currVal)
{
float pVal = pow(10,-1*currVal->val);
if(pVal > 0 && pVal <= 1)
{
logSum += log(pVal);
logSumCount++; // only count the ones we can actually use
}
currVal = currVal->next;
}
logSum *= -2;
*chi2 = logSum; // probably won't need this ever, but keep it around for consistancy with t-test
*prob = log(chdtrc(logSumCount*2,logSum))/log(10)*-1;
return 1;
}
boolean fishersMetaSigned(struct slDouble *data, float *chi2, float *prob)
/* Fisher's MetaAnalysis (Signed)
* return 0 if failed, return 1 if successfully executed */
{
if (data == NULL)
return 0;
float logSum = 0.0;
float sign = 0.0;
int logSumCount = 0;
struct slDouble *currVal = data;
while(currVal)
{
sign = 1.0;
if (currVal->val < 0.0)
sign = -1.0;
float pVal = pow(10,-1*fabs(currVal->val));
if(pVal > 0 && pVal <= 1)
{
logSum += sign*log(pVal);
logSumCount++; // only count the ones we can actually use
}
currVal = currVal->next;
}
if (logSumCount == 0 || logSum == 0)
return 0;
sign = 1.0;
if (logSum > 0.0)
sign = -1.0;
logSum = 2.0*fabs(logSum);
*chi2 = logSum; // probably won't need this ever, but keep it around for consistancy with t-test
// signed fishers
*prob = -1.0*sign*log(chdtrc(logSumCount*2,logSum))/log(10.0);
return 1;
}
boolean stoufferMeta(struct slDouble *data, float *norm, float *prob)
/* stouffer's MetaAnalysis
* return 0 if failed, return 1 if successfully executed */
{
if (data == NULL)
return 0;
float logSum = 0;
int logSumCount = 0;
struct slDouble *currVal = data;
while(currVal)
{
float pVal = pow(10,-1*currVal->val);
if(pVal > 0 && pVal < 1)
{
logSum += zScore(pVal);
logSumCount++; // only count the ones we can actually use
}
currVal = currVal->next;
}
logSum *= -2;
*norm = logSum; // probably won't need this ever, but keep it around for consistancy with t-test
*prob = log(ndtr(logSum/sqrt(logSumCount)))/log(10)*-1;
return 1;
}
boolean mudholkarMeta(struct slDouble *data, float *tval, float *prob)
/* mudholkar's MetaAnalysis
* return 0 if failed, return 1 if successfully executed */
{
if (data == NULL)
return 0;
float logSum = 0;
int logSumCount = 0;
struct slDouble *currVal = data;
while(currVal)
{
float pVal = pow(10,-1*currVal->val);
if(pVal > 0 && pVal < 1)
{
logSum += log(pVal) - log(1.0-pVal);
logSumCount++; // only count the ones we can actually use
}
currVal = currVal->next;
}
logSum *= -1*sqrt((15*logSumCount+12)/((5*logSumCount+2)*logSumCount*pow(PI,2)));
*tval = logSum; // probably won't need this ever, but keep it around for consistancy with t-test
float df = (logSumCount*5)+4;
*prob = log(betai((0.5*df), 0.5, df/( df + (* tval) * (* tval) ) ))/log(10)*-1;
return 1;
}
boolean symmUniMeta(struct slDouble *data, float *chi2, float *prob)
/* SymmUni MetaAnalysis
* return 0 if failed, return 1 if successfully executed */
{
if (data == NULL)
return 0;
int i,n=0;
double* subcomb = calloc(slCount(data)+1, sizeof(double));
double* pvalues = calloc(slCount(data), sizeof(double));
struct slDouble *currVal = data;
while(currVal)
{
float pVal = pow(10,-1*currVal->val);
if(pVal > 0 && pVal < 1)
{
pvalues[n] = pVal;
n++; // only count the ones we can actually use
}
currVal = currVal->next;
}
if(n < 1)
return 0;
subcomb[0] = 1.0;
for (i = 1; i <= n; i++)
{
subcomb[i] = pvcQuadraticHelper(i, pvalues, subcomb);
}
//fprintf(stderr,"SubComp %f %f\n",subcomb[n],fabs(subcomb[n]));
*prob = log(fabs(subcomb[n])); // not sure if abs() is legit here
//fprintf(stderr,"SubComp %f\n",*prob);
free(pvalues);
free(subcomb);
*prob *= -2;
*chi2 = *prob; // probably won't need this ever, but keep it around for consistancy with t-test
//fprintf(stderr,"Prob = %f\n",*prob);
if(*prob < 0.0) // fixed chdtrc() domain errors
*prob = 0.0;
*prob = log(chdtrc(n*2,*prob))/log(10)*-1;
return 1;
}
double zScore(double p)
{
float Z_EPSILON = 0.000001; /* Accuracy of z approximation */
float minz = -20;
float maxz = 20;
float zval = 0.0;
float pval;
if (p < 0.0 || p > 1.0) {
return -1;
}
while ((maxz - minz) > Z_EPSILON) {
pval = ndtr(zval);
if (pval > p) {
maxz = zval;
} else {
minz = zval;
}
zval = (maxz + minz) * 0.5;
}
return(zval);
}
/* This seems like a sloppy compare implementation, as all I want to get
* is the sign of the difference, but I'm not sure if that floating point
* operation is available from standard c */
int cmp_double_asc(const void* pa, const void* pb)
{
double a = *(double*)pa, b = *(double*)pb;
if (a < b) return -1;
else if (a > b) return 1;
else return 0;
}
/*
* for n = 2,
* 2 * r1 * r2 - r1^2
*/
double pvcQuadraticHelper(int n, double* pvalues, double* subcomb)
{
double sum = 0.0;
double choose = 1.0;
double j1 = 0.0;
double sign = 1.0;
int j = 0;
for (j = 0; j < n; ++j)
{
j1 = j + 1.0;
choose *= ((n == j+1) ? 1.0 : (n-j)) / (j1);
sum += sign * choose * pow(pvalues[n-j-1], j1) * subcomb[n-j-1];
sign *= -1.0;
}
return sum;
}