src/hg/instinct/lib/hgStatsLib.c 1.14

1.14 2009/06/04 03:42:50 jsanborn
added copyright notices, removed cluster library
Index: src/hg/instinct/lib/hgStatsLib.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/lib/hgStatsLib.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 1000000 -r1.13 -r1.14
--- src/hg/instinct/lib/hgStatsLib.c	27 Apr 2009 16:58:35 -0000	1.13
+++ src/hg/instinct/lib/hgStatsLib.c	4 Jun 2009 03:42:50 -0000	1.14
@@ -1,670 +1,674 @@
+/********************************************************************************/
+/* Copyright 2007-2009 -- The Regents of the University of California           */
+/********************************************************************************/
+
 #include <math.h>
 #include "float.h"
 #include "common.h"
 #include "cprob.h"
 #include "hgStatsLib.h"
 
 #define MAXIT 100
 #define EPS 3.0e-7
 #define FPMIN 1.0e-15   // Changed for 32-bit compatibility, used to be 1.0e-30
 
 boolean aveDiff( float data1[], unsigned long n1, float data2[], unsigned long n2, 
 		 float *r, float *p)
 {
 if (n1 == 0 || n2 == 0)
     return FALSE;
 if (!data1 || !data2)
     return FALSE;
 
 int i;
 float total1 = 0.0;
 for (i = 0; i < n1; i++)
     total1 += data1[i];
 
 float total2 = 0.0;    
 for (i = 0; i < n2; i++)
     total2 += data2[i];
     
 *p = total1/(float) n1 - total2 /(float) n2;
 if (*p < 0.0)
     *r = -1.0;
 else
     *r = 1.0;
 *p = fabs(*p);
 return TRUE;
 }
 
 float gammln(float xx)
 /*Returns the value ln[(xx)] for xx > 0.*/
 {
 //Internal arithmetic will be done in double precision, a nicety that you can omit if ve-gure accuracy is good enough.
 double x,y,tmp,ser;
 static double cof[6]={76.18009172947146,-86.50532032941677,
 		      24.01409824083091,-1.231739572450155,
 		      0.1208650973866179e-2,-0.5395239384953e-5};
 int j;
 y=x=xx;
 tmp=x+5.5;
 tmp -= (x+0.5)*log(tmp);
 ser=1.000000000190015;
 for (j=0;j<=5;j++) ser += cof[j]/++y;
 return -tmp+log(2.5066282746310005*ser/x);
 }
 
 void nrerror(char error_text[])
 {
 errAbort(error_text);
 }
 
 float betacf(float a, float b, float x)
 {
 /* Used by betai: Evaluates continued fraction for incomplete beta function by modied Lentz's method (x5.2). */
 int m,m2;
 float aa,c,d,del,h,qab,qam,qap;
 qab=a+b; //These q's will be used in factors that occur in the coefficients (6.4.6).
 qap=a+1.0; 
 qam=a-1.0;
 c=1.0; //First step of Lentz's method.
 d=1.0-qab*x/qap;
 if (fabs(d) < FPMIN) d=FPMIN;
 d=1.0/d;
 h=d;
 for (m=1;m<=MAXIT;m++) 
     {
     m2=2*m;
     aa=m*(b-m)*x/((qam+m2)*(a+m2));
     d=1.0+aa*d; //One step (the even one) of the recurrence.
     if (fabs(d) < FPMIN) d=FPMIN;
     c=1.0+aa/c;
     if (fabs(c) < FPMIN) c=FPMIN;
     d=1.0/d;
     h *= d*c;
     aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2));
     d=1.0+aa*d; //Next step of the recurrence (the odd one).
     if (fabs(d) < FPMIN) d=FPMIN;
     c=1.0+aa/c;
     if (fabs(c) < FPMIN) c=FPMIN;
     d=1.0/d;
     del=d*c;
     h *= del;
     if (fabs(del-1.0) < EPS) break; //Are we done?
     }
 if (m > MAXIT) nrerror("a or b too big, or MAXIT too small in betacf");
 return h;
 }
 
 float betai(float a, float b, float x)
 /*Returns the incomplete beta function Ix(a; b).
  * copied from NC */
 {
 float bt;
 if (x < 0.0 || x > 1.0) nrerror("Bad x in routine betai");
 if (x == 0.0 || x == 1.0) bt=0.0;
 else  //Factors in front of the continued fraction.
     bt=exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(1.0-x));
 if (x < (a+1.0)/(a+b+2.0)) //Use continued fraction directly.
     return bt*betacf(a,b,x)/a;
 else //Use continued fraction after making the symmetry transformation.
     return 1.0-bt*betacf(b,a,1.0-x)/b; 
 }
 
 void avevar(float data[], unsigned long n, float *ave, float *var)
 {
 unsigned long j;
 float s,ep;
 for (*ave=0.0, j=0; j<n; j++)
     *ave+=data[j];
 *ave/=n;
 *var=ep=0.0;
 for (j=0; j<n; j++)
     {
     s=data[j]-(*ave);
     ep+=s;
     *var += s*s;
     }
 *var = (*var -ep *ep/n)/(n-1);
 }
 
 boolean ttest(float data1[], unsigned long n1, float data2[], unsigned long n2,
 	      float *t, float *prob)
 /* t test copied from NC 
  * return 0 if failed, return 1 if successfully executed */
 {
 if (n1<=1 || n2<=1)
     return 0;
 
 float var1, var2, svar, df, ave1,ave2;
 avevar(data1, n1, &ave1, &var1);
 avevar(data2, n2, &ave2, &var2);
 df = n1 + n2 -2;
 svar = ((n1-1)*var1 + (n2-1)* var2) /df;
 if (svar==0)
     return 0;
 
 *t = (ave1-ave2)/sqrt(svar*(1.0/ n1+ 1.0/n2));
 *prob = betai (0.5*df, 0.5, df/(df+(*t)*(*t)));
 
 // -log10(prob)
 if (*prob < 0.000001)
     *prob = 0.000001;    // very low p-value, but we can't have zero, so low limit = 10^-6
 
 *prob = -log10(*prob);
 return 1;
 }
 
 boolean fishersLinearDisc(float data1[], unsigned long n1, float data2[], unsigned long n2,
               float *j, float *prob)
 /* Calculates the probability of getting the residuals you do from the subgrouping chosen.
  * return 0 if failed, return 1 if successfully executed */
 {
 if (n1<=1 || n2<=1)
 return 0;
 unsigned long i=0,x=0, n3=n1+n2;
 float data3[n3], ave1, ave2, var1, var2;
 for(i=0;i<n1;i++) 
     data3[i]=data1[i];
 for(i=n1;i<n3;i++) 
     data3[i]=data2[i-n1];
 
 //Calc the j-val (i.e. difference) for this subgrouping split
 avevar(data1, n1, &ave1,&var1);
 avevar(data2, n2, &ave2,&var2);
 *j = ((ave1-ave2)*(ave1-ave2)) / (var1 + var2);
 
 //Generate 100 random j-vals independent of grouping and see how our one compares
 float randSet1[n1], randSet2[n2], rand_j=0.0, dist=0.0;
 srand(1);
 for(x=0; x < 100; x++)
     {
     for(i=0;i< n1; i++)
         randSet1[i]=data3[rand()%n3];
     for(i=0;i< n2; i++)
         randSet2[i]=data3[rand()%n3];
     avevar(randSet1, n1, &ave1,&var1);
     avevar(randSet2, n2, &ave2,&var2);
     rand_j = ((ave1-ave2)*(ave1-ave2)) / (var1 + var2);
     if(rand_j > (*j)) 
         dist++; //larger j means better discriminant. Raise p-val for that.
     }
 *prob=(dist/100);
 if(*prob < 0.01) *prob = 0.01;//lower bound on 100 iterations
 *prob=log(*prob)/log(10)*-1;
     
 return 1;
 }
 
 long double log_fac(int i)
 /*Calculates the logged-factorial of int i - useful for doing factorials without numbers blowing up*/
 {
 double tmp;
 if(i <= 1) 
     return log(1);
 tmp = log(i) + log_fac(i - 1);
 return tmp;
 }
 
 boolean fishersExact(float data1[], unsigned long n1, float data2[], unsigned long n2,
               float *t, float *prob)
 /* Coherence test
  * return 0 if failed, return 1 if successfully executed */
 {
 unsigned long i,j;
 if (n1<=1 || n2<=1 || (n1+n2) > 300 )
     return 0;
 //Construct a 2x2 contingency table of values around 0
 int table[2][2];
 for(i=0;i<2;i++)
     {
     for(j=0;j<2;j++) table[i][j]=0;
     }
 for(i=0;i<n1;i++)
     {
     if(data1[i] >= 0) 
         table[0][0]++;
     else 
         table[0][1]++;
     }
 for(i=0;i<n2;i++)
     {
     if(data2[i] >= 0) 
         table[1][0]++;
     else 
         table[1][1]++;
     }
 //count limits of each dimension
 int r1 = table[0][0]+table[0][1];
 int r2 = table[1][0]+table[1][1];
 int c1 = table[0][0]+table[1][0];
 int c2 = table[0][1]+table[1][1];
 int tot = r1+r2;
 
 //calc the conditional probability of this set
 *t = exp(( log_fac(r1) + log_fac(r2) + log_fac(c1) + log_fac(c2) )
         - ( log_fac(tot) + log_fac(table[0][0]) + log_fac(table[0][1]) 
         + log_fac(table[1][0]) + log_fac(table[1][1]) ));
 //Sum all sets with a conditional probability smaller than this set to get the cumulative p
 unsigned long a, b, c, d;
 float cond_pval, cumm_pval;
 i = j = a = b = c = d = 0;
 cond_pval = cumm_pval = 0;
 for(i = 0; i <= c1; i++)
     {
     for(j = 0; j <= c2; j++)
         {
         a = i;
         b = j;
         c=c1-i;
         d=c2-j;
         if( ((a+b) == r1) && ((c+d) == r2) )
             {
             table[0][0] = a;
             table[0][1] = b;
             table[1][0] = c;
             table[1][1] = d;
             cond_pval = exp(( log_fac(r1) + log_fac(r2) + log_fac(c1) + log_fac(c2) )
                         - ( log_fac(tot) + log_fac(table[0][0]) + log_fac(table[0][1]) 
                         + log_fac(table[1][0]) + log_fac(table[1][1]) ));
             if(cond_pval <= (*t)) 
                 cumm_pval += cond_pval;
             }
         }
     }
 *prob=log(cumm_pval)/log(10)*-1;
 return 1;
 }
 
 void medvar(float data[], unsigned long n, float *med, float *var)
 {
 unsigned long i,j;
 float tmp, s,ep,ave;
 for (i=0; i<n-1; i++)
     {
     for (j=0; j<n-1-i; j++)
         {
         if (data[j+1] < data[j])
             {
             tmp = data[j];
             data[j] = data[j+1];
             data[j+1] = tmp;
             }
         }
     }
 *med=data[n/2];
 //TODO clean all this up to be more efficient.
 for(ave=0.0,j=0; j<n; j++)
     ave+=data[j];
 ave/=n;
 
 *var=ep=ave=0.0;
 for (j=0; j<n; j++)
     {
     s=data[j]-ave;
     ep+=s;
     *var += s*s;
     }
 *var = (*var -ep *ep/n)/(n-1);
 }
 
 float calcEC(float data[], unsigned long n, float cutoff)
 /*Supplementary function to expressionCoherence. Calcs an EC instance for comparisons*/
 {
 unsigned long i, j;
 float dist=0.0, EC=0.0, count=0.0; 
 for(j = 0; j < n; j++)
     {
     for(i = (j+1); i < n; i++)
         {
         dist = abs(data[i] - data[j]);
         if(dist < cutoff) 
             EC++;
         count++;
         }
     }
 return (EC/count);
 }
 
 /* Needs major work - Makes all random spots look significant because of cluster in the middle of 
 * normal curve - Without variance screening this looks odd. Don't use yet*/
 boolean expressionCoherence(float data1[], unsigned long n1, float data2[], unsigned long n2,float *f, float *prob)
 {
 if (n1<=1 || n2<=1)
     return 0;
 unsigned long n3=n1+n2, i=0,j=0;
 float data3[n3], tmp;
 for(i=0;i<n1;i++) 
     data3[i]=data1[i];
 for(i=n1;i<n3;i++) 
     data3[i]=data2[i-n1];
 for (i=0; i<n3-1; i++)
     {
     for (j=0; j<n3-1-i; j++)
         {
         if (data3[j+1] < data3[j])
             {
             tmp = data3[j];
             data3[j] = data3[j+1];
             data3[j+1] = tmp;
             }
         }
     }
 float cutoff = data3[n3/2];
 float EC1 = calcEC(data1, n1, cutoff);
 float EC2 = calcEC(data2, n2, cutoff);
 //Do 100 iterations of randomization and comparison
 float randSet1[n1], dist1=0.0, randSet2[n2], dist2=0.0;
 srand(1);
 for(i=0; i < 100; i++)
     {
     //generate random sets the same sizes as n1 and n2, and compare
     for(j=0;j< n1; j++) 
         randSet1[j]=data3[rand()%n3];
     if(calcEC(randSet1, n1, cutoff) < EC1) 
         dist1++;
     for(j=0;j< n2; j++)
         randSet2[j]=data3[rand()%n3];
     if(calcEC(randSet2, n2, cutoff) < EC2) 
         dist2++;
     }
 *prob=(dist1/100)*(dist2/100);
 if(*prob < 0.005) *prob = 0.005;//lower bound on 200 iterations
 *prob=log(*prob)/log(10)*-1;
 *f=1;
 return 1; 
 }
 
 boolean levene(float data1[], unsigned long n1, float data2[], unsigned long n2,float *f, float *prob)
 /* Homogeneity of variance test
  * return 0 if failed, return 1 if successfully executed */
 {
 if (n1<=1 || n2<=1)
     return 0;
 unsigned long n3=n1+n2, i=0;
 float data3[n3];
 for(i=0;i<n1;i++) 
     data3[i]=data1[i];
 for(i=n1;i<n3;i++) 
     data3[i]=data2[i-n1];
 float var1=0.0,var2=0.0, varAll=0.0, aveAll=0.0, ave1=0.0, ave2=0.0, MSE=0.0, MSBG=0.0;
 avevar(data1, n1,&ave1, &var1);
 avevar(data2, n2,&ave2, &var2);
 avevar(data3, n3,&aveAll, &varAll);
 
 for(i=0;i<n3;i++)
     {
     MSE+=((data3[i]-ave1)*(data3[i]-ave1));
     MSE+=((data3[i]-ave2)*(data3[i]-ave2));
     }
 MSBG+=(n1*(ave1-aveAll)*(ave1-aveAll));
 MSBG+=(n2*(ave2-aveAll)*(ave2-aveAll));
 MSBG*=(n3-2);
 int df = n3 -2;
 if(MSE==0)
     return 0;
 *f=MSBG/MSE;
 *prob = log(betai(0.5*df, 0.5, df/(df+*f)))/log(10)*-1;
 
 if (*prob < 0.000001)
     *prob = 0.000001;    // very low p-value, but we can't have zero, so low limit = 10^-6
 if(var1<var2) //set f so the colors go the right way
    *f=(*f*-1);
 return 1;
 }
 
 boolean brownForsythe(float data1[], unsigned long n1, float data2[], unsigned long n2,float *f, float *prob)
 /* Homogeneity of variance test
  * return 0 if failed, return 1 if successfully executed */
 {
 if (n1<=1 || n2<=1)
     return 0;
 unsigned long n3=n1+n2, i=0;
 float data3[n3];
 for(i=0;i<n1;i++)
     data3[i]=data1[i];
 for(i=n1;i<n3;i++) 
     data3[i]=data2[i-n1];
 float medAll=0.0, med1=0.0, med2=0.0,var1=0.0,var2=0.0,varAll=0.0, MSE=0.0, MSBG=0.0;
 medvar(data1, n1, &med1, &var1);
 medvar(data2, n2, &med2, &var2);
 medvar(data3, n3, &medAll, &varAll);    
 
 for(i=0;i<n3;i++)
     {
     MSE+=((data3[i]-med1)*(data3[i]-med1));
     MSE+=((data3[i]-med2)*(data3[i]-med2));
     }
 MSBG+= (n1*(med1-medAll)*(med1-medAll));
 MSBG+= (n2*(med2-medAll)*(med2-medAll));
 MSBG*=(n3-2);
 int df = n3 -2;
 if(MSE==0)
     return 0;
 *f=MSBG/MSE;
 *prob = log(betai(0.5*df, 0.5, df/(df+*f)))/log(10)*-1;
 if (*prob < 0.000001)
    *prob = 0.000001;    // very low p-value, but we can't have zero, so low limit = 10^-6
 if(var1<var2) //set f so the colors go the right way
    *f=(*f*-1);
 return 1;
 }
 
 boolean fishersMeta(struct slDouble *data, float *chi2, float *prob)
 /* Fisher's MetaAnalysis 
  * return 0 if failed, return 1 if successfully executed */
 {
 if (data == NULL)
     return 0;
 
 float logSum = 0;
 int logSumCount = 0;
 struct slDouble *currVal = data;
 while(currVal)
 	{
 	float pVal = pow(10,-1*currVal->val);
 	if(pVal > 0 && pVal <= 1)
 		{
 		logSum += log(pVal);
 		logSumCount++; // only count the ones we can actually use
 		}
 	currVal = currVal->next;
 	}
 logSum *= -2;
 *chi2 = logSum;    // probably won't need this ever, but keep it around for consistancy with t-test
 *prob = log(chdtrc(logSumCount*2,logSum))/log(10)*-1;
 return 1;
 }
 
 boolean fishersMetaSigned(struct slDouble *data, float *chi2, float *prob)
 /* Fisher's MetaAnalysis (Signed)
  * return 0 if failed, return 1 if successfully executed */
 {
 if (data == NULL)
     return 0;
 
 float logSum = 0.0;
 float sign = 0.0;
 int logSumCount = 0;
 struct slDouble *currVal = data;
 while(currVal)
 	{
 	sign = 1.0;
 	if (currVal->val < 0.0)
 	    sign = -1.0;
 	float pVal = pow(10,-1*fabs(currVal->val));
 	if(pVal > 0 && pVal <= 1)
 		{
 		logSum += sign*log(pVal);
 		logSumCount++; // only count the ones we can actually use
 		}
 	currVal = currVal->next;
 	}
 if (logSumCount == 0 || logSum == 0)
     return 0;
 
 sign = 1.0;
 if (logSum > 0.0)
     sign = -1.0;
 logSum = 2.0*fabs(logSum);
 *chi2 = logSum;    // probably won't need this ever, but keep it around for consistancy with t-test
 
 // signed fishers
 *prob = -1.0*sign*log(chdtrc(logSumCount*2,logSum))/log(10.0);
 return 1;
 }
 
 boolean stoufferMeta(struct slDouble *data, float *norm, float *prob)
 /* stouffer's MetaAnalysis 
  * return 0 if failed, return 1 if successfully executed */
 {
 if (data == NULL)
     return 0;
 float logSum = 0;
 int logSumCount = 0;
 struct slDouble *currVal = data;
 while(currVal)
 	{
 	float pVal = pow(10,-1*currVal->val);
 	if(pVal > 0 && pVal < 1)
 		{
 		logSum += zScore(pVal);
 		logSumCount++; // only count the ones we can actually use
 		}
 	currVal = currVal->next;
 	}
 logSum *= -2;
 *norm = logSum; // probably won't need this ever, but keep it around for consistancy with t-test
 *prob = log(ndtr(logSum/sqrt(logSumCount)))/log(10)*-1;
 return 1;
 }
 
 boolean mudholkarMeta(struct slDouble *data, float *tval, float *prob)
 /* mudholkar's MetaAnalysis 
  * return 0 if failed, return 1 if successfully executed */
 {
 if (data == NULL)
     return 0;
 float logSum = 0;
 int logSumCount = 0;
 struct slDouble *currVal = data;
 while(currVal)
 	{
 	float pVal = pow(10,-1*currVal->val);
 	if(pVal > 0 && pVal < 1)
 		{
 		logSum += log(pVal) - log(1.0-pVal);
 		logSumCount++; // only count the ones we can actually use
 		}
 	currVal = currVal->next;
 	}
 
 logSum *= -1*sqrt((15*logSumCount+12)/((5*logSumCount+2)*logSumCount*pow(PI,2)));
 *tval = logSum; 	// probably won't need this ever, but keep it around for consistancy with t-test
 float df = (logSumCount*5)+4;
 *prob = log(betai((0.5*df), 0.5, df/( df + (* tval) * (* tval) ) ))/log(10)*-1;
 return 1;
 }
 
 boolean symmUniMeta(struct slDouble *data, float *chi2, float *prob)
 /* SymmUni MetaAnalysis 
  * return 0 if failed, return 1 if successfully executed */
 {
 if (data == NULL)
     return 0;
 int i,n=0;
 double* subcomb = calloc(slCount(data)+1, sizeof(double));
 double* pvalues = calloc(slCount(data), sizeof(double));
 struct slDouble *currVal = data;
 while(currVal)
 	{
 	float pVal = pow(10,-1*currVal->val);
 	if(pVal > 0 && pVal < 1)
 		{
 		pvalues[n] = pVal;
 		n++; // only count the ones we can actually use
 		}
 	currVal = currVal->next;
 	}
 if(n < 1)
 	return 0;
 	
 subcomb[0] = 1.0;
 for (i = 1; i <= n; i++) 
     {
     subcomb[i] = pvcQuadraticHelper(i, pvalues, subcomb);
     }
 
 //fprintf(stderr,"SubComp %f %f\n",subcomb[n],fabs(subcomb[n]));
 *prob = log(fabs(subcomb[n])); // not sure if abs() is legit here
 //fprintf(stderr,"SubComp %f\n",*prob);
 free(pvalues);
 free(subcomb);
 
 *prob *= -2;
 *chi2 = *prob; 	// probably won't need this ever, but keep it around for consistancy with t-test
 //fprintf(stderr,"Prob = %f\n",*prob);
 if(*prob < 0.0)	// fixed chdtrc() domain errors
     *prob = 0.0;
 
 *prob = log(chdtrc(n*2,*prob))/log(10)*-1;
 
 return 1;
 }
 
 double zScore(double p) 
 {
 float Z_EPSILON = 0.000001;     /* Accuracy of z approximation */
 float minz = -20;
 float maxz = 20;
 float zval = 0.0;
 float pval;
 
 if (p < 0.0 || p > 1.0) {
 	return -1;
 }
 
 while ((maxz - minz) > Z_EPSILON) {
 	pval = ndtr(zval);
 	if (pval > p) {
 		maxz = zval;
 	} else {
 		minz = zval;
 	}
 	zval = (maxz + minz) * 0.5;
 }
 return(zval);
 }
 
 /* This seems like a sloppy compare implementation, as all I want to get
  * is the sign of the difference, but I'm not sure if that floating point
  * operation is available from standard c */
 int cmp_double_asc(const void* pa, const void* pb)
 {
   double a = *(double*)pa, b = *(double*)pb;
   if (a < b) return -1;
   else if (a > b) return 1;
   else return 0;
 }
 
 /*
  * for n = 2, 
  *    2 * r1 * r2 - r1^2
  */
 double pvcQuadraticHelper(int n, double* pvalues, double* subcomb)
 {
   double sum = 0.0;
   double choose = 1.0;
   double j1 = 0.0;
   double sign = 1.0;
   int j = 0;
 
   for (j = 0; j < n; ++j)
     {
       j1 = j + 1.0; 
       
       choose *= ((n == j+1) ? 1.0 : (n-j)) / (j1);
 
       sum += sign * choose * pow(pvalues[n-j-1], j1) * subcomb[n-j-1];
       sign *= -1.0;
     }
 
   return sum;
 }