src/hg/instinct/bioInt2/populateDb.c 1.11

1.11 2009/05/20 20:34:37 jsanborn
initial commit
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/populateDb.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -b -B -U 4 -r1.10 -r1.11
--- src/hg/instinct/bioInt2/populateDb.c	28 Apr 2009 21:21:24 -0000	1.10
+++ src/hg/instinct/bioInt2/populateDb.c	20 May 2009 20:34:37 -0000	1.11
@@ -511,15 +511,15 @@
     errAbort("clinicalData entries not unique, sample_id = %d, feature_id = %d",
 	     cd->sample_id, cd->feature_id);
 
 if (cd->val != cd2->val)
-    errAbort("clinicalData values don't match, sample_id = %d, feature_id = %d, "
+    fprintf(stderr, "clinicalData values don't match, sample_id = %d, feature_id = %d, "
 	     "%f != %f",
 	     cd->sample_id, cd->feature_id, cd->val, cd2->val);
 
 if (cd->code && cd2->code)
     if (!sameString(cd->code, cd2->code))	
-	errAbort("clinicalData codes don't match, sample_id = %d, feature_id = %d",
+	fprintf(stderr, "clinicalData codes don't match, sample_id = %d, feature_id = %d",
 		 cd->sample_id, cd->feature_id);
 
 return TRUE;
 }
@@ -754,8 +754,70 @@
 slFreeList(&allSdList);
 return TRUE;
 }
 
+
+boolean reduceDataHashRank(struct hash *dataHash, struct hash *hash, double *retCount)
+{
+struct slDouble *sd, *allSd, *allSdList = NULL;
+struct hashEl *outEl;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
+    {
+    struct hash *hash = outEl->val;
+
+    struct hashEl *inEl, *elList = hashElListHash(hash);
+    for (inEl = elList; inEl != NULL; inEl = inEl->next)
+	{
+	char *sample = inEl->name;
+	struct slDouble *sdList = inEl->val;
+	double med = slDoubleMedian(sdList);
+	sd = slDoubleNew(med);
+	allSd = slDoubleNew(med);
+
+	slAddHead(&allSdList, allSd);
+	
+	hashRemove(hash, sample);
+	slFreeList(&sdList);
+
+	hashAdd(hash, sample, sd);
+	}
+    hashElFreeList(&elList);
+    }
+
+double count = (double) slCount(allSdList);
+int halfCount = round(count/2.0);
+
+slSort(&allSdList, slDoubleCmp);
+
+char name[32];
+int index = 0, total = 0;
+for (sd = allSdList; sd && (index <= halfCount); sd = sd->next)
+    {
+    safef(name, sizeof(name), "%f", sd->val);
+    if (!hashLookup(hash, name))
+	hashAddInt(hash, name, index);
+    index++;
+    total++;
+    }
+
+index = (int) count;
+slReverse(&allSdList);
+for (sd = allSdList; sd && (index > halfCount); sd = sd->next)
+    {
+    safef(name, sizeof(name), "%f", sd->val);
+    if (!hashLookup(hash, name))
+	hashAddInt(hash, name, index);
+    index--;
+    total++;
+    }
+fprintf(stderr, "total = %d, out of %f\n", total, count);
+*retCount = count;
+
+slFreeList(&allSdList);
+return TRUE;
+}
+
 struct analysisVals *getAnalysisVals(struct sqlConnection *biConn, struct hash *dataHash, 
 				     double med, double std)
 {
 struct hash *sampleHash = createIdHash(biConn, SA_TABLE, "name");
@@ -790,9 +852,9 @@
 	av->feature_id = af->id;
 	av->val = sd->val;
 
 	z = (av->val - med)/std;
-	p = ndtr(-1.0*fabs(z));
+	p = 2.0*ndtr(-1.0*fabs(z));  // two-tailed
 	if (p > 0)
 	    val = min(-log(p)/log(10.0), maxLogP);
 	else
 	    val = maxLogP;
@@ -809,8 +871,73 @@
 
 return avList;
 }
 
+
+struct analysisVals *getAnalysisValsRank(struct sqlConnection *biConn, struct hash *dataHash, 
+					 struct hash *rankHash, int count)
+{
+struct hash *sampleHash = createIdHash(biConn, SA_TABLE, "name");
+
+double dCount = (double) count;
+int halfCount = round(dCount / 2.0);
+
+double sign, p;
+double maxLogP = 88.0;
+struct hashEl *inEl, *outEl;
+struct analysisVals *av, *avList = NULL;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
+    {
+    char *gene = outEl->name;
+    struct hash *hash = outEl->val;
+
+    struct analysisFeatures *af = getAnalysisFeatures(biConn, gene, "gene");
+
+    if (!af)
+	continue;
+
+    struct hashEl *elList = hashElListHash(hash);
+    for (inEl = elList; inEl != NULL; inEl = inEl->next)
+	{
+	char *sample = inEl->name;
+	struct slDouble *sd = inEl->val;
+
+	int sample_id = hashIntValDefault(sampleHash, sample, -1);
+	if (sample_id == -1)
+	    errAbort("No sample by name of %s\n", sample);
+
+	char valStr[32];
+	safef(valStr, sizeof(valStr), "%f", sd->val);
+	int index = hashIntValDefault(rankHash, valStr, -1);
+	if (index == -1)
+	    errAbort("couldn't find value %s in rankHash\n", valStr);
+	
+	if (index > halfCount)
+	    p = (dCount - (double) index + 1.0) / dCount;
+	else
+	    p = ((double) index + 1.0) / dCount;
+
+	sign = 1.0;
+	if (sd->val < 0)
+	    sign = -1.0;
+
+	p = sign * min(-log(2.0 * p)/log(10.0), maxLogP);
+	AllocVar(av);
+	av->sample_id = sample_id;
+	av->feature_id = af->id;
+	av->val = sd->val;
+	av->conf = p;
+
+	slAddHead(&avList, av);
+	}
+    analysisFeaturesFree(&af);
+    hashElFreeList(&elList);
+    }
+
+return avList;
+}
+
 void sexCorrect(struct bed *nb, float *correction)
 {
 int i;
 for (i = 0; i < nb->expCount; i++)
@@ -912,13 +1039,13 @@
 while ((row = sqlNextRow(sr)) != NULL)
     {
     struct bed *nb = bedLoadN(row+1, 15);
   
-//    if (sameString(nb->chrom, "chrX") || sameString(nb->chrom, "chrY"))
-//	{ // skip sex chroms
-//	bedFree(&nb);
-//	continue;
-//	}
+    if (sameString(nb->chrom, "chrX") || sameString(nb->chrom, "chrY"))
+	{ // skip sex chroms
+	bedFree(&nb);
+	continue;
+	}
 
     if (sameString(nb->chrom, "chrX") && doX)
 	sexCorrect(nb, correctionX);
     else if (sameString(nb->chrom, "chrY") && doY)
@@ -938,15 +1065,17 @@
 
 if (hashNumEntries(dataHash) == 0)
     errAbort("no entries in hash\n");
 
-fprintf(stderr, "\treducing hash...\n");
-double med, std;
-if (!reduceDataHash(dataHash, &med, &std))
+double count = 0;
+struct hash *rankHash = hashNew(0);
+fprintf(stderr, "\treducing hash, making rank hash...\n");
+
+if (!reduceDataHashRank(dataHash, rankHash, &count))
     errAbort("problem reducing hash\n");
 
 fprintf(stderr, "\tconverting hash to analysisVals...\n");
-struct analysisVals *avList = getAnalysisVals(biConn, dataHash, med, std);
+struct analysisVals *avList = getAnalysisValsRank(biConn, dataHash, rankHash, count);
 
 fprintf(stderr, "\tstoring analysisVals...\n");
 storeAnalysisValsInDb(biConn, dataTable, avList);
 analysisValsFreeList(&avList);