src/hg/instinct/bioInt2/populateDb.c 1.11
1.11 2009/05/20 20:34:37 jsanborn
initial commit
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/populateDb.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -b -B -U 4 -r1.10 -r1.11
--- src/hg/instinct/bioInt2/populateDb.c 28 Apr 2009 21:21:24 -0000 1.10
+++ src/hg/instinct/bioInt2/populateDb.c 20 May 2009 20:34:37 -0000 1.11
@@ -511,15 +511,15 @@
errAbort("clinicalData entries not unique, sample_id = %d, feature_id = %d",
cd->sample_id, cd->feature_id);
if (cd->val != cd2->val)
- errAbort("clinicalData values don't match, sample_id = %d, feature_id = %d, "
+ fprintf(stderr, "clinicalData values don't match, sample_id = %d, feature_id = %d, "
"%f != %f",
cd->sample_id, cd->feature_id, cd->val, cd2->val);
if (cd->code && cd2->code)
if (!sameString(cd->code, cd2->code))
- errAbort("clinicalData codes don't match, sample_id = %d, feature_id = %d",
+ fprintf(stderr, "clinicalData codes don't match, sample_id = %d, feature_id = %d",
cd->sample_id, cd->feature_id);
return TRUE;
}
@@ -754,8 +754,70 @@
slFreeList(&allSdList);
return TRUE;
}
+
+boolean reduceDataHashRank(struct hash *dataHash, struct hash *hash, double *retCount)
+{
+struct slDouble *sd, *allSd, *allSdList = NULL;
+struct hashEl *outEl;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
+ {
+ struct hash *hash = outEl->val;
+
+ struct hashEl *inEl, *elList = hashElListHash(hash);
+ for (inEl = elList; inEl != NULL; inEl = inEl->next)
+ {
+ char *sample = inEl->name;
+ struct slDouble *sdList = inEl->val;
+ double med = slDoubleMedian(sdList);
+ sd = slDoubleNew(med);
+ allSd = slDoubleNew(med);
+
+ slAddHead(&allSdList, allSd);
+
+ hashRemove(hash, sample);
+ slFreeList(&sdList);
+
+ hashAdd(hash, sample, sd);
+ }
+ hashElFreeList(&elList);
+ }
+
+double count = (double) slCount(allSdList);
+int halfCount = round(count/2.0);
+
+slSort(&allSdList, slDoubleCmp);
+
+char name[32];
+int index = 0, total = 0;
+for (sd = allSdList; sd && (index <= halfCount); sd = sd->next)
+ {
+ safef(name, sizeof(name), "%f", sd->val);
+ if (!hashLookup(hash, name))
+ hashAddInt(hash, name, index);
+ index++;
+ total++;
+ }
+
+index = (int) count;
+slReverse(&allSdList);
+for (sd = allSdList; sd && (index > halfCount); sd = sd->next)
+ {
+ safef(name, sizeof(name), "%f", sd->val);
+ if (!hashLookup(hash, name))
+ hashAddInt(hash, name, index);
+ index--;
+ total++;
+ }
+fprintf(stderr, "total = %d, out of %f\n", total, count);
+*retCount = count;
+
+slFreeList(&allSdList);
+return TRUE;
+}
+
struct analysisVals *getAnalysisVals(struct sqlConnection *biConn, struct hash *dataHash,
double med, double std)
{
struct hash *sampleHash = createIdHash(biConn, SA_TABLE, "name");
@@ -790,9 +852,9 @@
av->feature_id = af->id;
av->val = sd->val;
z = (av->val - med)/std;
- p = ndtr(-1.0*fabs(z));
+ p = 2.0*ndtr(-1.0*fabs(z)); // two-tailed
if (p > 0)
val = min(-log(p)/log(10.0), maxLogP);
else
val = maxLogP;
@@ -809,8 +871,73 @@
return avList;
}
+
+struct analysisVals *getAnalysisValsRank(struct sqlConnection *biConn, struct hash *dataHash,
+ struct hash *rankHash, int count)
+{
+struct hash *sampleHash = createIdHash(biConn, SA_TABLE, "name");
+
+double dCount = (double) count;
+int halfCount = round(dCount / 2.0);
+
+double sign, p;
+double maxLogP = 88.0;
+struct hashEl *inEl, *outEl;
+struct analysisVals *av, *avList = NULL;
+struct hashCookie cookie = hashFirst(dataHash);
+while ((outEl = hashNext(&cookie)) != NULL)
+ {
+ char *gene = outEl->name;
+ struct hash *hash = outEl->val;
+
+ struct analysisFeatures *af = getAnalysisFeatures(biConn, gene, "gene");
+
+ if (!af)
+ continue;
+
+ struct hashEl *elList = hashElListHash(hash);
+ for (inEl = elList; inEl != NULL; inEl = inEl->next)
+ {
+ char *sample = inEl->name;
+ struct slDouble *sd = inEl->val;
+
+ int sample_id = hashIntValDefault(sampleHash, sample, -1);
+ if (sample_id == -1)
+ errAbort("No sample by name of %s\n", sample);
+
+ char valStr[32];
+ safef(valStr, sizeof(valStr), "%f", sd->val);
+ int index = hashIntValDefault(rankHash, valStr, -1);
+ if (index == -1)
+ errAbort("couldn't find value %s in rankHash\n", valStr);
+
+ if (index > halfCount)
+ p = (dCount - (double) index + 1.0) / dCount;
+ else
+ p = ((double) index + 1.0) / dCount;
+
+ sign = 1.0;
+ if (sd->val < 0)
+ sign = -1.0;
+
+ p = sign * min(-log(2.0 * p)/log(10.0), maxLogP);
+ AllocVar(av);
+ av->sample_id = sample_id;
+ av->feature_id = af->id;
+ av->val = sd->val;
+ av->conf = p;
+
+ slAddHead(&avList, av);
+ }
+ analysisFeaturesFree(&af);
+ hashElFreeList(&elList);
+ }
+
+return avList;
+}
+
void sexCorrect(struct bed *nb, float *correction)
{
int i;
for (i = 0; i < nb->expCount; i++)
@@ -912,13 +1039,13 @@
while ((row = sqlNextRow(sr)) != NULL)
{
struct bed *nb = bedLoadN(row+1, 15);
-// if (sameString(nb->chrom, "chrX") || sameString(nb->chrom, "chrY"))
-// { // skip sex chroms
-// bedFree(&nb);
-// continue;
-// }
+ if (sameString(nb->chrom, "chrX") || sameString(nb->chrom, "chrY"))
+ { // skip sex chroms
+ bedFree(&nb);
+ continue;
+ }
if (sameString(nb->chrom, "chrX") && doX)
sexCorrect(nb, correctionX);
else if (sameString(nb->chrom, "chrY") && doY)
@@ -938,15 +1065,17 @@
if (hashNumEntries(dataHash) == 0)
errAbort("no entries in hash\n");
-fprintf(stderr, "\treducing hash...\n");
-double med, std;
-if (!reduceDataHash(dataHash, &med, &std))
+double count = 0;
+struct hash *rankHash = hashNew(0);
+fprintf(stderr, "\treducing hash, making rank hash...\n");
+
+if (!reduceDataHashRank(dataHash, rankHash, &count))
errAbort("problem reducing hash\n");
fprintf(stderr, "\tconverting hash to analysisVals...\n");
-struct analysisVals *avList = getAnalysisVals(biConn, dataHash, med, std);
+struct analysisVals *avList = getAnalysisValsRank(biConn, dataHash, rankHash, count);
fprintf(stderr, "\tstoring analysisVals...\n");
storeAnalysisValsInDb(biConn, dataTable, avList);
analysisValsFreeList(&avList);