src/hg/instinct/mapProbesToGenes/mapProbesToGenes.c 1.9
1.9 2009/11/16 18:37:00 sbenz
Fix to prevent query from timing out if there's lots of probes
Index: src/hg/instinct/mapProbesToGenes/mapProbesToGenes.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/mapProbesToGenes/mapProbesToGenes.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -b -B -U 1000000 -r1.8 -r1.9
--- src/hg/instinct/mapProbesToGenes/mapProbesToGenes.c 19 Nov 2008 00:34:15 -0000 1.8
+++ src/hg/instinct/mapProbesToGenes/mapProbesToGenes.c 16 Nov 2009 18:37:00 -0000 1.9
@@ -1,388 +1,412 @@
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
#include <limits.h>
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "bed.h"
#include "genePred.h"
#include "hPrint.h"
#include "hdb.h"
struct hash *mapHash = NULL;
char *output =NULL;
FILE *fout = NULL;
void usage()
/* Explain usage and exit. */
{
errAbort(
"mapProbesToGenes - Will maps probes in BED format to overlapping gene(s)\n"
"usage:\n"
" mapProbesToGenes db table mappingOutput\n"
"options:\n"
" -cgh - mapping probes to gene for CGH data\n"
" -snp - mapping probes to gene for SNP data\n"
" -methylation - mapping probes to genes for methylation data\n"
" -3PrimeDis=XXX - probes will not be mapping if their distance to 3' of the gene is > XXXbp, this rule does not apply is -cgh if specified\n"
" -5PrimeDis=XXX - probes will not be mapped if their distance to 5' of the gene is > XXXbp, this rule only applies for methylation data\n"
" -passExon=XXX - probe can pass the exon boundary by a small number of bps while still be adequate to measure gene expression change, this may especially be true for cDNA probes, or short probes that have been mapped to genes. This option specifies the base pair size a probe is allowed to pass an exon boundary. This rule does not apply if -cgh is specified\n"
);
}
static struct optionSpec options[] = {
{"cgh", OPTION_BOOLEAN},
{"snp", OPTION_BOOLEAN},
{"methylation", OPTION_BOOLEAN},
{"3PrimeDis", OPTION_INT},
{"5PrimeDis", OPTION_INT},
{"passExon", OPTION_INT},
{NULL, 0}
};
int rangeBothStrandOverlap(struct bed *gene, struct bed *probe)
/* Return amount of block-level overlap on same strand between a and b */
{
if (sameString(gene->chrom, probe->chrom))
{
if(optionExists("methylation"))
{
int fivePrimeDis = optionInt("5PrimeDis", INT_MAX);
if(gene->strand[0] == '+')
{
return rangeIntersection(gene->chromStart-fivePrimeDis,
gene->chromEnd,
probe->chromStart,
probe->chromEnd);
}
else if(gene->strand[0] == '-')
{
return rangeIntersection(gene->chromStart,
gene->chromEnd+fivePrimeDis,
probe->chromStart,
probe->chromEnd);
}
else
return 0;
}
else if (optionExists("snp"))
{
return rangeIntersection(gene->chromStart,
gene->chromEnd,
probe->chromStart,
probe->chromEnd + 1);
}
else
{
return rangeIntersection(gene->chromStart,
gene->chromEnd,
probe->chromStart,
probe->chromEnd);
}
}
return 0;
}
int ThreePrimeDistance (struct bed *gene, struct bed *probe)
/* calculate the distance between 3' of the gene to 3' of the probe
* error: return -1 ; otherwise, return a postive number
*/
{
if (gene->strand[0] != probe->strand[0])
return -1;
if (!sameString(gene->chrom, gene->chrom))
return -1;
int probeEnd;
int probeStart;
int N= gene->blockCount;
int distance =0;
boolean foundEnd = FALSE;
boolean foundStart = FALSE;
int passExon = optionInt("passExon", 0);
int i;
/* for positive strand */
if (gene->strand[0] == '+')
{
probeEnd = probe->chromEnd;
probeStart = probe->chromStart;
for (i=N-1; i>=0; i--)
{
//block i
int start = gene->chromStart + gene->chromStarts[i];
int end = gene->chromStart + gene->chromStarts[i] + gene->blockSizes[i];
if (start-passExon <= probeEnd && probeEnd <= end+passExon && foundEnd ==FALSE)
{
if ( end-probeEnd > 0)
distance = distance + (end - probeEnd);
foundEnd =TRUE;
}
else if (foundEnd ==FALSE)
distance = distance + gene->blockSizes[i];
if (start-passExon <= probeStart && probeStart <= end +passExon)
{
foundStart =TRUE;
}
if (foundEnd && foundStart)
break;
}
}
/* for negative strand */
else if (gene->strand[0] == '-')
{
probeEnd = probe->chromStart;
probeStart = probe->chromEnd;
//block i
for (i=0; i<N; i++)
{
int start = gene->chromStart + gene->chromStarts[i];
int end = gene->chromStart + gene->chromStarts[i] + gene->blockSizes[i];
if (start -passExon <= probeEnd && probeEnd <= end +passExon && foundEnd ==FALSE)
{
if (probeEnd - start >0 )
distance = distance + (probeEnd - start);
foundEnd =TRUE;
}
else if (foundEnd ==FALSE)
distance = distance + gene->blockSizes[i];
if (start-passExon <= probeStart && probeStart <= end +passExon)
{
foundStart =TRUE;
}
if (foundEnd && foundStart)
break;
}
}
if(!foundStart || !foundEnd)
{
printf("%s %s %c failed start = %d end= %d\n",gene->name, probe->name, gene->strand[0], foundStart, foundEnd);
return -1;
}
else
{
return distance;
}
}
int FivePrimeDistance (struct bed *gene, struct bed *probe)
/* calculate the distance between 5' of the gene to 3' of the probe
* error: return -1 ; otherwise, return a postive number
*/
{
if (gene->strand[0] != probe->strand[0])
return -1;
if (!sameString(gene->chrom, gene->chrom))
return -1;
int probeEnd = probe->chromEnd;
int probeStart = probe->chromStart;
int probeSite = (probeStart+probeEnd)/2; // gives a rough estimate of the probed CpG
int distance =0;
boolean found = FALSE;
/* for positive strand */
if (gene->strand[0] == '+')
{
int start = gene->chromStart;
distance = (start - probeSite);
found =TRUE;
}
else if (gene->strand[0] == '-')
{
int start = gene->chromEnd;
distance = (probeSite - start);
found =TRUE;
}
if(!found)
{
printf("%s %s %c failed, probeSite=%d, gene starts at %d\n",gene->name, probe->name, gene->strand[0], probeSite, gene->chromStart);
return -1;
}
else
{
return distance;
}
}
void reportOverlap(struct bed *gene, struct bed *probeList)
{
boolean isCgh = optionExists("cgh");
boolean isSNP = optionExists("snp");
int threePrimeCutoff = optionInt("3PrimeDis",INT_MAX);
boolean isMethylation = optionExists("methylation");
int fivePrimeCutoff = optionInt("5PrimeDis", INT_MAX);
struct bed *probe;
for (probe = probeList; probe; probe = probe->next)
{
char key[strlen(probe->name)+strlen(gene->name)+2];
safef(key,sizeof(key), "%s_%s", probe->name, gene->name);
if (hashLookup(mapHash, key))
continue;
if (isCgh)
{
if (rangeBothStrandOverlap(gene, probe) >0 )
{
- if ((probe->chromEnd - probe->chromStart) < 15)
+ if ((probe->chromEnd - probe->chromStart) < 1)
{
printf("%s %s %c too short dis=%d\n",
gene->name, probe->name, gene->strand[0],
probe->chromEnd- probe->chromStart);
continue;
}
fprintf(fout, "%s\t%s\n", probe->name, gene->name);
hashAdd(mapHash, key, NULL);
}
}
else if (isSNP)
{
if (rangeBothStrandOverlap(gene, probe) >0 )
{
fprintf(fout, "%s\t%s\n", probe->name, gene->name);
hashAdd(mapHash, key, NULL);
}
}
else if(isMethylation)
{
if (rangeBothStrandOverlap(gene, probe) > 0 )
{
if (abs(probe->chromEnd - probe->chromStart) < 15)
{
printf("%s %s %c too short dis=%d\n",gene->name, probe->name, gene->strand[0], probe->chromEnd- probe->chromStart);
continue;
}
//calculate distance between 5' end of the gene and the probe, if the option is specified
int fiveDistance = FivePrimeDistance(gene, probe);
if (fiveDistance > fivePrimeCutoff)
{
printf("%s %s %c too far from 5', 5'dis=%d\n",gene->name, probe->name, gene->strand[0], fiveDistance);
continue;
}
//could do a 3'end check here (like below) but want all probes to map
//irrespective of exon structure.
fprintf(fout, "%s\t%s\n", probe->name, gene->name);
hashAdd(mapHash, key, NULL);
}
}
else
{
if (bedSameStrandOverlap(gene, probe))
{
if (abs(probe->chromEnd - probe->chromStart) < 15)
{
printf("%s %s %c too short dis=%d\n",gene->name, probe->name, gene->strand[0], probe->chromEnd- probe->chromStart);
continue;
}
//calculate distance between 3' end of the gene and the probe, if the option is specified
int distance = ThreePrimeDistance (gene, probe);
if (distance <0)
continue;
if (distance > threePrimeCutoff)
{
printf("%s %s %c too far dis=%d\n",gene->name, probe->name, gene->strand[0], distance);
continue;
}
fprintf(fout, "%s\t%s\n", probe->name, gene->name);
hashAdd(mapHash, key, NULL);
}
}
}
}
void overlapWithTable(char *database, struct bed *probeList, char *table)
/* find overlapping bed-formatted probes with genes in table */
{
if (probeList == NULL)
return;
char query[512];
safef(query, sizeof(query),
- "select * from %s" , table);
+ "select COUNT(*) from %s" , table);
/* "db" is remote database connection, where "refGene" lives */
struct sqlConnection *conn = hAllocConnProfile("db", database);
struct sqlResult *sr = sqlGetResult(conn, query);
+int curr = 0;
char **row = NULL;
-struct genePred *gp;
-struct bed *gene;
-while ((row = sqlNextRow(sr)) != NULL)
+row = sqlNextRow(sr);
+unsigned i,count = sqlUnsigned(row[0]);
+sqlFreeResult(&sr);
+
+for(i = 0; i < count; i += 100)
+ {
+ safef(query, sizeof(query),
+ "select * from %s LIMIT %d,100" , table,i);
+
+ sr = sqlGetResult(conn, query);
+ struct genePred *gp;
+ struct bed *gene;
+ while ((row = sqlNextRow(sr)) != NULL)
{
+ //printf(">");
+ //fflush(stdout);
if (row[10])
{
char *tmp = row[12];
row[12] = row[1];
row[1] = tmp;
}
gp = genePredLoad(row+1);
gene = bedFromGenePred(gp);
genePredFree(&gp);
reportOverlap(gene, probeList);
+
+ if((++curr % 100) == 0)
+ {
+ printf(".");
+ fflush(stdout);
}
-hFreeConn(&conn);
+ }
+ sqlFreeResult(&sr);
+ }
+ printf("done.\n");
+ hFreeConn(&conn);
+
}
void mapProbesToGenes(char *db, char *table)
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
{
char query[512];
safef(query, sizeof(query),
"select * from %s", table);
/* uses hg18.profile = localDb in .hg.conf */
struct sqlConnection *conn = hAllocConn(db);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = NULL;
struct bed *tuple, *tupleList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
tuple = bedLoadN(row+1, 15);
slAddHead(&tupleList, tuple);
}
+sqlFreeResult(&sr);
slReverse(&tupleList);
overlapWithTable(db, tupleList, "refGene");
hFreeConn(&conn);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 4)
usage();
mapHash = newHash(0);
char *db=argv[1];
char *table=argv[2];
output = argv[3];
fout = fopen(output,"w");
mapProbesToGenes(db, table);
fclose(fout);
return 0;
}