src/hg/near/hgMapToGene/hgMapToGene.c 1.16
1.16 2009/11/23 23:39:44 kent
Adding ignoreStrand option.
Index: src/hg/near/hgMapToGene/hgMapToGene.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/near/hgMapToGene/hgMapToGene.c,v
retrieving revision 1.15
retrieving revision 1.16
diff -b -B -U 1000000 -r1.15 -r1.16
--- src/hg/near/hgMapToGene/hgMapToGene.c 3 Sep 2008 19:20:41 -0000 1.15
+++ src/hg/near/hgMapToGene/hgMapToGene.c 23 Nov 2009 23:39:44 -0000 1.16
@@ -1,439 +1,450 @@
/* hgMapToGene - Map a track to a genePred track.. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "dystring.h"
#include "jksql.h"
#include "bed.h"
#include "binRange.h"
#include "hdb.h"
#include "hgConfig.h"
static char const rcsid[] = "$Id$";
void usage()
/* Explain usage and exit. */
{
errAbort(
"hgMapToGene - Map a track to a genePred track.\n"
"usage:\n"
" hgMapToGene database track geneTrack mapTable\n"
"where:\n"
" database - is a browser database (ex. hg16)\n"
" track - the track to map to the genePred track\n"
" This must be bed12 psl or genePred itself\n"
" geneTrack - a genePred format track\n"
" newMapTable - a new table to create with two columns\n"
" <geneTrackId><trackId>\n"
"For each gene in geneTrack this will select the most\n"
"overlapping item in track to associate\n"
"Options:\n"
" -type=xxx - Set the type line here rather than looking it up in\n"
" the trackDb table.\n"
" -geneTableType=xxx - Set the gene table type (bed or genePred)\n"
" rather than look it up in trackDb table.\n"
" -all - Put all elements of track that intersect with geneTrack\n"
" into mapTable, not just the best for each gene.\n"
" -prefix=XXX - Only consider elements of track that begin with prefix\n"
" -cds - Only consider coding portions of gene.\n"
" -noLoad - Don't load database, just create mapTable.tab file\n"
" -verbose=N - Print intermediate status info if N > 0\n"
" -intronsToo - Include introns\n"
+ " -ignoreStrand - ignore strand when determining overlaps\n"
" -createOnly - Just create mapTable, don't populate it\n"
" -tempDb - Database to look for genes track and where to put result\n"
" -lookup=lookup.txt - Lookup.txt is a 2 column file\n"
" <trackId><lookupId>\n"
" The trackId from the geneTrack gets replaced with lookupId\n"
);
}
boolean cdsOnly = FALSE;
+boolean ignoreStrand = FALSE;
boolean intronsToo = FALSE;
boolean createOnly = FALSE;
char *prefix = NULL;
char *trackDb = NULL;
static struct optionSpec options[] = {
{"type", OPTION_STRING},
{"all", OPTION_BOOLEAN},
{"prefix", OPTION_STRING},
{"cds", OPTION_BOOLEAN},
{"intronsToo", OPTION_BOOLEAN},
+ {"ignoreStrand", OPTION_BOOLEAN},
{"noLoad", OPTION_BOOLEAN},
{"createOnly", OPTION_BOOLEAN},
{"lookup", OPTION_STRING},
{"geneTableType", OPTION_STRING},
{"tempDb", OPTION_STRING},
{NULL, 0},
};
int bedIntersectRange(struct bed *bed, int rStart, int rEnd)
/* Return intersection of bed with range rStart-rEnd */
{
int bedStart = bed->chromStart;
int overlap = 0;
if (bed->blockCount > 1)
{
int bedIx;
for (bedIx = 0; bedIx < bed->blockCount; ++bedIx)
{
int start = bedStart + bed->chromStarts[bedIx];
int end = start + bed->blockSizes[bedIx];
overlap += positiveRangeIntersection(rStart, rEnd, start, end);
}
}
else
{
overlap = positiveRangeIntersection(rStart, rEnd, bed->chromStart, bed->chromEnd);
}
return overlap;
}
int gpBedOverlap(struct genePred *gp, boolean cdsOnly, boolean intronsToo, struct bed *bed)
/* Return total amount of overlap between gp and bed12. */
{
/* This could be implemented more efficiently, but this
* way is really simple. */
int gpIx;
int overlap = 0;
int geneStart, geneEnd;
if (cdsOnly)
{
geneStart = gp->cdsStart;
geneEnd = gp->cdsEnd;
}
else
{
geneStart = gp->txStart;
geneEnd = gp->txEnd;
}
if (intronsToo)
{
overlap = bedIntersectRange(bed, geneStart, geneEnd);
}
else
{
for (gpIx = 0; gpIx < gp->exonCount; ++gpIx)
{
int gpStart = gp->exonStarts[gpIx];
int gpEnd = gp->exonEnds[gpIx];
if (cdsOnly)
{
if (gpStart < geneStart) gpStart = geneStart;
if (gpEnd > geneEnd) gpEnd = geneEnd;
if (gpStart >= gpEnd)
continue;
}
overlap += bedIntersectRange(bed, gpStart, gpEnd);
}
}
return overlap;
}
struct bed *mostOverlappingBed(struct binKeeper *bk, struct genePred *gp)
/* Find bed in bk that overlaps most with gp. Return NULL if no overlap. */
{
struct bed *bed, *bestBed = NULL;
int overlap, bestOverlap = 0;
struct binElement *el, *elList = binKeeperFind(bk, gp->txStart, gp->txEnd);
for (el = elList; el != NULL; el = el->next)
{
bed = el->val;
overlap = gpBedOverlap(gp, cdsOnly, intronsToo, bed);
if (overlap > bestOverlap)
{
bestOverlap = overlap;
bestBed = bed;
}
}
return bestBed;
}
void outTwo(FILE *f, struct hash *lookupHash, char *key, char *val)
/* Output key/val to tab separated file. If lookupHash is non-null
* then lookup val through it before outputting. */
{
if (lookupHash != NULL)
val = hashFindVal(lookupHash, val);
if (val != NULL)
fprintf(f, "%s\t%s\n", key, val);
}
void oneChromStrandTrackToGene(char *database, struct sqlConnection *conn, struct sqlConnection *tConn,
char *chrom, char strand,
char *geneTable, char *geneTableType,
char *otherTable, char *otherType,
struct hash *dupeHash, boolean doAll, struct hash *lookupHash,
FILE *f)
/* Find most overlapping bed for each genePred in one
* strand of a chromosome, and write it to file. */
{
int chromSize = hChromSize(database, chrom);
struct binKeeper *bk = binKeeperNew(0, chromSize);
struct sqlResult *sr;
char extraBuf[256], *extra = NULL, **row;
int rowOffset;
struct bed *bedList = NULL, *bed;
int bedNum; /* Number of bed fields*/
boolean geneTableBed = FALSE;
int numCols;
if(startsWith("genePred", geneTableType))
geneTableBed = FALSE;
else if(startsWith("bed", geneTableType))
geneTableBed = TRUE;
else
errAbort("%s table type doesn't appear to be bed or genePred (%s instead).", geneTable, geneTableType);
/* Read data into binKeeper. */
if (startsWith("bed", otherType))
{
char *numString = otherType + 3;
bedNum = atoi(numString);
- if (bedNum < 6) /* Just one strand in bed. */
+ if (bedNum < 6 || ignoreStrand) /* Just one strand in bed. */
{
if (strand == '-')
{
binKeeperFree(&bk);
return;
}
}
else
{
safef(extraBuf, sizeof(extraBuf), "strand = '%c'", strand);
extra = extraBuf;
}
sr = hChromQuery(conn, otherTable, chrom, extra, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
{
bed = bedLoadN(row+rowOffset, bedNum);
if (prefix == NULL || startsWith(prefix, bed->name))
{
slAddHead(&bedList, bed);
binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed);
}
}
}
else if (startsWith("genePred", otherType))
{
struct genePred *gp;
bedNum = 12;
+ if (!ignoreStrand)
+ {
safef(extraBuf, sizeof(extraBuf), "strand = '%c'", strand);
extra = extraBuf;
+ }
sr = hChromQuery(conn, otherTable, chrom, extra, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
{
gp = genePredLoad(row + rowOffset);
if (prefix == NULL || startsWith(prefix, gp->name))
{
bed = bedFromGenePred(gp);
genePredFree(&gp);
slAddHead(&bedList, bed);
binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed);
}
}
}
else if (startsWith("psl", otherType))
{
struct psl *psl;
bedNum = 12;
+ if (!ignoreStrand)
+ {
safef(extraBuf, sizeof(extraBuf), "strand = '%c'", strand);
extra = extraBuf;
+ }
sr = hChromQuery(conn, otherTable, chrom, extra, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
{
psl = pslLoad(row + rowOffset);
if (prefix == NULL || startsWith(prefix, psl->qName))
{
bed = bedFromPsl(psl);
pslFree(&psl);
slAddHead(&bedList, bed);
binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed);
}
}
}
else
{
errAbort("%s: unrecognized track type %s", otherTable, otherType);
}
sqlFreeResult(&sr);
/* Scan through gene preds looking for best overlap if any. */
sr = hChromQuery(tConn, geneTable, chrom, extra, &rowOffset);
numCols = sqlCountColumns(sr);
while ((row = sqlNextRow(sr)) != NULL)
{
struct genePred *gp = NULL;
struct bed *bed = NULL;
char *name = NULL;
if(geneTableBed)
{
bed = bedLoadN(row+rowOffset, numCols-rowOffset);
gp = bedToGenePred(bed);
bedFree(&bed);
}
else
gp = genePredLoad(row+rowOffset);
name = gp->name;
+
if (!hashLookup(dupeHash, name)) /* Only take first occurrence. */
{
if (doAll)
{
struct binElement *el, *elList = binKeeperFind(bk, gp->txStart, gp->txEnd);
for (el = elList; el != NULL; el = el->next)
{
bed = el->val;
if (gpBedOverlap(gp, cdsOnly, intronsToo, bed) > 0)
{
outTwo(f, lookupHash, gp->name, bed->name);
hashAdd(dupeHash, name, NULL);
}
}
}
else
{
if ((bed = mostOverlappingBed(bk, gp)) != NULL)
{
outTwo(f, lookupHash, gp->name, bed->name);
hashAdd(dupeHash, name, NULL);
}
}
}
genePredFree(&gp);
}
bedFreeList(&bedList);
binKeeperFree(&bk);
}
void createTable(struct sqlConnection *conn, char *tableName, boolean unique)
/* Create our name/value table, dropping if it already exists. */
{
char *indexType = (unique ? "UNIQUE" : "INDEX");
struct dyString *dy = dyStringNew(512);
dyStringPrintf(dy,
"CREATE TABLE %s (\n"
" name varchar(255) not null,\n"
" value varchar(255) not null,\n"
" #Indices\n"
" %s(name(16)),\n"
" INDEX(value(16))\n"
")\n", tableName, indexType);
sqlRemakeTable(conn, tableName, dy->string);
dyStringFree(&dy);
}
void hgMapTableToGene(char *database, struct sqlConnection *conn, struct sqlConnection *tConn,
char *geneTable, char *geneTableType,
char *otherTable, char *otherType, char *outTable,
struct hash *lookupHash)
/* hgMapTableToGene - Create a table that maps geneTable to otherTable,
* choosing the best single item in otherTable for each genePred. */
{
/* Open tab file and database loop through each chromosome writing to it. */
struct slName *chromList, *chrom;
char *tempDir = ".";
FILE *f;
boolean doAll = optionExists("all");
/* Process each strand of each chromosome into tab-separated file. */
if (!createOnly)
{
struct hash *dupeHash = newHash(16);
f = hgCreateTabFile(tempDir, outTable);
chromList = hAllChromNames(database);
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
{
verbose(2, "%s\n", chrom->name);
oneChromStrandTrackToGene(database, conn, tConn, chrom->name, '+', geneTable, geneTableType,
otherTable, otherType, dupeHash, doAll, lookupHash, f);
oneChromStrandTrackToGene(database, conn, tConn, chrom->name, '-', geneTable, geneTableType,
otherTable, otherType, dupeHash, doAll, lookupHash, f);
}
hashFree(&dupeHash);
}
/* Create and load table from tab file. */
if (createOnly)
{
createTable(tConn, outTable, !doAll);
}
else if (!optionExists("noLoad"))
{
createTable(tConn, outTable, !doAll);
hgLoadTabFile(tConn, tempDir, outTable, &f);
hgRemoveTabFile(tempDir, outTable);
}
}
char *tdbType(struct sqlConnection *conn, char *track)
/* Return track field from TDB. */
{
char query[512];
char *type, typeBuf[128];
safef(query, sizeof(query), "select type from %s where tableName = '%s'", trackDb, track);
type = sqlQuickQuery(conn, query, typeBuf, sizeof(typeBuf));
if (type == NULL)
errAbort("Can't find track %s in trackDb", track);
return cloneString(type);
}
struct hash *hashTwoColumns(char *fileName)
/* Make up a hash out of a two column file. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[2];
struct hash *hash = hashNew(17);
while (lineFileRow(lf, row))
hashAdd(hash, row[0], cloneString(row[1]));
lineFileClose(&lf);
return hash;
}
void hgMapToGene(char *database, char *track, char *geneTrack, char *newTable)
/* hgMapToGene - Map a track to a genePred track.. */
{
char *tempDb = optionVal("tempDb", database);
struct sqlConnection *conn = sqlConnect(database);
struct sqlConnection *tConn = sqlConnect(tempDb);
char *type = optionVal("type", NULL);
char *lookupFile = optionVal("lookup", NULL);
struct hash *lookupHash = NULL;
char *geneTableType = optionVal("geneTableType", NULL);
if (lookupFile != NULL)
lookupHash = hashTwoColumns(lookupFile);
if (type == NULL)
type = tdbType(conn, track);
/* If geneTableType not specified query database. */
if(geneTableType == NULL)
geneTableType = tdbType(conn, geneTrack);
if (!startsWith("genePred", geneTableType) && !startsWith("bed", geneTableType))
errAbort("%s is neither a genePred or bed type track", geneTrack);
hgMapTableToGene(database, conn, tConn, geneTrack, geneTableType, track, type, newTable, lookupHash);
sqlDisconnect(&conn);
sqlDisconnect(&tConn);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
cdsOnly = optionExists("cds");
intronsToo = optionExists("intronsToo");
+ignoreStrand = optionExists("ignoreStrand");
createOnly = optionExists("createOnly");
prefix = optionVal("prefix", NULL);
trackDb = cfgOption("db.trackDb");
if(trackDb == NULL)
trackDb = "trackDb";
if (argc != 5)
usage();
hgMapToGene(argv[1], argv[2], argv[3], argv[4]);
return 0;
}