src/hg/txGraph/txgAddEvidence/txgAddEvidence.c 1.14
1.14 2009/08/13 03:08:30 markd
fixed misuse of AllocVar macro detected by gcc 4.5
Index: src/hg/txGraph/txgAddEvidence/txgAddEvidence.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/txGraph/txgAddEvidence/txgAddEvidence.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 1000000 -r1.13 -r1.14
--- src/hg/txGraph/txgAddEvidence/txgAddEvidence.c 24 Feb 2007 02:26:26 -0000 1.13
+++ src/hg/txGraph/txgAddEvidence/txgAddEvidence.c 13 Aug 2009 03:08:30 -0000 1.14
@@ -1,326 +1,326 @@
/* txgAddEvidence - Add evidence from a bed file to existing transcript graph.. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "geneGraph.h"
#include "txGraph.h"
#include "ggTypes.h"
#include "binRange.h"
#include "txEdgeBed.h"
int maxBleedOver = 6;
boolean bedIsIntron = FALSE;
boolean bedIsHard = FALSE;
char *specificChrom = NULL;
void usage()
/* Explain usage and exit. */
{
errAbort(
"txgAddEvidence - Add evidence from a bed file to existing transcript graph.\n"
"usage:\n"
" txgAddEvidence in.txg in.edges sourceType out.txg\n"
"where in.txg is a tab-separated file in txGraph format\n"
" in.edges is a tab-separated file in txEdgeBed or txEdgeOrtho format\n"
" sourceType is a decriptive word like 'refSeq' 'est' etc.\n"
" out.txt is the output graph with additional info\n"
"options:\n"
" -chrom=chrN - Set to only add edges for given chromosome. Useful\n"
" if in.txg is chromosome specific but in.edges is not.\n"
" -maxBleedOver=N - Maximum amount of exon that can be lost when snapping\n"
" soft to hard edges. Default %d\n"
, maxBleedOver
);
}
static struct optionSpec options[] = {
{"chrom", OPTION_STRING},
{"maxBleedOver", OPTION_STRING},
{NULL, 0},
};
struct minChromSize
/* Associate chromosome and size. */
{
char *chrom; /* Chromosome name, Not alloced here */
int minSize;
};
struct hash *txgChromMinSizeHash(struct txGraph *txgList)
/* Hash full of lower bounds on chromosome sizes, taken
* from tEnd's in txgList. */
{
struct txGraph *txg;
struct hash *sizeHash = hashNew(16);
for (txg = txgList; txg != NULL; txg = txg->next)
{
struct minChromSize *chrom = hashFindVal(sizeHash, txg->tName);
if (chrom == NULL)
{
lmAllocVar(sizeHash->lm, chrom);
chrom->chrom = txg->tName;
chrom->minSize = txg->tEnd;
hashAdd(sizeHash, txg->tName, chrom);
}
else
{
chrom->minSize = max(chrom->minSize, txg->tEnd);
}
}
return sizeHash;
}
struct hash *txgIntoKeeperHash(struct txGraph *txgList)
/* Create a hash full of bin keepers (one for each chromosome or contig.
* The binKeepers are full of txGraphs. */
{
struct hash *sizeHash = txgChromMinSizeHash(txgList);
struct hash *bkHash = hashNew(16);
struct txGraph *txg;
for (txg = txgList; txg != NULL; txg = txg->next)
{
struct binKeeper *bk = hashFindVal(bkHash, txg->tName);
if (bk == NULL)
{
struct minChromSize *chrom = hashMustFindVal(sizeHash, txg->tName);
verbose(2, "New binKeeper for %s\n", txg->tName);
bk = binKeeperNew(0, chrom->minSize);
hashAdd(bkHash, txg->tName, bk);
}
binKeeperAdd(bk, txg->tStart, txg->tEnd, txg);
}
hashFree(&sizeHash);
return bkHash;
}
int getSourceIx(struct txGraph *txg, char *type, char *accession)
/* Search support for type & accession */
{
/* Start search where we got last result for better performance
* when doing things again and again. */
static int lastIx = 0;
if (lastIx < txg->sourceCount)
{
struct txSource *source = &txg->sources[lastIx];
if (sameString(accession, source->accession) && sameString(type, source->type))
return lastIx;
}
/* Search through whole list of sources. */
int ix;
for (ix=0; ix<txg->sourceCount; ++ix)
{
if (ix != lastIx)
{
struct txSource *source = &txg->sources[ix];
if (sameString(accession, source->accession) && sameString(type, source->type))
{
lastIx = ix;
return ix;
}
}
}
return -1;
}
boolean evIsHard(char *type)
/* Return TRUE if txEdgeBed start/end type is hard. */
{
char c = type[0];
return c == '[' || c == ']';
}
boolean vertexIsHard(struct txVertex *vertex)
/* Return true if a hard start or hard end. */
{
unsigned char c = vertex->type;
return c == ggHardStart || c == ggHardEnd;
}
int evEdgeScore(struct txGraph *txg, struct txEdge *edge,
struct txEdgeBed *ev)
/* Return a score for edge/evidence match up. There's two main
* factors in the score: the extent of the overlap, and the
* agreement of the hardness/softness of the ends. */
{
/* Check that introns go with introns and exons with exons. */
if ((int)edge->type != (int)ev->type)
{
return 0;
}
/* Unpack some of the input data structures into local variables
* and see if we even overlap. */
struct txVertex *startVertex = &txg->vertices[edge->startIx];
struct txVertex *endVertex = &txg->vertices[edge->endIx];
int edgeStart = startVertex->position;
int edgeEnd = endVertex->position;
int evStart = ev->chromStart;
int evEnd = ev->chromEnd;
int overlapSize = rangeIntersection(edgeStart, edgeEnd, evStart, evEnd);
if (overlapSize <= 0)
{
return 0;
}
/* Figure out hardness/softness of starts and ends of both evidence
* and edge. */
boolean edgeHardStart = vertexIsHard(startVertex);
boolean edgeHardEnd = vertexIsHard(endVertex);
boolean evHardStart = evIsHard(ev->startType);
boolean evHardEnd = evIsHard(ev->endType);
/* Go through 4 possible cases at start */
int startBonus = 0;
if (evHardStart && edgeHardStart)
{
if (evStart != edgeStart)
return 0;
startBonus = 2000000;
}
else if (evHardStart && !edgeHardStart)
{
if (evStart - edgeStart > maxBleedOver)
return 0;
}
else if (!evHardStart && edgeHardStart)
{
if (edgeStart - evStart > maxBleedOver)
return 0;
}
else if (!evHardStart && !edgeHardStart)
{
/* Both soft, no real constraints */
}
/* Go through 4 possible cases at end */
int endBonus = 0;
if (evHardEnd && edgeHardEnd)
{
if (evEnd != edgeEnd)
return 0;
endBonus = 2000000;
}
else if (evHardEnd && !edgeHardEnd)
{
if (edgeEnd - evEnd > maxBleedOver)
return 0;
}
else if (!evHardEnd && edgeHardEnd)
{
if (evEnd - edgeEnd > maxBleedOver)
return 0;
}
else if (!evHardEnd && !edgeHardEnd)
{
/* Both soft, no real constraints */
}
return startBonus + endBonus + overlapSize;
}
void addEvidenceToGraph(struct txGraph *txg, struct txEdgeBed *ev,
char *sourceType)
/* Search through txg, and if can find an edge that agrees with
* ev, add ev to best agreeing edge. */
{
struct txEdge *edge, *bestEdge = NULL;
int score, bestScore = 0;
for (edge = txg->edgeList; edge != NULL; edge = edge->next)
{
score = evEdgeScore(txg, edge, ev);
if (score > bestScore)
{
bestScore = score;
bestEdge = edge;
}
}
if (bestEdge != NULL)
{
verbose(2, "Adding evidence at %s:%d-%d to edge of %s\n",
txg->tName, ev->chromStart, ev->chromEnd, txg->name);
char *accession = ev->name;
int sourceIx = getSourceIx(txg, sourceType, accession);
if (sourceIx < 0)
{
sourceIx = txg->sourceCount;
- txg->sources = ExpandArray(txg->sources, sourceIx, txg->sourceCount+1);
+ ExpandArray(txg->sources, sourceIx, txg->sourceCount+1);
struct txSource *source = &txg->sources[sourceIx];
source->type = cloneString(sourceType);
source->accession = cloneString(accession);
txg->sourceCount = sourceIx + 1;
}
struct txEvidence *tev;
AllocVar(tev);
tev->sourceId = sourceIx;
tev->start = ev->chromStart;
tev->end = ev->chromEnd;
slAddTail(&bestEdge->evList, tev);
bestEdge->evCount += 1;
}
}
void addEvidence(struct txEdgeBed *evList, char *sourceType, struct hash *bkHash)
/* Input is a list of additional evidence in evList, and a hash
* full of binKeepers full of txGraphs. */
{
struct txEdgeBed *ev;
for (ev = evList; ev != NULL; ev = ev->next)
{
if (specificChrom != NULL && !sameString(ev->chrom, specificChrom))
continue;
verbose(2, "Processing %s %s:%d-%d\n", ev->name, ev->chrom, ev->chromStart, ev->chromEnd);
struct binKeeper *bk = hashMustFindVal(bkHash, ev->chrom);
struct binElement *bel, *belList = binKeeperFind(bk, ev->chromStart, ev->chromEnd);
verbose(3, "%s hits %d txg\n", ev->name, slCount(belList));
for (bel = belList; bel != NULL; bel = bel->next)
{
struct txGraph *txg = bel->val;
verbose(3, " txg %s %s:%d-%d\n", txg->name, txg->tName, txg->tStart, txg->tEnd);
if (ev->strand[0] == 0 || ev->strand[0] == '.'
||sameString(txg->strand, ev->strand))
{
addEvidenceToGraph(txg, ev, sourceType);
}
}
}
}
void txgAddEvidence(char *txgIn, char *bedIn, char *sourceType, char *txgOut)
/* txgAddEvidence - Add evidence from a bed file to existing transcript graph.. */
{
struct txGraph *txgList = txGraphLoadAll(txgIn);
verbose(2, "%d elements in txgList\n", slCount(txgList));
struct txEdgeBed *bedList = txEdgeBedLoadAll(bedIn);
verbose(2, "%d elements in bedList\n", slCount(bedList));
FILE *f = mustOpen(txgOut, "w");
verbose(1, "Adding %s evidence from %s to %s. Result is in %s.\n",
sourceType, bedIn, txgIn, txgOut);
struct hash *bkHash = txgIntoKeeperHash(txgList);
verbose(2, "Loaded into keeper hash of %d elements\n", bkHash->elCount);
addEvidence(bedList, sourceType, bkHash);
verbose(2, "Added evidence.\n");
struct txGraph *txg;
for (txg = txgList; txg != NULL; txg = txg->next)
{
txGraphTabOut(txg, f);
}
carefulClose(&f);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
specificChrom = optionVal("chrom", NULL);
maxBleedOver = optionInt("maxBleedOver", maxBleedOver);
if (argc != 5)
usage();
txgAddEvidence(argv[1], argv[2], argv[3], argv[4]);
return 0;
}