src/lib/pslTransMap.c 1.8
1.8 2009/11/02 05:17:38 markd
set t/q insert stats in mapped psl
Index: src/lib/pslTransMap.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/pslTransMap.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/lib/pslTransMap.c 11 Mar 2008 21:19:31 -0000 1.7
+++ src/lib/pslTransMap.c 2 Nov 2009 05:17:38 -0000 1.8
@@ -1,311 +1,325 @@
/* pslTransMap - transitive mapping of an alignment another sequence, via a
* common alignment */
#include "common.h"
#include "pslTransMap.h"
#include "psl.h"
/*
* Notes:
* - This code is used with both large and small mapping psls. Large
* psls used for doing cross-species mappings and small psl are used for
* doing protein to mRNA mappings. This introduces some speed issues. For
* chain mapping, a large amount of time is spent in getBlockMapping()
* searching for the starting point of a mapping. However an optimization
* to find the starting point, such as a binKeeper, could be inefficient
* for a large number of small psls. Implementing such an optimzation
* would have to be dependent on the type of mapping. The code was made
* 16x faster for genome mappings by remembering the current location in
* the mapping psl between blocks (iMapBlkPtr arg). This will do for a
* while.
*/
struct block
/* coordinates of a block */
{
int qStart; /* Query start position. */
int qEnd; /* Query end position. */
int tStart; /* Target start position. */
int tEnd; /* Target end position. */
};
static void pslProtToNA(struct psl *psl)
/* convert a protein/NA alignment to a NA/NA alignment */
{
int iBlk;
psl->qStart *= 3;
psl->qEnd *= 3;
psl->qSize *= 3;
for (iBlk = 0; iBlk < psl->blockCount; iBlk++)
{
psl->blockSizes[iBlk] *= 3;
psl->qStarts[iBlk] *= 3;
}
}
static void pslNAToProt(struct psl *psl)
/* undo pslProtToNA */
{
int iBlk;
psl->qStart /= 3;
psl->qEnd /= 3;
psl->qSize /= 3;
for (iBlk = 0; iBlk < psl->blockCount; iBlk++)
{
psl->blockSizes[iBlk] /= 3;
psl->qStarts[iBlk] /= 3;
}
}
static struct psl* createMappedPsl(struct psl* inPsl, struct psl *mapPsl,
int mappedPslMax)
/* setup a PSL for the output alignment */
{
char strand[3];
assert(pslTStrand(inPsl) == pslQStrand(mapPsl));
/* strand can be taken from both alignments, since common sequence is in same
* orientation. */
strand[0] = pslQStrand(inPsl);
strand[1] = pslTStrand(mapPsl);
strand[2] = '\n';
return pslNew(inPsl->qName, inPsl->qSize, 0, 0,
mapPsl->tName, mapPsl->tSize, 0, 0,
strand, mappedPslMax, 0);
}
static struct block blockFromPslBlock(struct psl* psl, int iBlock)
/* fill in a block object from a psl block */
{
struct block block;
block.qStart = psl->qStarts[iBlock];
block.qEnd = psl->qStarts[iBlock] + psl->blockSizes[iBlock];
block.tStart = psl->tStarts[iBlock];
block.tEnd = psl->tStarts[iBlock] + psl->blockSizes[iBlock];
return block;
}
static void addPslBlock(struct psl* psl, struct block* blk, int* pslMax)
/* add a block to a psl */
{
unsigned newIBlk = psl->blockCount;
assert((blk->qEnd - blk->qStart) == (blk->tEnd - blk->tStart));
if (newIBlk >= *pslMax)
pslGrow(psl, pslMax);
psl->qStarts[newIBlk] = blk->qStart;
psl->tStarts[newIBlk] = blk->tStart;
psl->blockSizes[newIBlk] = blk->qEnd - blk->qStart;
/* lie about match counts. */
psl->match += psl->blockSizes[newIBlk];
+/* count gaps */
+if (newIBlk > 0)
+ {
+ if (psl->qStarts[newIBlk] > pslQEnd(psl, newIBlk-1))
+ {
+ psl->qNumInsert++;
+ psl->qBaseInsert += psl->qStarts[newIBlk] - pslQEnd(psl, newIBlk-1);
+ }
+ if (psl->tStarts[newIBlk] > pslTEnd(psl, newIBlk-1))
+ {
+ psl->tNumInsert++;
+ psl->tBaseInsert += psl->tStarts[newIBlk] - pslTEnd(psl, newIBlk-1);
+ }
+ }
psl->blockCount++;
}
static void setPslBounds(struct psl* mappedPsl)
/* set sequences bounds on mapped PSL */
{
int lastBlk = mappedPsl->blockCount-1;
/* set start/end of sequences */
mappedPsl->qStart = mappedPsl->qStarts[0];
mappedPsl->qEnd = mappedPsl->qStarts[lastBlk] + mappedPsl->blockSizes[lastBlk];
if (pslQStrand(mappedPsl) == '-')
reverseIntRange(&mappedPsl->qStart, &mappedPsl->qEnd, mappedPsl->qSize);
mappedPsl->tStart = mappedPsl->tStarts[0];
mappedPsl->tEnd = mappedPsl->tStarts[lastBlk] + mappedPsl->blockSizes[lastBlk];
if (pslTStrand(mappedPsl) == '-')
reverseIntRange(&mappedPsl->tStart, &mappedPsl->tEnd, mappedPsl->tSize);
}
static void adjustOrientation(unsigned opts, struct psl *inPsl, char *inPslOrigStrand,
struct psl* mappedPsl)
/* adjust strand, possibly reverse complementing, based on
* pslTransMapKeepTrans option and input psls. */
{
if (!(opts&pslTransMapKeepTrans) || ((inPslOrigStrand[1] == '\0') && (mappedPsl->strand[1] == '\0')))
{
/* make untranslated */
if (pslTStrand(mappedPsl) == '-')
pslRc(mappedPsl);
mappedPsl->strand[1] = '\0'; /* implied target strand */
}
}
static struct block getBeforeBlockMapping(unsigned mqStart, unsigned mqEnd,
struct block* align1Blk)
/* map part of an ungapped psl block that occurs before a mapPsl block */
{
struct block mappedBlk;
/* mRNA start in genomic gap before this block, this will
* be an inPsl insert */
unsigned size = (align1Blk->tEnd < mqStart)
? (align1Blk->tEnd - align1Blk->tStart)
: (mqStart - align1Blk->tStart);
ZeroVar(&mappedBlk);
mappedBlk.qStart = align1Blk->qStart;
mappedBlk.qEnd = align1Blk->qStart + size;
return mappedBlk;
}
static struct block getOverBlockMapping(unsigned mqStart, unsigned mqEnd,
unsigned mtStart, struct block* align1Blk)
/* map part of an ungapped psl block that overlapps a mapPsl block. */
{
struct block mappedBlk;
/* common sequence start contained in this block, this handles aligned
* and genomic inserts */
unsigned off = align1Blk->tStart - mqStart;
unsigned size = (align1Blk->tEnd > mqEnd)
? (mqEnd - align1Blk->tStart)
: (align1Blk->tEnd - align1Blk->tStart);
ZeroVar(&mappedBlk);
mappedBlk.qStart = align1Blk->qStart;
mappedBlk.qEnd = align1Blk->qStart + size;
mappedBlk.tStart = mtStart + off;
mappedBlk.tEnd = mtStart + off + size;
return mappedBlk;
}
static struct block getBlockMapping(struct psl* inPsl, struct psl *mapPsl,
int *iMapBlkPtr, struct block* align1Blk)
/* Map part or all of a ungapped psl block to the genome. This returns the
* coordinates of the sub-block starting at the beginning of the align1Blk.
* If this is a gap, either the target or query coords are zero. This works
* in genomic strand space. The search starts at the specified map block,
* which is updated to prevent rescanning large psls.
*/
{
int iBlk;
struct block mappedBlk;
/* search for block or gap containing start of mrna block */
for (iBlk = *iMapBlkPtr; iBlk < mapPsl->blockCount; iBlk++)
{
unsigned mqStart = mapPsl->qStarts[iBlk];
unsigned mqEnd = mapPsl->qStarts[iBlk]+mapPsl->blockSizes[iBlk];
if (align1Blk->tStart < mqStart)
{
*iMapBlkPtr = iBlk;
return getBeforeBlockMapping(mqStart, mqEnd, align1Blk);
}
if ((align1Blk->tStart >= mqStart) && (align1Blk->tStart < mqEnd))
{
*iMapBlkPtr = iBlk;
return getOverBlockMapping(mqStart, mqEnd, mapPsl->tStarts[iBlk], align1Blk);
}
}
/* reached the end of the mRNA->genome alignment, finish off the
* rest of the the protein as an insert */
ZeroVar(&mappedBlk);
mappedBlk.qStart = align1Blk->qStart;
mappedBlk.qEnd = align1Blk->qEnd;
*iMapBlkPtr = iBlk;
return mappedBlk;
}
static boolean mapBlock(struct psl *inPsl, struct psl *mapPsl, int *iMapBlkPtr,
struct block *align1Blk, struct psl* mappedPsl,
int* mappedPslMax)
/* Add a PSL block from a ungapped portion of an alignment, mapping it to the
* genome. If the started of the inPsl block maps to a part of the mapPsl
* that is aligned, it is added as a PSL block, otherwise the gap is skipped.
* Block starts are adjusted for next call. Return FALSE if the end of the
* alignment is reached. */
{
struct block mappedBlk;
unsigned size;
assert(align1Blk->qStart <= align1Blk->qEnd);
assert(align1Blk->tStart <= align1Blk->tEnd);
assert((align1Blk->qEnd - align1Blk->qStart) == (align1Blk->tEnd - align1Blk->tStart));
if ((align1Blk->qStart >= align1Blk->qEnd) || (align1Blk->tStart >= align1Blk->tEnd))
return FALSE; /* end of ungapped block. */
/* find block or gap with start coordinates of mrna */
mappedBlk = getBlockMapping(inPsl, mapPsl, iMapBlkPtr, align1Blk);
if ((mappedBlk.qEnd != 0) && (mappedBlk.tEnd != 0))
addPslBlock(mappedPsl, &mappedBlk, mappedPslMax);
/* advance past block or gap */
size = (mappedBlk.qEnd != 0)
? (mappedBlk.qEnd - mappedBlk.qStart)
: (mappedBlk.tEnd - mappedBlk.tStart);
align1Blk->qStart += size;
align1Blk->tStart += size;
return TRUE;
}
struct psl* pslTransMap(unsigned opts, struct psl *inPsl, struct psl *mapPsl)
/* map a psl via a mapping psl, a single psl is returned, or NULL if it
* couldn't be mapped. */
{
int mappedPslMax = 8; /* allocated space in output psl */
int iMapBlk = 0;
char inPslOrigStrand[3];
boolean rcInPsl = (pslTStrand(inPsl) != pslQStrand(mapPsl));
boolean cnv1 = (pslIsProtein(inPsl) && !pslIsProtein(mapPsl));
boolean cnv2 = (pslIsProtein(mapPsl) && !pslIsProtein(inPsl));
int iBlock;
struct psl* mappedPsl;
/* sanity check size, but allow names to vary to allow ids to have
* unique-ifying suffixes. */
if (inPsl->tSize != mapPsl->qSize)
errAbort("Error: inPsl %s tSize (%d) != mapPsl %s qSize (%d)",
inPsl->tName, inPsl->tSize, mapPsl->qName, mapPsl->qSize);
/* convert protein PSLs */
if (cnv1)
pslProtToNA(inPsl);
if (cnv2)
pslProtToNA(mapPsl);
/* need to ensure common sequence is in same orientation, save strand for later */
safef(inPslOrigStrand, sizeof(inPslOrigStrand), "%s", inPsl->strand);
if (rcInPsl)
pslRc(inPsl);
mappedPsl = createMappedPsl(inPsl, mapPsl, mappedPslMax);
/* Fill in ungapped blocks. */
for (iBlock = 0; iBlock < inPsl->blockCount; iBlock++)
{
struct block align1Blk = blockFromPslBlock(inPsl, iBlock);
while (mapBlock(inPsl, mapPsl, &iMapBlk, &align1Blk, mappedPsl,
&mappedPslMax))
continue;
}
/* finish up psl, or free if no blocks were added */
assert(mappedPsl->blockCount <= mappedPslMax);
if (mappedPsl->blockCount == 0)
pslFree(&mappedPsl); /* nothing made it */
else
{
setPslBounds(mappedPsl);
adjustOrientation(opts, inPsl, inPslOrigStrand, mappedPsl);
}
/* restore input */
if (rcInPsl)
{
pslRc(inPsl);
strcpy(inPsl->strand, inPslOrigStrand);
}
if (cnv1)
pslNAToProt(inPsl);
if (cnv2)
pslNAToProt(mapPsl);
return mappedPsl;
}