src/hg/blastToPsl/pslBuild.c 1.2
1.2 2010/04/16 17:32:12 markd
add options to convert TBLASTN to codon nucleotide/nucleotide coordinates, fixed bug with tStart/tEnd extending beyond actual range
Index: src/hg/blastToPsl/pslBuild.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/blastToPsl/pslBuild.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/blastToPsl/pslBuild.c 8 Feb 2010 03:04:23 -0000 1.1
+++ src/hg/blastToPsl/pslBuild.c 16 Apr 2010 17:32:12 -0000 1.2
@@ -1,284 +1,302 @@
/* object using in building a PSL from a blast record */
#include "common.h"
#include "pslBuild.h"
#include "psl.h"
/* score file header */
char *pslBuildScoreHdr = "#strand\tqName\tqStart\tqEnd\ttName\ttStart\ttEnd\tbitScore\teVal\n";
unsigned pslBuildGetBlastAlgo(char *program)
/* determine blast algorithm flags */
{
if (sameWord(program, "BLASTN"))
return blastn;
else if (sameWord(program, "BLASTP"))
return blastp;
else if (sameWord(program, "BLASTX"))
return blastx;
else if (sameWord(program, "TBLASTN"))
return tblastn;
else if (sameWord(program, "TBLASTX"))
return tblastx;
else if (sameWord(program, "PSIBLAST"))
return psiblast;
else
errAbort("unknown BLAST program: \"%s\"", program);
return 0;
}
struct block
/* coordinates of a block use during build */
{
char *qAln; /* alignment strings (not owned) */
char *tAln;
- int qStart; /* Query start/end positions. */
+ int qStart; /* Query start/end positions, in blast units */
int qEnd;
- int tStart; /* Target start/end position. */
+ int tStart; /* Target start/end position, in blast units */
int tEnd;
int qLetMult; /* each letter counts as this many units */
int tLetMult;
- int qSizeMult; /* user to convert coordinates */
- int tSizeMult;
+ int qCoordMult; /* used to convert coordinates */
+ int tCoordMult;
+ int q2tBlkSizeMult; /* convert query block size to target block size */
int countMult; /* column count for match/mismatch */
int alnStart; /* start/end in alignment */
int alnEnd;
};
static void blkInit(struct block *blk, int qStart, char *qAln, int tStart, char *tAln, unsigned flags)
/* initialize a block object */
{
ZeroVar(blk);
blk->qAln = qAln;
blk->tAln = tAln;
// initialize ends to starts for first call to nextUngappedBlk
blk->qEnd = qStart;
blk->tEnd = tStart;
-if (flags & tblastn)
+if (flags & cnvNucCoords)
{
+ // DNA/DNA PSL from protein/DNA align
+ assert(flags & tblastn);
blk->qLetMult = 1;
- blk->qSizeMult = 1;
+ blk->qCoordMult = 3;
blk->tLetMult = 3;
- blk->tSizeMult = 3;
+ blk->tCoordMult = 1;
+ blk->q2tBlkSizeMult = 3;
+ blk->countMult = 3;
+ }
+else if (flags & tblastn)
+ {
+ // protein/DNA PSL
+ blk->qLetMult = 1;
+ blk->qCoordMult = 1;
+ blk->tLetMult = 3;
+ blk->tCoordMult = 1;
+ blk->q2tBlkSizeMult = 3;
blk->countMult = 1;
}
else if (flags & tblastx)
{
blk->qLetMult = 3;
- blk->qSizeMult = 1;
+ blk->qCoordMult = 1;
blk->tLetMult = 3;
- blk->tSizeMult = 1;
+ blk->tCoordMult = 1;
+ blk->q2tBlkSizeMult = 1;
blk->countMult = 3;
}
else
{
blk->qLetMult = 1;
- blk->qSizeMult = 1;
+ blk->qCoordMult = 1;
blk->tLetMult = 1;
- blk->tSizeMult = 1;
+ blk->tCoordMult = 1;
+ blk->q2tBlkSizeMult = 1;
blk->countMult = 1;
}
}
static boolean isProteinSeqs(unsigned flags)
/* do the flags indicate a protein sequences in alignments */
{
return (flags & (blastp|blastx|tblastn|tblastx)) != 0;
}
static boolean nextUngappedBlk(struct block* blk)
/* Find the next ungapped block in a blast alignment, in [0..n) coords in mrna
* space. blk qStartNext and tStartNext should be initialize to
*/
{
// initBlk set this up for first call, subsequent start where previous left off
blk->qStart = blk->qEnd;
blk->tStart = blk->tEnd;
blk->alnStart = blk->alnEnd;
/* find start of next aligned block */
char *qPtr = blk->qAln + blk->alnStart;
char *tPtr = blk->tAln + blk->alnStart;
while ((*qPtr != '\0') && (*tPtr != '\0') && ((*qPtr == '-') || (*tPtr == '-')))
{
if (*qPtr != '-')
blk->qStart += blk->qLetMult;
if (*tPtr != '-')
blk->tStart += blk->tLetMult;
qPtr++;
tPtr++;
blk->alnStart++;
}
blk->qEnd = blk->qStart;
blk->tEnd = blk->tStart;
blk->alnEnd = blk->alnStart;
if ((*qPtr == '\0') || (*tPtr == '\0'))
{
assert((*qPtr == '\0') && (*tPtr == '\0'));
return FALSE; /* no more */
}
/* find end of aligned block */
while ((*qPtr != '\0') && (*tPtr != '\0')
&& (*qPtr != '-') && (*tPtr != '-'))
{
blk->qEnd += blk->qLetMult;
blk->tEnd += blk->tLetMult;
qPtr++;
tPtr++;
blk->alnEnd++;
}
-assert((blk->tSizeMult * (blk->qEnd - blk->qStart)) == (blk->qSizeMult * (blk->tEnd - blk->tStart)));
+// sanity test for blast blocks being the same length after conversion to the same units
+assert((blk->tLetMult * (blk->qEnd - blk->qStart)) == (blk->qLetMult * (blk->tEnd - blk->tStart)));
return TRUE;
}
static void addUngappedBlock(struct psl* psl, int* pslSpace, struct block* blk, unsigned flags)
/* add the next ungapped block to a psl */
{
unsigned newIBlk = psl->blockCount;
-unsigned blkSize = blk->qSizeMult * (blk->qEnd - blk->qStart);
+unsigned blkSize = blk->qEnd - blk->qStart; // uses query size so protein psl is right
if (newIBlk >= *pslSpace)
pslGrow(psl, pslSpace);
-psl->qStarts[newIBlk] = blk->qStart;
-psl->tStarts[newIBlk] = blk->tStart;
-/* uses query size so protein psl is right */
-psl->blockSizes[newIBlk] = blkSize;
+psl->qStarts[newIBlk] = blk->qCoordMult * blk->qStart;
+psl->tStarts[newIBlk] = blk->tCoordMult * blk->tStart;
+psl->blockSizes[newIBlk] = blk->qCoordMult * blkSize;
/* keep bounds current */
psl->qStart = psl->qStarts[0];
-psl->qEnd = psl->qStarts[newIBlk] + (blk->qSizeMult * psl->blockSizes[newIBlk]);
+psl->qEnd = psl->qStarts[newIBlk] + (blk->qCoordMult * blkSize);
if (psl->strand[0] == '-')
reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
psl->tStart = psl->tStarts[0];
-psl->tEnd = psl->tStarts[newIBlk] + (blk->tSizeMult * psl->blockSizes[newIBlk]);
+psl->tEnd = psl->tStarts[newIBlk] + (blk->q2tBlkSizeMult * blkSize);
if (psl->strand[1] == '-')
reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);
if (flags & bldPslx)
{
psl->qSequence[newIBlk] = cloneStringZ(blk->qAln + blk->alnStart, blkSize);
psl->tSequence[newIBlk] = cloneStringZ(blk->tAln + blk->alnStart, blkSize);
}
psl->blockCount++;
}
static void countIndels(struct psl *psl)
/* update indel counts in psl after adding a block */
{
if (psl->blockCount > 1)
{
int iBlk = psl->blockCount - 1;
if (pslQEnd(psl, iBlk-1) != psl->qStarts[iBlk])
{
/* insert in query */
psl->qNumInsert++;
psl->qBaseInsert += (psl->qStarts[iBlk] - pslQEnd(psl, iBlk-1));
}
if (pslTEnd(psl, iBlk-1) != psl->tStarts[iBlk])
{
/* insert in target */
psl->tNumInsert++;
psl->tBaseInsert += (psl->tStarts[iBlk] - pslTEnd(psl, iBlk-1));
}
}
}
static void countMatches(struct psl* psl, struct block* blk, unsigned flags)
/* update the PSL match/mismatch after adding a block. */
{
int i, alnLen = (blk->alnEnd - blk->alnStart);
char *qPtr = blk->qAln + blk->alnStart;
char *tPtr = blk->tAln + blk->alnStart;
boolean isProt = isProteinSeqs(flags);
for (i = 0; i < alnLen; i++, qPtr++, tPtr++)
{
if ((!isProt && ((*qPtr == 'N') || (*tPtr == 'N')))
|| (isProt && ((*qPtr == 'X') || (*tPtr == 'X'))))
psl->repMatch += blk->countMult;
else if (*qPtr == *tPtr)
psl->match += blk->countMult;
else
psl->misMatch += blk->countMult;
}
}
static void hspToBlocks(struct psl *psl, int *pslSpace, struct block *blk, unsigned flags)
/* build PSl blocks from an HSP */
{
/* fill in ungapped blocks */
while (nextUngappedBlk(blk))
{
addUngappedBlock(psl, pslSpace, blk, flags);
countIndels(psl);
countMatches(psl, blk, flags);
}
assert(blk->qStart == blk->qEnd);
assert(blk->tStart == blk->tEnd);
// FIXME
//assert(blk->qStart == pslQEnd(psl, psl->blockCount-1));
//assert(blk->tStart == pslTEnd(psl, psl->blockCount-1));
}
static void makeUntranslated(struct psl* psl)
/* convert a PSL so it is in the untranslated form produced by blat */
{
if (psl->strand[1] == '-')
{
/* swap around blocks so it's query that is reversed */
int i;
for (i = 0; i < psl->blockCount; i++)
{
psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]);
psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]);
}
reverseUnsigned(psl->tStarts, psl->blockCount);
reverseUnsigned(psl->qStarts, psl->blockCount);
reverseUnsigned(psl->blockSizes, psl->blockCount);
/* fix strand, +- now -, -- now + */
psl->strand[0] = (psl->strand[0] == '+') ? '-' : '+';
}
psl->strand[1] = '\0';
}
static void finishPsl(struct psl* psl, unsigned flags)
/* put finishing touches on a psl */
{
if ((flags & tblastn) == 0)
makeUntranslated(psl);
}
struct psl *pslBuildFromHsp(char *qName, int qSize, int qStart, int qEnd, char qStrand, char *qAln,
char *tName, int tSize, int tStart, int tEnd, char tStrand, char *tAln,
unsigned flags)
/* construct a new psl from an HSP. Chaining is left to other programs. */
{
if ((flags & tblastx) && (flags & bldPslx))
errAbort("can't convert TBLASTX to PSLX");
+struct block blk;
+blkInit(&blk, qStart, qAln, tStart, tAln, flags);
+
// construct psl
int pslSpace = 8;
char strand[3];
safef(strand, sizeof(strand), "%c%c", qStrand, tStrand);
-struct psl *psl = pslNew(qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd,
+struct psl *psl = pslNew(qName, blk.qCoordMult * qSize, blk.qCoordMult * qStart, blk.qCoordMult * qEnd,
+ tName, blk.tCoordMult * tSize, blk.tCoordMult * tStart, blk.tCoordMult * tEnd,
strand, pslSpace, ((flags & bldPslx) ? PSL_XA_FORMAT : 0));
// fill in psl
-struct block blk;
-blkInit(&blk, qStart, qAln, tStart, tAln, flags);
hspToBlocks(psl, &pslSpace, &blk, flags);
finishPsl(psl, flags);
return psl;
}
void pslBuildWriteScores(FILE* scoreFh, struct psl *psl, double bitScore, double eValue)
/* write scores for a PSL */
{
fprintf(scoreFh, "%s\t%s\t%d\t%d\t%s\t%d\t%d\t%g\t%g\n", psl->strand,
psl->qName, psl->qStart, psl->qEnd, psl->tName, psl->tStart, psl->tEnd,
bitScore, eValue);
}