src/lib/psl.c 1.81
1.81 2009/05/31 07:28:33 markd
don't modify input sequences when converting alignments
Index: src/lib/psl.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/psl.c,v
retrieving revision 1.80
retrieving revision 1.81
diff -b -B -U 1000000 -r1.80 -r1.81
--- src/lib/psl.c 19 May 2008 18:14:36 -0000 1.80
+++ src/lib/psl.c 31 May 2009 07:28:33 -0000 1.81
@@ -1,1992 +1,1947 @@
/* psl.c was originally generated by the autoSql program, which also
* generated as_psl.h and as_psl.sql. This module links the database and the RAM
* representation of objects.
*
* This file is copyright 2002 Jim Kent, but license is hereby
* granted for all use - public, private or commercial. */
#include "common.h"
#include "sqlNum.h"
#include "sqlList.h"
#include "localmem.h"
#include "psl.h"
#include "hash.h"
#include "linefile.h"
#include "dnaseq.h"
#include "dystring.h"
#include "fuzzyFind.h"
#include "aliType.h"
#include "binRange.h"
#include "rangeTree.h"
static char const rcsid[] = "$Id$";
static char *createString =
"CREATE TABLE %s (\n"
"%s" /* Optional bin */
"matches int unsigned not null, # Number of bases that match that aren't repeats\n"
"misMatches int unsigned not null, # Number of bases that don't match\n"
"repMatches int unsigned not null, # Number of bases that match but are part of repeats\n"
"nCount int unsigned not null, # Number of 'N' bases\n"
"qNumInsert int unsigned not null, # Number of inserts in query\n"
"qBaseInsert int unsigned not null, # Number of bases inserted in query\n"
"tNumInsert int unsigned not null, # Number of inserts in target\n"
"tBaseInsert int unsigned not null, # Number of bases inserted in target\n"
"strand char(2) not null, # + or - for strand. First character is query, second is target.\n"
"qName varchar(255) not null, # Query sequence name\n"
"qSize int unsigned not null, # Query sequence size\n"
"qStart int unsigned not null, # Alignment start position in query\n"
"qEnd int unsigned not null, # Alignment end position in query\n"
"tName varchar(255) not null, # Target sequence name\n"
"tSize int unsigned not null, # Target sequence size\n"
"tStart int unsigned not null, # Alignment start position in target\n"
"tEnd int unsigned not null, # Alignment end position in target\n"
"blockCount int unsigned not null, # Number of blocks in alignment\n"
"blockSizes longblob not null, # Size of each block\n"
"qStarts longblob not null, # Start of each block in query.\n"
"tStarts longblob not null, # Start of each block in target.\n";
static char *indexString =
"#Indices\n"
"%s" /* Optional bin. */
"INDEX(qName(12))\n"
")\n";
struct psl *pslxLoad(char **row)
/* Load a psl from row fetched with select * from psl
* from database. Dispose of this with pslFree(). */
{
struct psl *ret = pslLoad(row);
int retSize;
sqlStringDynamicArray(row[21],&ret->qSequence, &retSize);
sqlStringDynamicArray(row[22],&ret->tSequence, &retSize);
return ret;
}
struct psl *pslLoad(char **row)
/* Load a psl from row fetched with select * from psl
* from database. Dispose of this with pslFree(). */
{
struct psl *ret;
int sizeOne;
AllocVar(ret);
ret->blockCount = sqlUnsigned(row[17]);
ret->match = sqlUnsigned(row[0]);
ret->misMatch = sqlUnsigned(row[1]);
ret->repMatch = sqlUnsigned(row[2]);
ret->nCount = sqlUnsigned(row[3]);
ret->qNumInsert = sqlUnsigned(row[4]);
ret->qBaseInsert = sqlSigned(row[5]);
ret->tNumInsert = sqlUnsigned(row[6]);
ret->tBaseInsert = sqlSigned(row[7]);
strcpy(ret->strand, row[8]);
ret->qName = cloneString(row[9]);
ret->qSize = sqlUnsigned(row[10]);
ret->qStart = sqlUnsigned(row[11]);
ret->qEnd = sqlUnsigned(row[12]);
ret->tName = cloneString(row[13]);
ret->tSize = sqlUnsigned(row[14]);
ret->tStart = sqlUnsigned(row[15]);
ret->tEnd = sqlUnsigned(row[16]);
sqlUnsignedDynamicArray(row[18], &ret->blockSizes, &sizeOne);
if (sizeOne != ret->blockCount)
{
printf("sizeOne bloxksizes %d bs %d block=%s\n",sizeOne, ret->blockCount,row[18]);
}
assert(sizeOne == ret->blockCount);
sqlUnsignedDynamicArray(row[19], &ret->qStarts, &sizeOne);
if (sizeOne != ret->blockCount)
{
printf("sizeOne qStarts %d bs %d\n",sizeOne, ret->blockCount);
}
assert(sizeOne == ret->blockCount);
sqlUnsignedDynamicArray(row[20], &ret->tStarts, &sizeOne);
if (sizeOne != ret->blockCount)
{
printf("sizeOne tStarts %d bs %d\n",sizeOne, ret->blockCount);
}
assert(sizeOne == ret->blockCount);
return ret;
}
struct psl *pslCommaIn(char **pS, struct psl *ret)
/* Create a psl out of a comma separated string.
* This will fill in ret if non-null, otherwise will
* return a new psl */
{
char *s = *pS;
int i;
if (ret == NULL)
AllocVar(ret);
ret->match = sqlUnsignedComma(&s);
ret->misMatch = sqlUnsignedComma(&s);
ret->repMatch = sqlUnsignedComma(&s);
ret->nCount = sqlUnsignedComma(&s);
ret->qNumInsert = sqlUnsignedComma(&s);
ret->qBaseInsert = sqlSignedComma(&s);
ret->tNumInsert = sqlUnsignedComma(&s);
ret->tBaseInsert = sqlSignedComma(&s);
sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
ret->qName = sqlStringComma(&s);
ret->qSize = sqlUnsignedComma(&s);
ret->qStart = sqlUnsignedComma(&s);
ret->qEnd = sqlUnsignedComma(&s);
ret->tName = sqlStringComma(&s);
ret->tSize = sqlUnsignedComma(&s);
ret->tStart = sqlUnsignedComma(&s);
ret->tEnd = sqlUnsignedComma(&s);
ret->blockCount = sqlUnsignedComma(&s);
s = sqlEatChar(s, '{');
AllocArray(ret->blockSizes, ret->blockCount);
for (i=0; i<ret->blockCount; ++i)
{
ret->blockSizes[i] = sqlUnsignedComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
s = sqlEatChar(s, '{');
AllocArray(ret->qStarts, ret->blockCount);
for (i=0; i<ret->blockCount; ++i)
{
ret->qStarts[i] = sqlUnsignedComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
s = sqlEatChar(s, '{');
AllocArray(ret->tStarts, ret->blockCount);
for (i=0; i<ret->blockCount; ++i)
{
ret->tStarts[i] = sqlUnsignedComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
*pS = s;
return ret;
}
void pslFree(struct psl **pEl)
/* Free a single dynamically allocated psl such as created
* with pslLoad(). */
{
struct psl *el;
if ((el = *pEl) == NULL) return;
freeMem(el->qName);
freeMem(el->tName);
freeMem(el->blockSizes);
freeMem(el->qStarts);
freeMem(el->tStarts);
if (el->qSequence)
{
freeMem(el->qSequence[0]);
freeMem(el->qSequence);
}
if (el->tSequence)
{
freeMem(el->tSequence[0]);
freeMem(el->tSequence);
}
freez(pEl);
}
void pslFreeList(struct psl **pList)
/* Free a list of dynamically allocated psl's */
{
struct psl *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
pslFree(&el);
}
*pList = NULL;
}
void pslOutput(struct psl *el, FILE *f, char sep, char lastSep)
/* Print out psl. Separate fields with sep. Follow last field with lastSep. */
{
int i;
fprintf(f, "%u", el->match);
fputc(sep,f);
fprintf(f, "%u", el->misMatch);
fputc(sep,f);
fprintf(f, "%u", el->repMatch);
fputc(sep,f);
fprintf(f, "%u", el->nCount);
fputc(sep,f);
fprintf(f, "%u", el->qNumInsert);
fputc(sep,f);
fprintf(f, "%d", el->qBaseInsert);
fputc(sep,f);
fprintf(f, "%u", el->tNumInsert);
fputc(sep,f);
fprintf(f, "%d", el->tBaseInsert);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->strand);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->qName);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->qSize);
fputc(sep,f);
fprintf(f, "%u", el->qStart);
fputc(sep,f);
fprintf(f, "%u", el->qEnd);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->tName);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->tSize);
fputc(sep,f);
fprintf(f, "%u", el->tStart);
fputc(sep,f);
fprintf(f, "%u", el->tEnd);
fputc(sep,f);
fprintf(f, "%u", el->blockCount);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->blockCount; ++i)
{
fprintf(f, "%u", el->blockSizes[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->blockCount; ++i)
{
fprintf(f, "%u", el->qStarts[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->blockCount; ++i)
{
fprintf(f, "%u", el->tStarts[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
if (el->qSequence)
{
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->blockCount; ++i)
{
fprintf(f, "%s", el->qSequence[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->blockCount; ++i)
{
fprintf(f, "%s", el->tSequence[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
}
fputc(lastSep,f);
if (ferror(f))
{
perror("Error writing psl file\n");
errAbort("\n");
}
}
/* ----- end autoSql generated part --------------- */
void pslOutputShort(struct psl *el, FILE *f, char sep, char lastSep)
/* Print out psl. Separate fields with sep. Follow last field with lastSep. */
{
fprintf(f, "%u", el->match);
fputc(sep,f);
fprintf(f, "%u", el->misMatch);
fputc(sep,f);
fprintf(f, "%u", el->repMatch);
fputc(sep,f);
fprintf(f, "%u", el->qNumInsert);
fputc(sep,f);
fprintf(f, "%d", el->qBaseInsert);
fputc(sep,f);
fprintf(f, "%u", el->tNumInsert);
fputc(sep,f);
fprintf(f, "%d", el->tBaseInsert);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->strand);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->qName);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->qStart);
fputc(sep,f);
fprintf(f, "%u", abs(el->qEnd - el->qStart));
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->tName);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->tStart);
fputc(sep,f);
fprintf(f, "%u", abs(el->tEnd - el->tStart));
fputc(sep,f);
fprintf(f, "%u", el->blockCount);
fputc(sep,f);
if (sep == ',') fputc('{',f);
fputc(lastSep,f);
if (ferror(f))
{
perror("Error writing psl file\n");
errAbort("\n");
}
}
void pslOutFormat(struct psl *el, FILE *f, char sep, char lastSep)
/* Print out selected psl values. Separate fields with sep. Follow last field with lastSep. */
/* Prints out a better format with bold field headings followed by value */
/* Requires further upstream work to ensure that only the field headers */
/* declared here are printed if replacing an existing psl print function*/
{
const char *headers[] = {"Matches", "Mismatches", "Matches in repeats", "Number of N bases", "Query name", "Size", "Start", "End", "Chromosome", "Strand", "Start", "End"};
char *hformat = "<B>%s:</B> "; /* string for formatted print for headers */
char *uformat = "<B>%s:</B> %u%c"; /* string for formatted print for unsigned variable */
char *targName;
fprintf(f, uformat, headers[0], el->match, sep);
fprintf(f, uformat, headers[1], el->misMatch, sep);
fprintf(f, uformat, headers[2], el->repMatch, sep);
fprintf(f, uformat, headers[3], el->nCount, sep);
fprintf(f, hformat, headers[4]);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->qName);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, uformat, headers[5], el->qSize, sep);
fprintf(f, uformat, headers[6], el->qStart, sep);
fprintf(f, uformat, headers[7], el->qEnd, sep);
fprintf(f, hformat, headers[8]);
if (sep == ',') fputc('"',f);
/* skip leading 'chr' in string to get only chromosome part */
targName = el->tName;
if (startsWith("chr", el->tName))
targName += 3;
fprintf(f, "%s", targName);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, hformat, headers[9]);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->strand);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, uformat, headers[10], el->tStart, sep);
fprintf(f, uformat, headers[11], el->tEnd, sep);
fputc(lastSep,f);
if (ferror(f))
{
perror("Error writing psl file\n");
errAbort("\n");
}
}
struct psl *pslLoadAll(char *fileName)
/* Load all psl's in file. */
{
struct lineFile *lf = pslFileOpen(fileName);
struct psl *pslList = NULL, *psl;
while ((psl = pslNext(lf)) != NULL)
{
slAddHead(&pslList, psl);
}
slReverse(&pslList);
lineFileClose(&lf);
return pslList;
}
int pslCmpQuery(const void *va, const void *vb)
/* Compare to sort based on query start. */
{
const struct psl *a = *((struct psl **)va);
const struct psl *b = *((struct psl **)vb);
int dif;
dif = strcmp(a->qName, b->qName);
if (dif == 0)
dif = a->qStart - b->qStart;
return dif;
}
int pslCmpTarget(const void *va, const void *vb)
/* Compare to sort based on target start. */
{
const struct psl *a = *((struct psl **)va);
const struct psl *b = *((struct psl **)vb);
int dif;
dif = strcmp(a->tName, b->tName);
if (dif == 0)
dif = a->tStart - b->tStart;
return dif;
}
int pslCmpTargetAndStrand(const void *va, const void *vb)
/* Compare to sort based on target, strand, tStart. */
{
const struct psl *a = *((struct psl **)va);
const struct psl *b = *((struct psl **)vb);
int dif;
dif = strcmp(a->tName, b->tName);
if (dif == 0)
dif = strcmp(a->strand, b->strand);
if (dif == 0)
dif = a->tStart - b->tStart;
return dif;
}
int pslCmpScore(const void *va, const void *vb)
/* Compare to sort based on score (descending). */
{
const struct psl *a = *((struct psl **)va);
const struct psl *b = *((struct psl **)vb);
return pslScore(b) - pslScore(a);
}
int pslCmpQueryScore(const void *va, const void *vb)
/* Compare to sort based on query then score (descending). */
{
const struct psl *a = *((struct psl **)va);
const struct psl *b = *((struct psl **)vb);
int diff = strcmp(a->qName, b->qName);
if (diff == 0)
diff = pslScore(b) - pslScore(a);
return diff;
}
int pslCmpMatch(const void *va, const void *vb)
/* Compare to sort based on match */
{
const struct psl *a = *((struct psl **)va);
const struct psl *b = *((struct psl **)vb);
return b->match - a->match;
}
static void pslLabelColumns(FILE *f)
/* Write column info. */
{
fputs("\n"
"match\tmis- \trep. \tN's\tQ gap\tQ gap\tT gap\tT gap\tstrand\tQ \tQ \tQ \tQ \tT \tT \tT \tT \tblock\tblockSizes \tqStarts\t tStarts\n"
" \tmatch\tmatch\t \tcount\tbases\tcount\tbases\t \tname \tsize\tstart\tend\tname \tsize\tstart\tend\tcount\n"
"---------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
f);
}
void pslxWriteHead(FILE *f, enum gfType qType, enum gfType tType)
/* Write header for extended (possibly protein) psl file. */
{
fprintf(f, "psLayout version 4 %s %s\n", gfTypeName(qType), gfTypeName(tType));
pslLabelColumns(f);
}
void pslWriteHead(FILE *f)
/* Write head of psl. */
{
fputs("psLayout version 3\n", f);
pslLabelColumns(f);
}
void pslWriteAll(struct psl *pslList, char *fileName, boolean writeHeader)
/* Write a psl file from list. */
{
FILE *f;
struct psl *psl;
f = mustOpen(fileName, "w");
if (writeHeader)
pslWriteHead(f);
for (psl = pslList; psl != NULL; psl = psl->next)
pslTabOut(psl, f);
fclose(f);
}
void pslxFileOpen(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf)
/* Read header part of psl and make sure it's right. Return
* sequence types and file handle. */
{
char *line;
int lineSize;
char *words[30];
char *version;
int wordCount;
int i;
enum gfType qt = gftRna, tt = gftDna;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
if (!lineFileNext(lf, &line, &lineSize))
warn("%s is empty", fileName);
else
{
if (startsWith("psLayout version", line))
{
wordCount = chopLine(line, words);
if (wordCount < 3)
errAbort("%s is not a psLayout file", fileName);
version = words[2];
if (sameString(version, "3"))
{
}
else if (sameString(version, "4"))
{
qt = gfTypeFromName(words[3]);
tt = gfTypeFromName(words[4]);
}
else
{
errAbort("%s is version %s of psLayout, this program can only handle through version 4",
fileName, version);
}
for (i=0; i<4; ++i)
{
if (!lineFileNext(lf, &line, &lineSize))
errAbort("%s severely truncated", fileName);
}
}
else
{
char *s = cloneString(line);
while (line != NULL && line[0] == '#')
{
freeMem(s);
lineFileNext(lf, &line, &lineSize);
s = cloneString(line);
}
wordCount = chopLine(s, words);
if (wordCount < 21 || wordCount > 23 || (words[8][0] != '+' && words[8][0] != '-'))
errAbort("%s is not a psLayout file", fileName);
else
lineFileReuse(lf);
freeMem(s);
}
}
*retQueryType = qt;
*retTargetType = tt;
*retLf = lf;
}
static void pslxFileOpenWithMetaConfig(char *fileName, bool isMetaUnique, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f)
/* Read header part of psl and make sure it's right. Return
* sequence types and file handle and send meta data to output file f */
{
char *line;
int lineSize;
char *words[30];
char *version;
int wordCount;
int i;
enum gfType qt = gftRna, tt = gftDna;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
lineFileSetMetaDataOutput(lf, f);
if (isMetaUnique)
lineFileSetUniqueMetaData(lf);
if (!lineFileNext(lf, &line, &lineSize))
warn("%s is empty", fileName);
else
{
if (startsWith("psLayout version", line))
{
wordCount = chopLine(line, words);
if (wordCount < 3)
errAbort("%s is not a psLayout file", fileName);
version = words[2];
if (sameString(version, "3"))
{
}
else if (sameString(version, "4"))
{
qt = gfTypeFromName(words[3]);
tt = gfTypeFromName(words[4]);
}
else
{
errAbort("%s is version %s of psLayout, this program can only handle through version 4",
fileName, version);
}
for (i=0; i<4; ++i)
{
if (!lineFileNext(lf, &line, &lineSize))
errAbort("%s severely truncated", fileName);
}
}
else
{
char *s = cloneString(line);
boolean eof = FALSE;
while ((line[0] == '#') && (!eof))
{
freeMem(s);
if (!lineFileNext(lf, &line, &lineSize))
eof = TRUE;
s = cloneString(line);
}
wordCount = chopLine(s, words);
if ((wordCount < 21 || wordCount > 23 || (words[8][0] != '+' && words[8][0] != '-')) && (!eof))
errAbort("%s is not a psLayout file", fileName);
else
lineFileReuse(lf);
freeMem(s);
}
}
*retQueryType = qt;
*retTargetType = tt;
*retLf = lf;
}
void pslxFileOpenWithMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f)
/* Read header part of psl and make sure it's right. Return
* sequence types and file handle and send meta data to output file f */
{
pslxFileOpenWithMetaConfig(fileName, FALSE, retQueryType, retTargetType, retLf, f);
}
void pslxFileOpenWithUniqueMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f)
/* Read header part of psl and make sure it's right. Return
* sequence types and file handle and send only unique meta data to output f */
{
pslxFileOpenWithMetaConfig(fileName, TRUE, retQueryType, retTargetType, retLf, f);
}
struct lineFile *pslFileOpen(char *fileName)
/* Read header part of psl and make sure it's right.
* Return line file handle to it. */
{
enum gfType qt, tt;
struct lineFile *lf;
pslxFileOpen(fileName, &qt, &tt, &lf);
return lf;
}
struct lineFile *pslFileOpenWithMeta(char *fileName, FILE *f)
/* Read header part of psl and make sure it's right.
* Return line file handle to it. */
{
enum gfType qt, tt;
struct lineFile *lf;
pslxFileOpenWithMeta(fileName, &qt, &tt, &lf, f);
return lf;
}
struct lineFile *pslFileOpenWithUniqueMeta(char *fileName, FILE *f)
/* Read header part of psl and make sure it's right.
* Set flag to suppress duplicate header comments.
* Return line file handle to it. */
{
enum gfType qt, tt;
struct lineFile *lf;
pslxFileOpenWithUniqueMeta(fileName, &qt, &tt, &lf, f);
return lf;
}
struct psl *pslNext(struct lineFile *lf)
/* Read next line from file and convert it to psl. Return
* NULL at eof. */
{
char *line;
int lineSize;
char *words[32];
int wordCount;
static int lineAlloc = 0;
static char *chopBuf = NULL;
if (!lineFileNextReal(lf, &line))
{
return NULL;
}
lineSize = strlen(line);
if (lineSize >= lineAlloc)
{
lineAlloc = lineSize+256;
chopBuf = needMoreMem(chopBuf, 0, lineAlloc);
}
memcpy(chopBuf, line, lineSize+1);
wordCount = chopLine(chopBuf, words);
if (wordCount == 21)
{
return pslLoad(words);
}
if (wordCount == 23)
{
return pslxLoad(words);
}
else
{
errAbort("Bad line %d of %s wordCount is %d instead of 21 or 23\n", lf->lineIx, lf->fileName, wordCount);
return NULL;
}
}
struct psl *pslxLoadLm(char **row, struct lm *lm)
/* Load row into local memory pslx. */
{
struct psl *ret = pslLoadLm(row, lm);
ret->qSequence = lmAlloc(lm, sizeof(ret->qSequence[0]) * ret->blockCount);
sqlStringArray(lmCloneString(lm,row[21]),ret->qSequence, ret->blockCount);
ret->tSequence = lmAlloc(lm, sizeof(ret->tSequence[0]) * ret->blockCount);
sqlStringArray(lmCloneString(lm,row[22]),ret->tSequence, ret->blockCount);
return ret;
}
struct psl *pslLoadLm(char **row, struct lm *lm)
/* Load row into local memory psl. */
{
struct psl *ret;
ret = lmAlloc(lm, sizeof(*ret));
ret->blockCount = sqlUnsigned(row[17]);
ret->match = sqlUnsigned(row[0]);
ret->misMatch = sqlUnsigned(row[1]);
ret->repMatch = sqlUnsigned(row[2]);
ret->nCount = sqlUnsigned(row[3]);
ret->qNumInsert = sqlUnsigned(row[4]);
ret->qBaseInsert = sqlSigned(row[5]);
ret->tNumInsert = sqlUnsigned(row[6]);
ret->tBaseInsert = sqlSigned(row[7]);
strcpy(ret->strand, row[8]);
ret->qName = lmCloneString(lm,row[9]);
ret->qSize = sqlUnsigned(row[10]);
ret->qStart = sqlUnsigned(row[11]);
ret->qEnd = sqlUnsigned(row[12]);
ret->tName = lmCloneString(lm, row[13]);
ret->tSize = sqlUnsigned(row[14]);
ret->tStart = sqlUnsigned(row[15]);
ret->tEnd = sqlUnsigned(row[16]);
ret->blockSizes = lmAlloc(lm, sizeof(ret->blockSizes[0]) * ret->blockCount);
sqlUnsignedArray(row[18], ret->blockSizes, ret->blockCount);
ret->qStarts = lmAlloc(lm, sizeof(ret->qStarts[0]) * ret->blockCount);
sqlUnsignedArray(row[19], ret->qStarts, ret->blockCount);
ret->tStarts = lmAlloc(lm, sizeof(ret->tStarts[0]) * ret->blockCount);
sqlUnsignedArray(row[20], ret->tStarts, ret->blockCount);
return ret;
}
boolean pslIsProtein(const struct psl *psl)
/* is psl a protein psl (are it's blockSizes and scores in protein space) */
{
int lastBlock = psl->blockCount - 1;
return (((psl->strand[1] == '+' ) &&
(psl->tEnd == psl->tStarts[lastBlock] + 3*psl->blockSizes[lastBlock])) ||
((psl->strand[1] == '-') &&
(psl->tStart == (psl->tSize-(psl->tStarts[lastBlock] + 3*psl->blockSizes[lastBlock])))));
}
int pslCalcMilliBad(struct psl *psl, boolean isMrna)
/* Calculate badness in parts per thousand. */
{
int sizeMul = pslIsProtein(psl) ? 3 : 1;
int qAliSize, tAliSize, aliSize;
int milliBad = 0;
int sizeDif;
int insertFactor;
int total;
qAliSize = sizeMul * (psl->qEnd - psl->qStart);
tAliSize = psl->tEnd - psl->tStart;
aliSize = min(qAliSize, tAliSize);
if (aliSize <= 0)
return 0;
sizeDif = qAliSize - tAliSize;
if (sizeDif < 0)
{
if (isMrna)
sizeDif = 0;
else
sizeDif = -sizeDif;
}
insertFactor = psl->qNumInsert;
if (!isMrna)
insertFactor += psl->tNumInsert;
total = (sizeMul * (psl->match + psl->repMatch + psl->misMatch));
if (total != 0)
milliBad = (1000 * (psl->misMatch*sizeMul + insertFactor + round(3*log(1+sizeDif)))) / total;
return milliBad;
}
int pslScore(const struct psl *psl)
/* Return score for psl. */
{
int sizeMul = pslIsProtein(psl) ? 3 : 1;
return sizeMul * (psl->match + ( psl->repMatch>>1)) -
sizeMul * psl->misMatch - psl->qNumInsert - psl->tNumInsert;
}
int pslCmpScoreDesc(const void *va, const void *vb)
/* Compare to sort based on score. */
{
const struct psl *a = *((struct psl **)va);
const struct psl *b = *((struct psl **)vb);
return pslScore(b) - pslScore(a);
}
struct ffAli *pslToFakeFfAli(struct psl *psl, DNA *needle, DNA *haystack)
/* Convert from psl to ffAli format. In some cases you can pass NULL
* for needle and haystack - depending what the post-processing is going
* to be. */
{
struct ffAli *ffList = NULL, *ff;
int blockCount = psl->blockCount;
unsigned *blockSizes = psl->blockSizes;
unsigned *qStarts = psl->qStarts;
unsigned *tStarts = psl->tStarts;
int size;
int i;
for (i=0; i<blockCount; ++i)
{
size = blockSizes[i];
AllocVar(ff);
ff->left = ffList;
ffList = ff;
ff->nStart = ff->nEnd = needle + qStarts[i];
ff->nEnd += size;
ff->hStart = ff->hEnd = haystack + tStarts[i];
ff->hEnd += size;
}
ffList = ffMakeRightLinks(ffList);
return ffList;
}
struct psl *pslFromFakeFfAli(struct ffAli *ff,
DNA *needle, DNA *haystack, char strand,
char *qName, int qSize, char *tName, int tSize)
/* This will create a basic psl structure from a sorted series of ffAli
* blocks. The fields that would need actual sequence to be filled in
* are left zero however - fields including match, repMatch, mismatch. */
{
struct psl *psl;
unsigned *blockSizes;
unsigned *qStarts;
unsigned *tStarts;
int blockCount;
int i;
int nStart, hStart;
int nEnd, hEnd;
AllocVar(psl);
psl->blockCount = blockCount = ffAliCount(ff);
psl->blockSizes = AllocArray(blockSizes, blockCount);
psl->qStarts = AllocArray(qStarts, blockCount);
psl->tStarts = AllocArray(tStarts, blockCount);
psl->qName = cloneString(qName);
psl->qSize = qSize;
psl->tName = cloneString(tName);
psl->tSize = tSize;
psl->strand[0] = strand;
for (i=0; i<blockCount; ++i)
{
nStart = ff->nStart - needle;
nEnd = ff->nEnd - needle;
hStart = ff->hStart - haystack;
hEnd = ff->hEnd - haystack;
blockSizes[i] = nEnd - nStart;
qStarts[i] = nStart;
tStarts[i] = hStart;
if (i == 0)
{
psl->qStart = nStart;
psl->tStart = hStart;
}
if (i == blockCount-1)
{
psl->qEnd = nEnd;
psl->tEnd = hEnd;
}
ff = ff->right;
}
if (strand == '-')
{
reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
}
return psl;
}
struct ffAli *pslToFfAli(struct psl *psl, struct dnaSeq *query, struct dnaSeq *target,
int targetOffset)
/* Convert from psl to ffAli format. Clip to parts that we actually
* have sequence for. */
{
struct ffAli *ffList = NULL, *ff;
DNA *needle = query->dna;
DNA *haystack = target->dna;
int blockCount = psl->blockCount;
unsigned *blockSizes = psl->blockSizes;
unsigned *qStarts = psl->qStarts;
unsigned *tStarts = psl->tStarts;
int size;
int i;
int tMin = targetOffset;
int tMax = targetOffset + target->size;
int tStart, tEnd;
int clipStart, clipEnd, clipOffset, clipSize;
for (i=0; i<blockCount; ++i)
{
clipStart = tStart = tStarts[i];
size = blockSizes[i];
clipEnd = tEnd = tStart + size;
if (tStart < tMax && tEnd > tMin)
{
if (clipStart < tMin) clipStart = tMin;
if (clipEnd > tMax) clipEnd = tMax;
clipOffset = clipStart - tStart;
clipSize = clipEnd - clipStart;
AllocVar(ff);
ff->left = ffList;
ffList = ff;
ff->nStart = ff->nEnd = needle + qStarts[i] + clipOffset;
ff->nEnd += clipSize;
ff->hStart = ff->hEnd = haystack + clipStart - targetOffset;
ff->hEnd += clipSize;
}
}
ffList = ffMakeRightLinks(ffList);
ffCountGoodEnds(ffList);
return ffList;
}
int pslOrientation(struct psl *psl)
/* Translate psl strand + or - to orientation +1 or -1 */
{
if (psl->strand[1] != '\0')
{
/* translated blat */
if (psl->strand[0] != psl->strand[1])
return -1;
else
return 1;
}
else
{
if (psl->strand[0] == '-')
return -1;
else
return 1;
}
}
int pslWeightedIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset)
/* Return >0 if introns make it look like alignment is on + strand,
* <0 if introns make it look like alignment is on - strand,
* 0 if can't tell. The absolute value of the return indicates
* how many splice sites we've seen supporting the orientation.
* Sequence should NOT be reverse complemented. */
{
int intronDir = 0;
int oneDir;
int i;
DNA *dna = genoSeq->dna;
/* code below doesn't support negative target strand (translated blat) */
if (psl->strand[1] == '-')
errAbort("pslWeightedIntronOrientation doesn't support a negative target strand");
for (i=1; i<psl->blockCount; ++i)
{
int iStart, iEnd, blockSize = psl->blockSizes[i-1];
if (psl->qStarts[i-1] + blockSize == psl->qStarts[i])
{
iStart = psl->tStarts[i-1] + psl->blockSizes[i-1] - offset;
iEnd = psl->tStarts[i] - offset;
oneDir = intronOrientation(dna+iStart, dna+iEnd);
intronDir += oneDir;
}
}
return intronDir;
}
int pslIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset)
/* Return 1 if introns make it look like alignment is on + strand,
* -1 if introns make it look like alignment is on - strand,
* 0 if can't tell.
* Sequence should NOT be reverse complemented. */
{
int intronDir = pslWeightedIntronOrientation(psl, genoSeq, offset);
if (intronDir < 0)
intronDir = -1;
else if (intronDir > 0)
intronDir = 1;
return intronDir;
}
boolean pslHasIntron(struct psl *psl, struct dnaSeq *seq, int seqOffset)
/* Return TRUE if there's a probable intron. Sequence should NOT be
* reverse complemented.*/
{
int blockCount = psl->blockCount, i;
unsigned *tStarts = psl->tStarts;
unsigned *blockSizes = psl->blockSizes;
unsigned *qStarts = psl->qStarts;
int blockSize, start, end;
DNA *dna = seq->dna;
for (i=1; i<blockCount; ++i)
{
blockSize = blockSizes[i-1];
start = qStarts[i-1]+blockSize;
end = qStarts[i];
if (start == end)
{
start = tStarts[i-1] + blockSize;
end = tStarts[i];
if (psl->strand[1] == '-')
reverseIntRange(&start, &end, psl->tSize);
start -= seqOffset;
end -= seqOffset;
if (intronOrientation(dna+start, dna+end) != 0)
return TRUE;
}
}
return FALSE;
}
void pslTailSizes(struct psl *psl, int *retStartTail, int *retEndTail)
/* Find the length of "tails" (rather than extensions) implied by psl. */
{
int orientation = pslOrientation(psl);
int qFloppyStart, qFloppyEnd;
int tFloppyStart, tFloppyEnd;
if (orientation > 0)
{
qFloppyStart = psl->qStart;
qFloppyEnd = psl->qSize - psl->qEnd;
}
else
{
qFloppyStart = psl->qSize - psl->qEnd;
qFloppyEnd = psl->qStart;
}
tFloppyStart = psl->tStart;
tFloppyEnd = psl->tSize - psl->tEnd;
*retStartTail = min(qFloppyStart, tFloppyStart);
*retEndTail = min(qFloppyEnd, tFloppyEnd);
}
static void rcSeqs(char **seqs, unsigned blockCount, unsigned *blockSizes)
/* reverses complement sequences in list, maintain property that all strings
* are in one malloc block. blockSizes should already be reversed. */
{
char *buf, *next;
int i, memSz = 0;
/* get a new memory block for strings */
for (i = 0; i < blockCount; i++)
memSz += blockSizes[i]+1;
next = buf = needLargeMem(memSz);
/* reverse compliment and copy to new memory block */
for (i = blockCount-1; i >= 0; i--)
{
int len = strlen(seqs[i]);
reverseComplement(seqs[i], len);
memcpy(next, seqs[i], len+1);
next += len+1;
}
/* swap memory and update pointers */
freeMem(seqs[0]);
seqs[0] = buf;
next = buf;
for (i = 0; i < blockCount; i++)
{
seqs[i] = next;
next += blockSizes[i]+1;
}
}
void pslRc(struct psl *psl)
/* Reverse-complement a PSL alignment. This makes the target strand explicit. */
{
unsigned tSize = psl->tSize, qSize = psl->qSize;
unsigned blockCount = psl->blockCount, i;
unsigned *tStarts = psl->tStarts, *qStarts = psl->qStarts, *blockSizes = psl->blockSizes;
/* swap strand, forcing target to have an explict strand */
psl->strand[0] = (psl->strand[0] != '-') ? '-' : '+';
psl->strand[1] = (psl->strand[1] != '-') ? '-' : '+';
psl->strand[2] = 0;
for (i=0; i<blockCount; ++i)
{
tStarts[i] = tSize - (tStarts[i] + blockSizes[i]);
qStarts[i] = qSize - (qStarts[i] + blockSizes[i]);
}
reverseUnsigned(tStarts, blockCount);
reverseUnsigned(qStarts, blockCount);
reverseUnsigned(blockSizes, blockCount);
if (psl->qSequence != NULL)
{
rcSeqs(psl->qSequence, blockCount, blockSizes);
rcSeqs(psl->tSequence, blockCount, blockSizes);
}
}
/* macro to swap to variables */
#define swapVars(a, b, tmp) ((tmp) = (a), (a) = (b), (b) = (tmp))
static void swapBlocks(struct psl *psl)
/* Swap the blocks in a psl without reverse complementing them. */
{
int i;
unsigned utmp;
char *stmp;
for (i = 0; i < psl->blockCount; i++)
{
swapVars(psl->qStarts[i], psl->tStarts[i], utmp);
if (psl->qSequence != NULL)
swapVars(psl->qSequence[i], psl->tSequence[i], stmp);
}
}
static void swapRCBlocks(struct psl *psl)
/* Swap and reverse complement blocks in a psl. Other psl fields must
* be modified first */
{
int i;
unsigned *uatmp;
char **satmp;
reverseUnsigned(psl->tStarts, psl->blockCount);
reverseUnsigned(psl->qStarts, psl->blockCount);
reverseUnsigned(psl->blockSizes, psl->blockCount);
swapVars(psl->tStarts, psl->qStarts, uatmp);
/* qSize and tSize have already been swapped */
for (i = 0; i < psl->blockCount; i++)
{
psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]);
psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]);
}
if (psl->qSequence != NULL)
{
/* note: all block sequences are stored in one malloc block, which is
* entry zero */
rcSeqs(psl->qSequence, psl->blockCount, psl->blockSizes);
rcSeqs(psl->tSequence, psl->blockCount, psl->blockSizes);
swapVars(psl->qSequence, psl->tSequence, satmp);
}
}
void pslSwap(struct psl *psl, boolean noRc)
/* swap query and target in psl. If noRc is TRUE, don't reverse-complement
* PSL if needed, instead make target strand explict. */
{
int itmp;
unsigned utmp;
char ctmp, *stmp;
swapVars(psl->qBaseInsert, psl->tBaseInsert, utmp);
swapVars(psl->tNumInsert, psl->qNumInsert, utmp);
swapVars(psl->qName, psl->tName, stmp);
swapVars(psl->qSize, psl->tSize, utmp);
swapVars(psl->qStart, psl->tStart, itmp);
swapVars(psl->qEnd, psl->tEnd, itmp);
/* handle strand and block copy */
if (psl->strand[1] != '\0')
{
/* translated */
swapVars(psl->strand[0], psl->strand[1], ctmp);
swapBlocks(psl);
}
else if (noRc)
{
/* untranslated with no reverse complement */
psl->strand[1] = psl->strand[0];
psl->strand[0] = '+';
swapBlocks(psl);
}
else
{
/* untranslated */
if (psl->strand[0] == '+')
swapBlocks(psl);
else
swapRCBlocks(psl);
}
}
void pslTargetOffset(struct psl *psl, int offset)
/* Add offset to target positions in psl. */
{
int i, blockCount = psl->blockCount;
unsigned *tStarts = psl->tStarts;
psl->tStart += offset;
psl->tEnd += offset;
for (i=0; i<blockCount; ++i)
tStarts[i] += offset;
}
void pslDump(struct psl *psl, FILE *f)
/* Dump most of PSL to file - for debugging. */
{
int i;
fprintf(f, "<PRE>\n");
fprintf(f, "psl %s:%d-%d %s %s:%d-%d %d\n",
psl->qName, psl->qStart, psl->qEnd, psl->strand,
psl->tName, psl->tStart, psl->tEnd, psl->blockCount);
for (i=0; i<psl->blockCount; ++i)
fprintf(f, " size %d, qStart %d, tStart %d\n",
psl->blockSizes[i], psl->qStarts[i], psl->tStarts[i]);
fprintf(f, "</PRE>");
}
static void pslRecalcBounds(struct psl *psl)
/* Calculate qStart/qEnd tStart/tEnd at top level to be consistent
* with blocks. */
{
int qStart, qEnd, tStart, tEnd, size;
int last = psl->blockCount - 1;
qStart = psl->qStarts[0];
tStart = psl->tStarts[0];
size = psl->blockSizes[last];
qEnd = psl->qStarts[last] + size;
tEnd = psl->tStarts[last] + size;
if (psl->strand[0] == '-')
reverseIntRange(&qStart, &qEnd, psl->qSize);
if (psl->strand[1] == '-')
reverseIntRange(&tStart, &tEnd, psl->tSize);
psl->qStart = qStart;
psl->qEnd = qEnd;
psl->tStart = tStart;
psl->tEnd = tEnd;
}
struct psl *pslTrimToTargetRange(struct psl *oldPsl, int tMin, int tMax)
/* Return psl trimmed to fit inside tMin/tMax. Note this does not
* update the match/misMatch and related fields. */
{
int newSize;
int oldBlockCount = oldPsl->blockCount;
boolean tIsRc = (oldPsl->strand[1] == '-');
int newBlockCount = 0, completeBlockCount = 0;
int i;
struct psl *newPsl = NULL;
int tMn = tMin, tMx = tMax; /* tMin/tMax adjusted for strand. */
/* Deal with case where we're completely trimmed out quickly. */
newSize = rangeIntersection(oldPsl->tStart, oldPsl->tEnd, tMin, tMax);
if (newSize <= 0)
return NULL;
if (tIsRc)
reverseIntRange(&tMn, &tMx, oldPsl->tSize);
/* Count how many blocks will survive trimming. */
oldBlockCount = oldPsl->blockCount;
for (i=0; i<oldBlockCount; ++i)
{
int s = oldPsl->tStarts[i];
int e = s + oldPsl->blockSizes[i];
int sz = e - s;
int overlap;
if ((overlap = rangeIntersection(s, e, tMn, tMx)) > 0)
++newBlockCount;
if (overlap == sz)
++completeBlockCount;
}
if (newBlockCount == 0)
return NULL;
/* Allocate new psl and fill in what we already know. */
AllocVar(newPsl);
strcpy(newPsl->strand, oldPsl->strand);
newPsl->qName = cloneString(oldPsl->qName);
newPsl->qSize = oldPsl->qSize;
newPsl->tName = cloneString(oldPsl->tName);
newPsl->tSize = oldPsl->tSize;
newPsl->blockCount = newBlockCount;
AllocArray(newPsl->blockSizes, newBlockCount);
AllocArray(newPsl->qStarts, newBlockCount);
AllocArray(newPsl->tStarts, newBlockCount);
/* Fill in blockSizes, qStarts, tStarts with real data. */
newBlockCount = completeBlockCount = 0;
for (i=0; i<oldBlockCount; ++i)
{
int oldSz = oldPsl->blockSizes[i];
int sz = oldSz;
int tS = oldPsl->tStarts[i];
int tE = tS + sz;
int qS = oldPsl->qStarts[i];
int qE = qS + sz;
if (rangeIntersection(tS, tE, tMn, tMx) > 0)
{
int diff;
if ((diff = (tMn - tS)) > 0)
{
tS += diff;
qS += diff;
sz -= diff;
}
if ((diff = (tE - tMx)) > 0)
{
tE -= diff;
qE -= diff;
sz -= diff;
}
newPsl->qStarts[newBlockCount] = qS;
newPsl->tStarts[newBlockCount] = tS;
newPsl->blockSizes[newBlockCount] = sz;
++newBlockCount;
if (sz == oldSz)
++completeBlockCount;
}
}
pslRecalcBounds(newPsl);
return newPsl;
}
struct psl *pslTrimToQueryRange(struct psl *oldPsl, int qMin, int qMax)
/* Return psl trimmed to fit inside qMin/qMax. Note this does not
* update the match/misMatch and related fields. */
{
int newSize;
int oldBlockCount = oldPsl->blockCount;
boolean qIsRc = (oldPsl->strand[0] == '-');
int newBlockCount = 0, completeBlockCount = 0;
int i;
struct psl *newPsl = NULL;
int qMn = qMin, qMx = qMax; /* qMin/qMax adjusted for strand. */
/* Deal with case where we're completely trimmed out quickly. */
newSize = rangeIntersection(oldPsl->qStart, oldPsl->qEnd, qMin, qMax);
if (newSize <= 0)
return NULL;
if (qIsRc)
reverseIntRange(&qMn, &qMx, oldPsl->qSize);
/* Count how many blocks will survive trimming. */
oldBlockCount = oldPsl->blockCount;
for (i=0; i<oldBlockCount; ++i)
{
int s = oldPsl->qStarts[i];
int e = s + oldPsl->blockSizes[i];
int sz = e - s;
int overlap;
if ((overlap = rangeIntersection(s, e, qMn, qMx)) > 0)
++newBlockCount;
if (overlap == sz)
++completeBlockCount;
}
if (newBlockCount == 0)
return NULL;
/* Allocate new psl and fill in what we already know. */
AllocVar(newPsl);
strcpy(newPsl->strand, oldPsl->strand);
newPsl->qName = cloneString(oldPsl->qName);
newPsl->qSize = oldPsl->qSize;
newPsl->tName = cloneString(oldPsl->tName);
newPsl->tSize = oldPsl->tSize;
newPsl->blockCount = newBlockCount;
AllocArray(newPsl->blockSizes, newBlockCount);
AllocArray(newPsl->qStarts, newBlockCount);
AllocArray(newPsl->tStarts, newBlockCount);
/* Fill in blockSizes, qStarts, tStarts with real data. */
newBlockCount = completeBlockCount = 0;
for (i=0; i<oldBlockCount; ++i)
{
int oldSz = oldPsl->blockSizes[i];
int sz = oldSz;
int qS = oldPsl->qStarts[i];
int qE = qS + sz;
int tS = oldPsl->tStarts[i];
int tE = tS + sz;
if (rangeIntersection(qS, qE, qMn, qMx) > 0)
{
int diff;
if ((diff = (qMn - qS)) > 0)
{
tS += diff;
qS += diff;
sz -= diff;
}
if ((diff = (qE - qMx)) > 0)
{
tE -= diff;
qE -= diff;
sz -= diff;
}
newPsl->qStarts[newBlockCount] = qS;
newPsl->tStarts[newBlockCount] = tS;
newPsl->blockSizes[newBlockCount] = sz;
++newBlockCount;
if (sz == oldSz)
++completeBlockCount;
}
}
pslRecalcBounds(newPsl);
return newPsl;
}
char* pslGetCreateSql(char* table, unsigned options, int tNameIdxLen)
/* Get SQL required to create PSL table. Options is a bit set consisting
* of PSL_TNAMEIX, PSL_WITH_BIN, and PSL_XA_FORMAT. tNameIdxLen is
* the number of characters in target name to index. If greater than
* zero, must specify PSL_TNAMEIX. If zero and PSL_TNAMEIX is specified,
* to will default to 8. */
{
struct dyString *sqlCmd = newDyString(2048);
char *sqlCmdStr;
char binIx[32];
binIx[0] = '\0';
/* check and default tNameIdxLen */
if ((tNameIdxLen > 0) && !(options & PSL_TNAMEIX))
errAbort("pslGetCreateSql: must specify PSL_TNAMEIX with tNameIdxLen > 0");
if ((options & PSL_TNAMEIX) && (tNameIdxLen == 0))
tNameIdxLen = 8;
/* setup tName and bin index fields */
if (options & PSL_WITH_BIN)
{
if (options & PSL_TNAMEIX)
safef(binIx, sizeof(binIx), "INDEX(tName(%d),bin),\n", tNameIdxLen);
else
safef(binIx, sizeof(binIx), "INDEX(bin),\n");
}
dyStringPrintf(sqlCmd, createString, table,
((options & PSL_WITH_BIN) ? "bin smallint unsigned not null,\n" : ""));
if (options & PSL_XA_FORMAT)
{
dyStringPrintf(sqlCmd, "qSeq longblob not null,\n");
dyStringPrintf(sqlCmd, "tSeq longblob not null,\n");
}
dyStringPrintf(sqlCmd, indexString, binIx);
sqlCmdStr = cloneString(sqlCmd->string);
dyStringFree(&sqlCmd);
return sqlCmdStr;
}
static void printPslDesc(char* pslDesc, FILE* out, struct psl* psl)
/* print description of a PSL on first error */
{
fprintf(out, "Error: invalid PSL: %s:%u-%u %s:%u-%u %s %s\n",
psl->qName, psl->qStart, psl->qEnd,
psl->tName, psl->tStart, psl->tEnd,
psl->strand, pslDesc);
}
static void chkRanges(char* pslDesc, FILE* out, struct psl* psl,
char* pName, char* pLabel, char pCLabel, char pStrand,
unsigned pSize, unsigned pStart, unsigned pEnd,
unsigned blockCount, unsigned* blockSizes,
unsigned* pBlockStarts, int* errCountPtr)
/* check the target or query ranges in a PSL, increment errorCnt */
{
int errCount = *errCountPtr;
unsigned iBlk, prevBlkEnd = 0;
if (pStart >= pEnd)
{
if (errCount == 0)
printPslDesc(pslDesc, out, psl);
fprintf(out, "\t%s %cStart %u >= %cEnd %u\n",
pName, pCLabel, pStart, pCLabel, pEnd);
errCount++;
}
if (pEnd > pSize)
{
if (errCount == 0)
printPslDesc(pslDesc, out, psl);
fprintf(out, "\t%s %cEnd %u >= %cSize %u\n",
pName, pCLabel, pEnd, pCLabel, pSize);
errCount++;
}
for (iBlk = 0; iBlk < blockCount; iBlk++)
{
unsigned blkStart = pBlockStarts[iBlk];
unsigned blkEnd = blkStart+blockSizes[iBlk];
/* translate stand to genomic coords */
unsigned gBlkStart = (pStrand == '+') ? blkStart : (pSize - blkEnd);
unsigned gBlkEnd = (pStrand == '+') ? blkEnd : (pSize - blkStart);
if ((pSize > 0) && (blkEnd > pSize))
{
if (errCount == 0)
printPslDesc(pslDesc, out, psl);
fprintf(out, "\t%s %s block %u end %u > %cSize %u\n",
pName, pLabel, iBlk, blkEnd, pCLabel, pSize);
errCount++;
}
if (gBlkStart < pStart)
{
if (errCount == 0)
printPslDesc(pslDesc, out, psl);
fprintf(out, "\t%s %s block %u start %u < %cStart %u\n",
pName, pLabel, iBlk, gBlkStart, pCLabel, pStart);
errCount++;
}
if (gBlkStart >= pEnd)
{
if (errCount == 0)
printPslDesc(pslDesc, out, psl);
fprintf(out, "\t%s %s block %u start %u >= %cEnd %u\n",
pName, pLabel, iBlk, gBlkStart, pCLabel, pEnd);
errCount++;
}
if (gBlkEnd < pStart)
{
if (errCount == 0)
printPslDesc(pslDesc, out, psl);
fprintf(out, "\t%s %s block %u end %u < %cStart %u\n",
pName, pLabel, iBlk, gBlkEnd, pCLabel, pStart);
errCount++;
}
if (gBlkEnd > pEnd)
{
if (errCount == 0)
printPslDesc(pslDesc, out, psl);
fprintf(out, "\t%s %s block %u end %u > %cEnd %u\n",
pName, pLabel, iBlk, gBlkEnd, pCLabel, pEnd);
errCount++;
}
if ((iBlk > 0) && (blkStart < prevBlkEnd))
{
if (errCount == 0)
printPslDesc(pslDesc, out, psl);
fprintf(out, "\t%s %s block %u start %u < previous block end %u\n",
pName, pLabel, iBlk, blkStart, prevBlkEnd);
errCount++;
}
prevBlkEnd = blkEnd;
}
*errCountPtr = errCount;
}
int pslCheck(char *pslDesc, FILE* out, struct psl* psl)
/* Validate a PSL for consistency. pslDesc is printed the error messages
* to file out (open /dev/null to discard). Return count of errors. */
{
static char* VALID_STRANDS[] = {
"+", "-", "++", "+-", "-+", "--", NULL
};
int i, errCount = 0;
char strand;
boolean isProt = FALSE;
/* check strand value */
for (i = 0; VALID_STRANDS[i] != NULL; i++)
{
if (strcmp(psl->strand, VALID_STRANDS[i]) == 0)
break;
}
if (VALID_STRANDS[i] == NULL)
{
if (errCount == 0)
printPslDesc(pslDesc, out, psl);
fprintf(out, "\tinvalid PSL strand: \"%s\"\n", psl->strand);
errCount++;
}
/* check target */
if (pslIsProtein(psl))
{
isProt = TRUE;
for (i = 0; i < psl->blockCount ; i++)
psl->blockSizes[i] *= 3;
}
strand = ((psl->strand[1] == '\0') ? '+' : psl->strand[1]);
chkRanges(pslDesc, out, psl, psl->tName, "target", 't',
strand, psl->tSize, psl->tStart, psl->tEnd,
psl->blockCount, psl->blockSizes, psl->tStarts,
&errCount);
if (isProt)
{
for (i = 0; i < psl->blockCount ; i++)
psl->blockSizes[i] /= 3;
}
/* check query */
strand = psl->strand[0];
chkRanges(pslDesc, out, psl, psl->qName, "query", 'q',
strand, psl->qSize, psl->qStart, psl->qEnd,
psl->blockCount, psl->blockSizes, psl->qStarts,
&errCount);
return errCount;
}
struct hash *readPslToBinKeeper(char *sizeFileName, char *pslFileName)
/* read a list of psls and return results in hash of binKeeper structure for fast query*/
{
struct binKeeper *bk;
struct psl *psl;
struct lineFile *sf = lineFileOpen(sizeFileName, TRUE);
struct lineFile *pf = lineFileOpen(pslFileName , TRUE);
struct hash *hash = newHash(0);
char *chromRow[2];
char *row[21] ;
while (lineFileRow(sf, chromRow))
{
char *name = chromRow[0];
int size = lineFileNeedNum(sf, chromRow, 1);
if (hashLookup(hash, name) != NULL)
warn("Duplicate %s, ignoring all but first\n", name);
else
{
bk = binKeeperNew(0, size);
assert(size > 1);
hashAdd(hash, name, bk);
}
}
while (lineFileNextRow(pf, row, ArraySize(row)))
{
psl = pslLoad(row);
bk = hashMustFindVal(hash, psl->tName);
binKeeperAdd(bk, psl->tStart, psl->tEnd, psl);
}
lineFileClose(&pf);
lineFileClose(&sf);
return hash;
}
-static int countInitialChars(char *s, char c)
-/* Count number of initial chars in s that match c. */
+static boolean isDelChar(char c)
+/* is this a indel character? */
{
-int count = 0;
-char d;
-while ((d = *s++) != 0)
- {
- if (c == d)
- ++count;
- else
- break;
- }
-return count;
+return (c == '-') || (c == '.') || (c == '=') || (c == '_');
}
-static int countTerminalChars(char *s, char c)
-/* Count number of terminal chars in s that match c. */
-{
-int len = strlen(s), i;
-int count = 0;
-for (i=len-1; i>=0; --i)
- {
- if (c == s[i])
- ++count;
- else
- break;
- }
-return count;
-}
-
-static int countNonInsert(char *s, int size)
-/* Count number of characters in initial part of s that
- * are not '-'. */
-{
-int count = 0;
-int i;
-for (i=0; i<size; ++i)
- if (*s++ != '-')
- ++count;
-return count;
-}
-
-static void trimAlignment(struct psl* psl, char** qStringPtr, char** tStringPtr,
+static void trimAlignment(struct psl* psl, char** qStrPtr, char** tStrPtr,
int* aliSizePtr)
/* remove leading or trailing indels from alignment */
{
-char* qString = *qStringPtr;
-char* tString = *tStringPtr;
+char* qStr = *qStrPtr;
+char* tStr = *tStrPtr;
int aliSize = *aliSizePtr;
-int qStartInsert = countInitialChars(qString, '-');
-int tStartInsert = countInitialChars(tString, '-');
-int qEndInsert = countTerminalChars(qString, '-');
-int tEndInsert = countTerminalChars(tString, '-');
-int startInsert = max(qStartInsert, tStartInsert);
-int endInsert = max(qEndInsert, tEndInsert);
-int qNonCount, tNonCount;
-
-if (startInsert > 0)
- {
- qNonCount = countNonInsert(qString, startInsert);
- tNonCount = countNonInsert(tString, startInsert);
- qString += startInsert;
- tString += startInsert;
- aliSize -= startInsert;
- psl->qStart += qNonCount;
- psl->tStart += tNonCount;
- }
-if (endInsert > 0)
- {
- aliSize -= endInsert;
- qNonCount = countNonInsert(qString+aliSize, endInsert);
- tNonCount = countNonInsert(tString+aliSize, endInsert);
- qString[aliSize] = 0;
- tString[aliSize] = 0;
- psl->qEnd -= qNonCount;
- psl->tEnd -= tNonCount;
+
+// skip lending indels
+while ((aliSize > 0) && (isDelChar(*qStr) || isDelChar(*tStr)))
+ {
+ if (!isDelChar(*qStr))
+ psl->qStart++;
+ else if (!isDelChar(*tStr))
+ psl->tStart++;
+ qStr++;
+ tStr++;
+ aliSize--;
+ }
+
+// skip trailing indels
+while ((aliSize > 0) && (isDelChar(qStr[aliSize-1]) || isDelChar(tStr[aliSize-1])))
+ {
+ if (!isDelChar(qStr[aliSize-1]))
+ psl->qEnd--;
+ else if (!isDelChar(tStr[aliSize-1]))
+ psl->tEnd--;
+ aliSize--;
}
-*qStringPtr = qString;
-*tStringPtr = tString;
+*qStrPtr = qStr;
+*tStrPtr = tStr;
*aliSizePtr = aliSize;
-/* recursive call to handle double gap */
-if (startInsert > 0 || endInsert > 0)
- trimAlignment(psl, qStringPtr, tStringPtr, aliSizePtr);
}
static void addBlock(struct psl* psl, int qs, int qe, int ts, int te,
int *blockSpace)
/* add a block to the psl, growing if necessary */
{
assert((qe-qs) == (te-ts)); /* same lengths? */
if (psl->blockCount == *blockSpace)
pslGrow(psl, blockSpace);
psl->blockSizes[psl->blockCount] = qe - qs;
psl->qStarts[psl->blockCount] = qs;
psl->tStarts[psl->blockCount] = ts;
psl->blockCount++;
}
static void accumCounts(struct psl *psl, char prevQ, char prevT,
char q, char t, unsigned options)
/* accumulate block and base counts */
{
-if ((q != '-') && (t != '-'))
+if (!isDelChar(q) && !isDelChar(t))
{
/* aligned column. */
char qu = toupper(q);
char tu = toupper(t);
if ((q == 'N') || (t == 'N'))
psl->nCount++;
else if (qu == tu)
{
if ((options & PSL_IS_SOFTMASK) && ((qu != q) || (tu != t)))
psl->repMatch++;
else
psl->match++;
}
else
psl->misMatch++;
}
-else if ((q == '-') && (t != '-'))
+else if (isDelChar(q) && !isDelChar(t))
{
/* target insert */
psl->tBaseInsert++;
- if (prevQ != '-')
+ if (!isDelChar(prevQ))
psl->tNumInsert++;
}
-else if ((t == '-') && (q != '-'))
+else if (isDelChar(t) && !isDelChar(q))
{
/* query insert */
psl->qBaseInsert++;
- if (prevT != '-')
+ if (!isDelChar(prevT))
psl->qNumInsert++;
}
}
struct psl* pslFromAlign(char *qName, int qSize, int qStart, int qEnd, char *qString,
char *tName, int tSize, int tStart, int tEnd, char *tString,
char* strand, unsigned options)
/* Create a PSL from an alignment. Options PSL_IS_SOFTMASK if lower case
* bases indicate repeat masking. Returns NULL if alignment is empty after
* triming leading and trailing indels.*/
{
/* Note, the unit tests for these programs exercise this function:
* hg/embossToPsl
* hg/mouseStuff/axtToPsl
* hg/spideyToPsl
* utils/est2genomeToPsl
*/
int blockSpace = 16;
struct psl* psl = NULL;
int aliSize = strlen(qString);
boolean eitherInsert = FALSE; /* True if either in insert state. */
int i, qs,qe,ts,te;
char prevQ = '\0', prevT = '\0';
AllocVar(psl);
if (strlen(tString) != aliSize)
errAbort("query and target alignment strings are different lengths");
psl = pslNew(qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd,
strand, blockSpace, 0);
trimAlignment(psl, &qString, &tString, &aliSize);
/* Don't create if either query or target is zero length after trim */
if ((psl->qStart == psl->qEnd) || (psl->tStart == psl->tEnd))
{
pslFree(&psl);
return NULL;
}
/* Get range of alignment in strand-specified coordinates */
qs = psl->qStart;
qe = psl->qEnd;
if (strand[0] == '-')
reverseIntRange(&qs, &qe, psl->qSize);
ts = psl->tStart;
te = psl->tEnd;
if (strand[1] == '-')
reverseIntRange(&ts, &te, psl->tSize);
eitherInsert = FALSE;
qe = qs; /* current block coords */
te = ts;
for (i=0; i<aliSize; ++i)
{
char q = qString[i];
char t = tString[i];
- if ((q == '-') && (t == '-'))
+ if (isDelChar(q) && isDelChar(t))
{
continue; /* nothing in this column, just ignore it */
}
- else if ((q == '-') || (t == '-'))
+ else if (isDelChar(q) || isDelChar(t))
{
/* insert in one of the columns */
if (!eitherInsert)
{
/* end of a block */
addBlock(psl, qs, qe, ts, te, &blockSpace);
eitherInsert = TRUE;
}
- if (q != '-')
+ if (!isDelChar(q))
qe += 1;
- if (t != '-')
+ if (!isDelChar(t))
te += 1;
}
else
{
/* columns aligned */
if (eitherInsert)
{
/* start new block */
qs = qe;
ts = te;
eitherInsert = FALSE;
}
qe += 1;
te += 1;
}
accumCounts(psl, prevQ, prevT, q, t, options);
prevQ = q; /* will not include skipped empty columns */
prevT = t;
}
addBlock(psl, qs, qe, ts, te, &blockSpace);
return psl;
}
struct psl* pslNew(char *qName, unsigned qSize, int qStart, int qEnd,
char *tName, unsigned tSize, int tStart, int tEnd,
char *strand, unsigned blockSpace, unsigned opts)
/* create a new psl with space for the specified number of blocks allocated.
* pslGrow maybe used to expand this space if needed. Valid options are
* PSL_XA_FORMAT. */
{
struct psl *psl;
AllocVar(psl);
assert(blockSpace > 0);
psl->qName = cloneString(qName);
psl->qSize = qSize;
psl->qStart = qStart;
psl->qEnd = qEnd;
psl->tName = cloneString(tName);
psl->tSize = tSize;
psl->tStart = tStart;
psl->tEnd = tEnd;
strncpy(psl->strand, strand, 2);
AllocArray(psl->blockSizes, blockSpace);
AllocArray(psl->qStarts, blockSpace);
AllocArray(psl->tStarts, blockSpace);
if (opts & PSL_XA_FORMAT)
{
AllocArray(psl->qSequence, blockSpace);
AllocArray(psl->tSequence, blockSpace);
}
return psl;
}
void pslGrow(struct psl *psl, int *blockSpacePtr)
/* Increase memory allocated to a psl to hold more blocks. blockSpacePtr
* should point the the current maximum number of blocks and will be
* updated to with the new amount of space. */
{
int blockSpace = *blockSpacePtr;
int newSpace = 2 * blockSpace;
ExpandArray(psl->blockSizes, blockSpace, newSpace);
ExpandArray(psl->qStarts, blockSpace, newSpace);
ExpandArray(psl->tStarts, blockSpace, newSpace);
if (psl->qSequence != NULL)
{
ExpandArray(psl->qSequence, blockSpace, newSpace);
ExpandArray(psl->tSequence, blockSpace, newSpace);
}
*blockSpacePtr = newSpace;
}
int pslRangeTreeOverlap(struct psl *psl, struct rbTree *rangeTree)
/* Return amount that psl overlaps (on target side) with rangeTree. */
{
int i;
int overlap = 0;
boolean isRc = (psl->strand[1] == '-');
for (i=0; i<psl->blockCount; ++i)
{
int start = psl->tStarts[i];
int end = start + psl->blockSizes[i];
if (isRc)
reverseIntRange(&start, &end, psl->tSize);
overlap += rangeTreeOverlapSize(rangeTree, start, end);
}
return overlap;
}
float pslIdent(struct psl *psl)
/* computer fraction identity */
{
float aligned = psl->match + psl->misMatch + psl->repMatch;
if (aligned == 0.0)
return 0.0;
else
return ((float)(psl->match + psl->repMatch))/aligned;
}
float pslQueryAligned(struct psl *psl)
/* compute fraction of query that was aligned */
{
float aligned = psl->match + psl->misMatch + psl->repMatch;
return aligned/(float)psl->qSize;
}