src/hg/mouseStuff/lavToPsl/lavToPsl.c 1.11

1.11 2009/04/19 05:44:11 markd
fix bug with name when lastz input is a 2bit
Index: src/hg/mouseStuff/lavToPsl/lavToPsl.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/mouseStuff/lavToPsl/lavToPsl.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -b -B -U 1000000 -r1.10 -r1.11
--- src/hg/mouseStuff/lavToPsl/lavToPsl.c	20 Jun 2006 16:44:17 -0000	1.10
+++ src/hg/mouseStuff/lavToPsl/lavToPsl.c	19 Apr 2009 05:44:11 -0000	1.11
@@ -1,367 +1,370 @@
 /* lavToPsl - Convert blastz lav to psl format. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "axt.h"
 
 static char const rcsid[] = "$Id$";
 
 /* strand to us for target */
 char* targetStrand = "+";
 boolean bed = FALSE;
 char* lavScoreFile = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "lavToPsl - Convert blastz lav to psl format\n"
   "usage:\n"
   "   lavToPsl in.lav out.psl\n"
   "options:\n"
   "   -target-strand=c set the target strand to c (default is no strand)\n"
   "   -bed output bed instead of psl\n"
   "   -scoreFile=filename  output lav scores to side file, such that\n"
   "                        each psl line in out.psl is matched by a score line.\n"
   );
 }
 
 struct block
 /* A block of an alignment. */
     {
     struct block *next;
     int qStart, qEnd;	/* Query position. */
     int tStart, tEnd;	/* Target position. */
     int percentId;	/* Percentage identity. */
     int score;
     };
 
 void outputBlocks(struct block *blockList, FILE *f, boolean isRc, 
 	char *qName, int qSize, char *tName, int tSize, boolean psl)
 /* Output block list to file. */
 {
 int qNumInsert = 0, qBaseInsert = 0, tNumInsert = 0, tBaseInsert = 0;
 int matchOne, match = 0, mismatch = 0, bases;
 double scale;
 struct block *lastBlock = NULL;
 int score, blockCount = 0;
 int qTotalStart = BIGNUM, qTotalEnd = 0, tTotalStart = BIGNUM, tTotalEnd = 0;
 struct block *block;
 
 if (blockList == NULL)
     return;
 
 /* Count up overall statistics. */
 for (block = blockList; block != NULL; block = block->next)
     {
     ++blockCount;
     scale = 0.01 * block->percentId;
     bases = block->qEnd - block->qStart;
     matchOne = round(scale*bases);
     match += matchOne;
     mismatch += bases - matchOne;
     if (lastBlock != NULL)
 	{
 	if (block->qStart != lastBlock->qEnd)
 	    {
 	    qNumInsert += 1;
 	    qBaseInsert += block->qStart - lastBlock->qEnd;
 	    }
 	if (block->tStart != lastBlock->tEnd)
 	    {
 	    tNumInsert += 1;
 	    tBaseInsert += block->tStart - lastBlock->tEnd;
 	    }
 	qTotalEnd = block->qEnd;
 	tTotalEnd = block->tEnd;
 	}
     else
 	{
 	qTotalStart = block->qStart;
 	qTotalEnd = block->qEnd;
 	tTotalStart = block->tStart;
 	tTotalEnd = block->tEnd;
 	}
     lastBlock = block;
     }
 
 if (psl)
     {
     fprintf(f, "%d\t%d\t0\t0\t", match, mismatch);
     fprintf(f, "%d\t%d\t%d\t%d\t", qNumInsert, qBaseInsert, tNumInsert, tBaseInsert);
     fprintf(f, "%c%s\t", (isRc ? '-' : '+'), targetStrand);
     /* if query is - strand, convert start/end to genomic */
     if (isRc)
         fprintf(f, "%s\t%d\t%d\t%d\t", qName, qSize,
                 (qSize - qTotalEnd), (qSize - qTotalStart));
     else
         fprintf(f, "%s\t%d\t%d\t%d\t", qName, qSize, qTotalStart, qTotalEnd);
     fprintf(f, "%s\t%d\t%d\t%d\t", tName, tSize, tTotalStart, tTotalEnd);
     fprintf(f, "%d\t", blockCount);
     for (block = blockList; block != NULL; block = block->next)
         fprintf(f, "%d,", block->tEnd - block->tStart);
     fprintf(f, "\t");
     for (block = blockList; block != NULL; block = block->next)
         fprintf(f, "%d,", block->qStart);
     fprintf(f, "\t");
     for (block = blockList; block != NULL; block = block->next)
 	fprintf(f, "%d,", block->tStart);
     fprintf(f, "\n");
     }
 else  /* Output bed line. */
     {
     score = block->score;
     fprintf(f, "%s\t%d\t%d\t%s\t%d\t", tName, tTotalStart, tTotalEnd,
 	qName, score);
     fprintf(f, "%c\t", (isRc ? '-' : '+'));
     fprintf(f, "%d\t%d\t0\t%d\t", tTotalStart, tTotalEnd, blockCount);
     for (block = blockList; block != NULL; block = block->next)
 	fprintf(f, "%d,", block->tEnd - block->tStart);
     fprintf(f, "\t");
     for (block = blockList; block != NULL; block = block->next)
 	fprintf(f, "%d,", block->tStart);
     fprintf(f, "\n");
     }
 }
 
 void unexpectedEof(struct lineFile *lf)
 /* Squawk about unexpected end of file. */
 {
 errAbort("Unexpected end of file in %s", lf->fileName);
 }
 
 void seekEndOfStanza(struct lineFile *lf)
 /* find end of stanza */
 {
 char *line;
 for (;;)
     {
     if (!lineFileNext(lf, &line, NULL))
         unexpectedEof(lf);
     if (line[0] == '}')
         break;
     }
 }
 
 void parseS(struct lineFile *lf, int *tSize, int *qSize)
 /* Parse z stanza and return tSize and qSize */
 {
 char *words[3];
 if (!lineFileRow(lf, words))
     unexpectedEof(lf);
 *tSize = lineFileNeedNum(lf, words, 2);
 if (!lineFileRow(lf, words))
     unexpectedEof(lf);
 *qSize = lineFileNeedNum(lf, words, 2);
 seekEndOfStanza(lf);
 }
 
 void parseD(struct lineFile *lf, char **matrix, char **command, FILE *f)
 /* Parse d stanza and return matrix and blastz command line */
 {
 char *line, *words[64];
 int i, size, wordCount = 0;
 struct axtScoreScheme *ss = NULL;
 freez(matrix);
 freez(command);
 if (!lineFileNext(lf, &line, &size))
    unexpectedEof(lf);
-if (stringIn("blastz",line))
+// check for "blastz" or "lastz"
+if (stringIn("lastz",line))
     {
     stripChar(line,'"');
     wordCount = chopLine(line, words);
     fprintf(f, "##aligner=%s",words[0]);
     for (i = 3 ; i <wordCount ; i++)
         fprintf(f, " %s ",words[i]);
     fprintf(f,"\n");
     ss = axtScoreSchemeReadLf(lf);
     axtScoreSchemeDnaWrite(ss, f, "blastz");
     }
 seekEndOfStanza(lf);
 }
 
 char *needNextWord(struct lineFile *lf, char **pLine)
 /* Get next word from line or die trying. */
 {
 char *word = nextWord(pLine);
 if (word == NULL)
    errAbort("Short line %d of %s\n", lf->lineIx, lf->fileName);
 return word;
 }
 
 char *justChrom(char *s)
 /* Simplify mongo nib file thing in axt. */
 {
 char *e = stringIn(".nib:", s);
 if (e == NULL)
     return s;
 *e = 0;
 e = strrchr(s, '/');
 if (e == NULL)
     return s;
 else
     return e+1;
 }
 
 
 void parseH(struct lineFile *lf,  char **tName, char **qName, boolean *isRc)
 /* Parse out H stanza */
 {
 char *line, *word, *e;
 int i;
 
 
 /* Set return variables to default values. */
 freez(qName);
 freez(tName);
 *isRc = FALSE;
 
 for (i=0; ; ++i)
     {
     if (!lineFileNext(lf, &line, NULL))
        unexpectedEof(lf);
     if (line[0] == '#')
        continue;
     if (line[0] == '}')
        {
        if (i < 2)
 	   errAbort("Short H stanza line %d of %s", lf->lineIx, lf->fileName);
        break;
        }
     word = needNextWord(lf, &line);
-    word += 2;  /* Skip over "> */
+    word++;  /* Skip over `"' and optional `>' */
+    if (*word == '>')
+        word++;
     e = strchr(word, '"');
     if (e != NULL) 
         {
 	*e = 0;
 	if (line != NULL)
 	    ++line;
 	}
     if (i == 0)
         *tName = cloneString(justChrom(word));
     else if (i == 1)
         *qName = cloneString(justChrom(word));
     if ((line != NULL) && (stringIn("(reverse", line) != NULL))
         *isRc = TRUE;
     }
 }
 
 
 struct block *parseA(struct lineFile *lf, FILE* ff)
 /* Parse an alignment stanza into a block list. */
 {
 struct block *block = NULL, *blockList = NULL;
 char *line, *words[6];
 int score = 0;
 int wordCount,lavScore;
 
 while (lineFileNext(lf, &line, NULL))
     {
     if (line[0] == '#')
 	continue;
     if (line[0] == '}')
 	break;
     wordCount = chopLine(line, words);
     if (wordCount == 0)
 	continue;
     if (words[0][0] == 's')
         {
 	lineFileExpectWords(lf, 2, wordCount) ;
 	score  = lineFileNeedNum(lf, words, 1) - 1 ;
         }
     if (words[0][0] == 'l')
 	{
 	AllocVar(block);
 	lineFileExpectWords(lf, 6, wordCount);
         block->score = score;
 	block->tStart = lineFileNeedNum(lf, words, 1) - 1;
 	block->tEnd = lineFileNeedNum(lf, words, 3);
 	block->qStart = lineFileNeedNum(lf, words, 2) - 1;
 	block->qEnd = lineFileNeedNum(lf, words, 4);
 	if (block->qEnd - block->qStart != block->tEnd - block->tStart)
 	    errAbort("Block size mismatch line %d of %s", lf->lineIx, lf->fileName);
 	block->percentId = lineFileNeedNum(lf, words, 5);
 	slAddHead(&blockList, block);
 	}
     if ((ff!=NULL) && (words[0][0] == 's'))
        {
        lavScore = lineFileNeedNum(lf, words, 1);
        fprintf(ff,"%d\n", lavScore);
        }
     }
 slReverse(&blockList);
 return blockList;
 }
 
 void parseIntoPsl(char *lavFile, FILE *f, FILE* ff)
 /* Parse a blastz lav file and put it into something .psl like. */
 {
 struct lineFile *lf = lineFileOpen(lavFile, TRUE);
 char *line;
 struct block *blockList = NULL;
 boolean isRc = FALSE;
 char *tName = NULL, *qName = NULL;
 char *matrix = NULL, *command = NULL;
 int qSize = 0, tSize = 0;
 
 /* Check header. */
 if (!lineFileNext(lf, &line, NULL))
    errAbort("%s is empty", lf->fileName);
 if (!startsWith("#:lav", line))
    errAbort("%s is not a lav file\n", lf->fileName);
 
 while (lineFileNext(lf, &line, NULL))
     {
     if (startsWith("s {", line))
         {
 	parseS(lf, &tSize, &qSize);
 	}
     else if (startsWith("h {", line))
         {
 	parseH(lf, &tName, &qName, &isRc);
 	}
     else if (startsWith("d {", line))
         {
 	parseD(lf, &matrix, &command, f);
 	}
     else if (startsWith("a {", line))
         {
 	blockList = parseA(lf,ff);
         bed = optionExists("bed");
 	outputBlocks(blockList, f, isRc, qName, qSize, tName, tSize, !bed);
 	slFreeList(&blockList);
 	}
     }
 lineFileClose(&lf);
 }
 
 
 void lavToPsl(char *lavIn, char *pslOut)
 /* lavToPsl - Convert blastz lav to psl format. */
 {
 FILE *f = mustOpen(pslOut, "w");
 FILE *ff = NULL;
 
 
 if (lavScoreFile!=NULL)
     ff = mustOpen(lavScoreFile, "w");
 
 parseIntoPsl(lavIn, f, ff);
 carefulClose(&f);
 
 if (lavScoreFile!=NULL)
     carefulClose(&ff);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionHash(&argc, argv);
 if (argc != 3)
     usage();
 targetStrand = optionVal("target-strand", "");
 lavScoreFile = optionVal("scoreFile",lavScoreFile);
 lavToPsl(argv[1], argv[2]);
 return 0;
 }