87160c4c6abc63ea1732458d5b58ca8bd420528c
markd
  Wed Aug 30 08:53:24 2017 -0700
added option to pslCheck to not check insert sizes

diff --git src/inc/psl.h src/inc/psl.h
index 5504b31..eac4fb2 100644
--- src/inc/psl.h
+++ src/inc/psl.h
@@ -1,352 +1,363 @@
 /* psl.h was originally generated by the autoSql program, which also 
  * generated psl.c and psl.sql.  This header links the database and 
  * the RAM representation of objects.   Additional functions were
  * added later. 
  *
  * This file is copyright 2002 Jim Kent, but license is hereby
  * granted for all use - public, private or commercial. */
 
 #ifndef PSL_H
 #define PSL_H
 
 #ifndef LOCALMEM_H
 #include "localmem.h"
 #endif 
 
 #ifndef LINEFILE_H
 #include "linefile.h"
 #endif
 
 #ifndef FUZZYFIND_H
 #include "fuzzyFind.h"
 #endif
 
 #ifndef DNASEQ_H
 #include "dnaseq.h"
 #endif
 
 /* Some forward declarations of structures used but not defined here. */
 struct rbTree;
 
 #define PSL_NUM_COLS  21  /* number of columns in a PSL */
 #define PSLX_NUM_COLS 23  /* number of columns in a PSLX */
 
 #define PSL_XA_FORMAT 0x04  /* add XA format columns */
 
 /* options for pslFromAlign */
 #define PSL_IS_SOFTMASK 0x01 /* lower case are mask */
 
+/* options for pslCheck */
+#define PSL_CHECK_IGNORE_INSERT_CNTS 0x01 /* Don't check insert counts in psl */
+
 struct psl
 /* Summary info about a patSpace alignment */
     {
     struct psl *next;  /* Next in singly linked list. */
     unsigned match;	/* Number of bases that match that aren't repeats */
     unsigned misMatch;	/* Number of bases that don't match */
     unsigned repMatch;	/* Number of bases that match but are part of repeats */
     unsigned nCount;	/* Number of 'N' bases */
     unsigned qNumInsert;	/* Number of inserts in query */
     int qBaseInsert;	/* Number of bases inserted in query */
     unsigned tNumInsert;	/* Number of inserts in target */
     int tBaseInsert;	/* Number of bases inserted in target */
     char strand[3];	/* + or - for strand */
     char *qName;	/* Query sequence name */
     unsigned qSize;	/* Query sequence size */
     int qStart;	/* Alignment start position in query */
     int qEnd;	/* Alignment end position in query */
     char *tName;	/* Target sequence name */
     unsigned tSize;	/* Target sequence size */
     int tStart;	/* Alignment start position in target */
     int tEnd;	/* Alignment end position in target */
     unsigned blockCount;	/* Number of blocks in alignment */
     unsigned *blockSizes;	/* Size of each block */
     unsigned *qStarts;	/* Start of each block in query. */
     unsigned *tStarts;	/* Start of each block in target. */
 
     char **qSequence;  /* query sequence for each block */
     char **tSequence;  /* target sequence for each block */
     };
 
 struct psl *pslxLoad(char **row);
 /* Load a pslx from row fetched with select * from psl
  * from database.  Dispose of this with pslFree(). */
 
 struct psl *pslLoad(char **row);
 /* Load a psl from row fetched with select * from psl
  * from database.  Dispose of this with pslFree(). */
 
 struct psl *pslCommaIn(char **pS, struct psl *ret);
 /* Create a psl out of a comma separated string. 
  * This will fill in ret if non-null, otherwise will
  * return a new psl */
 
 void pslFree(struct psl **pEl);
 /* Free a single dynamically allocated psl such as created
  * with pslLoad(). */
 
 void pslFreeList(struct psl **pList);
 /* Free a list of dynamically allocated psl's */
 
 void pslOutput(struct psl *el, FILE *f, char sep, char lastSep);
 /* Print out psl.  Separate fields with sep. Follow last field with lastSep. */
 
 #define pslTabOut(el,f) pslOutput(el,f,'\t','\n')
 /* Print out psl as a line in a tab-separated file. */
 
 #define pslCommaOut(el,f) pslOutput(el,f,',',',')
 /* Print out psl as a comma separated list including final comma. */
 
 /* ----- end autoSql generated part --------------- */
 
 void pslOutFormat(struct psl *el, FILE *f, char sep, char lastSep);
 /* Print out selected psl values.  Separate fields with sep. Follow last field with lastSep. */
 /* Prints out a better format with bold field headings followed by value */
 /* Requires further upstream work to ensure that only the field headers */
 /* declared here are printed if replacing an existing psl print function*/
 
 struct psl *pslLoadAll(char *fileName);
 /* Load all psl's in file. */
 
 struct psl *pslNext(struct lineFile *lf);
 /* Read next line from file and convert it to psl.  Return
  * NULL at eof. */
 
 struct psl *pslxLoadLm(char **row, struct lm *lm);
 /* Load row into local memory pslx. */
 
 struct psl *pslLoadLm(char **row, struct lm *lm);
 /* Load row into local memory psl. */
 
 void pslWriteHead(FILE *f);
 /* Write head of psl. */
 
 void pslxWriteHead(FILE *f, enum gfType qType, enum gfType tType);
 /* Write head of pslx (extended psl). */
 
 void pslWriteAll(struct psl *pslList, char *fileName, boolean writeHeader);
 /* Write a psl file from list. */
 
 struct lineFile *pslFileOpen(char *fileName);
 /* Read header part of psl and make sure it's right. 
  * Return line file handle to it. */
 
 struct lineFile *pslFileOpenWithMeta(char *fileName, FILE *f);
 /* Read header part of psl and make sure it's right. 
  * Return line file handle to it and send meta data to output file f */
 
 struct lineFile *pslFileOpenWithUniqueMeta(char *fileName, FILE *f);
 /* Read header part of psl and make sure it's right. 
 * Set flag to suppress duplicate header comments.
 * Return line file handle to it. */
 
 void pslxFileOpen(char *fileName, enum gfType *retQueryType, 
 	enum gfType *retTargetType, struct lineFile **retLf);
 /* Read header part of psl and make sure it's right.  Return
  * sequence types and file handle. */
 
 void pslxFileOpenWithMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f);
 /* Read header part of psl and make sure it's right.  Return
  * sequence types and file handle and send meta data to output file f */
 
 void pslxFileOpenWithUniqueMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f);
 /* Read header part of psl and make sure it's right.  Return
 * sequence types and file handle and send only unique meta data to output f */
 
 int pslCmpQuery(const void *va, const void *vb);
 /* Compare to sort based on query. */
 
 int pslCmpTarget(const void *va, const void *vb);
 /* Compare to sort based on target. */
 
 int pslCmpTargetStart(const void *va, const void *vb);
 /* Compare to sort based on target start. */
 
 int pslCmpTargetScore(const void *va, const void *vb);
 /* Compare to sort based on target then score. */
 
 int pslCmpTargetAndStrand(const void *va, const void *vb);
 /* Compare to sort based on target, strand,  tStart. */
 
 int pslCmpScore(const void *va, const void *vb);
 /* Compare to sort based on score (descending). */
 
 int pslCmpQueryScore(const void *va, const void *vb);
 /* Compare to sort based on query then score (descending). */
 
 int pslCalcMilliBad(struct psl *psl, boolean isMrna);
 /* Calculate badness in parts per thousand. */
 
 int pslCmpScoreDesc(const void *va, const void *vb);
 /* Compare to sort based on score descending. */
 
 int pslCmpMatch(const void *va, const void *vb);
 /* Compare to sort based on match. */
 
 int pslScore(const struct psl *psl);
 /* Return score for psl. */
 
 struct ffAli *pslToFfAli(struct psl *psl, struct dnaSeq *query, struct dnaSeq *target,
 	int targetOffset);
 /* Convert from psl to ffAli format. */
 
 struct ffAli *pslToFakeFfAli(struct psl *psl, DNA *needle, DNA *haystack);
 /* Convert from psl to ffAli format.  In some cases you can pass NULL
  * for needle and haystack - depending what the post-processing is going
  * to be. */
 
 struct psl *pslFromFakeFfAli(struct ffAli *ff, 
 	DNA *needle, DNA *haystack, char strand,
 	char *qName, int qSize, char *tName, int tSize);
 /* This will create a basic psl structure from a sorted series of ffAli
  * blocks.  The fields that would need actual sequence to be filled in
  * are left zero however - fields including match, repMatch, mismatch. */
 
 int pslOrientation(struct psl *psl);
 /* Translate psl strand + or - to orientation +1 or -1 */
 
 INLINE char pslQStrand(struct psl *psl)
 /* Get query strand. */
 {
 return psl->strand[0];
 }
 
 INLINE char pslTStrand(struct psl *psl)
 /* Get the target strand., Returns implied + when
  * it's not specific  */
 {
 return (psl->strand[1] != '-') ? '+' : '-';
 }
 
 int pslWeightedIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset);
 /* Return >0 if introns make it look like alignment is on + strand,
  *        <0 if introns make it look like alignment is on - strand,
  *        0 if can't tell.  The absolute value of the return indicates
  * how many splice sites we've seen supporting the orientation.
  * Sequence should NOT be reverse complemented.  */
 
 int pslIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset);
 /* Return 1 if introns make it look like alignment is on + strand,
  *       -1 if introns make it look like alignment is on - strand,
  *        0 if can't tell.
  * Sequence should NOT be reverse complemented.  */
 
 boolean pslHasIntron(struct psl *psl, struct dnaSeq *seq, int seqOffset);
 /* Return TRUE if there's a probable intron. Sequence should NOT be
  * reverse complemented. */
 
 void pslTailSizes(struct psl *psl, int *retStartTail, int *retEndTail);
 /* Find the length of "tails" (rather than extensions) implied by psl. */
 
 void pslRc(struct psl *psl);
 /* Reverse-complement a PSL alignment.  This makes the target strand explicit. */
 
 void pslSwap(struct psl *psl, boolean noRc);
 /* swap query and target in psl.  If noRc is TRUE, don't reverse-complement
  * PSL if needed, instead make target strand explict. */
 
 void pslTargetOffset(struct psl *psl, int offset);
 /* Add offset to target positions in psl. */
 
 void pslDump(struct psl *psl, FILE *f);
 /* Dump most of PSL to file - for debugging. */
 
 struct psl *pslTrimToTargetRange(struct psl *oldPsl, int tMin, int tMax);
 /* Return psl trimmed to fit inside tMin/tMax.  Note this does not
  * update the match/misMatch and related fields. */
 
 struct psl *pslTrimToQueryRange(struct psl *oldPsl, int qMin, int qMax);
 /* Return psl trimmed to fit inside qMin/qMax.  Note this does not
  * update the match/misMatch and related fields. */
 
 int pslCheck(char *pslDesc, FILE* out, struct psl* psl);
 /* Validate a PSL for consistency.  pslDesc is printed the error messages
  * to file out (open /dev/null to discard). Return count of errors. */
 
+int pslCheck2(unsigned opts, char *pslDesc, FILE* out, struct psl* psl);
+/* Validate a PSL for consistency.  pslDesc is printed the error messages to
+ * file out (open /dev/null to discard). Return count of errors.  Option
+ * PSL_CHECK_IGNORE_INSERT_CNTS doesn't validate problems insert counts fields
+ * in each PSL.  Useful because protein PSL doesn't seen to compute these in a
+ * consistent way.
+ */
+
 int pslCountBlocks(struct psl *target, struct psl *query, int maxBlockGap);
 /* count the number of blocks in the query that overlap the target */
 /* merge blocks that are closer than maxBlockGap */
 
 struct hash *readPslToBinKeeper(char *sizeFileName, char *pslFileName);
 /* read a list of psls and return results in hash of binKeeper structure for fast query*/
 
 boolean pslIsProtein(const struct psl *psl);
 /* is psl a protein psl (are it's blockSizes and scores in protein space) */
 
 struct psl* pslFromAlign(char *qName, int qSize, int qStart, int qEnd, char *qString,
                          char *tName, int tSize, int tStart, int tEnd, char *tString,
                          char* strand, unsigned options);
 /* Create a PSL from an alignment.  Options PSL_IS_SOFTMASK if lower case
  * bases indicate repeat masking.  Returns NULL if alignment is empty after
  * triming leading and trailing indels.*/
 
 int pslShowAlignment(struct psl *psl, boolean isProt,
 	char *qName, bioSeq *qSeq, int qStart, int qEnd,
 	char *tName, bioSeq *tSeq, int tStart, int tEnd, FILE *f);
 /* Show protein/DNA alignment or translated DNA alignment in HTML format. */
 
 int pslGenoShowAlignment(struct psl *psl, boolean isProt,
 		      char *qName, bioSeq *qSeq, int qStart, int qEnd,
 		      char *tName, bioSeq *tSeq, int tStart, int tEnd, int exnStarts[], int exnEnds[], int exnCnt, FILE *f);
 /* Show protein/DNA alignment or translated DNA alignment in HTML format. */
 
 struct psl* pslNew(char *qName, unsigned qSize, int qStart, int qEnd,
                    char *tName, unsigned tSize, int tStart, int tEnd,
                    char *strand, unsigned blockSpace, unsigned opts);
 /* create a new psl with space for the specified number of blocks allocated.
  * pslGrow maybe used to expand this space if needed.  Valid options are
  * PSL_XA_FORMAT. */
 
 void pslGrow(struct psl *psl, int *blockSpacePtr);
 /* Increase memory allocated to a psl to hold more blocks.  blockSpacePtr
  * should point the the current maximum number of blocks and will be
  * updated to with the new amount of space. */
 
 void pslComputeInsertCounts(struct psl *psl);
 /* compute numInsert and baseInsert fields from the blocks */
 
 struct psl* pslFromGff3Cigar(char *qName, int qSize, int qStart, int qEnd,
                              char *tName, int tSize, int tStart, int tEnd,
                              char* strand, char *cigar);
 /* create a PSL from a GFF3-style cigar formatted alignment */
 
 int pslRangeTreeOverlap(struct psl *psl, struct rbTree *rangeTree);
 /* Return amount that psl overlaps (on target side) with rangeTree. */
 
 float pslIdent(struct psl *psl);
 /* computer fraction identity */
 
 float pslQueryAligned(struct psl *psl);
 /* compute fraction of query that was aligned */
 
 INLINE unsigned pslQStart(struct psl *psl, int blkIdx)
 /* return query start for the given block */
 {
 return psl->qStarts[blkIdx];
 }
 
 INLINE unsigned pslTStart(struct psl *psl, int blkIdx)
 /* return target start for the given block */
 {
 return psl->tStarts[blkIdx];
 }
 
 INLINE unsigned pslQEnd(struct psl *psl, int blkIdx)
 /* return query end for the given block */
 {
 return psl->qStarts[blkIdx] + psl->blockSizes[blkIdx];
 }
 
 INLINE unsigned pslTEnd(struct psl *psl, int blkIdx)
 /* return target end for the given block */
 {
 return psl->tStarts[blkIdx] + psl->blockSizes[blkIdx];
 }
 
 struct psl* pslClone(struct psl *psl);
 /* clone a psl */
 
 extern char *pslSortList[5];
 
 void pslSortListByVar(struct psl **pslList, char *sort);
 /* Sort a list of psls using the method definied in the sort string. */
 #endif /* PSL_H */