e11b77d06719918edef238edac8d9b9e320b3076 angie Fri Jan 10 15:21:15 2020 -0800 Cleaning up old experimental util to drop trivial self-aligning blocks from PSL like lavToAxt -dropSelf does. refs #24694 diff --git src/hg/pslDropOverlap/pslDropOverlap.c src/hg/pslDropOverlap/pslDropOverlap.c index a9ac20f..2827395 100644 --- src/hg/pslDropOverlap/pslDropOverlap.c +++ src/hg/pslDropOverlap/pslDropOverlap.c @@ -1,234 +1,115 @@ -/* used for cleaning up self alignments - deletes all overlapping self alignments */ +/* used for cleaning up self alignments - deletes all overlapping self alignment blocks */ #include "common.h" #include "linefile.h" #include "psl.h" -#include "dnautil.h" - void usage() /* Explain usage and exit. */ { errAbort( -"pslDropOverlap - deletes all overlapping self alignments. \n" +"pslDropOverlap - deletes all overlapping (trivial/diagonal) self-alignment blocks. \n" "usage:\n" -" pslDropOverlap in.psl out.psl\n"); -} - -void pslOutRev(struct psl *el, FILE *f) -/* print psl with qStarts in fwd coordinates */ -{ -int i; -char sep = ' '; -char lastSep = '\n'; - -fprintf(f, "%u", el->match); -fputc(sep,f); -fprintf(f, "%u", el->misMatch); -fputc(sep,f); -fprintf(f, "%u", el->repMatch); -fputc(sep,f); -fprintf(f, "%u", el->nCount); -fputc(sep,f); -fprintf(f, "%u", el->qNumInsert); -fputc(sep,f); -fprintf(f, "%d", el->qBaseInsert); -fputc(sep,f); -fprintf(f, "%u", el->tNumInsert); -fputc(sep,f); -fprintf(f, "%d", el->tBaseInsert); -fputc(sep,f); -if (sep == ',') fputc('"',f); -fprintf(f, "%s", el->strand); -if (sep == ',') fputc('"',f); -fputc(sep,f); -if (sep == ',') fputc('"',f); -fprintf(f, "%s", el->qName); -if (sep == ',') fputc('"',f); -fputc(sep,f); -fprintf(f, "%u", el->qSize); -fputc(sep,f); -fprintf(f, "%u", el->qStart); -fputc(sep,f); -fprintf(f, "%u", el->qEnd); -fputc(sep,f); -if (sep == ',') fputc('"',f); -fprintf(f, "%s", el->tName); -if (sep == ',') fputc('"',f); -fputc(sep,f); -fprintf(f, "%u", el->tSize); -fputc(sep,f); -fprintf(f, "%u", el->tStart); -fputc(sep,f); -fprintf(f, "%u", el->tEnd); -fputc(sep,f); -fprintf(f, "%u", el->blockCount); -fputc(sep,f); -if (sep == ',') fputc('{',f); -fputc(lastSep,f); -for (i=0; i<el->blockCount; ++i) - { - fprintf(f, "%u", el->blockSizes[i]); - fputc(',', f); - } -if (sep == ',') fputc('}',f); -fputc(sep,f); -fputc(lastSep,f); -if (sep == ',') fputc('{',f); -for (i=0; i<el->blockCount; ++i) - { - int qs = el->qStarts[i]; - int qe = el->qStarts[i]+el->blockSizes[i]; - if (el->strand[0] == '-') - reverseIntRange(&qs, &qe, el->qSize); - fprintf(f, "%u", qs); - fputc(',', f); - } -if (sep == ',') fputc('}',f); -fputc(sep,f); -fputc(lastSep,f); -if (sep == ',') fputc('{',f); -for (i=0; i<el->blockCount; ++i) - { - fprintf(f, "%u", el->tStarts[i]); - fputc(',', f); - } -if (sep == ',') fputc('}',f); -fputc(lastSep,f); -for (i=0; i<el->blockCount; ++i) - { - int qs = el->qStarts[i]; - int qe = el->qStarts[i]+el->blockSizes[i]; - if (el->strand[0] == '-') - reverseIntRange(&qs, &qe, el->qSize); - fprintf(f, "%u", qe); - fputc(',', f); - } -if (sep == ',') fputc('}',f); -fputc(sep,f); -fputc(lastSep,f); -for (i=0; i<el->blockCount; ++i) - { - fprintf(f, "%u", el->tStarts[i]+el->blockSizes[i]); - fputc(',', f); - } -if (sep == ',') fputc('}',f); -fputc(lastSep,f); -if (ferror(f)) - { - perror("Error writing psl file\n"); - errAbort("\n"); - } +" pslDropOverlap in.psl out.psl\n" +"This discards information in mismatch, repMatch and nCount, lumping all into match.\n" +"(all matching bases are counted in the match column).\n"); } void deleteElement(unsigned *list, int i, int total) /* delete element i from array of size total, move elements down by one to fill gap */ { int j; for (j = i ; j < total ; j++) list[j] = list[j+1]; list[total-1] = 0; } void pslDropOverlap(char *inName, char *outName) -/* Simplify psl - print only select non-tab fields. */ +/* Delete all aligned blocks where target and query are at the same position. */ { -struct lineFile *lf = lineFileOpen(inName, TRUE); +struct lineFile *lf = pslFileOpen(inName); FILE *f = mustOpen(outName, "w"); struct psl *psl; -char *line; -int lineSize; int skipMatch = 0; -int totMatch = 0, totMis = 0, totIns = 0, totRepMatch = 0; +int totMatch = 0; int totSkip = 0; int totLines = 0; +int totBlocks = 0; -if (!lineFileNext(lf, &line, &lineSize)) - errAbort("%s is empty\n", inName); -if (startsWith("psLayout version", line)) - { - int i; - uglyf("Skipping header\n"); - for (i=0; i<4; ++i) - lineFileNext(lf, &line, &lineSize); - } -else - lineFileReuse(lf); while ((psl = pslNext(lf)) != NULL) { totLines++; + totBlocks += psl->blockCount; + psl->match += psl->misMatch + psl->repMatch + psl->nCount; + psl->misMatch = psl->repMatch = psl->nCount = 0; totMatch += psl->match; - totMis += psl->misMatch; - totIns += psl->blockCount-1; - totRepMatch += psl->repMatch; if (sameString(psl->qName, psl->tName)) { + boolean changed = FALSE; int i; int newBlockCount = psl->blockCount; - if (psl->tStart == psl->qStart) - { - pslFree(&psl); - continue; - } for (i = psl->blockCount-1 ; i >= 0 ; i--) { int ts = psl->tStarts[i]; int te = psl->tStarts[i]+psl->blockSizes[i]; int qs = psl->qStarts[i]; int qe = psl->qStarts[i]+psl->blockSizes[i]; if (psl->strand[0] == '-') reverseIntRange(&qs, &qe, psl->qSize); if (psl->strand[1] == '-') reverseIntRange(&ts, &te, psl->tSize); - if (ts == qs) - { + if (rangeIntersection(ts, te, qs, qe) > 0) + { + if ((ts != qs || te != qe) && + ((psl->strand[0] == '+' && psl->strand[1] == '\0') || + (psl->strand[0] == psl->strand[1]))) + verbose(2, "line %d: Blocks[%d] overlap but are not identical: " + "t:%d,%d; q:%d,%d, offset:%d, size:%d", + lf->lineIx, i, ts, te, qs, qe, qs - ts, psl->blockSizes[i]); newBlockCount--; + // In unscored PSL, psl->match could be 0; prevent underflow. + if (psl->match >= psl->blockSizes[i]) psl->match -= psl->blockSizes[i]; - printf( "skip block size %d #%d blk %d %d\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\n", - psl->blockSizes[i],i, psl->blockCount, psl->match, psl->misMatch, psl->repMatch, psl->nCount, + else if (psl->match > 0) + lineFileAbort(lf, "psl->match too small (%d) for block %d size %d", + psl->match, i, psl->blockSizes[i]); + totSkip++; + skipMatch += psl->blockSizes[i]; + verbose(2, "skip block size %d #%d blk %d " + "%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\n", + psl->blockSizes[i], i, psl->blockCount, + psl->match, psl->misMatch, psl->repMatch, psl->nCount, psl->strand, psl->qName, psl->qSize, psl->qStart, psl->qEnd, - psl->tName, psl->tSize, psl->tStart, psl->tEnd - ); - /* debug */ - if (psl->match > 200000000) - { - printf("blk %d : ", psl->blockCount); - pslOutRev(psl, stdout); - } + psl->tName, psl->tSize, psl->tStart, psl->tEnd); deleteElement(psl->tStarts, i , psl->blockCount); deleteElement(psl->qStarts, i , psl->blockCount); deleteElement(psl->blockSizes, i , psl->blockCount); - totSkip++; - skipMatch += psl->blockSizes[i]; + changed = TRUE; } } + if (changed) + { psl->blockCount = newBlockCount; + pslRecalcBounds(psl); + pslComputeInsertCounts(psl); } - /* fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n", - psl->match, psl->misMatch, psl->repMatch, psl->nCount, - psl->qNumInsert, psl->qBaseInsert, psl->tNumInsert, psl->tBaseInsert, - psl->strand, - psl->qName, psl->qSize, psl->qStart, psl->qEnd, - psl->tName, psl->tSize, psl->tStart, psl->tEnd, - psl->blockCount-1 ); - */ - + } + if (psl->blockCount > 0) pslTabOut(psl, f); - pslFree(&psl); } -printf( "Total skipped %d blocks out of %d alignments, match %d out of %d, in %s\n", - totSkip, totLines, skipMatch, totMatch, outName ); +verbose(1, "Total skipped %d blocks out of %d blocks in %d alignments, match %d out of %d, in %s\n", + totSkip, totBlocks, totLines, skipMatch, totMatch, outName ); fclose(f); lineFileClose(&lf); } int main(int argc, char *argv[]) /* Process command line. */ { if (argc != 3) usage(); pslDropOverlap(argv[1], argv[2]); return 0; }