6f8a50068e6c1558dbbf286b9a573800c68b1fb1 braney Tue May 28 17:16:22 2024 -0700 ongoing work on pair browser diff --git src/hg/utils/buildPairAssembly/buildPairAssembly.c src/hg/utils/buildPairAssembly/buildPairAssembly.c index bd71c3f..b349804 100644 --- src/hg/utils/buildPairAssembly/buildPairAssembly.c +++ src/hg/utils/buildPairAssembly/buildPairAssembly.c @@ -1,448 +1,500 @@ /* buildPairAssembly - From a single coverage chain build a pairwise assembly including all sequence from both query and target. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "chain.h" #include "twoBit.h" #include "dnaseq.h" #include "psl.h" void usage() /* Explain usage and exit. */ { errAbort( "buildPairAssembly - From a single coverage chain build a pairwise assembly including all sequence from both query and target\n" "usage:\n" " buildPairAssembly input.chain tSequenceName tSeq.2bit qSequenceName qSeq.2bit out.fa query.psl target.psl map.bed dup.bed mismatch.bed baseTarget.bed baseQuery.bed\n" "options:\n" " -xxx=XXX\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; struct linearBlock { struct linearBlock *next; +unsigned pStart, pEnd; unsigned tStart, tEnd; unsigned qStart, qEnd; boolean isFlipped; +unsigned anchor; }; int lbCmpQuery(const void *va, const void *vb) /* Compare to sort based on target start. */ { const struct linearBlock *a = *((struct linearBlock **)va); const struct linearBlock *b = *((struct linearBlock **)vb); return a->qStart - b->qStart; } int lbCmpTarget(const void *va, const void *vb) /* Compare to sort based on target start. */ { const struct linearBlock *a = *((struct linearBlock **)va); const struct linearBlock *b = *((struct linearBlock **)vb); return a->tStart - b->tStart; } void outOverlap(FILE *f, char *tSequenceName,char *qSequenceName, struct cBlock *first, struct cBlock *second, int count) { printf("first %d %d second %d %d\n", first->tStart, first->tEnd, second->tStart, second->tEnd); //if (first->tEnd >= second->tEnd) { second->score = 3; fprintf(f, "%s %d %d %s %d %d %d\n",qSequenceName, second->qStart, second->qEnd, tSequenceName, second->tStart, second->tEnd, count); } //else //errAbort("oops"); /* first->tEnd = second->tStart; if (first->tStart == first->tEnd) first->score = 3; first->qEnd = second->qStart; */ } -void buildPairAssembly(char *inputChain, char *tSequenceName, char *t2bitName, char *qSequenceName, char *q2bitName, char *outFaName, char *outQueryPsl, char *outTargetPsl, char *outMapBed, char *outDupBed, char *outMissBed, char *outBaseTargetBed, char *outBaseQueryBed) +void buildPairAssembly(char *inputChain, char *tSequenceName, char *t2bitName, char *qSequenceName, char *q2bitName, char *outFaName, char *outQueryPsl, char *outTargetPsl, char *outMapBed, char *outDupBed, char *outMissBed, char *outBaseTargetBed, char *outBaseQueryBed, char *outSynBlockBed) /* buildPairAssembly - From a single coverage chain build a pairwise assembly including all sequence from both query and target. */ { struct lineFile *lf = lineFileOpen(inputChain, TRUE); struct chain *chain; struct cBlock *blockList = NULL, *cBlock, *nextBlock; struct twoBitFile *q2bit = twoBitOpen(q2bitName); struct twoBitFile *t2bit = twoBitOpen(t2bitName); FILE *outFa = mustOpen(outFaName, "w"); FILE *outMap = mustOpen(outMapBed, "w"); FILE *outDup = mustOpen(outDupBed, "w"); FILE *outMiss = mustOpen(outMissBed, "w"); FILE *outBaseTarget = mustOpen(outBaseTargetBed, "w"); FILE *outBaseQuery = mustOpen(outBaseQueryBed, "w"); - +FILE *outSynBlock = mustOpen(outSynBlockBed, "w"); fprintf(outFa, ">buildPair\n"); while ((chain = chainRead(lf)) != NULL) { if (!sameString(tSequenceName, chain->tName) || !sameString(qSequenceName, chain->qName)) continue; for (cBlock = chain->blockList; cBlock; cBlock = nextBlock) { nextBlock = cBlock->next; if (chain->qStrand == '-') { unsigned qStart = cBlock->qStart; cBlock->qStart = chain->qSize - cBlock->qEnd; cBlock->qEnd = chain->qSize - qStart; cBlock->score = 1; } else cBlock->score = 0; slAddHead(&blockList, cBlock); } } slSort(&blockList, cBlockCmpTarget); unsigned prevBase = blockList->tEnd; //unsigned prevBase = 0; boolean firstBlock = TRUE; struct cBlock *prevCBlock = NULL; int count = 0; for (cBlock = blockList->next; cBlock; cBlock = cBlock->next) { //if (cBlock->score == 2) //continue; if (!firstBlock) { if (cBlock->tStart < prevBase) { outOverlap(outDup, tSequenceName, qSequenceName, prevCBlock, cBlock, count++); //printf("tBlock overlap %d %d\n", cBlock->tStart, prevBase); //cBlock->score = 3; continue; } } else firstBlock = FALSE; prevBase = cBlock->tEnd; prevCBlock = cBlock; } slSort(&blockList, cBlockCmpQuery); prevBase = blockList->qEnd; struct linearBlock *linearBlock; for (cBlock = blockList->next; cBlock; cBlock = cBlock->next) { if (cBlock->score == 3) continue; if (cBlock->qStart < prevBase) { //printf("qBlock overlap %d %d\n", cBlock->qStart, prevBase); cBlock->score = 2; continue; } if (cBlock->qStart > prevBase) { AllocVar(linearBlock); linearBlock->qStart = prevBase; linearBlock->qEnd = cBlock->qStart; if (cBlock->score == 1) linearBlock->isFlipped = TRUE; cBlock->data = (void *)linearBlock; } prevBase = cBlock->qEnd; } slSort(&blockList, cBlockCmpTarget); prevBase = 0; struct linearBlock *linearBlockList = NULL; firstBlock = TRUE; prevCBlock = NULL; for (cBlock = blockList; cBlock; cBlock = cBlock->next) { if (cBlock->score >= 2) continue; if (!firstBlock) { if (cBlock->tStart > prevBase) { AllocVar(linearBlock); linearBlock->tStart = prevBase; linearBlock->tEnd = cBlock->tStart; slAddHead(&linearBlockList, linearBlock); } } else firstBlock = FALSE; if ((cBlock->data != NULL) && (cBlock->score == 0)) { struct linearBlock *qInsert = (struct linearBlock *)cBlock->data; slAddHead(&linearBlockList, qInsert); } AllocVar(linearBlock); linearBlock->qStart = cBlock->qStart; linearBlock->qEnd = cBlock->qEnd; linearBlock->tStart = cBlock->tStart; linearBlock->tEnd = cBlock->tEnd; if (cBlock->score == 1) linearBlock->isFlipped = TRUE; slAddHead(&linearBlockList, linearBlock); if ((cBlock->data != NULL) && (cBlock->score == 1)) { struct linearBlock *qInsert = (struct linearBlock *)cBlock->data; slAddHead(&linearBlockList, qInsert); } prevBase = cBlock->tEnd; prevCBlock = cBlock; } slReverse(&linearBlockList); #define BOTH_COLOR "20,40,140" #define INVERT_COLOR "140,20,140" #define REFONLY_COLOR "125,125,250" #define QUERYONLY_COLOR "180,125,60" unsigned startAddress = 0, size; boolean firstBlockFlipped = linearBlockList->isFlipped; char strand; char *color; +count = 0; for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) { if (linearBlock->qStart == linearBlock->qEnd) { size = linearBlock->tEnd - linearBlock->tStart; struct dnaSeq *dna = twoBitReadSeqFrag(t2bit, tSequenceName, linearBlock->tStart, linearBlock->tEnd); writeSeqWithBreaks(outFa, dna->dna, size, 50 ); fprintf(outMap,"buildPair %d %d refOnly 0 + %d %d %s %d %d 0 0\n", startAddress, startAddress + size, startAddress, startAddress + size, REFONLY_COLOR, linearBlock->tStart, linearBlock->tEnd); } else if (linearBlock->tStart == linearBlock->tEnd) { size = linearBlock->qEnd - linearBlock->qStart; struct dnaSeq *dna = twoBitReadSeqFrag(q2bit, qSequenceName, linearBlock->qStart, linearBlock->qEnd); if (linearBlock->isFlipped) reverseComplement(dna->dna, size); writeSeqWithBreaks(outFa, dna->dna, size, 50 ); char strand = linearBlock->isFlipped ? '-' : '+'; fprintf(outMap,"buildPair %d %d queryOnly 0 %c %d %d %s 0 0 %d %d\n", startAddress, startAddress + size, strand, startAddress, startAddress + size, QUERYONLY_COLOR, linearBlock->qStart, linearBlock->qEnd); } else { size = linearBlock->qEnd - linearBlock->qStart; struct dnaSeq *dna = twoBitReadSeqFrag(t2bit, tSequenceName, linearBlock->tStart, linearBlock->tEnd); writeSeqWithBreaks(outFa, dna->dna, size, 50 ); struct dnaSeq *dna2 = twoBitReadSeqFrag(q2bit, qSequenceName, linearBlock->qStart, linearBlock->qEnd); if (linearBlock->isFlipped) reverseComplement(dna2->dna, size); int ii; for(ii = 0; ii < size; ii++) { if (toupper(dna->dna[ii]) != toupper(dna2->dna[ii])) fprintf(outMiss, "buildPair %d %d %c->%c\n", startAddress + ii, startAddress + ii + 1, toupper(dna->dna[ii]), toupper(dna2->dna[ii])); } if (linearBlock->isFlipped == firstBlockFlipped) { strand = '+'; color = BOTH_COLOR; //fprintf(outMap,"buildPair %d %d both 0 %c %d %d %s %d %d %d %d\n", startAddress, startAddress + size, strand, startAddress, startAddress + size, color, linearBlock->tStart, linearBlock->tEnd, linearBlock->qStart, linearBlock->qEnd); } else { strand = '-'; color = INVERT_COLOR; fprintf(outMap,"buildPair %d %d invert 0 %c %d %d %s %d %d %d %d\n", startAddress, startAddress + size, strand, startAddress, startAddress + size, color, linearBlock->tStart, linearBlock->tEnd, linearBlock->qStart, linearBlock->qEnd); } + + linearBlock->anchor = ++count; } + linearBlock->pStart = startAddress; + linearBlock->pEnd = startAddress + size; startAddress += size; } +//for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) + //printf("%d %d\n", linearBlock->pStart, linearBlock->pEnd); + unsigned sumSize = startAddress; unsigned tSize = twoBitSeqSize(t2bit, tSequenceName); unsigned qSize = twoBitSeqSize(q2bit, qSequenceName); printf("qSize %d\n", qSize); FILE *outQuery = mustOpen(outQueryPsl, "w"); FILE *outTarget = mustOpen(outTargetPsl, "w"); startAddress = 0; for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) { if (linearBlock->qStart == linearBlock->qEnd) { size = linearBlock->tEnd - linearBlock->tStart; struct psl *psl = pslNew(tSequenceName, tSize, linearBlock->tStart, linearBlock->tEnd, "buildPair", sumSize, startAddress, startAddress + size, "+", 1, 0); psl->qStarts[0] = linearBlock->tStart; psl->tStarts[0] = startAddress; psl->blockSizes[0] = size; psl->blockCount = 1; pslTabOut(psl, outTarget); } else if (linearBlock->tStart == linearBlock->tEnd) { size = linearBlock->qEnd - linearBlock->qStart; struct psl *psl = pslNew(qSequenceName, qSize, linearBlock->qStart, linearBlock->qEnd, "buildPair", sumSize, startAddress, startAddress + size, linearBlock->isFlipped ? "-" : "+", 1, 0); if (linearBlock->isFlipped) psl->qStarts[0] = qSize - linearBlock->qEnd; else psl->qStarts[0] = linearBlock->qStart; psl->tStarts[0] = startAddress; psl->blockSizes[0] = size; psl->blockCount = 1; pslTabOut(psl, outQuery); } else { size = linearBlock->qEnd - linearBlock->qStart; struct psl *psl = pslNew(tSequenceName, tSize, linearBlock->tStart, linearBlock->tEnd, "buildPair", sumSize, startAddress, startAddress + size, "+", 1, 0); psl->qStarts[0] = linearBlock->tStart; psl->tStarts[0] = startAddress; psl->blockSizes[0] = size; psl->blockCount = 1; pslTabOut(psl, outTarget); { struct psl *psl = pslNew(qSequenceName, qSize, linearBlock->qStart, linearBlock->qEnd, "buildPair", sumSize, startAddress, startAddress + size, linearBlock->isFlipped ? "-" : "+", 1, 0); if (linearBlock->isFlipped) psl->qStarts[0] = qSize - linearBlock->qEnd; else psl->qStarts[0] = linearBlock->qStart; psl->tStarts[0] = startAddress; psl->blockSizes[0] = size; psl->blockCount = 1; pslTabOut(psl, outQuery); } } startAddress += size; } startAddress = 0; unsigned currentAddress = 0; unsigned totalSize = 0; unsigned startT = 0; unsigned endT = 0; //unsigned size; //slSort(&linearBlockList, lbCmpTarget); for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) { if (linearBlock->tStart == linearBlock->tEnd) size = linearBlock->qEnd - linearBlock->qStart; else size = linearBlock->tEnd - linearBlock->tStart; if ((linearBlock->tStart == linearBlock->tEnd) || ((endT != 0) && (endT != linearBlock->tStart))) { if (startT != 0) fprintf(outBaseTarget, "buildPair %d %d %d %d\n",startAddress, currentAddress, startT, endT); startT = endT = totalSize = 0; } //else if (linearBlock->tStart != linearBlock->tEnd) { if (startT == 0) { startAddress = currentAddress; startT = linearBlock->tStart; } endT = linearBlock->tEnd; } //totalSize += size; currentAddress += size; } startAddress = 0; currentAddress = 0; totalSize = 0; startT = 0; endT = 0; for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) { if (linearBlock->tStart == linearBlock->tEnd) size = linearBlock->qEnd - linearBlock->qStart; else size = linearBlock->tEnd - linearBlock->tStart; if ((linearBlock->qStart == linearBlock->qEnd) || ((endT != 0) && (endT != linearBlock->qStart))) { if (startT != 0) fprintf(outBaseQuery, "buildPair %d %d %d %d\n",startAddress, currentAddress, startT, endT); startT = endT = totalSize = 0; } //else if (linearBlock->qStart != linearBlock->qEnd) { if (startT == 0) { startAddress = currentAddress; startT = linearBlock->qStart; } endT = linearBlock->qEnd; } //totalSize += size; currentAddress += size; } +slSort(&linearBlockList, lbCmpQuery); +int isLastFlipped = linearBlockList->isFlipped; +unsigned lastEnd = linearBlockList->qEnd; +unsigned lastAnchor; +startAddress = linearBlockList->pStart; +count = 0; +boolean first = TRUE; +for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) + { + if (linearBlock->anchor) + { + if (first) + { + first = FALSE; + startAddress = linearBlock->pStart; + isLastFlipped = linearBlock->isFlipped; + lastEnd = linearBlock->pEnd; + lastAnchor = linearBlock->anchor; + } + else if ((abs(linearBlock->anchor - lastAnchor) != 1 ) || (isLastFlipped != linearBlock->isFlipped)) + { + int signedCount = ++count; + if (isLastFlipped) + signedCount *= -1; + if (startAddress > lastEnd) + fprintf(outSynBlock, "buildPair %d %d %d\n",lastEnd, startAddress, signedCount); + else + fprintf(outSynBlock, "buildPair %d %d %d\n",startAddress, lastEnd, signedCount); + startAddress = linearBlock->pStart; + } + lastEnd = linearBlock->pEnd; + lastAnchor = linearBlock->anchor; + isLastFlipped = linearBlock->isFlipped; + } + } +int signedCount = ++count; +if (isLastFlipped) + signedCount *= -1; +if (startAddress > lastEnd) + fprintf(outSynBlock, "buildPair %d %d %d\n",lastEnd, startAddress, signedCount); +else + fprintf(outSynBlock, "buildPair %d %d %d\n",startAddress, lastEnd, signedCount); // validate slSort(&linearBlockList, lbCmpTarget); firstBlock = TRUE; for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) { if (linearBlock->tStart == linearBlock->tEnd) continue; if (!firstBlock) { if (linearBlock->tStart != startAddress) errAbort("tStart %d %d", startAddress, linearBlock->tStart); } else firstBlock = FALSE; startAddress = linearBlock->tEnd; } firstBlock = TRUE; slSort(&linearBlockList, lbCmpQuery); for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) { if (linearBlock->qStart == linearBlock->qEnd) continue; if (!firstBlock) { if (linearBlock->qStart != startAddress) errAbort("qStart %d %d", startAddress, linearBlock->qStart); } else firstBlock = FALSE; startAddress = linearBlock->qEnd; } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); -if (argc != 14) +if (argc != 15) usage(); -buildPairAssembly(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13]); +buildPairAssembly(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13], argv[14]); return 0; }