6f8a50068e6c1558dbbf286b9a573800c68b1fb1 braney Tue May 28 17:16:22 2024 -0700 ongoing work on pair browser diff --git src/hg/utils/buildPairAssembly/buildPairAssembly.c src/hg/utils/buildPairAssembly/buildPairAssembly.c index bd71c3f..b349804 100644 --- src/hg/utils/buildPairAssembly/buildPairAssembly.c +++ src/hg/utils/buildPairAssembly/buildPairAssembly.c @@ -16,33 +16,35 @@ "usage:\n" " buildPairAssembly input.chain tSequenceName tSeq.2bit qSequenceName qSeq.2bit out.fa query.psl target.psl map.bed dup.bed mismatch.bed baseTarget.bed baseQuery.bed\n" "options:\n" " -xxx=XXX\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; struct linearBlock { struct linearBlock *next; +unsigned pStart, pEnd; unsigned tStart, tEnd; unsigned qStart, qEnd; boolean isFlipped; +unsigned anchor; }; int lbCmpQuery(const void *va, const void *vb) /* Compare to sort based on target start. */ { const struct linearBlock *a = *((struct linearBlock **)va); const struct linearBlock *b = *((struct linearBlock **)vb); return a->qStart - b->qStart; } int lbCmpTarget(const void *va, const void *vb) /* Compare to sort based on target start. */ { const struct linearBlock *a = *((struct linearBlock **)va); const struct linearBlock *b = *((struct linearBlock **)vb); @@ -58,45 +60,45 @@ fprintf(f, "%s %d %d %s %d %d %d\n",qSequenceName, second->qStart, second->qEnd, tSequenceName, second->tStart, second->tEnd, count); } //else //errAbort("oops"); /* first->tEnd = second->tStart; if (first->tStart == first->tEnd) first->score = 3; first->qEnd = second->qStart; */ } -void buildPairAssembly(char *inputChain, char *tSequenceName, char *t2bitName, char *qSequenceName, char *q2bitName, char *outFaName, char *outQueryPsl, char *outTargetPsl, char *outMapBed, char *outDupBed, char *outMissBed, char *outBaseTargetBed, char *outBaseQueryBed) +void buildPairAssembly(char *inputChain, char *tSequenceName, char *t2bitName, char *qSequenceName, char *q2bitName, char *outFaName, char *outQueryPsl, char *outTargetPsl, char *outMapBed, char *outDupBed, char *outMissBed, char *outBaseTargetBed, char *outBaseQueryBed, char *outSynBlockBed) /* buildPairAssembly - From a single coverage chain build a pairwise assembly including all sequence from both query and target. */ { struct lineFile *lf = lineFileOpen(inputChain, TRUE); struct chain *chain; struct cBlock *blockList = NULL, *cBlock, *nextBlock; struct twoBitFile *q2bit = twoBitOpen(q2bitName); struct twoBitFile *t2bit = twoBitOpen(t2bitName); FILE *outFa = mustOpen(outFaName, "w"); FILE *outMap = mustOpen(outMapBed, "w"); FILE *outDup = mustOpen(outDupBed, "w"); FILE *outMiss = mustOpen(outMissBed, "w"); FILE *outBaseTarget = mustOpen(outBaseTargetBed, "w"); FILE *outBaseQuery = mustOpen(outBaseQueryBed, "w"); - +FILE *outSynBlock = mustOpen(outSynBlockBed, "w"); fprintf(outFa, ">buildPair\n"); while ((chain = chainRead(lf)) != NULL) { if (!sameString(tSequenceName, chain->tName) || !sameString(qSequenceName, chain->qName)) continue; for (cBlock = chain->blockList; cBlock; cBlock = nextBlock) { nextBlock = cBlock->next; if (chain->qStrand == '-') { unsigned qStart = cBlock->qStart; cBlock->qStart = chain->qSize - cBlock->qEnd; cBlock->qEnd = chain->qSize - qStart; @@ -208,30 +210,31 @@ prevBase = cBlock->tEnd; prevCBlock = cBlock; } slReverse(&linearBlockList); #define BOTH_COLOR "20,40,140" #define INVERT_COLOR "140,20,140" #define REFONLY_COLOR "125,125,250" #define QUERYONLY_COLOR "180,125,60" unsigned startAddress = 0, size; boolean firstBlockFlipped = linearBlockList->isFlipped; char strand; char *color; +count = 0; for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) { if (linearBlock->qStart == linearBlock->qEnd) { size = linearBlock->tEnd - linearBlock->tStart; struct dnaSeq *dna = twoBitReadSeqFrag(t2bit, tSequenceName, linearBlock->tStart, linearBlock->tEnd); writeSeqWithBreaks(outFa, dna->dna, size, 50 ); fprintf(outMap,"buildPair %d %d refOnly 0 + %d %d %s %d %d 0 0\n", startAddress, startAddress + size, startAddress, startAddress + size, REFONLY_COLOR, linearBlock->tStart, linearBlock->tEnd); } else if (linearBlock->tStart == linearBlock->tEnd) { size = linearBlock->qEnd - linearBlock->qStart; struct dnaSeq *dna = twoBitReadSeqFrag(q2bit, qSequenceName, linearBlock->qStart, linearBlock->qEnd); if (linearBlock->isFlipped) reverseComplement(dna->dna, size); @@ -255,34 +258,41 @@ fprintf(outMiss, "buildPair %d %d %c->%c\n", startAddress + ii, startAddress + ii + 1, toupper(dna->dna[ii]), toupper(dna2->dna[ii])); } if (linearBlock->isFlipped == firstBlockFlipped) { strand = '+'; color = BOTH_COLOR; //fprintf(outMap,"buildPair %d %d both 0 %c %d %d %s %d %d %d %d\n", startAddress, startAddress + size, strand, startAddress, startAddress + size, color, linearBlock->tStart, linearBlock->tEnd, linearBlock->qStart, linearBlock->qEnd); } else { strand = '-'; color = INVERT_COLOR; fprintf(outMap,"buildPair %d %d invert 0 %c %d %d %s %d %d %d %d\n", startAddress, startAddress + size, strand, startAddress, startAddress + size, color, linearBlock->tStart, linearBlock->tEnd, linearBlock->qStart, linearBlock->qEnd); } + + linearBlock->anchor = ++count; } + linearBlock->pStart = startAddress; + linearBlock->pEnd = startAddress + size; startAddress += size; } +//for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) + //printf("%d %d\n", linearBlock->pStart, linearBlock->pEnd); + unsigned sumSize = startAddress; unsigned tSize = twoBitSeqSize(t2bit, tSequenceName); unsigned qSize = twoBitSeqSize(q2bit, qSequenceName); printf("qSize %d\n", qSize); FILE *outQuery = mustOpen(outQueryPsl, "w"); FILE *outTarget = mustOpen(outTargetPsl, "w"); startAddress = 0; for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) { if (linearBlock->qStart == linearBlock->qEnd) { size = linearBlock->tEnd - linearBlock->tStart; struct psl *psl = pslNew(tSequenceName, tSize, linearBlock->tStart, linearBlock->tEnd, "buildPair", sumSize, startAddress, startAddress + size, "+", 1, 0); @@ -390,30 +400,72 @@ } //else if (linearBlock->qStart != linearBlock->qEnd) { if (startT == 0) { startAddress = currentAddress; startT = linearBlock->qStart; } endT = linearBlock->qEnd; } //totalSize += size; currentAddress += size; } +slSort(&linearBlockList, lbCmpQuery); +int isLastFlipped = linearBlockList->isFlipped; +unsigned lastEnd = linearBlockList->qEnd; +unsigned lastAnchor; +startAddress = linearBlockList->pStart; +count = 0; +boolean first = TRUE; +for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) + { + if (linearBlock->anchor) + { + if (first) + { + first = FALSE; + startAddress = linearBlock->pStart; + isLastFlipped = linearBlock->isFlipped; + lastEnd = linearBlock->pEnd; + lastAnchor = linearBlock->anchor; + } + else if ((abs(linearBlock->anchor - lastAnchor) != 1 ) || (isLastFlipped != linearBlock->isFlipped)) + { + int signedCount = ++count; + if (isLastFlipped) + signedCount *= -1; + if (startAddress > lastEnd) + fprintf(outSynBlock, "buildPair %d %d %d\n",lastEnd, startAddress, signedCount); + else + fprintf(outSynBlock, "buildPair %d %d %d\n",startAddress, lastEnd, signedCount); + startAddress = linearBlock->pStart; + } + lastEnd = linearBlock->pEnd; + lastAnchor = linearBlock->anchor; + isLastFlipped = linearBlock->isFlipped; + } + } +int signedCount = ++count; +if (isLastFlipped) + signedCount *= -1; +if (startAddress > lastEnd) + fprintf(outSynBlock, "buildPair %d %d %d\n",lastEnd, startAddress, signedCount); +else + fprintf(outSynBlock, "buildPair %d %d %d\n",startAddress, lastEnd, signedCount); // validate slSort(&linearBlockList, lbCmpTarget); firstBlock = TRUE; for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) { if (linearBlock->tStart == linearBlock->tEnd) continue; if (!firstBlock) { if (linearBlock->tStart != startAddress) errAbort("tStart %d %d", startAddress, linearBlock->tStart); } else firstBlock = FALSE; @@ -429,20 +481,20 @@ { if (linearBlock->qStart != startAddress) errAbort("qStart %d %d", startAddress, linearBlock->qStart); } else firstBlock = FALSE; startAddress = linearBlock->qEnd; } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); -if (argc != 14) +if (argc != 15) usage(); -buildPairAssembly(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13]); +buildPairAssembly(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13], argv[14]); return 0; }