83a03d08eb15f3352267026c3d875e2f8ff68bd0 braney Wed May 8 16:09:49 2024 -0700 a utility to build an assembly hub for a sequence that includes both sequences from a pairwise alignment diff --git src/hg/utils/buildPairAssembly/buildPairAssembly.c src/hg/utils/buildPairAssembly/buildPairAssembly.c new file mode 100644 index 0000000..ec6fc83 --- /dev/null +++ src/hg/utils/buildPairAssembly/buildPairAssembly.c @@ -0,0 +1,222 @@ +/* buildPairAssembly - From a single coverage chain build a pairwise assembly including all sequence from both query and target. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "chain.h" +#include "twoBit.h" +#include "dnaseq.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "buildPairAssembly - From a single coverage chain build a pairwise assembly including all sequence from both query and target\n" + "usage:\n" + " buildPairAssembly input.chain tSequenceName tSeq.2bit qSequenceName qSeq.2bit out.fa query.psl target.psl map.bed\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +struct linearBlock +{ +struct linearBlock *next; +unsigned tStart, tEnd; +unsigned qStart, qEnd; +boolean isFlipped; +}; + +int lbCmpQuery(const void *va, const void *vb) +/* Compare to sort based on target start. */ +{ +const struct linearBlock *a = *((struct linearBlock **)va); +const struct linearBlock *b = *((struct linearBlock **)vb); +return a->qStart - b->qStart; +} + +int lbCmpTarget(const void *va, const void *vb) +/* Compare to sort based on target start. */ +{ +const struct linearBlock *a = *((struct linearBlock **)va); +const struct linearBlock *b = *((struct linearBlock **)vb); +return a->tStart - b->tStart; +} + +void buildPairAssembly(char *inputChain, char *tSequenceName, char *t2bitName, char *qSequenceName, char *q2bitName, char *outFaName, char *outQueryPsl, char *outTargetPsl, char *outMapBed) +/* buildPairAssembly - From a single coverage chain build a pairwise assembly including all sequence from both query and target. */ +{ +struct lineFile *lf = lineFileOpen(inputChain, TRUE); +struct chain *chain; +struct cBlock *blockList = NULL, *cBlock, *nextBlock; +struct twoBitFile *q2bit = twoBitOpen(q2bitName); +struct twoBitFile *t2bit = twoBitOpen(t2bitName); +FILE *outFa = mustOpen(outFaName, "w"); + +fprintf(outFa, ">chr1\n"); + +while ((chain = chainRead(lf)) != NULL) + { + if (!sameString(tSequenceName, chain->tName) || !sameString(qSequenceName, chain->qName)) + continue; + + for (cBlock = chain->blockList; cBlock; cBlock = nextBlock) + { + nextBlock = cBlock->next; + if (chain->qStrand == '-') + { + unsigned qStart = cBlock->qStart; + cBlock->qStart = chain->qSize - cBlock->qEnd; + cBlock->qEnd = chain->qSize - qStart; + cBlock->score = 1; + } + else + cBlock->score = 0; + + slAddHead(&blockList, cBlock); + } + } + +slSort(&blockList, cBlockCmpQuery); +unsigned prevBase = blockList->qEnd; +struct linearBlock *linearBlock; +for (cBlock = blockList->next; cBlock; cBlock = cBlock->next) + { + if (cBlock->qStart < prevBase) + { + printf("qBlock overlap %d %d\n", cBlock->qStart, prevBase); + cBlock->score = 2; + continue; + } + if (cBlock->qStart > prevBase) + { + AllocVar(linearBlock); + linearBlock->qStart = prevBase; + linearBlock->qEnd = cBlock->qStart; + if (cBlock->score == 1) + linearBlock->isFlipped = TRUE; + cBlock->data = (void *)linearBlock; + } + prevBase = cBlock->qEnd; + } + +slSort(&blockList, cBlockCmpTarget); +prevBase = 0; +struct linearBlock *linearBlockList = NULL; +boolean firstBlock = TRUE; + +for (cBlock = blockList; cBlock; cBlock = cBlock->next) + { + if (cBlock->tStart < prevBase) + { + //printf("tBlock overlap %d %d\n", cBlock->tStart, prevBase); + continue; + } + if (cBlock->score == 2) + continue; + if (!firstBlock) + { + if (cBlock->tStart > prevBase) + { + AllocVar(linearBlock); + linearBlock->tStart = prevBase; + linearBlock->tEnd = cBlock->tStart; + slAddHead(&linearBlockList, linearBlock); + } + if (cBlock->data != NULL) + { + struct linearBlock *qInsert = (struct linearBlock *)cBlock->data; + slAddHead(&linearBlockList, qInsert); + } + } + else + firstBlock = FALSE; + + AllocVar(linearBlock); + linearBlock->qStart = cBlock->qStart; + linearBlock->qEnd = cBlock->qEnd; + linearBlock->tStart = cBlock->tStart; + linearBlock->tEnd = cBlock->tEnd; + if (cBlock->score == 1) + linearBlock->isFlipped = TRUE; + slAddHead(&linearBlockList, linearBlock); + + prevBase = cBlock->tEnd; + } + +slReverse(&linearBlockList); +unsigned startAddress = 0, size; +for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) + { + if (linearBlock->qStart == linearBlock->qEnd) + { + struct dnaSeq *dna = twoBitReadSeqFrag(t2bit, tSequenceName, linearBlock->tStart, linearBlock->tEnd); + fprintf(outFa, "%s\n", (char *)dna->dna); + size = linearBlock->tEnd - linearBlock->tStart; + printf("chr1 %d %d %d refOnly %d %d\n", startAddress, startAddress + size, linearBlock->isFlipped, linearBlock->tStart, linearBlock->tEnd); + } + else if (linearBlock->tStart == linearBlock->tEnd) + { + struct dnaSeq *dna = twoBitReadSeqFrag(q2bit, qSequenceName, linearBlock->qStart, linearBlock->qEnd); + fprintf(outFa, "%s\n", (char *)dna->dna); + size = linearBlock->qEnd - linearBlock->qStart; + printf("chr1 %d %d %d queryOnly %d %d\n", startAddress, startAddress + size, linearBlock->isFlipped, linearBlock->qStart, linearBlock->qEnd); + } + else + { + struct dnaSeq *dna = twoBitReadSeqFrag(t2bit, tSequenceName, linearBlock->tStart, linearBlock->tEnd); + fprintf(outFa, "%s\n", (char *)dna->dna); + size = linearBlock->qEnd - linearBlock->qStart; + printf("chr1 %d %d %d both %d %d %d %d\n", startAddress, startAddress + size, linearBlock->isFlipped,linearBlock->tStart, linearBlock->tEnd, linearBlock->qStart, linearBlock->qEnd); + } + startAddress += size; + } + +// validate +slSort(&linearBlockList, lbCmpTarget); +firstBlock = TRUE; +for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) + { + if (linearBlock->tStart == linearBlock->tEnd) + continue; + if (!firstBlock) + { + if (linearBlock->tStart != startAddress) + errAbort("tStart %d %d", startAddress, linearBlock->tStart); + } + else + firstBlock = FALSE; + startAddress = linearBlock->tEnd; + } +firstBlock = TRUE; +slSort(&linearBlockList, lbCmpQuery); +for(linearBlock = linearBlockList; linearBlock; linearBlock = linearBlock->next) + { + if (linearBlock->qStart == linearBlock->qEnd) + continue; + if (!firstBlock) + { + if (linearBlock->qStart != startAddress) + errAbort("qStart %d %d", startAddress, linearBlock->qStart); + } + else + firstBlock = FALSE; + startAddress = linearBlock->qEnd; + } + +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 10) + usage(); +buildPairAssembly(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9]); +return 0; +}