c631d4d20178d19ed60da27cd7e34d63872d4a90 markd Thu May 16 10:34:45 2024 -0700 added pslSpliceJunctions to gather information in splice junctions to use in comprative mapping analysis diff --git src/utils/pslSpliceJunctions/pslSpliceJunctions.c src/utils/pslSpliceJunctions/pslSpliceJunctions.c new file mode 100644 index 0000000..8e63e6e --- /dev/null +++ src/utils/pslSpliceJunctions/pslSpliceJunctions.c @@ -0,0 +1,144 @@ +/* pslSpliceJunctions - Extract splice junctions for a PSL file. */ +#include "common.h" +#include "options.h" +#include "twoBit.h" +#include "psl.h" +#include "dnautil.h" + +static void usage() +/* Explain usage and exit. */ +{ +errAbort( + "pslSpliceJunctions - Extract splice junctions from a PSL file\n" + "usage:\n" + " pslSpliceJunctions pslFile genome2bit junctionsTsv\n" + "options:\n" + "\n" + "Output query and target coordinates of target gaps, often introns,\n" + "in alignments. Output is always in query-positive and target-positive coordinates,\n" + "with only gaps in the target reported. Canonical junctions will be in upper cases,\n" + "unknown ones lower case. \n"); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +/* canonical splice junctions */ +static char *canonicalJuncs[][2] = { + {"GT", "AG"}, + {"AT", "AC"}, + {NULL, NULL} +}; + +static boolean isCanonicalJuncs(char *junc5p, char *junc3p) +/* is this canonical junction pair? */ +{ +for (int i = 0; canonicalJuncs[i][0] != NULL; i++) + { + if (sameString(junc5p, canonicalJuncs[i][0]) && sameString(junc3p, canonicalJuncs[i][1])) + return TRUE; + } +return FALSE; +} + +static void getJuncs(struct psl *transPsl, unsigned tStart, unsigned tEnd, unsigned tSize, + struct twoBitFile *genomeFh, char junc5p[3], char junc3p[3]) +/* get sequences at junctions */ +{ +if (pslTStrand(transPsl) == '-') + reverseUnsignedRange(&tStart, &tEnd, transPsl->tSize); +struct dnaSeq *juncA = twoBitReadSeqFrag(genomeFh, transPsl->tName, tStart, tStart + 2); +struct dnaSeq *juncB = twoBitReadSeqFrag(genomeFh, transPsl->tName, tEnd - 2, tEnd); + +if (pslTStrand(transPsl) == '-') + { + struct dnaSeq *hold = juncA; + juncA = juncB; + juncB = hold; + reverseComplement(juncA->dna, 2); + reverseComplement(juncB->dna, 2); + } + +safecpy(junc5p, sizeof(junc5p), juncA->dna); +safecpy(junc3p, sizeof(junc3p), juncB->dna); +freeDnaSeq(&juncA); +freeDnaSeq(&juncB); + +touppers(junc5p); +touppers(junc3p); +if (!isCanonicalJuncs(junc5p, junc3p)) + { + tolowers(junc5p); + tolowers(junc3p); + } +} + +static void writeRow(FILE *juncsTsvFh, struct psl *transPsl, + unsigned qStart, unsigned qEnd, unsigned tStart, unsigned tEnd, + int iBlk, int tGapIdx, char *junc5p, char *junc3p) +/* output information about one junction or gap. */ +{ +if (pslTStrand(transPsl) == '-') + reverseUnsignedRange(&tStart, &tEnd, transPsl->tSize); + +fprintf(juncsTsvFh, "%s\t%u\t%u\t%d\t%s\t%u\t%u\t%d\t%c\t%d\t%d\t%u\t%s\t%s\t%d\n", + transPsl->qName, qStart, qEnd, transPsl->qSize, + transPsl->tName, tStart, tEnd, transPsl->tSize, + pslTStrand(transPsl), iBlk, tGapIdx, tEnd - tStart, junc5p, junc3p, + isCanonicalJuncs(junc5p, junc3p)); +} + +static void processPslGap(struct psl* transPsl, int iBlk, int tGapIdx, struct twoBitFile *genomeFh, FILE *juncsTsvFh) +/* process one target gap; psl is query-positive */ +{ +char junc5p[3], junc3p[3]; +getJuncs(transPsl, pslTEnd(transPsl, iBlk), pslTStart(transPsl, iBlk + 1), pslTStrand(transPsl), + genomeFh, junc5p, junc3p); +writeRow(juncsTsvFh, transPsl, pslQEnd(transPsl, iBlk), pslQStart(transPsl, iBlk + 1), + pslTStart(transPsl, iBlk), pslTEnd(transPsl, iBlk + 1), iBlk, tGapIdx, + junc5p, junc3p); +} + +static void processPsl(struct psl* transPsl, struct twoBitFile *genomeFh, FILE *juncsTsvFh) +/* get splice junctions of one alignment */ +{ +if (pslQStrand(transPsl) == '-') + pslRc(transPsl); +int tGapIdx = 0; +for (int iBlk = 0; iBlk < transPsl->blockCount - 1; iBlk++) + { + if (pslTEnd(transPsl, iBlk) < pslTStart(transPsl, iBlk + 1)) + { + processPslGap(transPsl, iBlk, tGapIdx, genomeFh, juncsTsvFh); + tGapIdx++; + } + } +} + +static void pslSpliceJunctions(char *pslFile, char *genome2bitFile, char *junctionsTsv) +/* pslSpliceJunctions - Extract splice junctions for a PSL file. */ +{ +struct psl *transPsls = pslLoadAll(pslFile); +slSort(&transPsls, pslCmpTarget); + +struct twoBitFile *genomeFh = twoBitOpen(genome2bitFile); +FILE *juncsTsvFh = mustOpen(junctionsTsv, "w"); +fprintf(juncsTsvFh, "qName\tqStart\tqEnd\tqSize\ttName\ttStart\tEnd\tSize\tstrand\tiBlk\ttGapIdx\ttGapSize\tjunc5p\tjunc3p\tisCannon\n"); + +for (struct psl *transPsl = transPsls; transPsl != NULL; transPsl = transPsl->next) + processPsl(transPsl, genomeFh, juncsTsvFh); +carefulClose(&juncsTsvFh); +twoBitClose(&genomeFh); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 4) + usage(); +pslSpliceJunctions(argv[1], argv[2], argv[3]); +return 0; +}