48cc0aa29826d6f6e99357a12eda728d76a7d132 markd Fri May 17 02:31:33 2024 -0700 fixed error in pslSpliceJunctions TSV header diff --git src/utils/pslSpliceJunctions/pslSpliceJunctions.c src/utils/pslSpliceJunctions/pslSpliceJunctions.c index 8e63e6e..c20db7a 100644 --- src/utils/pslSpliceJunctions/pslSpliceJunctions.c +++ src/utils/pslSpliceJunctions/pslSpliceJunctions.c @@ -1,144 +1,144 @@ /* pslSpliceJunctions - Extract splice junctions for a PSL file. */ #include "common.h" #include "options.h" #include "twoBit.h" #include "psl.h" #include "dnautil.h" static void usage() /* Explain usage and exit. */ { errAbort( "pslSpliceJunctions - Extract splice junctions from a PSL file\n" "usage:\n" " pslSpliceJunctions pslFile genome2bit junctionsTsv\n" "options:\n" "\n" "Output query and target coordinates of target gaps, often introns,\n" "in alignments. Output is always in query-positive and target-positive coordinates,\n" "with only gaps in the target reported. Canonical junctions will be in upper cases,\n" "unknown ones lower case. \n"); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; /* canonical splice junctions */ static char *canonicalJuncs[][2] = { {"GT", "AG"}, {"AT", "AC"}, {NULL, NULL} }; static boolean isCanonicalJuncs(char *junc5p, char *junc3p) /* is this canonical junction pair? */ { for (int i = 0; canonicalJuncs[i][0] != NULL; i++) { if (sameString(junc5p, canonicalJuncs[i][0]) && sameString(junc3p, canonicalJuncs[i][1])) return TRUE; } return FALSE; } static void getJuncs(struct psl *transPsl, unsigned tStart, unsigned tEnd, unsigned tSize, struct twoBitFile *genomeFh, char junc5p[3], char junc3p[3]) /* get sequences at junctions */ { if (pslTStrand(transPsl) == '-') reverseUnsignedRange(&tStart, &tEnd, transPsl->tSize); struct dnaSeq *juncA = twoBitReadSeqFrag(genomeFh, transPsl->tName, tStart, tStart + 2); struct dnaSeq *juncB = twoBitReadSeqFrag(genomeFh, transPsl->tName, tEnd - 2, tEnd); if (pslTStrand(transPsl) == '-') { struct dnaSeq *hold = juncA; juncA = juncB; juncB = hold; reverseComplement(juncA->dna, 2); reverseComplement(juncB->dna, 2); } safecpy(junc5p, sizeof(junc5p), juncA->dna); safecpy(junc3p, sizeof(junc3p), juncB->dna); freeDnaSeq(&juncA); freeDnaSeq(&juncB); touppers(junc5p); touppers(junc3p); if (!isCanonicalJuncs(junc5p, junc3p)) { tolowers(junc5p); tolowers(junc3p); } } static void writeRow(FILE *juncsTsvFh, struct psl *transPsl, unsigned qStart, unsigned qEnd, unsigned tStart, unsigned tEnd, int iBlk, int tGapIdx, char *junc5p, char *junc3p) /* output information about one junction or gap. */ { if (pslTStrand(transPsl) == '-') reverseUnsignedRange(&tStart, &tEnd, transPsl->tSize); fprintf(juncsTsvFh, "%s\t%u\t%u\t%d\t%s\t%u\t%u\t%d\t%c\t%d\t%d\t%u\t%s\t%s\t%d\n", transPsl->qName, qStart, qEnd, transPsl->qSize, transPsl->tName, tStart, tEnd, transPsl->tSize, pslTStrand(transPsl), iBlk, tGapIdx, tEnd - tStart, junc5p, junc3p, isCanonicalJuncs(junc5p, junc3p)); } static void processPslGap(struct psl* transPsl, int iBlk, int tGapIdx, struct twoBitFile *genomeFh, FILE *juncsTsvFh) /* process one target gap; psl is query-positive */ { char junc5p[3], junc3p[3]; getJuncs(transPsl, pslTEnd(transPsl, iBlk), pslTStart(transPsl, iBlk + 1), pslTStrand(transPsl), genomeFh, junc5p, junc3p); writeRow(juncsTsvFh, transPsl, pslQEnd(transPsl, iBlk), pslQStart(transPsl, iBlk + 1), pslTStart(transPsl, iBlk), pslTEnd(transPsl, iBlk + 1), iBlk, tGapIdx, junc5p, junc3p); } static void processPsl(struct psl* transPsl, struct twoBitFile *genomeFh, FILE *juncsTsvFh) /* get splice junctions of one alignment */ { if (pslQStrand(transPsl) == '-') pslRc(transPsl); int tGapIdx = 0; for (int iBlk = 0; iBlk < transPsl->blockCount - 1; iBlk++) { if (pslTEnd(transPsl, iBlk) < pslTStart(transPsl, iBlk + 1)) { processPslGap(transPsl, iBlk, tGapIdx, genomeFh, juncsTsvFh); tGapIdx++; } } } static void pslSpliceJunctions(char *pslFile, char *genome2bitFile, char *junctionsTsv) /* pslSpliceJunctions - Extract splice junctions for a PSL file. */ { struct psl *transPsls = pslLoadAll(pslFile); slSort(&transPsls, pslCmpTarget); struct twoBitFile *genomeFh = twoBitOpen(genome2bitFile); FILE *juncsTsvFh = mustOpen(junctionsTsv, "w"); -fprintf(juncsTsvFh, "qName\tqStart\tqEnd\tqSize\ttName\ttStart\tEnd\tSize\tstrand\tiBlk\ttGapIdx\ttGapSize\tjunc5p\tjunc3p\tisCannon\n"); +fprintf(juncsTsvFh, "qName\tqStart\tqEnd\tqSize\ttName\ttStart\ttEnd\ttSize\tstrand\tiBlk\ttGapIdx\ttGapSize\tjunc5p\tjunc3p\tisCannon\n"); for (struct psl *transPsl = transPsls; transPsl != NULL; transPsl = transPsl->next) processPsl(transPsl, genomeFh, juncsTsvFh); carefulClose(&juncsTsvFh); twoBitClose(&genomeFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); pslSpliceJunctions(argv[1], argv[2], argv[3]); return 0; }