148cd685553c26bb02f7d6e9cde931a153faa7d8 hiram Tue Jun 30 14:12:33 2020 -0700 now correctly numbering reverse strand exons refs #25379 diff --git src/hg/genePredToGtf/genePredToGtf.c src/hg/genePredToGtf/genePredToGtf.c index b0bf513..fb8b4f5 100644 --- src/hg/genePredToGtf/genePredToGtf.c +++ src/hg/genePredToGtf/genePredToGtf.c @@ -73,53 +73,61 @@ hashAddInt(dupeHash, root, 1); return root; } else { static char buf[256]; int val = ptToInt(hel->val) + 1; hel->val = intToPt(val); safef(buf, sizeof(buf), "%s_%d", root, val); return buf; } } static void writeGtfLine(FILE *f, char *source, char *name, char *geneName, char *chrom, char strand, char *type, - int start, int end, int exonIx, int frame) + int start, int end, int exonIx, int frame, int exonCount) /* Write a single exon to GTF file */ { assert(start <= end); /* convert frame to phase */ char phase = (frame < 0) ? '.' : (frame == 0) ? '0' : (frame == 1) ? '2' : '1'; fprintf(f, "%s\t", chrom); fprintf(f, "%s\t", source); fprintf(f, "%s\t", type); fprintf(f, "%d\t", start+1); fprintf(f, "%d\t", end); fprintf(f, ".\t"); /* Score. */ fprintf(f, "%c\t", strand); fprintf(f, "%c\t", phase); fprintf(f, "gene_id \"%s\"; ", (isNotEmpty(geneName)) ? geneName : name); fprintf(f, "transcript_id \"%s\"; ", name); if (exonIx >= 0) { + if (strand == '-') + { + fprintf(f, "exon_number \"%d\"; ", exonCount-exonIx); + fprintf(f, "exon_id \"%s.%d\";", name, exonCount-exonIx); + } + else + { fprintf(f, "exon_number \"%d\"; ", exonIx+1); fprintf(f, "exon_id \"%s.%d\";", name, exonIx+1); } + } if (isNotEmpty(geneName)) fprintf(f, " gene_name \"%s\";", geneName); fprintf(f, "\n"); } static boolean inExon(struct genePred *gp, int iExon, int pos) /* determine if pos is in the specified exon */ { return ((gp->exonStarts[iExon] <= pos) && (pos <= gp->exonEnds[iExon])); } static int movePos(struct genePred *gp, int pos, int dist) /* Move a position in an exon by dist, which is positive to move forward, and * negative to move backwards. Introns are skipped. Error if can't move * distance and stay in exon. @@ -179,40 +187,40 @@ int start1, end1; /* first part of codon */ int iExon1; /* exon containing first part */ int start2, end2; /* second part of codon if spliced, or 0,0 */ int iExon2; /* exon containing second part */ }; static struct codonCoords zeroCodonCoords = {0, 0, 0, 0, 0, 0, 0, 0}; static boolean codonComplete(struct codonCoords *codon) /* are all bases of codon defined? */ { return (((codon->end1-codon->start1)+(codon->end2-codon->start2)) == 3); } static void writeCodon(FILE *f, char *source, char *name, char *geneName, char *chrom, char strand, char *type, - struct codonCoords *codon) + struct codonCoords *codon, int exonCount) /* write the location of a codon to the GTF, if it is non-zero and complete */ { writeGtfLine(f, source, name, geneName, chrom, strand, type, - codon->start1, codon->end1, codon->iExon1, 0); + codon->start1, codon->end1, codon->iExon1, 0, exonCount); if (codon->start2 < codon->end2) writeGtfLine(f, source, name, geneName, chrom, strand, type, codon->start2, codon->end2, codon->iExon2, - (codon->end1-codon->start1)); + (codon->end1-codon->start1), exonCount); } static struct codonCoords findFirstCodon(struct genePred *gp, int *frames) /* get the coordinates of the first codon (start or stop). It must be in * correct frame, or zero is returned. */ { if (honorCdsStat && (gp->optFields & genePredCdsStatFld) && (gp->cdsStartStat != cdsComplete)) return zeroCodonCoords; // not flagged as complete // find first CDS exon int iExon, cdsStart = 0, cdsEnd = 0; for (iExon = 0; iExon < gp->exonCount; iExon++) { if (genePredCdsExon(gp, iExon, &cdsStart, &cdsEnd)) @@ -322,45 +330,45 @@ } static void writeFeatures(struct genePred *gp, int i, char *source, char *name, char *chrom, char strand, char *geneName, int firstUtrEnd, int cdsStart, int cdsEnd, int lastUtrStart, int frame, FILE *f) /* write exons CDS/UTR features */ { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; if (utr && (exonStart < firstUtrEnd)) { int end = min(exonEnd, firstUtrEnd); writeGtfLine(f, source, name, geneName, chrom, strand, ((strand == '+') ? "5UTR" : "3UTR"), - exonStart, end, i, -1); + exonStart, end, i, -1, gp->exonCount); } if ((cdsStart < exonEnd) && (cdsEnd > exonStart)) { int start = max(exonStart, cdsStart); int end = min(exonEnd, cdsEnd); writeGtfLine(f, source, name, geneName, chrom, strand, "CDS", - start, end, i, frame); + start, end, i, frame, gp->exonCount); } if (utr && (exonEnd > lastUtrStart)) { int start = max(lastUtrStart, exonStart); writeGtfLine(f, source, name, geneName, chrom, strand, ((strand == '+') ? "3UTR" : "5UTR"), - start, exonEnd, i, -1); + start, exonEnd, i, -1, gp->exonCount); } } void genePredWriteToGtf(struct genePred *gp, char *source, struct hash *dupeHash, FILE *f) /* Write out genePredName to GTF file. */ { int i; char *name = findUniqueName(dupeHash, gp->name); char *geneName = gp->name2; char *chrom = gp->chrom; char strand = gp->strand[0]; int *frames = (gp->optFields & genePredExonFramesFld) ? gp->exonFrames : calcFrames(gp); struct codonCoords firstCodon = findFirstCodon(gp, frames); struct codonCoords lastCodon = findLastCodon(gp, frames); @@ -368,57 +376,57 @@ // figure out bounds of CDS and UTR regions, moving stop codon to outside of // CDS. int firstUtrEnd = gp->cdsStart, lastUtrStart = gp->cdsEnd; int cdsStart = gp->cdsStart, cdsEnd = gp->cdsEnd; if ((strand == '+') && codonComplete(&lastCodon)) cdsEnd = movePos(gp, lastUtrStart, -3); if ((strand == '-') && codonComplete(&firstCodon)) cdsStart = movePos(gp, cdsStart, 3); if (addComments) fprintf(f, "###\n# %s %s:%d-%d (%s) CDS: %d-%d\n#\n", gp->name, gp->chrom, gp->txStart, gp->txEnd, gp->strand, gp->cdsStart, gp->cdsEnd); writeGtfLine(f, source, name, geneName, chrom, strand, "transcript", - gp->txStart, gp->txEnd, -1, -1); + gp->txStart, gp->txEnd, -1, -1, gp->exonCount); for (i=0; iexonCount; ++i) { writeGtfLine(f, source, name, geneName, chrom, strand, "exon", - gp->exonStarts[i], gp->exonEnds[i], i, -1); + gp->exonStarts[i], gp->exonEnds[i], i, -1, gp->exonCount); if (cdsStart < cdsEnd) writeFeatures(gp, i, source, name, chrom, strand, geneName, firstUtrEnd, cdsStart, cdsEnd, lastUtrStart, frames[i], f); } if (gp->strand[0] == '+') { if (codonComplete(&firstCodon)) writeCodon(f, source, name, geneName, chrom, strand, "start_codon", - &firstCodon); + &firstCodon, gp->exonCount); if (codonComplete(&lastCodon)) writeCodon(f, source, name, geneName, chrom, strand, "stop_codon", - &lastCodon); + &lastCodon, gp->exonCount); } else { if (codonComplete(&lastCodon)) writeCodon(f, source, name, geneName, chrom, strand, "start_codon", - &lastCodon); + &lastCodon, gp->exonCount); if (codonComplete(&firstCodon)) writeCodon(f, source, name, geneName, chrom, strand, "stop_codon", - &firstCodon); + &firstCodon, gp->exonCount); } if (!(gp->optFields & genePredExonFramesFld)) freeMem(frames); } void genePredToGtf(char *database, char *table, char *gtfOut) /* genePredToGtf - Convert genePredName table or file to gtf.. */ { FILE *f = mustOpen(gtfOut, "w"); struct hash *dupeHash = newHash(16); struct genePred *gpList = NULL, *gp = NULL; if (sameString(database, "file")) { gpList = genePredReaderLoadFile(table, NULL);