677bb75aa7c1cbe71111e1a0ef1462f8277b6c9d max Wed Nov 22 10:20:25 2023 -0800 rescueing YPs back into curated refseq track, refs #32667 diff --git src/hg/utils/gff3ToGenePred/gff3ToGenePred.c src/hg/utils/gff3ToGenePred/gff3ToGenePred.c index ee91335..1c2bac0 100644 --- src/hg/utils/gff3ToGenePred/gff3ToGenePred.c +++ src/hg/utils/gff3ToGenePred/gff3ToGenePred.c @@ -256,48 +256,53 @@ /* set cds start/stop status based on start/stop codon annotation */ { if (gp->strand[0] == '+') { gp->cdsStartStat = haveChildFeature(mrna, gff3FeatStartCodon) ? cdsComplete : cdsIncomplete; gp->cdsEndStat = haveChildFeature(mrna, gff3FeatStopCodon) ? cdsComplete : cdsIncomplete; } else { gp->cdsStartStat = haveChildFeature(mrna, gff3FeatStopCodon) ? cdsComplete : cdsIncomplete; gp->cdsEndStat = haveChildFeature(mrna, gff3FeatStartCodon) ? cdsComplete : cdsIncomplete; } } static boolean isGeneWithCdsChildCase(struct gff3Ann* mrna) -/* is this one of the refseq gene with a direct CDS child? */ +/* is this one of the refseq gene with a direct CDS child */ { -return sameString(mrna->type, gff3FeatGene) && (mrna->children != NULL) - && (sameString(mrna->children->ann->type, gff3FeatCDS)); +// NCBI changed the format for YPs around 2020 or so, now they're mrna > exon > CDS, originally Gene > CDS +// I don't know why the exon level is missing at this point in the code, but it is. +return ((sameString(mrna->type, gff3FeatGene) || sameString(mrna->type, gff3FeatMRna)) && (mrna->children != NULL) + && (sameString(mrna->children->ann->type, gff3FeatCDS))); } static char* refSeqHacksFindName(struct gff3Ann* mrna) /* return the value to use for the genePred name field under refSeqHacks * rules. */ { -// if this is a gene with CDS child, the get the id out of the CDS if it looks like +// if this is a gene with CDS child or a mRNA > exon > CDS, the get the id out of the CDS if it looks like // a refseq accession if (isGeneWithCdsChildCase(mrna)) { // is name something like YP_203370.1 (don't try too hard) struct gff3Ann *cds = mrna->children->ann; + if (!sameString(cds->type, gff3FeatCDS)) + cds = mrna->children->ann->children->ann; // post-2018-format + // Also checking now for 'Y' as a prefix, as otherwise this would apply to all normal transcripts if ((cds->name != NULL) && (strlen(cds->name) > 4) && isupper(cds->name[0]) && isupper(cds->name[1]) - && (cds->name[2] == '_') && isdigit(cds->name[3])) + && (cds->name[2] == '_') && isdigit(cds->name[3]) && cds->name[0] == 'Y') return cds->name; } return NULL; } static char* getRnaName(struct gff3Ann* mrna) /* return the value to use for the genePred name field */ { char *name = NULL; if (rnaNameAttr != NULL) { struct gff3Attr *attr = gff3AnnFindAttr(mrna, rnaNameAttr); if (attr != NULL) name = attr->vals->name; }