b4f6c6c609d1e6ff8f09c2f42795b1df05f31452 markd Wed Jan 24 20:42:01 2024 -0800 try harder to find a more useful name for GFF3 records that don't follow the most common patterns. This are things like RefSeq IG segments and tRNAs that NHGRI had problems with diff --git src/hg/utils/gff3ToGenePred/gff3ToGenePred.c src/hg/utils/gff3ToGenePred/gff3ToGenePred.c index 1c2bac0..3e994cd 100644 --- src/hg/utils/gff3ToGenePred/gff3ToGenePred.c +++ src/hg/utils/gff3ToGenePred/gff3ToGenePred.c @@ -9,31 +9,31 @@ #include "options.h" #include "gff3.h" #include "genePred.h" #define LEAK_CHECK 0 // set to 1 to free all memory void usage() /* Explain usage and exit. */ { errAbort( "gff3ToGenePred - convert a GFF3 file to a genePred file\n" "usage:\n" " gff3ToGenePred inGff3 outGp\n" "options:\n" " -warnAndContinue - on bad genePreds being created, put out warning but continue\n" - " -useName - rather than using 'id' as name, use the 'name' tag\n" + " -useName - use the 'name' tag as the name, if present\n" " -rnaNameAttr=attr - If this attribute exists on an RNA record, use it as the genePred\n" " name column\n" " -geneNameAttr=attr - If this attribute exists on a gene record, use it as the genePred\n" " name2 column\n" " -attrsOut=file - output attributes of mRNA record to file. These are per-genePred row,\n" " not per-GFF3 record. Thery are derived from GFF3 attributes, not the attributes themselves.\n" " -processAllGeneChildren - output genePred for all children of a gene regardless of feature\n" " -unprocessedRootsOut=file - output GFF3 root records that were not used. This will not be a\n" " valid GFF3 file. It's expected that many non-root records will not be used and they are not\n" " reported.\n" " -bad=file - output genepreds that fail checks to file\n" " -maxParseErrors=50 - Maximum number of parsing errors before aborting. A negative\n" " value will allow an unlimited number of errors. Default is 50.\n" " -maxConvertErrors=50 - Maximum number of conversion errors before aborting. A negative\n" " value will allow an unlimited number of errors. Default is 50.\n" @@ -284,46 +284,65 @@ // a refseq accession if (isGeneWithCdsChildCase(mrna)) { // is name something like YP_203370.1 (don't try too hard) struct gff3Ann *cds = mrna->children->ann; if (!sameString(cds->type, gff3FeatCDS)) cds = mrna->children->ann->children->ann; // post-2018-format // Also checking now for 'Y' as a prefix, as otherwise this would apply to all normal transcripts if ((cds->name != NULL) && (strlen(cds->name) > 4) && isupper(cds->name[0]) && isupper(cds->name[1]) && (cds->name[2] == '_') && isdigit(cds->name[3]) && cds->name[0] == 'Y') return cds->name; } return NULL; } +static char* getAttrVal(struct gff3Ann* ann, char *name) +/* return the single value for name or NULL */ +{ +struct gff3Attr *attr = gff3AnnFindAttr(ann, name); +if (attr != NULL) + return attr->vals->name; +else + return NULL; +} + static char* getRnaName(struct gff3Ann* mrna) /* return the value to use for the genePred name field */ { char *name = NULL; if (rnaNameAttr != NULL) - { - struct gff3Attr *attr = gff3AnnFindAttr(mrna, rnaNameAttr); - if (attr != NULL) - name = attr->vals->name; - } + name = getAttrVal(mrna, rnaNameAttr); if (isEmpty(name) && refseqHacks) name = refSeqHacksFindName(mrna); +if (isEmpty(name) && useName) + name = mrna->name; +// try other possible fields +if (isEmpty(name)) + name = getAttrVal(mrna, "transcript_id"); +if (isEmpty(name)) + name = getAttrVal(mrna, "transcript_name"); +if (isEmpty(name)) + name = getAttrVal(mrna, "Name"); +if (isEmpty(name)) + name = getAttrVal(mrna, "standard_name"); // RefSeq use this +if (isEmpty(name)) + name = getAttrVal(mrna, "gene"); // also for RefSeq when no transcript name if (isEmpty(name)) - name = (useName ? mrna->name : mrna->id); + name = getAttrVal(mrna, "gene_name"); if (isEmpty(name)) - name = mrna->id; + name = mrna->id; // desperation return name; } static char* getGeneName(struct gff3Ann* gene) /* return the value to use for the genePred name2 field, * or NULL if can't be defined. */ { char *name2 = NULL; if (geneNameAttr != NULL) { struct gff3Attr *attr = gff3AnnFindAttr(gene, geneNameAttr); if (attr != NULL) name2 = attr->vals->name; } if (isEmpty(name2) && useName)