0362494184981193a406895a488aee5e92ece803 markd Tue Feb 7 14:24:25 2023 -0800 support for converting HPRC GFF3 files from Ensembl. Not perfect, however, all genes get converted, so this can be fixed in post-preocessing diff --git src/hg/utils/gff3ToGenePred/tests/makefile src/hg/utils/gff3ToGenePred/tests/makefile index 30b9b15..c31607c 100644 --- src/hg/utils/gff3ToGenePred/tests/makefile +++ src/hg/utils/gff3ToGenePred/tests/makefile @@ -1,134 +1,138 @@ kentSrc = ../../../.. include ../../../../inc/common.mk gff3ToGenePred = ${DESTBINDIR}/gff3ToGenePred # sh commands to deal with gff3ToGenePred shouldhave failed and didn't cmdShouldFail = echo "Error: command should have failed" >&2; false all:: test:: geneMRnaTest noGeneMRnaTest geneMRnaHonorTest discontinuousTest multCdsOutOfExonTest \ noIdTest errCases1Test bogusQuotesTest noExonsTest geneTranscriptTest transcriptCdsParentTest \ minimalGenesTest geneDefaultStatusUnknownTest useNameTest nameAttrIdTest nameAttrNameTest \ frameShiftTest mm10GencodeTest ncbiSegmentsTest ncbiProblemsTest makeBadTest \ - transcriptOnlyTest + transcriptOnlyTest hprcTest geneMRnaTest: mkout ${gff3ToGenePred} input/geneMRna.gff3 output/$@.gp diff expected/$@.gp output/$@.gp noGeneMRnaTest: mkout ${gff3ToGenePred} input/noGeneMRna.gff3 output/$@.gp diff expected/$@.gp output/$@.gp geneMRnaHonorTest: mkout ${gff3ToGenePred} -honorStartStopCodons input/geneMRna.gff3 output/$@.gp diff expected/$@.gp output/$@.gp geneDefaultStatusUnknownTest: mkout ${gff3ToGenePred} -defaultCdsStatusToUnknown input/geneMRna.gff3 output/$@.gp diff expected/$@.gp output/$@.gp discontinuousTest: mkout ${gff3ToGenePred} input/discontinuous.gff3 output/$@.gp diff expected/$@.gp output/$@.gp # was only reporting the first error multCdsOutOfExonTest: mkout if ! ${gff3ToGenePred} input/multCdsOutOfExon.gff3 /dev/null >output/$@.out 2>&1 ; then true ; else ${cmdShouldFail} ; fi diff expected/$@.out output/$@.out # some records without ID or Parent attrs noIdTest: mkout ${gff3ToGenePred} input/noId.gff3 output/$@.gp diff expected/$@.gp output/$@.gp # error cases that should be handled errCases1Test: mkout if ! ${gff3ToGenePred} input/errCases1.gff3 /dev/null >output/$@.out 2>&1 ; then true ; else ${cmdShouldFail} ; fi diff expected/$@.out output/$@.out # error cases involving quotes bogusQuotesTest: mkout if ! ${gff3ToGenePred} input/bogusQuotes.gff3 /dev/null >output/$@.out 2>&1 ; then true ; else ${cmdShouldFail} ; fi diff expected/$@.out output/$@.out noExonsTest: mkout ${gff3ToGenePred} input/noExons.gff3 output/$@.gp diff expected/$@.gp output/$@.gp # from PlasmoDB: ncRNAs have gene->transcript->exon geneTranscriptTest: mkout ${gff3ToGenePred} input/geneTranscript.gff3 output/$@.gp diff expected/$@.gp output/$@.gp # it also appears that gene->transcript->exon->cds is valid transcriptCdsParentTest: mkout ${gff3ToGenePred} input/transcriptCdsParent.gff3 output/$@.gp diff expected/$@.gp output/$@.gp # pseudogene annotations tie exons directly to genes and have standalong gnenes minimalGenesTest: mkout ${gff3ToGenePred} -allowMinimalGenes -unprocessedRootsOut=output/$@.unprocessed input/ncbiRefSeq.pseudoGenes.gff3 output/$@.gp diff expected/$@.gp output/$@.gp diff expected/$@.unprocessed output/$@.unprocessed # set name/name2 using -useName useNameTest: mkout ${gff3ToGenePred} -attrsOut=output/$@.attrs -useName input/transcriptCdsParent.gff3 output/$@.gp diff expected/$@.gp output/$@.gp diff expected/$@.attrs output/$@.attrs nameAttrIdTest: mkout ${gff3ToGenePred} -rnaNameAttr=transcript_id -geneNameAttr=gene_id input/transcriptCdsParent.gff3 output/$@.gp diff expected/$@.gp output/$@.gp nameAttrNameTest: mkout ${gff3ToGenePred} -attrsOut=output/$@.attrs -rnaNameAttr=transcript_name -geneNameAttr=gene_name input/transcriptCdsParent.gff3 output/$@.gp diff expected/$@.gp output/$@.gp diff expected/$@.attrs output/$@.attrs frameShiftTest: mkout ${gff3ToGenePred} input/frameShifts.gff3 output/$@.gp diff expected/$@.gp output/$@.gp # gencode conversion mm10GencodeTest: mkout ${gff3ToGenePred} -rnaNameAttr=transcript_id -geneNameAttr=gene_id -honorStartStopCodons -attrsOut=output/$@.attrs input/mm10Gencode.gff3 output/$@.gp diff expected/$@.gp output/$@.gp diff expected/$@.attrs output/$@.attrs # NCBI [CDJV]_gene_segment annotations ncbiSegmentsTest: mkout ${gff3ToGenePred} -rnaNameAttr=transcript_id -geneNameAttr=gene -honorStartStopCodons -attrsOut=output/$@.attrs input/ncbiSegments.gff3 output/$@.gp > output/$@.out 2>&1 diff expected/$@.gp output/$@.gp diff expected/$@.attrs output/$@.attrs diff expected/$@.out output/$@.out # various NCBI problem cases ncbiProblemsTest: mkout ${gff3ToGenePred} -warnAndContinue -geneNameAttr=gene -refseqHacks -attrsOut=output/$@.attrs input/ncbiProblems.gff3 output/$@.gp > output/$@.out 2>&1 diff expected/$@.gp output/$@.gp diff expected/$@.attrs output/$@.attrs diff expected/$@.out output/$@.out # -warnAndContinue on maker file with missing gene/mrna records. This test this option on # GFF3 parse tests, not just genePred conversion makeBadTest: mkout ${gff3ToGenePred} -warnAndContinue -attrsOut=output/$@.attrs input/makerBad.gff3 output/$@.gp > output/$@.out 2>&1 diff expected/$@.gp output/$@.gp diff expected/$@.attrs output/$@.attrs diff expected/$@.out output/$@.out # pseudogene annotations as transcripts-only. Yet another non-standard variation transcriptOnlyTest: mkout ${gff3ToGenePred} -allowMinimalGenes input/transcriptOnly.gff3 output/$@.gp diff expected/$@.gp output/$@.gp +# check features used by Ensembl HPRC annotations +hprcTest: mkout + ${gff3ToGenePred} -geneNameAttr=Name -rnaNameAttr=transcript_id input/hprc.gff3 output/$@.gp + diff expected/$@.gp output/$@.gp mkout: @mkdir -p output clean:: rm -rf output