f6a4d0f9523d06a6aa46e4ee4d96b37a4091d4a5 hiram Thu Sep 30 18:20:31 2021 -0700 correctly filter on chromSizes to eliminate out of range gene definitions no redmine diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index c459827..431f021 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -317,63 +317,71 @@ my $localLiftFile = "$downloadDir/${asmId}To${db}.lift"; if (! -s "$localLiftFile") { $localLiftFile = "../download/${asmId}To${db}.lift" if (-s "../download/${asmId}To${db}.lift"); } $localLiftFile = "" if (! -s "$localLiftFile"); $localLiftFile = $liftFile if (-s $liftFile); my $pslTargetSizes = "-db=\$db"; my $fakePslSizes = ""; if (-s "$target2bit") { $pslTargetSizes = "-targetSizes=\$db.chrom.sizes"; $fakePslSizes = "-chromSize=\$db.chrom.sizes"; } my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit"; $dbTwoBit = $target2bit if (-s "$target2bit"); + my $localSizes = "$HgAutomate::clusterData/$db/chrom.sizes"; $bossScript->add(<<_EOF_ # establish all variables to use here export asmId=$asmId export downloadDir=$downloadDir export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz export db=$db +export chromSizes="$localSizes" export gff3ToRefLink=$gff3ToRefLink export gbffToCds=$gbffToCds export dateStamp=`date "+%F"` +if [ -s "../../../\$asmId.chrom.sizes" ]; then + chromSizes="../../../\$asmId.chrom.sizes" +fi +if [ -s "../download/\$asmId.ncbi.chrom.sizes" ]; then + chromSizes="../download/\$asmId.ncbi.chrom.sizes" +fi export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //; s/ Updated Annotation Release//;'` if [ "\$annotationRelease" == "" ]; then export annotationRelease=\$asmId fi export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'` echo "\$annotationRelease (\$versionDate)" > ncbiRefSeqVersion.txt # this produces the genePred in NCBI coordinates # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type): if [ -s ../../../download/\${asmId}.remove.dups.list ]; then zcat \$ncbiGffGz | grep -v -f ../../../download/\${asmId}.remove.dups.list \\ | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin stdout \\ - | genePredFilter -chromSizes=../../../\$asmId.chrom.sizes stdin \$asmId.gp + | genePredFilter -chromSizes=\$chromSizes stdin \$asmId.gp else zcat \$ncbiGffGz \\ | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin stdout \\ - | genePredFilter -chromSizes=../../../\$asmId.chrom.sizes stdin \$asmId.gp + | genePredFilter -chromSizes=\$chromSizes stdin \$asmId.gp fi genePredCheck \$asmId.gp rm -f \$asmId.refseqSelectTranscripts.txt zegrep 'tag=(RefSeq|MANE) Select' \$ncbiGffGz > before.cut9.txt || true if [ -s before.cut9.txt ]; then cut -f9- before.cut9.txt | tr ';' '\\n' \\ | grep 'Name=' | grep -v NP_ | cut -d= -f2 | sort -u \\ > \$asmId.refseqSelectTranscripts.txt fi rm -f before.cut9.txt # extract labels from semi-structured text in gbff COMMENT/description sections: zcat \$downloadDir/\${asmId}_rna.gbff.gz \\