be2fb5e3a2af8d3e1f73c1cc5c6bd5ff05bad8a5 hiram Wed Apr 8 07:52:51 2020 -0700 handle the problem of duplicate contigs and make all download files including archive refs #24547 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index 06758c7..177d402 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -231,34 +231,36 @@ faToTwoBit \${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit _EOF_ ); } my $haveLiftFile = 0; if ( -s "$liftFile" ) { $haveLiftFile = 1; } elsif ($dbExists) { # check if db.ucscToRefSeq exists? my $sql = "show tables like 'ucscToRefSeq';"; my $result = `hgsql $db -BN -e "$sql"`; chomp $result; if ( $result =~ m/ucscToRefSeq/ ) { $haveLiftFile = 1; +### the sort -k2,2 -u eliminates the problem of duplicate sequences existing +### in the target $db assembly $bossScript->add(<<_EOF_ cd \$runDir hgsql -N -e 'select 0,name,chromEnd,chrom,chromEnd from ucscToRefSeq;' \${db} \\ - > \${asmId}To\${db}.lift + | sort -k2,2 -u > \${asmId}To\${db}.lift _EOF_ ); } } my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit"; $dbTwoBit = $target2bit if (-s "$target2bit"); if (! $haveLiftFile ) { $bossScript->add(<<_EOF_ # generate idKeys for the NCBI sequence to translate names to UCSC equivalents rm -rf \$runDir/idKeys mkdir -p \$runDir/idKeys cd \$runDir/idKeys @@ -319,30 +321,31 @@ $fakePslSizes = "-chromSize=\$db.chrom.sizes"; } my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit"; $dbTwoBit = $target2bit if (-s "$target2bit"); $bossScript->add(<<_EOF_ # establish all variables to use here export asmId=$asmId export downloadDir=$downloadDir export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz export db=$db export gff3ToRefLink=$gff3ToRefLink export gbffToCds=$gbffToCds +export dateStamp=`date "+%F"` export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //'` if [ "\$annotationRelease" == "" ]; then export annotationRelease=\$asmId fi export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'` echo "\$annotationRelease (\$versionDate)" > ncbiRefSeqVersion.txt # this produces the genePred in NCBI coordinates # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type): zcat \$ncbiGffGz \\ | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp genePredCheck \$asmId.gp @@ -359,30 +362,34 @@ # extract labels from semi-structured text in gbff COMMENT/description sections: zcat \$downloadDir/\${asmId}_rna.gbff.gz \\ | (grep ' :: ' || true) \\ | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\ | sort -u \\ > pragmaLabels.txt # extract cross reference text for refLink \$gff3ToRefLink \$downloadDir/\$asmId.raFile.txt \$ncbiGffGz pragmaLabels.txt 2> \$db.refLink.stderr.txt \\ | sort > \$asmId.refLink.tab # converting the NCBI coordinates to UCSC coordinates liftUp -extGenePred -type=.gp stdout $localLiftFile warn \$asmId.gp \\ | gzip -c > \$asmId.\$db.gp.gz $genePredCheckDb \$asmId.\$db.gp.gz +zcat \$asmId.\$db.gp.gz > ncbiRefSeq.\$dateStamp +genePredToGtf -utr file ncbiRefSeq.\$dateStamp stdout | gzip -c \\ + > ../\$db.\$dateStamp.ncbiRefSeq.gtf.gz +rm -f ncbiRefSeq.\$dateStamp # curated subset of all genes (zegrep "^[NY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.curated.gp # may not be any curated genes if [ ! -s \$db.curated.gp ]; then rm -f \$db.curated.gp elif [ -s \$asmId.refseqSelectTranscripts.txt ]; then cat \$db.curated.gp | fgrep -f \$asmId.refseqSelectTranscripts.txt - \\ > \$db.refseqSelect.curated.gp # may not be any refseqSelect.curated genes if [ ! -s \$db.refseqSelect.curated.gp ]; then rm -f \$db.refseqSelect.curated.gp fi fi @@ -720,30 +727,38 @@ pslCat -nohead process/\$asmId.\$db.psl.gz \\ | grep -Fwf \$db.noRna.available.list \\ | grep chrM > missingChrMFa.psl if [ -s missingChrMFa.psl ]; then pslToBed missingChrMFa.psl stdout \\ | twoBitToFa -bed=stdin $dbTwoBit stdout >> \$db.rna.fa fi fi mkdir -p $gbdbDir ln -f -s `pwd`/\$db.rna.fa $gbdbDir/seqNcbiRefSeq.rna.fa hgLoadSeq -drop -seqTbl=seqNcbiRefSeq -extFileTbl=extNcbiRefSeq \$db $gbdbDir/seqNcbiRefSeq.rna.fa hgLoadPsl \$db -table=ncbiRefSeqPsl process/\$asmId.\$db.psl.gz +if [ -d "/usr/local/apache/htdocs-hgdownload/goldenPath/archive" ]; then + gtfFile=`ls \$db.*.ncbiRefSeq.gtf.gz` + mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq + rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/\$gtfFile + ln -s `pwd`/\$db.*.ncbiRefSeq.gtf.gz \\ + /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/ +fi + featureBits \$db ncbiRefSeq > fb.ncbiRefSeq.\$db.txt 2>&1 cat fb.ncbiRefSeq.\$db.txt 2>&1 _EOF_ ); } # if ($dbExists) $bossScript->execute(); } # doLoad ######################################################################### # * step: cleanup [fileServer] sub doCleanup { my $runDir = "$buildDir"; my $whatItDoes = "It cleans up or compresses intermediate files."; my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer, $runDir, $whatItDoes);