653eaa635409806f4885b0486125efd4390c0931 hiram Thu Aug 27 14:00:07 2020 -0700 fix scripts to behave properly when not using UCSC sequence names refs #23734 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index 177d402..702fa7d 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -1,47 +1,42 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl instead. -# HOW TO USE THIS TEMPLATE: -# 1. Global-replace doNcbiRefSeq.pl with your actual script name. -# 2. Search for template and replace each instance with something appropriate. -# Add steps and subroutines as needed. Other do*.pl or make*.pl may have -# useful example code -- this is just a skeleton. - use Getopt::Long; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; my $doIdKeys = "$Bin/doIdKeys.pl"; my $gff3ToRefLink = "$Bin/gff3ToRefLink.pl"; my $gbffToCds = "$Bin/gbffToCds.pl"; my $ncbiRefSeqOtherIxIxx = "$Bin/ncbiRefSeqOtherIxIxx.pl"; my $ncbiRefSeqOtherAttrs = "$Bin/ncbiRefSeqOtherAttrs.pl"; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_liftFile + $opt_assemblyHub $opt_target2bit $opt_toGpWarnOnly /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'process', func => \&doProcess }, { name => 'load', func => \&doLoad }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; @@ -61,30 +56,32 @@ # Basic help (for incorrect usage): print STDERR " usage: $base [options] asmId db required arguments: asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. db - database to load with track tables options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date (necessary when continuing at a later date). + -assemblyHub the processing is taking place in an assembly hub + build -toGpWarnOnly add -warnAndContinue to the gff3ToGenePred operation to avoid gene definitions that will not convert -liftFile pathName a lift file to translate NCBI names to local genome names -target2bit pathName when not a local UCSC genome, use this 2bit file for the target genome sequence _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $defaultWorkhorse, 'fileServer' => $defaultFileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates UCSC's ncbiRefSeq track build. Steps: @@ -105,39 +102,40 @@ 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from \$db.2bit. NOTE: Override these assumptions with the -target2Bit option " if ($detailed); print "\n"; exit $status; } # Globals: # Command line args: asmId db my ($asmId, $db); # Other: -my ($ftpDir, $buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit); +my ($ftpDir, $buildDir, $assemblyHub, $toGpWarnOnly, $dbExists, $liftFile, $target2bit); my ($secondsStart, $secondsEnd); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'liftFile=s', 'target2bit=s', + 'assemblyHub', 'toGpWarnOnly', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); $workhorse = $opt_workhorse if ($opt_workhorse); $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub); $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub); $fileServer = $opt_fileServer if ($opt_fileServer); } @@ -246,50 +244,54 @@ ### the sort -k2,2 -u eliminates the problem of duplicate sequences existing ### in the target $db assembly $bossScript->add(<<_EOF_ cd \$runDir hgsql -N -e 'select 0,name,chromEnd,chrom,chromEnd from ucscToRefSeq;' \${db} \\ | sort -k2,2 -u > \${asmId}To\${db}.lift _EOF_ ); } } my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit"; $dbTwoBit = $target2bit if (-s "$target2bit"); if (! $haveLiftFile ) { + # if this is an assembly hub and it did NOT supply a lift file, + # then we don't need one, do not attempt to build + if (! $opt_assemblyHub) { $bossScript->add(<<_EOF_ # generate idKeys for the NCBI sequence to translate names to UCSC equivalents rm -rf \$runDir/idKeys mkdir -p \$runDir/idKeys cd \$runDir/idKeys time ($doIdKeys -buildDir=\$runDir/idKeys -twoBit=\$runDir/\$asmId.ncbi.2bit \$db) > idKeys.log 2>&1 cd \$runDir twoBitInfo $dbTwoBit stdout | sort -k2,2nr > \$db.chrom.sizes ln -f -s idKeys/\$db.idKeys.txt ./ncbi.\$asmId.idKeys.txt ln -f -s /hive/data/genomes/\$db/bed/idKeys/\$db.idKeys.txt ./ucsc.\$db.idKeys.txt # joining the idKeys establishes a lift file to translate chrom names join -t\$'\\t' ucsc.\$db.idKeys.txt ncbi.\$asmId.idKeys.txt \\ | cut -f2-3 | sort \\ | join -t\$'\\t' - <(sort \$db.chrom.sizes) \\ | awk -F\$'\\t' '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$2, \$3, \$1, \$3}' \\ | sort -k3nr > \${asmId}To\${db}.lift _EOF_ ); } + } $bossScript->add(<<_EOF_ cd \$runDir twoBitInfo \$asmId.ncbi.2bit stdout | sort -k2nr > \$asmId.ncbi.chrom.sizes zcat \${asmId}_rna.fna.gz | sed -e 's/ .*//;' | gzip -c > \$asmId.rna.fa.gz faSize -detailed \${asmId}.rna.fa.gz | sort -k2nr > rna.sizes # genbank processor extracts infomation about the RNAs /hive/data/outside/genbank/bin/x86_64/gbProcess \\ -inclXMs /dev/null \$asmId.raFile.txt \${asmId}_rna.gbff.gz _EOF_ ); $bossScript->execute(); } # doDownload @@ -301,30 +303,31 @@ &HgAutomate::mustMkdir($runDir); my $whatItDoes = "process NCBI download files into track files."; # Use dbHost since genePredCheck -db=$db needs database my $bossScript = newBash HgRemoteScript("$runDir/doProcess.bash", $dbHost, $runDir, $whatItDoes); my $genePredCheckDb = "genePredCheck -db=\$db"; if (! $dbExists) { $genePredCheckDb = "genePredCheck"; } my $warnOnly = ""; $warnOnly = "-warnAndContinue" if ($toGpWarnOnly); my $localLiftFile = "\$downloadDir/\${asmId}To\${db}.lift"; + $localLiftFile = "" if (! -s "$localLiftFile"); $localLiftFile = $liftFile if (-s $liftFile); my $pslTargetSizes = "-db=\$db"; my $fakePslSizes = ""; if (-s "$target2bit") { $pslTargetSizes = "-targetSizes=\$db.chrom.sizes"; $fakePslSizes = "-chromSize=\$db.chrom.sizes"; } my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit"; $dbTwoBit = $target2bit if (-s "$target2bit"); $bossScript->add(<<_EOF_ # establish all variables to use here export asmId=$asmId @@ -358,33 +361,47 @@ > \$asmId.refseqSelectTranscripts.txt fi rm -f before.cut9.txt # extract labels from semi-structured text in gbff COMMENT/description sections: zcat \$downloadDir/\${asmId}_rna.gbff.gz \\ | (grep ' :: ' || true) \\ | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\ | sort -u \\ > pragmaLabels.txt # extract cross reference text for refLink \$gff3ToRefLink \$downloadDir/\$asmId.raFile.txt \$ncbiGffGz pragmaLabels.txt 2> \$db.refLink.stderr.txt \\ | sort > \$asmId.refLink.tab +_EOF_ + ); + if ( -s "$localLiftFile") { + $bossScript->add(<<_EOF_ # converting the NCBI coordinates to UCSC coordinates liftUp -extGenePred -type=.gp stdout $localLiftFile warn \$asmId.gp \\ | gzip -c > \$asmId.\$db.gp.gz +_EOF_ + ); + } else { + $bossScript->add(<<_EOF_ +# no lifting necessary +cat \$asmId.gp | gzip -c > \$asmId.\$db.gp.gz +_EOF_ + ); + } + $bossScript->add(<<_EOF_ $genePredCheckDb \$asmId.\$db.gp.gz zcat \$asmId.\$db.gp.gz > ncbiRefSeq.\$dateStamp genePredToGtf -utr file ncbiRefSeq.\$dateStamp stdout | gzip -c \\ > ../\$db.\$dateStamp.ncbiRefSeq.gtf.gz rm -f ncbiRefSeq.\$dateStamp # curated subset of all genes (zegrep "^[NY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.curated.gp # may not be any curated genes if [ ! -s \$db.curated.gp ]; then rm -f \$db.curated.gp elif [ -s \$asmId.refseqSelectTranscripts.txt ]; then cat \$db.curated.gp | fgrep -f \$asmId.refseqSelectTranscripts.txt - \\ > \$db.refseqSelect.curated.gp # may not be any refseqSelect.curated genes @@ -435,58 +452,90 @@ # PSL data will be loaded into a psl type track to show the alignments (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff if [ -s ../../../download/\${asmId}.remove.dups.list ]; then (zegrep -v "NG_" \$ncbiGffGz || true) \\ | grep -v -f ../../../download/\${asmId}.remove.dups.list \\ | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\ gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl else (zegrep -v "NG_" \$ncbiGffGz || true) \\ | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\ gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl fi +# might be empty result +if [ ! -s \$asmId.psl ]; then + rm -f \$asmId.psl +else +_EOF_ + ); + # note else clause of above if statement is concluded below in these two + # cases + if ( -s "$localLiftFile") { + $bossScript->add(<<_EOF_ simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\ | liftUp -type=.psl stdout $localLiftFile warn stdin \\ | gzip -c > \$db.psl.gz +fi +_EOF_ + ); + } else { + $bossScript->add(<<_EOF_ + simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\ + | gzip -c > \$db.psl.gz +fi +_EOF_ + ); + } + + $bossScript->add(<<_EOF_ +if [ -s \$db.psl.gz ]; then pslCheck $pslTargetSizes \\ -querySizes=\$downloadDir/rna.sizes \$db.psl.gz +fi # extract RNA CDS information from genbank record # Note: $asmId.raFile.txt could be used instead of _rna.gbff.gz \$gbffToCds \$downloadDir/\${asmId}_rna.gbff.gz | sort > \$asmId.rna.cds # the NCBI _genomic.gff.gz file only contains cDNA_match records for transcripts # that do not *exactly* match the reference genome. For all other transcripts # construct 'fake' PSL records representing the alignments of all cDNAs # that would be perfect matches to the reference genome. The pslFixCdsJoinGap # will fixup those records with unusual alignments due to frameshifts of # various sorts as found in the rna.cds file: genePredToFakePsl -qSizes=\$downloadDir/rna.sizes $fakePslSizes \$db \$db.ncbiRefSeq.gp \\ stdout \$db.fake.cds \\ | pslFixCdsJoinGap stdin \$asmId.rna.cds \$db.fake.psl +if [ -s \$db.psl.gz ]; then pslCat -nohead \$db.psl.gz | cut -f10,14 > \$db.psl.names +fi if [ -s \$db.psl.names ]; then pslSomeRecords -tToo -not \$db.fake.psl \$db.psl.names \$db.someRecords.psl else cp -p \$db.fake.psl \$db.someRecords.psl fi +if [ -s \$db.psl.gz ]; then pslSort dirs stdout \\ - ./tmpdir \$db.psl.gz \$db.someRecords.psl \\ - | (pslCheck -quiet $pslTargetSizes -pass=stdout -fail=\$asmId.\$db.fail.psl stdin || true) \\ + ./tmpdir \$db.psl.gz \$db.someRecords.psl | gzip -c > \$db.sorted.psl.gz +else + pslSort dirs stdout \\ + ./tmpdir \$db.someRecords.psl | gzip -c > \$db.sorted.psl.gz +fi +(pslCheck -quiet $pslTargetSizes -pass=stdout -fail=\$asmId.\$db.fail.psl \$db.sorted.psl.gz || true) \\ | pslPosTarget stdin stdout \\ | pslRecalcMatch -ignoreQMissing stdin $dbTwoBit \$downloadDir/\$asmId.rna.fa.gz stdout \\ | sort -k14,14 -k16,16n | gzip -c > \$asmId.\$db.psl.gz rm -fr ./tmpdir pslCheck $pslTargetSizes \$asmId.\$db.psl.gz _EOF_ ); $bossScript->execute(); } # doProcess ######################################################################### # * step: load [dbHost] sub doLoad { my $runDir = "$buildDir"; &HgAutomate::mustMkdir($runDir); @@ -617,60 +666,96 @@ > \$db.ncbiRefSeqPepTable.tab # and load the fasta peptides, again, only those for items that exist pslCat -nohead process/\$asmId.\$db.psl.gz | cut -f10 \\ | sort -u > \$db.psl.used.rna.list cut -f5 process/\$asmId.\$db.ncbiRefSeqLink.tab \\ | grep . | sort -u > \$db.mrnaAcc.name.list sort -u \$db.psl.used.rna.list \$db.mrnaAcc.name.list > \$db.rna.select.list cut -f1 process/\$db.ncbiRefSeq.gp | sort -u > \$db.gp.name.list comm -12 \$db.rna.select.list \$db.gp.name.list > \$db.toLoad.rna.list comm -23 \$db.rna.select.list \$db.gp.name.list > \$db.not.used.rna.list faSomeRecords download/\$asmId.rna.fa.gz \$db.toLoad.rna.list \$db.rna.fa grep '^>' \$db.rna.fa | sed -e 's/^>//' | sort > \$db.rna.found.list comm -13 \$db.rna.found.list \$db.gp.name.list > \$db.noRna.available.list +_EOF_ + ); + + my $nonNucNames="chrM"; + + my $haveLiftFile = 0; + if ( -s "$liftFile" ) { + $haveLiftFile = 1; + } + # if this is an assembly hub, find out what the non-nuclear names are + if ( $opt_assemblyHub) { + if ( -s "../../sequence/$asmId.nonNucChr.names") { + # with lift file means to use UCSC name in the nonNucChr.names + if ($haveLiftFile ) { + $nonNucNames=`cut -f1 ../../sequence/$asmId.nonNucChr.names | xargs echo | tr ' ' '|'`; + } else { + $nonNucNames=`cut -f2 ../../sequence/$asmId.nonNucChr.names | xargs echo | tr ' ' '|'`; + } + chomp $nonNucNames; + } + } + # if this is an assembly hub and it did NOT supply a lift file, + # then we don't need one, do not attempt to build + $bossScript->add(<<_EOF_ + # If \$db.noRna.available.list is not empty but items are on chrM, # make fake cDNA sequence for them using chrM sequence # since NCBI puts proteins, not coding RNAs, in the GFF. if [ -s \$db.noRna.available.list ]; then pslCat -nohead process/\$asmId.\$db.psl.gz \\ | grep -Fwf \$db.noRna.available.list \\ - | grep chrM > missingChrMFa.psl + | egrep "$nonNucNames" > missingChrMFa.psl if [ -s missingChrMFa.psl ]; then pslToBed missingChrMFa.psl stdout \\ | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa fi fi +if [ -s process/\$asmId.rna.cds.gz ]; then + zcat process/\$asmId.rna.cds.gz egrep '[0-9]+\\.\\.[0-9]\\+' \\ + pslMismatchGapToBed -cdsFile=stdin -db=\$db -ignoreQNamePrefix=X \\ + process/\$asmId.\$db.psl.gz \$target2bit \\ + \$db.rna.fa ncbiRefSeqGenomicDiff || true + + wget -O txAliDiff.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/txAliDiff.as' + bedToBigBed -type=bed9+ -tab -as=txAliDiff.as \\ + ncbiRefSeqGenomicDiff.bed \$db.chrom.sizes ncbiRefSeqGenomicDiff.bb +fi + export totalBases=`ave -col=2 \$db.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'` export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'` export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\ "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiRefSeq.txt rm -f \$db.ncbiRefSeq.bigGp \$asmId.exons.bed pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\ process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.bigPsl bedToBigBed -type=bed12+13 -tab -as=bigPsl.as -extraIndex=name \\ \$asmId.bigPsl \$db.chrom.sizes \$asmId.bigPsl.bb rm -f \$asmId.bigPsl _EOF_ ); - } else { + } else { # processing for a database genome $bossScript->add(<<_EOF_ # loading the genePred tracks, all genes in one, and subsets hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp $genePredCheckDb ncbiRefSeq if [ -s process/\$db.curated.gp ]; then hgLoadGenePred -genePredExt \$db ncbiRefSeqCurated process/\$db.curated.gp $genePredCheckDb ncbiRefSeqCurated if [ -s process/\$db.refseqSelect.curated.gp ]; then hgLoadGenePred -genePredExt \$db ncbiRefSeqSelect process/\$db.refseqSelect.curated.gp $genePredCheckDb ncbiRefSeqSelect fi fi @@ -727,30 +812,42 @@ pslCat -nohead process/\$asmId.\$db.psl.gz \\ | grep -Fwf \$db.noRna.available.list \\ | grep chrM > missingChrMFa.psl if [ -s missingChrMFa.psl ]; then pslToBed missingChrMFa.psl stdout \\ | twoBitToFa -bed=stdin $dbTwoBit stdout >> \$db.rna.fa fi fi mkdir -p $gbdbDir ln -f -s `pwd`/\$db.rna.fa $gbdbDir/seqNcbiRefSeq.rna.fa hgLoadSeq -drop -seqTbl=seqNcbiRefSeq -extFileTbl=extNcbiRefSeq \$db $gbdbDir/seqNcbiRefSeq.rna.fa hgLoadPsl \$db -table=ncbiRefSeqPsl process/\$asmId.\$db.psl.gz +if [ -s process/\$asmId.rna.cds.gz ]; then + zcat process/\$asmId.rna.cds.gz egrep '[0-9]+\\.\\.[0-9]\\+' \\ + pslMismatchGapToBed -cdsFile=stdin -db=\$db -ignoreQNamePrefix=X \\ + process/\$asmId.\$db.psl.gz $dbTwoBit \\ + \$db.rna.fa ncbiRefSeqGenomicDiff || true + + bedToBigBed -type=bed9+ -tab -as=~/kent/src/hg/lib/txAliDiff.as \\ + ncbiRefSeqGenomicDiff.bed process/\$db.chrom.sizes ncbiRefSeqGenomicDiff.bb + rm -f $gbdbDir/ncbiRefSeqGenomicDiff.bb + ln -s `pwd`/ncbiRefSeqGenomicDiff.bb $gbdbDir/ncbiRefSeqGenomicDiff.bb +fi + if [ -d "/usr/local/apache/htdocs-hgdownload/goldenPath/archive" ]; then gtfFile=`ls \$db.*.ncbiRefSeq.gtf.gz` mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/\$gtfFile ln -s `pwd`/\$db.*.ncbiRefSeq.gtf.gz \\ /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/ fi featureBits \$db ncbiRefSeq > fb.ncbiRefSeq.\$db.txt 2>&1 cat fb.ncbiRefSeq.\$db.txt 2>&1 _EOF_ ); } # if ($dbExists) $bossScript->execute(); } # doLoad