deeb37cb48db2bb239792b0ed59889eeb271b0d6 hiram Wed Jan 8 17:10:16 2020 -0800 do not actually need the targeSizes option get chrom.sizes from target2bit refs #23891 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index a139ab7..0d9fdc5 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -22,31 +22,30 @@ my $gff3ToRefLink = "$Bin/gff3ToRefLink.pl"; my $gbffToCds = "$Bin/gbffToCds.pl"; my $ncbiRefSeqOtherIxIxx = "$Bin/ncbiRefSeqOtherIxIxx.pl"; my $ncbiRefSeqOtherAttrs = "$Bin/ncbiRefSeqOtherAttrs.pl"; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_genbank $opt_subgroup $opt_species $opt_liftFile $opt_target2bit - $opt_targetSizes $opt_toGpWarnOnly /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'process', func => \&doProcess }, { name => 'load', func => \&doLoad }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $bigClusterHub = 'ku'; @@ -76,80 +75,77 @@ db - database to load with track tables options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date (necessary when continuing at a later date). -toGpWarnOnly add -warnAndContinue to the gff3ToGenePred operation to avoid gene definitions that will not convert -liftFile pathName a lift file to translate NCBI names to local genome names -target2bit pathName when not a local UCSC genome, use this 2bit file for the target genome sequence - -targetSizes pathName when not a local UCSC genome, use this file for - target genome chrom.sizes file _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $defaultWorkhorse, 'fileServer' => $defaultFileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates UCSC's ncbiRefSeq track build. Steps: download: symlink local files or rsync required files from NCBI FTP site process: generate the track data files from the download data load: load the processed data into database tables cleanup: Removes or compresses intermediate files. All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date unless -buildDir is given. Expects to find already done idKeys procedure and result file in: /hive/data/genomes/<db>/bed/idKeys/<db>.idKeys.txt See also: doIdKeys.pl command. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from \$db.2bit. -NOTE: Override these assumptions with -target2Bit and -targetSizes options +NOTE: Override these assumptions with the -target2Bit option " if ($detailed); print "\n"; exit $status; } # Globals: # Command line args: genbankRefseq subGroup species asmId db my ($genbankRefseq, $subGroup, $species, $asmId, $db, $ftpDir); # Other: -my ($buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit, $targetSizes); +my ($buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit); my ($secondsStart, $secondsEnd); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'liftFile=s', 'target2bit=s', - 'targetSizes=s', 'toGpWarnOnly', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); $workhorse = $opt_workhorse if ($opt_workhorse); $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub); $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub); $fileServer = $opt_fileServer if ($opt_fileServer); } @@ -194,31 +190,30 @@ _EOF_ ); } else { $bossScript->add(<<_EOF_ # local file copies do not exist, download from NCBI: for F in _rna.gbff _rna.fna _protein.faa _genomic.gff do rsync -a -P \\ rsync://ftp.ncbi.nlm.nih.gov/\$ftpDir/\$asmId\${F}.gz ./ done _EOF_ ); } -printf STDERR "# checking $local2Bit\n"; if ( -s $local2Bit ) { $bossScript->add(<<_EOF_ ln -f -s $local2Bit . _EOF_ ); } elsif ( -s "$outsideCopy/${asmId}_genomic.fna.gz") { $bossScript->add(<<_EOF_ # build \$asmId.ncbi.2bit from local copy of genomic fasta faToTwoBit \$outsideCopy/\${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit _EOF_ ); } else { $bossScript->add(<<_EOF_ # download genomic fasta and build \${asmId}.ncbi.2bit @@ -236,57 +231,58 @@ # check if db.ucscToRefSeq exists? my $sql = "show tables like 'ucscToRefSeq';"; my $result = `hgsql $db -BN -e "$sql"`; chomp $result; if ( $result =~ m/ucscToRefSeq/ ) { $haveLiftFile = 1; $bossScript->add(<<_EOF_ cd \$runDir hgsql -N -e 'select 0,name,chromEnd,chrom,chromEnd from ucscToRefSeq;' \${db} \\ > \${asmId}To\${db}.lift _EOF_ ); } } - my $hostSizes = "$HgAutomate::clusterData/\$db/chrom.sizes"; - $hostSizes = $targetSizes if (-s "$targetSizes"); + my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit"; + $dbTwoBit = $target2bit if (-s "$target2bit"); if (! $haveLiftFile ) { $bossScript->add(<<_EOF_ # generate idKeys for the NCBI sequence to translate names to UCSC equivalents rm -rf \$runDir/idKeys mkdir -p \$runDir/idKeys cd \$runDir/idKeys time ($doIdKeys -buildDir=\$runDir/idKeys -twoBit=\$runDir/\$asmId.ncbi.2bit \$db) > idKeys.log 2>&1 cd \$runDir +twoBitInfo $dbTwoBit stdout | sort -k2,2nr > \$db.chrom.sizes ln -f -s idKeys/\$db.idKeys.txt ./ncbi.\$asmId.idKeys.txt ln -f -s /hive/data/genomes/\$db/bed/idKeys/\$db.idKeys.txt ./ucsc.\$db.idKeys.txt # joining the idKeys establishes a lift file to translate chrom names join -t\$'\\t' ucsc.\$db.idKeys.txt ncbi.\$asmId.idKeys.txt \\ | cut -f2-3 | sort \\ - | join -t\$'\\t' - <(sort $hostSizes) \\ + | join -t\$'\\t' - <(sort \$db.chrom.sizes) \\ | awk -F\$'\\t' '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$2, \$3, \$1, \$3}' \\ | sort -k3nr > \${asmId}To\${db}.lift _EOF_ ); } $bossScript->add(<<_EOF_ cd \$runDir -twoBitInfo \$asmId.ncbi.2bit stdout | sort -k2nr > $hostSizes +twoBitInfo \$asmId.ncbi.2bit stdout | sort -k2nr > \$asmId.ncbi.chrom.sizes zcat \${asmId}_rna.fna.gz | sed -e 's/ .*//;' | gzip -c > \$asmId.rna.fa.gz faSize -detailed \${asmId}.rna.fa.gz | sort -k2nr > rna.sizes # genbank processor extracts infomation about the RNAs /hive/data/outside/genbank/bin/x86_64/gbProcess \\ -inclXMs /dev/null \$asmId.raFile.txt \${asmId}_rna.gbff.gz _EOF_ ); $bossScript->execute(); } # doDownload ######################################################################### # * step: process [dbHost] sub doProcess { my $runDir = "$buildDir/process"; @@ -295,42 +291,39 @@ my $whatItDoes = "process NCBI download files into track files."; # Use dbHost since genePredCheck -db=$db needs database my $bossScript = newBash HgRemoteScript("$runDir/doProcess.bash", $dbHost, $runDir, $whatItDoes); my $genePredCheckDb = "genePredCheck -db=\$db"; if (! $dbExists) { $genePredCheckDb = "genePredCheck"; } my $warnOnly = ""; $warnOnly = "-warnAndContinue" if ($toGpWarnOnly); my $localLiftFile = "\$downloadDir/\${asmId}To\${db}.lift"; $localLiftFile = $liftFile if (-s $liftFile); - my $hostSizes = "$HgAutomate::clusterData/\$db/chrom.sizes"; - my $pslTargetSizes = "-targetSizes=$hostSizes -db=\$db"; - if (-s "$targetSizes") { - $hostSizes = $targetSizes; - $pslTargetSizes = "-targetSizes=$hostSizes"; + my $pslTargetSizes = "-db=\$db"; + my $fakePslSizes = ""; + if (-s "$target2bit") { + $pslTargetSizes = "-targetSizes=\$db.chrom.sizes"; + $fakePslSizes = "-chromSize=\$db.chrom.sizes"; } - my $fakePslSizes = $targetSizes ? "-chromSize=$targetSizes" : ""; my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit"; $dbTwoBit = $target2bit if (-s "$target2bit"); -printf STDERR "# DBG dbTwoBit: '%s'\n", $dbTwoBit; -printf STDERR "# DBG: target2bit: '%s'\n", $target2bit; $bossScript->add(<<_EOF_ # establish all variables to use here export asmId=$asmId export downloadDir=$downloadDir export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz export db=$db export gff3ToRefLink=$gff3ToRefLink export gbffToCds=$gbffToCds export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //'` if [ "\$annotationRelease" == "" ]; then export annotationRelease=\$asmId fi @@ -380,47 +373,48 @@ if [ -s \$db.curated.gp ]; then $genePredCheckDb \$db.curated.gp fi if [ -s \$db.predicted.gp ]; then $genePredCheckDb \$db.predicted.gp fi if [ -s \$db.other.gp ]; then $genePredCheckDb \$db.other.gp fi # join the refLink metadata with curated+predicted names cut -f1 \$db.ncbiRefSeq.gp | sort -u > \$asmId.\$db.name.list join -t\$'\\t' \$asmId.\$db.name.list \$asmId.refLink.tab > \$asmId.\$db.ncbiRefSeqLink.tab # Make bigBed with attributes in extra columns for ncbiRefSeqOther: +twoBitInfo $dbTwoBit stdout | sort -k2,2n > \$db.chrom.sizes genePredToBed \$db.other.gp stdout | sort -k1,1 -k2n,2n > \$db.other.bed $ncbiRefSeqOtherAttrs \$db.other.bed \$asmId.attrs.txt > \$db.other.extras.bed bedToBigBed -type=bed12+13 -as=ncbiRefSeqOther.as -tab \\ -extraIndex=name \\ - \$db.other.extras.bed $hostSizes \$db.other.bb + \$db.other.extras.bed \$db.chrom.sizes \$db.other.bb # Make trix index for ncbiRefSeqOther $ncbiRefSeqOtherIxIxx \\ ncbiRefSeqOther.as \$db.other.extras.bed > ncbiRefSeqOther.ix.tab ixIxx ncbiRefSeqOther.ix.tab ncbiRefSeqOther.ix{,x} # PSL data will be loaded into a psl type track to show the alignments (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff (zegrep -v "NG_" \$ncbiGffGz || true) \\ | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff -gff3ToPsl -dropT \$downloadDir/\$asmId.chrom.sizes \$downloadDir/rna.sizes \\ +gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\ gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\ | liftUp -type=.psl stdout $localLiftFile drop stdin \\ | gzip -c > \$db.psl.gz pslCheck $pslTargetSizes \\ -querySizes=\$downloadDir/rna.sizes \$db.psl.gz # extract RNA CDS information from genbank record # Note: $asmId.raFile.txt could be used instead of _rna.gbff.gz \$gbffToCds \$downloadDir/\${asmId}_rna.gbff.gz | sort > \$asmId.rna.cds # the NCBI _genomic.gff.gz file only contains cDNA_match records for transcripts # that do not *exactly* match the reference genome. For all other transcripts # construct 'fake' PSL records representing the alignments of all cDNAs # that would be perfect matches to the reference genome. The pslFixCdsJoinGap @@ -464,54 +458,52 @@ my $genePredCheckDb = "genePredCheck -db=\$db"; if (! $dbExists) { $genePredCheckDb = "genePredCheck"; } $bossScript->add(<<_EOF_ # establish all variables to use here export db=$db export asmId=$asmId _EOF_ ); if (! $dbExists) { - my $hostSizes = "$HgAutomate::clusterData/\$db/chrom.sizes"; - $hostSizes = $targetSizes if (-s "$targetSizes"); $bossScript->add(<<_EOF_ export target2bit=$dbTwoBit -export targetSizes=$hostSizes +twoBitInfo \$target2bit stdout | sort -k2,2nr > \$db.chrom.sizes wget -O bigGenePred.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigGenePred.as' wget -O bigPsl.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigPsl.as' genePredToBigGenePred process/\$db.ncbiRefSeq.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeq.bigGp bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\ - \$db.ncbiRefSeq.bigGp \$targetSizes \\ + \$db.ncbiRefSeq.bigGp \$db.chrom.sizes \\ \$db.ncbiRefSeq.bb if [ -s process/\$db.curated.gp ]; then genePredToBigGenePred process/\$db.curated.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqCurated.bigGp bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\ - \$db.ncbiRefSeqCurated.bigGp \$targetSizes \\ + \$db.ncbiRefSeqCurated.bigGp \$db.chrom.sizes \\ \$db.ncbiRefSeqCurated.bb rm -f \$db.ncbiRefSeqCurated.bigGp fi if [ -s process/\$db.predicted.gp ]; then genePredToBigGenePred process/\$db.predicted.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqPredicted.bigGp bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\ - \$db.ncbiRefSeqPredicted.bigGp \$targetSizes \\ + \$db.ncbiRefSeqPredicted.bigGp \$db.chrom.sizes \\ \$db.ncbiRefSeqPredicted.bb rm -f \$db.ncbiRefSeqPredicted.bigGp fi if [ -s "process/\$db.other.bb" ]; then ln -f -s process/\$db.other.bb \$db.ncbiRefSeqOther.bb fi if [ -s "process/ncbiRefSeqOther.ix" ]; then ln -f -s process/ncbiRefSeqOther.ix ./\$db.ncbiRefSeqOther.ix ln -f -s process/ncbiRefSeqOther.ixx ./\$db.ncbiRefSeqOther.ixx fi ln -f -s process/ncbiRefSeqVersion.txt ./\$db.ncbiRefSeqVersion.txt # select only coding genes to have CDS records awk -F" " '\$6 != \$7 {print \$1;}' process/\$db.ncbiRefSeq.gp \\ | sort -u > coding.cds.name.list @@ -544,42 +536,42 @@ comm -13 \$db.rna.found.list \$db.gp.name.list > \$db.noRna.available.list # If \$db.noRna.available.list is not empty but items are on chrM, # make fake cDNA sequence for them using chrM sequence # since NCBI puts proteins, not coding RNAs, in the GFF. if [ -s \$db.noRna.available.list ]; then pslCat -nohead process/\$asmId.\$db.psl.gz \\ | grep -Fwf \$db.noRna.available.list \\ | grep chrM > missingChrMFa.psl if [ -s missingChrMFa.psl ]; then pslToBed missingChrMFa.psl stdout \\ | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa fi fi -export totalBases=`ave -col=2 \$targetSizes | grep "^total" | awk '{printf "%d", \$2}'` +export totalBases=`ave -col=2 \$db.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'` export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'` export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\ "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt rm -f \$db.ncbiRefSeq.bigGp pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\ process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.\$db.bigPsl bedToBigBed -type=bed12+13 -tab -as=bigPsl.as \\ - \$asmId.\$db.bigPsl \$targetSizes \$asmId.\$db.bigPsl.bb + \$asmId.\$db.bigPsl \$db.chrom.sizes \$asmId.\$db.bigPsl.bb _EOF_ ); } else { $bossScript->add(<<_EOF_ # loading the genePred tracks, all genes in one, and subsets hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp $genePredCheckDb ncbiRefSeq if [ -s process/\$db.curated.gp ]; then hgLoadGenePred -genePredExt \$db ncbiRefSeqCurated process/\$db.curated.gp $genePredCheckDb ncbiRefSeqCurated fi if [ -s process/\$db.predicted.gp ]; then @@ -674,31 +666,30 @@ ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 5); $toGpWarnOnly = 0; $toGpWarnOnly = 1 if ($opt_toGpWarnOnly); $liftFile = $opt_liftFile ? $opt_liftFile : ""; -$targetSizes = $opt_targetSizes ? $opt_targetSizes : ""; $target2bit = $opt_target2bit ? $opt_target2bit : ""; $secondsStart = `date "+%s"`; chomp $secondsStart; # expected command line arguments after options are processed ($genbankRefseq, $subGroup, $species, $asmId, $db) = @ARGV; $ftpDir = "genomes/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId"; if ( -z "$liftFile" && ! -s "/hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt") { die "ERROR: can not find /hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt\n\t need to run doIdKeys.pl for $db before this procedure."; } # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1;