e87ffcb94edd42fabe93f3057a0719aa92d77381 hiram Wed Jan 8 15:36:13 2020 -0800 now constructing files in assembly hub build environment refs #23891 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index e15b105..a139ab7 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -20,30 +20,33 @@ my $doIdKeys = "$Bin/doIdKeys.pl"; my $gff3ToRefLink = "$Bin/gff3ToRefLink.pl"; my $gbffToCds = "$Bin/gbffToCds.pl"; my $ncbiRefSeqOtherIxIxx = "$Bin/ncbiRefSeqOtherIxIxx.pl"; my $ncbiRefSeqOtherAttrs = "$Bin/ncbiRefSeqOtherAttrs.pl"; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_genbank $opt_subgroup $opt_species + $opt_liftFile + $opt_target2bit + $opt_targetSizes $opt_toGpWarnOnly /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'process', func => \&doProcess }, { name => 'load', func => \&doLoad }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $bigClusterHub = 'ku'; @@ -69,74 +72,84 @@ species - species directory at NCBI FTP site, examples: - Homo_sapiens Mus_musculus etc... asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. db - database to load with track tables options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date (necessary when continuing at a later date). -toGpWarnOnly add -warnAndContinue to the gff3ToGenePred operation to avoid gene definitions that will not convert + -liftFile pathName a lift file to translate NCBI names to local genome + names + -target2bit pathName when not a local UCSC genome, use this 2bit file for + the target genome sequence + -targetSizes pathName when not a local UCSC genome, use this file for + target genome chrom.sizes file _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $defaultWorkhorse, 'fileServer' => $defaultFileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates UCSC's ncbiRefSeq track build. Steps: download: symlink local files or rsync required files from NCBI FTP site process: generate the track data files from the download data load: load the processed data into database tables cleanup: Removes or compresses intermediate files. All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date unless -buildDir is given. Expects to find already done idKeys procedure and result file in: /hive/data/genomes/<db>/bed/idKeys/<db>.idKeys.txt See also: doIdKeys.pl command. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from \$db.2bit. +NOTE: Override these assumptions with -target2Bit and -targetSizes options " if ($detailed); print "\n"; exit $status; } # Globals: # Command line args: genbankRefseq subGroup species asmId db my ($genbankRefseq, $subGroup, $species, $asmId, $db, $ftpDir); # Other: -my ($buildDir, $toGpWarnOnly, $dbExists); +my ($buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit, $targetSizes); my ($secondsStart, $secondsEnd); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', + 'liftFile=s', + 'target2bit=s', + 'targetSizes=s', 'toGpWarnOnly', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); $workhorse = $opt_workhorse if ($opt_workhorse); $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub); $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub); $fileServer = $opt_fileServer if ($opt_fileServer); } @@ -205,97 +218,120 @@ faToTwoBit \$outsideCopy/\${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit _EOF_ ); } else { $bossScript->add(<<_EOF_ # download genomic fasta and build \${asmId}.ncbi.2bit rsync -a -P \\ rsync://ftp.ncbi.nlm.nih.gov/\$ftpDir/\${asmId}_genomic.fna.gz ./ faToTwoBit \${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit _EOF_ ); } my $haveLiftFile = 0; - if ($dbExists) { + if ( -s "$liftFile" ) { + $haveLiftFile = 1; + } elsif ($dbExists) { # check if db.ucscToRefSeq exists? my $sql = "show tables like 'ucscToRefSeq';"; my $result = `hgsql $db -BN -e "$sql"`; chomp $result; if ( $result =~ m/ucscToRefSeq/ ) { $haveLiftFile = 1; $bossScript->add(<<_EOF_ cd \$runDir hgsql -N -e 'select 0,name,chromEnd,chrom,chromEnd from ucscToRefSeq;' \${db} \\ > \${asmId}To\${db}.lift _EOF_ ); } } + my $hostSizes = "$HgAutomate::clusterData/\$db/chrom.sizes"; + $hostSizes = $targetSizes if (-s "$targetSizes"); + if (! $haveLiftFile ) { $bossScript->add(<<_EOF_ # generate idKeys for the NCBI sequence to translate names to UCSC equivalents rm -rf \$runDir/idKeys mkdir -p \$runDir/idKeys cd \$runDir/idKeys time ($doIdKeys -buildDir=\$runDir/idKeys -twoBit=\$runDir/\$asmId.ncbi.2bit \$db) > idKeys.log 2>&1 cd \$runDir ln -f -s idKeys/\$db.idKeys.txt ./ncbi.\$asmId.idKeys.txt ln -f -s /hive/data/genomes/\$db/bed/idKeys/\$db.idKeys.txt ./ucsc.\$db.idKeys.txt # joining the idKeys establishes a lift file to translate chrom names -join -t'\t' ucsc.\$db.idKeys.txt ncbi.\$asmId.idKeys.txt \\ +join -t\$'\\t' ucsc.\$db.idKeys.txt ncbi.\$asmId.idKeys.txt \\ | cut -f2-3 | sort \\ - | join -t'\t' - <(sort $HgAutomate::clusterData/\$db/chrom.sizes) \\ - | awk -F'\t' '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$2, \$3, \$1, \$3}' \\ + | join -t\$'\\t' - <(sort $hostSizes) \\ + | awk -F\$'\\t' '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$2, \$3, \$1, \$3}' \\ | sort -k3nr > \${asmId}To\${db}.lift _EOF_ ); } $bossScript->add(<<_EOF_ cd \$runDir -twoBitInfo \$asmId.ncbi.2bit stdout | sort -k2nr > \$asmId.chrom.sizes +twoBitInfo \$asmId.ncbi.2bit stdout | sort -k2nr > $hostSizes zcat \${asmId}_rna.fna.gz | sed -e 's/ .*//;' | gzip -c > \$asmId.rna.fa.gz faSize -detailed \${asmId}.rna.fa.gz | sort -k2nr > rna.sizes # genbank processor extracts infomation about the RNAs /hive/data/outside/genbank/bin/x86_64/gbProcess \\ -inclXMs /dev/null \$asmId.raFile.txt \${asmId}_rna.gbff.gz _EOF_ ); $bossScript->execute(); } # doDownload ######################################################################### # * step: process [dbHost] sub doProcess { my $runDir = "$buildDir/process"; my $downloadDir = "$buildDir/download"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "process NCBI download files into track files."; # Use dbHost since genePredCheck -db=$db needs database my $bossScript = newBash HgRemoteScript("$runDir/doProcess.bash", $dbHost, $runDir, $whatItDoes); + my $genePredCheckDb = "genePredCheck -db=\$db"; + if (! $dbExists) { + $genePredCheckDb = "genePredCheck"; + } my $warnOnly = ""; $warnOnly = "-warnAndContinue" if ($toGpWarnOnly); + my $localLiftFile = "\$downloadDir/\${asmId}To\${db}.lift"; + $localLiftFile = $liftFile if (-s $liftFile); + my $hostSizes = "$HgAutomate::clusterData/\$db/chrom.sizes"; + my $pslTargetSizes = "-targetSizes=$hostSizes -db=\$db"; + if (-s "$targetSizes") { + $hostSizes = $targetSizes; + $pslTargetSizes = "-targetSizes=$hostSizes"; + } + my $fakePslSizes = $targetSizes ? "-chromSize=$targetSizes" : ""; + my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit"; + $dbTwoBit = $target2bit if (-s "$target2bit"); +printf STDERR "# DBG dbTwoBit: '%s'\n", $dbTwoBit; +printf STDERR "# DBG: target2bit: '%s'\n", $target2bit; + $bossScript->add(<<_EOF_ # establish all variables to use here export asmId=$asmId export downloadDir=$downloadDir export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz export db=$db export gff3ToRefLink=$gff3ToRefLink export gbffToCds=$gbffToCds export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //'` if [ "\$annotationRelease" == "" ]; then export annotationRelease=\$asmId fi export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'` @@ -309,183 +345,285 @@ -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp genePredCheck \$asmId.gp # extract labels from semi-structured text in gbff COMMENT/description sections: zcat \$downloadDir/\${asmId}_rna.gbff.gz \\ | (grep ' :: ' || true) \\ | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\ | sort -u \\ > pragmaLabels.txt # extract cross reference text for refLink \$gff3ToRefLink \$downloadDir/\$asmId.raFile.txt \$ncbiGffGz pragmaLabels.txt 2> \$db.refLink.stderr.txt \\ | sort > \$asmId.refLink.tab # converting the NCBI coordinates to UCSC coordinates -liftUp -extGenePred -type=.gp stdout \$downloadDir/\${asmId}To\${db}.lift drop \$asmId.gp \\ +liftUp -extGenePred -type=.gp stdout $localLiftFile drop \$asmId.gp \\ | gzip -c > \$asmId.\$db.gp.gz -genePredCheck -db=\$db \$asmId.\$db.gp.gz +$genePredCheckDb \$asmId.\$db.gp.gz # curated subset of all genes (zegrep "^[NY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.curated.gp # may not be any curated genes if [ ! -s \$db.curated.gp ]; then rm -f \$db.curated.gp fi # predicted subset of all genes (zegrep "^X[MR]_" \$asmId.\$db.gp.gz || true) > \$db.predicted.gp # not curated or predicted subset of all genes, the left overs (zegrep -v "^[NXY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.other.gp # curated and predicted without leftovers: (zegrep "^[NXY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.ncbiRefSeq.gp if [ -s \$db.curated.gp ]; then - genePredCheck -db=\$db \$db.curated.gp + $genePredCheckDb \$db.curated.gp fi if [ -s \$db.predicted.gp ]; then - genePredCheck -db=\$db \$db.predicted.gp + $genePredCheckDb \$db.predicted.gp fi if [ -s \$db.other.gp ]; then - genePredCheck -db=\$db \$db.other.gp + $genePredCheckDb \$db.other.gp fi # join the refLink metadata with curated+predicted names cut -f1 \$db.ncbiRefSeq.gp | sort -u > \$asmId.\$db.name.list -join -t'\t' \$asmId.\$db.name.list \$asmId.refLink.tab > \$asmId.\$db.ncbiRefSeqLink.tab +join -t\$'\\t' \$asmId.\$db.name.list \$asmId.refLink.tab > \$asmId.\$db.ncbiRefSeqLink.tab # Make bigBed with attributes in extra columns for ncbiRefSeqOther: genePredToBed \$db.other.gp stdout | sort -k1,1 -k2n,2n > \$db.other.bed $ncbiRefSeqOtherAttrs \$db.other.bed \$asmId.attrs.txt > \$db.other.extras.bed bedToBigBed -type=bed12+13 -as=ncbiRefSeqOther.as -tab \\ -extraIndex=name \\ - \$db.other.extras.bed $HgAutomate::clusterData/\$db/chrom.sizes \$db.other.bb + \$db.other.extras.bed $hostSizes \$db.other.bb # Make trix index for ncbiRefSeqOther $ncbiRefSeqOtherIxIxx \\ ncbiRefSeqOther.as \$db.other.extras.bed > ncbiRefSeqOther.ix.tab ixIxx ncbiRefSeqOther.ix.tab ncbiRefSeqOther.ix{,x} # PSL data will be loaded into a psl type track to show the alignments (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff (zegrep -v "NG_" \$ncbiGffGz || true) \\ - | awk -F'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff + | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff gff3ToPsl -dropT \$downloadDir/\$asmId.chrom.sizes \$downloadDir/rna.sizes \\ gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\ - | liftUp -type=.psl stdout \$downloadDir/\${asmId}To\${db}.lift drop stdin \\ + | liftUp -type=.psl stdout $localLiftFile drop stdin \\ | gzip -c > \$db.psl.gz -pslCheck -targetSizes=$HgAutomate::clusterData/\$db/chrom.sizes \\ - -querySizes=\$downloadDir/rna.sizes -db=\$db \$db.psl.gz +pslCheck $pslTargetSizes \\ + -querySizes=\$downloadDir/rna.sizes \$db.psl.gz # extract RNA CDS information from genbank record # Note: $asmId.raFile.txt could be used instead of _rna.gbff.gz \$gbffToCds \$downloadDir/\${asmId}_rna.gbff.gz | sort > \$asmId.rna.cds # the NCBI _genomic.gff.gz file only contains cDNA_match records for transcripts # that do not *exactly* match the reference genome. For all other transcripts # construct 'fake' PSL records representing the alignments of all cDNAs # that would be perfect matches to the reference genome. The pslFixCdsJoinGap # will fixup those records with unusual alignments due to frameshifts of # various sorts as found in the rna.cds file: -genePredToFakePsl -qSizes=\$downloadDir/rna.sizes \$db \$db.ncbiRefSeq.gp \\ +genePredToFakePsl -qSizes=\$downloadDir/rna.sizes $fakePslSizes \$db \$db.ncbiRefSeq.gp \\ stdout \$db.fake.cds \\ | pslFixCdsJoinGap stdin \$asmId.rna.cds \$db.fake.psl pslCat -nohead \$db.psl.gz | cut -f10,14 > \$db.psl.names if [ -s \$db.psl.names ]; then pslSomeRecords -tToo -not \$db.fake.psl \$db.psl.names \$db.someRecords.psl else cp -p \$db.fake.psl \$db.someRecords.psl fi pslSort dirs stdout \\ ./tmpdir \$db.psl.gz \$db.someRecords.psl \\ - | (pslCheck -quiet -db=\$db -pass=stdout -fail=\$asmId.\$db.fail.psl stdin || true) \\ + | (pslCheck -quiet $pslTargetSizes -pass=stdout -fail=\$asmId.\$db.fail.psl stdin || true) \\ | pslPosTarget stdin stdout \\ | pslRecalcMatch -ignoreQMissing stdin $dbTwoBit \$downloadDir/\$asmId.rna.fa.gz stdout \\ | sort -k14,14 -k16,16n | gzip -c > \$asmId.\$db.psl.gz rm -fr ./tmpdir -pslCheck -db=\$db \$asmId.\$db.psl.gz +pslCheck $pslTargetSizes \$asmId.\$db.psl.gz _EOF_ ); $bossScript->execute(); } # doProcess ######################################################################### # * step: load [dbHost] sub doLoad { my $runDir = "$buildDir"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "load NCBI processed files into track files."; my $bossScript = newBash HgRemoteScript("$runDir/doLoad.bash", $dbHost, $runDir, $whatItDoes); my $gbdbDir = "$HgAutomate::gbdb/\$db/ncbiRefSeq"; my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit"; + $dbTwoBit = $target2bit if (-s "$target2bit"); + + my $genePredCheckDb = "genePredCheck -db=\$db"; + if (! $dbExists) { + $genePredCheckDb = "genePredCheck"; + } $bossScript->add(<<_EOF_ # establish all variables to use here export db=$db export asmId=$asmId _EOF_ ); + if (! $dbExists) { + my $hostSizes = "$HgAutomate::clusterData/\$db/chrom.sizes"; + $hostSizes = $targetSizes if (-s "$targetSizes"); + $bossScript->add(<<_EOF_ +export target2bit=$dbTwoBit +export targetSizes=$hostSizes + +wget -O bigGenePred.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigGenePred.as' +wget -O bigPsl.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigPsl.as' + +genePredToBigGenePred process/\$db.ncbiRefSeq.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeq.bigGp +bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\ + \$db.ncbiRefSeq.bigGp \$targetSizes \\ + \$db.ncbiRefSeq.bb +if [ -s process/\$db.curated.gp ]; then + genePredToBigGenePred process/\$db.curated.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqCurated.bigGp + bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\ + \$db.ncbiRefSeqCurated.bigGp \$targetSizes \\ + \$db.ncbiRefSeqCurated.bb + rm -f \$db.ncbiRefSeqCurated.bigGp +fi +if [ -s process/\$db.predicted.gp ]; then + genePredToBigGenePred process/\$db.predicted.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqPredicted.bigGp + bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\ + \$db.ncbiRefSeqPredicted.bigGp \$targetSizes \\ + \$db.ncbiRefSeqPredicted.bb + rm -f \$db.ncbiRefSeqPredicted.bigGp +fi +if [ -s "process/\$db.other.bb" ]; then + ln -f -s process/\$db.other.bb \$db.ncbiRefSeqOther.bb +fi +if [ -s "process/ncbiRefSeqOther.ix" ]; then + ln -f -s process/ncbiRefSeqOther.ix ./\$db.ncbiRefSeqOther.ix + ln -f -s process/ncbiRefSeqOther.ixx ./\$db.ncbiRefSeqOther.ixx +fi +ln -f -s process/ncbiRefSeqVersion.txt ./\$db.ncbiRefSeqVersion.txt +# select only coding genes to have CDS records + +awk -F" " '\$6 != \$7 {print \$1;}' process/\$db.ncbiRefSeq.gp \\ + | sort -u > coding.cds.name.list + +join -t\$'\t' coding.cds.name.list process/\$asmId.rna.cds \\ + > \$db.ncbiRefSeqCds.tab + +rm -f coding.cds.name.list + +ln -f -s process/\$asmId.\$db.ncbiRefSeqLink.tab . +cut -f6 \$asmId.\$db.ncbiRefSeqLink.tab | grep . \\ + | sort -u > \$db.ncbiRefSeqLink.protAcc.list + +zcat download/\${asmId}_protein.faa.gz \\ + | sed -e 's/ .*//;' | faToTab -type=protein -keepAccSuffix stdin stdout \\ + | sort | join -t\$'\\t' \$db.ncbiRefSeqLink.protAcc.list - \\ + > \$db.ncbiRefSeqPepTable.tab + +# and load the fasta peptides, again, only those for items that exist +pslCat -nohead process/\$asmId.\$db.psl.gz | cut -f10 \\ + | sort -u > \$db.psl.used.rna.list +cut -f5 process/\$asmId.\$db.ncbiRefSeqLink.tab \\ + | grep . | sort -u > \$db.mrnaAcc.name.list +sort -u \$db.psl.used.rna.list \$db.mrnaAcc.name.list > \$db.rna.select.list +cut -f1 process/\$db.ncbiRefSeq.gp | sort -u > \$db.gp.name.list +comm -12 \$db.rna.select.list \$db.gp.name.list > \$db.toLoad.rna.list +comm -23 \$db.rna.select.list \$db.gp.name.list > \$db.not.used.rna.list +faSomeRecords download/\$asmId.rna.fa.gz \$db.toLoad.rna.list \$db.rna.fa +grep '^>' \$db.rna.fa | sed -e 's/^>//' | sort > \$db.rna.found.list +comm -13 \$db.rna.found.list \$db.gp.name.list > \$db.noRna.available.list + +# If \$db.noRna.available.list is not empty but items are on chrM, +# make fake cDNA sequence for them using chrM sequence +# since NCBI puts proteins, not coding RNAs, in the GFF. +if [ -s \$db.noRna.available.list ]; then + pslCat -nohead process/\$asmId.\$db.psl.gz \\ + | grep -Fwf \$db.noRna.available.list \\ + | grep chrM > missingChrMFa.psl + if [ -s missingChrMFa.psl ]; then + pslToBed missingChrMFa.psl stdout \\ + | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa + fi +fi + +export totalBases=`ave -col=2 \$targetSizes | grep "^total" | awk '{printf "%d", \$2}'` +export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'` +export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'` +printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\ + "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt + +rm -f \$db.ncbiRefSeq.bigGp + +pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\ + process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.\$db.bigPsl +bedToBigBed -type=bed12+13 -tab -as=bigPsl.as \\ + \$asmId.\$db.bigPsl \$targetSizes \$asmId.\$db.bigPsl.bb +_EOF_ + ); + } else { $bossScript->add(<<_EOF_ # loading the genePred tracks, all genes in one, and subsets hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp -genePredCheck -db=\$db ncbiRefSeq +$genePredCheckDb ncbiRefSeq if [ -s process/\$db.curated.gp ]; then hgLoadGenePred -genePredExt \$db ncbiRefSeqCurated process/\$db.curated.gp - genePredCheck -db=\$db ncbiRefSeqCurated + $genePredCheckDb ncbiRefSeqCurated fi if [ -s process/\$db.predicted.gp ]; then hgLoadGenePred -genePredExt \$db ncbiRefSeqPredicted process/\$db.predicted.gp - genePredCheck -db=\$db ncbiRefSeqPredicted + $genePredCheckDb ncbiRefSeqPredicted fi mkdir -p $gbdbDir ln -f -s `pwd`/process/\$db.other.bb $gbdbDir/ncbiRefSeqOther.bb hgBbiDbLink \$db ncbiRefSeqOther $gbdbDir/ncbiRefSeqOther.bb ln -f -s `pwd`/process/ncbiRefSeqOther.ix{,x} $gbdbDir/ ln -f -s `pwd`/process/ncbiRefSeqVersion.txt $gbdbDir/ # select only coding genes to have CDS records -awk -F"\t" '\$6 != \$7 {print \$1;}' process/\$db.ncbiRefSeq.gp \\ +awk -F\$'\\t' '\$6 != \$7 {print \$1;}' process/\$db.ncbiRefSeq.gp \\ | sort -u > coding.cds.name.list -join -t'\t' coding.cds.name.list process/\$asmId.rna.cds \\ +join -t\$'\\t' coding.cds.name.list process/\$asmId.rna.cds \\ | hgLoadSqlTab \$db ncbiRefSeqCds ~/kent/src/hg/lib/cdsSpec.sql stdin # loading the cross reference data hgLoadSqlTab \$db ncbiRefSeqLink ~/kent/src/hg/lib/ncbiRefSeqLink.sql \\ process/\$asmId.\$db.ncbiRefSeqLink.tab # prepare Pep table, select only those peptides that match loaded items hgsql -N -e 'select protAcc from ncbiRefSeqLink;' \$db | grep . \\ | sort -u > \$db.ncbiRefSeqLink.protAcc.list zcat download/\${asmId}_protein.faa.gz \\ | sed -e 's/ .*//;' | faToTab -type=protein -keepAccSuffix stdin stdout \\ - | sort | join -t'\t' \$db.ncbiRefSeqLink.protAcc.list - \\ + | sort | join -t\$'\\t' \$db.ncbiRefSeqLink.protAcc.list - \\ > \$db.ncbiRefSeqPepTable.tab hgLoadSqlTab \$db ncbiRefSeqPepTable ~/kent/src/hg/lib/pepPred.sql \\ \$db.ncbiRefSeqPepTable.tab # and load the fasta peptides, again, only those for items that exist pslCat -nohead process/\$asmId.\$db.psl.gz | cut -f10 \\ | sort -u > \$db.psl.used.rna.list cut -f5 process/\$asmId.\$db.ncbiRefSeqLink.tab | grep . | sort -u > \$db.mrnaAcc.name.list sort -u \$db.psl.used.rna.list \$db.mrnaAcc.name.list > \$db.rna.select.list cut -f1 process/\$db.ncbiRefSeq.gp | sort -u > \$db.gp.name.list comm -12 \$db.rna.select.list \$db.gp.name.list > \$db.toLoad.rna.list comm -23 \$db.rna.select.list \$db.gp.name.list > \$db.not.used.rna.list faSomeRecords download/\$asmId.rna.fa.gz \$db.toLoad.rna.list \$db.rna.fa grep '^>' \$db.rna.fa | sed -e 's/^>//' | sort > \$db.rna.found.list @@ -501,30 +639,31 @@ pslToBed missingChrMFa.psl stdout \\ | twoBitToFa -bed=stdin $dbTwoBit stdout >> \$db.rna.fa fi fi mkdir -p $gbdbDir ln -f -s `pwd`/\$db.rna.fa $gbdbDir/seqNcbiRefSeq.rna.fa hgLoadSeq -drop -seqTbl=seqNcbiRefSeq -extFileTbl=extNcbiRefSeq \$db $gbdbDir/seqNcbiRefSeq.rna.fa hgLoadPsl \$db -table=ncbiRefSeqPsl process/\$asmId.\$db.psl.gz featureBits \$db ncbiRefSeq > fb.ncbiRefSeq.\$db.txt 2>&1 cat fb.ncbiRefSeq.\$db.txt 2>&1 _EOF_ ); + } # if ($dbExists) $bossScript->execute(); } # doLoad ######################################################################### # * step: cleanup [fileServer] sub doCleanup { my $runDir = "$buildDir"; my $whatItDoes = "It cleans up or compresses intermediate files."; my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ gzip -f download/{rna.sizes,*.raFile.txt} gzip -f process/*.{tab,txt,gp,gff,psl,cds,bed} # Leave this one uncompressed, gbdb links to it: gunzip process/ncbiRefSeqVersion.txt.gz @@ -534,39 +673,42 @@ } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 5); $toGpWarnOnly = 0; $toGpWarnOnly = 1 if ($opt_toGpWarnOnly); +$liftFile = $opt_liftFile ? $opt_liftFile : ""; +$targetSizes = $opt_targetSizes ? $opt_targetSizes : ""; +$target2bit = $opt_target2bit ? $opt_target2bit : ""; $secondsStart = `date "+%s"`; chomp $secondsStart; # expected command line arguments after options are processed ($genbankRefseq, $subGroup, $species, $asmId, $db) = @ARGV; $ftpDir = "genomes/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId"; -if ( ! -s "/hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt") { +if ( -z "$liftFile" && ! -s "/hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt") { die "ERROR: can not find /hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt\n\t need to run doIdKeys.pl for $db before this procedure."; } # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1; # $opt_verbose = 3 if ($opt_verbose < 3); # Establish what directory we will work in. my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/ncbiRefSeq.$date"; # may be working on a 2bit file that does not have a database browser $dbExists = 0;