531a1da73a9c257444ee0ce9884d271f5f7bcd5d hiram Thu Apr 2 14:21:28 2020 -0700 now correctly making bigZips/genes downloads refs #24524; diff --git src/hg/utils/automation/makeDownloads.pl src/hg/utils/automation/makeDownloads.pl index 5dac8d6..0d30432 100755 --- src/hg/utils/automation/makeDownloads.pl +++ src/hg/utils/automation/makeDownloads.pl @@ -88,62 +88,70 @@ 3. AGP, RepeatMasker .out and trfBig .bed files are in their usual places under $HgAutomate::clusterData/\$db/ . (Will complain if not able to find.) 4. RepeatMasker version information obtained from /hive/data/staging/data/RepeatMasker/ 5. Data use conditions are generic, they may need to be specific. " if ($detailed); print "\n"; exit $status; } # Globals: # Command line args: db my ($db); # Other: my ($topDir, $scriptDir, $trfRunDir, $trfRunDirRel); -my ($chromBased, @chroms, %chromRoots, $chromGz, $geneTable); +my ($chromBased, @chroms, %chromRoots, $chromGz, $geneTable, @geneTableList); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'allowMissedTrfs', 'noChromRoot', 'ignoreRepeatMasker', 'noChromFiles', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } #*** libify? sub dbHasTable { my ($dbHost, $db, $table) = @_; my $rows = `echo show tables like "'$table'" | $HgAutomate::runSSH $dbHost hgsql -N $db | wc -l`; return ($rows > 0); } # dbHasTable +sub getGeneTableList { + # Construct list of gene tables to make gtf file dumps + foreach my $table ('ncbiRefSeq', 'refGene', 'ensGene', 'knownGene') { + if (&dbHasTable($dbHost, $db, $table)) { + push @geneTableList, $table; + } + } +} # getGeneTableList sub getGeneTable { # If there is a suitable table for generating upstream genes, return it. - foreach my $table ('refGene', 'mgcGenes') { + foreach my $table ('ncbiRefSeq', 'refGene', 'ensGene', 'mgcGenes') { if (&dbHasTable($dbHost, $db, $table)) { return $table; } } return undef; } # getGeneTable ######################################################################### # * step: template [workhorse] sub compressChromFiles { # To be called only when assembly is chrom-based. # Expect to find per-chromosome .agp and RepeatMasker .out files in # directories with names distilled from chrom names. @@ -686,30 +694,74 @@ To load one of the tables directly into your local mirror database, for example the table chromInfo: ## create table from the sql definition \$ hgsql $db < chromInfo.sql ## load data from the txt.gz file \$ zcat chromInfo.txt.gz | hgsql $db --local-infile=1 \ -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE chromInfo;' _EOF_ ; &printAssemblyUsage($fh, $Organism, $assemblyLabel); &printTableSpecificUsage($fh); close($fh); } # makeDatabaseReadme +sub makeBigZipsGenesReadme { + # Dump out a README.txt for bigZips/genes + my ($runDir) = @_; + my $fh = &HgAutomate::mustOpen(">$runDir/README.bigZipsGenes.txt"); + print $fh <<_EOF_ +Introduction +^^^^^^^^^^^^ + +This directory contains GTF files for the main gene transcript sets where available. They are +sourced from the following gene model tables: ncbiRefSeq, refGene, ensGene, knownGene + +Not all files are available for every assembly. For more information on the source tables +see the respective data track description page in the assembly. For example: + http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg38&g=refGene + +Information on the different gene models can also be found in our genes FAQ: + https://genome.ucsc.edu/FAQ/FAQgenes.html + +Generation +^^^^^^^^^^ + +The files are created using the genePredToGtf utility with the additional -utr flag. Utilities +can be found in the following directory: + http://hgdownload.soe.ucsc.edu/admin/exe/ + +An example command is as follows: + genePredToGtf -utr hg38 ncbiRefSeq hg38.ncbiRefSeq.gtf + +Additional Resources +^^^^^^^^^^^^^^^^^^^^ + +Information on GTF format and how it is related to GFF format: + https://genome.ucsc.edu/FAQ/FAQformat.html#format4 + +Information about the different gene models available in the Genome Browser: + https://genome.ucsc.edu/FAQ/FAQgenes.html + +More information on how the files were generated: + https://genome.ucsc.edu/FAQ/FAQdownloads.html#download37 +_EOF_ + ; + close($fh); +} # sub makeBigZipsGenesReadme + sub makeBigZipsReadme { # Dump out a README.txt for bigZips/ . my ($runDir) = @_; my ($Organism, $assemblyDate, $assemblyLabel, $organism, $consortium, $sequencingCenter, $projectUrl) = &getDescriptives(); my $rmVersion = ""; if ( ! -s "/hive/data/staging/data/RepeatMasker/RepeatMasker" ) { die "can not read /hive/data/staging/data/RepeatMasker/RepeatMasker\n"; } $rmVersion = `grep -w open /hive/data/staging/data/RepeatMasker/RepeatMasker | grep -w version | grep -w RepeatMasker`; chomp $rmVersion; $rmVersion =~ s/#\s*//; my $emblLib = ""; if ( ! -s "/hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl" ) { @@ -823,32 +875,36 @@ _EOF_ ; } if (&dbHasTable($dbHost, $db, 'refGene')) { print $fh <<_EOF_ refMrna.fa.gz - RefSeq mRNA from the same species as the genome. This sequence data is updated once a week via automatic GenBank updates. _EOF_ ; } my $dunno = '*** ??? ***'; if ($geneTable) { my $geneDesc; - if ($geneTable eq 'refGene') { + if ($geneTable eq 'ncbiRefSeq') { + $geneDesc = 'NCBI RefSeq'; + } elsif ($geneTable eq 'refGene') { $geneDesc = 'RefSeq'; + } elsif ($geneTable eq 'ensGene') { + $geneDesc = 'Ensembl'; } elsif ($geneTable eq 'mgcGenes') { $geneDesc = 'MGC'; } elsif ($geneTable eq 'xenoRefGene') { $geneDesc = 'non-$Organism RefSeq'; } else { $geneDesc = $dunno; } print $fh <<_EOF_ upstream1000.fa.gz - Sequences 1000 bases upstream of annotated transcription starts of $geneDesc genes with annotated 5' UTRs. _EOF_ ; if ($geneDesc ne $dunno) { print $fh <<_EOF_ This file is updated weekly so it might be slightly out of sync with @@ -909,31 +965,30 @@ wget --timestamping \ 'ftp://hgdownload.soe.ucsc.edu/goldenPath/$db/bigZips/chromFa.tar.gz' \ -O chromFa.tar.gz To unpack the *.tar.gz files: tar xvzf .tar.gz To uncompress the fa.gz files: gunzip .fa.gz _EOF_ ; &printAssemblyUsage($fh, $Organism, $assemblyLabel); close($fh); } # makeBigZipsReadme - sub makeChromosomesReadme { # Dump out a README.txt for chromsomes/ . my ($runDir) = @_; my ($Organism, $assemblyDate, $assemblyLabel, $organism, $consortium, $sequencingCenter, $projectUrl) = &getDescriptives(); my $fh = &HgAutomate::mustOpen(">$runDir/README.chromosomes.txt"); print $fh <<_EOF_ This directory contains the $assemblyDate assembly of the $organism genome ($db, $assemblyLabel) in one gzip-compressed FASTA file per chromosome. For more information about this assembly, please note the NCBI resources: https://www.ncbi.nlm.nih.gov/genome/$ncbiGenomeId https://www.ncbi.nlm.nih.gov/genome/assembly/$ncbiAssemblyId @@ -1096,37 +1151,41 @@ &compressScaffoldFiles($runDir, $bossScript); } $bossScript->add(<<_EOF_ # Add md5sum.txt and README.txt to each dir: foreach d (bigZips $chromGz database liftOver) cd $runDir/\$d if (\$d != "database" && \$d != "liftOver") then if (-s $db.2bit) then md5sum $db.2bit $db.chrom.sizes *.gz > md5sum.txt else md5sum *.gz > md5sum.txt endif endif mv $runDir/README.\$d.txt README.txt end - +if (-d "$runDir/bigZips/genes") then + cd $runDir/bigZips/genes + mv $runDir/README.bigZipsGenes.txt README.txt +endif _EOF_ ); # Create README.*.txt files which will be moved into subdirs by the script. &makeDatabaseReadme($runDir); &makeBigZipsReadme($runDir); + &makeBigZipsGenesReadme($runDir); &makeChromosomesReadme($runDir) if ($chromBased); &makeLiftOverReadme($runDir); $bossScript->execute(); } # doCompress ######################################################################### # * step: install [dbHost] sub doInstall { my $runDir = "$topDir/goldenPath"; my $whatItDoes = "It creates links from the web server's goldenPath download area to the actual compressed files."; @@ -1135,43 +1194,69 @@ my $gp = "$HgAutomate::goldenPath/$db"; $bossScript->add(<<_EOF_ mkdir -p $gp foreach d (bigZips $chromGz database) rm -rf $gp/\$d mkdir $gp/\$d ln -s $runDir/\$d/*.{gz,txt,2bit,sizes} $gp/\$d/ end # Don't blow away all of liftOver, just the README -- there may be # pre-existing links that are not regenerated above. mkdir -p $gp/liftOver rm -f $gp/liftOver/README.txt ln -s $runDir/liftOver/README.txt $gp/liftOver/README.txt _EOF_ ); + if ($geneTable) { $bossScript->add(<<_EOF_ cd $runDir/bigZips foreach size (1000 2000 5000) echo \$size featureBits $db $geneTable:upstream:\$size -fa=stdout \\ | gzip -c > upstream\$size.fa.gz end md5sum up*.gz >> md5sum.txt ln -s $runDir/bigZips/up*.gz $gp/bigZips/ _EOF_ ); + } # if ($geneTable) + + if (scalar(@geneTableList) > 0) { + $bossScript->add(<<_EOF_ +cd $runDir/bigZips +mkdir -p genes $gp/bigZips/genes +_EOF_ + ); + foreach my $geneTbl (@geneTableList) { + $bossScript->add(<<_EOF_ +genePredToGtf -utr $db $geneTbl stdout | gzip -c > genes/$geneTbl.gtf.gz +_EOF_ + ); } + $bossScript->add(<<_EOF_ +cd $runDir/bigZips/genes +md5sum *.gtf.gz > md5sum.txt +rm -fr $gp/bigZips/genes +mkdir $gp/bigZips/genes +ln -s $runDir/bigZips/genes/*.gtf.gz $gp/bigZips/genes/ +ln -s $runDir/bigZips/genes/md5sum.txt $gp/bigZips/genes/ +ln -s $runDir/bigZips/genes/README.txt $gp/bigZips/genes/ +_EOF_ + ); + } # if (scalar(@geneTableList) > 0) + $bossScript->execute(); } # doInstall sub requireVar { # Ensure that var is in %config and return its value. # Remove it from %config so we can check for unrecognized contents. my ($var, $config) = @_; my $val = $config->{$var} || die "Error: $configFile is missing required variable \"$var\".\n" . "For a detailed list of required variables, run \"$base -help\".\n"; delete $config->{$var}; return $val; } # requireVar sub optionalVar { @@ -1254,30 +1339,31 @@ # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); ($db) = @ARGV; $topDir = "$HgAutomate::clusterData/$db"; $configFile= "$topDir/$db.config.ra"; &parseConfig($configFile); $scriptDir = "$topDir/jkStuff"; $trfRunDirRel = "$HgAutomate::trackBuild/simpleRepeat"; $trfRunDir = "$topDir/$trfRunDirRel"; $geneTable = &getGeneTable(); +&getGeneTableList(); if (! -e "$topDir/$db.2bit") { die "Sorry, this script requires $topDir/$db.2bit.\n"; } if (! -e "$topDir/chrom.sizes") { die "Sorry, this script requires $topDir/chrom.sizes.\n"; } @chroms = split("\n", `awk '{print \$1;}' $topDir/chrom.sizes`); $chromBased = (scalar(@chroms) <= $HgAutomate::splitThreshold) && ! $opt_noChromFiles; if ($chromBased) { foreach my $chr (@chroms) { my $chrRoot = $chr; $chrRoot =~ s/^chr//; $chrRoot =~ s/_random$//; if (! $opt_noChromRoot) {