src/hg/utils/automation/makeDownloads.pl 531a1da73a9c257444ee0ce9884d271f5f7bcd5d

531a1da73a9c257444ee0ce9884d271f5f7bcd5d
hiram
  Thu Apr 2 14:21:28 2020 -0700
now correctly making bigZips/genes downloads refs #24524;

diff --git src/hg/utils/automation/makeDownloads.pl src/hg/utils/automation/makeDownloads.pl
index 5dac8d6..0d30432 100755
--- src/hg/utils/automation/makeDownloads.pl
+++ src/hg/utils/automation/makeDownloads.pl
@@ -88,62 +88,70 @@
 3. AGP, RepeatMasker .out and trfBig .bed files are in their usual places under
    $HgAutomate::clusterData/\$db/ .  (Will complain if not able to find.)
 4. RepeatMasker version information obtained from /hive/data/staging/data/RepeatMasker/
 5. Data use conditions are generic, they may need to be specific.
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
 my ($db);
 # Other:
 my ($topDir, $scriptDir, $trfRunDir, $trfRunDirRel);
-my ($chromBased, @chroms, %chromRoots, $chromGz, $geneTable);
+my ($chromBased, @chroms, %chromRoots, $chromGz, $geneTable, @geneTableList);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'allowMissedTrfs',
 		      'noChromRoot',
 		      'ignoreRepeatMasker',
 		      'noChromFiles',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 
 #*** libify?
 sub dbHasTable {
   my ($dbHost, $db, $table) = @_;
   my $rows = `echo show tables like "'$table'" |
               $HgAutomate::runSSH $dbHost hgsql -N $db | wc -l`;
   return ($rows > 0);
 } # dbHasTable
 
+sub getGeneTableList {
+  # Construct list of gene tables to make gtf file dumps
+  foreach my $table ('ncbiRefSeq', 'refGene', 'ensGene', 'knownGene') {
+    if (&dbHasTable($dbHost, $db, $table)) {
+      push @geneTableList, $table;
+    }
+  }
+} # getGeneTableList
 
 sub getGeneTable {
   # If there is a suitable table for generating upstream genes, return it.
-  foreach my $table ('refGene', 'mgcGenes') {
+  foreach my $table ('ncbiRefSeq', 'refGene', 'ensGene', 'mgcGenes') {
     if (&dbHasTable($dbHost, $db, $table)) {
       return $table;
     }
   }
   return undef;
 } # getGeneTable
 
 
 #########################################################################
 # * step: template [workhorse]
 
 sub compressChromFiles {
   # To be called only when assembly is chrom-based.
   # Expect to find per-chromosome .agp and RepeatMasker .out files in
   # directories with names distilled from chrom names.
@@ -686,30 +694,74 @@
 To load one of the tables directly into your local mirror database,
 for example the table chromInfo:
 ## create table from the sql definition
 \$ hgsql $db < chromInfo.sql
 ## load data from the txt.gz file
 \$ zcat chromInfo.txt.gz | hgsql $db --local-infile=1 \
         -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE chromInfo;'
 
 _EOF_
   ;
   &printAssemblyUsage($fh, $Organism, $assemblyLabel);
   &printTableSpecificUsage($fh);
   close($fh);
 } # makeDatabaseReadme
 
+sub makeBigZipsGenesReadme {
+  # Dump out a README.txt for bigZips/genes
+  my ($runDir) = @_;
+  my $fh = &HgAutomate::mustOpen(">$runDir/README.bigZipsGenes.txt");
+  print $fh <<_EOF_
+Introduction
+^^^^^^^^^^^^
+
+This directory contains GTF files for the main gene transcript sets where available. They are
+sourced from the following gene model tables: ncbiRefSeq, refGene, ensGene, knownGene
+
+Not all files are available for every assembly. For more information on the source tables
+see the respective data track description page in the assembly. For example:
+    http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg38&g=refGene
+
+Information on the different gene models can also be found in our genes FAQ:
+    https://genome.ucsc.edu/FAQ/FAQgenes.html
+
+Generation
+^^^^^^^^^^
+
+The files are created using the genePredToGtf utility with the additional -utr flag. Utilities
+can be found in the following directory:
+    http://hgdownload.soe.ucsc.edu/admin/exe/
+
+An example command is as follows:
+    genePredToGtf -utr hg38 ncbiRefSeq hg38.ncbiRefSeq.gtf
+
+Additional Resources
+^^^^^^^^^^^^^^^^^^^^
+
+Information on GTF format and how it is related to GFF format:
+    https://genome.ucsc.edu/FAQ/FAQformat.html#format4
+
+Information about the different gene models available in the Genome Browser:
+    https://genome.ucsc.edu/FAQ/FAQgenes.html
+
+More information on how the files were generated:
+    https://genome.ucsc.edu/FAQ/FAQdownloads.html#download37
+_EOF_
+  ;
+  close($fh);
+}	#	sub makeBigZipsGenesReadme
+
 sub makeBigZipsReadme {
   # Dump out a README.txt for bigZips/ .
   my ($runDir) = @_;
   my ($Organism, $assemblyDate, $assemblyLabel,
       $organism, $consortium, $sequencingCenter, $projectUrl) =
 	&getDescriptives();
   my $rmVersion = "";
   if ( ! -s "/hive/data/staging/data/RepeatMasker/RepeatMasker" ) {
     die "can not read /hive/data/staging/data/RepeatMasker/RepeatMasker\n";
   }
   $rmVersion = `grep -w open /hive/data/staging/data/RepeatMasker/RepeatMasker | grep -w version | grep -w RepeatMasker`;
   chomp $rmVersion;
   $rmVersion =~ s/#\s*//;
   my $emblLib = "";
   if ( ! -s "/hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl" ) {
@@ -823,32 +875,36 @@
 _EOF_
     ;
   }
   if (&dbHasTable($dbHost, $db, 'refGene')) {
     print $fh <<_EOF_
 refMrna.fa.gz - RefSeq mRNA from the same species as the genome.
     This sequence data is updated once a week via automatic GenBank
     updates.
 
 _EOF_
     ;
   }
   my $dunno = '*** ??? ***';
   if ($geneTable) {
     my $geneDesc;
-    if ($geneTable eq 'refGene') {
+    if ($geneTable eq 'ncbiRefSeq') {
+      $geneDesc = 'NCBI RefSeq';
+    } elsif ($geneTable eq 'refGene') {
       $geneDesc = 'RefSeq';
+    } elsif ($geneTable eq 'ensGene') {
+      $geneDesc = 'Ensembl';
     } elsif ($geneTable eq 'mgcGenes') {
       $geneDesc = 'MGC';
     } elsif ($geneTable eq 'xenoRefGene') {
       $geneDesc = 'non-$Organism RefSeq';
     } else {
       $geneDesc = $dunno;
     }
     print $fh <<_EOF_
 upstream1000.fa.gz - Sequences 1000 bases upstream of annotated
     transcription starts of $geneDesc genes with annotated 5' UTRs.
 _EOF_
     ;
     if ($geneDesc ne $dunno) {
       print $fh <<_EOF_
     This file is updated weekly so it might be slightly out of sync with
@@ -909,31 +965,30 @@
     wget --timestamping \
         'ftp://hgdownload.soe.ucsc.edu/goldenPath/$db/bigZips/chromFa.tar.gz' \
         -O chromFa.tar.gz
 
 To unpack the *.tar.gz files:
     tar xvzf <file>.tar.gz
 To uncompress the fa.gz files:
     gunzip <file>.fa.gz
 
 _EOF_
   ;
   &printAssemblyUsage($fh, $Organism, $assemblyLabel);
   close($fh);
 } # makeBigZipsReadme
 
-
 sub makeChromosomesReadme {
   # Dump out a README.txt for chromsomes/ .
   my ($runDir) = @_;
   my ($Organism, $assemblyDate, $assemblyLabel,
       $organism, $consortium, $sequencingCenter, $projectUrl) =
 	&getDescriptives();
   my $fh = &HgAutomate::mustOpen(">$runDir/README.chromosomes.txt");
   print $fh <<_EOF_
 This directory contains the $assemblyDate assembly of the
     $organism genome ($db, $assemblyLabel)
     in one gzip-compressed FASTA file per chromosome.
 
 For more information about this assembly, please note the NCBI resources:
     https://www.ncbi.nlm.nih.gov/genome/$ncbiGenomeId
     https://www.ncbi.nlm.nih.gov/genome/assembly/$ncbiAssemblyId
@@ -1096,37 +1151,41 @@
     &compressScaffoldFiles($runDir, $bossScript);
   }
   $bossScript->add(<<_EOF_
 # Add md5sum.txt and README.txt to each dir:
 foreach d (bigZips $chromGz database liftOver)
   cd $runDir/\$d
   if (\$d != "database" && \$d != "liftOver") then
     if (-s $db.2bit) then
 	md5sum $db.2bit $db.chrom.sizes *.gz > md5sum.txt
     else
 	md5sum *.gz > md5sum.txt
     endif
   endif
   mv $runDir/README.\$d.txt README.txt
 end
-
+if (-d "$runDir/bigZips/genes") then
+  cd $runDir/bigZips/genes
+  mv $runDir/README.bigZipsGenes.txt README.txt
+endif
 _EOF_
   );
 
   # Create README.*.txt files which will be moved into subdirs by the script.
   &makeDatabaseReadme($runDir);
   &makeBigZipsReadme($runDir);
+  &makeBigZipsGenesReadme($runDir);
   &makeChromosomesReadme($runDir) if ($chromBased);
   &makeLiftOverReadme($runDir);
 
   $bossScript->execute();
 } # doCompress
 
 
 #########################################################################
 # * step: install [dbHost]
 
 sub doInstall {
   my $runDir = "$topDir/goldenPath";
   my $whatItDoes =
 "It creates links from the web server's goldenPath download area to the
 actual compressed files.";
@@ -1135,43 +1194,69 @@
   my $gp = "$HgAutomate::goldenPath/$db";
   $bossScript->add(<<_EOF_
 mkdir -p $gp
 foreach d (bigZips $chromGz database)
   rm -rf $gp/\$d
   mkdir $gp/\$d
   ln -s $runDir/\$d/*.{gz,txt,2bit,sizes} $gp/\$d/
 end
 # Don't blow away all of liftOver, just the README -- there may be
 # pre-existing links that are not regenerated above.
 mkdir -p $gp/liftOver
 rm -f $gp/liftOver/README.txt
 ln -s $runDir/liftOver/README.txt $gp/liftOver/README.txt
 _EOF_
   );
+
   if ($geneTable) {
     $bossScript->add(<<_EOF_
 cd $runDir/bigZips
 foreach size (1000 2000 5000)
   echo \$size
   featureBits $db $geneTable:upstream:\$size -fa=stdout \\
   | gzip -c > upstream\$size.fa.gz
 end
 md5sum up*.gz >> md5sum.txt
 ln -s $runDir/bigZips/up*.gz $gp/bigZips/
 _EOF_
     );
+  }	#	if ($geneTable)
+
+  if (scalar(@geneTableList) > 0) {
+    $bossScript->add(<<_EOF_
+cd $runDir/bigZips
+mkdir -p genes $gp/bigZips/genes
+_EOF_
+    );
+    foreach my $geneTbl (@geneTableList) {
+      $bossScript->add(<<_EOF_
+genePredToGtf -utr $db $geneTbl stdout | gzip -c > genes/$geneTbl.gtf.gz
+_EOF_
+      );
     }
+    $bossScript->add(<<_EOF_
+cd $runDir/bigZips/genes
+md5sum *.gtf.gz > md5sum.txt
+rm -fr $gp/bigZips/genes
+mkdir $gp/bigZips/genes
+ln -s $runDir/bigZips/genes/*.gtf.gz $gp/bigZips/genes/
+ln -s $runDir/bigZips/genes/md5sum.txt $gp/bigZips/genes/
+ln -s $runDir/bigZips/genes/README.txt $gp/bigZips/genes/
+_EOF_
+    );
+  }	#	if (scalar(@geneTableList) > 0)
+
   $bossScript->execute();
 } # doInstall
 
 sub requireVar {
   # Ensure that var is in %config and return its value.
   # Remove it from %config so we can check for unrecognized contents.
   my ($var, $config) = @_;
   my $val = $config->{$var}
     || die "Error: $configFile is missing required variable \"$var\".\n" .
       "For a detailed list of required variables, run \"$base -help\".\n";
   delete $config->{$var};
   return $val;
 } # requireVar
 
 sub optionalVar {
@@ -1254,30 +1339,31 @@
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 ($db) = @ARGV;
 
 $topDir = "$HgAutomate::clusterData/$db";
 $configFile= "$topDir/$db.config.ra";
 &parseConfig($configFile);
 $scriptDir = "$topDir/jkStuff";
 $trfRunDirRel = "$HgAutomate::trackBuild/simpleRepeat";
 $trfRunDir = "$topDir/$trfRunDirRel";
 $geneTable = &getGeneTable();
+&getGeneTableList();
 
 if (! -e "$topDir/$db.2bit") {
   die "Sorry, this script requires $topDir/$db.2bit.\n";
 }
 if (! -e "$topDir/chrom.sizes") {
   die "Sorry, this script requires $topDir/chrom.sizes.\n";
 }
 @chroms = split("\n", `awk '{print \$1;}' $topDir/chrom.sizes`);
 $chromBased = (scalar(@chroms) <= $HgAutomate::splitThreshold) && ! $opt_noChromFiles;
 if ($chromBased) {
   foreach my $chr (@chroms) {
     my $chrRoot = $chr;
 	$chrRoot =~ s/^chr//;
 	$chrRoot =~ s/_random$//;
     if (! $opt_noChromRoot) {