src/hg/utils/automation/doAssemblyHub.pl 03886581a00e4d626316a657c170287b1f266e35

03886581a00e4d626316a657c170287b1f266e35
hiram
  Mon Sep 12 15:55:14 2022 -0700
beginning a oneAndDone browser build script and therefore adding a new argument to the assembly build for a concise UCSC style dbName refs #29811

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index f709fac..637607e 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -23,30 +23,31 @@
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_sourceDir
     $opt_species
     $opt_rmskSpecies
     $opt_ncbiRmsk
     $opt_noRmsk
     $opt_augustusSpecies
     $opt_noAugustus
     $opt_xenoRefSeq
     $opt_noXenoRefSeq
     $opt_ucscNames
+    $opt_dbName
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download',   func => \&doDownload },
       { name => 'sequence',   func => \&doSequence },
       { name => 'assemblyGap',   func => \&doAssemblyGap },
       { name => 'chromAlias',   func => \&doChromAlias },
       { name => 'gatewayPage',   func => \&doGatewayPage },
       { name => 'cytoBand',   func => \&doCytoBand },
       { name => 'gc5Base',   func => \&doGc5Base },
       { name => 'repeatMasker',   func => \&doRepeatMasker },
       { name => 'simpleRepeat',   func => \&doSimpleRepeat },
       { name => 'allGaps',   func => \&doAllGaps },
       { name => 'idKeys',   func => \&doIdKeys },
@@ -66,30 +67,31 @@
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
 my $species = "";       # usually found in asmId_assembly_report.txt
 my $ftpDir = "";	# will be determined from given asmId
 my $rmskSpecies = "";
 my $noRmsk = 0;	# when RepeatMasker is not possible, such as bacteria
 my $ncbiRmsk = 0;	# when =1 call doRepeatMasker.pl
                         # with -ncbiRmsk=path.out.gz and -liftSpec=...
 my $augustusSpecies = "human";
 my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq";
 my $noAugustus = 0;     # bacteria do *not* create an augustus track
 my $noXenoRefSeq = 0;	# bacteria do *not* create a xenoRefSeq track
 my $ucscNames = 0;  # default 'FALSE' (== 0)
+my $dbName = "";  # default uses NCBI asmId for name, can specify: abcDef1
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 # key is original accession name from the remove.dups.list, value is 1
 my %dupAccessionList;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
@@ -97,30 +99,31 @@
 usage: $base [options] asmId
 required arguments:
     asmId          - assembly identifier at NCBI FTP site, examples:
                    - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc..
 
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir     Construct assembly hub in dir instead of default
        $HgAutomate::clusterData/asmHubs/refseqBuild/GC[AF]/123/456/789/asmId/
     -sourceDir dir    Find assembly in dir instead of default:
        $sourceDir/GC[AF]/123/456/789/asmId
     -ucscNames        Translate NCBI/INSDC/RefSeq names to UCSC names
                       default is to use the given NCBI/INSDC/RefSeq names
+    -dbName <name>    name for UCSC style database name, e.g. 'abcDef1'
     -species <name>   use this species designation if there is no
                       asmId_assembly_report.txt with an
                       'Organism name:' entry to obtain species
     -rmskSpecies <name> to override default 'species' name for repeat masker
                       the default is found in the asmId_asssembly_report.txt
                       e.g. -rmskSpecies=viruses
     -noRmsk           when RepeatMasker is not possible, such as bacteria
     -ncbiRmsk         use NCBI rm.out.gz file instead of local cluster run
                       for repeat masking
     -augustusSpecies <human|chicken|zebrafish> default 'human'
     -noAugustus       do *not* create the Augustus gene track
     -noXenoRefSeq     do *not* create the Xeno RefSeq gene track
     -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz
                 expanded directory of mrnas/ and xenoRefMrna.sizes, default
                 $xenoRefSeq
@@ -191,30 +194,31 @@
 # Command line args: asmId
 my ( $asmId);
 # Other:
 my ($buildDir, $secondsStart, $secondsEnd, $assemblySource);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'sourceDir=s',
 		      'rmskSpecies=s',
 		      'noRmsk',
 		      'ncbiRmsk',
 		      'augustusSpecies=s',
 		      'xenoRefSeq=s',
+		      'dbName=s',
 		      'noXenoRefSeq',
 		      'noAugustus',
 		      'ucscNames',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 #########################################################################
@@ -842,35 +846,38 @@
     my $agpSource = "$nonNucAsm/unlocalized_scaffolds/AGP";
     my $agpOutput = "$runDir/$asmId.nonNucUnlocalized.agp.gz";
     my $agpNames = "$runDir/$asmId.nonNucUnlocalized.names";
     my $fastaOut = "$runDir/$asmId.nonNucUnlocalized.fa.gz";
     $partsDone += 1;
     if (needsUpdate($nonNucChr2scaf, $agpOutput)) {
       unlocalizedAgp($nonNucChr2scaf, $agpSource, $agpOutput, $agpNames);
       `touch -r $nonNucChr2scaf $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       unlocalizedFasta($nonNucChr2scaf, $twoBitFile, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
+  my $defaultName = $asmId;
+  $defaultName = $dbName if (length($dbName));
+
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
+export dbName="$defaultName"
 
-
-if [ -s ../\$asmId.chrom.sizes ]; then
+if [ -s ../\$dbName.chrom.sizes ]; then
   printf "sequence step previously completed\\n" 1>&2
   exit 0
 fi
 
 _EOF_
   );
 
 ### construct sequence when no AGP files exist
   if (0 == $partsDone) {
     printf STDERR "# partsDone is zero, creating fake AGP\n";
     if ($ucscNames) {
       $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz
 hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
@@ -880,92 +887,92 @@
       $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz
 hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\
   | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\
     | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\
       | awk '{printf "%s\\t%s\\n", \$1, \$2}' | sort > \$asmId.fake.names
 _EOF_
       );
     }
 } else {
 printf STDERR "partsDone: %d\n", $partsDone;
   }
 
   $bossScript->add(<<_EOF_
-zcat *.agp.gz | gzip > ../\$asmId.agp.gz
-faToTwoBit *.fa.gz ../\$asmId.2bit
-faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit
-twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt
+zcat *.agp.gz | gzip > ../\$dbName.agp.gz
+faToTwoBit *.fa.gz ../\$dbName.2bit
+faToTwoBit -noMask *.fa.gz ../\$dbName.unmasked.2bit
+twoBitDup ../\$dbName.unmasked.2bit > \$asmId.dups.txt
 if [ -s "\$asmId.dups.txt" ]; then
-  printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2
+  printf "ERROR: duplicate sequences found in ../\$dbName.unmasked.2bit\\n" 1>&2
   cat \$asmId.dups.txt 1>&2
   awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list
-  mv ../\$asmId.unmasked.2bit ../\$asmId.unmasked.dups.2bit
-  twoBitToFa ../\$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\
+  mv ../\$dbName.unmasked.2bit ../\$dbName.unmasked.dups.2bit
+  twoBitToFa ../\$dbName.unmasked.dups.2bit stdout | faSomeRecords -exclude \\
     stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz
-  rm -f ../\$asmId.2bit ../\$asmId.unmasked.2bit
-  faToTwoBit \$asmId.noDups.fasta.gz ../\$asmId.2bit
-  faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$asmId.unmasked.2bit
+  rm -f ../\$dbName.2bit ../\$dbName.unmasked.2bit
+  faToTwoBit \$asmId.noDups.fasta.gz ../\$dbName.2bit
+  faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$dbName.unmasked.2bit
 fi
 gzip -f \$asmId.dups.txt
-touch -r ../download/\$asmId.2bit ../\$asmId.2bit
-touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit
-touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz
-twoBitInfo ../\$asmId.2bit stdout | sort -k2nr > ../\$asmId.chrom.sizes
-touch -r ../\$asmId.2bit ../\$asmId.chrom.sizes
+touch -r ../download/\$asmId.2bit ../\$dbName.2bit
+touch -r ../download/\$asmId.2bit ../\$dbName.unmasked.2bit
+touch -r ../download/\$asmId.2bit ../\$dbName.agp.gz
+twoBitInfo ../\$dbName.2bit stdout | sort -k2nr > ../\$dbName.chrom.sizes
+touch -r ../\$dbName.2bit ../\$dbName.chrom.sizes
 # verify everything is there
 twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes
-export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"`
+export newTotal=`ave -col=2 ../\$dbName.chrom.sizes | grep "^total"`
 export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"`
 if [ "\$newTotal" != "\$oldTotal" ]; then
   printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2
   printf "# \$newTotal != \$oldTotal\\n" 1>&2
   exit 255
 fi
 rm source.\$asmId.chrom.sizes
-export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1`
+export checkAgp=`checkAgpAndFa ../\$dbName.agp.gz ../\$dbName.2bit 2>&1 | tail -1`
 if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then
-  printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\\n" 1>&2
+  printf "# ERROR: checkAgpAndFa \$dbName.agp.gz \$dbName.2bit failing\\n" 1>&2
   exit 255
 fi
 _EOF_
   );
   if ($ucscNames) {
     $bossScript->add(<<_EOF_
-join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift
-join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift
+join -t\$'\\t' <(sort ../\$dbName.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift
+join -t\$'\\t' <(sort ../\$dbName.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift
 export c0=`cat \$asmId.ncbiToUcsc.lift | wc -l`
 export c1=`cat \$asmId.ucscToNcbi.lift | wc -l`
-export c2=`cat ../\$asmId.chrom.sizes | wc -l`
+export c2=`cat ../\$dbName.chrom.sizes | wc -l`
 # verify all names are accounted for
 if [ "\$c0" -ne "\$c2" ]; then
   printf "# ERROR: not all names accounted for in \$asmId.ncbiToUcsc.lift" 1>&2
   exit 255
 fi
 if [ "\$c1" -ne "\$c2" ]; then
   printf "# ERROR: not all names accounted for in \$asmId.ucscToNcbi.lift" 1>&2
   exit 255
 fi
 _EOF_
     );
   }
   $bossScript->add(<<_EOF_
-twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz
-touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz
+twoBitToFa ../\$dbName.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz
+touch -r ../\$dbName.2bit \$asmId.faCount.txt.gz
 zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt
-touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt
+touch -r ../\$dbName.2bit \$asmId.faCount.signature.txt
 _EOF_
   );
   $bossScript->execute();
 } # doSequence
 
 #########################################################################
 # * step: assemblyGap [workhorse]
 sub doAssemblyGap {
   my $runDir = "$buildDir/trackData/assemblyGap";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct assembly and gap tracks from AGP file";
   my $bossScript = newBash HgRemoteScript("$runDir/doAssemblyGap.bash",
                     $workhorse, $runDir, $whatItDoes);
 
@@ -2023,46 +2030,50 @@
 
 if (length($species) < 1) {
   if (-s "$asmReport") {
      $species = `grep -i "organism name:" $asmReport`;
      chomp $species;
      $species =~ s/.*organism\s+name:\s+//i;
      $species =~ s/\s+\(.*//;
   } else {
      die "ERROR: no -species specified and can not find $asmReport";
   }
   if (length($species) < 1) {
      die "ERROR: no -species specified and can not find Organism name: in $asmReport";
   }
 }
 
+$dbName = $opt_dbName ? $opt_dbName : $dbName;
 $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species;
 $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies;
 $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq;
 $ucscNames = $opt_ucscNames ? 1 : $ucscNames;   # '1' == 'TRUE'
 $noAugustus = $opt_noAugustus ? 1 : $noAugustus;   # '1' == 'TRUE'
 $noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq;   # '1' == 'TRUE'
 $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse;
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0;
 $noRmsk = $opt_noRmsk ? 1 : 0;
 
 die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource);
 printf STDERR "# buildDir: %s\n", $buildDir;
 printf STDERR "# sourceDir %s\n", $sourceDir;
+if (length($dbName)) {
+printf STDERR "# dbName %s - building in /hive/data/genomes/%s\n", $dbName, $dbName;
+}
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq;
 printf STDERR "# assemblySource: %s\n", $assemblySource;
 printf STDERR "# rmskSpecies %s\n", $rmskSpecies;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE";
 printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE";
 printf STDERR "# noAugustus %s\n", $noAugustus ? "TRUE" : "FALSE";
 printf STDERR "# noXenoRefSeq %s\n", $noXenoRefSeq ? "TRUE" : "FALSE";
 printf STDERR "# noRmsk %s\n", $noRmsk ? "TRUE" : "FALSE";
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.