src/hg/utils/automation/makeGenomeDb.pl 94d9d0495ac3ff8446552a7ee48cf7bfd889be3d

94d9d0495ac3ff8446552a7ee48cf7bfd889be3d
hiram
  Fri Jun 12 11:04:38 2020 -0700
catching up trackDb source tree files refs #25720

diff --git src/hg/utils/automation/makeGenomeDb.pl src/hg/utils/automation/makeGenomeDb.pl
index 33378ac..f3bb47c 100755
--- src/hg/utils/automation/makeGenomeDb.pl
+++ src/hg/utils/automation/makeGenomeDb.pl
@@ -1,1510 +1,1512 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/makeGenomeDb.pl instead.
 
 # $Id: makeGenomeDb.pl,v 1.30 2010/04/13 23:18:44 hiram Exp $
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 
 my $stepper = new HgStepManager(
     [ { name => 'seq',     func => \&makeBuildDir },
       { name => 'agp',     func => \&checkAgp },
       { name => 'db',      func => \&makeDb },
       { name => 'dbDb',    func => \&makeDbDb },
       { name => 'trackDb', func => \&makeTrackDb },
     ]
 			       );
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $maxMitoSize = 25000;  ## Can be overridden in the config.ra file
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base config.ra
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 					        'fileServer' => '');
 print STDERR <<_EOF_
     -splitGoldGap         split the gold/gap tables (default is not split)
     -noGoldGapSplit       default behavior, this option no longer required
     -forceDescription     construct a new description.html when -continue=trackDb
 _EOF_
   ;
   print STDERR "
 Automates the first steps of building a genome database:
     seq:     Translates fasta into /hive/data/genomes/\$db/\$db.unmasked.2bit .
     agp:     Checks consistency of fasta and AGP (or runs hgFakeAgp).
     db:      Creates the genome database and basic tables.
     dbDb:    Adds the genome database to central database tables.
     trackDb: Creates a trackDb/ checkout and adds -- but does not check in --
              template trackDb.ra and assembly description .html files.
 config.ra describes the species, assembly and downloaded files.
 To see detailed information about what should appear in config.ra,
 run \"$base -help\".
 ";
   # Detailed help (-help):
   print STDERR "
 -----------------------------------------------------------------------------
 Required config.ra settings:
 
 db xxxYyyN
   - The UCSC database name and assembly identifier.  xxx is the first three
     letters of the genus and Yyy is the first three letters of the species.
     N is the sequence number of this species' build at UCSC.  For some
     species that we have been processing since before this convention, the
     pattern is xyN.
 
 scientificName Xxxxxx yyyyyy
   - The genus and species name.
 
 assemblyDate Mmm. YYYY
   - The month and year in which this assembly was released by the sequencing
     center.
 
 assemblyLabel XXXXXX
   - The detailed/long form of the sequencing center's label or version
     identifier for this release (e.g. 'Genome Center at Washington University,
     St. Louis, Genus_species 1.2.3').
 
 assemblyShortLabel XXXXXX
   - The abbreviated form of the sequencing center's label or version identifier
     for this release (e.g. 'WUGSC 1.2.3').
 
 photoCreditURL http://some.path/to/photo/credit.html
   - used to construct a courtesy URL reference to the source for the
     photograph on the gateway page
 
 photoCreditName string for photo credit
   - used to label the photoCreditURL under the picture on the gateway page
 
 ncbiGenomeId nnnnn
   - A numeric NCBI identifier for the genome information reference at
     https://www.ncbi.nlm.nih.gov/genome/nnnnn
 
 ncbiAssemblyId nnnnn
   - A numeric NCBI identifier for the assembly. To determine this, do an
     NCBI Assembly query at https://www.ncbi.nlm.nih.gov/assembly/,
     using the scientific name \"Xxxxxx yyyyyy\" and choose a project ID that
     match the assembly name.
 
 ncbiAssemblyName xxxxxrr
   - The assembly name used in the ftp path such as \"catChrV17e\" in
     ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/catChrV17e/
     It is identical to the name returned from the NCBI Assembly query mention above.
     NOTE: this is NOT an NCBI identifier, this name was supplied by the
     assembly provider.
 
 ncbiBioProject nnnnn
   - The NCBI bioproject number to construct the URL:
     https://www.ncbi.nlm.nih.gov/bioproject/nnnnn
 
 ncbiBioSample nnnnn
   - A numeric NCBI identifier for the genome information reference at
     https://www.ncbi.nlm.nih.gov/biosample/nnnnn
 
 genBankAccessionID GCA_nnn
   - The NCBI assembly accession identification from the ASSEMBLY_INFO file
     in the downloads from the FTP site ftp.ncbi.nlm.nih.gov/genbank/genomes
 
 orderKey NN
   - A priority number (for the central database's dbDb.orderKey column)
     that will determine db's relative position in the assembly menu.
 
 mitoAcc XXXXXXX
   - A numeric Genbank identifier (\"gi number\") for the complete
     mitochondrial sequence of this species, or \"none\" to disable fetching
     and inclusion in the build.  To determine this, do an Entrez Nucleotide
     query on \"Xxxxxx yyyyyy mitochondrion complete genome\" and choose a
     result that looks best (there may be multiple complete sequences in
     Genbank, or there may be none in which case just say \"none\").
 
 fastaFiles /path/to/downloaded.fa
   - A complete path, possibly with wildcard characters, to FASTA sequence
     files which have already been downloaded from the sequencing center.
 
 dbDbSpeciesDir dirName
   - The name of the subdirectory of trackDb/ in which the \$db subdirectory
     will be added.  For vertebrates, this is often a lower-cased common name,
     but various patterns have been used especially for non-vertebrates.
 
 taxId ncbiTaxId
   - The NCBI numeric taxonomy id.  If a pseudo-genome, like a reconstruction
     is being built, specify 0.
 
 -----------------------------------------------------------------------------
 Conditionally required config.ra settings:
 
 1. Required when AGP is not included in the assembly.  In this case we run
    hgFakeAgp to deduce gap annotations from runs of Ns in the sequence.
    The faGapSizes program shows a histogram that emphasizes overrepresented
    round-number sizes, which are decent hints for these parameters.
 
 fakeAgpMinContigGap NN
   - hgFakeAgp -minContigGap param.  Minimum size for a run of Ns that
     will be considered a bridged gap with type \"contig\".
 
 fakeAgpMinScaffoldGap NN
   - hgFakeAgp -minScaffoldGap param.  Minimum size for a run of Ns that
     will be considered an unbridged gap with type \"scaffold\".
 
 
 2. Required when the central database table genomeClade does not yet have a
    row for this species:
 
 clade cccccc
   - UCSC's \"clade\" category for this species, as specified in the central
     database tables clade and genomeClade.
 
 genomeCladePriority NN
   - Relative priority of this species compared to others in the same clade.
     This central database query can be helpful in picking a value:
     select * from genomeClade where clade = '\$clade' order by priority;
 
 
 -----------------------------------------------------------------------------
 Optional config.ra settings:
 
 commonName Xxxx
   - Common name to use as the menu label for this species (and in central
     database's genome column) instead of abbreviated scientific name.
 
 agpFiles /path/to/downloaded.agp
   - A complete path, possibly with wildcard characters, to AGP files
     which have already been downloaded from the sequencing center.
 
 qualFiles [/path/to/downloaded.qual | /path/to/qacAgpLift-ed.qac]
   - A complete path, possibly with wildcard characters, to quality score
     files which have already been downloaded from the sequencing center,
     or a complete path to a single .qac file (in case you need to pre-process
     qual files with qaToQac | qacAgpLift, for example).
 
 mitoSize N
   - to override the internal default of max size for mitochondrial
     sequence of $maxMitoSize e.g. for yeast: mitoSize 90000
 
 subsetLittleIds Y
   - ok if agp little ids (col6) are a subset of fasta sequences
     rather than requiring an exact match
 
 doNotCheckDuplicates Y
   - do not stop build if duplicate sequences are found in genome
 
 " if ($detailed);
   print STDERR "\n";
   exit $status;
 } # usage
 
 
 # Globals:
 # Command line argument:
 my $CONFIG;
 # Command line option vars:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 # specific command line options
 use vars qw/
     $opt_splitGoldGap
     $opt_noGoldGapSplit
     $opt_forceDescription
     /;
 
 # Required config parameters:
 my ($db, $scientificName, $assemblyDate, $assemblyLabel, $assemblyShortLabel, $orderKey, $photoCreditURL, $photoCreditName, $ncbiGenomeId, $providerAssemblyName, $ncbiAssemblyId, $ncbiBioProject, $ncbiBioSample, $genBankAccessionID,
     $mitoAcc, $fastaFiles, $dbDbSpeciesDir, $taxId);
 # Conditionally required config parameters:
 my ($fakeAgpMinContigGap, $fakeAgpMinScaffoldGap,
     $clade, $genomeCladePriority);
 # Optional config parameters:
 my ($commonName, $agpFiles, $doNotCheckDuplicates, $qualFiles, $mitoSize, $subsetLittleIds);
 # Other globals:
 my ($gotMito, $gotAgp, $gotQual, $topDir, $chromBased, $forceDescription);
 my ($bedDir, $scriptDir, $endNotes);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      @HgAutomate::commonOptionSpec,
 		      "splitGoldGap",
 		      "noGoldGapSplit",
 		      "forceDescription",
 		     );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if (defined $opt_dbHost);
   $forceDescription = 0;
   $forceDescription = 1 if (defined $opt_forceDescription)
 } # checkOptions
 
 sub requireVar {
   # Ensure that var is in %config and return its value.
   # Remove it from %config so we can check for unrecognized contents.
   my ($var, $config) = @_;
   my $val = $config->{$var}
     || die "Error: $CONFIG is missing required variable \"$var\".\n" .
       "For a detailed list of required variables, run \"$base -help\".\n";
   delete $config->{$var};
   return $val;
 } # requireVar
 
 sub optionalVar {
   # If var has a value in %config, return it.
   # Remove it from %config so we can check for unrecognized contents.
   my ($var, $config) = @_;
   my $val = $config->{$var};
   delete $config->{$var} if ($val);
   return $val;
 } # optionalVar
 
 sub parseConfig {
   # Parse config.ra file, make sure it contains the required variables.
   my ($configFile) = @_;
   my %config = ();
   my $fh = &HgAutomate::mustOpen($configFile);
   while (<$fh>) {
     next if (/^\s*#/ || /^\s*$/);
     if (/^\s*(\w+)\s*(.*)$/) {
       my ($var, $val) = ($1, $2);
       if (! exists $config{$var}) {
 	$config{$var} = $val;
       } else {
 	die "Duplicate definition for $var line $. of config file $configFile.\n";
       }
     } else {
       die "Can't parse line $. of config file $configFile:\n$_\n";
     }
   }
   close($fh);
   # Required variables.
   $db = &requireVar('db', \%config);
   $scientificName = &requireVar('scientificName', \%config);
   $assemblyDate = &requireVar('assemblyDate', \%config);
   $assemblyLabel = &requireVar('assemblyLabel', \%config);
   $assemblyShortLabel = &requireVar('assemblyShortLabel', \%config);
   $orderKey = &requireVar('orderKey', \%config);
   $mitoAcc = &requireVar('mitoAcc', \%config);
   $fastaFiles = &requireVar('fastaFiles', \%config);
   $dbDbSpeciesDir = &requireVar('dbDbSpeciesDir', \%config);
   $taxId = &requireVar('taxId', \%config);
   $photoCreditURL = &requireVar('photoCreditURL', \%config);
   $photoCreditName = &requireVar('photoCreditName', \%config);
   $ncbiGenomeId = &requireVar('ncbiGenomeId', \%config);
   $providerAssemblyName = &requireVar('ncbiAssemblyName', \%config);
   $ncbiAssemblyId = &requireVar('ncbiAssemblyId', \%config);
   $ncbiBioProject = &requireVar('ncbiBioProject', \%config);
   $ncbiBioSample = &requireVar('ncbiBioSample', \%config);
   $genBankAccessionID = &requireVar('genBankAccessionID', \%config);
   # Conditionally required variables -- optional here, but they might be
   # required later on in some cases.
   $fakeAgpMinContigGap = &optionalVar('fakeAgpMinContigGap', \%config);
   $fakeAgpMinScaffoldGap = &optionalVar('fakeAgpMinScaffoldGap', \%config);
   $clade = &optionalVar('clade', \%config);
   $genomeCladePriority = &optionalVar('genomeCladePriority', \%config);
   # Optional variables.
   $commonName = &optionalVar('commonName', \%config);
   $commonName =~ s/^(\w)(.*)/\u$1\L$2/;  # Capitalize only the first word
   $agpFiles = &optionalVar('agpFiles', \%config);
   $doNotCheckDuplicates = &optionalVar('doNotCheckDuplicates', \%config);
   $qualFiles = &optionalVar('qualFiles', \%config);
   $mitoSize = &optionalVar('mitoSize', \%config);
   $subsetLittleIds = &optionalVar('subsetLittleIds', \%config);
   # Make sure no unrecognized variables were given.
   my @stragglers = sort keys %config;
   if (scalar(@stragglers) > 0) {
     die "Error: config file $CONFIG has unrecognized variables:\n" .
       "    " . join(", ", @stragglers) . "\n" .
       "For a detailed list of supported variables, run \"$base -help\".\n";
   }
   $gotMito = ($mitoAcc ne 'none');
   $gotAgp = (defined $agpFiles);
   $gotQual = (defined $qualFiles);
   $topDir = "/cluster/data/$db";
 } # parseConfig
 
 
 #########################################################################
 # * step: seq [workhorse]
 
 sub mkClusterDataLink {
   # Make a /cluster/data/$tDb/ -> /cluster/store?/$tDb/ if it doesn't exist
   if (! -e "$topDir") {
     my $clusterStore = &HgAutomate::choosePermanentStorage();
     &HgAutomate::mustMkdir("$clusterStore/$db");
     # Don't use HgAutomate::run because this needs to happen despite -debug:
     (system("ln -s $clusterStore/$db $topDir") == 0)
       || die "Couldn't ln -s $clusterStore/$db $topDir\n";
   }
 } # mkClusterDataLink
 
 sub getMito {
   # Get the mitochondrion from Genbank (if specified) and rename it chrM
   if ($gotMito) {
     my $whatItDoes =
 	"It fetches mitochondrial sequence from GenBank and renames it chrM.";
     # Use $dbHost because this needs to wget and some of our workhorses
     # can't do that, and this is computationally cheap.
     my $bossScript = new HgRemoteScript("$scriptDir/getMito.csh",
 					$dbHost, $topDir, $whatItDoes,
 					$CONFIG);
     if ($mitoSize) {
 	$maxMitoSize = $mitoSize;
     }
     my $mitoFile = "$topDir/M/$mitoAcc.fa";
     $bossScript->add(<<_EOF_
 mkdir M
 wget -O $mitoFile \\
   'https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc'
 
 # old url  'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=$mitoAcc&retmode=text'
 
 
 # Make sure there's exactly one fasta record:
 if (`grep '^>' $mitoFile | wc -l` != 1) then
   echo "getMito: $mitoFile"
   echo "         does not have exactly one fasta record"
   exit 1
 endif
 
 # Make sure what we got is of about the right size:
 set mSize = `faSize $mitoFile | grep bases | awk '{print \$1;}'`
 if (\$mSize < 10000 || \$mSize > $maxMitoSize) then
   echo "getMito: $mitoFile"
   echo "         fasta size \$mSize is out of expected range [10000, $maxMitoSize]"
   exit 1
 endif
 
 # Make sure the fasta header looks plausible:
 set keyCount = 0
 set header = `grep '^>' $mitoFile`
 foreach keyword ('$scientificName' mitochondr complete genome)
   set count = `echo \$header | grep -i "\$keyword" | wc -l`
   set keyCount = `expr \$keyCount + \$count`
 end
 if (\$keyCount < 2) then
   echo "getMito: $mitoFile"
   echo "         fasta header does not have very many expected keywords"
   echo "         (looking for '$scientificName' mitochondr complete genome)"
   echo "         Here is the header:"
   echo \$header
   exit 1
 endif
 
 # Make chrM.fa with >chrM:
 sed -e 's/^>.*/>chrM/' $mitoFile > $topDir/M/chrM.fa
 rm $mitoFile
 _EOF_
     );
     $bossScript->execute();
   }
 } # getMito
 
 sub makeUnmasked2bit {
   my ($workhorse) = @_;
   my $whatItDoes =
 "It installs assembly fasta (and agp, if given) files in the usual dirs.";
   my $bossScript = new HgRemoteScript("$scriptDir/makeUnmasked2bit.csh",
 				      $workhorse, $topDir, $whatItDoes,
 				      $CONFIG);
 
   my $chrM = $gotMito ? "$topDir/M/chrM.fa" : "";
   # Forget the fasta, just make an unmasked 2bit for now!
   # If we really need a fasta hierarchy, we can make it later, after
   # RepeatMasking!!  The whole point of the thing is to provide a
   # structure for the RM run and subsequent masking of the sequence.
   # But the RM+masking task can structure itself -- it can simply make
   # masked .2bit, and then subsequent jobs can do their own split and
   # distribute if necessary (or better yet just use 2bit specs).
 
   # possibilities for $fastaFiles:
   # {.fa, .fa.gz}
   # {many single-fasta, one multi-fasta, several multi-fasta}
 
   # If AGP is given:
   # 1. if first column of AGP matches sequence names, use as-is.
   # 2. if sixth column of AGP matches seqnames, assemble with agpToFa.
 
   # installation options:
   # 1. chrom-based (numSeqs <= 100): directories containing single-fasta,
   #    per-sequence files; randoms lumped into same directories with reals.
   # 2. scaffold-based: scaffolds.fa
   # Traditionally we have split to 5M in prep for RM -- but let the RM take
   # care of its own splitting (better yet, make it use 2bit specs).
 
   my $acat = "cat";
   my $fcat = "cat";
   my $sli = "";
   if (defined $subsetLittleIds && $subsetLittleIds eq "Y") {
     $sli = "-1 ";
   }
   foreach my $file (`ls $fastaFiles 2> /dev/null`) {
     if ($file =~ m/\.gz$/) {
       $fcat = "zcat";
       last;
     }
   }
   foreach my $file (`ls $agpFiles 2> /dev/null`) {
     if ($file =~ m/\.gz$/) {
       $acat = "zcat";
       last;
     }
   }
   if ($gotAgp) {
     $bossScript->add(<<_EOF_
 # Get sorted IDs from fasta sequence files:
 set fastaIds = `mktemp -p /tmp makeGenomeDb.fastaIds.XXXXXX`
 $fcat $fastaFiles | grep '^>' | sed -e 's/^>.*gb\|/>/; s/\|.*//' | perl -wpe 's/^>(\\S+).*/\$1/' | sort \\
   > \$fastaIds
 
 # Get sorted "big" (1st column) and "little" (6th column) IDs from AGP files:
 set agpBigIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
 $acat $agpFiles | grep -v '^#' | awk '{print \$1;}' | sort -u \\
   > \$agpBigIds
 set agpLittleIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
 $acat $agpFiles | grep -v '^#' | awk '((\$5 != "N") && (\$5 != "U")) {print \$6;}' | sort -u \\
   > \$agpLittleIds
 
 # Compare fasta IDs to first and sixth columns of AGP:
 set diffBigCount = `comm -3 \$fastaIds \$agpBigIds | wc -l`
 set diffLittleCount = `comm $sli-3 \$fastaIds \$agpLittleIds | wc -l`
 
 # If AGP "big" IDs match sequence IDs, use sequence as-is.
 # If AGP "little" IDs match sequence IDs, or are a subset, assemble sequence with agpToFa.
 set bigGenome = ""
 #   big genomes are over 4Gb: 4*1024*1024*1024 = 4294967296
 # requires -long argument to faToTwoBit
 if (\$diffLittleCount == 0) then
   set agpTmp = `mktemp -p /tmp makeGenomeDb.agp.XXXXXX`
   $acat $agpFiles | grep -v '^#' > \$agpTmp
   set genomeSize = `$fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' | agpToFa -simpleMultiMixed \$agpTmp all stdout stdin | faSize stdin | grep -w bases | awk '{print \$1}'`
   if ( \$genomeSize > 4294967295 ) then
     set bigGenome = "-long"
   endif
   $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\
   | agpToFa -simpleMultiMixed \$agpTmp all stdout stdin \\
   | faToTwoBit \$bigGenome -noMask stdin $chrM $db.unmasked.2bit
   rm -f \$agpTmp
 else if (\$diffBigCount == 0) then
   set genomeSize = `$fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' | faSize stdin | grep -w bases | awk '{print \$1}'`
   if ( \$genomeSize > 4294967295 ) then
     set bigGenome = "-long"
   endif
   $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\
   | faToTwoBit \$bigGenome -noMask stdin $chrM $db.unmasked.2bit
 else
   echo "Error: IDs in fastaFiles ($fastaFiles)"
   echo "do not perfectly match IDs in either the first or sixth columns of"
   echo "agpFiles ($agpFiles)"
   echo "Please examine and then remove these temporary files:"
   echo "  \$fastaIds -- fasta IDs"
   echo "  \$agpBigIds -- AGP first column IDs ('big' sequences)"
   echo "  \$agpLittleIds -- AGP sixth column IDs ('little' sequences)"
   exit 1
 endif
 rm -f \$fastaIds \$agpBigIds \$agpLittleIds
 _EOF_
     );
   } else {
     # No AGP -- just make an unmasked 2bit.
     $bossScript->add(<<_EOF_
 set bigGenome = ""
 #   big genomes are over 4Gb: 4*1024*1024*1024 = 4294967296
 # requires -long argument to faToTwoBit
 set genomeSize = `$fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' | faSize stdin | grep -w bases | awk '{print \$1}'`
 if ( \$genomeSize > 4294967295 ) then
   set bigGenome = "-long"
 endif
 $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' | \\
     faToTwoBit \$bigGenome -noMask stdin $chrM $db.unmasked.2bit
 _EOF_
     );
   }
 
   # Having made the unmasked .2bit, make chrom.sizes and chromInfo.tab:
   # verify no dots allowed in chrom names
   if (! defined $doNotCheckDuplicates || ($doNotCheckDuplicates eq "N")) {
   $bossScript->add(<<_EOF_
 twoBitDup $db.unmasked.2bit > jkStuff/twoBitDup.txt
 if (`wc -l < jkStuff/twoBitDup.txt` > 0) then
   echo "ERROR: duplicate sequence found in $db.unmasked.2bit"
   exit 1
 endif
 _EOF_
   );
   }
 
 
   $bossScript->add(<<_EOF_
 twoBitInfo $db.unmasked.2bit stdout | sort -k2nr > chrom.sizes
 
 # if no dots in chrom names, should have only one kind of field size:
 set noDots = `cut -f 1 chrom.sizes | awk -F'.' '{print NF}' | sort -u | wc -l`
 
 if ( \$noDots != 1 ) then
   echo "ERROR: no dots allowed in chrom names !  e.g.:"
   grep "\." chrom.sizes | head -3
   exit 1
 endif
 # only one kind of field size and it must be simply a one:
 set dotCount = `cut -f 1 chrom.sizes | awk -F'.' '{print NF}' | sort -u`
 if ( \$dotCount != 1 ) then
   echo "ERROR: no dots allowed in chrom names !  e.g.:"
   grep "\." chrom.sizes | head -3
   exit 1
 endif
 
 rm -rf $HgAutomate::trackBuild/chromInfo
 mkdir $HgAutomate::trackBuild/chromInfo
 awk '{print \$1 "\t" \$2 "\t$HgAutomate::gbdb/$db/$db.2bit";}' chrom.sizes \\
   > $HgAutomate::trackBuild/chromInfo/chromInfo.tab
 _EOF_
     );
   if ($gotAgp) {
     my $splitThreshold = $HgAutomate::splitThreshold;
     $bossScript->add(<<_EOF_
 if (`wc -l < chrom.sizes` < $splitThreshold) then
   # Install per-chrom .agp files for download.
   $acat $agpFiles | grep -v '^#' \\
   | splitFileByColumn -col=1 -ending=.agp stdin $topDir -chromDirs
 endif
 _EOF_
       );
   }
   $bossScript->execute();
 
   # Now that we have created chrom.sizes (unless we're in -debug mode),
   # re-evaluate $chromBased for subsequent steps:
   if (-e "$topDir/chrom.sizes") {
     $chromBased = (`wc -l < $topDir/chrom.sizes` < $HgAutomate::splitThreshold);
   }
 } # makeUnmasked2bit
 
 sub makeBuildDir {
   # * step: seq [workhorse]
   &mkClusterDataLink();
   &HgAutomate::checkCleanSlate('seq', 'agp',
 			       "$topDir/M", "$topDir/chrom.sizes");
   &HgAutomate::mustMkdir($scriptDir);
   &HgAutomate::mustMkdir($bedDir);
   my $workhorse = &HgAutomate::chooseWorkhorse();
   &getMito();
   &makeUnmasked2bit($workhorse);
 } # makeBuildDir
 
 
 #########################################################################
 # * step: agp [workhorse]
 
 sub requireFakeAgpParams {
   # When assembly does not include AGP, run hgFakeAgp and require the
   # developer to specify its parameters.  If not specified, run faGapSizes
   # to give some hints.
   my $problem = 0;
   if (! defined $fakeAgpMinContigGap) {
     warn "Error: $CONFIG is missing required variable " .
       "\"fakeAgpMinContigGap\".\n";
     $problem = 1;
   }
   if (! defined $fakeAgpMinScaffoldGap) {
     warn "Error: $CONFIG is missing required variable " .
       "\"fakeAgpMinScaffoldGap\".\n";
     $problem = 1;
   }
   if ($problem) {
     warn "\nWhen the assembly does not include AGP, $CONFIG must specify " .
       "fakeAgpMinContigGap and fakeAgpMinScaffoldGap.\n";
     warn "Printing a gap-size histogram from faGapSizes to stdout... " .
       "Check for overrepresented round numbers.\n";
     warn "See usage messages of hgFakeAgp and faGapSizes for more hints.\n\n";
     my $fileServer = &HgAutomate::chooseFileServer($topDir);
     print "\n";
     &HgAutomate::run("$HgAutomate::runSSH $fileServer nice " .
 		     "twoBitToFa $topDir/$db.unmasked.2bit stdout " .
 		     "\\| faGapSizes stdin -niceSizes=" .
 		     "10,20,25,50,100,1000,2000,5000,10000,20000,50000");
     print "\n";
     exit 1;
     }
 }
 
 sub checkAgp {
   my $workhorse = &HgAutomate::chooseWorkhorse();
   # Check or generate AGP, depending on whether it was provided:
   if ($gotAgp) {
     &HgAutomate::checkCleanSlate('agp', 'db', "$topDir/$db.agp");
     my $whatItDoes = "It checks consistency of AGP and fasta.";
     my $bossScript = new HgRemoteScript("$scriptDir/checkAgpAndFa.csh",
 					$workhorse, $topDir, $whatItDoes,
 					$CONFIG);
     my $allAgp = "$topDir/$db.agp";
     # If we added chrM from GenBank, exclude it from fasta:
     my $exclude = ($mitoAcc eq 'none') ? "" : "-exclude=chrM";
     my $acat = "cat";
     foreach my $file (`ls $agpFiles 2> /dev/null`) {
 	if ($file =~ m/\.gz$/) {
 	    $acat = "zcat";
 	    last;
 	}
     }
     $bossScript->add(<<_EOF_
 # When per-chrom AGP and fasta files are given, it would be much more
 # efficient to run this one chrom at a time.  However, since the filenames
 # are arbitrary, I'm not sure how to identify the pairings of AGP and fa
 # files.  So cat 'em all together and check everything at once:
 
 $acat $agpFiles | grep -v '^#' | sort -k1,1 -k2n,2n > $allAgp
 
 set result = `checkAgpAndFa $exclude $allAgp $db.unmasked.2bit | tail -1`
 
 if ("\$result" != 'All AGP and FASTA entries agree - both files are valid') then
   echo "Error: checkAgpAndFa failed\\!"
   echo "Last line of output: \$result"
   exit 1
 endif
 _EOF_
     );
     $bossScript->execute();
   } else {
     my $runDir = "$bedDir/hgFakeAgp";
     &HgAutomate::mustMkdir($runDir);
     &HgAutomate::checkCleanSlate('agp', 'db', "$runDir/$db.agp");
     &requireFakeAgpParams();
     my $whatItDoes = "It runs hgFakeAgp (because no AGP was provided).";
     my $bossScript = new HgRemoteScript("$runDir/doFakeAgp.csh",
 					$workhorse, $runDir, $whatItDoes,
 					$CONFIG);
     $bossScript->add(<<_EOF_
 twoBitToFa $topDir/$db.unmasked.2bit stdout \\
 | hgFakeAgp stdin $db.agp \\
   -minContigGap=$fakeAgpMinContigGap -minScaffoldGap=$fakeAgpMinScaffoldGap
 _EOF_
     );
     $bossScript->execute();
   }
 
 #*** Produce a gap report so we can say something sensible in gap.html.
 
 } # checkAgp
 
 
 #########################################################################
 # * step: db [dbHost]
 
 sub makeDb {
   # Create a database on hgwdev, grp, chromInfo, gold/gap.
   my $qual = (defined $qualFiles) ? " qual" : "";
   my $whatItDoes =
 "It creates the genome database ($db) and loads the most basic tables:
 chromInfo, grp, gap, gold,$qual and gc5Base.";
   my $bossScript = new HgRemoteScript("$scriptDir/makeDb.csh", $dbHost,
 				      $topDir, $whatItDoes, $CONFIG);
 
   # Actually, build some basic track files on $workhorse, then load.
   $qual = (defined $qualFiles) ? " qual and" : "";
   $whatItDoes = "It generates$qual gc5Base track files for loading.";
   my $workhorse = &HgAutomate::chooseWorkhorse();
   my $horseScript = new HgRemoteScript("$scriptDir/makeTrackFiles.csh",
 				       $workhorse,
 				       $topDir, $whatItDoes, $CONFIG);
   # Build quality wiggle track files (if provided).
   if (defined $qualFiles) {
     $horseScript->add(<<_EOF_
 # Translate qual files to wiggle encoding.  If there is a problem with
 # sequence names, you may need to tweak the original qual sequence names
 # and/or lift using qacAgpLft.
 mkdir -p $bedDir/qual
 cd $bedDir/qual
 _EOF_
       );
     if ($qualFiles =~ /^\S+\.qac$/) {
       # Single .qac file:
       $horseScript->add(<<_EOF_
 qacToWig -fixed $qualFiles stdout | gzip -c > $db.qual.wigVarStep.gz
 _EOF_
         );
     } else {
       # Possible wildcard of qual file(s):
       $horseScript->add(<<_EOF_
 if (`ls $qualFiles | grep \.gz | wc -l`) then
   set qcat = zcat
 else
   set qcat = cat
 endif
 \$qcat $qualFiles \\
 | qaToQac stdin stdout \\
 | qacToWig -fixed stdin stdout | gzip -c > $db.qual.wigVarStep.gz
 _EOF_
       );
     }
     $horseScript->add(<<_EOF_
 wigToBigWig $db.qual.wigVarStep.gz ../../chrom.sizes $db.quality.bw
 _EOF_
     );
   }
 
   # Build gc5Base files.
   $horseScript->add(<<_EOF_
 # Make gc5Base wiggle files.
 mkdir -p $bedDir/gc5Base
 cd $bedDir/gc5Base
 hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 $db \\
   $topDir/$db.unmasked.2bit | gzip -c > $db.gc5Base.wigVarStep.gz
 wigToBigWig $db.gc5Base.wigVarStep.gz ../../chrom.sizes $db.gc5Base.bw
 _EOF_
   );
 
   # Now start the database creation and loading commands.
   $bossScript->add (<<_EOF_
 hgsql '' -e 'create database $db'
 df -h /var/lib/mysql
 hgsql $db < \${HOME}/kent/src/hg/lib/grp.sql
 cut -f1 $HgAutomate::trackBuild/chromInfo/chromInfo.tab | awk '{if (length(\$0)>32) exit(1);}'  || echo Attention annotator: Your chromosome names are longer than 32 character. This will crash this script and lead to error messages by featureBits and everything that uses hdb.c
 cut -f1 $HgAutomate::trackBuild/chromInfo/chromInfo.tab | awk '{print length(\$0)}' | sort -nr > $HgAutomate::trackBuild/chromInfo/t.chrSize
 set chrSize = `head -1 $HgAutomate::trackBuild/chromInfo/t.chrSize`
 sed -e "s/chrom(16)/chrom(\$chrSize)/" \${HOME}/kent/src/hg/lib/chromInfo.sql > $HgAutomate::trackBuild/chromInfo/chromInfo.sql
 rm -f $HgAutomate::trackBuild/chromInfo/t.chrSize
 hgLoadSqlTab $db chromInfo $HgAutomate::trackBuild/chromInfo/chromInfo.sql \\
   $HgAutomate::trackBuild/chromInfo/chromInfo.tab
 _EOF_
   );
 
   my $allAgp = "$topDir/$db.agp";
   $allAgp = "$bedDir/hgFakeAgp/$db.agp" if (! $gotAgp);
   if ($chromBased) {
     if ($opt_splitGoldGap) {
       $bossScript->add(<<_EOF_
 # Split AGP into per-chrom files/dirs so we can load split gold and gap tables.
 cp /dev/null chrom.lst.tmp
 foreach chr (`awk '{print \$1;}' chrom.sizes`)
   set c = `echo \$chr | sed -e 's/^chr//; s/_random\$//;'`
   mkdir -p \$c
   echo \$c >> chrom.lst.tmp
   awk '\$1 == "'\$chr'" {print;}' $allAgp > \$c/\$chr.agp
 end
 sort -u chrom.lst.tmp > chrom.lst
 rm chrom.lst.tmp
 _EOF_
       );
       $bossScript->add("hgGoldGapGl -noGl -chromLst=chrom.lst $db $topDir .\n");
     } else {
       $bossScript->add("hgGoldGapGl -noGl $db $allAgp\n");
     }
   } else {
     $bossScript->add("hgGoldGapGl -noGl $db $allAgp\n");
   }
 
   if ($gotAgp && $gotMito) {
     # When we load gold/gap from assembly AGP, but pull in chrM sequence
     # separately, chrM is conspicuously absent from gold/gap -- so add a fake
     # entry for it in gold (so featureBits gold --> 100%) and if split tables,
     # make a token chrM_gap table.  Use bin=585 (512+ 64 + 8 + 1), the
     # smallest bin that starts at 0.  The smallest bin is 128k bases, which
     # should always cover the entire mitochondrial genome (typically ~16k).
     my $bin = 585;
     my $mitoGold = ($mitoAcc =~ /^\d+$/) ? "gi|$mitoAcc" : $mitoAcc;
     if ($opt_splitGoldGap && ($chromBased || $opt_debug)) {
       my $defaultChrom = `head -1 $topDir/chrom.sizes | cut -f 1`;
       chomp $defaultChrom;
       $bossScript->add(<<_EOF_
 
 # Add fake chrM_{gap,gold} to make featureBits happy.
 set mSize = `faSize $topDir/M/chrM.fa | grep bases | awk '{print \$1;}'`
 hgsql $db -e 'drop table chrM_gap,chrM_gold;'
 hgsql $db -e \\
     'create table chrM_gap select * from ${defaultChrom}_gap where 0; \\
      create table chrM_gold select * from ${defaultChrom}_gold where 0; \\
      insert into chrM_gold \\
        values($bin,"chrM",0,'\$mSize',1,"F","$mitoGold",0,'\$mSize',"+");'
 _EOF_
       );
     } else {
       $bossScript->add(<<_EOF_
 
 # Add fake chrM entry in gold table to make featureBits happy.
 set mSize = `faSize $topDir/M/chrM.fa | grep bases | awk '{print \$1;}'`
 hgsql $db -e \\
     'insert into gold \\
         values($bin,"chrM",0,'\$mSize',1,"F","$mitoGold",0,'\$mSize',"+");'
 _EOF_
       );
     }
     # may as well finally add the chrM entry to the agp file
     $bossScript->add(<<_EOF_
 set lastId = `tail -1 $topDir/$db.agp | awk '{print \$4+1}'`
 /bin/echo -e "chrM\t1\t\$mSize\t\$lastId\tF\t$mitoGold\t1\t\$mSize\t+" >> $topDir/$db.agp
 _EOF_
       );
   }
 
   $bossScript->add(<<_EOF_
 # verify gold and gap tables cover everything
 featureBits -or -countGaps $db gold gap >&fb.$db.gold.gap.txt
 cat fb.$db.gold.gap.txt
 set allCovered = `awk '{print \$4-\$1}' fb.$db.gold.gap.txt`
 if (\$allCovered != 0) then
     echo "ERROR: gold and gap tables do not cover whole genome"
     exit 255
 endif
 _EOF_
       );
 
   $bossScript->add(<<_EOF_
 
 # Load gc5base
 mkdir -p $HgAutomate::gbdb/$db/bbi/gc5BaseBw
 rm -f $HgAutomate::gbdb/$db/bbi/gc5BaseBw/gc5Base.bw
 ln -s $bedDir/gc5Base/$db.gc5Base.bw $HgAutomate::gbdb/$db/bbi/gc5BaseBw/gc5Base.bw
 hgsql $db -e 'drop table if exists gc5BaseBw; \\
             create table gc5BaseBw (fileName varchar(255) not null); \\
             insert into gc5BaseBw values ("$HgAutomate::gbdb/$db/bbi/gc5BaseBw/gc5Base.bw");'
 _EOF_
   );
   if (defined $qualFiles) {
     $bossScript->add(<<_EOF_
 
 # Load qual
 mkdir -p $HgAutomate::gbdb/$db/bbi/qualityBw
 rm -f $HgAutomate::gbdb/$db/bbi/qualityBw/quality.bw
 ln -s $bedDir/qual/$db.quality.bw $HgAutomate::gbdb/$db/bbi/qualityBw/quality.bw
 hgsql $db -e 'drop table if exists qualityBw; \\
             create table qualityBw (fileName varchar(255) not null); \\
             insert into qualityBw values ("$HgAutomate::gbdb/$db/bbi/qualityBw/quality.bw");'
 _EOF_
     );
   }
   $horseScript->execute();
   $bossScript->execute();
 } # makeDb
 
 
 #########################################################################
 # * step: dbDb [dbHost]
 
 sub requireCladeAndPriority {
   # If the genomeClade table in the central database does not have a row
   # for this $db's genome, require user to specify clade and priority.
   my ($genome) = @_;
   my $problem = 0;
   if (! defined $clade) {
     warn "Error: $CONFIG is missing required variable " .
       "\"clade\".\n";
     $problem = 1;
   }
   if (! defined $genomeCladePriority) {
     warn "Error: $CONFIG is missing required variable " .
       "\"genomeCladePriority\".\n";
     $problem = 1;
   }
   if ($problem) {
     warn "\nCentral database table genomeClade does not have a row for " .
       "${db}'s genome \"$genome\",\n";
     warn "so $CONFIG must specify clade and genomeCladePriority.\n";
     warn "Examine the contents of the genomeClade table to get an idea " .
       "where \"$genome\" fits in.\n\n";
     exit 1;
   }
 }
 
 sub getGenome {
   # Return what we'll use in the genome (and organism) column of dbDb.
   # This is the label that appears in the gateway menu.
   my $genome;
   if ($commonName) {
     $genome = $commonName;
   } else {
     $genome = $scientificName;
     $genome =~ s/^(\w)\w+\s+(\w+)$/$1. $2/;
   }
   return $genome;
 }
 
 sub makeDbDb {
 # * step: dbDb [dbHost... or just direct to hgcentraltest??]
 # - create hgcentraltest entry, (if necessary) defaultDb and genomeClade
   my $genome = &getGenome();
   my $defaultPos;
   my ($seq, $size) = $opt_debug ? ("chr1", 1000) :
     split(/\s/, `head -1 "$topDir/chrom.sizes"`);
   my $start = int($size / 2);
   $size = ($start + 9999) if ($size > ($start + 9999));
   $defaultPos = "$seq:$start-$size";
 
   my $dbDbInsert = "$topDir/dbDbInsert.sql";
   my $fh = &HgAutomate::mustOpen(">$dbDbInsert");
   print $fh <<_EOF_
 DELETE from dbDb where name = "$db";
 INSERT INTO dbDb
     (name, description, nibPath, organism,
      defaultPos, active, orderKey, genome, scientificName,
      htmlPath, hgNearOk, hgPbOk, sourceName, taxId)
 VALUES
     ("$db", "$assemblyDate ($assemblyShortLabel/$db)", "$HgAutomate::gbdb/$db", "$genome",
      "$defaultPos", 1, $orderKey, "$genome", "$scientificName",
      "$HgAutomate::gbdb/$db/html/description.html", 0, 0, "$assemblyLabel",
     $taxId);
 _EOF_
   ;
   close($fh);
   my $centDbSql = "$HgAutomate::runSSH $dbHost $HgAutomate::centralDbSql";
   &HgAutomate::run("$centDbSql < $dbDbInsert");
 
   # Add a row to defaultDb if this is the first usage of $genome.
   my $quotedGenome = $genome;
   $quotedGenome =~ s/'/'"'"'/g;
   my $sql = "'select count(*) from defaultDb where genome = \"$quotedGenome\"'";
   if (`echo $sql | $centDbSql` == 0) {
     $sql = "'INSERT INTO defaultDb (genome, name) " .
       "VALUES (\"$quotedGenome\", \"$db\")'";
     &HgAutomate::run("echo $sql | $centDbSql");
   }
 
   # If $genome does not already appear in genomeClade, warn user that
   # they will have to manually add it.
   $sql = "'select count(*) from genomeClade where genome = \"$quotedGenome\"'";
   if (`echo $sql | $centDbSql` == 0) {
     &requireCladeAndPriority($genome);
     $sql = "'INSERT INTO genomeClade (genome, clade, priority) " .
       "VALUES (\"$quotedGenome\", \"$clade\", $genomeCladePriority)'";
     &HgAutomate::run("echo $sql | $centDbSql");
   }
 } # makeDbDb
 
 
 #########################################################################
 # * step: trackDb [dbHost]
 
 sub makeDescription {
   # Make a template description.html (for the browser gateway page).
   my $sciUnderscore = $scientificName;
   $sciUnderscore =~ s/ /_/g;
   my $genome = &getGenome();
   my $anchorRoot = lc($genome);
   $anchorRoot =~ s/\. /_/;
   $anchorRoot =~ s/ /_/;
   my $imgExtn = "jpg";
   my $img = "/usr/local/apache/htdocs/images/$sciUnderscore.$imgExtn";
   if ( ! -s $img ) {
       $imgExtn = "png";
       $img = "/usr/local/apache/htdocs/images/$sciUnderscore.$imgExtn";
       if ( ! -s $img ) {
         $imgExtn = "gif";
         $img = "/usr/local/apache/htdocs/images/$sciUnderscore.$imgExtn";
           if ( ! -s $img ) {
             warn "missing htdocs/images/$sciUnderscore.{jpg|png|gif}\n\n";
             exit 1;
           }
       }
   }
   my $widthHeight = `identify $img | awk '{print \$3}'`;
   chomp $widthHeight;
   my ($width, $height) = split('x', $widthHeight);
   my $borderWidth = $width + 15;
 
   my $fh = &HgAutomate::mustOpen(">$topDir/html/description.html");
   print $fh <<_EOF_
 <!-- Display image in righthand corner -->
 <TABLE ALIGN=RIGHT BORDER=0 WIDTH=$borderWidth>
   <TR><TD ALIGN=RIGHT>
     <A HREF="https://www.ncbi.nlm.nih.gov/genome/$ncbiGenomeId" TARGET=_blank>
     <IMG SRC="../images/$sciUnderscore.$imgExtn" WIDTH=$width HEIGHT=$height ALT="$genome"></A>
   </TD></TR>
   <TR><TD ALIGN=RIGHT>
     <FONT SIZE=-1><em>$scientificName</em><BR>
     </FONT>
     <FONT SIZE=-2>
       (<A HREF="$photoCreditURL"
       TARGET=_blank>$photoCreditName</A>)
     </FONT>
   </TD></TR>
 </TABLE>
 
 <P>
 <B>UCSC Genome Browser assembly ID:</B> $db<BR>
 <B>Sequencing/Assembly provider ID:</B> $assemblyLabel $providerAssemblyName<BR>
 <B>Assembly date:</B> $assemblyDate<BR>
 <B>Accession ID:</B> $genBankAccessionID<BR>
 _EOF_
   ;
   if ($ncbiGenomeId ne "n/a") {
   printf $fh "<B>NCBI Genome ID:</B> <A HREF='https://www.ncbi.nlm.nih.gov/genome/$ncbiGenomeId'
 TARGET='_blank'>$ncbiGenomeId</A> ($scientificName)<BR>\n";
   } else {
   printf $fh "<B>NCBI Genome ID:</B> $ncbiGenomeId<BR>\n";
   }
   printf $fh "<B>NCBI Assembly ID:</B> <A HREF='https://www.ncbi.nlm.nih.gov/assembly/$ncbiAssemblyId'
 TARGET='_blank'>$ncbiAssemblyId</A><BR>
 <B>NCBI BioProject ID:</B> <A HREF='https://www.ncbi.nlm.nih.gov/bioproject/$ncbiBioProject'
 TARGET='_blank'>$ncbiBioProject</A><BR>\n";
   if ($ncbiBioSample ne "n/a") {
     printf $fh "<B>NCBI BioSample ID:</B> <A HREF='https://www.ncbi.nlm.nih.gov/biosample/$ncbiBioSample'
 TARGET='_blank'>$ncbiBioSample</A>\n";
   } else {
     printf $fh "<B>NCBI BioSample ID:</B> $ncbiBioSample<BR>\n";
   }
   print $fh <<_EOF_
 </P>
 <HR>
 <P>
 <B>Search the assembly:</B>
 <UL>
 <LI>
 <B>By position or search term: </B> Use the &quot;position or search term&quot;
 box to find areas of the genome associated with many different attributes, such
 as a specific chromosomal coordinate range; mRNA, EST, or STS marker names; or
 keywords from the GenBank description of an mRNA.
 <A HREF="../goldenPath/help/query.html">More information</A>, including sample
 queries.
 <LI>
 <B>By gene name: </B> Type a gene name into the &quot;search term&quot; box,
 choose your gene from the drop-down list, then press &quot;submit&quot; to go
 directly to the assembly location associated with that gene.
 <A HREF="../goldenPath/help/geneSearchBox.html">More information</A>.
 <LI>
 <B>By track type: </B> Click the &quot;track search&quot; button
 to find Genome Browser tracks that match specific selection criteria.
 <A HREF="../goldenPath/help/trackSearch.html">More information</A>.
 </UL>
 </P>
 <HR>
 <P>
 <B>Download sequence and annotation data:</B>
 <UL>
 <LI><A HREF="../goldenPath/help/ftp.html">Using rsync</A> (recommended)
 <LI><A HREF="ftp://hgdownload.soe.ucsc.edu/goldenPath/$db/">Using FTP</A>
 <LI><A HREF="http://hgdownload.soe.ucsc.edu/downloads.html#$anchorRoot">Using HTTP</A>
 <LI><A HREF="../goldenPath/credits.html#${anchorRoot}_use">Data use conditions and
 restrictions</A>
 <LI><A HREF="../goldenPath/credits.html#${anchorRoot}_credits">Acknowledgments</A>
 </UL>
 </P>
 _EOF_
   ;
   close($fh);
 } # makeDescription
 
 # from Perl Cookbook Recipe 2.17, print out large numbers with comma
 # delimiters:
 sub commify($) {
     my $text = reverse $_[0];
     $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
     return scalar reverse $text
 }
 
 # definition of contig types in the AGP file
 my %goldTypes = (
 'A' => 'active finishing',
 'D' => 'draft sequence',
 'F' => 'finished sequence',
 'O' => 'other sequence',
 'P' => 'pre draft',
 'W' => 'whole genome shotgun'
 );
 # definition of gap types in the AGP file
 my %gapTypes = (
 'clone' => 'gaps between clones in scaffolds',
 'heterochromatin' => 'heterochromatin gaps',
 'short_arm' => 'a gap inserted at the start of an acrocentric chromosome',
 'telomere' => 'telomere gaps',
 'repeat' => 'an unresolvable repeat',
 'centromere' => 'gaps for centromeres are included when they can be reasonably localized',
 'scaffold' => 'gaps between scaffolds in chromosome assemblies',
 'contig' => 'gaps between contigs in scaffolds',
 'other' => 'gaps added at UCSC to annotate strings of <em>N</em>s that were not marked in the AGP file',
 'fragment' => 'gaps between whole genome shotgun contigs'
 );
 
 
 sub makeLocalTrackDbRa {
   # Make an assembly-level trackDb.ra, gap.html and gold.html.
   my @localFiles = qw( trackDb.ra gap.html gold.html );
 
   my $fh = &HgAutomate::mustOpen(">$topDir/html/trackDb.ra");
   print $fh <<_EOF_
 # Local declaration so that local gold.html is picked up.
 track gold override
 html gold
 
 # Local declaration so that local gap.html is picked up.
 track gap override
 html gap
 
 _EOF_
   ;
   close($fh);
 
   $fh = &HgAutomate::mustOpen(">$topDir/html/gap.html");
   my $em = $commonName ? "" : "<em>";
   my $noEm = $commonName ? "" : "</em>";
   my $gapCount = `hgsql -N -e 'select count(*) from gap;' $db`;
   chomp $gapCount;
   if ($gotAgp) {
     print $fh <<_EOF_
 <H2>Description</H2>
 <P>
 This track shows the gaps in the $assemblyDate $em\$organism$noEm genome assembly.
 </P>
 <P>
 Genome assembly procedures are covered in the NCBI
 <A HREF="https://www.ncbi.nlm.nih.gov/assembly/basics/"
 TARGET=_blank>assembly documentation</A>.<BR>
 NCBI also provides
 <A HREF="https://www.ncbi.nlm.nih.gov/assembly/$ncbiAssemblyId"
 TARGET="_blank">specific information about this assembly</A>.
 </P>
 <P>
 The definition of the gaps in this assembly is from the
 <A HREF="ftp://hgdownload.soe.ucsc.edu/goldenPath/$db/bigZips/$db.agp.gz"
 TARGET=_blank>AGP file</A> delivered with the sequence.  The NCBI document
 <A HREF="https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/"
 TARGET=_blank>AGP Specification</A> describes the format of the AGP file.
 </P>
 <P>
 Gaps are represented as black boxes in this track.
 If the relative order and orientation of the contigs on either side
 of the gap is supported by read pair data,
 it is a <em>bridged</em> gap and a white line is drawn
 through the black box representing the gap.
 </P>
 _EOF_
     ;
     if ($gapCount > 0) {
       print $fh "<P>This assembly contains the following principal types of gaps:
 <UL>\n";
     } else {
       print $fh "<P>This assembly has no annotated gaps.\n</P>\n";
     }
     open (GL, "hgsql -N -e 'select type from gap;' $db | sort | uniq -c | sort -n|") or die "can not select type from $db.gap table";
     while (my $line = <GL>) {
         chomp $line;
         $line =~ s/^\s+//;
         my ($count, $type) = split('\s+', $line);
         my $minSize = `hgsql -N -e 'select min(size) from gap where type="$type";' $db`;
         chomp $minSize;
         my $maxSize = `hgsql -N -e 'select max(size) from gap where type="$type";' $db`;
         chomp $maxSize;
         my $sizeMessage = "";
         if ($minSize == $maxSize) {
             $sizeMessage = sprintf ("all of size %s bases", commify($minSize));
         } else {
             $sizeMessage = sprintf ("size range: %s - %s bases",
                 commify($minSize), commify($maxSize));
         }
         if (exists ($gapTypes{$type}) ) {
             printf $fh "<LI><B>%s</B> - %s (count: %s; %s)</LI>\n", $type,
                 $gapTypes{$type}, commify($count), $sizeMessage;
         } else {
             die "makeLocalTrackDbRa: missing AGP gap type definition: $type";
         }
     }
     close (GL);
     if ($gapCount > 0) {
       print $fh "</UL></P>\n";
     }
   } else {
     print $fh <<_EOF_
 <H2>Description</H2>
 This track depicts gaps in the draft assembly ($assemblyDate, $assemblyLabel)
 of the $em\$organism$noEm genome.
 
   *** Developer: remove this statement if no future assemblies are expected:
 
 Many of these gaps &mdash; with the
 exception of intractable heterochromatic gaps &mdash; may be closed during the
 finishing process.
 <P>
 Gaps are represented as black boxes in this track.
 If the relative order and orientation of the contigs on either side
 of the gap is known, it is a <em>bridged</em> gap and a white line is drawn
 through the black box representing the gap.
 </P>
 <P>
 This assembly was sequenced with paired reads taken from
 
   *** Developer: check if this is accurate:
 
 plasmids and fosmids of various sizes.
 
 Overlapping reads were merged into contigs,
 and pairing information was then used to join the contigs into scaffolds.
 The gap sizes are estimated from the size of the
 
 plasmids and fosmids,
 
 with a minimum gap size of $fakeAgpMinContigGap.
 </P>
 <P></P>
 <P>This assembly contains the following types of gaps:
 <UL>
 <LI><B>Contig</B> - gaps of size $fakeAgpMinContigGap to $fakeAgpMinScaffoldGap.
 <LI><B>Scaffold</B> - gaps greater than $fakeAgpMinScaffoldGap in size.
 </UL>
 </P>
 _EOF_
     ;
   }
   close($fh);
 
   $fh = &HgAutomate::mustOpen(">$topDir/html/gold.html");
   if ($gotAgp) {
     print $fh <<_EOF_
 <H2>Description</H2>
 <P>
 This track shows the sequences used in the $assemblyDate $em\$organism$noEm genome assembly.
 </P>
 <P>
 Genome assembly procedures are covered in the NCBI
 <A HREF="https://www.ncbi.nlm.nih.gov/assembly/basics/"
 TARGET=_blank>assembly documentation</A>.<BR>
 NCBI also provides
 <A HREF="https://www.ncbi.nlm.nih.gov/assembly/$ncbiAssemblyId"
 TARGET="_blank">specific information about this assembly</A>.
 </P>
 <P>
 The definition of this assembly is from the
 <A HREF="ftp://hgdownload.soe.ucsc.edu/goldenPath/$db/bigZips/$db.agp.gz"
 TARGET=_blank>AGP file</A> delivered with the sequence.  The NCBI document
 <A HREF="https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/"
 TARGET=_blank>AGP Specification</A> describes the format of the AGP file.
 </P>
 <P>
 In dense mode, this track depicts the contigs that make up the
 currently viewed scaffold.
 Contig boundaries are distinguished by the use of alternating gold and brown
 coloration. Where gaps
 exist between contigs, spaces are shown between the gold and brown
 blocks.  The relative order and orientation of the contigs
 within a scaffold is always known; therefore, a line is drawn in the graphical
 display to bridge the blocks.</P>
 <P>
 Component types found in this track (with counts of that type in parentheses):
 <UL>
 _EOF_
     ;
     open (GL, "hgsql -N -e 'select type from gold;' $db | sort | uniq -c | sort -rn|") or die "can not select type from $db.gold table";
     while (my $line = <GL>) {
         chomp $line;
         $line =~ s/^\s+//;
         my ($count, $type) = split('\s+', $line);
         my $singleMessage = "";
         if ((1 == $count) && (("F" eq $type) || ("O" eq $type))) {
             my $chr = `hgsql -N -e 'select chrom from gold where type=\"$type\";' $db`;
             my $frag = `hgsql -N -e 'select frag from gold where type=\"$type\";' $db`;
             chomp $chr;
             chomp $frag;
             $singleMessage = sprintf("(%s/%s)", $chr, $frag);
         }
         if (exists ($goldTypes{$type}) ) {
             if (length($singleMessage)) {
                 printf $fh "<LI>%s - one %s %s</LI>\n", $type, $goldTypes{$type}, $singleMessage;
             } else {
                 printf $fh "<LI>%s - %s (%s)</LI>\n", $type, $goldTypes{$type}, commify($count);
             }
         } else {
             die "makeLocalTrackDbRa: missing AGP contig type definition: $type";
         }
     }
     close (GL);
     printf $fh "</UL></P>\n";
   } else {
     print $fh <<_EOF_
 <H2>Description</H2>
 <P>
 This track shows the draft assembly ($assemblyDate, $assemblyLabel)
 of the $em\$organism$noEm genome.
 
   *** Developer: check if this is accurate:
 
 Whole-genome shotgun reads were assembled into contigs.  When possible,
 contigs were grouped into scaffolds (also known as &quot;supercontigs&quot;).
 The order, orientation and gap sizes between contigs within a scaffold are
 based on paired-end read evidence. </P>
 <P>
 Locations of contigs and scaffolds were deduced from runs of Ns in the
 assembled sequence.  A run of Ns of more than $fakeAgpMinScaffoldGap bases
 was assumed to be a gap between scaffolds, and a run of Ns between
 $fakeAgpMinContigGap and $fakeAgpMinScaffoldGap was assumed to be a gap
 between contigs.</P>
 <P>
 In dense mode, this track depicts the contigs that make up the
 currently viewed scaffold.
 Contig boundaries are distinguished by the use of alternating gold and brown
 coloration. Where gaps
 exist between contigs, spaces are shown between the gold and brown
 blocks.  The relative order and orientation of the contigs
 within a scaffold is always known; therefore, a line is drawn in the graphical
 display to bridge the blocks.</P>
 <P>
 All components within this track are of fragment type &quot;W&quot;:
 whole genome shotgun contig. </P>
 _EOF_
     ;
   }
   close($fh);
 
   my $localFiles = "$topDir/html/{" . join(',', @localFiles) . '}';
   return $localFiles;
 } # makeLocalTrackDbRa
 
 sub makeTrackDb {
 # * step: trackDb [dbHost]
   my $runDir = "$topDir/TemporaryTrackDbCheckout";
   &HgAutomate::mustMkdir($runDir);
   &HgAutomate::mustMkdir("$topDir/html");
   &makeDescription();
   my $localFiles = &makeLocalTrackDbRa();
   my $whatItDoes =<<_EOF_
 It makes a local checkout of kent/src/hg/makeDb/trackDb/
 and makes *suggested* modifications.  Checking in modified files, and adding
 new files, is left up to the user in case this script has made some wrong
 guesses about proper names and locations.
 _EOF_
   ;
   my $bossScript = new HgRemoteScript("$scriptDir/makeTrackDb.csh", $dbHost,
 				      $runDir, $whatItDoes, $CONFIG);
 
   $bossScript->add(<<_EOF_
 # These directories are necessary for running make in trackDb:
 $HgAutomate::git archive --remote=git://genome-source.soe.ucsc.edu/kent.git \\
   --prefix=kent/ HEAD src/hg/makeDb/trackDb/loadTracks \\
 src/hg/makeDb/trackDb/$dbDbSpeciesDir \\
 src/hg/makeDb/trackDb/trackDb.transMap.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.ra \\
+src/hg/makeDb/trackDb/trackDb.encode3.ra \\
+src/hg/makeDb/trackDb/trackDb.chainNet.asmHub.ra \\
 src/hg/makeDb/trackDb/crisprAll.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.primates.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.other.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.euarchontoglires.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.laurasiatheria.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.afrotheria.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.mammal.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.birds.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.sarcopterygii.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.fish.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.insects.ra \\
 src/hg/makeDb/trackDb/trackDb.chainNet.nematode.ra \\
 src/hg/makeDb/trackDb/chainNetPetMar1.ra \\
 src/hg/makeDb/trackDb/chainNetPetMar2.ra \\
 src/hg/makeDb/trackDb/trackDb.nt.ra \\
 src/hg/makeDb/trackDb/joinedRmskComposite.ra \\
 src/hg/makeDb/trackDb/joinedRmsk.ra \\
 src/hg/makeDb/trackDb/trackDb.genbank.ra \\
 src/hg/makeDb/trackDb/trackDb.genbank.new.ra \\
 src/hg/makeDb/trackDb/trackDb.uniprot.ra \\
 src/hg/makeDb/trackDb/crispr10K.ra \\
 src/hg/makeDb/trackDb/crisprAll.ra \\
 src/hg/makeDb/trackDb/tagTypes.tab \\
 src/hg/lib/trackDb.sql \\
 src/hg/lib/hgFindSpec.sql \\
 src/hg/makeDb/trackDb/trackDb.ra | tar xf -
 
 cd kent/src/hg/makeDb/trackDb
 
 # Create the expected species-level directory for $db if necessary:
 mkdir -p $dbDbSpeciesDir/$db
 
 # Move local description files into place:
 mv $localFiles $dbDbSpeciesDir/$db/
 
 if (1 == $forceDescription) then
   rm -f $dbDbSpeciesDir/$db/description.html
 endif
 # Copy the template description.html for $db into place, and link to it
 # from $HgAutomate::gbdb/$db/html/ .
 if (! -e $dbDbSpeciesDir/$db/description.html) then
   cp -p $topDir/html/description.html $dbDbSpeciesDir/$db/description.html
 endif
 mkdir -p $HgAutomate::gbdb/$db/html
 rm -f $HgAutomate::gbdb/$db/html/description.html
 ln -s $topDir/html/description.html $HgAutomate::gbdb/$db/html/
 
 # Do a test run with the generated files:
 ./loadTracks trackDb_\${USER} hgFindSpec_\${USER} $db
 _EOF_
   );
 
   $bossScript->execute();
 
   $endNotes .=<<_EOF_
 
 Template trackDb.ra and .html's have been created, but they all need editing!
 
 cd $runDir/kent/src/hg/makeDb/trackDb/$dbDbSpeciesDir/$db
 
 Search for '***' notes in each file in and make corrections (sometimes the
 files used for a previous assembly might make a better template):
   description.html $localFiles
 
 Then copy these files to your ~/kent/src/hg/makeDb/trackDb/$dbDbSpeciesDir/$db
  - cd ~/kent/src/hg/makeDb/trackDb
  - edit makefile to add $db to DBS.
  - git add $dbDbSpeciesDir/$db/*.{ra,html}
  - git commit -m "Added $db to DBS." makefile
  - git commit -m "Initial descriptions for $db." $dbDbSpeciesDir/$db/*.{ra,html}
  - git pull; git push
  - Run make update DBS=$db and make alpha when done.
  - (optional) Clean up $runDir
 
 _EOF_
   ;
 } # makeTrackDb
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 ($CONFIG) = @ARGV;
 
 #*** Force -verbose until this is really ready to go:
 $opt_verbose = 3 if (! $opt_verbose);
 
 &parseConfig($CONFIG);
 
 $endNotes = "";
 $bedDir = "$topDir/$HgAutomate::trackBuild";
 $scriptDir = "$topDir/jkStuff";
 if (-e "$topDir/chrom.sizes") {
   $chromBased = (`wc -l < $topDir/chrom.sizes` < $HgAutomate::splitThreshold);
 }
 
 $stepper->execute();
 
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'trackDb') ? "" :
   "  (through the '$stopStep' step)";
 
 HgAutomate::verbose(1,
 	"\n *** All done!$upThrough\n");
 
 if ($endNotes) {
   #*** Should mail this to $ENV{'USER'} so it's not so easy to ignore.
   #*** Does mail work on all of our machines??  Even if it works on one,
   #*** we can ssh it.  Should be in an HgAutomate routine.
   HgAutomate::verbose(0,
 		      "\n\nNOTES -- STUFF THAT YOU WILL HAVE TO DO --\n\n" .
 		      "$endNotes\n");
 }