src/hg/utils/automation/makeGenomeDb.pl 1.23

1.23 2009/08/06 18:22:31 hiram
Adding chrM entry to the AGP file
Index: src/hg/utils/automation/makeGenomeDb.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/makeGenomeDb.pl,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/utils/automation/makeGenomeDb.pl	8 Jun 2009 18:38:57 -0000	1.22
+++ src/hg/utils/automation/makeGenomeDb.pl	6 Aug 2009 18:22:31 -0000	1.23
@@ -1,1356 +1,1362 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file -- 
 # edit ~/kent/src/hg/utils/automation/makeGenomeDb.pl instead.
 
 # $Id$
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 
 my $stepper = new HgStepManager(
     [ { name => 'seq',     func => \&makeBuildDir },
       { name => 'agp',     func => \&checkAgp },
       { name => 'db',      func => \&makeDb },
       { name => 'dbDb',    func => \&makeDbDb },
       { name => 'trackDb', func => \&makeTrackDb },
     ]
 			       );
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $maxMitoSize = 25000;  ## Can be overridden in the config.ra file
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base config.ra
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 					        'fileServer' => '');
 print STDERR <<_EOF_
     -noGoldGapSplit       do not split the gold/gap tables even when chrom based
 _EOF_
   ;
   print STDERR "
 Automates the first steps of building a genome database:
     seq:     Translates fasta into /hive/data/genomes/\$db/\$db.unmasked.2bit .
     agp:     Checks consistency of fasta and AGP (or runs hgFakeAgp).
     db:      Creates the genome database and basic tables.
     dbDb:    Adds the genome database to central database tables.
     trackDb: Creates a trackDb/ checkout and adds -- but does not check in --
              template trackDb.ra and assembly description .html files.
 config.ra describes the species, assembly and downloaded files.
 To see detailed information about what should appear in config.ra,
 run \"$base -help\".
 ";
   # Detailed help (-help):
   print STDERR "
 -----------------------------------------------------------------------------
 Required config.ra settings:
 
 db xxxYyyN
   - The UCSC database name and assembly identifier.  xxx is the first three
     letters of the genus and Yyy is the first three letters of the species.
     N is the sequence number of this species' build at UCSC.  For some
     species that we have been processing since before this convention, the
     pattern is xyN.
 
 scientificName Xxxxxx yyyyyy
   - The genus and species name.
 
 assemblyDate Mmm. YYYY
   - The month and year in which this assembly was released by the sequencing
     center.
 
 assemblyLabel XXXXXX
   - The sequencing center's label or version identifier for this release.
 
 orderKey NN
   - A priority number (for the central database's dbDb.orderKey column)
     that will determine db's relative position in the assembly menu.
 
 mitoAcc XXXXXXX
   - A numeric Genbank identifier (\"gi number\") for the complete
     mitochondrial sequence of this species, or \"none\" to disable fetching
     and inclusion in the build.  To determine this, do an Entrez Nucleotide
     query on \"Xxxxxx yyyyyy mitochondrion complete genome\" and choose a
     result that looks best (there may be multiple complete sequences in
     Genbank, or there may be none in which case just say \"none\").
 
 fastaFiles /path/to/downloaded.fa
   - A complete path, possibly with wildcard characters, to FASTA sequence
     files which have already been downloaded from the sequencing center.
 
 dbDbSpeciesDir dirName
   - The name of the subdirectory of trackDb/ in which the \$db subdirectory
     will be added.  For vertebrates, this is often a lower-cased common name,
     but various patterns have been used especially for non-vertebrates.
 
 taxId ncbiTaxId
   - The NCBI numeric taxonomy id.  If a pseudo-genome, like a reconstruction
     is being built, specify 0.
 
 -----------------------------------------------------------------------------
 Conditionally required config.ra settings:
 
 1. Required when AGP is not included in the assembly.  In this case we run
    hgFakeAgp to deduce gap annotations from runs of Ns in the sequence.
    The faGapSizes program shows a histogram that emphasizes overrepresented
    round-number sizes, which are decent hints for these parameters.
 
 fakeAgpMinContigGap NN
   - hgFakeAgp -minContigGap param.  Minimum size for a run of Ns that
     will be considered a bridged gap with type \"contig\".
 
 fakeAgpMinScaffoldGap NN
   - hgFakeAgp -minScaffoldGap param.  Minimum size for a run of Ns that
     will be considered an unbridged gap with type \"scaffold\".
 
 
 2. Required when the central database table genomeClade does not yet have a
    row for this species:
 
 clade cccccc
   - UCSC's \"clade\" category for this species, as specified in the central
     database tables clade and genomeClade.
 
 genomeCladePriority NN
   - Relative priority of this species compared to others in the same clade.
     This central database query can be helpful in picking a value:
     select * from genomeClade where clade = '\$clade' order by priority;
 
 
 -----------------------------------------------------------------------------
 Optional config.ra settings:
 
 commonName Xxxx
   - Common name to use as the menu label for this species (and in central
     database's genome column) instead of abbreviated scientific name.
 
 agpFiles /path/to/downloaded.agp
   - A complete path, possibly with wildcard characters, to AGP files
     which have already been downloaded from the sequencing center.
 
 qualFiles [/path/to/downloaded.qual | /path/to/qacAgpLift-ed.qac]
   - A complete path, possibly with wildcard characters, to quality score
     files which have already been downloaded from the sequencing center,
     or a complete path to a single .qac file (in case you need to pre-process
     qual files with qaToQac | qacAgpLift, for example).
 
 mitoSize N
   - to override the internal default of max size for mitochondrial
     sequence of $maxMitoSize e.g. for yeast: mitoSize 90000
 " if ($detailed);
   print STDERR "\n";
   exit $status;
 } # usage
 
 
 # Globals:
 # Command line argument:
 my $CONFIG;
 # Command line option vars:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 # specific command line options
 use vars qw/
     $opt_noGoldGapSplit
     /;
 
 # Required config parameters:
 my ($db, $scientificName, $assemblyDate, $assemblyLabel, $orderKey,
     $mitoAcc, $fastaFiles, $dbDbSpeciesDir, $taxId);
 # Conditionally required config parameters:
 my ($fakeAgpMinContigGap, $fakeAgpMinScaffoldGap,
     $clade, $genomeCladePriority);
 # Optional config parameters:
 my ($commonName, $agpFiles, $qualFiles, $mitoSize);
 # Other globals:
 my ($gotMito, $gotAgp, $gotQual, $topDir, $chromBased);
 my ($bedDir, $scriptDir, $endNotes);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      @HgAutomate::commonOptionSpec,
 		      "noGoldGapSplit",
 		     );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if (defined $opt_dbHost);
 } # checkOptions
 
 sub requireVar {
   # Ensure that var is in %config and return its value.
   # Remove it from %config so we can check for unrecognized contents.
   my ($var, $config) = @_;
   my $val = $config->{$var}
     || die "Error: $CONFIG is missing required variable \"$var\".\n" .
       "For a detailed list of required variables, run \"$base -help\".\n";
   delete $config->{$var};
   return $val;
 } # requireVar
 
 sub optionalVar {
   # If var has a value in %config, return it.
   # Remove it from %config so we can check for unrecognized contents.
   my ($var, $config) = @_;
   my $val = $config->{$var};
   delete $config->{$var} if ($val);
   return $val;
 } # optionalVar
 
 sub parseConfig {
   # Parse config.ra file, make sure it contains the required variables.
   my ($configFile) = @_;
   my %config = ();
   my $fh = &HgAutomate::mustOpen($configFile);
   while (<$fh>) {
     next if (/^\s*#/ || /^\s*$/);
     if (/^\s*(\w+)\s*(.*)$/) {
       my ($var, $val) = ($1, $2);
       if (! exists $config{$var}) {
 	$config{$var} = $val;
       } else {
 	die "Duplicate definition for $var line $. of config file $configFile.\n";
       }
     } else {
       die "Can't parse line $. of config file $configFile:\n$_\n";
     }
   }
   close($fh);
   # Required variables.
   $db = &requireVar('db', \%config);
   $scientificName = &requireVar('scientificName', \%config);
   $assemblyDate = &requireVar('assemblyDate', \%config);
   $assemblyLabel = &requireVar('assemblyLabel', \%config);
   $orderKey = &requireVar('orderKey', \%config);
   $mitoAcc = &requireVar('mitoAcc', \%config);
   $fastaFiles = &requireVar('fastaFiles', \%config);
   $dbDbSpeciesDir = &requireVar('dbDbSpeciesDir', \%config);
   $taxId = &requireVar('taxId', \%config);
   # Conditionally required variables -- optional here, but they might be
   # required later on in some cases.
   $fakeAgpMinContigGap = &optionalVar('fakeAgpMinContigGap', \%config);
   $fakeAgpMinScaffoldGap = &optionalVar('fakeAgpMinScaffoldGap', \%config);
   $clade = &optionalVar('clade', \%config);
   $genomeCladePriority = &optionalVar('genomeCladePriority', \%config);
   # Optional variables.
   $commonName = &optionalVar('commonName', \%config);
   $agpFiles = &optionalVar('agpFiles', \%config);
   $qualFiles = &optionalVar('qualFiles', \%config);
   $mitoSize = &optionalVar('mitoSize', \%config);
   # Make sure no unrecognized variables were given.
   my @stragglers = sort keys %config;
   if (scalar(@stragglers) > 0) {
     die "Error: config file $CONFIG has unrecognized variables:\n" .
       "    " . join(", ", @stragglers) . "\n" .
       "For a detailed list of supported variables, run \"$base -help\".\n";
   }
   $gotMito = ($mitoAcc ne 'none');
   $gotAgp = (defined $agpFiles);
   $gotQual = (defined $qualFiles);
   $topDir = "/cluster/data/$db";
 } # parseConfig
 
 
 #########################################################################
 # * step: seq [workhorse]
 
 sub mkClusterDataLink {
   # Make a /cluster/data/$tDb/ -> /cluster/store?/$tDb/ if it doesn't exist
   if (! -e "$topDir") {
     my $clusterStore = &HgAutomate::choosePermanentStorage();
     &HgAutomate::mustMkdir("$clusterStore/$db");
     # Don't use HgAutomate::run because this needs to happen despite -debug:
     (system("ln -s $clusterStore/$db $topDir") == 0)
       || die "Couldn't ln -s $clusterStore/$db $topDir\n";
   }
 } # mkClusterDataLink
 
 sub getMito {
   # Get the mitochondrion from Genbank (if specified) and rename it chrM
   if ($gotMito) {
     my $whatItDoes =
 	"It fetches mitochondrial sequence from GenBank and renames it chrM.";
     # Use $dbHost because this needs to wget and some of our workhorses
     # can't do that, and this is computationally cheap.
     my $bossScript = new HgRemoteScript("$scriptDir/getMito.csh",
 					$dbHost, $topDir, $whatItDoes,
 					$CONFIG);
     if ($mitoSize) {
 	$maxMitoSize = $mitoSize;
     }
     my $mitoFile = "$topDir/M/$mitoAcc.fa";
     $bossScript->add(<<_EOF_
 mkdir M
 wget -O $mitoFile \\
   'http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc'
 
 # old url  'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=$mitoAcc&retmode=text'
 
 
 # Make sure there's exactly one fasta record:
 if (`grep '^>' $mitoFile | wc -l` != 1) then
   echo "getMito: $mitoFile"
   echo "         does not have exactly one fasta record"
   exit 1
 endif
 
 # Make sure what we got is of about the right size:
 set mSize = `faSize $mitoFile | grep bases | awk '{print \$1;}'`
 if (\$mSize < 10000 || \$mSize > $maxMitoSize) then
   echo "getMito: $mitoFile"
   echo "         fasta size \$mSize is out of expected range [10000, $maxMitoSize]"
   exit 1
 endif
 
 # Make sure the fasta header looks plausible:
 set keyCount = 0
 set header = `grep '^>' $mitoFile`
 foreach keyword ('$scientificName' mitochondr complete genome)
   set count = `echo \$header | grep -i "\$keyword" | wc -l`
   set keyCount = `expr \$keyCount + \$count`
 end
 if (\$keyCount < 2) then
   echo "getMito: $mitoFile"
   echo "         fasta header does not have very many expected keywords"
   echo "         (looking for '$scientificName' mitochondr complete genome)"
   echo "         Here is the header:"
   echo \$header
   exit 1
 endif
 
 # Make chrM.fa with >chrM:
 sed -e 's/^>.*/>chrM/' $mitoFile > $topDir/M/chrM.fa
 rm $mitoFile
 _EOF_
     );
     $bossScript->execute();
   }
 } # getMito
 
 sub makeUnmasked2bit {
   my ($workhorse) = @_;
   my $whatItDoes =
 "It installs assembly fasta (and agp, if given) files in the usual dirs.";
   my $bossScript = new HgRemoteScript("$scriptDir/makeUnmasked2bit.csh",
 				      $workhorse, $topDir, $whatItDoes,
 				      $CONFIG);
 
   my $chrM = $gotMito ? "$topDir/M/chrM.fa" : "";
   # Forget the fasta, just make an unmasked 2bit for now!
   # If we really need a fasta hierarchy, we can make it later, after
   # RepeatMasking!!  The whole point of the thing is to provide a
   # structure for the RM run and subsequent masking of the sequence.
   # But the RM+masking task can structure itself -- it can simply make
   # masked .2bit, and then subsequent jobs can do their own split and
   # distribute if necessary (or better yet just use 2bit specs).
 
   # possibilities for $fastaFiles:
   # {.fa, .fa.gz}
   # {many single-fasta, one multi-fasta, several multi-fasta}
 
   # If AGP is given:
   # 1. if first column of AGP matches sequence names, use as-is.
   # 2. if sixth column of AGP matches seqnames, assemble with agpToFa.
 
   # installation options:
   # 1. chrom-based (numSeqs <= 100): directories containing single-fasta,
   #    per-sequence files; randoms lumped into same directories with reals.
   # 2. scaffold-based: scaffolds.fa
   # Traditionally we have split to 5M in prep for RM -- but let the RM take
   # care of its own splitting (better yet, make it use 2bit specs).
 
   my $acat = "cat";
   my $fcat = "cat";
   foreach my $file (`ls $fastaFiles 2> /dev/null`) {
     if ($file =~ m/\.gz$/) {
       $fcat = "zcat";
       last;
     }
   }
   foreach my $file (`ls $agpFiles 2> /dev/null`) {
     if ($file =~ m/\.gz$/) {
       $acat = "zcat";
       last;
     }
   }
   if ($gotAgp) {
     $bossScript->add(<<_EOF_
 # Get sorted IDs from fasta sequence files:
 set fastaIds = `mktemp -p /tmp makeGenomeDb.fastaIds.XXXXXX`
 $fcat $fastaFiles | grep '^>' | perl -wpe 's/^>(\\S+).*/\$1/' | sort \\
   > \$fastaIds
 
 # Get sorted "big" (1st column) and "little" (6th column) IDs from AGP files:
 set agpBigIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
 $acat $agpFiles | awk '{print \$1;}' | sort -u \\
   > \$agpBigIds
 set agpLittleIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
 $acat $agpFiles | awk '\$5 != "N" {print \$6;}' | sort -u \\
   > \$agpLittleIds
 
 # Compare fasta IDs to first and sixth columns of AGP:
 set diffBigCount = `comm -3 \$fastaIds \$agpBigIds | wc -l`
 set diffLittleCount = `comm -3 \$fastaIds \$agpLittleIds | wc -l`
 
 # If AGP "big" IDs match sequence IDs, use sequence as-is.
 # If AGP "little" IDs match sequence IDs, assemble sequence with agpToFa.
 if (\$diffLittleCount == 0) then
   set agpTmp = `mktemp -p /tmp makeGenomeDb.agp.XXXXXX`
   $acat $agpFiles > \$agpTmp
   $fcat $fastaFiles \\
   | agpToFa -simpleMultiMixed \$agpTmp all stdout stdin \\
   | faToTwoBit -noMask stdin $chrM $db.unmasked.2bit
   rm -f \$agpTmp
 else if (\$diffBigCount == 0) then
   faToTwoBit -noMask $fastaFiles $chrM $db.unmasked.2bit
 else
   echo "Error: IDs in fastaFiles ($fastaFiles)"
   echo "do not perfectly match IDs in either the first or sixth columns of"
   echo "agpFiles ($agpFiles)"
   echo "Please examine and then remove these temporary files:"
   echo "  \$fastaIds -- fasta IDs"
   echo "  \$agpBigIds -- AGP first column IDs ('big' sequences)"
   echo "  \$agpLittleIds -- AGP sixth column IDs ('little' sequences)"
   exit 1
 endif
 rm -f \$fastaIds \$agpBigIds \$agpLittleIds
 _EOF_
     );
   } else {
     # No AGP -- just make an unmasked 2bit.
     $bossScript->add(<<_EOF_
 faToTwoBit -noMask $fastaFiles $chrM $db.unmasked.2bit
 _EOF_
     );
   }
 
   # Having made the unmasked .2bit, make chrom.sizes and chromInfo.tab:
   $bossScript->add(<<_EOF_
 
 twoBitInfo $db.unmasked.2bit stdout | sort -k2nr > chrom.sizes
 
 rm -rf $HgAutomate::trackBuild/chromInfo
 mkdir $HgAutomate::trackBuild/chromInfo
 awk '{print \$1 "\t" \$2 "\t$HgAutomate::gbdb/$db/$db.2bit";}' chrom.sizes \\
   > $HgAutomate::trackBuild/chromInfo/chromInfo.tab
 _EOF_
     );
   if ($gotAgp) {
     $bossScript->add(<<_EOF_
 
 if (`wc -l < chrom.sizes` < 1000) then
   # Install per-chrom .agp files for download.
   $acat $agpFiles \\
   | splitFileByColumn -col=1 -ending=.agp stdin $topDir -chromDirs
 endif
 _EOF_
       );
   }
   $bossScript->execute();
 
   # Now that we have created chrom.sizes (unless we're in -debug mode),
   # re-evaluate $chromBased for subsequent steps:
   if (-e "$topDir/chrom.sizes") {
     $chromBased = (`wc -l < $topDir/chrom.sizes` < $HgAutomate::splitThreshold);
   }
 } # makeUnmasked2bit
 
 sub makeBuildDir {
   # * step: seq [workhorse]
   &mkClusterDataLink();
   &HgAutomate::checkCleanSlate('seq', 'agp',
 			       "$topDir/M", "$topDir/chrom.sizes");
   &HgAutomate::mustMkdir($scriptDir);
   &HgAutomate::mustMkdir($bedDir);
   my $workhorse = &HgAutomate::chooseWorkhorse();
   &getMito();
   &makeUnmasked2bit($workhorse);
 } # makeBuildDir
 
 
 #########################################################################
 # * step: agp [workhorse]
 
 sub requireFakeAgpParams {
   # When assembly does not include AGP, run hgFakeAgp and require the
   # developer to specify its parameters.  If not specified, run faGapSizes
   # to give some hints.
   my $problem = 0;
   if (! defined $fakeAgpMinContigGap) {
     warn "Error: $CONFIG is missing required variable " .
       "\"fakeAgpMinContigGap\".\n";
     $problem = 1;
   }
   if (! defined $fakeAgpMinScaffoldGap) {
     warn "Error: $CONFIG is missing required variable " .
       "\"fakeAgpMinScaffoldGap\".\n";
     $problem = 1;
   }
   if ($problem) {
     warn "\nWhen the assembly does not include AGP, $CONFIG must specify " .
       "fakeAgpMinContigGap and fakeAgpMinScaffoldGap.\n";
     warn "Printing a gap-size histogram from faGapSizes to stdout... " .
       "Check for overrepresented round numbers.\n";
     warn "See usage messages of hgFakeAgp and faGapSizes for more hints.\n\n";
     my $fileServer = &HgAutomate::chooseFileServer($topDir);
     print "\n";
     &HgAutomate::run("$HgAutomate::runSSH $fileServer nice " .
 		     "twoBitToFa $topDir/$db.unmasked.2bit stdout " .
 		     "\\| faGapSizes stdin -niceSizes=" .
 		     "10,20,25,50,100,1000,2000,5000,10000,20000,50000");
     print "\n";
     exit 1;
     }
 }
 
 sub checkAgp {
   my $workhorse = &HgAutomate::chooseWorkhorse();
   # Check or generate AGP, depending on whether it was provided:
   if ($gotAgp) {
     &HgAutomate::checkCleanSlate('agp', 'db', "$topDir/$db.agp");
     my $whatItDoes = "It checks consistency of AGP and fasta.";
     my $bossScript = new HgRemoteScript("$scriptDir/checkAgpAndFa.csh",
 					$workhorse, $topDir, $whatItDoes,
 					$CONFIG);
     my $allAgp = "$topDir/$db.agp";
     # If we added chrM from GenBank, exclude it from fasta:
     my $exclude = ($mitoAcc eq 'none') ? "" : "-exclude=chrM";
     my $acat = "cat";
     foreach my $file (`ls $agpFiles 2> /dev/null`) {
 	if ($file =~ m/\.gz$/) {
 	    $acat = "zcat";
 	    last;
 	}
     }
     $bossScript->add(<<_EOF_
 # When per-chrom AGP and fasta files are given, it would be much more
 # efficient to run this one chrom at a time.  However, since the filenames
 # are arbitrary, I'm not sure how to identify the pairings of AGP and fa
 # files.  So cat 'em all together and check everything at once:
 
 $acat $agpFiles | sort -k1,1 -k2n,2n > $allAgp
 
 set result = `checkAgpAndFa $exclude $allAgp $db.unmasked.2bit | tail -1`
 
 if ("\$result" != 'All AGP and FASTA entries agree - both files are valid') then
   echo "Error: checkAgpAndFa failed\\!"
   echo "Last line of output: \$result"
   exit 1
 endif
 _EOF_
     );
     $bossScript->execute();
   } else {
     my $runDir = "$bedDir/hgFakeAgp";
     &HgAutomate::mustMkdir($runDir);
     &HgAutomate::checkCleanSlate('agp', 'db', "$runDir/$db.agp");
     &requireFakeAgpParams();
     my $whatItDoes = "It runs hgFakeAgp (because no AGP was provided).";
     my $bossScript = new HgRemoteScript("$runDir/doFakeAgp.csh",
 					$workhorse, $runDir, $whatItDoes,
 					$CONFIG);
     $bossScript->add(<<_EOF_
 twoBitToFa $topDir/$db.unmasked.2bit stdout \\
 | hgFakeAgp stdin $db.agp \\
   -minContigGap=$fakeAgpMinContigGap -minScaffoldGap=$fakeAgpMinScaffoldGap
 _EOF_
     );
     $bossScript->execute();
   }
 
 #*** Produce a gap report so we can say something sensible in gap.html.
 
 } # checkAgp
 
 
 #########################################################################
 # * step: db [dbHost]
 
 sub makeDb {
   # Create a database on hgwdev, grp, chromInfo, gold/gap.
   my $qual = (defined $qualFiles) ? " qual" : "";
   my $whatItDoes =
 "It creates the genome database ($db) and loads the most basic tables:
 chromInfo, grp, gap, gold,$qual and gc5Base.";
   my $bossScript = new HgRemoteScript("$scriptDir/makeDb.csh", $dbHost,
 				      $topDir, $whatItDoes, $CONFIG);
 
   # Actually, build some basic track files on $workhorse, then load.
   $qual = (defined $qualFiles) ? " qual and" : "";
   $whatItDoes = "It generates$qual gc5Base track files for loading.";
   my $workhorse = &HgAutomate::chooseWorkhorse();
   my $horseScript = new HgRemoteScript("$scriptDir/makeTrackFiles.csh",
 				       $workhorse,
 				       $topDir, $whatItDoes, $CONFIG);
   # Build quality wiggle track files (if provided).
   if (defined $qualFiles) {
     $horseScript->add(<<_EOF_
 # Translate qual files to wiggle encoding.  If there is a problem with
 # sequence names, you may need to tweak the original qual sequence names
 # and/or lift using qacAgpLft.
 mkdir -p $bedDir/qual
 cd $bedDir/qual
 _EOF_
       );
     if ($qualFiles =~ /^\S+\.qac$/) {
       # Single .qac file:
       $horseScript->add(<<_EOF_
 qacToWig -fixed $qualFiles stdout \\
 _EOF_
         );
     } else {
       # Possible wildcard of qual file(s):
       $horseScript->add(<<_EOF_
 if (`ls $qualFiles | grep \.gz | wc -l`) then
   set qcat = zcat
 else
   set qcat = cat
 endif
 \$qcat $qualFiles \\
 | qaToQac stdin stdout \\
 | qacToWig -fixed stdin stdout \\
 _EOF_
       );
     }
     $horseScript->add(<<_EOF_
 | wigEncode stdin qual.{wig,wib}
 
 _EOF_
     );
   }
 
   # Build gc5Base files.
   $horseScript->add(<<_EOF_
 # Make gc5Base wiggle files.
 mkdir -p $bedDir/gc5Base
 cd $bedDir/gc5Base
 hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 $db \\
   $topDir/$db.unmasked.2bit \\
 | wigEncode stdin gc5Base.{wig,wib}
 _EOF_
   );
 
   # Now start the database creation and loading commands.
   $bossScript->add (<<_EOF_
 hgsql '' -e 'create database $db'
 df -h /var/lib/mysql
 hgsql $db < \${HOME}/kent/src/hg/lib/grp.sql
 hgLoadSqlTab $db chromInfo \${HOME}/kent/src/hg/lib/chromInfo.sql \\
   $HgAutomate::trackBuild/chromInfo/chromInfo.tab
 _EOF_
   );
 
   my $allAgp = "$topDir/$db.agp";
   $allAgp = "$bedDir/hgFakeAgp/$db.agp" if (! $gotAgp);
   if ($chromBased) {
     $bossScript->add(<<_EOF_
 # Split AGP into per-chrom files/dirs so we can load split gold and gap tables.
 cp /dev/null chrom.lst.tmp
 foreach chr (`awk '{print \$1;}' chrom.sizes`)
   set c = `echo \$chr | sed -e 's/^chr//; s/_random\$//;'`
   mkdir -p \$c
   echo \$c >> chrom.lst.tmp
   awk '\$1 == "'\$chr'" {print;}' $allAgp > \$c/\$chr.agp
 end
 sort -u chrom.lst.tmp > chrom.lst
 rm chrom.lst.tmp
 _EOF_
     );
     if ($opt_noGoldGapSplit) {
       $bossScript->add("hgGoldGapGl -noGl $db $allAgp\n");
     } else {
       $bossScript->add("hgGoldGapGl -noGl -chromLst=chrom.lst $db $topDir .\n");
     }
   } else {
     $bossScript->add("hgGoldGapGl -noGl $db $allAgp\n");
   }
 
   if ($gotAgp && $gotMito) {
     # When we load gold/gap from assembly AGP, but pull in chrM sequence
     # separately, chrM is conspicuously absent from gold/gap -- so add a fake
     # entry for it in gold (so featureBits gold --> 100%) and if split tables,
     # make a token chrM_gap table.  Use bin=585 (512+ 64 + 8 + 1), the
     # smallest bin that starts at 0.  The smallest bin is 128k bases, which
     # should always cover the entire mitochondrial genome (typically ~16k).
     my $bin = 585;
     my $mitoGold = ($mitoAcc =~ /^\d+$/) ? "gi|$mitoAcc" : $mitoAcc;
     if (!$opt_noGoldGapSplit && ($chromBased || $opt_debug)) {
       my $defaultChrom = `head -1 $topDir/chrom.sizes | cut -f 1`;
       chomp $defaultChrom;
       $bossScript->add(<<_EOF_
 
 # Add fake chrM_{gap,gold} to make featureBits happy.
 set mSize = `faSize $topDir/M/chrM.fa | grep bases | awk '{print \$1;}'`
 hgsql $db -e 'drop table chrM_gap,chrM_gold;'
 hgsql $db -e \\
     'create table chrM_gap select * from ${defaultChrom}_gap where 0; \\
      create table chrM_gold select * from ${defaultChrom}_gold where 0; \\
      insert into chrM_gold \\
        values($bin,"chrM",0,'\$mSize',1,"F","$mitoGold",0,'\$mSize',"+");'
 _EOF_
       );
     } else {
       $bossScript->add(<<_EOF_
 
 # Add fake chrM entry in gold table to make featureBits happy.
 set mSize = `faSize $topDir/M/chrM.fa | grep bases | awk '{print \$1;}'`
 hgsql $db -e \\
     'insert into gold \\
         values($bin,"chrM",0,'\$mSize',1,"F","$mitoGold",0,'\$mSize',"+");'
 _EOF_
       );
     }
+    # may as well finally add the chrM entry to the agp file
+    $bossScript->add(<<_EOF_
+set lastId = `tail -1 $topDir/$db.agp | awk '{print \$4+1}'`
+/bin/echo -e "chrM\t1\t\$mSize\t\$lastId\tF\t$mitoGold\t1\t\$mSize\t+" >> $topDir/$db.agp
+_EOF_
+      );
   }
 
   $bossScript->add(<<_EOF_
 
 # Load gc5base
 mkdir -p $HgAutomate::gbdb/$db/wib
 rm -f $HgAutomate::gbdb/$db/wib/gc5Base.wib
 ln -s $bedDir/gc5Base/gc5Base.wib $HgAutomate::gbdb/$db/wib
 hgLoadWiggle -pathPrefix=$HgAutomate::gbdb/$db/wib \\
   $db gc5Base $bedDir/gc5Base/gc5Base.wig
 _EOF_
   );
   if (defined $qualFiles) {
     $bossScript->add(<<_EOF_
 
 # Load qual
 cd $bedDir/qual
 rm -f $HgAutomate::gbdb/$db/wib/qual.wib
 ln -s $bedDir/qual/qual.wib $HgAutomate::gbdb/$db/wib/
 hgLoadWiggle -pathPrefix=$HgAutomate::gbdb/$db/wib \\
   $db quality qual.wig
 _EOF_
     );
   }
 
   $bossScript->add(<<_EOF_
 rm -f wiggle.tab
 
 _EOF_
   );
   $horseScript->execute();
   $bossScript->execute();
 } # makeDb
 
 
 #########################################################################
 # * step: dbDb [dbHost]
 
 sub requireCladeAndPriority {
   # If the genomeClade table in the central database does not have a row
   # for this $db's genome, require user to specify clade and priority.
   my ($genome) = @_;
   my $problem = 0;
   if (! defined $clade) {
     warn "Error: $CONFIG is missing required variable " .
       "\"clade\".\n";
     $problem = 1;
   }
   if (! defined $genomeCladePriority) {
     warn "Error: $CONFIG is missing required variable " .
       "\"genomeCladePriority\".\n";
     $problem = 1;
   }
   if ($problem) {
     warn "\nCentral database table genomeClade does not have a row for " .
       "${db}'s genome \"$genome\",\n";
     warn "so $CONFIG must specify clade and genomeCladePriority.\n";
     warn "Examine the contents of the genomeClade table to get an idea " .
       "where \"$genome\" fits in.\n\n";
     exit 1;
   }
 }
 
 sub getGenome {
   # Return what we'll use in the genome (and organism) column of dbDb.
   # This is the label that appears in the gateway menu.
   my $genome;
   if ($commonName) {
     $genome = $commonName;
   } else {
     $genome = $scientificName;
     $genome =~ s/^(\w)\w+\s+(\w+)$/$1. $2/;
   }
   return $genome;
 }
 
 sub makeDbDb {
 # * step: dbDb [dbHost... or just direct to hgcentraltest??]
 # - create hgcentraltest entry, (if necessary) defaultDb and genomeClade
   my $genome = &getGenome();
   my $defaultPos;
   my ($seq, $size) = $opt_debug ? ("chr1", 1000) :
     split(/\s/, `head -1 "$topDir/chrom.sizes"`);
   my $start = int($size / 2);
   $size = ($start + 9999) if ($size > ($start + 9999));
   $defaultPos = "$seq:$start-$size";
 
   my $dbDbInsert = "$topDir/dbDbInsert.sql";
   my $fh = &HgAutomate::mustOpen(">$dbDbInsert");
   print $fh <<_EOF_
 DELETE from dbDb where name = "$db";
 INSERT INTO dbDb
     (name, description, nibPath, organism,
      defaultPos, active, orderKey, genome, scientificName,
      htmlPath, hgNearOk, hgPbOk, sourceName, taxId)
 VALUES
     ("$db", "$assemblyDate", "$HgAutomate::gbdb/$db", "$genome",
      "$defaultPos", 1, $orderKey, "$genome", "$scientificName",
      "$HgAutomate::gbdb/$db/html/description.html", 0, 0, "$assemblyLabel",
     $taxId);
 _EOF_
   ;
   close($fh);
   my $centDbSql = "$HgAutomate::runSSH $dbHost $HgAutomate::centralDbSql";
   &HgAutomate::run("$centDbSql < $dbDbInsert");
 
   # Add a row to defaultDb if this is the first usage of $genome.
   my $sql = "'select count(*) from defaultDb where genome = \"$genome\"'";
   if (`echo $sql | $centDbSql` == 0) {
     $sql = "'INSERT INTO defaultDb (genome, name) " .
       "VALUES (\"$genome\", \"$db\")'";
     &HgAutomate::run("echo $sql | $centDbSql");
   }
 
   # If $genome does not already appear in genomeClade, warn user that
   # they will have to manually add it.
   $sql = "'select count(*) from genomeClade where genome = \"$genome\"'";
   if (`echo $sql | $centDbSql` == 0) {
     &requireCladeAndPriority($genome);
     $sql = "'INSERT INTO genomeClade (genome, clade, priority) " .
       "VALUES (\"$genome\", \"$clade\", $genomeCladePriority)'";
     &HgAutomate::run("echo $sql | $centDbSql");
   }
 } # makeDbDb
 
 
 #########################################################################
 # * step: trackDb [dbHost]
 
 sub makeDescription {
   # Make a template description.html (for the browser gateway page).
   my $sciUnderscore = $scientificName;
   $sciUnderscore =~ s/ /_/g;
   my $genome = &getGenome();
   my $anchorRoot = lc($genome);
   $anchorRoot =~ s/\. /_/;
   $anchorRoot =~ s/ /_/;
   my $fh = &HgAutomate::mustOpen(">$topDir/html/description.html");
   print $fh <<_EOF_
 <!-- Display image in righthand corner -->
 <TABLE ALIGN=RIGHT BORDER=0 WIDTH=185>
   <TR><TD ALIGN=RIGHT>
     <A HREF="*** link to sequencing center's project page"
     TARGET=_blank>
     <IMG SRC="/images/$sciUnderscore.jpg" WIDTH=*** HEIGHT=*** ALT="$genome"></A>
   </TD></TR>
   <TR><TD ALIGN=RIGHT> 
     <FONT SIZE=-1><em>$scientificName</em><BR> 
     </FONT>
     <FONT SIZE=-2> 
       Photo courtesy of ***
       (<A HREF="*** link to source" 
       TARGET=_blank>***source</A>)
     </FONT>
   </TD></TR>
 </TABLE>
 
 <P>
 The $assemblyDate <em>$scientificName</em> draft assembly 
 ($assemblyLabel) was produced by the *** sequencing center.
 
 <H3>Sample position queries</H3>
 <P>
 A genome position can be specified by the accession number of a
 sequenced genomic region, an mRNA or EST, a chromosomal coordinate
 range, or keywords from the GenBank description of an mRNA. The
 following list shows examples of valid position queries for the
 $genome genome.  See the
 <A HREF="../goldenPath/help/hgTracksHelp.html"
     TARGET=_blank>User's Guide</A> for more information.
 <P>
 <TABLE border=0 CELLPADDING=0 CELLSPACING=0>
     <TR><TH ALIGN=LEFT>Request:</TH>
         <TH WIDTH=14>&nbsp;</TH>
        	<TH ALIGN=LEFT>Genome Browser Response:</TH></TR>
     <TR><TD VALIGN=Top COLSPAN=3>&nbsp;</TD></TR>
     <TR><TD VALIGN=Top>chrZ</TD>
         <TD WIDTH=14>&nbsp;</TD>
        	<TD VALIGN=Top>Displays all of chromosome Z</TD>
     </TR>
     <TR><TD VALIGN=Top>chr3:1-1000000</TD>
         <TD WIDTH=14>&nbsp;</TD>
        	<TD VALIGN=Top>Displays first million bases of chr 3</TD>
     </TR>
     <TR><TD VALIGN=Top NOWRAP>chr3:1000000+2000</TD>
        	<TD WIDTH=14></TD>
        	<TD VALIGN=Top>Displays a region of chr 3 that spans 2000 bases, starting with position 1000000 </TD>
     </TR>
     <TR><TD VALIGN=Top COLSPAN=3>&nbsp;<br></TD></TR>
     <TR><TD VALIGN=Top>BX950328</TD>
         <TD WIDTH=14>&nbsp;</TD>
        	<TD VALIGN=Top>Displays region of mRNA with GenBank accession
 		number BX950328</TD>
     </TR>
     <TR><TD VALIGN=Top>BU335953</TD>
         <TD WIDTH=14>&nbsp;</TD>
       	<TD VALIGN=Top>Displays region of EST with GenBank accession
 		BU335953 on chr Z</TD>
     </TR>
     <TR><TD VALIGN=Top COLSPAN=3>&nbsp;<br></TD></TR>
     <TR><TD VALIGN=Top>pseudogene mRNA</TD>
         <TD WIDTH=14>&nbsp;</TD>
        	<TD VALIGN=Top>Lists transcribed pseudogenes, but not cDNAs</TD>
     </TR>
     <TR><TD VALIGN=Top>homeobox caudal</TD>
         <TD WIDTH=14>&nbsp;</TD>
        	<TD VALIGN=Top>Lists mRNAs for caudal homeobox genes</TD>
     </TR>
     <TR><TD VALIGN=Top>zinc finger</TD>
         <TD WIDTH=14>&nbsp;</TD>
        	<TD VALIGN=Top>Lists many zinc finger mRNAs</TD>
     </TR>
     <TR><TD VALIGN=Top>kruppel zinc finger</TD>
         <TD WIDTH=14>&nbsp;</TD>
        	<TD VALIGN=Top>Lists only kruppel-like zinc fingers</TD>
     </TR>
     <TR><TD VALIGN=Top>zhang</TD>
         <TD WIDTH=14>&nbsp;</TD>
 	<TD VALIGN=Top>Lists mRNAs deposited by scientist named Zhang</TD>
     </TR>
     <TR><TD VALIGN=Top>Taguchi,T.</TD>
         <TD WIDTH=14>&nbsp;</TD>
        	<TD VALIGN=Top>Lists mRNAs deposited by co-author T. Taguchi</TD>
     </TR>
     <TR><TD VALIGN=Top COLSPAN=3>&nbsp;</TD></TR>
     <TR><TD COLSPAN="3"> Use this last format for author queries. Although
 	GenBank requires the search format <em>Taguchi T</em>, internally it
 	uses the format <em>Taguchi,T.</em>.</TD>
     </TR>
 </TABLE>
 
 <HR>
 <H3>Assembly details</H3>
 <P>
 The genome has been sequenced to ***X coverage.
 Approximately 88% of the sequence has been anchored to chromosomes, 
 which include autosomes 1-24, 26-28, and 32, and sex chromosomes W and Z. 
 (In contrast to mammals, the female chicken is heterogametic (ZW) and the 
 male is homogametic (ZZ).) 
 The remaining unanchored contigs that could be localized to a chromosome 
 have been concatenated into the virtual 
 chromosomes &quot;chr*_random&quot;, separated by gaps of 10,000 bp. 
 Unanchored contigs that could not be localized to a chromosome 
 have been concatenated into the virtual 
 chromosome &quot;chrUn&quot;, separated by gaps of 1,000 bp in order 
 to reduce the total size of chrUn. 
 The chicken 
 mitochondrial sequence is also available as the virtual chromosome 
 &quot;chrM&quot;. 
 </P>
 
 
 <P>
 Bulk downloads of the sequence and annotation data are available via 
 the Genome Browser <A HREF="ftp://hgdownload.cse.ucsc.edu/goldenPath/$db/">FTP
 server</A> or the <A HREF="http://hgdownload.cse.ucsc.edu/downloads.html#$anchorRoot">Downloads</A> page. 
 
   *** Developer -- remove this if no conditions for use:
 These data have
 <A HREF="../goldenPath/credits.html#${anchorRoot}_use">specific 
 conditions for use</A>. 
 
 The $genome browser annotation tracks were generated by UCSC and 
 collaborators worldwide. See the 
 <A HREF="/goldenPath/credits.html#${anchorRoot}_credits">Credits</A> 
 page for a detailed list of the organizations and individuals who contributed 
 to the success of this release. 
 </P>
 _EOF_
   ;
   close($fh);
 } # makeDescription
 
 sub makeLocalTrackDbRa {
   # Make an assembly-level trackDb.ra, gap.html and gold.html.
   my @localFiles = qw( trackDb.ra gap.html gold.html );
 
   my $fh = &HgAutomate::mustOpen(">$topDir/html/trackDb.ra");
   print $fh <<_EOF_
 # Local declaration so that local gold.html is picked up.
 track gold
 shortLabel Assembly
 longLabel Assembly from Fragments
 group map
 priority 10
 visibility hide
 color 150,100,30
 altColor 230,170,40
 type bed 3 +
 
 # Local declaration so that local gap.html is picked up.
 track gap
 shortLabel Gap
 longLabel Gap Locations
 priority 11
 group map
 visibility dense
 type bed 3 +
 
 _EOF_
   ;
   if (! $chromBased) {
     print $fh <<_EOF_
 # Unsplit mRNA track for scaffold-based assembly.
 track all_mrna
 shortLabel \$Organism mRNAs
 longLabel \$Organism mRNAs from GenBank
 group rna
 priority 54
 visibility pack
 spectrum on
 type psl .
 baseColorUseCds genbank
 baseColorUseSequence genbank
 baseColorDefault diffCodons
 showDiffBasesAllScales .
 indelDoubleInsert on
 indelQueryInsert on
 indelPolyA on
 
 # Unsplit EST track for scaffold-based assembly.
 track all_est
 shortLabel \$Organism ESTs
 longLabel \$Organism ESTs Including Unspliced
 group rna
 priority 57
 visibility hide
 spectrum on
 intronGap 30
 type psl est
 baseColorUseSequence genbank
 indelDoubleInsert on
 indelQueryInsert on
 maxItems 300
 
 _EOF_
     ;
   }
   close($fh);
 
   $fh = &HgAutomate::mustOpen(">$topDir/html/gap.html");
   if ($gotAgp) {
     print $fh <<_EOF_
 <H2>Description</H2>
 This track depicts gaps in the assembly.
 
   *** Developer: remove this statement if no future assemblies are expected:
 
 Many of these gaps &mdash; with the
 exception of intractable heterochromatic gaps &mdash; may be closed during the
 finishing process.
 
 <P>
 Gaps are represented as black boxes in this track.
 If the relative order and orientation of the contigs on either side
 of the gap is supported by read pair data, 
 it is a <em>bridged</em> gap and a white line is drawn 
 through the black box representing the gap. 
 </P>
 <P>This assembly contains the following principal types of gaps:
 <UL>
 
   *** Developer: make sure this part is accurate.  Here is a WUSTL template:
 
 <LI><B>Fragment</B> - gaps between the Whole Genome Shotgun contigs of a 
 supercontig.  (In this context, a contig is a set of overlapping sequence reads.  
 A supercontig is a set of contigs ordered and oriented during the 
 Whole Genome Shotgun process using paired-end reads.)
 These are represented by varying numbers of <em>N</em>s  in the assembly.  
 Fragment gap sizes are usually taken from read pair data.  
 <LI><B>Clone</B> - gaps between supercontigs linked by the fingerprint map.  
 In general, these are represented by 10,000 <em>N</em>s  in the assembly.
 <LI><B>Contig</B> - gaps between supercontigs not linked by the fingerprint 
 map, but instead by marker data.  (In this context, the &quot;Contig&quot; gap 
 type refers to a map contig, not a sequence contig.)  
 In general, these are represented by 10,000 <em>N</em>s  in the assembly for all 
 chromosomes except chrUn (concatenation of unplaced supercontigs), where 
 gaps of 1,000 <em>N</em>s  are used.  Gaps of other sizes were used when mRNA or 
 other data suggested possible but not confirmed links between supercontigs.
 <LI><B>Centromere</B> - gaps for centromeres were included when they could be 
 reasonably localized.  These are represented by 1,500,000 <em>N</em>s  in the 
 assembly for the macrochromosomes 1-10 and Z, and by 500,000 <em>N</em>s  for 
 all others (microchromosomes).  
 
   *** Developer: here is a Broad template:
 
 <LI><B>Fragment</B> - gaps between the Whole Genome Shotgun contigs of a 
 supercontig. These are represented by varying numbers of <em>N</em>s in the 
 assembly. In this context, a contig is a set of overlapping sequence reads and  
 a supercontig is a set of contigs ordered and oriented during the 
 Whole Genome Shotgun process using paired-end reads. Fragment gap sizes 
 are usually taken from read pair data.  
 </LI>
 <LI><B>Clone</B> - gaps between supercontigs linked by the physical map.  
 In general, these are represented by 1,000 <em>N</em>s  in the assembly.  
 
 *** Developer: look for this kind of gap placement:
 
 Clone gaps of 3,000,000 have been placed at the end of chrX and at the 
 beginning of all other chromosomes.  
 </LI>
 
 
 </UL></P>
 
 
 _EOF_
     ;
   } else {
     print $fh <<_EOF_
 <H2>Description</H2>
 This track depicts gaps in the assembly.
 
   *** Developer: remove this statement if no future assemblies are expected:
 
 Many of these gaps &mdash; with the
 exception of intractable heterochromatic gaps &mdash; may be closed during the
 finishing process.
 <P>
 Gaps are represented as black boxes in this track.
 If the relative order and orientation of the contigs on either side
 of the gap is known, it is a <em>bridged</em> gap and a white line is drawn 
 through the black box representing the gap.
 </P>
 <P>
 This assembly was sequenced with paired reads taken from
 
   *** Developer: check if this is accurate:
 
 plasmids and fosmids of various sizes.
 
 Overlapping reads were merged into contigs,
 and pairing information was then used to join the contigs into scaffolds.
 The gap sizes are estimated from the size of the 
 
 plasmids and fosmids,
 
 with a minimum gap size of $fakeAgpMinContigGap.
 </P>
 <P></P>
 <P>This assembly contains the following types of gaps:
 <UL>
 <LI><B>Contig</B> - gaps of size $fakeAgpMinContigGap to $fakeAgpMinScaffoldGap.
 <LI><B>Scaffold</B> - gaps greater than $fakeAgpMinScaffoldGap in size.
 </UL>
 </P>
 _EOF_
     ;
   }
   close($fh);
 
   $fh = &HgAutomate::mustOpen(">$topDir/html/gold.html");
   my $em = $commonName ? "" : "<em>";
   my $noEm = $commonName ? "" : "</em>";
   if ($gotAgp) {
     print $fh <<_EOF_
 <H2>Description</H2>
 <P>
 This track shows the draft assembly of the $em\$organism$noEm genome.  
 
   *** Developer: check if this is accurate:
 
 Whole-genome shotgun reads were assembled into contigs.  When possible, 
 contigs were grouped into scaffolds (also known as &quot;supercontigs&quot;).
 The order, orientation and gap sizes between contigs within a scaffold are
 based on paired-end read evidence. </P>
 <P>
 In dense mode, this track depicts the contigs that make up the 
 currently viewed scaffold. 
 Contig boundaries are distinguished by the use of alternating gold and brown 
 coloration. Where gaps
 exist between contigs, spaces are shown between the gold and brown
 blocks.  The relative order and orientation of the contigs
 within a scaffold is always known; therefore, a line is drawn in the graphical
 display to bridge the blocks.</P>
 <P>
 All components within this track are of fragment type &quot;W&quot;: 
 Whole Genome Shotgun contig. </P>
 _EOF_
     ;
   } else {
     print $fh <<_EOF_
 <H2>Description</H2>
 <P>
 This track shows the draft assembly of the $em\$organism$noEm genome.  
 
   *** Developer: check if this is accurate:
 
 Whole-genome shotgun reads were assembled into contigs.  When possible, 
 contigs were grouped into scaffolds (also known as &quot;supercontigs&quot;).
 The order, orientation and gap sizes between contigs within a scaffold are
 based on paired-end read evidence. </P>
 <P>
 Locations of contigs and scaffolds were deduced from runs of Ns in the
 assembled sequence.  A run of Ns of more than $fakeAgpMinScaffoldGap bases
 was assumed to be a gap between scaffolds, and a run of Ns between
 $fakeAgpMinContigGap and $fakeAgpMinScaffoldGap was assumed to be a gap
 between contigs.</P>
 <P>
 In dense mode, this track depicts the contigs that make up the 
 currently viewed scaffold. 
 Contig boundaries are distinguished by the use of alternating gold and brown 
 coloration. Where gaps
 exist between contigs, spaces are shown between the gold and brown
 blocks.  The relative order and orientation of the contigs
 within a scaffold is always known; therefore, a line is drawn in the graphical
 display to bridge the blocks.</P>
 <P>
 All components within this track are of fragment type &quot;W&quot;: 
 Whole Genome Shotgun contig. </P>
 _EOF_
     ;
   }
   close($fh);
 
   my $localFiles = "$topDir/html/{" . join(',', @localFiles) . '}';
   return $localFiles;
 } # makeLocalTrackDbRa
 
 sub makeTrackDb {
 # * step: trackDb [dbHost]
   my $runDir = "$topDir/TemporaryTrackDbCheckout";
   &HgAutomate::mustMkdir($runDir);
   &HgAutomate::mustMkdir("$topDir/html");
   &makeDescription();
   my $localFiles = &makeLocalTrackDbRa();
   my $whatItDoes =<<_EOF_
 It makes a local checkout of kent/src/hg/makeDb/trackDb/
 and makes *suggested* modifications.  Checking in modified files, and adding
 new files, is left up to the user in case this script has made some wrong
 guesses about proper names and locations.
 _EOF_
   ;
   my $bossScript = new HgRemoteScript("$scriptDir/makeTrackDb.csh", $dbHost,
 				      $runDir, $whatItDoes, $CONFIG);
 
   $bossScript->add(<<_EOF_
 # These directories are necessary for running make in trackDb:
 $HgAutomate::cvs -Q co -P \\
   kent/src/inc kent/src/hg/lib kent/src/hg/makeDb/trackDb
 
 cd kent/src/hg/makeDb/trackDb
 
 # Create the expected species-level directory for $db if necessary:
 mkdir -p $dbDbSpeciesDir/$db
 
 # Move local description files into place:
 mv $localFiles $dbDbSpeciesDir/$db/
 
 # Copy the template description.html for $db into place, and link to it
 # from $HgAutomate::gbdb/$db/html/ .
 if (! -e $dbDbSpeciesDir/$db/description.html) then
   cp -p $topDir/html/description.html $dbDbSpeciesDir/$db/description.html
 endif
 mkdir -p $HgAutomate::gbdb/$db/html
 rm -f $HgAutomate::gbdb/$db/html/description.html
 ln -s $topDir/html/description.html $HgAutomate::gbdb/$db/html/
 
 # Do a test run with the generated files:
 make update DBS=$db
 _EOF_
   );
 
   $bossScript->execute();
 
   $endNotes .=<<_EOF_
 
 Template trackDb.ra and .html's have been created, but they all need editing!
 
 cd $runDir/kent/src/hg/makeDb/trackDb/$dbDbSpeciesDir/$db
 
 Search for '***' notes in each file in and make corrections (sometimes the
 files used for a previous assembly might make a better template):
   description.html $localFiles
 
 Then cd ../.. (to trackDb/) and
  - edit makefile to add $db to DBS.
  - (if necessary) cvs add $dbDbSpeciesDir
  - cvs add $dbDbSpeciesDir/$db
  - cvs add $dbDbSpeciesDir/$db/*.{ra,html}
  - cvs ci -m "Added $db to DBS." makefile
  - cvs ci -m "Initial descriptions for $db." $dbDbSpeciesDir/$db
  - (if necessary) cvs ci $dbDbSpeciesDir
  - Run make update DBS=$db and make alpha when done.
  - (optional) Clean up $runDir
  - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there.
 
 _EOF_
   ;
 } # makeTrackDb
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 ($CONFIG) = @ARGV;
 
 #*** Force -verbose until this is really ready to go:
 $opt_verbose = 3 if (! $opt_verbose);
 
 &parseConfig($CONFIG);
 
 $endNotes = "";
 $bedDir = "$topDir/$HgAutomate::trackBuild";
 $scriptDir = "$topDir/jkStuff";
 if (-e "$topDir/chrom.sizes") {
   $chromBased = (`wc -l < $topDir/chrom.sizes` < $HgAutomate::splitThreshold);
 }
 
 $stepper->execute();
 
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'trackDb') ? "" :
   "  (through the '$stopStep' step)";
 
 HgAutomate::verbose(1,
 	"\n *** All done!$upThrough\n");
 
 if ($endNotes) {
   #*** Should mail this to $ENV{'USER'} so it's not so easy to ignore.
   #*** Does mail work on all of our machines??  Even if it works on one,
   #*** we can ssh it.  Should be in an HgAutomate routine.
   HgAutomate::verbose(0,
 		      "\n\nNOTES -- STUFF THAT YOU WILL HAVE TO DO --\n\n" . 
 		      "$endNotes\n");
 }