94d9d0495ac3ff8446552a7ee48cf7bfd889be3d hiram Fri Jun 12 11:04:38 2020 -0700 catching up trackDb source tree files refs #25720 diff --git src/hg/utils/automation/makeGenomeDb.pl src/hg/utils/automation/makeGenomeDb.pl index 33378ac..f3bb47c 100755 --- src/hg/utils/automation/makeGenomeDb.pl +++ src/hg/utils/automation/makeGenomeDb.pl @@ -1,1510 +1,1512 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/makeGenomeDb.pl instead. # $Id: makeGenomeDb.pl,v 1.30 2010/04/13 23:18:44 hiram Exp $ use Getopt::Long; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; my $stepper = new HgStepManager( [ { name => 'seq', func => \&makeBuildDir }, { name => 'agp', func => \&checkAgp }, { name => 'db', func => \&makeDb }, { name => 'dbDb', func => \&makeDbDb }, { name => 'trackDb', func => \&makeTrackDb }, ] ); my $base = $0; $base =~ s/^(.*\/)?//; # Option defaults: my $dbHost = 'hgwdev'; my $maxMitoSize = 25000; ## Can be overridden in the config.ra file sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base config.ra options: "; print STDERR $stepper->getOptionHelp(); print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => '', 'fileServer' => ''); print STDERR <<_EOF_ -splitGoldGap split the gold/gap tables (default is not split) -noGoldGapSplit default behavior, this option no longer required -forceDescription construct a new description.html when -continue=trackDb _EOF_ ; print STDERR " Automates the first steps of building a genome database: seq: Translates fasta into /hive/data/genomes/\$db/\$db.unmasked.2bit . agp: Checks consistency of fasta and AGP (or runs hgFakeAgp). db: Creates the genome database and basic tables. dbDb: Adds the genome database to central database tables. trackDb: Creates a trackDb/ checkout and adds -- but does not check in -- template trackDb.ra and assembly description .html files. config.ra describes the species, assembly and downloaded files. To see detailed information about what should appear in config.ra, run \"$base -help\". "; # Detailed help (-help): print STDERR " ----------------------------------------------------------------------------- Required config.ra settings: db xxxYyyN - The UCSC database name and assembly identifier. xxx is the first three letters of the genus and Yyy is the first three letters of the species. N is the sequence number of this species' build at UCSC. For some species that we have been processing since before this convention, the pattern is xyN. scientificName Xxxxxx yyyyyy - The genus and species name. assemblyDate Mmm. YYYY - The month and year in which this assembly was released by the sequencing center. assemblyLabel XXXXXX - The detailed/long form of the sequencing center's label or version identifier for this release (e.g. 'Genome Center at Washington University, St. Louis, Genus_species 1.2.3'). assemblyShortLabel XXXXXX - The abbreviated form of the sequencing center's label or version identifier for this release (e.g. 'WUGSC 1.2.3'). photoCreditURL http://some.path/to/photo/credit.html - used to construct a courtesy URL reference to the source for the photograph on the gateway page photoCreditName string for photo credit - used to label the photoCreditURL under the picture on the gateway page ncbiGenomeId nnnnn - A numeric NCBI identifier for the genome information reference at https://www.ncbi.nlm.nih.gov/genome/nnnnn ncbiAssemblyId nnnnn - A numeric NCBI identifier for the assembly. To determine this, do an NCBI Assembly query at https://www.ncbi.nlm.nih.gov/assembly/, using the scientific name \"Xxxxxx yyyyyy\" and choose a project ID that match the assembly name. ncbiAssemblyName xxxxxrr - The assembly name used in the ftp path such as \"catChrV17e\" in ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/catChrV17e/ It is identical to the name returned from the NCBI Assembly query mention above. NOTE: this is NOT an NCBI identifier, this name was supplied by the assembly provider. ncbiBioProject nnnnn - The NCBI bioproject number to construct the URL: https://www.ncbi.nlm.nih.gov/bioproject/nnnnn ncbiBioSample nnnnn - A numeric NCBI identifier for the genome information reference at https://www.ncbi.nlm.nih.gov/biosample/nnnnn genBankAccessionID GCA_nnn - The NCBI assembly accession identification from the ASSEMBLY_INFO file in the downloads from the FTP site ftp.ncbi.nlm.nih.gov/genbank/genomes orderKey NN - A priority number (for the central database's dbDb.orderKey column) that will determine db's relative position in the assembly menu. mitoAcc XXXXXXX - A numeric Genbank identifier (\"gi number\") for the complete mitochondrial sequence of this species, or \"none\" to disable fetching and inclusion in the build. To determine this, do an Entrez Nucleotide query on \"Xxxxxx yyyyyy mitochondrion complete genome\" and choose a result that looks best (there may be multiple complete sequences in Genbank, or there may be none in which case just say \"none\"). fastaFiles /path/to/downloaded.fa - A complete path, possibly with wildcard characters, to FASTA sequence files which have already been downloaded from the sequencing center. dbDbSpeciesDir dirName - The name of the subdirectory of trackDb/ in which the \$db subdirectory will be added. For vertebrates, this is often a lower-cased common name, but various patterns have been used especially for non-vertebrates. taxId ncbiTaxId - The NCBI numeric taxonomy id. If a pseudo-genome, like a reconstruction is being built, specify 0. ----------------------------------------------------------------------------- Conditionally required config.ra settings: 1. Required when AGP is not included in the assembly. In this case we run hgFakeAgp to deduce gap annotations from runs of Ns in the sequence. The faGapSizes program shows a histogram that emphasizes overrepresented round-number sizes, which are decent hints for these parameters. fakeAgpMinContigGap NN - hgFakeAgp -minContigGap param. Minimum size for a run of Ns that will be considered a bridged gap with type \"contig\". fakeAgpMinScaffoldGap NN - hgFakeAgp -minScaffoldGap param. Minimum size for a run of Ns that will be considered an unbridged gap with type \"scaffold\". 2. Required when the central database table genomeClade does not yet have a row for this species: clade cccccc - UCSC's \"clade\" category for this species, as specified in the central database tables clade and genomeClade. genomeCladePriority NN - Relative priority of this species compared to others in the same clade. This central database query can be helpful in picking a value: select * from genomeClade where clade = '\$clade' order by priority; ----------------------------------------------------------------------------- Optional config.ra settings: commonName Xxxx - Common name to use as the menu label for this species (and in central database's genome column) instead of abbreviated scientific name. agpFiles /path/to/downloaded.agp - A complete path, possibly with wildcard characters, to AGP files which have already been downloaded from the sequencing center. qualFiles [/path/to/downloaded.qual | /path/to/qacAgpLift-ed.qac] - A complete path, possibly with wildcard characters, to quality score files which have already been downloaded from the sequencing center, or a complete path to a single .qac file (in case you need to pre-process qual files with qaToQac | qacAgpLift, for example). mitoSize N - to override the internal default of max size for mitochondrial sequence of $maxMitoSize e.g. for yeast: mitoSize 90000 subsetLittleIds Y - ok if agp little ids (col6) are a subset of fasta sequences rather than requiring an exact match doNotCheckDuplicates Y - do not stop build if duplicate sequences are found in genome " if ($detailed); print STDERR "\n"; exit $status; } # usage # Globals: # Command line argument: my $CONFIG; # Command line option vars: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; # specific command line options use vars qw/ $opt_splitGoldGap $opt_noGoldGapSplit $opt_forceDescription /; # Required config parameters: my ($db, $scientificName, $assemblyDate, $assemblyLabel, $assemblyShortLabel, $orderKey, $photoCreditURL, $photoCreditName, $ncbiGenomeId, $providerAssemblyName, $ncbiAssemblyId, $ncbiBioProject, $ncbiBioSample, $genBankAccessionID, $mitoAcc, $fastaFiles, $dbDbSpeciesDir, $taxId); # Conditionally required config parameters: my ($fakeAgpMinContigGap, $fakeAgpMinScaffoldGap, $clade, $genomeCladePriority); # Optional config parameters: my ($commonName, $agpFiles, $doNotCheckDuplicates, $qualFiles, $mitoSize, $subsetLittleIds); # Other globals: my ($gotMito, $gotAgp, $gotQual, $topDir, $chromBased, $forceDescription); my ($bedDir, $scriptDir, $endNotes); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, @HgAutomate::commonOptionSpec, "splitGoldGap", "noGoldGapSplit", "forceDescription", ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if (defined $opt_dbHost); $forceDescription = 0; $forceDescription = 1 if (defined $opt_forceDescription) } # checkOptions sub requireVar { # Ensure that var is in %config and return its value. # Remove it from %config so we can check for unrecognized contents. my ($var, $config) = @_; my $val = $config->{$var} || die "Error: $CONFIG is missing required variable \"$var\".\n" . "For a detailed list of required variables, run \"$base -help\".\n"; delete $config->{$var}; return $val; } # requireVar sub optionalVar { # If var has a value in %config, return it. # Remove it from %config so we can check for unrecognized contents. my ($var, $config) = @_; my $val = $config->{$var}; delete $config->{$var} if ($val); return $val; } # optionalVar sub parseConfig { # Parse config.ra file, make sure it contains the required variables. my ($configFile) = @_; my %config = (); my $fh = &HgAutomate::mustOpen($configFile); while (<$fh>) { next if (/^\s*#/ || /^\s*$/); if (/^\s*(\w+)\s*(.*)$/) { my ($var, $val) = ($1, $2); if (! exists $config{$var}) { $config{$var} = $val; } else { die "Duplicate definition for $var line $. of config file $configFile.\n"; } } else { die "Can't parse line $. of config file $configFile:\n$_\n"; } } close($fh); # Required variables. $db = &requireVar('db', \%config); $scientificName = &requireVar('scientificName', \%config); $assemblyDate = &requireVar('assemblyDate', \%config); $assemblyLabel = &requireVar('assemblyLabel', \%config); $assemblyShortLabel = &requireVar('assemblyShortLabel', \%config); $orderKey = &requireVar('orderKey', \%config); $mitoAcc = &requireVar('mitoAcc', \%config); $fastaFiles = &requireVar('fastaFiles', \%config); $dbDbSpeciesDir = &requireVar('dbDbSpeciesDir', \%config); $taxId = &requireVar('taxId', \%config); $photoCreditURL = &requireVar('photoCreditURL', \%config); $photoCreditName = &requireVar('photoCreditName', \%config); $ncbiGenomeId = &requireVar('ncbiGenomeId', \%config); $providerAssemblyName = &requireVar('ncbiAssemblyName', \%config); $ncbiAssemblyId = &requireVar('ncbiAssemblyId', \%config); $ncbiBioProject = &requireVar('ncbiBioProject', \%config); $ncbiBioSample = &requireVar('ncbiBioSample', \%config); $genBankAccessionID = &requireVar('genBankAccessionID', \%config); # Conditionally required variables -- optional here, but they might be # required later on in some cases. $fakeAgpMinContigGap = &optionalVar('fakeAgpMinContigGap', \%config); $fakeAgpMinScaffoldGap = &optionalVar('fakeAgpMinScaffoldGap', \%config); $clade = &optionalVar('clade', \%config); $genomeCladePriority = &optionalVar('genomeCladePriority', \%config); # Optional variables. $commonName = &optionalVar('commonName', \%config); $commonName =~ s/^(\w)(.*)/\u$1\L$2/; # Capitalize only the first word $agpFiles = &optionalVar('agpFiles', \%config); $doNotCheckDuplicates = &optionalVar('doNotCheckDuplicates', \%config); $qualFiles = &optionalVar('qualFiles', \%config); $mitoSize = &optionalVar('mitoSize', \%config); $subsetLittleIds = &optionalVar('subsetLittleIds', \%config); # Make sure no unrecognized variables were given. my @stragglers = sort keys %config; if (scalar(@stragglers) > 0) { die "Error: config file $CONFIG has unrecognized variables:\n" . " " . join(", ", @stragglers) . "\n" . "For a detailed list of supported variables, run \"$base -help\".\n"; } $gotMito = ($mitoAcc ne 'none'); $gotAgp = (defined $agpFiles); $gotQual = (defined $qualFiles); $topDir = "/cluster/data/$db"; } # parseConfig ######################################################################### # * step: seq [workhorse] sub mkClusterDataLink { # Make a /cluster/data/$tDb/ -> /cluster/store?/$tDb/ if it doesn't exist if (! -e "$topDir") { my $clusterStore = &HgAutomate::choosePermanentStorage(); &HgAutomate::mustMkdir("$clusterStore/$db"); # Don't use HgAutomate::run because this needs to happen despite -debug: (system("ln -s $clusterStore/$db $topDir") == 0) || die "Couldn't ln -s $clusterStore/$db $topDir\n"; } } # mkClusterDataLink sub getMito { # Get the mitochondrion from Genbank (if specified) and rename it chrM if ($gotMito) { my $whatItDoes = "It fetches mitochondrial sequence from GenBank and renames it chrM."; # Use $dbHost because this needs to wget and some of our workhorses # can't do that, and this is computationally cheap. my $bossScript = new HgRemoteScript("$scriptDir/getMito.csh", $dbHost, $topDir, $whatItDoes, $CONFIG); if ($mitoSize) { $maxMitoSize = $mitoSize; } my $mitoFile = "$topDir/M/$mitoAcc.fa"; $bossScript->add(<<_EOF_ mkdir M wget -O $mitoFile \\ 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc' # old url 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=$mitoAcc&retmode=text' # Make sure there's exactly one fasta record: if (`grep '^>' $mitoFile | wc -l` != 1) then echo "getMito: $mitoFile" echo " does not have exactly one fasta record" exit 1 endif # Make sure what we got is of about the right size: set mSize = `faSize $mitoFile | grep bases | awk '{print \$1;}'` if (\$mSize < 10000 || \$mSize > $maxMitoSize) then echo "getMito: $mitoFile" echo " fasta size \$mSize is out of expected range [10000, $maxMitoSize]" exit 1 endif # Make sure the fasta header looks plausible: set keyCount = 0 set header = `grep '^>' $mitoFile` foreach keyword ('$scientificName' mitochondr complete genome) set count = `echo \$header | grep -i "\$keyword" | wc -l` set keyCount = `expr \$keyCount + \$count` end if (\$keyCount < 2) then echo "getMito: $mitoFile" echo " fasta header does not have very many expected keywords" echo " (looking for '$scientificName' mitochondr complete genome)" echo " Here is the header:" echo \$header exit 1 endif # Make chrM.fa with >chrM: sed -e 's/^>.*/>chrM/' $mitoFile > $topDir/M/chrM.fa rm $mitoFile _EOF_ ); $bossScript->execute(); } } # getMito sub makeUnmasked2bit { my ($workhorse) = @_; my $whatItDoes = "It installs assembly fasta (and agp, if given) files in the usual dirs."; my $bossScript = new HgRemoteScript("$scriptDir/makeUnmasked2bit.csh", $workhorse, $topDir, $whatItDoes, $CONFIG); my $chrM = $gotMito ? "$topDir/M/chrM.fa" : ""; # Forget the fasta, just make an unmasked 2bit for now! # If we really need a fasta hierarchy, we can make it later, after # RepeatMasking!! The whole point of the thing is to provide a # structure for the RM run and subsequent masking of the sequence. # But the RM+masking task can structure itself -- it can simply make # masked .2bit, and then subsequent jobs can do their own split and # distribute if necessary (or better yet just use 2bit specs). # possibilities for $fastaFiles: # {.fa, .fa.gz} # {many single-fasta, one multi-fasta, several multi-fasta} # If AGP is given: # 1. if first column of AGP matches sequence names, use as-is. # 2. if sixth column of AGP matches seqnames, assemble with agpToFa. # installation options: # 1. chrom-based (numSeqs <= 100): directories containing single-fasta, # per-sequence files; randoms lumped into same directories with reals. # 2. scaffold-based: scaffolds.fa # Traditionally we have split to 5M in prep for RM -- but let the RM take # care of its own splitting (better yet, make it use 2bit specs). my $acat = "cat"; my $fcat = "cat"; my $sli = ""; if (defined $subsetLittleIds && $subsetLittleIds eq "Y") { $sli = "-1 "; } foreach my $file (`ls $fastaFiles 2> /dev/null`) { if ($file =~ m/\.gz$/) { $fcat = "zcat"; last; } } foreach my $file (`ls $agpFiles 2> /dev/null`) { if ($file =~ m/\.gz$/) { $acat = "zcat"; last; } } if ($gotAgp) { $bossScript->add(<<_EOF_ # Get sorted IDs from fasta sequence files: set fastaIds = `mktemp -p /tmp makeGenomeDb.fastaIds.XXXXXX` $fcat $fastaFiles | grep '^>' | sed -e 's/^>.*gb\|/>/; s/\|.*//' | perl -wpe 's/^>(\\S+).*/\$1/' | sort \\ > \$fastaIds # Get sorted "big" (1st column) and "little" (6th column) IDs from AGP files: set agpBigIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX` $acat $agpFiles | grep -v '^#' | awk '{print \$1;}' | sort -u \\ > \$agpBigIds set agpLittleIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX` $acat $agpFiles | grep -v '^#' | awk '((\$5 != "N") && (\$5 != "U")) {print \$6;}' | sort -u \\ > \$agpLittleIds # Compare fasta IDs to first and sixth columns of AGP: set diffBigCount = `comm -3 \$fastaIds \$agpBigIds | wc -l` set diffLittleCount = `comm $sli-3 \$fastaIds \$agpLittleIds | wc -l` # If AGP "big" IDs match sequence IDs, use sequence as-is. # If AGP "little" IDs match sequence IDs, or are a subset, assemble sequence with agpToFa. set bigGenome = "" # big genomes are over 4Gb: 4*1024*1024*1024 = 4294967296 # requires -long argument to faToTwoBit if (\$diffLittleCount == 0) then set agpTmp = `mktemp -p /tmp makeGenomeDb.agp.XXXXXX` $acat $agpFiles | grep -v '^#' > \$agpTmp set genomeSize = `$fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' | agpToFa -simpleMultiMixed \$agpTmp all stdout stdin | faSize stdin | grep -w bases | awk '{print \$1}'` if ( \$genomeSize > 4294967295 ) then set bigGenome = "-long" endif $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\ | agpToFa -simpleMultiMixed \$agpTmp all stdout stdin \\ | faToTwoBit \$bigGenome -noMask stdin $chrM $db.unmasked.2bit rm -f \$agpTmp else if (\$diffBigCount == 0) then set genomeSize = `$fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' | faSize stdin | grep -w bases | awk '{print \$1}'` if ( \$genomeSize > 4294967295 ) then set bigGenome = "-long" endif $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\ | faToTwoBit \$bigGenome -noMask stdin $chrM $db.unmasked.2bit else echo "Error: IDs in fastaFiles ($fastaFiles)" echo "do not perfectly match IDs in either the first or sixth columns of" echo "agpFiles ($agpFiles)" echo "Please examine and then remove these temporary files:" echo " \$fastaIds -- fasta IDs" echo " \$agpBigIds -- AGP first column IDs ('big' sequences)" echo " \$agpLittleIds -- AGP sixth column IDs ('little' sequences)" exit 1 endif rm -f \$fastaIds \$agpBigIds \$agpLittleIds _EOF_ ); } else { # No AGP -- just make an unmasked 2bit. $bossScript->add(<<_EOF_ set bigGenome = "" # big genomes are over 4Gb: 4*1024*1024*1024 = 4294967296 # requires -long argument to faToTwoBit set genomeSize = `$fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' | faSize stdin | grep -w bases | awk '{print \$1}'` if ( \$genomeSize > 4294967295 ) then set bigGenome = "-long" endif $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' | \\ faToTwoBit \$bigGenome -noMask stdin $chrM $db.unmasked.2bit _EOF_ ); } # Having made the unmasked .2bit, make chrom.sizes and chromInfo.tab: # verify no dots allowed in chrom names if (! defined $doNotCheckDuplicates || ($doNotCheckDuplicates eq "N")) { $bossScript->add(<<_EOF_ twoBitDup $db.unmasked.2bit > jkStuff/twoBitDup.txt if (`wc -l < jkStuff/twoBitDup.txt` > 0) then echo "ERROR: duplicate sequence found in $db.unmasked.2bit" exit 1 endif _EOF_ ); } $bossScript->add(<<_EOF_ twoBitInfo $db.unmasked.2bit stdout | sort -k2nr > chrom.sizes # if no dots in chrom names, should have only one kind of field size: set noDots = `cut -f 1 chrom.sizes | awk -F'.' '{print NF}' | sort -u | wc -l` if ( \$noDots != 1 ) then echo "ERROR: no dots allowed in chrom names ! e.g.:" grep "\." chrom.sizes | head -3 exit 1 endif # only one kind of field size and it must be simply a one: set dotCount = `cut -f 1 chrom.sizes | awk -F'.' '{print NF}' | sort -u` if ( \$dotCount != 1 ) then echo "ERROR: no dots allowed in chrom names ! e.g.:" grep "\." chrom.sizes | head -3 exit 1 endif rm -rf $HgAutomate::trackBuild/chromInfo mkdir $HgAutomate::trackBuild/chromInfo awk '{print \$1 "\t" \$2 "\t$HgAutomate::gbdb/$db/$db.2bit";}' chrom.sizes \\ > $HgAutomate::trackBuild/chromInfo/chromInfo.tab _EOF_ ); if ($gotAgp) { my $splitThreshold = $HgAutomate::splitThreshold; $bossScript->add(<<_EOF_ if (`wc -l < chrom.sizes` < $splitThreshold) then # Install per-chrom .agp files for download. $acat $agpFiles | grep -v '^#' \\ | splitFileByColumn -col=1 -ending=.agp stdin $topDir -chromDirs endif _EOF_ ); } $bossScript->execute(); # Now that we have created chrom.sizes (unless we're in -debug mode), # re-evaluate $chromBased for subsequent steps: if (-e "$topDir/chrom.sizes") { $chromBased = (`wc -l < $topDir/chrom.sizes` < $HgAutomate::splitThreshold); } } # makeUnmasked2bit sub makeBuildDir { # * step: seq [workhorse] &mkClusterDataLink(); &HgAutomate::checkCleanSlate('seq', 'agp', "$topDir/M", "$topDir/chrom.sizes"); &HgAutomate::mustMkdir($scriptDir); &HgAutomate::mustMkdir($bedDir); my $workhorse = &HgAutomate::chooseWorkhorse(); &getMito(); &makeUnmasked2bit($workhorse); } # makeBuildDir ######################################################################### # * step: agp [workhorse] sub requireFakeAgpParams { # When assembly does not include AGP, run hgFakeAgp and require the # developer to specify its parameters. If not specified, run faGapSizes # to give some hints. my $problem = 0; if (! defined $fakeAgpMinContigGap) { warn "Error: $CONFIG is missing required variable " . "\"fakeAgpMinContigGap\".\n"; $problem = 1; } if (! defined $fakeAgpMinScaffoldGap) { warn "Error: $CONFIG is missing required variable " . "\"fakeAgpMinScaffoldGap\".\n"; $problem = 1; } if ($problem) { warn "\nWhen the assembly does not include AGP, $CONFIG must specify " . "fakeAgpMinContigGap and fakeAgpMinScaffoldGap.\n"; warn "Printing a gap-size histogram from faGapSizes to stdout... " . "Check for overrepresented round numbers.\n"; warn "See usage messages of hgFakeAgp and faGapSizes for more hints.\n\n"; my $fileServer = &HgAutomate::chooseFileServer($topDir); print "\n"; &HgAutomate::run("$HgAutomate::runSSH $fileServer nice " . "twoBitToFa $topDir/$db.unmasked.2bit stdout " . "\\| faGapSizes stdin -niceSizes=" . "10,20,25,50,100,1000,2000,5000,10000,20000,50000"); print "\n"; exit 1; } } sub checkAgp { my $workhorse = &HgAutomate::chooseWorkhorse(); # Check or generate AGP, depending on whether it was provided: if ($gotAgp) { &HgAutomate::checkCleanSlate('agp', 'db', "$topDir/$db.agp"); my $whatItDoes = "It checks consistency of AGP and fasta."; my $bossScript = new HgRemoteScript("$scriptDir/checkAgpAndFa.csh", $workhorse, $topDir, $whatItDoes, $CONFIG); my $allAgp = "$topDir/$db.agp"; # If we added chrM from GenBank, exclude it from fasta: my $exclude = ($mitoAcc eq 'none') ? "" : "-exclude=chrM"; my $acat = "cat"; foreach my $file (`ls $agpFiles 2> /dev/null`) { if ($file =~ m/\.gz$/) { $acat = "zcat"; last; } } $bossScript->add(<<_EOF_ # When per-chrom AGP and fasta files are given, it would be much more # efficient to run this one chrom at a time. However, since the filenames # are arbitrary, I'm not sure how to identify the pairings of AGP and fa # files. So cat 'em all together and check everything at once: $acat $agpFiles | grep -v '^#' | sort -k1,1 -k2n,2n > $allAgp set result = `checkAgpAndFa $exclude $allAgp $db.unmasked.2bit | tail -1` if ("\$result" != 'All AGP and FASTA entries agree - both files are valid') then echo "Error: checkAgpAndFa failed\\!" echo "Last line of output: \$result" exit 1 endif _EOF_ ); $bossScript->execute(); } else { my $runDir = "$bedDir/hgFakeAgp"; &HgAutomate::mustMkdir($runDir); &HgAutomate::checkCleanSlate('agp', 'db', "$runDir/$db.agp"); &requireFakeAgpParams(); my $whatItDoes = "It runs hgFakeAgp (because no AGP was provided)."; my $bossScript = new HgRemoteScript("$runDir/doFakeAgp.csh", $workhorse, $runDir, $whatItDoes, $CONFIG); $bossScript->add(<<_EOF_ twoBitToFa $topDir/$db.unmasked.2bit stdout \\ | hgFakeAgp stdin $db.agp \\ -minContigGap=$fakeAgpMinContigGap -minScaffoldGap=$fakeAgpMinScaffoldGap _EOF_ ); $bossScript->execute(); } #*** Produce a gap report so we can say something sensible in gap.html. } # checkAgp ######################################################################### # * step: db [dbHost] sub makeDb { # Create a database on hgwdev, grp, chromInfo, gold/gap. my $qual = (defined $qualFiles) ? " qual" : ""; my $whatItDoes = "It creates the genome database ($db) and loads the most basic tables: chromInfo, grp, gap, gold,$qual and gc5Base."; my $bossScript = new HgRemoteScript("$scriptDir/makeDb.csh", $dbHost, $topDir, $whatItDoes, $CONFIG); # Actually, build some basic track files on $workhorse, then load. $qual = (defined $qualFiles) ? " qual and" : ""; $whatItDoes = "It generates$qual gc5Base track files for loading."; my $workhorse = &HgAutomate::chooseWorkhorse(); my $horseScript = new HgRemoteScript("$scriptDir/makeTrackFiles.csh", $workhorse, $topDir, $whatItDoes, $CONFIG); # Build quality wiggle track files (if provided). if (defined $qualFiles) { $horseScript->add(<<_EOF_ # Translate qual files to wiggle encoding. If there is a problem with # sequence names, you may need to tweak the original qual sequence names # and/or lift using qacAgpLft. mkdir -p $bedDir/qual cd $bedDir/qual _EOF_ ); if ($qualFiles =~ /^\S+\.qac$/) { # Single .qac file: $horseScript->add(<<_EOF_ qacToWig -fixed $qualFiles stdout | gzip -c > $db.qual.wigVarStep.gz _EOF_ ); } else { # Possible wildcard of qual file(s): $horseScript->add(<<_EOF_ if (`ls $qualFiles | grep \.gz | wc -l`) then set qcat = zcat else set qcat = cat endif \$qcat $qualFiles \\ | qaToQac stdin stdout \\ | qacToWig -fixed stdin stdout | gzip -c > $db.qual.wigVarStep.gz _EOF_ ); } $horseScript->add(<<_EOF_ wigToBigWig $db.qual.wigVarStep.gz ../../chrom.sizes $db.quality.bw _EOF_ ); } # Build gc5Base files. $horseScript->add(<<_EOF_ # Make gc5Base wiggle files. mkdir -p $bedDir/gc5Base cd $bedDir/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 $db \\ $topDir/$db.unmasked.2bit | gzip -c > $db.gc5Base.wigVarStep.gz wigToBigWig $db.gc5Base.wigVarStep.gz ../../chrom.sizes $db.gc5Base.bw _EOF_ ); # Now start the database creation and loading commands. $bossScript->add (<<_EOF_ hgsql '' -e 'create database $db' df -h /var/lib/mysql hgsql $db < \${HOME}/kent/src/hg/lib/grp.sql cut -f1 $HgAutomate::trackBuild/chromInfo/chromInfo.tab | awk '{if (length(\$0)>32) exit(1);}' || echo Attention annotator: Your chromosome names are longer than 32 character. This will crash this script and lead to error messages by featureBits and everything that uses hdb.c cut -f1 $HgAutomate::trackBuild/chromInfo/chromInfo.tab | awk '{print length(\$0)}' | sort -nr > $HgAutomate::trackBuild/chromInfo/t.chrSize set chrSize = `head -1 $HgAutomate::trackBuild/chromInfo/t.chrSize` sed -e "s/chrom(16)/chrom(\$chrSize)/" \${HOME}/kent/src/hg/lib/chromInfo.sql > $HgAutomate::trackBuild/chromInfo/chromInfo.sql rm -f $HgAutomate::trackBuild/chromInfo/t.chrSize hgLoadSqlTab $db chromInfo $HgAutomate::trackBuild/chromInfo/chromInfo.sql \\ $HgAutomate::trackBuild/chromInfo/chromInfo.tab _EOF_ ); my $allAgp = "$topDir/$db.agp"; $allAgp = "$bedDir/hgFakeAgp/$db.agp" if (! $gotAgp); if ($chromBased) { if ($opt_splitGoldGap) { $bossScript->add(<<_EOF_ # Split AGP into per-chrom files/dirs so we can load split gold and gap tables. cp /dev/null chrom.lst.tmp foreach chr (`awk '{print \$1;}' chrom.sizes`) set c = `echo \$chr | sed -e 's/^chr//; s/_random\$//;'` mkdir -p \$c echo \$c >> chrom.lst.tmp awk '\$1 == "'\$chr'" {print;}' $allAgp > \$c/\$chr.agp end sort -u chrom.lst.tmp > chrom.lst rm chrom.lst.tmp _EOF_ ); $bossScript->add("hgGoldGapGl -noGl -chromLst=chrom.lst $db $topDir .\n"); } else { $bossScript->add("hgGoldGapGl -noGl $db $allAgp\n"); } } else { $bossScript->add("hgGoldGapGl -noGl $db $allAgp\n"); } if ($gotAgp && $gotMito) { # When we load gold/gap from assembly AGP, but pull in chrM sequence # separately, chrM is conspicuously absent from gold/gap -- so add a fake # entry for it in gold (so featureBits gold --> 100%) and if split tables, # make a token chrM_gap table. Use bin=585 (512+ 64 + 8 + 1), the # smallest bin that starts at 0. The smallest bin is 128k bases, which # should always cover the entire mitochondrial genome (typically ~16k). my $bin = 585; my $mitoGold = ($mitoAcc =~ /^\d+$/) ? "gi|$mitoAcc" : $mitoAcc; if ($opt_splitGoldGap && ($chromBased || $opt_debug)) { my $defaultChrom = `head -1 $topDir/chrom.sizes | cut -f 1`; chomp $defaultChrom; $bossScript->add(<<_EOF_ # Add fake chrM_{gap,gold} to make featureBits happy. set mSize = `faSize $topDir/M/chrM.fa | grep bases | awk '{print \$1;}'` hgsql $db -e 'drop table chrM_gap,chrM_gold;' hgsql $db -e \\ 'create table chrM_gap select * from ${defaultChrom}_gap where 0; \\ create table chrM_gold select * from ${defaultChrom}_gold where 0; \\ insert into chrM_gold \\ values($bin,"chrM",0,'\$mSize',1,"F","$mitoGold",0,'\$mSize',"+");' _EOF_ ); } else { $bossScript->add(<<_EOF_ # Add fake chrM entry in gold table to make featureBits happy. set mSize = `faSize $topDir/M/chrM.fa | grep bases | awk '{print \$1;}'` hgsql $db -e \\ 'insert into gold \\ values($bin,"chrM",0,'\$mSize',1,"F","$mitoGold",0,'\$mSize',"+");' _EOF_ ); } # may as well finally add the chrM entry to the agp file $bossScript->add(<<_EOF_ set lastId = `tail -1 $topDir/$db.agp | awk '{print \$4+1}'` /bin/echo -e "chrM\t1\t\$mSize\t\$lastId\tF\t$mitoGold\t1\t\$mSize\t+" >> $topDir/$db.agp _EOF_ ); } $bossScript->add(<<_EOF_ # verify gold and gap tables cover everything featureBits -or -countGaps $db gold gap >&fb.$db.gold.gap.txt cat fb.$db.gold.gap.txt set allCovered = `awk '{print \$4-\$1}' fb.$db.gold.gap.txt` if (\$allCovered != 0) then echo "ERROR: gold and gap tables do not cover whole genome" exit 255 endif _EOF_ ); $bossScript->add(<<_EOF_ # Load gc5base mkdir -p $HgAutomate::gbdb/$db/bbi/gc5BaseBw rm -f $HgAutomate::gbdb/$db/bbi/gc5BaseBw/gc5Base.bw ln -s $bedDir/gc5Base/$db.gc5Base.bw $HgAutomate::gbdb/$db/bbi/gc5BaseBw/gc5Base.bw hgsql $db -e 'drop table if exists gc5BaseBw; \\ create table gc5BaseBw (fileName varchar(255) not null); \\ insert into gc5BaseBw values ("$HgAutomate::gbdb/$db/bbi/gc5BaseBw/gc5Base.bw");' _EOF_ ); if (defined $qualFiles) { $bossScript->add(<<_EOF_ # Load qual mkdir -p $HgAutomate::gbdb/$db/bbi/qualityBw rm -f $HgAutomate::gbdb/$db/bbi/qualityBw/quality.bw ln -s $bedDir/qual/$db.quality.bw $HgAutomate::gbdb/$db/bbi/qualityBw/quality.bw hgsql $db -e 'drop table if exists qualityBw; \\ create table qualityBw (fileName varchar(255) not null); \\ insert into qualityBw values ("$HgAutomate::gbdb/$db/bbi/qualityBw/quality.bw");' _EOF_ ); } $horseScript->execute(); $bossScript->execute(); } # makeDb ######################################################################### # * step: dbDb [dbHost] sub requireCladeAndPriority { # If the genomeClade table in the central database does not have a row # for this $db's genome, require user to specify clade and priority. my ($genome) = @_; my $problem = 0; if (! defined $clade) { warn "Error: $CONFIG is missing required variable " . "\"clade\".\n"; $problem = 1; } if (! defined $genomeCladePriority) { warn "Error: $CONFIG is missing required variable " . "\"genomeCladePriority\".\n"; $problem = 1; } if ($problem) { warn "\nCentral database table genomeClade does not have a row for " . "${db}'s genome \"$genome\",\n"; warn "so $CONFIG must specify clade and genomeCladePriority.\n"; warn "Examine the contents of the genomeClade table to get an idea " . "where \"$genome\" fits in.\n\n"; exit 1; } } sub getGenome { # Return what we'll use in the genome (and organism) column of dbDb. # This is the label that appears in the gateway menu. my $genome; if ($commonName) { $genome = $commonName; } else { $genome = $scientificName; $genome =~ s/^(\w)\w+\s+(\w+)$/$1. $2/; } return $genome; } sub makeDbDb { # * step: dbDb [dbHost... or just direct to hgcentraltest??] # - create hgcentraltest entry, (if necessary) defaultDb and genomeClade my $genome = &getGenome(); my $defaultPos; my ($seq, $size) = $opt_debug ? ("chr1", 1000) : split(/\s/, `head -1 "$topDir/chrom.sizes"`); my $start = int($size / 2); $size = ($start + 9999) if ($size > ($start + 9999)); $defaultPos = "$seq:$start-$size"; my $dbDbInsert = "$topDir/dbDbInsert.sql"; my $fh = &HgAutomate::mustOpen(">$dbDbInsert"); print $fh <<_EOF_ DELETE from dbDb where name = "$db"; INSERT INTO dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, htmlPath, hgNearOk, hgPbOk, sourceName, taxId) VALUES ("$db", "$assemblyDate ($assemblyShortLabel/$db)", "$HgAutomate::gbdb/$db", "$genome", "$defaultPos", 1, $orderKey, "$genome", "$scientificName", "$HgAutomate::gbdb/$db/html/description.html", 0, 0, "$assemblyLabel", $taxId); _EOF_ ; close($fh); my $centDbSql = "$HgAutomate::runSSH $dbHost $HgAutomate::centralDbSql"; &HgAutomate::run("$centDbSql < $dbDbInsert"); # Add a row to defaultDb if this is the first usage of $genome. my $quotedGenome = $genome; $quotedGenome =~ s/'/'"'"'/g; my $sql = "'select count(*) from defaultDb where genome = \"$quotedGenome\"'"; if (`echo $sql | $centDbSql` == 0) { $sql = "'INSERT INTO defaultDb (genome, name) " . "VALUES (\"$quotedGenome\", \"$db\")'"; &HgAutomate::run("echo $sql | $centDbSql"); } # If $genome does not already appear in genomeClade, warn user that # they will have to manually add it. $sql = "'select count(*) from genomeClade where genome = \"$quotedGenome\"'"; if (`echo $sql | $centDbSql` == 0) { &requireCladeAndPriority($genome); $sql = "'INSERT INTO genomeClade (genome, clade, priority) " . "VALUES (\"$quotedGenome\", \"$clade\", $genomeCladePriority)'"; &HgAutomate::run("echo $sql | $centDbSql"); } } # makeDbDb ######################################################################### # * step: trackDb [dbHost] sub makeDescription { # Make a template description.html (for the browser gateway page). my $sciUnderscore = $scientificName; $sciUnderscore =~ s/ /_/g; my $genome = &getGenome(); my $anchorRoot = lc($genome); $anchorRoot =~ s/\. /_/; $anchorRoot =~ s/ /_/; my $imgExtn = "jpg"; my $img = "/usr/local/apache/htdocs/images/$sciUnderscore.$imgExtn"; if ( ! -s $img ) { $imgExtn = "png"; $img = "/usr/local/apache/htdocs/images/$sciUnderscore.$imgExtn"; if ( ! -s $img ) { $imgExtn = "gif"; $img = "/usr/local/apache/htdocs/images/$sciUnderscore.$imgExtn"; if ( ! -s $img ) { warn "missing htdocs/images/$sciUnderscore.{jpg|png|gif}\n\n"; exit 1; } } } my $widthHeight = `identify $img | awk '{print \$3}'`; chomp $widthHeight; my ($width, $height) = split('x', $widthHeight); my $borderWidth = $width + 15; my $fh = &HgAutomate::mustOpen(">$topDir/html/description.html"); print $fh <<_EOF_ <!-- Display image in righthand corner --> <TABLE ALIGN=RIGHT BORDER=0 WIDTH=$borderWidth> <TR><TD ALIGN=RIGHT> <A HREF="https://www.ncbi.nlm.nih.gov/genome/$ncbiGenomeId" TARGET=_blank> <IMG SRC="../images/$sciUnderscore.$imgExtn" WIDTH=$width HEIGHT=$height ALT="$genome"></A> </TD></TR> <TR><TD ALIGN=RIGHT> <FONT SIZE=-1><em>$scientificName</em><BR> </FONT> <FONT SIZE=-2> (<A HREF="$photoCreditURL" TARGET=_blank>$photoCreditName</A>) </FONT> </TD></TR> </TABLE> <P> <B>UCSC Genome Browser assembly ID:</B> $db<BR> <B>Sequencing/Assembly provider ID:</B> $assemblyLabel $providerAssemblyName<BR> <B>Assembly date:</B> $assemblyDate<BR> <B>Accession ID:</B> $genBankAccessionID<BR> _EOF_ ; if ($ncbiGenomeId ne "n/a") { printf $fh "<B>NCBI Genome ID:</B> <A HREF='https://www.ncbi.nlm.nih.gov/genome/$ncbiGenomeId' TARGET='_blank'>$ncbiGenomeId</A> ($scientificName)<BR>\n"; } else { printf $fh "<B>NCBI Genome ID:</B> $ncbiGenomeId<BR>\n"; } printf $fh "<B>NCBI Assembly ID:</B> <A HREF='https://www.ncbi.nlm.nih.gov/assembly/$ncbiAssemblyId' TARGET='_blank'>$ncbiAssemblyId</A><BR> <B>NCBI BioProject ID:</B> <A HREF='https://www.ncbi.nlm.nih.gov/bioproject/$ncbiBioProject' TARGET='_blank'>$ncbiBioProject</A><BR>\n"; if ($ncbiBioSample ne "n/a") { printf $fh "<B>NCBI BioSample ID:</B> <A HREF='https://www.ncbi.nlm.nih.gov/biosample/$ncbiBioSample' TARGET='_blank'>$ncbiBioSample</A>\n"; } else { printf $fh "<B>NCBI BioSample ID:</B> $ncbiBioSample<BR>\n"; } print $fh <<_EOF_ </P> <HR> <P> <B>Search the assembly:</B> <UL> <LI> <B>By position or search term: </B> Use the "position or search term" box to find areas of the genome associated with many different attributes, such as a specific chromosomal coordinate range; mRNA, EST, or STS marker names; or keywords from the GenBank description of an mRNA. <A HREF="../goldenPath/help/query.html">More information</A>, including sample queries. <LI> <B>By gene name: </B> Type a gene name into the "search term" box, choose your gene from the drop-down list, then press "submit" to go directly to the assembly location associated with that gene. <A HREF="../goldenPath/help/geneSearchBox.html">More information</A>. <LI> <B>By track type: </B> Click the "track search" button to find Genome Browser tracks that match specific selection criteria. <A HREF="../goldenPath/help/trackSearch.html">More information</A>. </UL> </P> <HR> <P> <B>Download sequence and annotation data:</B> <UL> <LI><A HREF="../goldenPath/help/ftp.html">Using rsync</A> (recommended) <LI><A HREF="ftp://hgdownload.soe.ucsc.edu/goldenPath/$db/">Using FTP</A> <LI><A HREF="http://hgdownload.soe.ucsc.edu/downloads.html#$anchorRoot">Using HTTP</A> <LI><A HREF="../goldenPath/credits.html#${anchorRoot}_use">Data use conditions and restrictions</A> <LI><A HREF="../goldenPath/credits.html#${anchorRoot}_credits">Acknowledgments</A> </UL> </P> _EOF_ ; close($fh); } # makeDescription # from Perl Cookbook Recipe 2.17, print out large numbers with comma # delimiters: sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } # definition of contig types in the AGP file my %goldTypes = ( 'A' => 'active finishing', 'D' => 'draft sequence', 'F' => 'finished sequence', 'O' => 'other sequence', 'P' => 'pre draft', 'W' => 'whole genome shotgun' ); # definition of gap types in the AGP file my %gapTypes = ( 'clone' => 'gaps between clones in scaffolds', 'heterochromatin' => 'heterochromatin gaps', 'short_arm' => 'a gap inserted at the start of an acrocentric chromosome', 'telomere' => 'telomere gaps', 'repeat' => 'an unresolvable repeat', 'centromere' => 'gaps for centromeres are included when they can be reasonably localized', 'scaffold' => 'gaps between scaffolds in chromosome assemblies', 'contig' => 'gaps between contigs in scaffolds', 'other' => 'gaps added at UCSC to annotate strings of <em>N</em>s that were not marked in the AGP file', 'fragment' => 'gaps between whole genome shotgun contigs' ); sub makeLocalTrackDbRa { # Make an assembly-level trackDb.ra, gap.html and gold.html. my @localFiles = qw( trackDb.ra gap.html gold.html ); my $fh = &HgAutomate::mustOpen(">$topDir/html/trackDb.ra"); print $fh <<_EOF_ # Local declaration so that local gold.html is picked up. track gold override html gold # Local declaration so that local gap.html is picked up. track gap override html gap _EOF_ ; close($fh); $fh = &HgAutomate::mustOpen(">$topDir/html/gap.html"); my $em = $commonName ? "" : "<em>"; my $noEm = $commonName ? "" : "</em>"; my $gapCount = `hgsql -N -e 'select count(*) from gap;' $db`; chomp $gapCount; if ($gotAgp) { print $fh <<_EOF_ <H2>Description</H2> <P> This track shows the gaps in the $assemblyDate $em\$organism$noEm genome assembly. </P> <P> Genome assembly procedures are covered in the NCBI <A HREF="https://www.ncbi.nlm.nih.gov/assembly/basics/" TARGET=_blank>assembly documentation</A>.<BR> NCBI also provides <A HREF="https://www.ncbi.nlm.nih.gov/assembly/$ncbiAssemblyId" TARGET="_blank">specific information about this assembly</A>. </P> <P> The definition of the gaps in this assembly is from the <A HREF="ftp://hgdownload.soe.ucsc.edu/goldenPath/$db/bigZips/$db.agp.gz" TARGET=_blank>AGP file</A> delivered with the sequence. The NCBI document <A HREF="https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/" TARGET=_blank>AGP Specification</A> describes the format of the AGP file. </P> <P> Gaps are represented as black boxes in this track. If the relative order and orientation of the contigs on either side of the gap is supported by read pair data, it is a <em>bridged</em> gap and a white line is drawn through the black box representing the gap. </P> _EOF_ ; if ($gapCount > 0) { print $fh "<P>This assembly contains the following principal types of gaps: <UL>\n"; } else { print $fh "<P>This assembly has no annotated gaps.\n</P>\n"; } open (GL, "hgsql -N -e 'select type from gap;' $db | sort | uniq -c | sort -n|") or die "can not select type from $db.gap table"; while (my $line = <GL>) { chomp $line; $line =~ s/^\s+//; my ($count, $type) = split('\s+', $line); my $minSize = `hgsql -N -e 'select min(size) from gap where type="$type";' $db`; chomp $minSize; my $maxSize = `hgsql -N -e 'select max(size) from gap where type="$type";' $db`; chomp $maxSize; my $sizeMessage = ""; if ($minSize == $maxSize) { $sizeMessage = sprintf ("all of size %s bases", commify($minSize)); } else { $sizeMessage = sprintf ("size range: %s - %s bases", commify($minSize), commify($maxSize)); } if (exists ($gapTypes{$type}) ) { printf $fh "<LI><B>%s</B> - %s (count: %s; %s)</LI>\n", $type, $gapTypes{$type}, commify($count), $sizeMessage; } else { die "makeLocalTrackDbRa: missing AGP gap type definition: $type"; } } close (GL); if ($gapCount > 0) { print $fh "</UL></P>\n"; } } else { print $fh <<_EOF_ <H2>Description</H2> This track depicts gaps in the draft assembly ($assemblyDate, $assemblyLabel) of the $em\$organism$noEm genome. *** Developer: remove this statement if no future assemblies are expected: Many of these gaps — with the exception of intractable heterochromatic gaps — may be closed during the finishing process. <P> Gaps are represented as black boxes in this track. If the relative order and orientation of the contigs on either side of the gap is known, it is a <em>bridged</em> gap and a white line is drawn through the black box representing the gap. </P> <P> This assembly was sequenced with paired reads taken from *** Developer: check if this is accurate: plasmids and fosmids of various sizes. Overlapping reads were merged into contigs, and pairing information was then used to join the contigs into scaffolds. The gap sizes are estimated from the size of the plasmids and fosmids, with a minimum gap size of $fakeAgpMinContigGap. </P> <P></P> <P>This assembly contains the following types of gaps: <UL> <LI><B>Contig</B> - gaps of size $fakeAgpMinContigGap to $fakeAgpMinScaffoldGap. <LI><B>Scaffold</B> - gaps greater than $fakeAgpMinScaffoldGap in size. </UL> </P> _EOF_ ; } close($fh); $fh = &HgAutomate::mustOpen(">$topDir/html/gold.html"); if ($gotAgp) { print $fh <<_EOF_ <H2>Description</H2> <P> This track shows the sequences used in the $assemblyDate $em\$organism$noEm genome assembly. </P> <P> Genome assembly procedures are covered in the NCBI <A HREF="https://www.ncbi.nlm.nih.gov/assembly/basics/" TARGET=_blank>assembly documentation</A>.<BR> NCBI also provides <A HREF="https://www.ncbi.nlm.nih.gov/assembly/$ncbiAssemblyId" TARGET="_blank">specific information about this assembly</A>. </P> <P> The definition of this assembly is from the <A HREF="ftp://hgdownload.soe.ucsc.edu/goldenPath/$db/bigZips/$db.agp.gz" TARGET=_blank>AGP file</A> delivered with the sequence. The NCBI document <A HREF="https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/" TARGET=_blank>AGP Specification</A> describes the format of the AGP file. </P> <P> In dense mode, this track depicts the contigs that make up the currently viewed scaffold. Contig boundaries are distinguished by the use of alternating gold and brown coloration. Where gaps exist between contigs, spaces are shown between the gold and brown blocks. The relative order and orientation of the contigs within a scaffold is always known; therefore, a line is drawn in the graphical display to bridge the blocks.</P> <P> Component types found in this track (with counts of that type in parentheses): <UL> _EOF_ ; open (GL, "hgsql -N -e 'select type from gold;' $db | sort | uniq -c | sort -rn|") or die "can not select type from $db.gold table"; while (my $line = <GL>) { chomp $line; $line =~ s/^\s+//; my ($count, $type) = split('\s+', $line); my $singleMessage = ""; if ((1 == $count) && (("F" eq $type) || ("O" eq $type))) { my $chr = `hgsql -N -e 'select chrom from gold where type=\"$type\";' $db`; my $frag = `hgsql -N -e 'select frag from gold where type=\"$type\";' $db`; chomp $chr; chomp $frag; $singleMessage = sprintf("(%s/%s)", $chr, $frag); } if (exists ($goldTypes{$type}) ) { if (length($singleMessage)) { printf $fh "<LI>%s - one %s %s</LI>\n", $type, $goldTypes{$type}, $singleMessage; } else { printf $fh "<LI>%s - %s (%s)</LI>\n", $type, $goldTypes{$type}, commify($count); } } else { die "makeLocalTrackDbRa: missing AGP contig type definition: $type"; } } close (GL); printf $fh "</UL></P>\n"; } else { print $fh <<_EOF_ <H2>Description</H2> <P> This track shows the draft assembly ($assemblyDate, $assemblyLabel) of the $em\$organism$noEm genome. *** Developer: check if this is accurate: Whole-genome shotgun reads were assembled into contigs. When possible, contigs were grouped into scaffolds (also known as "supercontigs"). The order, orientation and gap sizes between contigs within a scaffold are based on paired-end read evidence. </P> <P> Locations of contigs and scaffolds were deduced from runs of Ns in the assembled sequence. A run of Ns of more than $fakeAgpMinScaffoldGap bases was assumed to be a gap between scaffolds, and a run of Ns between $fakeAgpMinContigGap and $fakeAgpMinScaffoldGap was assumed to be a gap between contigs.</P> <P> In dense mode, this track depicts the contigs that make up the currently viewed scaffold. Contig boundaries are distinguished by the use of alternating gold and brown coloration. Where gaps exist between contigs, spaces are shown between the gold and brown blocks. The relative order and orientation of the contigs within a scaffold is always known; therefore, a line is drawn in the graphical display to bridge the blocks.</P> <P> All components within this track are of fragment type "W": whole genome shotgun contig. </P> _EOF_ ; } close($fh); my $localFiles = "$topDir/html/{" . join(',', @localFiles) . '}'; return $localFiles; } # makeLocalTrackDbRa sub makeTrackDb { # * step: trackDb [dbHost] my $runDir = "$topDir/TemporaryTrackDbCheckout"; &HgAutomate::mustMkdir($runDir); &HgAutomate::mustMkdir("$topDir/html"); &makeDescription(); my $localFiles = &makeLocalTrackDbRa(); my $whatItDoes =<<_EOF_ It makes a local checkout of kent/src/hg/makeDb/trackDb/ and makes *suggested* modifications. Checking in modified files, and adding new files, is left up to the user in case this script has made some wrong guesses about proper names and locations. _EOF_ ; my $bossScript = new HgRemoteScript("$scriptDir/makeTrackDb.csh", $dbHost, $runDir, $whatItDoes, $CONFIG); $bossScript->add(<<_EOF_ # These directories are necessary for running make in trackDb: $HgAutomate::git archive --remote=git://genome-source.soe.ucsc.edu/kent.git \\ --prefix=kent/ HEAD src/hg/makeDb/trackDb/loadTracks \\ src/hg/makeDb/trackDb/$dbDbSpeciesDir \\ src/hg/makeDb/trackDb/trackDb.transMap.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.ra \\ +src/hg/makeDb/trackDb/trackDb.encode3.ra \\ +src/hg/makeDb/trackDb/trackDb.chainNet.asmHub.ra \\ src/hg/makeDb/trackDb/crisprAll.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.primates.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.other.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.euarchontoglires.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.laurasiatheria.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.afrotheria.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.mammal.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.birds.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.sarcopterygii.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.fish.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.insects.ra \\ src/hg/makeDb/trackDb/trackDb.chainNet.nematode.ra \\ src/hg/makeDb/trackDb/chainNetPetMar1.ra \\ src/hg/makeDb/trackDb/chainNetPetMar2.ra \\ src/hg/makeDb/trackDb/trackDb.nt.ra \\ src/hg/makeDb/trackDb/joinedRmskComposite.ra \\ src/hg/makeDb/trackDb/joinedRmsk.ra \\ src/hg/makeDb/trackDb/trackDb.genbank.ra \\ src/hg/makeDb/trackDb/trackDb.genbank.new.ra \\ src/hg/makeDb/trackDb/trackDb.uniprot.ra \\ src/hg/makeDb/trackDb/crispr10K.ra \\ src/hg/makeDb/trackDb/crisprAll.ra \\ src/hg/makeDb/trackDb/tagTypes.tab \\ src/hg/lib/trackDb.sql \\ src/hg/lib/hgFindSpec.sql \\ src/hg/makeDb/trackDb/trackDb.ra | tar xf - cd kent/src/hg/makeDb/trackDb # Create the expected species-level directory for $db if necessary: mkdir -p $dbDbSpeciesDir/$db # Move local description files into place: mv $localFiles $dbDbSpeciesDir/$db/ if (1 == $forceDescription) then rm -f $dbDbSpeciesDir/$db/description.html endif # Copy the template description.html for $db into place, and link to it # from $HgAutomate::gbdb/$db/html/ . if (! -e $dbDbSpeciesDir/$db/description.html) then cp -p $topDir/html/description.html $dbDbSpeciesDir/$db/description.html endif mkdir -p $HgAutomate::gbdb/$db/html rm -f $HgAutomate::gbdb/$db/html/description.html ln -s $topDir/html/description.html $HgAutomate::gbdb/$db/html/ # Do a test run with the generated files: ./loadTracks trackDb_\${USER} hgFindSpec_\${USER} $db _EOF_ ); $bossScript->execute(); $endNotes .=<<_EOF_ Template trackDb.ra and .html's have been created, but they all need editing! cd $runDir/kent/src/hg/makeDb/trackDb/$dbDbSpeciesDir/$db Search for '***' notes in each file in and make corrections (sometimes the files used for a previous assembly might make a better template): description.html $localFiles Then copy these files to your ~/kent/src/hg/makeDb/trackDb/$dbDbSpeciesDir/$db - cd ~/kent/src/hg/makeDb/trackDb - edit makefile to add $db to DBS. - git add $dbDbSpeciesDir/$db/*.{ra,html} - git commit -m "Added $db to DBS." makefile - git commit -m "Initial descriptions for $db." $dbDbSpeciesDir/$db/*.{ra,html} - git pull; git push - Run make update DBS=$db and make alpha when done. - (optional) Clean up $runDir _EOF_ ; } # makeTrackDb ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); ($CONFIG) = @ARGV; #*** Force -verbose until this is really ready to go: $opt_verbose = 3 if (! $opt_verbose); &parseConfig($CONFIG); $endNotes = ""; $bedDir = "$topDir/$HgAutomate::trackBuild"; $scriptDir = "$topDir/jkStuff"; if (-e "$topDir/chrom.sizes") { $chromBased = (`wc -l < $topDir/chrom.sizes` < $HgAutomate::splitThreshold); } $stepper->execute(); my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'trackDb') ? "" : " (through the '$stopStep' step)"; HgAutomate::verbose(1, "\n *** All done!$upThrough\n"); if ($endNotes) { #*** Should mail this to $ENV{'USER'} so it's not so easy to ignore. #*** Does mail work on all of our machines?? Even if it works on one, #*** we can ssh it. Should be in an HgAutomate routine. HgAutomate::verbose(0, "\n\nNOTES -- STUFF THAT YOU WILL HAVE TO DO --\n\n" . "$endNotes\n"); }