7a14294a4cf3e478e7ed6f353af8c482ee6b8934
braney
Wed Jan 13 11:36:28 2021 -0800
search only hg/lib for AS files to build the tableDescriptions table.
diff --git src/test/buildTableDescriptions.pl src/test/buildTableDescriptions.pl
index 3da3d2f..b9b00ad 100755
--- src/test/buildTableDescriptions.pl
+++ src/test/buildTableDescriptions.pl
@@ -1,459 +1,467 @@
#!/usr/bin/env perl
#
# buildTableDescriptions.pl - read .as files in a directory tree and use them to
# create the tableDescriptions table used by hg CGI's
# See usage for a description of parameters.
#
use warnings;
# Figure out path of executable so we can add perllib to the path.
use FindBin qw($Bin);
use lib "$Bin/perllib";
use TrackDb;
use HgConf;
use Getopt::Long;
use DBI;
use Carp;
use strict;
#
# Default behaviors, changeable by command line args:
#
my $kentSrc = "/cluster/home/galt/kentclean/src";
# currently does not get checked-in to git:
my $gbdDPath = "/cluster/home/galt/kentclean/src/hg/htdocs/goldenPath/gbdDescriptions.html";
my $noLoad = 0;
my $verbose = 0;
+my $subDir = "";
# Hard-coded behaviors:
my $debug = 0;
# These are ignored to avoid errors about duplicate table/object definitions.
my %autoSqlIgnore = ( "hg/autoSql/tests/input/polyTest.as" => "",
"hg/autoSql/tests/input/newTest.as" => "",
"hg/autoSql/tests/input/doc.as" => "",
"hg/autoSql/tests/input/doc2.as" => "",
"hg/autoSql/doc.as" => "",
"hg/autoSql/doc2.as" => "",
"hg/instinct/instinctMember/members.as" => "",
"hg/instinct/bioIntegrator/bioIntDb.as" => "",
"hg/lib/bed.as" => "",
"hg/lib/ggDbRep.as" => "",
"hg/lib/genotype.as" => "",
"hg/lib/ispyTables.as" => "",
"hg/lib/gsid/vax003AaCons.as" => "",
"hg/lib/gsid/vax003AaMaf.as" => "",
"hg/lib/gsid/vax003Cons.as" => "",
"hg/lib/gsid/vax003Maf.as" => "",
"hg/lib/gsid/vax004AaCons.as" => "",
"hg/lib/gsid/vax004AaMaf.as" => "",
"hg/lib/gsid/vax004Cons.as" => "",
"hg/lib/gsid/vax004Maf.as" => "",
"hg/makeDb/schema/joinerGraph/all.as" => "",
"hg/makeDb/schema/joinerGraph/swissProt.as" => "",
"hg/protein/spToDb/spDbTables.as" => "",
"hg/ultras/ultraDb/as/udHistory.as" => "",
"hg/bioImage/bioImage.as" => "",
"hg/bioImage/loadMahoney/mahoney.as" => "",
"hg/genePix/genePix.as" => "",
"hg/genePix/loadMahoney/mahoney.as" => "",
"hg/visiGene/visiGene.as" => "",
"hg/visiGene/vgLoadMahoney/mahoney.as" => "",
"lib/pslWScore.as" => "",
"utils/bedToBigBed/tests/input/itemRgb.as" => "",
);
my $basename = $0; $basename =~ s@.*/@@;
#
# usage: Print help message and exit, happy or unhappy.
#
sub usage {
print STDERR "Usage:
$basename [-kentSrc dir] [-gbdDPath f] [-noLoad] [-help]
-kentSrc dir: Use dir as the kent/src checkout.
Default: $kentSrc.
-gbdDPath f: Use f as the gbdDescriptions.html.
Default: $gbdDPath.
-db db: Work only on db, not on all active dbs.
-hgConf file: Use file instead of ~/.hg.conf.
-noLoad: Don't load the database, just create .sql files.
+ -subDir Grab AS files from this sub directory of kentSrc rather than kentSrc
-help: Print this message.
";
exit(@_);
} # end usage
#
# getActiveDbs: connect to central db, get list of active dbs
#
sub getActiveDbs {
my $hgConf = shift;
confess "Too many arguments" if (defined shift);
my $centdb = $hgConf->lookup('central.db');
my $host = $hgConf->lookup('central.host');
my $username = $hgConf->lookup('central.user');
my $password = $hgConf->lookup('central.password');
my $dbh = DBI->connect("DBI:mysql:database=$centdb;host=$host",
$username, $password);
my $results =
$dbh->selectcol_arrayref("select name from dbDb where active = 1;");
$dbh->disconnect();
return @{$results};
}
#
# simplifyFields: trim fieldSpec of occasionally-used prefix/suffix fields
#
sub simplifyFields {
my $fieldSpec = shift;
confess "Too few arguments" if (! defined $fieldSpec);
confess "Too many arguments" if (defined shift);
$fieldSpec =~ s/^bin,//;
$fieldSpec =~ s/,crc,$/,/;
return $fieldSpec;
}
#
# getTableFields: connect to db, get tables and fields
#
sub getTableFields {
my $hgConf = shift;
my $db = shift;
confess "Too few arguments" if (! defined $db);
confess "Too many arguments" if (defined shift);
my $host = $hgConf->lookup('db.host');
if ($host && $host ne 'localhost') {
$host = ";host=$host";
} else {
$host = "";
}
my $username = $hgConf->lookup('db.user');
my $password = $hgConf->lookup('db.password');
my $dbh = DBI->connect("DBI:mysql:database=$db$host", $username, $password);
my %tableFields = ();
my %tableNamesInsens = ();
my $tables = $dbh->selectcol_arrayref("show tables;");
foreach my $t (@{$tables}) {
my $desc = $dbh->selectcol_arrayref("desc $t;");
my $fields = "";
foreach my $f (@{$desc}) {
$fields .= "$f,";
}
$t =~ s/^chr\w+_(\w+_)?/chrN_/;
$fields = &simplifyFields($fields);
if (defined $tableFields{$t} &&
$tableFields{$t} ne $fields) {
warn "table fields differ for split table $db.$t:\n$tableFields{$t} != $fields";
}
my $tableInsens = $t;
$tableInsens =~ tr/A-Z/a-z/;
if (! defined $tableFields{$t} &&
defined $tableNamesInsens{$tableInsens}) {
warn "Case-insensitive duplicate for $db.$t... dropping.";
} else {
$tableFields{$t} = $fields;
$tableNamesInsens{$tableInsens} = 1;
}
}
$dbh->disconnect();
return %tableFields;
}
#
# slurpAutoSql: find all .as files under rootDir, grab contents.
#
sub slurpAutoSql {
my $rootDir = shift;
+ my $subDir = shift;
confess "Too few arguments" if (! defined $rootDir);
confess "Too many arguments" if (defined shift);
- open(P, "find $rootDir -name '*.as' -print |") || die "Can't open pipe";
+ open(P, "find $rootDir/$subDir -name '*.as' -print |") || die "Can't open pipe";
my %tableAS = ();
my %objectAS = ();
my $gotLeftParen = 0;
while (
) {
chop;
my $filename = $_;
my $filetail = $filename; $filetail =~ s/^$kentSrc\///;
next if (defined $autoSqlIgnore{$filetail});
open(F, "$filename") || die "Can't open $filename";
my $as = "";
my $table = "";
my $object = "";
my $fields = "";
while () {
$as .= $_;
if (/^\s*\(/) {
$gotLeftParen = 1;
}
if (/^\s*table\s+(\S+)[^\;]*$/i) {
$table = $1;
$object = "";
} elsif (/^\s*(object|simple)\s+(\S+)/ && !$gotLeftParen) {
$object = $2;
$table = "";
} elsif (/^\s*enum\s*\([^\)]+\)\s*(\S+)\s*;/) {
$fields .= "$1,";
} elsif (/^[^\"]+\s+(\S+)\s*;/) {
$fields .= "$1,";
} elsif (/^\s*\)/) {
if (($table eq "" && $object eq "") || $fields eq "") {
die "Trouble parsing autoSql file $filename:\n$as";
}
if ($table ne "") {
if (defined $tableAS{$table}) {
warn "Duplicate autoSql def for table $table (" .
$tableAS{$table}->{filename} . " vs. $filename)";
}
$tableAS{$table} = { fields => &simplifyFields($fields),
autoSql => $as,
tableName => $table,
filename => $filename, };
} elsif ($object ne "") {
if (defined $objectAS{$table}) {
warn "Duplicate autoSql def for object $object (" .
$objectAS{$object}->{filename} . " vs. $filename)";
}
$objectAS{$object} = { autoSql => $as,
objectName => $object,
filename => $filename, };
}
$as = $table = $object = $fields = "";
$gotLeftParen = 0;
}
} # each line of autoSql file
close(F);
} # each autoSql file found in $rootDir
close(P);
# Add bedN (for when we get the fields from the trackDb.type).
my @bedFields = split(",", $tableAS{"bed"}->{fields});
my @autoSql = split("\n", $tableAS{"bed"}->{autoSql});
my $filename = $tableAS{"bed"}->{filename};
for (my $n=scalar(@bedFields); $n >= 3; $n--) {
$tableAS{"bed$n"} = { fields => join(",", @bedFields) . ",",
autoSql => join("\n", @autoSql) . "\n",
tableName => "bed$n",
filename => $filename, };
my $lastField = pop(@bedFields);
my @newAS = ();
my $nm1 = $n - 1;
foreach my $as (@autoSql) {
$as =~ s/table\s+\S+/table bed$nm1/;
if ($as !~ /\s+$lastField\s*;/) {
push @newAS, $as;
}
}
@autoSql = @newAS;
}
# When table definitions rely on objects, add the object definitions to
# the stored table autoSql (so the autoSql parser doesn't die in hgTables).
foreach my $table (keys %tableAS) {
if ($tableAS{$table}->{autoSql} =~ /\n\s+table (\w+)/) {
my $object = $1;
if (defined $objectAS{$object}) {
$tableAS{$table}->{autoSql} .= ("\n" . $objectAS{$object}->{autoSql});
} elsif (defined $tableAS{$object}) {
$tableAS{$table}->{autoSql} .= ("\n" . $tableAS{$object}->{autoSql});
} else {
warn "Incomplete AutoSql? table $table refers to object $object " .
"but I can't find the definition of $object.";
}
}
}
return %tableAS;
}
#
# indexAutoSqlByFields: make a new AutoSql hash, indexed by fields not table
#
sub indexAutoSqlByFields {
my $tASRef = shift;
confess "Too few arguments" if (! defined $tASRef);
confess "Too many arguments" if (defined shift);
my %fieldsAS = ();
foreach my $t (keys %{$tASRef}) {
my $asRef = $tASRef->{$t};
my $fields = $asRef->{fields};
$fieldsAS{$fields} = $asRef;
}
return %fieldsAS;
}
#
# matchAutoSqlByFields: see if there's an autoSql def for given fields.
#
sub matchAutoSqlByFields {
my $fields = shift;
my $tASRef = shift;
my $fASRef = shift;
confess "Too few arguments" if (! defined $fASRef);
confess "Too many arguments" if (defined shift);
# try standard types first, to save time (and avoid dupl's for std types).
if ($fields eq $tASRef->{"psl"}->{fields}) {
return $tASRef->{"psl"};
} elsif ($fields eq $tASRef->{"genePredExt"}->{fields}) {
return $tASRef->{"genePredExt"};
} elsif ($fields eq $tASRef->{"genePred"}->{fields}) {
return $tASRef->{"genePred"};
} elsif ($fields eq $tASRef->{"lfs"}->{fields}) {
return $tASRef->{"lfs"};
} elsif ($fields eq $tASRef->{"genericNameValue"}->{fields}) {
return $tASRef->{"genericNameValue"};
} elsif ($fields eq $tASRef->{"openChromCombinedPeaks"}->{fields}) {
return $tASRef->{"openChromCombinedPeaks"};
} else {
for (my $n=12; $n >= 3; $n--) {
if ($fields eq $tASRef->{"bed$n"}->{fields}) {
return $tASRef->{"bed$n"};
}
}
return $fASRef->{$fields};
}
}
#
# parseGbdDescriptions: parse anchors and .as out of gbdDescriptions.html
#
sub parseGbdDescriptions {
my $filename = shift;
confess "Too few arguments" if (! defined $filename);
confess "Too many arguments" if (defined shift);
open(F, "$filename") || die "Can't open $filename";
my %tableAnchors = ();
my $anchor = "";
while () {
if (m/\s*table\s+([\w_]+)/) {
$tableAnchors{$1} = $anchor;
}
}
close(F);
return %tableAnchors;
}
###########################################################################
#
# Parse & process command line args
#
# GetOptions will put command line args here:
use vars qw/
$opt_kentSrc
$opt_gbdDPath
$opt_noLoad
$opt_db
$opt_hgConf
$opt_help
+ $opt_subDir
$opt_verbose
/;
my $ok = GetOptions("kentSrc=s",
"gbdDPath=s",
"noLoad",
"db=s",
"hgConf=s",
"help",
+ "subDir=s",
"verbose");
&usage(1) if (! $ok);
&usage(0) if ($opt_help);
$kentSrc = $opt_kentSrc if ($opt_kentSrc);
$gbdDPath = $opt_gbdDPath if ($opt_gbdDPath);
$noLoad = 1 if (defined $opt_noLoad);
$verbose = $opt_verbose if (defined $opt_verbose);
$verbose = 1 if ($debug);
+$subDir = $opt_subDir if ($opt_subDir);
# If -hgConf is given, set HGDB_CONF environment variable so our call to
# hgsql uses the correct file.
if ($opt_hgConf) {
if (! -e $opt_hgConf) {
die "Error: -hgConf file \"$opt_hgConf\" does not exist.\n";
}
$ENV{HGDB_CONF} = $opt_hgConf;
}
############################################################################
# MAIN
-my %tableAutoSql = slurpAutoSql($kentSrc);
+my %tableAutoSql = slurpAutoSql($kentSrc, $subDir);
my %fieldsAutoSql = indexAutoSqlByFields(\%tableAutoSql);
my %tableAnchors = parseGbdDescriptions($gbdDPath);
my $hgConf = HgConf->new($opt_hgConf);
my @auxDbs = ('hgFixed', 'proteome');
my @dbs = (defined $opt_db) ? split(',', $opt_db) :
(&getActiveDbs($hgConf), @auxDbs);
foreach my $db (@dbs) {
my $sqlFile = "$db.tableDescriptions.sql";
open(SQL, ">$sqlFile") || die "Can't open $sqlFile for writing";
print SQL "use $db;\n";
print SQL "drop table if exists tableDescriptions;";
open(F, "$kentSrc/hg/lib/tableDescriptions.sql")
|| die "Can't open $kentSrc/hg/lib/tableDescriptions.sql";
while () {
print SQL;
}
close (F);
my $trackDb = TrackDb->new($db);
my %tableTypes = $trackDb->getTrackNamesTypes();
my %tableFields = &getTableFields($hgConf, $db);
foreach my $table (sort keys %tableFields) {
next if ($table =~ /^(trackDb|hgFindSpec|metaDb)_/);
next if $tableFields{$table} eq "fileName,";
if ((! defined $tableAutoSql{$table}) ||
($tableFields{$table} ne $tableAutoSql{$table}->{fields})) {
my $as =
&matchAutoSqlByFields($tableFields{$table}, \%tableAutoSql,
\%fieldsAutoSql);
if (defined $as) {
$tableAutoSql{$table} = $as;
}
}
if (! defined $tableTypes{$table} &&
defined $tableAutoSql{$table}) {
$tableTypes{$table} = $tableAutoSql{$table}->{tableName};
$tableTypes{$table} =~ s/bed\d+/bed/;
}
my $type = $tableTypes{$table};
if (defined $type && $table ne "mrna") {
my $typeN = $type;
$typeN =~ s/^bed (\d+).*/bed$1/;
$typeN =~ s/^(\w+).*/$1/;
$type =~ s/^(\w+).*/$1/;
if (! defined $tableAutoSql{$table} &&
defined $tableAutoSql{$typeN}) {
$tableAutoSql{$table} = $tableAutoSql{$typeN};
}
if (! defined $tableAnchors{$table} &&
defined $tableAnchors{$type}) {
$tableAnchors{$table} = $tableAnchors{$type};
}
}
my $as = $tableAutoSql{$table};
if (defined $as) {
if ($tableFields{$table} ne $as->{fields}) {
print "$db.$table FIELD MISMATCH:\n";
print "$db.$table table fields: $tableFields{$table}\n";
print "$db.$table autoSql fields: $as->{fields} [$as->{tableName}]\n";
}
} else {
print "$db.$table: No AutoSql.\n";
}
my $anchor = $tableAnchors{$table} || "";
#*** should suggest addition to gbdD of table&.as if not already in there;
#*** should complain about gbdD tables not in any active db.
my $asd = (defined $as) ? $as->{autoSql} : "";
+ # my $asFileName = (defined $as) ? $as->{filename} : "noAs";
$asd =~ s/'/\\'/g;
+ # print "fileName $asFileName $table\n";
print SQL "INSERT INTO tableDescriptions (tableName, autoSqlDef, gbdAnchor)"
. " values ('$table', '$asd', '$anchor');\n";
}
# Thanks Jorge for finding that this fixes a problem with myisamchk complaining
# that the file was not closed properly:
print SQL "FLUSH TABLES tableDescriptions;\n";
close(SQL);
if (! $noLoad) {
(! system("/cluster/bin/x86_64/hgsql $db < $sqlFile")) || die "hgsql error for $sqlFile";
print "Loaded $db.tableDescriptions.\n";
unlink($sqlFile);
}
}