2aa31eacd21082add302adf39d080e47ebc01598 gperez2 Sun Mar 16 15:37:10 2025 -0700 Adding a new script to replace the checkSyncAndMetaDataAll cron. The python script compares hg38, hg19, mm39, and mm10 tables and metadata tables for hgw1 and hgwbeta discrepancies, refs #35172 diff --git src/utils/qa/check_Tables_Metadata.py src/utils/qa/check_Tables_Metadata.py new file mode 100755 index 00000000000..40070f28b02 --- /dev/null +++ src/utils/qa/check_Tables_Metadata.py @@ -0,0 +1,160 @@ +#!rusr/bin/env python3 + +# Program Header +# Name: Gerardo Perez +# Description: A program that compares hg38, hg19, mm39, and mm10 tables and metadata tables for hgw1 and hgwbeta discrepancies +# +# +# +# checkSessionsFromRR.py +# +# +# Development Environment: VIM - Vi IMproved version 7.4.629 + +import subprocess + +def bash(cmd): + """Executes a shell command in Bash and returns the output as a list of lines.""" + rawOutput = subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True) + return rawOutput.stdout.split('\n')[0:-1] + + +def run_command(command): + """Executes a shell command and returns its output as a string. Prints an error message if the command fails.""" + result = subprocess.run(command, shell=True, capture_output=True, text=True) + if result.returncode != 0: + print("Error executing: {}".format(command)) + print("Error message: {}".format(result.stderr.strip())) + return result.stdout.strip() # Remove leading/trailing whitespace. + + +def get_data(host, query, database): + """Gets and sorts data from a database using a SQL query.""" + command = '{} -Ne "{}" {} 2>/dev/null | sort'.format(host, query, database) + return run_command(command).splitlines() + + +def filter_blat_servers(data): + """Splits blat*.soe.ucsc.edu after 'blat' and before '.soe.ucsc.edu', then re-formats for comparison.""" + filtered_entries = [] + for entry in data: + fields = entry.split() + if len(fields) > 2: + server_name = fields[1] + port = fields[2] + if server_name.startswith("blat") and ".soe.ucsc.edu" in server_name: + base_name = server_name.split(".soe.ucsc.edu")[0] # Remove domain part + new_entry = "{}\t{}\t{}\t{}".format(fields[0], base_name, port, "\t".join(fields[3:]) if len(fields) > 3 else "") + filtered_entries.append(new_entry) + return filtered_entries + + +def compare_table_lists(beta_tables, hgw1_tables, db): + """Compares table lists from two database sources and prints differences. + + Tables that exist only in one of the sources are displayed. + """ + ignored_beta_tables = {"hgFindSpec_public", "trackDb_public", "relatedTrack_public", "metaDb_public", "metaDb_back", "metaDb_public_back"} + + # Filter out ignored tables from beta_tables + filtered_beta_tables = [] + for table in beta_tables: + if table not in ignored_beta_tables: + filtered_beta_tables.append(table) + + # Convert lists to sets for comparison + set1, set2 = set(filtered_beta_tables), set(hgw1_tables) + only_in_beta_tables = set1 - set2 + only_in_hgw1_tables = set2 - set1 + + # Print differences only if there are discrepancies + if only_in_beta_tables or only_in_hgw1_tables: + print("\nDiscrepancies between hgw1 and beta tables for Database: {}".format(db)) + + if only_in_beta_tables: + print("Tables only in hgwbeta:") + for table in sorted(only_in_beta_tables): + print(table) + print() + + if only_in_hgw1_tables: + print("Tables only in hgw1:") + for table in sorted(only_in_hgw1_tables): + print(table) + print() + + +def compare_metatables(db_list, metatables, host1, host2, centdb1, centdb2): + """Compares metadata tables across two databases in db_list.""" + for db in db_list: + for metatable in metatables: + # Construct different SQL queries depending on the metatable type + if metatable == "liftOverChain": + query = "SELECT * FROM {} WHERE fromDb = '{}' OR toDb = '{}'".format(metatable, db, db) + elif metatable == "blatServers": + query = "SELECT * FROM {} WHERE db = '{}'".format(metatable, db) + else: + query = "SELECT * FROM {} WHERE name = '{}'".format(metatable, db) + + # Gets and sort data from both databases + data1 = get_data(host1, query, centdb1) + data2 = get_data(host2, query, centdb2) + + # Apply filtering to blatServers to remove domain part + if metatable == "blatServers": + data1 = filter_blat_servers(data1) + + # Convert lists to sets for comparison + set1 = set(data1) + set2 = set(data2) + + # Identify unique entries in each database + only_in_centdb1 = set1 - set2 + only_in_centdb2 = set2 - set1 + + # Print differences if any unique entries exist + if only_in_centdb1: + print("Discrepancies between hgw1 and beta metadata tables for Database: {}".format(db)) + print("{} entries only in {}:".format(metatable, centdb1)) + for line in sorted(only_in_centdb1): + print(line) + print() # Add a newline for better readability + + if only_in_centdb2: + print("Discrepancies between hgw1 and beta metadata tables for Database: {}".format(db)) + print("{} entries only in {}:".format(metatable, centdb2)) + for line in sorted(only_in_centdb2): + print(line) + print() # Add a newline for better readability + + +def main(): + """Main function to compare database tables and metadata.""" + db_list = ["hg38", "hg19", "mm39", "mm10"] + centdb1, centdb2 = "hgcentralbeta", "hgcentral" + host1 = "/cluster/bin/x86_64/hgsql -h hgwbeta" + host2 = "/cluster/bin/x86_64/hgsql -h genome-centdb" + metatables = ["liftOverChain", "blatServers", "dbDb"] + + # Compare table lists for each database in db_list + for db in db_list: + beta_tables = bash("/cluster/bin/x86_64/hgsql -h hgwbeta -Ne 'show tables' {}".format(db)) + hgw1_tables = bash("ssh qateam@hgw1 'HGDB_CONF=`pwd`/.hg.local.conf /home/qateam/bin/x86_64/hgsql -N -e \"show tables\" \"{}\"'".format(db)) + compare_table_lists(beta_tables, hgw1_tables, db) + + # Compare metadata tables + compare_metatables(db_list, metatables, host1, host2, centdb1, centdb2) + + +if __name__ == "__main__": + main() + +# Program Output (Commented out) +#Discrepancies between hgw1 and beta tables for Database: rn6 +#Tables only in hgwbeta: +#refSeqSummary +# +#Discrepancies between hgw1 and beta metadata tables for Database: hg38 +#blatServers entries only in hgcentralbeta: +#hg38 blat1c 17780 1 0 0 +#hg38 blat1c 17781 0 1 0