61654162481479361c996500c8dcdae120caae95 angie Fri Dec 20 16:06:08 2024 -0800 March 2024 build: start with Lily Karim's tree and Ash O'Farrell's metadata, add GenBank sequences, combine metadata. diff --git src/hg/utils/otto/mtb/combineMetadata.py src/hg/utils/otto/mtb/combineMetadata.py new file mode 100755 index 0000000..c857f46 --- /dev/null +++ src/hg/utils/otto/mtb/combineMetadata.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python + +import csv +import re +import logging, argparse, sys, math +from warnings import warn +from collections import defaultdict +import pandas as pd +import dateutil.parser + + +def cleanDate(messyDate): + if isinstance(messyDate, float): + if math.isnan(messyDate): + return '' + else: + return int(messyDate) + elif messyDate.endswith('.0'): + return messyDate.removesuffix('.0') + elif messyDate.startswith('between '): + return messyDate.removeprefix('between ') + else: + return dateutil.parser.parse(messyDate).strftime('%Y-%m-%d') + + +def checkEqualOrEmpty(valA, valB): + if valA == 'nan' or (isinstance(valA, float) and math.isnan(valA)): + valA = '' + if valB == 'nan' or (isinstance(valB, float) and math.isnan(valB)): + valB = '' + if valA == '': + return valB + elif valB == '': + return valA + else: + if valA != valB: + warn(f"Mismatching non-empty values '{valA}' != '{valB}'") + return valA + +def useLonger(valA, valB): + if valA == 'nan' or (isinstance(valA, float) and math.isnan(valA)): + valA = '' + if valB == 'nan' or (isinstance(valB, float) and math.isnan(valB)): + valB = '' + if valA == '': + return valB + elif valB == '': + return valA + elif valA.startswith(valB): + return valA + elif valB.startswith(valA): + return valB + else: + if valA != valB: + warn(f"Mismatching non-empty values '{valA}' != '{valB}'") + return valA + + +def useAltIfEmpty(val, alt): + if val == 'nan' or (isinstance(val, float) and math.isnan(val)): + val = '' + if alt == 'nan' or (isinstance(alt, float) and math.isnan(alt)): + alt = '' + if val == '': + return alt + else: + return val + + +def main(): + parser = argparse.ArgumentParser(description=""" +Read in INSDC metadata and metadata from Ash O'Farrell, join into one cohesive +output TSV on stdout. +""" + ) + parser.add_argument('insdc_metadata', help='TSV from INSDC samples with user-friendly tree name, may include BioSample ID') + parser.add_argument('ash_metadata', help="TSV from Ash O'Farrell keyed to BioSample ID") + args = parser.parse_args() + + # Read inputs + insdc_df = pd.read_csv(args.insdc_metadata, sep='\t', dtype='unicode') + ash_df = pd.read_csv(args.ash_metadata, sep='\t', dtype='unicode') + + # Convert arbitrary date strings to ISO dates + ash_df['date_isolation'] = ash_df['date_isolation'].apply(cleanDate) + + both_df = (insdc_df.rename(columns = {'biosample_accession': 'BioSample'}) + .merge(ash_df, on = ['BioSample'], how = 'outer') + ) + + # Make sure that redundant columns agree (if not empty) + both_df['country_x'] = both_df['country_x'].combine(both_df['country_y'], checkEqualOrEmpty) + both_df['date'] = both_df['date'].combine(both_df['date_isolation'], useLonger) + both_df['location'] = both_df['location'].combine(both_df['region'], checkEqualOrEmpty) + both_df['bioproject_accession'] = both_df['bioproject_accession'].combine(both_df['BioProject'], + checkEqualOrEmpty) + both_df['sra_sample_accession'] = both_df['sra_sample_accession'].combine(both_df['ERS'], + checkEqualOrEmpty) + + # If 'strain' column is empty (i.e. BioSample is only in Ash's metadata), set it to BioSample + both_df['strain'] = both_df['strain'].combine(both_df['BioSample'], useAltIfEmpty) + + # Drop unwanted columns and tweak some column names + both_df = (both_df.drop(['year_isolation', 'tba3_no_lineage', 'on_20240221_final', + 'date_isolation', 'country_y', 'region', 'BioProject', 'ERS'], axis=1) + .rename(columns = {'country_x': 'country', + 'location': 'region', + 'bioproject_accession': 'BioProject', + 'sra_sample_accession': 'SraSample'})) + + # Write to stdout + both_df.to_csv(sys.stdout, sep='\t', index=False) + +main()