c28af7c6bbe39c880c7e6dcc0f1c3ff2ee5b427c angie Fri Sep 3 12:11:22 2021 -0700 Grab lab from obscure location found by Nextstrain team in ncov-ingest PR#208. diff --git src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py index 74a3680..5d1270a 100755 --- src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py +++ src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py @@ -1,185 +1,193 @@ #!/usr/bin/env python3 import json import re import logging, argparse, sys from warnings import warn from collections import defaultdict def nameValToDDict(attributes): """Dunno why, but instead of a plain old JSON object, they used an array of { "name": ..., "value": ... } objects for attributes. Unpack into plain old defaultdict(str).""" attrs = defaultdict(str) for nameVal in attributes: attrs[nameVal['name']] = nameVal['value'].strip() return attrs def isReal(val): """Return true if value is something real, not a placeholder""" if not val: return False lcVal = val.lower() if lcVal in ['missing', 'unknown', 'not applicable', 'not collected', 'not provided', 'restricted access']: return False return True def tryAttrs(attrs, choices): """See if attrs has anything in choices; if so, return the first one encountered, otherwise return empty string.""" for choice in choices: if isReal(attrs[choice]): return attrs[choice] return "" nameFluff = ['SARS-CoV-2/', 'SARS-Cov-2/', 'hCoV-19/', 'hCov-19/', 'Human/', 'human/', 'Severe acute respiratory syndrome coronavirus 2/', 'Severe_acute_respiratory_syndrome_coronavirus2/', '/North America/', 'BetaCov/' ] def removeNameFluff(name): for fluff in nameFluff: name = name.replace(fluff, ''); return name def scoreName(name): """Make up some scores that will help us compare names given in different places in the record (because record fields and attributes are used so inconsistently by different labs). We're hoping for a country/isolate/year name but have to settle for the most ID-ish thing. removeNameFluff before calling this.""" if not name: return 100 elif name.isdigit(): return 70 elif '/' in name: return 10 elif len(name) > 25: return 80 elif name.isalpha(): return 75 elif name == "SARS-CoV-2": return 78 else: return 50 def bestName(nameList): """Return the highest-scoring name, which hopefully will be the most familiar and useful for matching with GISAID""" defluffedList = [ removeNameFluff(name) for name in nameList ] defluffedList.sort(key=scoreName) return defluffedList[0] def nameFromRecordAttrs(record, attrs): """Try various attributes and other record fields that often contain something like the country/isolate/year format that we want. Strip fluff that is sometimes prepended.""" possibleNames = [] for attrName in ['sample name', 'Submitter Id', 'strain', 'isolate', 'title', 'virus identifier']: if isReal(attrs[attrName]): possibleNames.append(attrs[attrName]) if record.get('description'): title = record['description'].get('title') if title: possibleNames.append(title) if record.get('sampleIds'): for obj in record['sampleIds']: label = obj.get('label') if label and label == 'Sample name': possibleNames.append(obj['value']) name = bestName(possibleNames) return name def dateFromAttrs(attrs): """Try to extract most complete sample collection date from attrs""" collectionDate = tryAttrs(attrs, ['collection date', 'collection_date']) receiptDate = tryAttrs(attrs, ['receipt date', 'receipt_date']) date = "" if receiptDate and collectionDate: if len(receiptDate) > len(collectionDate): date = receiptDate else: date = collectionDate elif collectionDate: date = collectionDate else: date = receiptDate return date -def labFromAttrs(attrs): - return tryAttrs(attrs, ['collecting institution', 'collected by', 'INSDC center name', +def labFromRecordAttrs(record, attrs): + lab = tryAttrs(attrs, ['collecting institution', 'collected by', 'INSDC center name', 'collected_by']) + # HT Jover Lee https://github.com/nextstrain/ncov-ingest/pull/208 + if not lab and record.get('owner') and record['owner'].get('name') and \ + record['owner']['name'] == "European Bioinformatics Institute": + if record.get('sampleIds'): + for sidBlob in record['sampleIds']: + if sidBlob.get('label') == "Sample name": + lab = sidBlob['db'] + return lab def authorFromAttrs(attrs): return tryAttrs(attrs, ['collector name']) def countryFromAttrs(attrs): return tryAttrs(attrs, ['geographic location', 'geo_loc_name']) def localeFromAttrs(attrs, country): locale = tryAttrs(attrs, ['geographic location (region and locality)']) if country and not locale and ':' in country: country, locale = country.split(':', 2) locale = locale.strip() return country, locale def hostIdFromAttrs(attrs): return tryAttrs(attrs, ['host subject id', 'host_subject_id']) epiIslRe = re.compile('.*(EPI_ISL_[0-9]+).*') def epiIdFromRecordAttrs(record, attrs): epiId = tryAttrs(attrs, ['gisaid_accession', 'gisaid_accession_id', 'gisaid', 'gisaid id', 'gisaid accession id', 'subgroup', 'GISAID Accession ID']) if not epiId and record.get('description'): comment = record['description'].get('comment') if comment: m = epiIslRe.match(comment) if m: epiId = m.groups()[0] if epiId == "0": epiId = "" return epiId def sraIdFromRecord(record): sraId = "" if record.get('sampleIds'): for dbVal in record['sampleIds']: db = dbVal.get('db') if db and db == 'SRA': sraId = dbVal['value'] break return sraId def main(): parser = argparse.ArgumentParser(description=""" Read in NCBI Datasets / Virus biosample.jsonl file (each line has JSON for a BioSample record), extract relevant attributes, and output TSV. """ ) parser.add_argument('jsonFile', help='File with one line of JSON per BioSample entry') args = parser.parse_args() with open(args.jsonFile) as f: for line in f: record = json.loads(line) attrs = nameValToDDict(record['attributes']) acc = record['accession'] gi = "" name = nameFromRecordAttrs(record, attrs) date = dateFromAttrs(attrs) - lab = labFromAttrs(attrs) + lab = labFromRecordAttrs(record, attrs) author = authorFromAttrs(attrs) country = countryFromAttrs(attrs) country, locale = localeFromAttrs(attrs, country) hostId = hostIdFromAttrs(attrs) sraId = sraIdFromRecord(record) epiId = epiIdFromRecordAttrs(record, attrs) print('\t'.join([gi, acc, name, date, lab, author, country, locale, hostId, sraId, epiId])) main()