c28af7c6bbe39c880c7e6dcc0f1c3ff2ee5b427c angie Fri Sep 3 12:11:22 2021 -0700 Grab lab from obscure location found by Nextstrain team in ncov-ingest PR#208. diff --git src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py index 74a3680..5d1270a 100755 --- src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py +++ src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py @@ -98,33 +98,41 @@ """Try to extract most complete sample collection date from attrs""" collectionDate = tryAttrs(attrs, ['collection date', 'collection_date']) receiptDate = tryAttrs(attrs, ['receipt date', 'receipt_date']) date = "" if receiptDate and collectionDate: if len(receiptDate) > len(collectionDate): date = receiptDate else: date = collectionDate elif collectionDate: date = collectionDate else: date = receiptDate return date -def labFromAttrs(attrs): - return tryAttrs(attrs, ['collecting institution', 'collected by', 'INSDC center name', +def labFromRecordAttrs(record, attrs): + lab = tryAttrs(attrs, ['collecting institution', 'collected by', 'INSDC center name', 'collected_by']) + # HT Jover Lee https://github.com/nextstrain/ncov-ingest/pull/208 + if not lab and record.get('owner') and record['owner'].get('name') and \ + record['owner']['name'] == "European Bioinformatics Institute": + if record.get('sampleIds'): + for sidBlob in record['sampleIds']: + if sidBlob.get('label') == "Sample name": + lab = sidBlob['db'] + return lab def authorFromAttrs(attrs): return tryAttrs(attrs, ['collector name']) def countryFromAttrs(attrs): return tryAttrs(attrs, ['geographic location', 'geo_loc_name']) def localeFromAttrs(attrs, country): locale = tryAttrs(attrs, ['geographic location (region and locality)']) if country and not locale and ':' in country: country, locale = country.split(':', 2) locale = locale.strip() return country, locale def hostIdFromAttrs(attrs): @@ -160,26 +168,26 @@ Read in NCBI Datasets / Virus biosample.jsonl file (each line has JSON for a BioSample record), extract relevant attributes, and output TSV. """ ) parser.add_argument('jsonFile', help='File with one line of JSON per BioSample entry') args = parser.parse_args() with open(args.jsonFile) as f: for line in f: record = json.loads(line) attrs = nameValToDDict(record['attributes']) acc = record['accession'] gi = "" name = nameFromRecordAttrs(record, attrs) date = dateFromAttrs(attrs) - lab = labFromAttrs(attrs) + lab = labFromRecordAttrs(record, attrs) author = authorFromAttrs(attrs) country = countryFromAttrs(attrs) country, locale = localeFromAttrs(attrs, country) hostId = hostIdFromAttrs(attrs) sraId = sraIdFromRecord(record) epiId = epiIdFromRecordAttrs(record, attrs) print('\t'.join([gi, acc, name, date, lab, author, country, locale, hostId, sraId, epiId])) main()