src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py 86c9c0072ad12699dfdcf8232ad812f1982d23d7

86c9c0072ad12699dfdcf8232ad812f1982d23d7
angie
  Thu Sep 2 17:47:47 2021 -0700
NCBI Datasets now includes a biosample.jsonl file -- when it's stable we won't need EUtils anymore, yay!

diff --git src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py
new file mode 100755
index 0000000..74a3680
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/bioSampleJsonToTab.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+
+import json
+import re
+import logging, argparse, sys
+from warnings import warn
+from collections import defaultdict
+
+def nameValToDDict(attributes):
+    """Dunno why, but instead of a plain old JSON object, they used an array of
+    { "name": ..., "value": ... } objects for attributes.
+    Unpack into plain old defaultdict(str)."""
+    attrs = defaultdict(str)
+    for nameVal in attributes:
+        attrs[nameVal['name']] = nameVal['value'].strip()
+    return attrs
+
+def isReal(val):
+    """Return true if value is something real, not a placeholder"""
+    if not val:
+        return False
+    lcVal = val.lower()
+    if lcVal in ['missing', 'unknown', 'not applicable', 'not collected', 'not provided',
+                 'restricted access']:
+        return False
+    return True
+
+def tryAttrs(attrs, choices):
+    """See if attrs has anything in choices; if so, return the first one encountered,
+    otherwise return empty string."""
+    for choice in choices:
+        if isReal(attrs[choice]):
+            return attrs[choice]
+    return ""
+
+nameFluff = ['SARS-CoV-2/', 'SARS-Cov-2/',
+             'hCoV-19/', 'hCov-19/',
+             'Human/', 'human/',
+             'Severe acute respiratory syndrome coronavirus 2/',
+             'Severe_acute_respiratory_syndrome_coronavirus2/',
+             '/North America/',
+             'BetaCov/'
+             ]
+
+def removeNameFluff(name):
+    for fluff in nameFluff:
+        name = name.replace(fluff, '');
+    return name
+
+def scoreName(name):
+    """Make up some scores that will help us compare names given in different places in the record
+    (because record fields and attributes are used so inconsistently by different labs).
+    We're hoping for a country/isolate/year name but have to settle for the most ID-ish thing.
+    removeNameFluff before calling this."""
+    if not name:
+        return 100
+    elif name.isdigit():
+        return 70
+    elif '/' in name:
+        return 10
+    elif len(name) > 25:
+        return 80
+    elif name.isalpha():
+        return 75
+    elif name == "SARS-CoV-2":
+        return 78
+    else:
+        return 50
+
+def bestName(nameList):
+    """Return the highest-scoring name, which hopefully will be the most familiar and useful for
+    matching with GISAID"""
+    defluffedList = [ removeNameFluff(name) for name in nameList ]
+    defluffedList.sort(key=scoreName)
+    return defluffedList[0]
+
+def nameFromRecordAttrs(record, attrs):
+    """Try various attributes and other record fields that often contain something like the
+    country/isolate/year format that we want.  Strip fluff that is sometimes prepended."""
+    possibleNames = []
+    for attrName in ['sample name', 'Submitter Id', 'strain', 'isolate', 'title',
+                     'virus identifier']:
+        if isReal(attrs[attrName]):
+            possibleNames.append(attrs[attrName])
+    if record.get('description'):
+        title = record['description'].get('title')
+        if title:
+            possibleNames.append(title)
+    if record.get('sampleIds'):
+        for obj in record['sampleIds']:
+            label = obj.get('label')
+            if label and label == 'Sample name':
+                possibleNames.append(obj['value'])
+    name = bestName(possibleNames)
+    return name
+
+def dateFromAttrs(attrs):
+    """Try to extract most complete sample collection date from attrs"""
+    collectionDate = tryAttrs(attrs, ['collection date', 'collection_date'])
+    receiptDate = tryAttrs(attrs, ['receipt date', 'receipt_date'])
+    date = ""
+    if receiptDate and collectionDate:
+        if len(receiptDate) > len(collectionDate):
+            date = receiptDate
+        else:
+            date = collectionDate
+    elif collectionDate:
+        date = collectionDate
+    else:
+        date = receiptDate
+    return date
+
+def labFromAttrs(attrs):
+    return tryAttrs(attrs, ['collecting institution', 'collected by', 'INSDC center name',
+                            'collected_by'])
+
+def authorFromAttrs(attrs):
+    return tryAttrs(attrs, ['collector name'])
+
+def countryFromAttrs(attrs):
+    return tryAttrs(attrs, ['geographic location', 'geo_loc_name'])
+
+def localeFromAttrs(attrs, country):
+    locale = tryAttrs(attrs, ['geographic location (region and locality)'])
+    if country and not locale and ':' in country:
+        country, locale = country.split(':', 2)
+        locale = locale.strip()
+    return country, locale
+
+def hostIdFromAttrs(attrs):
+    return tryAttrs(attrs, ['host subject id', 'host_subject_id'])
+
+epiIslRe = re.compile('.*(EPI_ISL_[0-9]+).*')
+
+def epiIdFromRecordAttrs(record, attrs):
+    epiId = tryAttrs(attrs, ['gisaid_accession', 'gisaid_accession_id', 'gisaid', 'gisaid id',
+                             'gisaid accession id', 'subgroup', 'GISAID Accession ID'])
+    if not epiId and record.get('description'):
+        comment =  record['description'].get('comment')
+        if comment:
+            m = epiIslRe.match(comment)
+            if m:
+                epiId = m.groups()[0]
+    if epiId == "0":
+        epiId = ""
+    return epiId
+
+def sraIdFromRecord(record):
+    sraId = ""
+    if record.get('sampleIds'):
+        for dbVal in record['sampleIds']:
+            db = dbVal.get('db')
+            if db and db == 'SRA':
+                sraId = dbVal['value']
+                break
+    return sraId
+
+def main():
+    parser = argparse.ArgumentParser(description="""
+Read in NCBI Datasets / Virus biosample.jsonl file (each line has JSON for a BioSample record),
+extract relevant attributes, and output TSV.
+"""
+    )
+    parser.add_argument('jsonFile', help='File with one line of JSON per BioSample entry')
+    args = parser.parse_args()
+
+    with open(args.jsonFile) as f:
+        for line in f:
+            record = json.loads(line)
+            attrs = nameValToDDict(record['attributes'])
+            acc = record['accession']
+            gi = ""
+            name = nameFromRecordAttrs(record, attrs)
+            date = dateFromAttrs(attrs)
+            lab = labFromAttrs(attrs)
+            author = authorFromAttrs(attrs)
+            country = countryFromAttrs(attrs)
+            country, locale = localeFromAttrs(attrs, country)
+            hostId = hostIdFromAttrs(attrs)
+            sraId = sraIdFromRecord(record)
+            epiId = epiIdFromRecordAttrs(record, attrs)
+            print('\t'.join([gi, acc, name, date, lab, author, country, locale,
+                             hostId, sraId, epiId]))
+
+main()