5c3fb7c203029aae27a69a7104b690b30da7f558 hiram Mon Sep 23 13:05:57 2024 -0700 fixup to allow UCSC database names and reading dbGcaGcf.tsf to get GCA GCF accessions on those UCSC names refs #34337 diff --git src/hg/makeDb/doc/asmHubs/tsvToJson.py src/hg/makeDb/doc/asmHubs/tsvToJson.py index 9548b22..3ab387a 100755 --- src/hg/makeDb/doc/asmHubs/tsvToJson.py +++ src/hg/makeDb/doc/asmHubs/tsvToJson.py @@ -1,22 +1,23 @@ #!/cluster/software/bin/python3 import os import sys import re import site import json +import subprocess #################################################################### ### this is kinda like an environment setting, it gets everything ### into a UTF-8 reading mode #################################################################### def setUtf8Encoding(): """ Set UTF-8 encoding for stdin, stdout, and stderr in Python. """ if sys.stdout.encoding != 'utf-8': sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1) if sys.stderr.encoding != 'utf-8': sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1) @@ -171,59 +172,90 @@ elif len(isolate): extraStrings = f"{isolate}{asmType} {yearDate}" elif len(cultivar): extraStrings = f"{cultivar}{asmType} {yearDate}" if len(extraStrings) < 1: pat = re.compile(r'^ +') extraStrings = pat.sub('', f"{asmType} {yearDate}") return asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings, genBankAcc, refSeqAcc, identical, taxId except FileNotFoundError: print(f"Error: File '{asmRpt}' not found.", file=sys.stderr) sys.exit(1) ### inFh is a file handle to a list of assembly identifiers, ### might be a multi column file, ignore anything after the first column -def processList(inFh): +def processList(inFh, dbGcaGcfDict): outStr = f"taxId\tasmId\tgenBankAcc\trefSeqAcc\tidentical\tsciName\tcomName" print(outStr, file=sys.stderr) schema = [ {"name": "taxId", "type": "integer"}, {"name": "asmId", "type": "string"}, {"name": "genBank", "type": "string"}, {"name": "refSeq", "type": "string"}, {"name": "identical", "type": "boolean"}, {"name": "sciName", "type": "string"}, {"name": "comName", "type": "string"}, {"name": "ucscBrowser", "type": "string"}, ] jsonSchema = { "columns": [[obj["name"], obj["type"]] for obj in schema] } ## jsonOut = json.dumps(jsonSchema) ## print(jsonOut) ### accumulating array rows dataOut = [] ncbiSrc = "/hive/data/outside/ncbi/genomes" for lineIn in inFh: # rstrip() # eliminates new line at end of string splitByTab = lineIn.rstrip().split("\t") asmId = splitByTab[0] comName = splitByTab[1] if asmId.startswith('#'): # ignore comment lines print(f"{asmId}", file=sys.stderr) continue +# felCat8 *** NOT GC.. +# 0 ['felCat8' +# 1 'Nov. 2014 (ICGSC Felis_catus_8.0/felCat8)' +# 2 '/gbdb/felCat8' +# 3 'Cat' +# 4 'chrA2:53484451-53597660' +# 5 '1' +# 6 '3240' +# 7 'Cat' +# 8 'Felis catus' +# 9 '/gbdb/felCat8/html/description.html' +# 10 '0' +# 11 '0' +# 12 'International Cat Genome Sequencing Consortium' +# 13 '9685'] + if not asmId.startswith('GC'): + sql = f"select * from dbDb where name='{asmId}'" + hgsqlReturn = subprocess.run(['hgsql', 'hgcentraltest', '-N', '-e', sql], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + dbDb = hgsqlReturn.stdout.strip().split('\t') + taxId = dbDb[13] + genBankAcc = "n/a" + refSeqAcc = "n/a" + identical = False + if asmId in dbGcaGcfDict: + genBankAcc = dbGcaGcfDict[asmId][0] + refSeqAcc = dbGcaGcfDict[asmId][1] + identical = dbGcaGcfDict[asmId][2] + sciName = dbDb[8] + comName = f"{dbDb[7]} - {dbDb[1]}" + ucscBrowser = f"https://genome.ucsc.edu/cgi-bin/hgTracks?db={asmId}" + else: asmId = asmId.split(' ',1)[0] # only the first column p = asmId.split('_', 2) # split on _ maximum of three results in list accession = p[0] + '_' + p[1] nPart = p[1] by3 = [nPart[i:i+3] for i in range(0, len(nPart), 3)] gcX = p[0] d0 = by3[0] d1 = by3[1] d2 = by3[2] asmName = "na" if (len(p) == 3): asmName = p[2] # print(f"{accession}\t{asmName}\t{asmId}") srcDir = f"{ncbiSrc}/{gcX}/{d0}/{d1}/{d2}/{asmId}" asmRpt = f"{srcDir}/{asmId}_assembly_report.txt" @@ -244,72 +276,96 @@ ab = hapX.replace(hx,'') hapX = ab asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings, genBankAcc, refSeqAcc, identical, taxId = extractNames(asmRpt, hapX) if len(extraStrings): pat = re.compile(r'[()\[\]+*]') extraStrings = pat.sub('', extraStrings) pat = re.compile(r'\?') extraStrings = pat.sub(' ', extraStrings) extraStrings = re.sub(re.escape(asmNameAbbrev) + ' ', '', extraStrings) extraStrings = re.sub(r'\s+', ' ', extraStrings) words = extraStrings.split() orgList = orgName.split() orgName = " ".join([orgWord for orgWord in orgList if orgWord not in words]) orgName = re.sub(r'=|\s+$|^\s+', '', orgName) orgName = re.sub(r'\s+', ' ', orgName).strip() + ucscBrowser = f"https://genome.ucsc.edu/h/{accession}" + outStr = f"{taxId}\t{asmId}\t{genBankAcc}\t{refSeqAcc}\t{identical}\t{sciName}\t{comName}" - ucscBrowser = f"https://genome.ucsc.edu/h/{accession}" rowData = { "taxId": int(taxId), "asmId": asmId, "genBank": genBankAcc, "refSeq": refSeqAcc, "identical": identical, "sciName": sciName, "comName": comName, "ucscBrowser": ucscBrowser, } dataOut.append(rowData) print(outStr, file=sys.stderr) - dataJson = { - "data": [[obj["taxId"], obj["asmId"], obj["genBank"], obj["refSeq"], obj["identical"], obj["sciName"], obj["comName"], obj["ucscBrowser"]] for obj in dataOut] - } # schemaPlusData = schema + dataOut schemaPlusData = { "columns": schema, "data": dataOut, } jsonOut = json.dumps(schemaPlusData) print(jsonOut) +# read three column file: db gca gcf +# for UCSC db to GCA and GCF accession equivalence +def readDbGcaGcf(tsvFile): + dbDict = {} + + try: + fh = open(tsvFile, 'r', encoding='latin-1') + except FileNotFoundError: + print(f"Error: File '{tsvFile}' not found.", file=sys.stderr) + sys.exit(1) + + for line in fh: + if line.startswith('#'): + continue + db, gca, gcf, identical = line.strip().split('\t') + ident = False + if identical.lower().startswith('yes'): + ident = True + dbDict[db] = [gca, gcf, ident] + + return dbDict + # print(f"asmType: '{asmType}', {sciName}, {orgName}, {yearDate}, {isolate}, {cultivar} {extraStrings}") def main(): - site.ENABLE_USER_SITE = False if len(sys.argv) != 2: # print(f"sys.path: {sys.path}") print("Usage: ./commonNames.py <filename|stdin>", file=sys.stderr) print("e.g.: ./commonNames.py some.asmId.list", file=sys.stderr) print(" where some.asmId.list is a simple list of NCBI assembly ids", file=sys.stderr) print(" will look up the common names for each ID from the assembly_report files", file=sys.stderr) sys.exit(1) # Ensure stdout and stderr use UTF-8 encoding setUtf8Encoding() listFile = sys.argv[1] + dbGcaGcfDict = {} + dbGcaGcf = "dbGcaGcf.tsv" + if os.path.isfile(dbGcaGcf): + dbGcaGcfDict = readDbGcaGcf(dbGcaGcf) + if listFile == 'stdin': fileIn = sys.stdin else: try: fileIn = open(listFile, 'r', encoding='latin-1') except FileNotFoundError: print(f"Error: File '{listFile}' not found.", file=sys.stderr) sys.exit(1) - processList(fileIn) + processList(fileIn, dbGcaGcfDict) if __name__ == "__main__": main()