9cf3f9dfbd2770a517953c7f43ffade95be9a9de hiram Tue Sep 24 15:26:53 2024 -0700 add the ucscBrowser link to the stderr outpur ref #34337 diff --git src/hg/makeDb/doc/asmHubs/tsvToJson.py src/hg/makeDb/doc/asmHubs/tsvToJson.py index 3ab387a..695c804 100755 --- src/hg/makeDb/doc/asmHubs/tsvToJson.py +++ src/hg/makeDb/doc/asmHubs/tsvToJson.py @@ -173,31 +173,31 @@ extraStrings = f"{isolate}{asmType} {yearDate}" elif len(cultivar): extraStrings = f"{cultivar}{asmType} {yearDate}" if len(extraStrings) < 1: pat = re.compile(r'^ +') extraStrings = pat.sub('', f"{asmType} {yearDate}") return asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings, genBankAcc, refSeqAcc, identical, taxId except FileNotFoundError: print(f"Error: File '{asmRpt}' not found.", file=sys.stderr) sys.exit(1) ### inFh is a file handle to a list of assembly identifiers, ### might be a multi column file, ignore anything after the first column def processList(inFh, dbGcaGcfDict): - outStr = f"taxId\tasmId\tgenBankAcc\trefSeqAcc\tidentical\tsciName\tcomName" + outStr = f"# taxId\tasmId\tgenBankAcc\trefSeqAcc\tidentical\tsciName\tcomName\tucscBrowser" print(outStr, file=sys.stderr) schema = [ {"name": "taxId", "type": "integer"}, {"name": "asmId", "type": "string"}, {"name": "genBank", "type": "string"}, {"name": "refSeq", "type": "string"}, {"name": "identical", "type": "boolean"}, {"name": "sciName", "type": "string"}, {"name": "comName", "type": "string"}, {"name": "ucscBrowser", "type": "string"}, ] jsonSchema = { "columns": [[obj["name"], obj["type"]] for obj in schema] } ## jsonOut = json.dumps(jsonSchema) @@ -278,31 +278,31 @@ asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings, genBankAcc, refSeqAcc, identical, taxId = extractNames(asmRpt, hapX) if len(extraStrings): pat = re.compile(r'[()\[\]+*]') extraStrings = pat.sub('', extraStrings) pat = re.compile(r'\?') extraStrings = pat.sub(' ', extraStrings) extraStrings = re.sub(re.escape(asmNameAbbrev) + ' ', '', extraStrings) extraStrings = re.sub(r'\s+', ' ', extraStrings) words = extraStrings.split() orgList = orgName.split() orgName = " ".join([orgWord for orgWord in orgList if orgWord not in words]) orgName = re.sub(r'=|\s+$|^\s+', '', orgName) orgName = re.sub(r'\s+', ' ', orgName).strip() ucscBrowser = f"https://genome.ucsc.edu/h/{accession}" - outStr = f"{taxId}\t{asmId}\t{genBankAcc}\t{refSeqAcc}\t{identical}\t{sciName}\t{comName}" + outStr = f"{taxId}\t{asmId}\t{genBankAcc}\t{refSeqAcc}\t{identical}\t{sciName}\t{comName}\t{ucscBrowser}" rowData = { "taxId": int(taxId), "asmId": asmId, "genBank": genBankAcc, "refSeq": refSeqAcc, "identical": identical, "sciName": sciName, "comName": comName, "ucscBrowser": ucscBrowser, } dataOut.append(rowData) print(outStr, file=sys.stderr) # schemaPlusData = schema + dataOut