68207fb5ba1960173ec5460bce40789997c80e19 hiram Wed Oct 9 16:11:26 2024 -0700 fixup some common names and allow only 10 tracks on a fungi refs #29545 diff --git src/hg/makeDb/doc/asmHubs/commonNames.py src/hg/makeDb/doc/asmHubs/commonNames.py index 589559e..2e37b9a 100755 --- src/hg/makeDb/doc/asmHubs/commonNames.py +++ src/hg/makeDb/doc/asmHubs/commonNames.py @@ -1,258 +1,276 @@ #!/cluster/software/bin/python3 import os import sys import re import site #################################################################### ### this is kinda like an environment setting, it gets everything ### into a UTF-8 reading mode #################################################################### def setUtf8Encoding(): """ Set UTF-8 encoding for stdin, stdout, and stderr in Python. """ if sys.stdout.encoding != 'utf-8': sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1) if sys.stderr.encoding != 'utf-8': sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1) ### given a path name to an asmRpt, read the file and get out the names ### we need def extractNames(asmRpt, hapX): asmType = "asmType" sciName = "sciName" yearDate = "yearDate" isolate = "" cultivar = "" orgName = "orgName" order = "" extraStrings = "" try: readFile = open(asmRpt, 'r', encoding='utf-8') for line in readFile: line = line.rstrip() if line.startswith('#'): if "Assembly type:" in line: pat = re.compile(r'.*bly type:\s+', re.IGNORECASE) asmType = pat.sub('', line) if re.search('alternate-pseudohaplotype', asmType): asmType = " alternate hap" elif re.search('principal pseudo', asmType): asmType = " primary hap" else: asmType = hapX elif "Organism name:" in line: saveOrgName = line pat = re.compile(r'.*\(') orgName = pat.sub('', line) pat = re.compile(r'[()\[\]+*]') orgName = pat.sub('', orgName) pat = re.compile(r'\?') orgName = pat.sub('', orgName) pat = re.compile(r'.*ism name:\s+') orgName = pat.sub('', orgName) pat = re.compile(r'.*ganism name:\s+', re.IGNORECASE) sciName = pat.sub('', line) pat = re.compile(r'\s+\(.*\)$') sciName = pat.sub('', sciName) pat = re.compile(r'[()\[\]+*]') sciName = pat.sub('', sciName) pat = re.compile(r'\?') sciName = pat.sub(' ', sciName) pat = r'kinetoplastids|firmicutes|proteobacteria|high G|enterobacteria|agent of' if re.search(pat, orgName): orgName = sciName else: - pat = r'apicomplexans|bugs|crustaceans|nematodes|flatworm|ascomycete|basidiomycete|budding|microsporidian|smut|fungi|eukaryotes|flies|beetles|mosquitos|bees|moths|sponges|^mites|ticks|^comb|jellies|jellyfishes|chitons|bivalves|bony fishes|birds|eudicots|snakes|bats|tunicates|tsetse fly' + pat = r'apicomplexans|bryozoans|bugs|ciliates|crustaceans|diatoms|hydrozoans|gastropods|nematodes|flatworm|pelagophytes|scorpions|ascomycete|basidiomycete|budding|microsporidian|smut|fungi|eukaryotes|flies|beetles|mosquitos|bees|moths|sponges|^mites|ticks|^comb|jellies|jellyfishes|chitons|bivalves|bony fishes|birds|eudicots|snakes|bats|tunicates|tsetse fly|cellular slime molds|stony corals' if re.search(pat, orgName): order = orgName.split()[0] if re.search('budding', order): order = "budding yeast" elif re.search('smut', order): order = "smut fungi" elif re.search('bony', order): order = "bony fish" elif re.search('ascomycete', order): order = "ascomycetes" elif re.search('eudicots', order): order = "eudicot" elif re.search('birds', order): order = "bird" elif re.search('snakes', order): order = "snake" elif re.search('crustaceans', order): order = "crustacean" elif re.search('mosquitos', order): order = "mosquito" elif re.search('mites', order): order = "mite/tick" elif re.search('comb', order): order = "comb jelly" elif re.search('jellyfishes', order): order = "jellyfish" elif re.search('chitons', order): order = "chiton" elif re.search('bivalves', order): order = "bivalve" elif re.search('beetles', order): order = "beetle" elif re.search('bees', order): order = "bee" elif re.search('bats', order): order = "bat" elif re.search('moths', order): order = "moth" elif re.search('sponges', order): order = "sponge" elif re.search('flatworms', order): order = "flatworm" elif re.search('nematodes', order): order = "nematode" elif re.search('basidiomycete', order): order = "basidiomycetes" words = sciName.split() restWords = " ".join(words[1:]) if re.search("eukaryotes", orgName): orgName = words[0][0].upper() + "." + restWords elif re.search("apicomplexans", orgName): orgName = "apicomplexans " + words[0][0].upper() + "." + restWords + elif re.search("bryozoans", orgName): + orgName = "bryozoans " + words[0][0].upper() + "." + restWords + elif re.search("ciliates", orgName): + orgName = "ciliates " + words[0][0].upper() + "." + restWords + elif re.search("diatoms", orgName): + orgName = "diatoms " + words[0][0].upper() + "." + restWords + elif re.search("hydrozoans", orgName): + orgName = "hydrozoans " + words[0][0].upper() + "." + restWords + elif re.search("gastropods", orgName): + orgName = "gastropods " + words[0][0].upper() + "." + restWords + elif re.search("pelagophytes", orgName): + orgName = "pelagophytes " + words[0][0].upper() + "." + restWords + elif re.search("scorpions", orgName): + orgName = "scorpions " + words[0][0].upper() + "." + restWords elif re.search("flies", orgName): orgName = "fly " + words[0][0].upper() + "." + restWords elif re.search("tsetse", orgName): orgName = "tsetse fly " + words[0][0].upper() + "." + restWords + elif re.search("cellular slime mold", orgName): + orgName = "cellular slime mold " + words[0][0].upper() + "." + restWords + elif re.search("stony corals", orgName): + orgName = "stony coral " + words[0][0].upper() + "." + restWords elif re.search("tunicates", orgName): orgName = "tunicate " + words[0][0].upper() + "." + restWords else: orgName = order + " " + words[0][0].upper() + "." + restWords elif re.search("viruses", orgName): orgName = saveOrgName pat = re.compile(r'.*ism name:\s+') orgName = pat.sub('', orgName) pat = re.compile(r'\s+\(.*\)$') orgName = pat.sub('', orgName) elif "Date:" in line: words = line.split() pat = re.compile(r'-.*') yearDate = pat.sub('', words[-1]) elif "Isolate:" in line: pat = re.compile(r'.*solate:\s+') isolate = pat.sub('', line) elif "Infraspecific name:" in line: pat = re.compile(r'.*cultivar=|.*ecotype=|.*strain=|.*breed=') cultivar = pat.sub('', line) else: break readFile.close() if len(isolate) and len(cultivar): extraStrings = f"{cultivar} {isolate}{asmType} {yearDate}" elif len(isolate): extraStrings = f"{isolate}{asmType} {yearDate}" elif len(cultivar): extraStrings = f"{cultivar}{asmType} {yearDate}" if len(extraStrings) < 1: pat = re.compile(r'^ +') extraStrings = pat.sub('', f"{asmType} {yearDate}") return asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings except FileNotFoundError: print(f"Error: File '{asmRpt}' not found.", file=sys.stderr) sys.exit(1) ### inFh is a file handle to a list of assembly identifiers, ### might be a multi column file, ignore anything after the first column def processList(inFh): ncbiSrc = "/hive/data/outside/ncbi/genomes" for asmId in inFh: asmId = asmId.rstrip() # eliminate new line at end of string if asmId.startswith('#'): # ignore comment lines print(f"{asmId}", file=sys.stderr) continue asmId = asmId.split(' ',1)[0] # only the first column p = asmId.split('_', 2) # split on _ maximum of three results in list accession = p[0] + '_' + p[1] nPart = p[1] by3 = [nPart[i:i+3] for i in range(0, len(nPart), 3)] gcX = p[0] d0 = by3[0] d1 = by3[1] d2 = by3[2] asmName = "na" if (len(p) == 3): asmName = p[2] # print(f"{accession}\t{asmName}\t{asmId}") srcDir = f"{ncbiSrc}/{gcX}/{d0}/{d1}/{d2}/{asmId}" asmRpt = f"{srcDir}/{asmId}_assembly_report.txt" # print(f"{asmRpt}", file=sys.stderr) if not os.path.isfile(asmRpt): print(f"{asmId}\tmissing '{asmRpt}'", file=sys.stderr) continue asmNameAbbrev = asmName hapX = "" abbrevPattern = r'hap1|hap2|alternate_haplotype|primary_haplotype' if re.search(abbrevPattern, asmNameAbbrev): hapX = asmNameAbbrev ab = asmNameAbbrev.replace('.hap1','') asmNameAbbrev = ab.replace('.hap2','') ab = asmNameAbbrev.replace('_alternate_haplotype','') asmNameAbbrev = ab.replace('_primary_haplotype','') hx = asmNameAbbrev + '.' ab = hapX.replace(hx,'') hapX = ab asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings = extractNames(asmRpt, hapX) outStr = f"{asmId}\t{orgName}" if len(extraStrings): pat = re.compile(r'[()\[\]+*]') extraStrings = pat.sub('', extraStrings) pat = re.compile(r'\?') extraStrings = pat.sub(' ', extraStrings) extraStrings = re.sub(re.escape(asmNameAbbrev) + ' ', '', extraStrings) extraStrings = re.sub(r'\s+', ' ', extraStrings) words = extraStrings.split() orgList = orgName.split() orgName = " ".join([orgWord for orgWord in orgList if orgWord not in words]) orgName = re.sub(r'=|\s+$|^\s+', '', orgName) orgName = re.sub(r'\s+', ' ', orgName).strip() if len(orgName): outStr = f"{asmId}\t{orgName} ({extraStrings})" else: outStr = f"{asmId}\t{extraStrings}" print(outStr) # print(f"asmType: '{asmType}', {sciName}, {orgName}, {yearDate}, {isolate}, {cultivar} {extraStrings}") def main(): site.ENABLE_USER_SITE = False if len(sys.argv) != 2: # print(f"sys.path: {sys.path}") print("Usage: ./commonNames.py ") print("e.g.: ./commonNames.py some.asmId.list") print(" where some.asmId.list is a simple list of NCBI assembly ids") print(" will look up the common names for each ID from the assembly_report files") sys.exit(1) # Ensure stdout and stderr use UTF-8 encoding setUtf8Encoding() listFile = sys.argv[1] if listFile == 'stdin': fileIn = sys.stdin else: try: fileIn = open(listFile, 'r') except FileNotFoundError: print(f"Error: File '{listFile}' not found.", file=sys.stderr) sys.exit(1) processList(fileIn) if __name__ == "__main__": main()