215ac86539ac6a12e2db96c777a7a1568817318a hiram Fri Sep 13 16:07:46 2024 -0700 getting the GC accessions into dbDb descriptions when possible refs #33720 diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py index a6c8548..89ca7fa 100755 --- src/hg/hubApi/assemblyList.py +++ src/hg/hubApi/assemblyList.py @@ -370,31 +370,31 @@ #################################################################### def processDbDbData(data, clades, years, ncbi): # Initialize a list to hold the dictionaries dataList = [] # Split the data into lines (rows) rows = data.strip().split('\n') # reverse the rows so that names such as hg19 come before hg18 sortedRows = sorted(rows, key=getFirstWordCaseInsensitive, reverse=True) for row in sortedRows: # Split each row into columns columns = row.split('\t') clade = clades.get(columns[0], "n/a") year = years.get(columns[0], "n/a") - gcAccession = ncbi.get(columns[0], "n/a") + gcAccession = ncbi.get(columns[0], "na") cladeP = cladePriority(clade) # corresponds with the SELECT statement # name,scientificName,organism,taxId,sourceName,description # Create a dictionary for each row dataDict = { "name": columns[0], "scientificName": columns[1], "organism": columns[2], "taxId": columns[3], "sourceName": columns[4], "description": columns[5], "clade": clade, "year": year, "gcAccession": gcAccession, @@ -775,42 +775,45 @@ totalItemCount = 0 itemCount = 0 # Print the dbDb data for entry in dbDbItems: dbDbName = entry['name'] if dbDbName in allPriorities: priority = allPriorities[dbDbName] else: print("no priority for ", dbDbName) sys.exit(255) clade = entry['clade'] year = entry['year'] gcAccession = entry['gcAccession'] + description = entry['description'] refSeqCategory = "" versionStatus = "" assemblyLevel = "" organism = entry['organism'] if "na" not in gcAccession: if gcAccession in allStatus: stat = allStatus[gcAccession] refSeqCategory = stat['refSeqCategory'].lower() versionStatus = stat['versionStatus'].lower() assemblyLevel = stat['assemblyLevel'].lower() + if gcAccession not in description: + description += " " + gcAccession - descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']}" + descr = f"{entry['sourceName']} {entry['taxId']} {description}" if year not in organism and year not in descr: descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']} {year}" description = re.sub(r'\s+', ' ', descr).strip() outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: