src/hg/hubApi/assemblyList.py 48466d5a0bfaa3fa457f5a9a07118c323de32b11

48466d5a0bfaa3fa457f5a9a07118c323de32b11
hiram
  Wed Aug 28 18:08:34 2024 -0700
fixed up to add the new columns refs #33720

diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py
index 5ff3d4f..48d2598 100755
--- src/hg/hubApi/assemblyList.py
+++ src/hg/hubApi/assemblyList.py
@@ -206,69 +206,83 @@
          # key is gcAccession, value is version_status and refseq_category
 
     filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix
 
     with open(filePath, 'r', encoding='utf-8') as file:
         reader = csv.reader(file, delimiter='\t')
         for row in reader:
             if len(row) < 1:
                continue
             if row[0].startswith('#'):
                continue
             gcAccession = row[0]
             ### record the year and status here before bailing out so
             ### these can get into genArk if needed
             asmName = row[15]
+            assemblyLevel = re.sub(r'genome', '', row[11], flags=re.IGNORECASE)
             year = re.sub(r'/.*', '', row[14])
             yearDict[gcAccession] = year
             versionStatus = row[10]
-            refseqCategory = re.sub(r'genome', '', row[4])
-            if refseqCategory == "na":
-               refseqCategory = ""
-            genomeStatus = refseqCategory + versionStatus
-            statusDict[gcAccession] = genomeStatus
+            refSeqCategory = re.sub(r'genome', '', row[4])
+            if refSeqCategory == "na":
+               refSeqCategory = ""
+            if versionStatus == "na":
+               versionStatus = ""
+            if assemblyLevel == "na":
+               assemblyLevel = ""
+            genomeStatus = f"{refSeqCategory} {versionStatus} {assemblyLevel}"
+            thisStat = {
+                "refSeqCategory": refSeqCategory,
+                "versionStatus": versionStatus,
+                "assemblyLevel": assemblyLevel,
+            }
+            statusDict[gcAccession] = thisStat
             if gcAccession in prioExists:
                continue
             if len(row) != 38:
                 print(f"ERROR: incorrect number of fields in {file}")
                 sys.exit(255)
             strain = re.sub(r'breed=', '', row[8])
             s0 = re.sub(r'cultivar=', '', strain)
             strain = re.sub(r'ecotype=', '', s0)
             s0 = re.sub(r'strain=', '', strain)
             strain = re.sub(r'na', '', s0)
             asmId = gcAccession + "_" + asmName
             asmSubmitter = row[16]
             asmType = row[23]
             commonName = "n/a"
             if gcAccession in comNames:
                 commonName = comNames[gcAccession]
             clade = row[24]	# almost like GenArk clades
             if asmId in asmIdClade:	# specific GenArk clade
                 clade = asmIdClade[asmId]
             if clade == "plant":
                 clade = "plants"
             cladeP = cladePriority(clade)
 
             dataDict = {
                 "gcAccession": gcAccession,
                 "asmName": asmName,
                 "scientificName": row[7],
                 "commonName": commonName,
                 "taxId": row[5],
                 "clade": clade,
-                "other": asmSubmitter + " " + strain + " " + asmType + " " + year + " " + genomeStatus,
+                "other": f"{asmSubmitter} {strain} {asmType} {year} {genomeStatus}",
+                "year": year,
+                "refSeqCategory": refSeqCategory,
+                "versionStatus": versionStatus,
+                "assemblyLevel": assemblyLevel,
                 "sortOrder": cladeP,
             }
 
             utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()}
             # Append the dictionary to the list
             dataList.append(utf8Encoded)
 
     return sorted(dataList, key=lambda x: x['sortOrder']), yearDict, statusDict
 
 ####################################################################
 ### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt
 ### this sets up the GenArk listing
 ####################################################################
 def readGenArkData(url):
     # Initialize a list to hold the dictionaries
@@ -280,111 +294,123 @@
     fileIo = StringIO(fileContent)
     reader = csv.reader(fileIo, delimiter='\t')
 
     for row in reader:
         if row and row[0].startswith('#'):
             continue
         clade = re.sub(r'\(L\)$', '', row[5])
         cladeP = cladePriority(clade)
         dataDict = {
             "gcAccession": row[0],
             "asmName": row[1],
             "scientificName": row[2],
             "commonName": row[3],
             "taxId": row[4],
             "clade": clade,
+            "year": 0,
+            "refSeqCategory": "na",
+            "versionStatus": "na",
+            "assemblyLevel": "na",
             "sortOrder": cladeP,
         }
 
         utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()}
         # Append the dictionary to the list
         dataList.append(utf8Encoded)
 
     # reset the list so that accessions such as GCF_000001405.40
     # come before GCF_000001405.39
 #    dataList.reverse()
 #    return dataList
     return sorted(dataList, key=lambda x: x['sortOrder'])
 
 ####################################################################
 ### a manually maintained clade listing for UCSC dbDb assemblies
 ####################################################################
 def dbDbCladeList(filePath):
-    returnList = {}
+    returnClade = {}
+    returnYear = {}
+    returnNcbi = {}
 
     with open(filePath, 'r') as file:
         reader = csv.reader(file, delimiter='\t')
         for row in reader:
             if len(row) < 1:
                continue
             if row[0].startswith('#'):
                continue
-            returnList[row[0]] = row[1]
+            returnClade[row[0]] = row[1]
+            returnYear[row[0]] = row[2]
+            returnNcbi[row[0]] = row[3]
 
-    return returnList
+    return returnClade, returnYear, returnNcbi
 
 ####################################################################
 ### out of the genArkData list, extract the given clade
 ####################################################################
 def extractClade(clade, genArkData):
     tmpList = {}
     for item in genArkData:
         if clade != item['clade']:
             continue
         tmpList[item['gcAccession']] = item['commonName']
 
     # return sorted list on the common name, case insensitive
     returnList = dict(sorted(tmpList.items(), key=lambda item:item[1].lower()))
 
     return returnList
 
 ####################################################################
 ### Define a key function for sorting by the first word, case insensitive
 ####################################################################
 def getFirstWordCaseInsensitive(row):
     firstWord = row.split('\t')[0]  # Extract the first word
     return firstWord.lower()  # Convert to lowercase for case-insensitive sorting
 
 ####################################################################
 ### process the hgcentral.dbDb data into a dictionary
 ####################################################################
-def processDbDbData(data, clades):
+def processDbDbData(data, clades, years, ncbi):
     # Initialize a list to hold the dictionaries
     dataList = []
 
     # Split the data into lines (rows)
     rows = data.strip().split('\n')
     # reverse the rows so that names such as hg19 come before hg18
     sortedRows = sorted(rows, key=getFirstWordCaseInsensitive, reverse=True)
 
     for row in sortedRows:
         # Split each row into columns
         columns = row.split('\t')
         clade = clades.get(columns[0], "n/a")
+        year = years.get(columns[0], "n/a")
+        gcAccession = ncbi.get(columns[0], "n/a")
         cladeP = cladePriority(clade)
 
         # corresponds with the SELECT statement
         # name,scientificName,organism,taxId,sourceName,description
         # Create a dictionary for each row
         dataDict = {
             "name": columns[0],
             "scientificName": columns[1],
             "organism": columns[2],
             "taxId": columns[3],
             "sourceName": columns[4],
             "description": columns[5],
             "clade": clade,
+            "year": year,
+            "gcAccession": gcAccession,
             "sortOrder": cladeP,
         }
 
         utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()}
         # Append the dictionary to the list
         dataList.append(utf8Encoded)
 
     return sorted(dataList, key=lambda x: x['sortOrder'])
 
 ####################################################################
 ### Function to remove non-alphanumeric characters
 ### cleans up names to make them better indexing words
 ####################################################################
 def removeNonAlphanumeric(s):
     # Ensure string type
@@ -432,39 +458,43 @@
 
     # Join the parts to form the path
     path = f"{prefix}/{parts[0]}/{parts[1]}/{parts[2]}/{gcAccession}/hub.txt"
 
     return path
 
 
 ####################################################################
 ### scan the genArks items, add years and status if not already there
 ####################################################################
 def addYearsStatus(genArks, years, status):
     for item in genArks:
         gcAccession = item['gcAccession']
         if gcAccession in years:
             year = years[gcAccession]
+            item['year'] = year
             pat = r'\b' + re.escape(year) + r'\b'
             if not re.search(pat, item['commonName']):
                 if not re.search(pat, item['taxId']):
                     item['taxId'] += " " + year
         if gcAccession in status:
             stat = status[gcAccession]
-            pat = r'\b' + re.escape(stat) + r'\b'
-            if not re.search(pat, item['taxId']):
-                item['taxId'] += " " + stat
+            item['refSeqCategory'] = stat['refSeqCategory']
+            item['versionStatus'] = stat['versionStatus']
+            item['assemblyLevel'] = stat['assemblyLevel']
+##            pat = r'\b' + re.escape(stat) + r'\b'
+##            if not re.search(pat, item['taxId']):
+##                item['taxId'] += " " + stat
 
     return
 
 ####################################################################
 ### for the genArk set, establish some ad-hoc priorities
 ####################################################################
 def establishPriorities(dbDb, genArk):
     global topPriorities
     global allPriorities
     global priorityCounter
 
     totalItemCount = 0
 
     expectedTotal = len(dbDb) + len(genArk)
     print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes")
@@ -676,136 +706,159 @@
             allPriorities[dbName] = priorityCounter
             priorityCounter += 1
             itemCount += 1
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tthe rest of dbDb count: {itemCount:4}")
 
 ####################################################################
 
 def main():
     global priorityCounter
     global allPriorities
 
     if len(sys.argv) != 2:
         print("assemblyList.py - prepare assemblyList.tsv file from")
         print("    dbDb.hgcentral and UCSC_GI.assemblyHubList.txt file.\n")
-        print("Usage: assemblyList.py dbDb.name.clade.tsv\n")
-        print("the dbDb.name.clade.tsv file is a manually curated file to relate")
+        print("Usage: assemblyList.py dbDb.clade.year.acc.tsv\n")
+        print("the dbDb.clade.year.tsv file is a manually curated file to relate")
         print("    UCSC database names to GenArk clades, in source tree hubApi/")
         print("This script is going to read the dbDb.hgcentral table, and the file")
         print("    UCSC_GI.assemblyHubList.txt from hgdownload.")
         print("Writing an output file assemblyList.tsv to be loaded into")
         print("    assemblyList.hgcentral.  The output file needs to be sorted")
         print("    sort -k2,2n assemblyList.tsv before table load.")
         sys.exit(255)
 
     # Ensure stdout and stderr use UTF-8 encoding
     set_utf8_encoding()
 
     initCladePriority()
 
     dbDbNameCladeFile = sys.argv[1]
 
     # the correspondence of dbDb names to GenArk clade categories
-    dbDbClades = dbDbCladeList(dbDbNameCladeFile)
+    dbDbClades, dbDbYears, dbDbNcbi = dbDbCladeList(dbDbNameCladeFile)
     # Get the dbDb.hgcentral table data
     rawData = dbDbData()
-    dbDbItems = processDbDbData(rawData, dbDbClades)
+    dbDbItems = processDbDbData(rawData, dbDbClades, dbDbYears, dbDbNcbi)
 
     # read the GenArk data from hgdownload into a list of dictionaries
     genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt"
     genArkItems = readGenArkData(genArkUrl)
 
     establishPriorities(dbDbItems, genArkItems)
 
     asmIdClade = readAsmIdClade()
     commonNames = allCommonNames()
     print("# all common names: ", len(commonNames))
 
     refSeqList, refSeqYears, refSeqStatus = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade)
     print("# refSeq assemblies: ", len(refSeqList))
+    refSeqListHist, refSeqYearsHist, refSeqStatusHist = readAsmSummary("refseq_historical.txt", allPriorities, commonNames, asmIdClade)
+    print("# refSeq historical assemblies: ", len(refSeqListHist))
     genBankList, genBankYears, genBankStatus = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade)
     print("# genBank assemblies: ", len(genBankList))
+    genBankListHist, genBankYearsHist, genBankStatusHist = readAsmSummary("genbank_historical.txt", allPriorities, commonNames, asmIdClade)
+    print("# genBank historical  assemblies: ", len(genBankListHist))
     ### dictionary unpacking, combine both dictionaries (Python 3.5+)
-    allYears = {**refSeqYears, **genBankYears}
-    allStatus = {**refSeqStatus, **genBankStatus}
+    allYears = {**refSeqYears, **genBankYears, **refSeqYearsHist, **genBankYearsHist}
+    allStatus = {**refSeqStatus, **genBankStatus, **refSeqStatusHist, **genBankStatusHist}
     addYearsStatus(genArkItems, allYears, allStatus)
 
-    refSeqGenBankList = refSeqList + genBankList
+    refSeqGenBankList = refSeqList + genBankList + refSeqListHist + genBankListHist
     print("# refSeq + genBank assemblies: ", len(refSeqGenBankList))
 
     refSeqGenBankSorted =  sorted(refSeqGenBankList, key=lambda x: x['sortOrder'])
     print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted))
 
 
     outFile = "assemblyList.tsv"
     fileOut = open(outFile, 'w')
 
     totalItemCount = 0
     itemCount = 0
     # Print the dbDb data
     for entry in dbDbItems:
         dbDbName = entry['name']
         if dbDbName in allPriorities:
             priority = allPriorities[dbDbName]
         else:
             print("no priority for ", dbDbName)
             sys.exit(255)
 
         clade = entry['clade']
+        year = entry['year']
+        gcAccession = entry['gcAccession']
+        refSeqCategory = ""
+        versionStatus = ""
+        assemblyLevel = ""
+        if "na" not in gcAccession:
+            if gcAccession in allStatus:
+              stat = allStatus[gcAccession]
+              refSeqCategory = stat['refSeqCategory']
+              versionStatus = stat['versionStatus']
+              assemblyLevel = stat['assemblyLevel']
 
         descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']}\n"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\n"
+        outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}")
 
     itemCount = 0
     # Print the GenArk data
     for entry in genArkItems:
         gcAccession = entry['gcAccession']
         if gcAccession in allPriorities:
             priority = allPriorities[gcAccession]
         else:
             print("no priority for ", gcAccession)
             sys.exit(255)
 
         hubPath = genarkPath(gcAccession)
         cleanName = removeNonAlphanumeric(entry['commonName'])
         clade = entry['clade']
         descr = f"{entry['asmName']} {entry['taxId']}"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\n"
+        year = entry['year']
+        refSeqCategory = entry['refSeqCategory']
+        versionStatus = entry['versionStatus']
+        assemblyLevel = entry['assemblyLevel']
+        outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}")
 
     incrementPriority = len(allPriorities) + 1
     print("# incrementing priorities from: ", incrementPriority)
 
     itemCount = 0
     # Print the refSeq/genBank data
     for entry in refSeqGenBankSorted:
         gcAccession = entry['gcAccession']
         commonName = entry['commonName']
         scientificName = entry['scientificName']
         asmName = entry['asmName']
         clade = entry['clade']
+        year = entry['year']
+        refSeqCategory = entry['refSeqCategory']
+        versionStatus = entry['versionStatus']
+        assemblyLevel = entry['assemblyLevel']
         descr = f"{asmName} {entry['taxId']} {entry['other']}"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\n"
+        outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n"
         fileOut.write(outLine)
         incrementPriority += 1
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}")
 
     fileOut.close()
 
 if __name__ == "__main__":
     main()