src/hg/hubApi/assemblyList.py 7d62fef94fc8e61b8c4f2b66229d81596e19a44d

7d62fef94fc8e61b8c4f2b66229d81596e19a44d
hiram
  Mon Aug 19 13:50:38 2024 -0700
augment the description on GenArk assemblies to include the year and the assembly status refs #33720

diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py
index 6a547ad..5ff3d4f 100755
--- src/hg/hubApi/assemblyList.py
+++ src/hg/hubApi/assemblyList.py
@@ -73,30 +73,34 @@
 ####################################################################
 ### this common name file is setup weekly via a cron job
 ####################################################################
 def allCommonNames():
     commonNames = {}
 
     filePath = "/hive/data/outside/ncbi/genomes/reports/allCommonNames/asmId.commonName.all.txt"
 
     with open(filePath, 'r', encoding='utf-8') as file:
         reader = csv.reader(file, delimiter='\t')
         for row in reader:
             asmId = row[0]
             gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1]
             commonNames[gcAcc] = row[1]
 
+    if len(commonNames) < 1000000:
+        sys.stderr.write(f"ERROR: assemblyList.py: allCommonNames is failing\n")
+        sys.exit(255)
+
     return commonNames
 
 ####################################################################
 """
      header definitions from assembly_summary_refseq.txt
 
      1  #assembly_accession 20  ftp_path
      2  bioproject          21  excluded_from_refseq
      3  biosample           22  relation_to_type_material
      4  wgs_master          23  asm_not_live_date
      5  refseq_category     24  assembly_type
      6  taxid               25  group
      7  species_taxid       26  genome_size
      8  organism_name       27  genome_size_ungapped
      9  infraspecific_name  28  gc_percent
@@ -176,87 +180,103 @@
 
 ####################################################################
 ### given clade c, return priority from cladePrio
 ####################################################################
 def cladePriority(c):
     global cladePrio
     if c not in cladePrio:
         print(f"ERROR: missing clade {c} in cladePrio")
         sys.exit(255)
 
     return cladePrio[c]
 
 ####################################################################
 ### read one of the NCBI files from
 ### /hive/data/outside/ncbi/genomes/reports/assembly_summary_{suffix}
+### returning three items:
+### 1. sorted list of dictionaries representing all the data read in
+### 2. dictionary: key gcAccession and value year of assembly
+### 3. dictionary: key gcAccession and value genome version and refseq status
 ####################################################################
 def readAsmSummary(suffix, prioExists, comNames, asmIdClade):
 
     # Initialize a list to hold the dictionaries
     dataList = []
+    yearDict = {} # key is gcAccession, value is year
+    statusDict = {}
+         # key is gcAccession, value is version_status and refseq_category
 
     filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix
 
     with open(filePath, 'r', encoding='utf-8') as file:
         reader = csv.reader(file, delimiter='\t')
         for row in reader:
             if len(row) < 1:
                continue
             if row[0].startswith('#'):
                continue
-            if row[0] in prioExists:
+            gcAccession = row[0]
+            ### record the year and status here before bailing out so
+            ### these can get into genArk if needed
+            asmName = row[15]
+            year = re.sub(r'/.*', '', row[14])
+            yearDict[gcAccession] = year
+            versionStatus = row[10]
+            refseqCategory = re.sub(r'genome', '', row[4])
+            if refseqCategory == "na":
+               refseqCategory = ""
+            genomeStatus = refseqCategory + versionStatus
+            statusDict[gcAccession] = genomeStatus
+            if gcAccession in prioExists:
                continue
             if len(row) != 38:
                 print(f"ERROR: incorrect number of fields in {file}")
                 sys.exit(255)
-            gcAccession = row[0]
             strain = re.sub(r'breed=', '', row[8])
             s0 = re.sub(r'cultivar=', '', strain)
             strain = re.sub(r'ecotype=', '', s0)
             s0 = re.sub(r'strain=', '', strain)
             strain = re.sub(r'na', '', s0)
-            year = re.sub(r'/.*', '', row[14])
-            asmName = row[15]
             asmId = gcAccession + "_" + asmName
             asmSubmitter = row[16]
             asmType = row[23]
             commonName = "n/a"
             if gcAccession in comNames:
                 commonName = comNames[gcAccession]
             clade = row[24]	# almost like GenArk clades
             if asmId in asmIdClade:	# specific GenArk clade
                 clade = asmIdClade[asmId]
             if clade == "plant":
                 clade = "plants"
             cladeP = cladePriority(clade)
 
             dataDict = {
                 "gcAccession": gcAccession,
                 "asmName": asmName,
                 "scientificName": row[7],
                 "commonName": commonName,
                 "taxId": row[5],
                 "clade": clade,
-                "other": asmSubmitter + " " + strain + " " + asmType + " " + year,
+                "other": asmSubmitter + " " + strain + " " + asmType + " " + year + " " + genomeStatus,
                 "sortOrder": cladeP,
             }
 
             utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()}
             # Append the dictionary to the list
             dataList.append(utf8Encoded)
 
-    return sorted(dataList, key=lambda x: x['sortOrder'])
+    return sorted(dataList, key=lambda x: x['sortOrder']), yearDict, statusDict
 
 ####################################################################
 ### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt
 ### this sets up the GenArk listing
 ####################################################################
 def readGenArkData(url):
     # Initialize a list to hold the dictionaries
     dataList = []
 
     response = requests.get(url)
     response.raise_for_status()
     fileContent = response.text
     fileIo = StringIO(fileContent)
     reader = csv.reader(fileIo, delimiter='\t')
 
@@ -403,30 +423,51 @@
 ###  given:   GCA_000001905.1
 ###  returns: GCA/000/001/905/GCA_000001905.1/hub.txt
 ####################################################################
 def genarkPath(gcAccession):
     # Extract the prefix and the numeric part
     prefix, numPart = gcAccession.split('_')
 
     # Break the numeric part into chunks of three digits
     parts = [numPart[i:i+3] for i in range(0, len(numPart), 3)]
 
     # Join the parts to form the path
     path = f"{prefix}/{parts[0]}/{parts[1]}/{parts[2]}/{gcAccession}/hub.txt"
 
     return path
 
+
+####################################################################
+### scan the genArks items, add years and status if not already there
+####################################################################
+def addYearsStatus(genArks, years, status):
+    for item in genArks:
+        gcAccession = item['gcAccession']
+        if gcAccession in years:
+            year = years[gcAccession]
+            pat = r'\b' + re.escape(year) + r'\b'
+            if not re.search(pat, item['commonName']):
+                if not re.search(pat, item['taxId']):
+                    item['taxId'] += " " + year
+        if gcAccession in status:
+            stat = status[gcAccession]
+            pat = r'\b' + re.escape(stat) + r'\b'
+            if not re.search(pat, item['taxId']):
+                item['taxId'] += " " + stat
+
+    return
+
 ####################################################################
 ### for the genArk set, establish some ad-hoc priorities
 ####################################################################
 def establishPriorities(dbDb, genArk):
     global topPriorities
     global allPriorities
     global priorityCounter
 
     totalItemCount = 0
 
     expectedTotal = len(dbDb) + len(genArk)
     print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes")
 
     # first priority are the specific manually selected top items
     itemCount = 0
@@ -668,34 +709,38 @@
     dbDbClades = dbDbCladeList(dbDbNameCladeFile)
     # Get the dbDb.hgcentral table data
     rawData = dbDbData()
     dbDbItems = processDbDbData(rawData, dbDbClades)
 
     # read the GenArk data from hgdownload into a list of dictionaries
     genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt"
     genArkItems = readGenArkData(genArkUrl)
 
     establishPriorities(dbDbItems, genArkItems)
 
     asmIdClade = readAsmIdClade()
     commonNames = allCommonNames()
     print("# all common names: ", len(commonNames))
 
-    refSeqList = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade)
+    refSeqList, refSeqYears, refSeqStatus = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade)
     print("# refSeq assemblies: ", len(refSeqList))
-    genBankList = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade)
+    genBankList, genBankYears, genBankStatus = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade)
     print("# genBank assemblies: ", len(genBankList))
+    ### dictionary unpacking, combine both dictionaries (Python 3.5+)
+    allYears = {**refSeqYears, **genBankYears}
+    allStatus = {**refSeqStatus, **genBankStatus}
+    addYearsStatus(genArkItems, allYears, allStatus)
 
     refSeqGenBankList = refSeqList + genBankList
     print("# refSeq + genBank assemblies: ", len(refSeqGenBankList))
 
     refSeqGenBankSorted =  sorted(refSeqGenBankList, key=lambda x: x['sortOrder'])
     print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted))
 
 
     outFile = "assemblyList.tsv"
     fileOut = open(outFile, 'w')
 
     totalItemCount = 0
     itemCount = 0
     # Print the dbDb data
     for entry in dbDbItems:
@@ -740,27 +785,27 @@
     print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}")
 
     incrementPriority = len(allPriorities) + 1
     print("# incrementing priorities from: ", incrementPriority)
 
     itemCount = 0
     # Print the refSeq/genBank data
     for entry in refSeqGenBankSorted:
         gcAccession = entry['gcAccession']
         commonName = entry['commonName']
         scientificName = entry['scientificName']
         asmName = entry['asmName']
         clade = entry['clade']
         descr = f"{asmName} {entry['taxId']} {entry['other']}"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\n"
+        outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\n"
         fileOut.write(outLine)
         incrementPriority += 1
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}")
 
     fileOut.close()
 
 if __name__ == "__main__":
     main()