ucsc/updateNewsSec 1468e11700d139e452a7e77f9e30b4f87f9f38ec

1468e11700d139e452a7e77f9e30b4f87f9f38ec
mspeir
  Fri Jun 13 15:30:58 2025 -0700
Fixing script so that only one entry per date is added to combined.html,added several options to script to control output directories for the various files which is mostly useful for debugging

diff --git ucsc/updateNewsSec ucsc/updateNewsSec
index 857a324..7a10469 100755
--- ucsc/updateNewsSec
+++ ucsc/updateNewsSec
@@ -1,52 +1,62 @@
 #! /usr/bin/env python3
 
 import json, sys, operator, argparse, os, urllib.request, subprocess, time, glob
 from datetime import datetime, timedelta, date
 
 parser = argparse.ArgumentParser(
     formatter_class=argparse.RawDescriptionHelpFormatter,
     description="Update 'News' section in desc.conf. Also updates sitemap and list of datasets currently on the RR.")
 parser.add_argument("-r","--run", action='store_true',
     help='run script to update news, rr datasets list, and sitemap.')
+parser.add_argument("-n","--newsOnly", action='store_true',
+    help='only update news html')
+parser.add_argument("-p","--perDateDir", action='store', default="/hive/data/inside/cells/news/perDate/",
+    help='where to find perDate news file')
+parser.add_argument("-o","--newsOutDir", action='store', default="/hive/data/inside/cells/news/",
+    help='where to output news file')
+parser.add_argument("-d","--rrDatasetsOut", action='store', default="/hive/data/inside/cells/",
+    help='directory where to find/output rr.datasets.txt')
+parser.add_argument("-s","--sitemap", action='store', default="/hive/data/inside/cells/",
+    help='directory where to output sitemap file')
 args = parser.parse_args()
 
 def buildSitemapSet():
     """Builds a set of all URLs currently in the sitemap."""
 
-    sitemapPath = "/hive/data/inside/cells/sitemap.cells.txt"
+    sitemapPath = args.sitemap + "/sitemap.cells.txt"
     sitemapSet = set()
 
     # If the sitemap file exists, then open it and add each line to a new set
     if os.path.exists(sitemapPath):
         sitemap = open(sitemapPath, "r")
         for line in sitemap:
             sitemapSet.add(line)
         sitemap.close()
     # Otherwise, just make an empty file?
     # Maybe I should try/except here instead?
     else:
         sitemap = open(sitemapPath, "w")
         sitemap.close()
 
     return sitemapSet
 
 def addSiteToSitemap(entry, sitemapSet):
     """Writes an entry out to the sitemap if it's not already there."""
 
     # Open sitemap file
-    with open("/hive/data/inside/cells/sitemap.cells.txt", "a") as sitemap:
+    with open(args.sitemap + "/sitemap.cells.txt", "a") as sitemap:
         urlline = "https://" + entry + ".cells.ucsc.edu\n"
         if urlline not in sitemapSet:
             sitemap.write(urlline)
 
 def makeDate(aDate):
     """Will turn a string date separated by "-" into a date object."""
 
     # to make a date object, we need at least the year, month, and day
     splitDate = aDate.split("-")
     year = int(splitDate[0])
     month = int(splitDate[1])
     day = int(splitDate[2])
     dateObj = date(year, month, day)
 
     return dateObj
@@ -134,139 +144,138 @@
                 # Do some comparison to see if data for current dataset in collection
                 # is older than the last
                 if day < oldDate:
                     oldDate = day
             # Also add an entry to betaInfo that covers the collection as a whole
             # Cell count for this one is the sum of the cell counts for all subdatasets
             # Date is that for the oldest dataset in the collection
             betaInfo[cname] = [cshort, str(collCellCount), oldDate]
         else:
             dname, bList = processDataset(dataset, bdir)
             betaInfo[dname] = bList
 
     return betaInfo
 
 def combineNews():
+    """ Combines all of the perDate files into a single html file"""
+
     # From https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
-    newsFiles = sorted(glob.glob("/hive/data/inside/cells/news/perDate/*.html"), reverse=True)
+    newsFiles = sorted(glob.glob(args.perDateDir+"*.html"), reverse=True)
     # Basically, we're gathering up all of the individual news files to combine them into one
-    filenames = ['/hive/data/inside/cells/news/basic.html'] + newsFiles
-    with open('/hive/data/inside/cells/news/combined.html','w') as outfile:
+    basicNews = args.newsOutDir + "/basic.html"
+    filenames = [basicNews] + newsFiles
+    with open(args.newsOutDir + '/combined.html','w') as outfile:
         for fname in filenames:
             with open(fname) as infile:
                 outfile.write(infile.read())
 
 def writeNewsHtml(toPrint, dateDir):
     """Takes a list of datasets and writes out an html file per day that lists
        all datasets released that day."""
 
     for day in toPrint:
         dateOut = dateDir + str(day) + ".html"
-        if os.path.exists(dateOut):
-            htmlOut = open(dateOut, "a")
-        else:
         htmlOut = open(dateOut, "w")
 
         # Do some work to get the date into something we can easily grab pieces of
         betterDate = time.strftime('%d-%b-%Y', day.timetuple())
         splitDay = betterDate.split("-")
         # Separate vars for month/day/year
         month=splitDay[1]
         dayNum=splitDay[0]
         year=splitDay[2]
 
         # Write bits out to the news file for the specific day
         htmlOut.write("<p><b>" + month + " " + dayNum + ", " + year + "</b></p>\n")
         htmlOut.write("<p>New datasets:</p>\n<ul>\n")
         for line in toPrint[day]:
             htmlOut.write(line)
         htmlOut.write("</ul>\n")
         htmlOut.close()
 
 def main():
     if args.run == True:
         # From https://stackoverflow.com/questions/19216334/python-give-start-and-end-of-week-data-from-a-given-date
         # and https://www.programiz.com/python-programming/datetime/current-datetime
         # Get date for Monday, so that all datasets added in the last week show up under the same date
         start = mondayBefore(date.today().strftime('%Y-%m-%d'))
 
         # File should contain RR datasets
         # First run of this script will generate this file,
         # move it out of the way to regenerate, though this means that
         # everything will be noted as being released on the same day
-        rrDatasetsPath = "/hive/data/inside/cells/rr.datasets.txt"
-        dateDir = "/hive/data/inside/cells/news/perDate/"
+        rrDatasetsPath = args.rrDatasetsOut + "/rr.datasets.txt"
+        dateDir = args.perDateDir
 
         sitemapSet = buildSitemapSet()
 
         # This should only happen if this is the first time the script is run
         # or if previous version is moved/deleted
         if not os.path.exists(rrDatasetsPath):
             betaInfo = parseBetaDatasets()
             toPrint = dict()
             for entry in betaInfo.keys():
                 label = betaInfo[entry][0]
                 count = betaInfo[entry][1]
                 day = betaInfo[entry][2]
-                line =  str(day) + "\t" + entry + "\t" + label + "\t" + count + "\n"
+                line =str(day) + "\t" + entry + "\t" + str(label) + "\t" + count + "\n"
+                if not args.newsOnly:
                     with open(rrDatasetsPath, "a") as rrDatasets:
                         rrDatasets.write(line)
                 # Check if '/' in shortname, if so means it's a dataset in a collection
                 # and we're not outputting it to the sitemap or to the announcements
                 if "/" not in entry:
+                    if not args.newsOnly:
                         addSiteToSitemap(entry, sitemapSet)
                     outLine = "  <li><a href='?ds=" + entry + "' target='_blank'>" + label + "</a>\n"
                     if day not in toPrint.keys():
                         toPrint[day] = [outLine]
                     else:
                         toPrint[day].append(outLine)
 
             writeNewsHtml(toPrint, dateDir)
             combineNews()
 
         else: # This is the main part of the function that prints out the html for a news update
             betaInfo = parseBetaDatasets()
 
             # Parse the old rr.datasets.txt file so we know what's already out there
             oldNames = set()
             oldDatasets = open(rrDatasetsPath,"r")
             for line in oldDatasets:
                 splitLine = line.strip().split("\t")
                 name = splitLine[1]
-                oldDate = makeDate(splitLine[0])
-                # Remove entry from betaInfo dict if it existed in rrDatasetsFile
-                if name in betaInfo.keys():
-                    del betaInfo[name]
                 oldNames.add(name)
             oldDatasets.close()
 
             sitemapSet = buildSitemapSet()
 
-            if len(betaInfo) > 0:
             allDatasets = open(rrDatasetsPath,"a")
             # Go through and determine which datasets need to be added to the news announcements
             toPrint = dict()
             for entry in betaInfo:
                 label = betaInfo[entry][0]
                 count = betaInfo[entry][1]
                 day = betaInfo[entry][2]
-                    line = str(day) + "\t" + entry + "\t" + label[0] + "\t" + count + "\n"
+                line = str(day) + "\t" + entry + "\t" + str(label) + "\t" + count + "\n"
+                if not args.newsOnly:
                     if entry not in oldNames:
                         allDatasets.write(line)
                 # Check if '/' in shortname, if so means it's a dataset in a collection
                 # and we're not outputting it to the sitemap or to the announcements
                 if "/" not in entry:
+                    if not args.newsOnly:
                         addSiteToSitemap(entry, sitemapSet)
                     outLine = "  <li><a href='?ds=" + entry + "' target='_blank'>" + label + "</a>\n"
                     # If doesn't already in exist in toPrint, add it
                     if day not in toPrint.keys():
                         toPrint[day] = [outLine]
                     else:
                         toPrint[day].append(outLine)
 
             # Print out HTML for new datasets to be put into /hive/data/inside/cells/datasets/desc.conf
             writeNewsHtml(toPrint, dateDir)
             combineNews()
     else:
         parser.print_help(sys.stderr)
         sys.exit(1)