1468e11700d139e452a7e77f9e30b4f87f9f38ec mspeir Fri Jun 13 15:30:58 2025 -0700 Fixing script so that only one entry per date is added to combined.html,added several options to script to control output directories for the various files which is mostly useful for debugging diff --git ucsc/updateNewsSec ucsc/updateNewsSec index 857a324..7a10469 100755 --- ucsc/updateNewsSec +++ ucsc/updateNewsSec @@ -1,274 +1,283 @@ #! /usr/bin/env python3 import json, sys, operator, argparse, os, urllib.request, subprocess, time, glob from datetime import datetime, timedelta, date parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="Update 'News' section in desc.conf. Also updates sitemap and list of datasets currently on the RR.") parser.add_argument("-r","--run", action='store_true', help='run script to update news, rr datasets list, and sitemap.') +parser.add_argument("-n","--newsOnly", action='store_true', + help='only update news html') +parser.add_argument("-p","--perDateDir", action='store', default="/hive/data/inside/cells/news/perDate/", + help='where to find perDate news file') +parser.add_argument("-o","--newsOutDir", action='store', default="/hive/data/inside/cells/news/", + help='where to output news file') +parser.add_argument("-d","--rrDatasetsOut", action='store', default="/hive/data/inside/cells/", + help='directory where to find/output rr.datasets.txt') +parser.add_argument("-s","--sitemap", action='store', default="/hive/data/inside/cells/", + help='directory where to output sitemap file') args = parser.parse_args() def buildSitemapSet(): """Builds a set of all URLs currently in the sitemap.""" - sitemapPath = "/hive/data/inside/cells/sitemap.cells.txt" + sitemapPath = args.sitemap + "/sitemap.cells.txt" sitemapSet = set() # If the sitemap file exists, then open it and add each line to a new set if os.path.exists(sitemapPath): sitemap = open(sitemapPath, "r") for line in sitemap: sitemapSet.add(line) sitemap.close() # Otherwise, just make an empty file? # Maybe I should try/except here instead? else: sitemap = open(sitemapPath, "w") sitemap.close() return sitemapSet def addSiteToSitemap(entry, sitemapSet): """Writes an entry out to the sitemap if it's not already there.""" # Open sitemap file - with open("/hive/data/inside/cells/sitemap.cells.txt", "a") as sitemap: + with open(args.sitemap + "/sitemap.cells.txt", "a") as sitemap: urlline = "https://" + entry + ".cells.ucsc.edu\n" if urlline not in sitemapSet: sitemap.write(urlline) def makeDate(aDate): """Will turn a string date separated by "-" into a date object.""" # to make a date object, we need at least the year, month, and day splitDate = aDate.split("-") year = int(splitDate[0]) month = int(splitDate[1]) day = int(splitDate[2]) dateObj = date(year, month, day) return dateObj def mondayBefore(aDate): """Returns the date of the monday before a given date.""" # Use makeDate function to make input date into a datetime object dateObj = makeDate(aDate) # From https://www.programiz.com/python-programming/datetime # Basically allows us to get the date for the monday in which # a given date falls monday = dateObj - timedelta(days=dateObj.weekday()) return monday def processDataset(dataset, rootDir): """Processes a single dataset and returns the shortname and a list of info about that dataset.""" name = dataset["name"] # 'firstBuildTime was added by Max in early 2022, so not all datasets have it # Need to be resistant to it not being there try: lastmtime = dataset["firstBuildTime"] except KeyError: lastmtime = time.strftime('%Y-%m-%d', time.localtime(os.path.getmtime(os.path.join(rootDir, name, "exprMatrix.bin")))) monday = mondayBefore(lastmtime) # Collect info for dataset to return at end of function short = dataset["shortLabel"] count = str(dataset["sampleCount"]) dList = [short, count, monday] return name, dList def processCollection(collection, rootDir): """Processes a collection and returns a list of info about that collection. If it runs into a nested collection, will also process that.""" collInfo = list() name = collection["name"] cjson = json.load(open(os.path.join(rootDir, name, "dataset.json"), "r")) subdirs = cjson["datasets"] for sub in subdirs: if 'isCollection' in sub.keys(): # If we run into another collection, process it in the same way subCollInfo = processCollection(sub, rootDir) # Need to do this appending here otherwise subcollection info disappears collInfo = collInfo + subCollInfo else: # Otherwise, process each dataset in a collection as a normal dataset subInfo = processDataset(sub, rootDir) subDict = {subInfo[0]:subInfo[1]} collInfo.append(subDict) return collInfo def parseBetaDatasets(): """Parses dataset.json for cells-beta to get a current list of datasets.""" # Builds dictionary of datasets currently on the beta, assuming it's a proxy of the RR betaInfo = dict() # a smaller dict that contains only name/shortLabel/count/release date bdir = "/usr/local/apache/htdocs-cells-beta/" cellsBeta = json.load(open(os.path.join(bdir, "dataset.json"), "r")) for dataset in cellsBeta["datasets"]: if "isCollection" in dataset.keys(): collectionInfo = processCollection(dataset, bdir) cname = dataset["name"] cshort = dataset["shortLabel"] oldDate = date.today() collCellCount = 0 # After we process the collection, we want to add the info for that # to the betaInfo dictionary for d in collectionInfo: dname = list(d.keys())[0] info = d[dname] betaInfo[dname] = info collCellCount += int(info[1]) day = info[2] # Do some comparison to see if data for current dataset in collection # is older than the last if day < oldDate: oldDate = day # Also add an entry to betaInfo that covers the collection as a whole # Cell count for this one is the sum of the cell counts for all subdatasets # Date is that for the oldest dataset in the collection betaInfo[cname] = [cshort, str(collCellCount), oldDate] else: dname, bList = processDataset(dataset, bdir) betaInfo[dname] = bList return betaInfo def combineNews(): + """ Combines all of the perDate files into a single html file""" + # From https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory - newsFiles = sorted(glob.glob("/hive/data/inside/cells/news/perDate/*.html"), reverse=True) + newsFiles = sorted(glob.glob(args.perDateDir+"*.html"), reverse=True) # Basically, we're gathering up all of the individual news files to combine them into one - filenames = ['/hive/data/inside/cells/news/basic.html'] + newsFiles - with open('/hive/data/inside/cells/news/combined.html','w') as outfile: + basicNews = args.newsOutDir + "/basic.html" + filenames = [basicNews] + newsFiles + with open(args.newsOutDir + '/combined.html','w') as outfile: for fname in filenames: with open(fname) as infile: outfile.write(infile.read()) def writeNewsHtml(toPrint, dateDir): """Takes a list of datasets and writes out an html file per day that lists all datasets released that day.""" for day in toPrint: dateOut = dateDir + str(day) + ".html" - if os.path.exists(dateOut): - htmlOut = open(dateOut, "a") - else: htmlOut = open(dateOut, "w") # Do some work to get the date into something we can easily grab pieces of betterDate = time.strftime('%d-%b-%Y', day.timetuple()) splitDay = betterDate.split("-") # Separate vars for month/day/year month=splitDay[1] dayNum=splitDay[0] year=splitDay[2] # Write bits out to the news file for the specific day htmlOut.write("
" + month + " " + dayNum + ", " + year + "
\n") htmlOut.write("New datasets:
\n