232df494c4ee8f9e36fbd496b94e46f08a140628 max Fri Jun 13 08:13:47 2025 -0700 always make a new combined.html diff --git ucsc/updateNewsSec ucsc/updateNewsSec index e5fdced..857a324 100755 --- ucsc/updateNewsSec +++ ucsc/updateNewsSec @@ -1,270 +1,274 @@ #! /usr/bin/env python3 import json, sys, operator, argparse, os, urllib.request, subprocess, time, glob from datetime import datetime, timedelta, date parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="Update 'News' section in desc.conf. Also updates sitemap and list of datasets currently on the RR.") parser.add_argument("-r","--run", action='store_true', help='run script to update news, rr datasets list, and sitemap.') args = parser.parse_args() def buildSitemapSet(): """Builds a set of all URLs currently in the sitemap.""" sitemapPath = "/hive/data/inside/cells/sitemap.cells.txt" sitemapSet = set() # If the sitemap file exists, then open it and add each line to a new set if os.path.exists(sitemapPath): sitemap = open(sitemapPath, "r") for line in sitemap: sitemapSet.add(line) sitemap.close() # Otherwise, just make an empty file? # Maybe I should try/except here instead? else: sitemap = open(sitemapPath, "w") sitemap.close() return sitemapSet def addSiteToSitemap(entry, sitemapSet): """Writes an entry out to the sitemap if it's not already there.""" # Open sitemap file with open("/hive/data/inside/cells/sitemap.cells.txt", "a") as sitemap: urlline = "https://" + entry + ".cells.ucsc.edu\n" if urlline not in sitemapSet: sitemap.write(urlline) def makeDate(aDate): """Will turn a string date separated by "-" into a date object.""" # to make a date object, we need at least the year, month, and day splitDate = aDate.split("-") year = int(splitDate[0]) month = int(splitDate[1]) day = int(splitDate[2]) dateObj = date(year, month, day) return dateObj def mondayBefore(aDate): """Returns the date of the monday before a given date.""" # Use makeDate function to make input date into a datetime object dateObj = makeDate(aDate) # From https://www.programiz.com/python-programming/datetime # Basically allows us to get the date for the monday in which # a given date falls monday = dateObj - timedelta(days=dateObj.weekday()) return monday def processDataset(dataset, rootDir): """Processes a single dataset and returns the shortname and a list of info about that dataset.""" name = dataset["name"] # 'firstBuildTime was added by Max in early 2022, so not all datasets have it # Need to be resistant to it not being there try: lastmtime = dataset["firstBuildTime"] except KeyError: lastmtime = time.strftime('%Y-%m-%d', time.localtime(os.path.getmtime(os.path.join(rootDir, name, "exprMatrix.bin")))) monday = mondayBefore(lastmtime) # Collect info for dataset to return at end of function short = dataset["shortLabel"] count = str(dataset["sampleCount"]) dList = [short, count, monday] return name, dList def processCollection(collection, rootDir): """Processes a collection and returns a list of info about that collection. If it runs into a nested collection, will also process that.""" collInfo = list() name = collection["name"] cjson = json.load(open(os.path.join(rootDir, name, "dataset.json"), "r")) subdirs = cjson["datasets"] for sub in subdirs: if 'isCollection' in sub.keys(): # If we run into another collection, process it in the same way subCollInfo = processCollection(sub, rootDir) # Need to do this appending here otherwise subcollection info disappears collInfo = collInfo + subCollInfo else: # Otherwise, process each dataset in a collection as a normal dataset subInfo = processDataset(sub, rootDir) subDict = {subInfo[0]:subInfo[1]} collInfo.append(subDict) return collInfo def parseBetaDatasets(): """Parses dataset.json for cells-beta to get a current list of datasets.""" # Builds dictionary of datasets currently on the beta, assuming it's a proxy of the RR betaInfo = dict() # a smaller dict that contains only name/shortLabel/count/release date bdir = "/usr/local/apache/htdocs-cells-beta/" cellsBeta = json.load(open(os.path.join(bdir, "dataset.json"), "r")) for dataset in cellsBeta["datasets"]: if "isCollection" in dataset.keys(): collectionInfo = processCollection(dataset, bdir) cname = dataset["name"] cshort = dataset["shortLabel"] oldDate = date.today() collCellCount = 0 # After we process the collection, we want to add the info for that # to the betaInfo dictionary for d in collectionInfo: dname = list(d.keys())[0] info = d[dname] betaInfo[dname] = info collCellCount += int(info[1]) day = info[2] # Do some comparison to see if data for current dataset in collection # is older than the last if day < oldDate: oldDate = day # Also add an entry to betaInfo that covers the collection as a whole # Cell count for this one is the sum of the cell counts for all subdatasets # Date is that for the oldest dataset in the collection betaInfo[cname] = [cshort, str(collCellCount), oldDate] else: dname, bList = processDataset(dataset, bdir) betaInfo[dname] = bList return betaInfo +def combineNews(): + # From https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory + newsFiles = sorted(glob.glob("/hive/data/inside/cells/news/perDate/*.html"), reverse=True) + # Basically, we're gathering up all of the individual news files to combine them into one + filenames = ['/hive/data/inside/cells/news/basic.html'] + newsFiles + with open('/hive/data/inside/cells/news/combined.html','w') as outfile: + for fname in filenames: + with open(fname) as infile: + outfile.write(infile.read()) + def writeNewsHtml(toPrint, dateDir): """Takes a list of datasets and writes out an html file per day that lists all datasets released that day.""" for day in toPrint: dateOut = dateDir + str(day) + ".html" if os.path.exists(dateOut): htmlOut = open(dateOut, "a") else: htmlOut = open(dateOut, "w") # Do some work to get the date into something we can easily grab pieces of betterDate = time.strftime('%d-%b-%Y', day.timetuple()) splitDay = betterDate.split("-") # Separate vars for month/day/year month=splitDay[1] dayNum=splitDay[0] year=splitDay[2] # Write bits out to the news file for the specific day htmlOut.write("
" + month + " " + dayNum + ", " + year + "
\n") htmlOut.write("New datasets:
\n