ucsc/datasetDiffs 9f13a83c45f20774ba2f749cc0ce25ba9c64aa73

9f13a83c45f20774ba2f749cc0ce25ba9c64aa73
max
  Fri Jun 27 08:20:16 2025 -0700
removing temp file after ucscDataset diffs run, because of perms problems

diff --git ucsc/datasetDiffs ucsc/datasetDiffs
index 4d1bd03..ff04dc9 100755
--- ucsc/datasetDiffs
+++ ucsc/datasetDiffs
@@ -1,265 +1,266 @@
 #! /usr/bin/env python3
 
 import argparse, threading, sys, tempfile
 from json import load
 from subprocess import Popen,PIPE
 import subprocess
 from operator import eq
 from os import unlink, mkdir
 import logging
 
 def threader(to_fetch, mach_djson, hidden):
     """
     use multithreading to grab contents of urls or files
     """
     # from: https://stackoverflow.com/questions/16181121/a-very-simple-multithreading-parallel-url-fetching-without-queue
     threads = [threading.Thread(target=fetch_data, args=(to_fetch[d],d,mach_djson, hidden)) for d in to_fetch]
     for thread in threads:
         thread.start()
     for thread in threads:
         thread.join()
 
 def fetch_data(loc, dname, out_dict, hidden):
     """
     Gets content url or and saves to given dict where key is the dataset name and
     value is the json contents
 
     also indicates if dataset is hidden on one of the various machines
     """
     djson = load(open(loc,"r"))
     out_dict[dname] = djson
 
     if 'hide' in djson.values():
         hidden.add(dname)
 
 def print_diffs(key, mach1_name, mach1_val, mach2_name, mach2_val):
     """Print out values that differ between two machines, e.g. cells-beta and cells-test"""
     print(key, "\n", mach1_name + ": ", mach1_val, "\n", mach2_name + ": ", mach2_val)
 
 def process_dict(mach1_name, mach1_dict, mach2_name, mach2_dict):
     """Processes a dictionary, printing out differenting values if needed.
        Or continuing to process if it finds another dictionary."""
     for key in mach1_dict.keys():
         mach1_val = mach1_dict[key]
         mach2_val = mach2_dict[key]
         if isinstance(mach1_val, dict):
             process_dict(mach1_name, mach1_val, mach2_name, mach2_val)
         else:
             if mach1_val != mach2_val:
                 print_diffs(key, mach1_name, mach1_val, mach2_name, mach2_val)
 
 def compare_machines(mach1_dict, mach1_name, mach2_dict, mach2_name, verbose=False):
     """
     - Takes a dictionary containing dataset.json for each dataset on two machines
     - Prints those that are different between the two machines and highlights if
       a dataset exists on one machine but not the other
     - 'Verbose' mode prints the exact diffs, otherwise just prints the dataset name
     """
 
     print(f'\nDatasets with diffs between {mach1_name} + {mach2_name}:')
 
 	# Sort the datasets alphabetically
     sorted_datasets = sorted(mach2_dict.keys())
 
     for dataset in sorted_datasets:
         # Check if the overall dicts are the same for a dataset on test/beta
         # If not, we'll go through and print out only those values that are different
         try:
             if not eq(mach1_dict[dataset], mach2_dict[dataset]):
                 # Print dataset name
                 print(dataset)
 
                 # Script has option to print only names of datasets that have diffs
                 # This whole chunk gets skipped if that's the case
                 if verbose:
                     for key in mach2_dict[dataset].keys():
                         # Check if current value is a dict and if so, process that a certain way
                         if isinstance(mach2_dict[dataset][key], dict):
                             process_dict(mach2_name, mach2_dict[dataset][key], mach1_name, mach1_dict[dataset][key])
                         # Otherwise check to see if values for test/beta are different
                         # and print only the diffs
                         else:
                             try:
                                 val1 = mach1_dict[dataset][key]
                                 val2 = mach2_dict[dataset][key]
                                 if val1 != val2:
                                     # The value for this keys if often large and bloats the results
                                     # making it difficult to actually see the diffs. Only print a
                                     # message saying this value differs for test/beta.
                                     if key == "metaFields":
                                         print(key, mach1_name + " and " + mach2_name + " differ")
                                     # Otherwise print out the field that differs and what the values
                                     # are for test/beta
                                     else:
                                         print_diffs(key, mach2_name, val2, mach1_name, val1)
                             except KeyError:
                                 print(key, f'present on {mach1_name}, but not on {mach2_name} or vice versa')
         except KeyError:
             print(dataset, f'present on {mach1_name}, but not on {mach2_name} or vice versa')
 
 # Set up script arguments
 parser = argparse.ArgumentParser(
     formatter_class=argparse.RawDescriptionHelpFormatter,
     description="Shows diffs between cells-test and cells-beta. By default shows only names ")
 parser.add_argument("-r","--run", action='store_true',
     help='run script to looks for diffs')
 parser.add_argument("-s","--stats", action='store_true',
     help='Print stats about the number of datasets/collections on each machine')
 parser.add_argument("-d","--hidden", action='store_true',
     help='Print datasets that are hidden on each machine')
 parser.add_argument("-v","--verbose", action='store_true',
     help='Print datasets that are hidden on each machine')
 args = parser.parse_args()
 
 def main():
     """Main function of datasetDiffs. Runs all of the other functions of the program."""
     # Script only runs if option to run is set via -r/--run
     if args.run == True:
         # Open dataset.json on both dev and beta
         # Hard coded since I don't think you'd ever be able to reasonably compare
         # two arbitrary hosts or collections of datasets?
         # Loading them as dictionaries via json.load
         ct_hide = set()
         ct_djson = dict()
         ct_loc = dict()
         ct_base = "/usr/local/apache/htdocs-cells/"
         bash_cmd = 'find ' + ct_base + ' -name dataset.json | cut -f6- -d "/" | sed "s/\/dataset.json//g" | sort'
         p = Popen([bash_cmd], shell=True, stdout=PIPE, stderr=PIPE)
         cmdout, cmderr = p.communicate()
         ct_datasets = set(cmdout.decode().strip().split('\n'))
         ct_datasets.remove('dataset.json')
         for dname in ct_datasets:
             ct_loc[dname] = ct_base + dname + "/dataset.json"
         threader(ct_loc, ct_djson, ct_hide)
         # dicts/sets to hold everything
         cb_hide = set() # hidden cells-beta datasets
         cb_djson = dict() # holds contents of json for each cells-beta dataset
         cb_loc = dict()
         # get list of all datasets on cells-beta
         cb_base = '/usr/local/apache/htdocs-cells-beta/'
         bash_cmd = 'find ' + cb_base + ' -name dataset.json | cut -f6- -d "/" | sed "s/\/dataset.json//g" | sort'
         p = Popen([bash_cmd], shell=True, stdout=PIPE, stderr=PIPE)
         cmdout, cmderr = p.communicate()
         cb_datasets = set(cmdout.decode().strip().split('\n'))
         cb_datasets.remove('dataset.json')
         for dname in cb_datasets:
             cb_loc[dname] = cb_base + dname + "/dataset.json"
         threader(cb_loc, cb_djson, cb_hide)
 
         # dicts/sets to hold everything
         rr_hide = set() # holds hidden rr datasets
         rr_djson = dict() # holds contents of json files for each rr dataset
         rr_datasets = set() # holds names of all datasets on rr
         rr_paths = dict() # file paths to process via multithreading
         # Get list of dataset.json files for rr
         cmd = ['ssh', 'qateam@hgw0.soe.ucsc.edu', 'find', ct_base, '-name',\
         'dataset.json', '|', 'cut', '-f', '6-', '-d', '"/"']
         p = Popen(cmd, shell=False, stdout=PIPE, stderr=PIPE)
         cmdout, cmderr = p.communicate()
         rr_files = set(cmdout.decode().rstrip().split('\n'))
-        #with tempfile.NamedTemporaryFile(prefix='cb_', delete=False) as tmp:
-            #tmp.write(cmdout)
-        tmp = open("/data/tmp/filesForRsync.txt", "wb")
+        with tempfile.NamedTemporaryFile(prefix='cb_', delete=False) as tmp:
             tmp.write(cmdout)
-        tmp.close()
+        #tmp = open("/data/tmp/filesForRsync.txt", "wb")
+        tmp.write(cmdout)
+        tmp.flush()
 
         # Wanted to use TemporaryDirectory here but couldn't get it to work
         rr_base = '/data/tmp/cb_tmp_new'
         # Make the new directory, continue on if dir already exists
         try:
             mkdir(rr_base)
         except OSError:
             pass
         rsync_cmd = ['rsync', '--files-from=' + tmp.name, \
         'qateam@hgw0.soe.ucsc.edu:/usr/local/apache/htdocs-cells/', rr_base]
         print("running ", rsync_cmd)
         #rsync_p = Popen(rsync_cmd, shell=True, stdout=PIPE, stderr=PIPE)
         #rsync_cmdout, rsync_cmderr = p.communicate()
         subprocess.run(rsync_cmd, check=True)
+        tmp.close()
 
         #if p.returncode != 0:
             #logging.error("error when running: "+repr(" ".join(rsync_cmd)))
             #assert(False)
         # go through rr_files
         for fname in rr_files:
             # turn full path name into a simple dataset name
             fsplit = fname.strip().split('/')
             # from https://stackoverflow.com/questions/12453580/how-do-i-concatenate-items-in-a-list-to-a-single-string
             dname = '/'.join(fsplit[:-1])
             if dname != '':
                 # now turn dataset name into url
                 dpath = rr_base + '/' + dname  + '/dataset.json'
                 # add that url to a dict
                 rr_paths[dname] = dpath
                 rr_datasets.add(dname)
         threader(rr_paths, rr_djson, rr_hide)
 
         # Close out our tmp file
         tmp.close()
         #unlink(tmp.name)
 
         if args.hidden:
             if len(ct_hide) != 0:
                 print("cells-test hidden datasets:")
                 for dataset in sorted(list(ct_hide)):
                     print(dataset)
                 print("\n")            
             if len(cb_hide) != 0:
                 print("cells-beta hidden datasets:")
                 for dataset in sorted(list(cb_hide)):
                     print(dataset)
                 print("\n")
             if  len(rr_hide) != 0:
                 print("cells hidden datasets:")
                 for dataset in sorted(list(rr_hide)):
                     print(dataset)
                 print("\n")
 
         # Print out only datasets on cells-test only
         dev_only = sorted(list(x for x in ct_datasets.difference(cb_datasets) if "/" not in x))
         # Only print something if there are actually datasets on cells-test only
         if len(dev_only) != 0:
             print("Datasets on cells-test only:")
             for dataset in dev_only:
                 print(dataset)
             print("\n")
         # Now datasets on cells-beta only
         beta_only = sorted(list(x for x in cb_datasets.difference(rr_datasets) if "/" not in x))
         if len(beta_only) != 0:
             print("Datasets on cells-beta, but not on RR:")
             for dataset in beta_only:
                 print(dataset)
             print("\n") 
 
         # Compare cells-test to cells-beta datasets
         print('Verbose Mode "on": ')        
         print(args.verbose)
         compare_machines(ct_djson, "cells-test", cb_djson, "cells-beta", args.verbose)
         # And then cells-beta to cells
         compare_machines(cb_djson, "cells-beta", rr_djson, "cells", args.verbose)
 
         # make sets of only top-level datasets
         # from: https://stackoverflow.com/questions/33944647/what-is-the-most-pythonic-way-to-filter-a-set
         ct_top = set(x for x in ct_datasets if "/" not in x)
         cb_top = set(x for x in cb_datasets if "/" not in x)
         rr_top = set(x for x in rr_datasets if "/" not in x)
 
         # Print stats about number of datasets on each machine if arg set
         if args.stats:
             # top-level datasets, or those w/o a '/' in their name
             print("Num of top-level datasets/collections")
             print("\tcells-test:", len(ct_top))
             print("\tcells-beta:", len(cb_top))
             print("\tcells:", len(rr_top))
             # Now all datasets
             print("Num of datasets (including those in collections):")
             print("\tcells-test:", len(ct_datasets))
             print("\tcells-beta:", len(cb_datasets))
             print("\tcells:", len(rr_datasets))
 
     else:
         parser.print_help(sys.stderr)
         exit(1)
 
 if __name__ == "__main__":
     main()