77c5f12e9076b92ee337c80210d44804e3bebafc
max
  Wed Apr 11 10:47:10 2018 -0700
CIRM: making websync faster with many files

diff --git src/utils/webSync src/utils/webSync
index 0015cd2..15be2a8 100755
--- src/utils/webSync
+++ src/utils/webSync
@@ -2,36 +2,35 @@
 
 import logging, sys, optparse, os, time, atexit
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile, isdir
 import string
 
 logDir = "webSyncLog"
 
 flagFname = join(logDir, "isRunning.flag")
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] <url> - download from https server, using files.txt on their end to get the list of files
 
-    To create files.txt on the remote end, run this command:
+    To create files.txt on the remote end, this simple command can be used to create a list of files:
       du -ab > files.txt
-    Or preferably this command (otherwise empty directories will lead to "transmit" errors):
-      find . -type f -exec du -ab {} + > files.txt
-    Or this one if you have symlinks:
-      find -L . -type f -exec du -Lab {} + > files.txt
+    But the above command is slow, includes directories (will lead to warnings) and does not follow
+    symlinks, so rather use this command:
+      find -L . -type f -print0 | du -Lab --files0-from=- > files.txt
 
     Then run this in the download directory:
       webSync https://there.org/
 
     This will create a "webSyncLog" directory in the current directory, compare
     https://there.org/files.txt with the files in the current directory,
     transfer the missing files and write the changes to webSync/transfer.log.
 
     The URL will be saved after the first run and is not necessary from then on. You can add
     cd xxx && webSync to your crontab. It will not start if it's already running (flagfile).
 
     Status files after a run:
     - webSyncLog/biggerHere.txt - list of files that are bigger here. These could be errors or OK.
     - webSyncLog/files.here.txt - the list of files here
     - webSyncLog/files.there.txt - the list of files there, current copy of https://there.org/files.txt
@@ -211,31 +210,32 @@
     filesThereName = join(logDir, "files.there.txt")
     filesHereName = join(logDir, "files.here.txt")
 
     if isfile(filesThereName) and not options.skipScan:
         os.remove(filesThereName)
     if isfile(filesHereName) and not options.skipScan:
         os.remove(filesHereName)
 
     if not isfile(filesThereName):
         fileUrl = join(url, "files.txt")
         logging.debug("Downloading %s" % fileUrl)
         cmd = "wget -q %s --no-check-certificate -O %s" % (fileUrl, filesThereName)
         run(cmd)
 
     if not isfile(filesHereName):
-        cmd = "find -L . -type f -exec du -Lab {} + > %s" % filesHereName
+        #cmd = "find -L . -type f -exec du -Lab {} + > %s" % filesHereName
+        cmd = "find -L . -type f -print0 | du -Lab --files0-from=- > %s" % filesHereName
         run(cmd)
 
     hereFiles, hereDirs = parseFileList(filesHereName)
     thereFiles, thereDirs = parseFileList(filesThereName)
 
     logging.debug("checking %d directories, e.g. %s" % (len(thereDirs), list(thereDirs)[:3]))
 
     for d in thereDirs:
         if not isdir(d):
             os.makedirs(d)
 
     biggerHereFname = join(logDir, "biggerHere.txt")
     missingThereFname = join(logDir, "missingThere.txt")
 
     ariaCmdFname = join(logDir, "aria2c.in.tmp")