77c5f12e9076b92ee337c80210d44804e3bebafc max Wed Apr 11 10:47:10 2018 -0700 CIRM: making websync faster with many files diff --git src/utils/webSync src/utils/webSync index 0015cd2..15be2a8 100755 --- src/utils/webSync +++ src/utils/webSync @@ -2,36 +2,35 @@ import logging, sys, optparse, os, time, atexit from collections import defaultdict from os.path import join, basename, dirname, isfile, isdir import string logDir = "webSyncLog" flagFname = join(logDir, "isRunning.flag") # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] - download from https server, using files.txt on their end to get the list of files - To create files.txt on the remote end, run this command: + To create files.txt on the remote end, this simple command can be used to create a list of files: du -ab > files.txt - Or preferably this command (otherwise empty directories will lead to "transmit" errors): - find . -type f -exec du -ab {} + > files.txt - Or this one if you have symlinks: - find -L . -type f -exec du -Lab {} + > files.txt + But the above command is slow, includes directories (will lead to warnings) and does not follow + symlinks, so rather use this command: + find -L . -type f -print0 | du -Lab --files0-from=- > files.txt Then run this in the download directory: webSync https://there.org/ This will create a "webSyncLog" directory in the current directory, compare https://there.org/files.txt with the files in the current directory, transfer the missing files and write the changes to webSync/transfer.log. The URL will be saved after the first run and is not necessary from then on. You can add cd xxx && webSync to your crontab. It will not start if it's already running (flagfile). Status files after a run: - webSyncLog/biggerHere.txt - list of files that are bigger here. These could be errors or OK. - webSyncLog/files.here.txt - the list of files here - webSyncLog/files.there.txt - the list of files there, current copy of https://there.org/files.txt @@ -211,31 +210,32 @@ filesThereName = join(logDir, "files.there.txt") filesHereName = join(logDir, "files.here.txt") if isfile(filesThereName) and not options.skipScan: os.remove(filesThereName) if isfile(filesHereName) and not options.skipScan: os.remove(filesHereName) if not isfile(filesThereName): fileUrl = join(url, "files.txt") logging.debug("Downloading %s" % fileUrl) cmd = "wget -q %s --no-check-certificate -O %s" % (fileUrl, filesThereName) run(cmd) if not isfile(filesHereName): - cmd = "find -L . -type f -exec du -Lab {} + > %s" % filesHereName + #cmd = "find -L . -type f -exec du -Lab {} + > %s" % filesHereName + cmd = "find -L . -type f -print0 | du -Lab --files0-from=- > %s" % filesHereName run(cmd) hereFiles, hereDirs = parseFileList(filesHereName) thereFiles, thereDirs = parseFileList(filesThereName) logging.debug("checking %d directories, e.g. %s" % (len(thereDirs), list(thereDirs)[:3])) for d in thereDirs: if not isdir(d): os.makedirs(d) biggerHereFname = join(logDir, "biggerHere.txt") missingThereFname = join(logDir, "missingThere.txt") ariaCmdFname = join(logDir, "aria2c.in.tmp")