5538246d3229013664d13136606cc7a3c2fa779a
chmalee
  Wed Nov 20 08:26:47 2019 -0800
Adding a script that assists with some common track archiving operations, refs #21825

diff --git src/hg/utils/archiveTracks.sh src/hg/utils/archiveTracks.sh
new file mode 100755
index 0000000..ca70b4c
--- /dev/null
+++ src/hg/utils/archiveTracks.sh
@@ -0,0 +1,287 @@
+#!/bin/bash
+
+# this script backs up track data (both bigBeds and mysql tables)
+# to a location of your choosing
+
+set -beEu -o pipefail
+
+# globals
+archiveDir="/hive/data/inside/archive"
+tables=""
+files=""
+versionName=""
+trackSetName=""
+dbList=""
+verbose="FALSE"
+EXIT_STATUS=0
+
+usage() {
+cat << EOF
+Usage: `basename $0` [-hbtfv] archiveRoot database(s) trackArchiveName
+
+Required Positional arguments:
+archiveRoot        The root location of the backup directory (/hive/data/inside/archive/).
+database(s)        A single database or double quoted list of databases to back up
+trackSetName       The name of this track archive set. A directory will be created
+                   in the archiveRoot location with the files or tables from the -t
+                   or -f arguments.
+
+Optional arguments (Must preceed required args):
+-h                  Display this help and exit.
+-t                  A table name or a file with a list of tables to backup.
+-f                  A list of files to back up for this track set (/gbdb/ files).
+-v                  Use a specified version string like "v1" instead of the output of 'date +%F'.
+-s                  Print verbose status along the way to stderr.
+
+Backs up a list of tables or files for a track for a single database or list of
+databases. Note the third required argument of what this track set is named. Exits 0 for success
+and 1 on failure. The heirarchy created is:
+\$archiveRoot/\$database/\$trackSetName/\$version/
+
+Example Usages:
+To back up the big files of the crispr track, as well as the crisprRanges table:
+find /gbdb/hg38/crispr/crispr{Details.tab,.bb} -print  > crisprFiles.txt
+`basename $0` -t crisprRanges -f crisprFiles.txt /hive/data/inside/archive/ hg38 "CRISPR"
+EOF
+}
+
+printVars() {
+    printf "found variables:\n" 
+    printf "archiveDir: '%s'\n" "${archiveDir}"
+    printf "versionName: '%s'\n" "${versionName}"
+    printf "dbs: '%s'\n" "${dbs}"
+    printf "trackName: '%s'\n" "${trackName}"
+    printf "tbls: '%s'\n" "${tables}"
+    printf "files: '%s'\n" "${files}"
+    exit 1
+}
+
+backupOneTable() {
+    db=$1
+    tbl=$2
+    if [ ${verbose} = "TRUE" ]
+    then
+        printf "backing up %s\n" "$tbl" 1>&2
+    fi
+    set +e
+    hgsql -Ne "select * from ${tbl}" ${db} > ${tbl}.txt
+    if [ $? -ne 0 ]
+    then
+        set -e
+        rm ${tbl}.txt
+        printf "WARNING: back up of table '%s' failed. Ignore if only the trackDb entry is being backed up.\n" "${tbl}" 1>&2
+    else
+        set -e
+        # -f forces an overwrite if you are re-running this script
+        gzip -f ${tbl}.txt
+        hgsql --raw -Ne "show create table ${tbl}" ${db} > ${tbl}.sql
+    fi
+    backupTrackDb ${db} ${tbl}
+}
+
+backupTables() {
+    # dump tables to current directory
+    db=$1
+    tbls=$2
+    if [ ${verbose} = "TRUE" ]
+    then
+        printf "got tables %s for db %s\n" "$tbls" "$db" 1>&2
+    fi
+    if [[ -e "${tbls}" ]]
+    then
+        for tbl in $(cat "${tbls}")
+        do
+            backupOneTable $db $tbl
+        done
+    else # may be a quoted list like "refGene mrna"
+        for tbl in $(echo "${tbls}")
+        do
+            backupOneTable $db $tbl
+        done
+    fi
+}
+
+backupBigFiles() {
+    # copy list of files in fname to current directory
+    db=$1
+    fname=$2
+    if [ -e "${fname}" ]
+    then
+        # enforce fname is a list of files
+        if [[ `file -L "${fname}" | cut -d' ' -f2-` = "ASCII text" ]]
+        then
+            for f in $(cat "${fname}")
+            do
+                if [ "${verbose}" = "TRUE" ]
+                then
+                    printf "backing up '%s'\n" 1>&2
+                fi
+                cp -p ${f} .
+            done
+        else
+            printf "ERROR: '%s' is not a list of files\n" "${fname}" 1>&2
+            EXIT_STATUS=1
+        fi
+    else
+        printf "ERROR: '%s' does not exist\n" "${fname}" 1>&2
+        EXIT_STATUS=1
+    fi
+}
+
+backupTrackDb() {
+    # if a table has a trackDb entry, back it up
+    db=$1
+    tbl=$2
+    if [[ `hgsql ${db} -Ne "select count(*) from trackDb where tableName='${tbl}'"` -ne 0 ]]
+    then
+        if [ "${verbose}" = "TRUE" ]
+        then
+            printf "backing up trackDb for %s\n" "${tbl}" 1>&2
+        fi
+        set +e
+        hgsql ${db} -Ne "select * from trackDb where tableName='${tbl}'" > ${tbl}.trackDb.tab
+        if [ $? -ne 0 ]
+        then
+            set -e
+            rm "${tbl}.trackDb.tab"
+            printf "ERROR: backup of trackDb failed for %s\n" "${tbl}" 1>&2
+            EXIT_STATUS=1
+            exit 1
+        else
+            set -e
+            # -f forces in case the script is being re-run
+            gzip -f ${tbl}.trackDb.tab
+        fi
+    fi
+}
+
+doBackup()
+{
+    db=$1
+    mkdir -p ${db}/"${trackSetName}"/"${versionName}"
+    if [ "${verbose}" = "TRUE" ]
+    then
+        printf "created dir %s/%s/%s\n" "${db}" "${trackSetName}" "${versionName}"
+    fi
+    cd ${db}/"${trackSetName}"/${versionName}
+    if [[ ! -z "${files}" ]]
+    then
+        backupBigFiles "${db}" "${files}"
+    fi
+    if [[ ! -z "${tables}" ]]
+    then
+        backupTables "${db}" "${tables}"
+    fi
+    cd ${archiveDir}
+}
+
+##### Parse command-line input #####
+
+#OPTIND=1 # Reset is necessary if getopts was used previously in the script.  It is a good idea to make this local in a function.
+while getopts "hst:f:v:" opt
+do
+    case $opt in
+        h)
+            usage
+            exit 0
+            ;;
+        s)
+            verbose="TRUE"
+            ;;
+        t)
+            tables="${OPTARG}"
+            ;;
+        f)
+            files="${OPTARG}"
+            ;;
+        v)
+            versionName="${OPTARG}"
+            ;;
+        '?')
+            printf "unknown option %s\n" "$opt" 1>&2
+            usage >&2
+            exit 1
+            ;;
+    esac
+done
+
+shift "$((OPTIND-1))" # Shift off the options and optional --.
+
+# Check number of required arguments
+if [[ $# -eq 0 ]]
+then
+    usage
+    exit 0
+fi
+
+if [[ $# -ne 3 ]]
+then
+    # Output usage message if number of required arguments is wrong
+    printf "missing required arguments\n"
+    usage >&2
+    exit 1
+else
+    # Set variables with required argument values
+    archiveDir=$1
+    dbList=$2
+    trackSetName=$3
+fi
+
+# set the version string
+if [[ -z "${versionName}" ]]
+then
+    versionName=`date +%F`
+fi
+
+# set the tables and files strings. If they are files with relative locations,
+# prefix them with a full path so later functions work
+if [[ -e "${files}" ]] && [[ "${files}" != /* ]]
+then
+    tmp="${files}"
+    files=`pwd`/"${tmp}"
+    if [ "${verbose}" = "TRUE" ]
+    then
+        printf "Fixing up files list to %s\n" "${files}"
+    fi
+fi
+
+if [[ -e "${tables}" ]] && [[ "${tables}" != /* ]]
+then
+    tmp="${tables}"
+    tables=`pwd`/"${tmp}"
+    if [ "${verbose}" = "TRUE" ]
+    then
+        printf "Fixing up tables list to %s\n" "${tables}"
+    fi
+fi
+
+# set the archive parent directory location
+if [[ ! -d "${archiveDir}" ]]
+then
+    if [ "${verbose}" = "TRUE" ]
+    then
+        printf "Archiving to %s\n" "${archiveDir}" 1>&2
+    fi
+    mkdir -p "${archiveDir}"
+fi
+
+if [[ "${archiveDir}" != /* ]]
+then
+    tmp="${archiveDir}"
+    archiveDir=`pwd`/"${tmp}"
+fi
+
+cd "${archiveDir}"
+if [[ -f "${dbList}" ]]
+then
+    for db in $(cat "${dbList}")
+    do
+        doBackup ${db}
+    done
+else
+    for db in $(echo "${dbList}")
+    do
+        doBackup ${db}
+    done
+fi
+exit ${EXIT_STATUS}