src/utils/googleSheetToBigBed df0e81c5c77a7f42182f5d5e7a30905d8b31999b

df0e81c5c77a7f42182f5d5e7a30905d8b31999b
max
  Thu Apr 16 09:55:41 2020 -0700
adding cronjob code for user annotations track, refs #25297

diff --git src/utils/googleSheetToBigBed src/utils/googleSheetToBigBed
new file mode 100755
index 0000000..de65bd2
--- /dev/null
+++ src/utils/googleSheetToBigBed
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+
+import logging, sys, optparse
+from collections import defaultdict
+from os.path import join, basename, dirname, isfile
+import hashlib, re, subprocess, os
+
+chromName = "NC_045512v2"
+chromSizes = "/hive/data/genomes/wuhCor1/chrom.sizes"
+TMPDIR = "/scratch/tmp"
+MAXLINES = 1000
+MAXFIELDLEN = 16000
+DEBUG = False
+
+asBed4 = """table bedDetail
+"Browser extensible data, with extended fields "
+    (
+    string chrom;      "Reference sequence chromosome or scaffold"
+    uint   chromStart; "Start position in chromosome"
+    uint   chromEnd;   "End position in chromosome"
+    string name;       "Short Name of item"
+    uint   score;      "Score from 0-1000"
+    char[1] strand;    "+ or -"
+    uint thickStart;   "Start of where display should be thick (start codon)"
+    uint thickEnd;     "End of where display should be thick (stop codon)"
+    uint reserved;     "Used as itemRgb as of 2004-11-22"
+"""
+# ==== functions =====
+    
+def parseArgs():
+    " setup logging, parse command line arguments and options. -h shows auto-generated help page "
+    parser = optparse.OptionParser("usage: %prog credentialsJson URL bbFname [options] - convert tsv to bigBed, taking care to skip invalid rows. Credentials JSON is a Google Sheets API json credentials file. For an example see /cluster/home/max/.user-annots-browser-track-c513a43ffff8.json")
+
+    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+    #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
+    #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
+    (options, args) = parser.parse_args()
+
+    if args==[]:
+        parser.print_help()
+        exit(1)
+
+    if options.debug:
+        logging.basicConfig(level=logging.DEBUG)
+        logging.getLogger().setLevel(logging.DEBUG)
+        global DEBUG
+        DEBUG=True
+    else:
+        logging.basicConfig(level=logging.INFO)
+        logging.getLogger().setLevel(logging.INFO)
+
+    return args, options
+# ----------- main --------------
+def cleanString(s):
+    " remove any non-alnum and space characters from string "
+    pat = re.compile('[ \W_-]+')
+    return pat.sub('', s)
+
+def openSheet(credFname, url):
+    import gspread
+    if not DEBUG:
+        logging.getLogger().setLevel(logging.CRITICAL)
+
+    from oauth2client.service_account import ServiceAccountCredentials
+    scope = ['https://spreadsheets.google.com/feeds',
+                     'https://www.googleapis.com/auth/drive']
+
+    credentials = ServiceAccountCredentials.from_json_keyfile_name(credFname, scopes=scope)
+    gc = gspread.authorize(credentials)
+    sh = gc.open_by_url(url)
+    wks = sh.get_worksheet(0)
+    rows =  wks.get_all_values()
+    if DEBUG:
+        logging.getLogger().setLevel(logging.DEBUG)
+    else:
+        logging.getLogger().setLevel(logging.INFO)
+    return rows
+
+def openTsvRows(credFname, url, oldMd5):
+    " return data as (headers, list of rows, newMd5), if data is different than oldMd5, or None if not changed "
+    #import requests
+    #r = requests.get(url)
+    #text = r.text
+    #lines = r.text.split("\r\n")
+    #logging.debug("lines"+repr(lines))
+
+    inRows = openSheet(credFname, url)
+    rowsAsStrings = ["|".join(row) for row in inRows]
+    inStr = "||".join(rowsAsStrings)
+    md5 = hashlib.md5(inStr.encode("utf8")).hexdigest()
+    logging.debug("New Md5: %s" % md5)
+
+    if md5==oldMd5:
+        logging.debug("No change. Stop.")
+        return None, None, None
+
+
+    headers = None
+    rows = []
+
+    i = 0
+    for row in inRows:
+        #l = l.rstrip("\n\r")
+        i+=1
+        #logging.debug("line: %s" % l)
+        if i > MAXLINES:
+            print("too many lines")
+            break
+
+        if str(row[0]).startswith("#"):
+            continue
+
+        #row = l.split("\t")
+        if len(row)==0:
+            logging.debug("line %d: empty line" % i)
+            continue
+
+        if row[0]=="Start" and headers is None:
+            headers = row
+            continue
+
+        assert(headers!=None)
+        if len(row)!=len(headers):
+            print("line %d: number of fields is %d, but have %d headers" % (i, len(row), len(headers)))
+            continue
+
+        if not str(row[0]).isdigit():
+            print("line %d: start pos is not a number" % (i))
+            continue
+        if not str(row[1]).isdigit():
+            print("line %d: end pos is not a number" % (i))
+            continue
+
+        start = int(row[0])
+        end = int(row[1])
+
+        if end < start:
+            print("line %d: end pos is not after start pos" % (i))
+            continue
+        if start < 0 or end < 0:
+            print("line %d: start or end pos is negative" % (i))
+            continue
+
+        row[0] = start
+        row[1] = end
+
+        logging.debug("Accepted row: %s" % row)
+        rows.append(row)
+
+    assert(headers[0]=="Start")
+    assert(headers[1]=="End")
+    return headers, rows, md5
+
+def makeBb(headers, rows, bbFname):
+    " make bed and autosql file and convert to bb file "
+    asFname = join(TMPDIR, "temp.as")
+    afh = open(asFname, "w")
+    afh.write(asBed4)
+
+    otherHeaders = headers[3:]
+    for colIdx, oh in enumerate(otherHeaders):
+        fieldName = "extraField%d" % colIdx
+        fieldDesc = cleanString(oh)
+        fieldDesc = fieldDesc[:150] # limit field length
+        afh.write('     lstring %s;   "%s"\n' % (fieldName, oh))
+
+    afh.write(")\n")
+    afh.close()
+
+    bedFname = join(TMPDIR, "temp.bed")
+    bedFh = open(bedFname, "wt")
+
+    rows.sort() # numbers should be sorted correctly, since they're ints
+
+    for row in rows:
+        newRow = [chromName, row[0], row[1]]
+        bedName = row[2]
+        if len(bedName) > 17:
+            bedName = bedName[:16]+"..."
+        newRow.append(bedName)
+
+        newRow.extend(["0", "+", row[0], row[1], "0"])
+
+        otherCols = row[3:]
+        for oc in otherCols:
+            if oc.startswith("http"):
+                oc = "<a target=_blank href='%s'>%s</a>" % (oc, oc)
+            newRow.append(oc[:MAXFIELDLEN])
+
+        newRow = [str(x) for x in newRow]
+
+        bedFh.write("\t".join(newRow))
+        bedFh.write("\n")
+        logging.debug("Wrote bed row: %s" % newRow)
+    bedFh.close()
+
+    bbTmp = bbFname+".tmp"
+    cmd = ["bedToBigBed", "-tab", "-type=bed9+", "-as="+asFname, bedFname, chromSizes, bbTmp]
+
+    fnull = open(os.devnull, 'w')
+    proc = subprocess.run(cmd, stderr=fnull)
+    if proc.returncode != 0:
+        print("could not convert to bigBed")
+        sys.exit(1)
+
+    os.rename(bbTmp, bbFname)
+
+def main():
+    args, options = parseArgs()
+
+    credFname, url, bbFname = args
+
+    oldMd5 = None
+    statusFname = bbFname+".md5"
+    if isfile(statusFname) and not options.debug:
+    #if isfile(statusFname):
+        oldMd5 = open(statusFname).read()
+        logging.debug("Old md5: %s" % oldMd5)
+
+    headers, rows, newMd5 = openTsvRows(credFname, url, oldMd5)
+
+    if headers is None:
+        return
+
+    open(statusFname, "wt").write(newMd5)
+
+    logging.debug("rows: %s" % rows)
+    makeBb(headers, rows, bbFname)
+
+main()