src/hg/makeDb/scripts/variantsMuts/usherDescToBed 99acbcf9be0a3ba5974c153d54e021fd9eb9feda

99acbcf9be0a3ba5974c153d54e021fd9eb9feda
max
  Thu Dec 2 07:44:49 2021 -0800
adding makedoc for omicron mutations, refs #28579

diff --git src/hg/makeDb/scripts/variantsMuts/usherDescToBed src/hg/makeDb/scripts/variantsMuts/usherDescToBed
new file mode 100755
index 0000000..314659b
--- /dev/null
+++ src/hg/makeDb/scripts/variantsMuts/usherDescToBed
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+import logging, sys, optparse, re
+from collections import defaultdict
+import json
+from os import system
+from os.path import join, basename, dirname, isfile
+
+# ==== functions =====
+    
+def parseArgs():
+    " setup logging, parse command line arguments and options. -h shows auto-generated help page "
+    parser = optparse.OptionParser("usage: %prog [options] usherFname jsonFname outputBase - outputs a wuhCor1 BED file given an usher description of nucleotide variants and json file from cov-lineages. creates two files: outBase_nuc.bb and outBase_prot.bb ")
+
+    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+    (options, args) = parser.parse_args()
+
+    if args==[]:
+        parser.print_help()
+        exit(1)
+
+    if options.debug:
+        logging.basicConfig(level=logging.DEBUG)
+        logging.getLogger().setLevel(logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+        logging.getLogger().setLevel(logging.INFO)
+
+    return args, options
+# ----------- main --------------
+def main():
+    args, options = parseArgs()
+
+    usherFname, jsonFname, outBase = args
+    nucBed = outBase+"_nuc.bed"
+    protBed = outBase+"_prot.bed"
+
+    usherText = open(usherFname).read().strip()
+    ofh = open(nucBed, "w")
+    # C14408T > C241T > C3037T > A23403G > G28881A > G28883C > G28882A > C21846T > C23604A > A23063T > T22882G,C22995A,A23013C,C23525T,C24130A > A2832G,T5386G,G8393A,C10029T,C10449A,A11537G,T13195C,C15240T,A18163G,C21762T,G22578A,T22673C,C22674T,T22679C,G22813T,G22898A,G22992A,A23040G,G23048A,A23055G,C23202A,T23599G,C23854A,G23948T,A24424T,T24469A,C24503T,C25000T,C25584T,C26270T,A26530G,C26577G,G26709A,A27259C,C27807T,A28271T,C28311T
+    usherText = usherText.replace(" > ", ",")
+    muts = usherText.split(",")
+    for mut in muts:
+        fromNuc = mut[:1]
+        toNuc = mut[-1:]
+        pos = mut[1:-1]
+        row = ["NC_045512v2", str(int(pos)-1), pos, mut]
+        ofh.write("\t".join(row))
+        ofh.write("\n")
+    ofh.close()
+
+    covData = json.load(open(jsonFname))
+
+    protToStart = {
+            # see https://github.com/nextstrain/ncov/blob/master/defaults/reference_seq.gb
+            "spike" : 21562,
+            "orf1" : 265,
+            "orf1a" : 265,
+            "orf1b" : 13467,
+            "e" : 26244,
+            "m" : 26522,
+            "n" : 28273,
+            }
+
+    ofh = open(protBed, "w")
+    for site in covData["sites"]:
+        #print(site)
+        # nuc:C25000T
+        siteParts = site.split(":")
+        loc = siteParts[0]
+        mut = siteParts[1]
+
+        #print(mut)
+        mo = re.match(r'([A-Z]*)([0-9]+)([A-Z]*)', mut)
+        fromSeq, pos, toSeq = mo.groups()
+
+        if loc=="nuc":
+            # nuc:C25000T
+            row = ("NC_045512v2", str(int(pos)-1), pos, mut)
+        elif loc=="del":
+            # del:11283:9
+            delLen = int(siteParts[2])
+            delStart = int(mut)-1
+            delEnd = delStart+delLen
+            row = ("NC_045512v2", str(delStart), str(delEnd), "del_"+str(delLen))
+        else:
+            # spike:N969K
+            protName = loc
+            protStart = protToStart[protName]
+            nucStart = protStart+ 3*(int(pos)-1)
+            row = ("NC_045512v2", str(nucStart), str(nucStart+3*len(fromSeq)), mut)
+
+        ofh.write("\t".join(row))
+        ofh.write("\n")
+
+    ofh.close()
+
+    # convert to bigBed
+    cmd = "bedSort %s %s" % (nucBed, nucBed)
+    system(cmd)
+    cmd = "bedSort %s %s" % (protBed, protBed)
+    system(cmd)
+
+    nucBb = outBase+"_nuc.bb"
+    protBb = outBase+"_prot.bb"
+    cmd = "bedToBigBed %s /hive/data/genomes/wuhCor1/chrom.sizes %s" % (nucBed, nucBb)
+    system(cmd)
+    cmd = "bedToBigBed %s /hive/data/genomes/wuhCor1/chrom.sizes %s" % (protBed, protBb)
+    system(cmd)
+
+    print("Wrote %s and %s" % (nucBb, protBb))
+
+
+
+main()