src/utils/phegenToBed 7140201cf33552ce442df7f585cda379c2fbb8bf

7140201cf33552ce442df7f585cda379c2fbb8bf
max
  Sun Mar 4 22:13:53 2018 -0800
adding phegeni tools and trackDb, refs #21052

diff --git src/utils/phegenToBed src/utils/phegenToBed
new file mode 100755
index 0000000..df4ce95
--- /dev/null
+++ src/utils/phegenToBed
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+import logging, sys, optparse
+from collections import defaultdict
+from os.path import join, basename, dirname, isfile
+
+# ==== functions =====
+    
+def parseArgs():
+    " setup logging, parse command line arguments and options. -h shows auto-generated help page "
+    parser = optparse.OptionParser("usage: %prog [options] PhenGenFile snpBed - rewrite PhenGen file to a bed8+")
+
+    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+    #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
+    #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
+    (options, args) = parser.parse_args()
+
+    if args==[]:
+        parser.print_help()
+        exit(1)
+
+    if options.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+    return args, options
+# ----------- main --------------
+def main():
+    args, options = parseArgs()
+
+    filename = args[0]
+    snpBed = args[1]
+
+    #	Trait	SNP rs	Context	Gene	Gene ID	Gene 2	Gene ID 2	Chromosome	Location	P-Value	Source	PubMed	Analysis ID	Study ID	Study Name
+    bySnp = defaultdict(list)
+    for line in open(filename):
+        if line.startswith("#"):
+            continue
+        row = line.rstrip("\n").split("\t")
+        _, trait, snpId, context, gene, geneId, gene2, geneId2, chrom, start, pVal, source, pmid, analysisId, studyId, studyName = row
+        #if source=="NHGRI": # remove GWAS catalog for now?
+            #continue
+        # need phsxxxxxx for linkout to NCBI
+        if row[-2]!="":
+            row[-2] = "phs%06d" % int(row[-2])
+        extraCount = len(row)
+        bySnp[snpId].append(row)
+
+    bySnp = dict(bySnp)
+
+    for line in open(snpBed):
+        line = line.rstrip("\n")
+        snpId = line.rstrip("\n").split('\t')[3]
+        chrom, start, end, name = line.split("\t") # has chrom, start, end, name
+        bed = [chrom, start, end, name]
+        rows = bySnp.get(snpId.replace("rs",""), None)
+        if rows==None:
+            continue
+
+        bed.append(str(len(rows)))
+        bed.append(".")
+        bed.append(start)
+        bed.append(end)
+        bed.append("0")
+
+        bedCount = len(bed)
+
+        extras = []
+        for i in range(1, extraCount):
+            extras.append( [] )
+
+        for row in rows:
+            for i in range(1, len(row)):
+                #print row, i, len(row)
+                extras[i-1].append(row[i].replace(",", ";"))
+
+        extras = [", ".join(x) for x in extras]
+
+        bed.extend(extras)
+
+        # fixup a few things
+        bed[10] = "rs"+bed[10]
+        # use trait as name, shorten
+        traitShort = bed[9].split(",")[0].split(';')[0].replace(" Disorders", "").replace(" Tests", "")
+        if len(traitShort) > 17:
+            traitShort = traitShort[:16]+".."
+        bed[3] = traitShort
+
+        print "\t".join(bed)
+
+main()