16720e7dfab79a7f60e91f0cb102a213c3e4738a
max
  Fri Apr 28 15:39:08 2017 -0700
first big commit for hgGeneGraph. Others will follow as QA progresses.  refs #13634

diff --git src/utils/ggSpfToTab src/utils/ggSpfToTab
new file mode 100755
index 0000000..56d3c1f
--- /dev/null
+++ src/utils/ggSpfToTab
@@ -0,0 +1,132 @@
+#!/usr/bin/env python2.7
+
+import logging, sys, optparse, string, os
+from collections import defaultdict
+from os.path import join, basename, dirname, isfile
+
+# links are generated like this:
+# http://biocyc.org/HUMAN/NEW-IMAGE?type=PATHWAY&object=BETA-ALA-DEGRADATION-I-PWY
+# http://www.reactome.org/cgi-bin/link?SOURCE=Reactome&ID=REACT_1
+# http://pantherdb.org/pathway/pathwayDiagram.jsp?catAccession=P04376
+
+# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
+parser = optparse.OptionParser("usage: %prog [options] superpathwayDirectory - parse Kyrle's superpathway input files into tab format excluding PID") 
+
+parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
+parser.add_option("-g", "--genes", dest="toGenes", action="store", help="resolve a name to genes, print result and stop") 
+#parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
+#parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
+(options, args) = parser.parse_args()
+
+if options.debug:
+    logging.basicConfig(level=logging.DEBUG)
+else:
+    logging.basicConfig(level=logging.INFO)
+# ==== FUNCTIONs =====
+    
+def parseSpf(inFname, comp, fam, edges, ign):
+    """ parse spf file and return three dicts, comp is id->list of symbols, fam is id->list of symbols and interact
+    is a list of (cause, theme, subType).
+    """
+    if not isfile(inFname):
+        return comp, fam, edges, ign
+
+    logging.info(inFname)
+    for line in open(inFname):
+        # TRAF2	TNF-alpha_TNF-R1_TRAPP_RIP1_TRAF2_Complex_(complex)	component>
+        pwRef = basename(dirname(inFname)).split(".")[0]
+        srcDb, acc = string.split(pwRef, "_", 1)
+
+        parts = line.rstrip("\n").split("\t")
+        lastField = parts[-1]
+        if parts[0]=="rna":
+            name = parts[-1]
+            ign.add(name)
+        if lastField == "component>":
+            member, complex = parts[:2]
+            comp[complex].add(member)
+        elif lastField == "member>":
+            member, famName = parts[:2]
+            fam[famName].add(member)
+        elif lastField.startswith("-a"):
+            cause, theme = parts[:2]
+            if lastField[-1]==">":
+                relType = "activate"
+            else:
+                relType = "inhibits"
+            edges.append( (cause, theme, relType, srcDb, acc) )
+    return comp, fam, edges, ign
+
+def nameToGenes(name, comp, fam):
+    """ recursively resolve a name to the list of all gene symbols
+    associated to it through either complexes or families
+    """
+    genes = []
+    #print name, comp, fam
+    if name in comp:
+        names = comp[name]
+        for n in names:
+            genes.extend(nameToGenes(n, comp, fam))
+        return genes
+    elif name in fam:
+        names = fam[name]
+        for n in names:
+            genes.extend(nameToGenes(n, comp, fam))
+        return genes
+    else:
+        return [name]
+
+def resolveRef(name, comp, fam):
+    """ given the comp and fam dicts, return a tuple ("gene"|"family"|"complex", name, |-sep gene list) """
+    if name in comp:
+        ent = "complex"
+    elif name in fam:
+        ent = "family"
+    else:
+        ent = "gene"
+
+    geneList = nameToGenes(name, comp, fam)
+    return (ent, name, "|".join(geneList))
+# ----------- MAIN --------------
+if args==[]:
+    parser.print_help()
+    exit(1)
+
+inDir = args[0]
+
+comp = defaultdict(set)
+fam = defaultdict(set)
+ign = set()
+edges = []
+
+i = 0
+if isfile(inDir):
+    inFnames = [inDir]
+else:
+    inDirs = os.listdir(inDir)
+    inFnames = [join(inDir, subDir, "graph.spf") for subDir in inDirs]
+
+for inFname in inFnames:
+    comp, fam, edges, ign = parseSpf(inFname, comp, fam, edges, ign)
+    i+=1
+    #if i==100:
+        #break
+
+if options.toGenes:
+    print nameToGenes(options.toGenes, comp, fam)
+    sys.exit(0)
+
+for (cause, theme, relType, srcDb, acc) in edges:
+    if srcDb=="PID":
+        continue
+    if cause in ign or theme in ign:
+        continue
+    row = list(resolveRef(cause, comp, fam))
+    row.extend(resolveRef(theme, comp, fam))
+    row.append(relType)
+    row.append("")
+    row.append(srcDb)
+    row.append(acc)
+    row.append("")
+    print "\t".join(row)
+    #print row