src/utils/ggBelCsvToTab 96d63c692fc15357af3800b6a0c363a18d33db89

96d63c692fc15357af3800b6a0c363a18d33db89
max
  Sat Jun 3 08:45:23 2017 -0700
adding support and converter for the openbel dataset, refs #13634

diff --git src/utils/ggBelCsvToTab src/utils/ggBelCsvToTab
new file mode 100755
index 0000000..fc668b6
--- /dev/null
+++ src/utils/ggBelCsvToTab
@@ -0,0 +1,277 @@
+#!/usr/bin/env python2
+
+import logging, sys, optparse, itertools, json, codecs
+from collections import defaultdict, namedtuple
+from os.path import join, basename, dirname, isfile
+#import xml.etree.ElementTree as et
+import xml.etree.cElementTree as et
+
+HGNCFILE="/hive/data/outside/hgnc/current/hgnc_complete_set.txt"
+
+# output file headers
+headers = "eventId,causeType,causeName,causeGenes,themeType,themeName,themeGenes,relType,relSubtype,sourceDb,sourceId,sourceDesc,pmids,evidence".split(",")
+
+# ID of event in output file, goes across input files, so global
+eventId = 0
+
+# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
+parser = optparse.OptionParser("""usage: %prog [options] filename - convert .BEL files to tab-sep pathway format
+""")
+
+parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+#parser.add_option("-k", "--keggDir", dest="keggDir", action="store", help="the KEGG ftp mirror directory on disk, default %default", default="/hive/data/outside/kegg/06032011/ftp.genome.jp/pub/kegg") 
+parser.add_option("-s", "--hgncFname", dest="hgncFname", action="store", help="the HGNC tab file on disk, default %default", default=HGNCFILE)
+#parser.add_option("-u", "--uniprotFname", dest="uniprotFname", action="store", help="the uniprot file from the pubs parser, default %default", default="/hive/data/inside/pubs/parsedDbs/uniprot.9606.tab")
+#parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
+#parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
+(options, args) = parser.parse_args()
+
+if options.debug:
+    logging.basicConfig(level=logging.DEBUG)
+else:
+    logging.basicConfig(level=logging.INFO)
+# ==== FUNCTIONs =====
+def lineFileNext(fh):
+    """ 
+        parses tab-sep file with headers as field names 
+        yields collection.namedtuples
+        strips "#"-prefix from header line
+    """
+    line1 = fh.readline()
+    line1 = line1.strip("\n").strip("#")
+    headers = line1.split("\t")
+    headers = [h.replace(" ", "_") for h in headers]
+    headers = [h.replace("(", "") for h in headers]
+    headers = [h.replace(")", "") for h in headers]
+    headers = [h.replace(".", "") for h in headers]
+    headers = [h.replace("-", "_") for h in headers]
+    Record = namedtuple('tsvRec', headers)
+
+    for line in fh:
+        line = line.rstrip("\n")
+        fields = line.split("\t")
+        try:
+            rec = Record(*fields)
+        except Exception, msg:
+            logging.error("Exception occured while parsing line, %s" % msg)
+            logging.error("Filename %s" % fh.name)
+            logging.error("Line was: %s" % repr(line))
+            logging.error("Does number of fields match headers?")
+            logging.error("Headers are: %s" % headers)
+            #raise Exception("wrong field count in line %s" % line)
+            continue
+        # convert fields to correct data type
+        yield rec
+
+def trySymFromName(nameStr, accToSym):
+    # try a few things to get the official symbol of a synonym
+    # handles complexes and returns a | sep list for them
+
+    if "/" in nameStr:
+        parts = nameStr.split("/")
+        syms = []
+        for p in parts:
+            p = p.strip()
+            pSym = trySymFromName(p, accToSym)
+            if pSym!="":
+                syms.append(pSym)
+        if len(syms)!=0:
+            return "|".join(syms)
+
+    if nameStr == "":
+        nameStr = accTonameStr.get(nameStr, "")
+    if nameStr=="":
+        nameStr = accTonameStr.get(nameStr.split("-")[0].lower(), "")
+    if nameStr=="":
+        nameStr = accTonameStr.get(nameStr.split("/")[0].lower(), "")
+    if nameStr=="":
+        nameStr = accTonameStr.get(nameStr.split()[0].lower(), "")
+    # give up
+    return ""
+
+def pipeSplitAddAll(string, dict, key, toLower=False):
+    " split on pipe and add all values to dict with key "
+    for val in string.split("|"):
+        if toLower:
+            val = val.lower()
+        dict[val]=key
+
+def parseHgnc(fname, addEntrez=False):
+    " return two dicts: uniprot -> symbol and alias -> symbol from the HGNC tab-sep file "
+    upToSym = {}
+    skipSyms = set()
+    aliasToSym = defaultdict(set)
+    clashList = []
+    for row in lineFileNext(open(fname)):
+        sym = row.symbol
+        if "withdrawn" in sym:
+            continue
+
+        aliasList = [sym]
+        #aliasList.append(row.Approved_Name)
+        #aliasList.extend(splitQuote(row.Previous_Names))
+        #aliasList.extend(splitQuote(row.Name_Synonyms))
+        aliasList.extend(splitQuote(row.prev_symbol, isSym=True))
+        aliasList.extend(splitQuote(row.alias_symbol, isSym=True))
+
+
+        for n in aliasList:
+            if n in aliasToSym:
+                oldSym = aliasToSym[n]
+                if oldSym!=n:
+                    clashList.append("%s->%s/%s" % (n, aliasToSym[n], sym))
+                #print "clash: %s with %s" % (n, aliasToSym[n])
+            else:
+                aliasToSym[n].add(sym)
+
+        upAcc = row.uniprot_ids
+        if upAcc=="" or upAcc=="-":
+            continue
+        if upAcc in upToSym:
+            #logging.debug("uniprot accession %s assigned to %s, but already assigned to symbol %s" % (upAcc, sym, upToSym[upAcc]))
+            skipSyms.add(sym)
+            continue
+        upToSym[upAcc] = sym
+    logging.info("Skipped these symbols due to duplicate uniprot IDs: %s" % ",".join(skipSyms))
+    logging.info("%d symbol clashes: %s" % (len(clashList),  clashList))
+    return upToSym, aliasToSym
+
+def flattenCsv(member):
+    " convert pybel csv member to type and list of genes, as in our format "
+    mType = member[0]
+
+    if member[1]=="HGNC":
+        #sDb = member[1]
+        #if sDb!="HGNC":
+            #return None
+        sym = member[2]
+        newSyms = aliasToSym[sym]
+        return "gene", "", "|".join(newSyms)
+
+    if mType=="Complex":
+        parts = member[1:]
+        partSyms = []
+        for p in parts:
+            sDb = p[1]
+            if sDb!="HGNC":
+                continue
+
+            sym = p[2]
+            partSyms.extend(aliasToSym[sym])
+        return "complex", "", "|".join(partSyms)
+
+    if mType=="Abundance":
+        # ('Abundance', 'CHEBI', 'metformin')
+        name = member[2]
+        return "compound", name, ""
+
+def parseBelCsv(fname, aliasToSym):
+    " parse BEL csv format and return as rows "
+    skipCount = 0
+    #('Protein', 'HGNC', 'HOXD3')	('RNA', 'HGNC', 'TSPAN7')	{'subject_effect_namespace': 'bel', 'citation_type': 'PubMed', 'citation_name': 'Oncogene 2002 Jan 24 21(5) 798-808', 'evidence': 'Table 1 - Gene regulated by overexpressing HOXD3 in A549 lung cancer cell lines by more than 1.6 fold.', 'subject_modifier': 'Activity', 'relation': 'increases', 'line': 62002, 'subject_effect_name': 'tscript', 'citation_reference': '11850808'}
+    #('Abundance', 'CHEBI', 'palmitic acid') ('Complex', ('Protein', 'MGI', 'Myd88'), ('Protein', 'MGI', 'Tlr2')) {'citation_type': 'PubMed', 'citation_name': 'J Biol Chem 2006 Sep 15 281(37) 26865-75', 'evidence': 'Treatment with palmitate rapidly induced the association of myeloid differentiation factor 88 (MyD88) with the TLR2 receptor, activated the stress-linked kinases p38, JNK, and protein kinase C, induced degradation of IkappaBalpha, and increased NF-kappaB DNA binding...TLR2 mediates the initial events of fatty acid-induced insulin resistance in muscle', 'relation': 'increases', 'line': 205920, 'citation_reference': '16798732'}
+
+    i = 0
+    for line in codecs.open(fname, "r", encoding="utf8"):
+        #print line, len(line)
+        line = line.rstrip("\n").rstrip(" ")
+        fs = line.split("\t")
+        fs = [x.encode("latin1") for x in fs]
+
+        # some lines have invalid syntac, wrong quote characters 
+        # opened github issue, but skipping for now, only 18 lines
+        try:
+            source = eval(fs[0])
+        except:
+            skipCount += 1
+            continue
+        sType = source[0]
+
+        target = eval(fs[1])
+        tType = target[0]
+
+        try:
+            attrs = eval(fs[2])
+        except:
+            skipCount += 1
+
+        #print sType, tType
+
+        # useless relationship
+        if attrs["relation"]=="hasComponent":
+            continue
+
+        g1 = flattenCsv(source)
+        g2 = flattenCsv(target)
+        if g1 is None or g2 is None:
+            continue
+        g1Type, g1Name, g1Genes = g1
+        g2Type, g2Name, g2Genes = g2
+        pmid = attrs.get("citation_reference", "")
+        evid = attrs.get("evidence", "")
+        if len(evid)>500:
+            evid = evid[:500]+"..."
+
+        relType = attrs.get("relation", "")
+
+        # conv modifiers to a single string
+        parts = []
+        mod1 = attrs.get("subject_modifier")
+        mod2 = attrs.get("object_modifier")
+        if mod1!=None:
+            parts.append("gene1: %s" % mod1)
+        if mod2!=None:
+            parts.append("gene2: %s" % mod2)
+        relSubtype = ", ".join(parts)
+
+        # fields are:
+        #"eventId,causeType,causeName,causeGenes,themeType,themeName,themeGenes,relType,relSubtype,sourceDb,sourceId,sourceDesc,pmids,evidence
+        # openBel is source -> target
+        # but our format is target <- source, because I copied Hoifung's format
+        row = [
+                "belLarge"+str(i),g2Type, g2Name, g2Genes, g1Type, g1Name, g1Genes,
+                relType, relSubtype,
+                "belLarge","9ea3c170-01ad-11e5-ac0f-000c29cb28fb", "Selventa BEL large corpus",
+                pmid, evid
+              ]
+        yield row
+        i += 1
+
+    logging.info("Could not parse %d lines" % skipCount)
+
+def splitQuote(name, isSym=False):
+    """ try to split quoted names on , """
+    if '"' in name:
+        # first split quoted names
+        names = name.split('", "')
+    else:
+        # if there are no quotes, also try to split on just commas
+        names = name.split(",")
+    names = [n.strip('"') for n in names]
+    names = [n.strip() for n in names]
+    names = [n for n in names if n!=""]
+    #names = [unidecode.unidecode(n) for n in names]
+    # make sure there are no commas left, if symbol
+    if isSym:
+        for n in names:
+            assert("," not in n)
+    return names
+
+# ----------- MAIN --------------
+if args==[]:
+    parser.print_help()
+    exit(1)
+
+filename = args[0]
+
+#uniprotTable = "/hive/data/inside/pubs/parsedDbs/uniprot.9606.tab"
+upToSym, aliasToSym = parseHgnc(options.hgncFname)
+#accToSym = parseUniprot(options.uniprotFname, accToSym)
+
+print "#"+"\t".join(headers)
+
+logging.debug(filename)
+rows = parseBelCsv(filename, aliasToSym)
+for row in rows:
+    l = "\t".join(row)
+    print l