src/utils/ggBelCsvToTab 884a11762453d7f4141aed366f3d1eaa2a428a20

884a11762453d7f4141aed366f3d1eaa2a428a20
max
  Sun Jun 4 23:59:50 2017 -0700
a few more fixes for openbel and gpml, refs #13634

diff --git src/utils/ggBelCsvToTab src/utils/ggBelCsvToTab
index 48ace46..5965442 100755
--- src/utils/ggBelCsvToTab
+++ src/utils/ggBelCsvToTab
@@ -51,56 +51,30 @@
         line = line.rstrip("\n")
         fields = line.split("\t")
         try:
             rec = Record(*fields)
         except Exception, msg:
             logging.error("Exception occured while parsing line, %s" % msg)
             logging.error("Filename %s" % fh.name)
             logging.error("Line was: %s" % repr(line))
             logging.error("Does number of fields match headers?")
             logging.error("Headers are: %s" % headers)
             #raise Exception("wrong field count in line %s" % line)
             continue
         # convert fields to correct data type
         yield rec
 
-def trySymFromName(nameStr, accToSym):
-    # try a few things to get the official symbol of a synonym
-    # handles complexes and returns a | sep list for them
-
-    if "/" in nameStr:
-        parts = nameStr.split("/")
-        syms = []
-        for p in parts:
-            p = p.strip()
-            pSym = trySymFromName(p, accToSym)
-            if pSym!="":
-                syms.append(pSym)
-        if len(syms)!=0:
-            return "|".join(syms)
-
-    if nameStr == "":
-        nameStr = accTonameStr.get(nameStr, "")
-    if nameStr=="":
-        nameStr = accTonameStr.get(nameStr.split("-")[0].lower(), "")
-    if nameStr=="":
-        nameStr = accTonameStr.get(nameStr.split("/")[0].lower(), "")
-    if nameStr=="":
-        nameStr = accTonameStr.get(nameStr.split()[0].lower(), "")
-    # give up
-    return ""
-
 def pipeSplitAddAll(string, dict, key, toLower=False):
     " split on pipe and add all values to dict with key "
     for val in string.split("|"):
         if toLower:
             val = val.lower()
         dict[val]=key
 
 def parseHgnc(fname, addEntrez=False):
     " return two dicts: uniprot -> symbol, set of validSyms and alias -> symbol from the HGNC tab-sep file "
     upToSym = {}
     skipSyms = set()
     aliasToSym = defaultdict(set)
     clashList = []
     validSyms = set()
     for row in lineFileNext(open(fname)):
@@ -139,57 +113,70 @@
     return validSyms, upToSym, aliasToSym
 
 def flattenCsv(member, validSyms, aliasToSym):
     " convert pybel csv member to type and list of genes, as in our format "
     mType = member[0]
 
     if member[1]=="HGNC":
         #sDb = member[1]
         #if sDb!="HGNC":
             #return None
         sym = member[2]
         if sym in validSyms:
             newSyms = [sym]
         else:
             newSyms = aliasToSym[sym]
-        return "gene", sym, "|".join(newSyms)
+
+        label = sym
+        # only output a label is we did make a change to the gene names
+        newSymStr = "|".join(newSyms)
+        if label==newSymStr:
+            label = ""
+        return "gene", label, newSymStr
 
     if mType=="Complex":
         parts = member[1:]
         partSyms = []
         origNames = []
         for p in parts:
             sDb = p[1]
             sym = p[2]
             origNames.append(sym)
             if sDb!="HGNC":
                 continue
 
             if sym in validSyms:
                 partSyms.append(sym)
             else:
                 partSyms.extend(aliasToSym[sym])
-        return "complex", "/".join(origNames), "|".join(partSyms)
+
+        # only show a label if we changed something
+        label = "/".join(origNames)
+        geneStr = "|".join(partSyms)
+        if set(origNames)==set(partSyms):
+            label = ""
+        return "complex", label, geneStr
 
     if mType=="Abundance":
         # ('Abundance', 'CHEBI', 'metformin')
         name = member[2]
         return "compound", name, ""
 
 def parseBelCsv(fname, validSyms, aliasToSym):
     " parse BEL csv format and return as rows "
     skipCount = 0
+    doneRows = set()
     #('Protein', 'HGNC', 'HOXD3')	('RNA', 'HGNC', 'TSPAN7')	{'subject_effect_namespace': 'bel', 'citation_type': 'PubMed', 'citation_name': 'Oncogene 2002 Jan 24 21(5) 798-808', 'evidence': 'Table 1 - Gene regulated by overexpressing HOXD3 in A549 lung cancer cell lines by more than 1.6 fold.', 'subject_modifier': 'Activity', 'relation': 'increases', 'line': 62002, 'subject_effect_name': 'tscript', 'citation_reference': '11850808'}
     #('Abundance', 'CHEBI', 'palmitic acid') ('Complex', ('Protein', 'MGI', 'Myd88'), ('Protein', 'MGI', 'Tlr2')) {'citation_type': 'PubMed', 'citation_name': 'J Biol Chem 2006 Sep 15 281(37) 26865-75', 'evidence': 'Treatment with palmitate rapidly induced the association of myeloid differentiation factor 88 (MyD88) with the TLR2 receptor, activated the stress-linked kinases p38, JNK, and protein kinase C, induced degradation of IkappaBalpha, and increased NF-kappaB DNA binding...TLR2 mediates the initial events of fatty acid-induced insulin resistance in muscle', 'relation': 'increases', 'line': 205920, 'citation_reference': '16798732'}
 
     i = 0
     for line in codecs.open(fname, "r", encoding="utf8"):
         #print line, len(line)
         line = line.rstrip("\n").rstrip(" ")
         fs = line.split("\t")
         fs = [x.encode("latin1") for x in fs]
 
         # some lines have invalid syntac, wrong quote characters 
         # opened github issue, but skipping for now, only 18 lines
         try:
             source = eval(fs[0])
         except:
@@ -214,42 +201,53 @@
         g1 = flattenCsv(source, validSyms, aliasToSym)
         g2 = flattenCsv(target, validSyms, aliasToSym)
         if g1 is None or g2 is None:
             continue
         g1Type, g1Name, g1Genes = g1
         g2Type, g2Name, g2Genes = g2
         pmid = attrs.get("citation_reference", "")
         evid = attrs.get("evidence", "")
         if len(evid)>500:
             evid = evid[:500]+"..."
 
         relType = attrs.get("relation", "")
 
         # conv modifiers to a single string
         parts = []
-        mod1 = attrs.get("subject_modifier")
-        mod2 = attrs.get("object_modifier")
+        # our format is theme <- cause but every other database 
+        # is cause -> theme. Too late to change our format now.
+        mod1 = attrs.get("object_modifier")
+        mod2 = attrs.get("subject_modifier")
         if mod1!=None:
-            parts.append("gene1: %s" % mod1)
+            g1Label = g1Genes.replace("|", "/") # '/' is a bit more readable
+            parts.append("%s %s" % (g1Label, mod1))
         if mod2!=None:
-            parts.append("gene2: %s" % mod2)
+            g2Label = g2Genes.replace("|", "/")
+            parts.append("%s %s" % (g2Label, mod2))
         relSubtype = ", ".join(parts)
 
+        # skip duplicates
+        mainData = (g2Type, g2Name, g2Genes, g1Type, g1Name, g1Genes, relType, relSubtype, pmid, evid)
+        if mainData in doneRows:
+            continue
+        doneRows.add(mainData)
+
         # fields are:
         #"eventId,causeType,causeName,causeGenes,themeType,themeName,themeGenes,relType,relSubtype,sourceDb,sourceId,sourceDesc,pmids,evidence
         # openBel is source -> target
         # but our format is target <- source, because I copied Hoifung's format
+
         row = [
                 "belLarge"+str(i),g2Type, g2Name, g2Genes, g1Type, g1Name, g1Genes,
                 relType, relSubtype,
                 "belLarge","9ea3c170-01ad-11e5-ac0f-000c29cb28fb", "Selventa BEL large corpus",
                 pmid, evid
               ]
         yield row
         i += 1
 
     logging.info("Could not parse %d lines" % skipCount)
 
 def splitQuote(name, isSym=False):
     """ try to split quoted names on , """
     if '"' in name:
         # first split quoted names