src/utils/ggBelCsvToTab 8f07ea1e9137dc164688def8459f6784879ebbeb

8f07ea1e9137dc164688def8459f6784879ebbeb
max
  Sun Jun 4 08:09:48 2017 -0700
fixes for openbel dataset in hgGeneGraph, refs #13634

diff --git src/utils/ggBelCsvToTab src/utils/ggBelCsvToTab
index fc668b6..48ace46 100755
--- src/utils/ggBelCsvToTab
+++ src/utils/ggBelCsvToTab
@@ -85,99 +85,109 @@
     if nameStr=="":
         nameStr = accTonameStr.get(nameStr.split("/")[0].lower(), "")
     if nameStr=="":
         nameStr = accTonameStr.get(nameStr.split()[0].lower(), "")
     # give up
     return ""
 
 def pipeSplitAddAll(string, dict, key, toLower=False):
     " split on pipe and add all values to dict with key "
     for val in string.split("|"):
         if toLower:
             val = val.lower()
         dict[val]=key
 
 def parseHgnc(fname, addEntrez=False):
-    " return two dicts: uniprot -> symbol and alias -> symbol from the HGNC tab-sep file "
+    " return two dicts: uniprot -> symbol, set of validSyms and alias -> symbol from the HGNC tab-sep file "
     upToSym = {}
     skipSyms = set()
     aliasToSym = defaultdict(set)
     clashList = []
+    validSyms = set()
     for row in lineFileNext(open(fname)):
         sym = row.symbol
         if "withdrawn" in sym:
             continue
+        validSyms.add(sym)
 
         aliasList = [sym]
         #aliasList.append(row.Approved_Name)
         #aliasList.extend(splitQuote(row.Previous_Names))
         #aliasList.extend(splitQuote(row.Name_Synonyms))
         aliasList.extend(splitQuote(row.prev_symbol, isSym=True))
         aliasList.extend(splitQuote(row.alias_symbol, isSym=True))
 
 
         for n in aliasList:
             if n in aliasToSym:
                 oldSym = aliasToSym[n]
                 if oldSym!=n:
                     clashList.append("%s->%s/%s" % (n, aliasToSym[n], sym))
                 #print "clash: %s with %s" % (n, aliasToSym[n])
             else:
                 aliasToSym[n].add(sym)
 
         upAcc = row.uniprot_ids
         if upAcc=="" or upAcc=="-":
             continue
         if upAcc in upToSym:
             #logging.debug("uniprot accession %s assigned to %s, but already assigned to symbol %s" % (upAcc, sym, upToSym[upAcc]))
             skipSyms.add(sym)
             continue
         upToSym[upAcc] = sym
     logging.info("Skipped these symbols due to duplicate uniprot IDs: %s" % ",".join(skipSyms))
     logging.info("%d symbol clashes: %s" % (len(clashList),  clashList))
-    return upToSym, aliasToSym
+    return validSyms, upToSym, aliasToSym
 
-def flattenCsv(member):
+def flattenCsv(member, validSyms, aliasToSym):
     " convert pybel csv member to type and list of genes, as in our format "
     mType = member[0]
 
     if member[1]=="HGNC":
         #sDb = member[1]
         #if sDb!="HGNC":
             #return None
         sym = member[2]
+        if sym in validSyms:
+            newSyms = [sym]
+        else:
             newSyms = aliasToSym[sym]
-        return "gene", "", "|".join(newSyms)
+        return "gene", sym, "|".join(newSyms)
 
     if mType=="Complex":
         parts = member[1:]
         partSyms = []
+        origNames = []
         for p in parts:
             sDb = p[1]
+            sym = p[2]
+            origNames.append(sym)
             if sDb!="HGNC":
                 continue
 
-            sym = p[2]
+            if sym in validSyms:
+                partSyms.append(sym)
+            else:
                 partSyms.extend(aliasToSym[sym])
-        return "complex", "", "|".join(partSyms)
+        return "complex", "/".join(origNames), "|".join(partSyms)
 
     if mType=="Abundance":
         # ('Abundance', 'CHEBI', 'metformin')
         name = member[2]
         return "compound", name, ""
 
-def parseBelCsv(fname, aliasToSym):
+def parseBelCsv(fname, validSyms, aliasToSym):
     " parse BEL csv format and return as rows "
     skipCount = 0
     #('Protein', 'HGNC', 'HOXD3')	('RNA', 'HGNC', 'TSPAN7')	{'subject_effect_namespace': 'bel', 'citation_type': 'PubMed', 'citation_name': 'Oncogene 2002 Jan 24 21(5) 798-808', 'evidence': 'Table 1 - Gene regulated by overexpressing HOXD3 in A549 lung cancer cell lines by more than 1.6 fold.', 'subject_modifier': 'Activity', 'relation': 'increases', 'line': 62002, 'subject_effect_name': 'tscript', 'citation_reference': '11850808'}
     #('Abundance', 'CHEBI', 'palmitic acid') ('Complex', ('Protein', 'MGI', 'Myd88'), ('Protein', 'MGI', 'Tlr2')) {'citation_type': 'PubMed', 'citation_name': 'J Biol Chem 2006 Sep 15 281(37) 26865-75', 'evidence': 'Treatment with palmitate rapidly induced the association of myeloid differentiation factor 88 (MyD88) with the TLR2 receptor, activated the stress-linked kinases p38, JNK, and protein kinase C, induced degradation of IkappaBalpha, and increased NF-kappaB DNA binding...TLR2 mediates the initial events of fatty acid-induced insulin resistance in muscle', 'relation': 'increases', 'line': 205920, 'citation_reference': '16798732'}
 
     i = 0
     for line in codecs.open(fname, "r", encoding="utf8"):
         #print line, len(line)
         line = line.rstrip("\n").rstrip(" ")
         fs = line.split("\t")
         fs = [x.encode("latin1") for x in fs]
 
         # some lines have invalid syntac, wrong quote characters 
         # opened github issue, but skipping for now, only 18 lines
         try:
@@ -189,32 +199,32 @@
 
         target = eval(fs[1])
         tType = target[0]
 
         try:
             attrs = eval(fs[2])
         except:
             skipCount += 1
 
         #print sType, tType
 
         # useless relationship
         if attrs["relation"]=="hasComponent":
             continue
 
-        g1 = flattenCsv(source)
-        g2 = flattenCsv(target)
+        g1 = flattenCsv(source, validSyms, aliasToSym)
+        g2 = flattenCsv(target, validSyms, aliasToSym)
         if g1 is None or g2 is None:
             continue
         g1Type, g1Name, g1Genes = g1
         g2Type, g2Name, g2Genes = g2
         pmid = attrs.get("citation_reference", "")
         evid = attrs.get("evidence", "")
         if len(evid)>500:
             evid = evid[:500]+"..."
 
         relType = attrs.get("relation", "")
 
         # conv modifiers to a single string
         parts = []
         mod1 = attrs.get("subject_modifier")
         mod2 = attrs.get("object_modifier")
@@ -253,25 +263,25 @@
     #names = [unidecode.unidecode(n) for n in names]
     # make sure there are no commas left, if symbol
     if isSym:
         for n in names:
             assert("," not in n)
     return names
 
 # ----------- MAIN --------------
 if args==[]:
     parser.print_help()
     exit(1)
 
 filename = args[0]
 
 #uniprotTable = "/hive/data/inside/pubs/parsedDbs/uniprot.9606.tab"
-upToSym, aliasToSym = parseHgnc(options.hgncFname)
+validSyms, upToSym, aliasToSym = parseHgnc(options.hgncFname)
 #accToSym = parseUniprot(options.uniprotFname, accToSym)
 
 print "#"+"\t".join(headers)
 
 logging.debug(filename)
-rows = parseBelCsv(filename, aliasToSym)
+rows = parseBelCsv(filename, validSyms, aliasToSym)
 for row in rows:
     l = "\t".join(row)
     print l