884a11762453d7f4141aed366f3d1eaa2a428a20 max Sun Jun 4 23:59:50 2017 -0700 a few more fixes for openbel and gpml, refs #13634 diff --git src/utils/ggBelCsvToTab src/utils/ggBelCsvToTab index 48ace46..5965442 100755 --- src/utils/ggBelCsvToTab +++ src/utils/ggBelCsvToTab @@ -51,56 +51,30 @@ line = line.rstrip("\n") fields = line.split("\t") try: rec = Record(*fields) except Exception, msg: logging.error("Exception occured while parsing line, %s" % msg) logging.error("Filename %s" % fh.name) logging.error("Line was: %s" % repr(line)) logging.error("Does number of fields match headers?") logging.error("Headers are: %s" % headers) #raise Exception("wrong field count in line %s" % line) continue # convert fields to correct data type yield rec -def trySymFromName(nameStr, accToSym): - # try a few things to get the official symbol of a synonym - # handles complexes and returns a | sep list for them - - if "/" in nameStr: - parts = nameStr.split("/") - syms = [] - for p in parts: - p = p.strip() - pSym = trySymFromName(p, accToSym) - if pSym!="": - syms.append(pSym) - if len(syms)!=0: - return "|".join(syms) - - if nameStr == "": - nameStr = accTonameStr.get(nameStr, "") - if nameStr=="": - nameStr = accTonameStr.get(nameStr.split("-")[0].lower(), "") - if nameStr=="": - nameStr = accTonameStr.get(nameStr.split("/")[0].lower(), "") - if nameStr=="": - nameStr = accTonameStr.get(nameStr.split()[0].lower(), "") - # give up - return "" - def pipeSplitAddAll(string, dict, key, toLower=False): " split on pipe and add all values to dict with key " for val in string.split("|"): if toLower: val = val.lower() dict[val]=key def parseHgnc(fname, addEntrez=False): " return two dicts: uniprot -> symbol, set of validSyms and alias -> symbol from the HGNC tab-sep file " upToSym = {} skipSyms = set() aliasToSym = defaultdict(set) clashList = [] validSyms = set() for row in lineFileNext(open(fname)): @@ -139,57 +113,70 @@ return validSyms, upToSym, aliasToSym def flattenCsv(member, validSyms, aliasToSym): " convert pybel csv member to type and list of genes, as in our format " mType = member[0] if member[1]=="HGNC": #sDb = member[1] #if sDb!="HGNC": #return None sym = member[2] if sym in validSyms: newSyms = [sym] else: newSyms = aliasToSym[sym] - return "gene", sym, "|".join(newSyms) + + label = sym + # only output a label is we did make a change to the gene names + newSymStr = "|".join(newSyms) + if label==newSymStr: + label = "" + return "gene", label, newSymStr if mType=="Complex": parts = member[1:] partSyms = [] origNames = [] for p in parts: sDb = p[1] sym = p[2] origNames.append(sym) if sDb!="HGNC": continue if sym in validSyms: partSyms.append(sym) else: partSyms.extend(aliasToSym[sym]) - return "complex", "/".join(origNames), "|".join(partSyms) + + # only show a label if we changed something + label = "/".join(origNames) + geneStr = "|".join(partSyms) + if set(origNames)==set(partSyms): + label = "" + return "complex", label, geneStr if mType=="Abundance": # ('Abundance', 'CHEBI', 'metformin') name = member[2] return "compound", name, "" def parseBelCsv(fname, validSyms, aliasToSym): " parse BEL csv format and return as rows " skipCount = 0 + doneRows = set() #('Protein', 'HGNC', 'HOXD3') ('RNA', 'HGNC', 'TSPAN7') {'subject_effect_namespace': 'bel', 'citation_type': 'PubMed', 'citation_name': 'Oncogene 2002 Jan 24 21(5) 798-808', 'evidence': 'Table 1 - Gene regulated by overexpressing HOXD3 in A549 lung cancer cell lines by more than 1.6 fold.', 'subject_modifier': 'Activity', 'relation': 'increases', 'line': 62002, 'subject_effect_name': 'tscript', 'citation_reference': '11850808'} #('Abundance', 'CHEBI', 'palmitic acid') ('Complex', ('Protein', 'MGI', 'Myd88'), ('Protein', 'MGI', 'Tlr2')) {'citation_type': 'PubMed', 'citation_name': 'J Biol Chem 2006 Sep 15 281(37) 26865-75', 'evidence': 'Treatment with palmitate rapidly induced the association of myeloid differentiation factor 88 (MyD88) with the TLR2 receptor, activated the stress-linked kinases p38, JNK, and protein kinase C, induced degradation of IkappaBalpha, and increased NF-kappaB DNA binding...TLR2 mediates the initial events of fatty acid-induced insulin resistance in muscle', 'relation': 'increases', 'line': 205920, 'citation_reference': '16798732'} i = 0 for line in codecs.open(fname, "r", encoding="utf8"): #print line, len(line) line = line.rstrip("\n").rstrip(" ") fs = line.split("\t") fs = [x.encode("latin1") for x in fs] # some lines have invalid syntac, wrong quote characters # opened github issue, but skipping for now, only 18 lines try: source = eval(fs[0]) except: @@ -214,42 +201,53 @@ g1 = flattenCsv(source, validSyms, aliasToSym) g2 = flattenCsv(target, validSyms, aliasToSym) if g1 is None or g2 is None: continue g1Type, g1Name, g1Genes = g1 g2Type, g2Name, g2Genes = g2 pmid = attrs.get("citation_reference", "") evid = attrs.get("evidence", "") if len(evid)>500: evid = evid[:500]+"..." relType = attrs.get("relation", "") # conv modifiers to a single string parts = [] - mod1 = attrs.get("subject_modifier") - mod2 = attrs.get("object_modifier") + # our format is theme <- cause but every other database + # is cause -> theme. Too late to change our format now. + mod1 = attrs.get("object_modifier") + mod2 = attrs.get("subject_modifier") if mod1!=None: - parts.append("gene1: %s" % mod1) + g1Label = g1Genes.replace("|", "/") # '/' is a bit more readable + parts.append("%s %s" % (g1Label, mod1)) if mod2!=None: - parts.append("gene2: %s" % mod2) + g2Label = g2Genes.replace("|", "/") + parts.append("%s %s" % (g2Label, mod2)) relSubtype = ", ".join(parts) + # skip duplicates + mainData = (g2Type, g2Name, g2Genes, g1Type, g1Name, g1Genes, relType, relSubtype, pmid, evid) + if mainData in doneRows: + continue + doneRows.add(mainData) + # fields are: #"eventId,causeType,causeName,causeGenes,themeType,themeName,themeGenes,relType,relSubtype,sourceDb,sourceId,sourceDesc,pmids,evidence # openBel is source -> target # but our format is target <- source, because I copied Hoifung's format + row = [ "belLarge"+str(i),g2Type, g2Name, g2Genes, g1Type, g1Name, g1Genes, relType, relSubtype, "belLarge","9ea3c170-01ad-11e5-ac0f-000c29cb28fb", "Selventa BEL large corpus", pmid, evid ] yield row i += 1 logging.info("Could not parse %d lines" % skipCount) def splitQuote(name, isSym=False): """ try to split quoted names on , """ if '"' in name: # first split quoted names