8f07ea1e9137dc164688def8459f6784879ebbeb max Sun Jun 4 08:09:48 2017 -0700 fixes for openbel dataset in hgGeneGraph, refs #13634 diff --git src/utils/ggBelCsvToTab src/utils/ggBelCsvToTab index fc668b6..48ace46 100755 --- src/utils/ggBelCsvToTab +++ src/utils/ggBelCsvToTab @@ -85,99 +85,109 @@ if nameStr=="": nameStr = accTonameStr.get(nameStr.split("/")[0].lower(), "") if nameStr=="": nameStr = accTonameStr.get(nameStr.split()[0].lower(), "") # give up return "" def pipeSplitAddAll(string, dict, key, toLower=False): " split on pipe and add all values to dict with key " for val in string.split("|"): if toLower: val = val.lower() dict[val]=key def parseHgnc(fname, addEntrez=False): - " return two dicts: uniprot -> symbol and alias -> symbol from the HGNC tab-sep file " + " return two dicts: uniprot -> symbol, set of validSyms and alias -> symbol from the HGNC tab-sep file " upToSym = {} skipSyms = set() aliasToSym = defaultdict(set) clashList = [] + validSyms = set() for row in lineFileNext(open(fname)): sym = row.symbol if "withdrawn" in sym: continue + validSyms.add(sym) aliasList = [sym] #aliasList.append(row.Approved_Name) #aliasList.extend(splitQuote(row.Previous_Names)) #aliasList.extend(splitQuote(row.Name_Synonyms)) aliasList.extend(splitQuote(row.prev_symbol, isSym=True)) aliasList.extend(splitQuote(row.alias_symbol, isSym=True)) for n in aliasList: if n in aliasToSym: oldSym = aliasToSym[n] if oldSym!=n: clashList.append("%s->%s/%s" % (n, aliasToSym[n], sym)) #print "clash: %s with %s" % (n, aliasToSym[n]) else: aliasToSym[n].add(sym) upAcc = row.uniprot_ids if upAcc=="" or upAcc=="-": continue if upAcc in upToSym: #logging.debug("uniprot accession %s assigned to %s, but already assigned to symbol %s" % (upAcc, sym, upToSym[upAcc])) skipSyms.add(sym) continue upToSym[upAcc] = sym logging.info("Skipped these symbols due to duplicate uniprot IDs: %s" % ",".join(skipSyms)) logging.info("%d symbol clashes: %s" % (len(clashList), clashList)) - return upToSym, aliasToSym + return validSyms, upToSym, aliasToSym -def flattenCsv(member): +def flattenCsv(member, validSyms, aliasToSym): " convert pybel csv member to type and list of genes, as in our format " mType = member[0] if member[1]=="HGNC": #sDb = member[1] #if sDb!="HGNC": #return None sym = member[2] + if sym in validSyms: + newSyms = [sym] + else: newSyms = aliasToSym[sym] - return "gene", "", "|".join(newSyms) + return "gene", sym, "|".join(newSyms) if mType=="Complex": parts = member[1:] partSyms = [] + origNames = [] for p in parts: sDb = p[1] + sym = p[2] + origNames.append(sym) if sDb!="HGNC": continue - sym = p[2] + if sym in validSyms: + partSyms.append(sym) + else: partSyms.extend(aliasToSym[sym]) - return "complex", "", "|".join(partSyms) + return "complex", "/".join(origNames), "|".join(partSyms) if mType=="Abundance": # ('Abundance', 'CHEBI', 'metformin') name = member[2] return "compound", name, "" -def parseBelCsv(fname, aliasToSym): +def parseBelCsv(fname, validSyms, aliasToSym): " parse BEL csv format and return as rows " skipCount = 0 #('Protein', 'HGNC', 'HOXD3') ('RNA', 'HGNC', 'TSPAN7') {'subject_effect_namespace': 'bel', 'citation_type': 'PubMed', 'citation_name': 'Oncogene 2002 Jan 24 21(5) 798-808', 'evidence': 'Table 1 - Gene regulated by overexpressing HOXD3 in A549 lung cancer cell lines by more than 1.6 fold.', 'subject_modifier': 'Activity', 'relation': 'increases', 'line': 62002, 'subject_effect_name': 'tscript', 'citation_reference': '11850808'} #('Abundance', 'CHEBI', 'palmitic acid') ('Complex', ('Protein', 'MGI', 'Myd88'), ('Protein', 'MGI', 'Tlr2')) {'citation_type': 'PubMed', 'citation_name': 'J Biol Chem 2006 Sep 15 281(37) 26865-75', 'evidence': 'Treatment with palmitate rapidly induced the association of myeloid differentiation factor 88 (MyD88) with the TLR2 receptor, activated the stress-linked kinases p38, JNK, and protein kinase C, induced degradation of IkappaBalpha, and increased NF-kappaB DNA binding...TLR2 mediates the initial events of fatty acid-induced insulin resistance in muscle', 'relation': 'increases', 'line': 205920, 'citation_reference': '16798732'} i = 0 for line in codecs.open(fname, "r", encoding="utf8"): #print line, len(line) line = line.rstrip("\n").rstrip(" ") fs = line.split("\t") fs = [x.encode("latin1") for x in fs] # some lines have invalid syntac, wrong quote characters # opened github issue, but skipping for now, only 18 lines try: @@ -189,32 +199,32 @@ target = eval(fs[1]) tType = target[0] try: attrs = eval(fs[2]) except: skipCount += 1 #print sType, tType # useless relationship if attrs["relation"]=="hasComponent": continue - g1 = flattenCsv(source) - g2 = flattenCsv(target) + g1 = flattenCsv(source, validSyms, aliasToSym) + g2 = flattenCsv(target, validSyms, aliasToSym) if g1 is None or g2 is None: continue g1Type, g1Name, g1Genes = g1 g2Type, g2Name, g2Genes = g2 pmid = attrs.get("citation_reference", "") evid = attrs.get("evidence", "") if len(evid)>500: evid = evid[:500]+"..." relType = attrs.get("relation", "") # conv modifiers to a single string parts = [] mod1 = attrs.get("subject_modifier") mod2 = attrs.get("object_modifier") @@ -253,25 +263,25 @@ #names = [unidecode.unidecode(n) for n in names] # make sure there are no commas left, if symbol if isSym: for n in names: assert("," not in n) return names # ----------- MAIN -------------- if args==[]: parser.print_help() exit(1) filename = args[0] #uniprotTable = "/hive/data/inside/pubs/parsedDbs/uniprot.9606.tab" -upToSym, aliasToSym = parseHgnc(options.hgncFname) +validSyms, upToSym, aliasToSym = parseHgnc(options.hgncFname) #accToSym = parseUniprot(options.uniprotFname, accToSym) print "#"+"\t".join(headers) logging.debug(filename) -rows = parseBelCsv(filename, aliasToSym) +rows = parseBelCsv(filename, validSyms, aliasToSym) for row in rows: l = "\t".join(row) print l