884a11762453d7f4141aed366f3d1eaa2a428a20 max Sun Jun 4 23:59:50 2017 -0700 a few more fixes for openbel and gpml, refs #13634 diff --git src/utils/ggGpmlToTab src/utils/ggGpmlToTab new file mode 100755 index 0000000..ce842db --- /dev/null +++ src/utils/ggGpmlToTab @@ -0,0 +1,329 @@ +#!/usr/bin/env python2.7 + +import logging, sys, optparse +from collections import defaultdict, namedtuple +from os.path import join, basename, dirname, isfile +import xml.etree.ElementTree as et + +intTypeNames = { + "Arrow" : "activation", + "TBar" : "inhibition" +} + +# global var to identify reactions +eventId = 0 + +# output file headers +headers = "eventId,causeType,causeName,causeGenes,themeType,themeName,themeGenes,relType,relSubtype,sourceDb,sourceId,sourceDesc,pmids".split(",") + +# === COMMAND LINE INTERFACE, OPTIONS AND HELP === +parser = optparse.OptionParser("""usage: %prog [options] filename - parse Wikipathways GPML files to tab-sep format of gene/family/complex + gene/family/complex + interaction type + subtype + +links to pathways look like this: http://www.wikipathways.org/index.php/Pathway:WP1991 +""") + +parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") +#parser.add_option("-k", "--keggDir", dest="keggDir", action="store", help="the KEGG ftp mirror directory on disk, default %default", default="/hive/data/outside/kegg/06032011/ftp.genome.jp/pub/kegg") +#parser.add_option("-s", "--hgncFname", dest="hgncFname", action="store", help="the HGNC tab file on disk, default %default", default="/hive/data/outside/hgnc/111413/hgnc_complete_set.txt") +#parser.add_option("-f", "--file", dest="file", action="store", help="run on file") +#parser.add_option("", "--test", dest="test", action="store_true", help="do something") +(options, args) = parser.parse_args() + +if options.debug: + logging.basicConfig(level=logging.DEBUG) +else: + logging.basicConfig(level=logging.INFO) +# ==== FUNCTIONs ===== +def lineFileNext(fh): + """ + parses tab-sep file with headers as field names + yields collection.namedtuples + strips "#"-prefix from header line + """ + line1 = fh.readline() + line1 = line1.strip("\n").strip("#") + headers = line1.split("\t") + Record = namedtuple('tsvRec', headers) + + for line in fh: + line = line.rstrip("\n") + fields = line.split("\t") + try: + rec = Record(*fields) + except Exception, msg: + logging.error("Exception occured while parsing line, %s" % msg) + logging.error("Filename %s" % fh.name) + logging.error("Line was: %s" % repr(line)) + logging.error("Does number of fields match headers?") + logging.error("Headers are: %s" % headers) + #raise Exception("wrong field count in line %s" % line) + continue + # convert fields to correct data type + yield rec + +def symFromPoint(pe, idToGene, idToGroup, groupToSyms): + " given a element with a graphRef, return the symbol for it (can be a group or a gene) " + gi = pe.attrib["GraphRef"] + if gi in idToGene: + return idToGene[gi] + elif gi in idToGroup: + group = idToGroup[gi] + return "/".join(groupToSyms[group]) + else: + logging.debug("Cannot find a symbol for graphRef %s" % gi) + return None + +def parseGpml(filename, accToSym): + """ parse kegg, returns list of tuples: + (gene1, complex1, family1, gene2, complex2, family2, interactionType, interactionSubtype + """ + logging.info(filename) + rows = [] + tree = et.parse(filename) + root = tree.getroot() + + # remove all namespaces + for el in root.getiterator(): + if el.tag[0] == '{': + el.tag = el.tag.split('}', 1)[1] + + + #comment = root.find("Comment").text + + global eventId + + pathway = basename(filename).split("_")[-2] + # DNA Damage Response + title = root.attrib["Name"] + idToGroup = {} + groupToType = {} # graphId -> "complex" or "family" + + for g in root.findall("Group"): + # + #if g.attrib.get("Style", None)=="Complex" and g.attrib.get("GraphId", None)!=None: + if g.attrib.get("GraphId", None)!=None: + idToGroup[g.attrib["GraphId"]] = g.attrib["GroupId"] + style = g.attrib.get("Style", "Group") + if style=="Complex": + groupType = "complex" + elif style=="Group": + groupType = "family" # XX careful - A. Pico email ? + elif style=="Pathway": + #groupType = "pathway" + logging.warn("A group is a pathway?") + continue + else: + assert(False) # unknown group + groupToType[g.attrib["GraphId"]] = groupType + + # get a mapping from reference id to PMID + # + # 20691899 + # PubMed + # Direct reprogramming of fibroblasts into functional card + # Cell + # 2010 + # Ieda M + # + idToPmid = {} + for g in root.findall("Biopax/PublicationXref"): + refId = g.attrib["{http://www.w3.org/1999/02/22-rdf-syntax-ns#}id"] + if g.find("DB").text!="PubMed": + continue + pmid = g.find("ID").text + if pmid==None: + logging.debug("PMID is none?") + continue + idToPmid[refId] = pmid + + idToGene = {} + groupToSyms = defaultdict(list) + + for c in root: + #print c.tag + # + # + # + # + if c.tag=="DataNode": + if "Type" not in c.attrib: + continue + dataType = c.attrib["Type"] + if dataType=="GeneProduct": + sym = c.attrib["TextLabel"] + # often the symbol is not the text label + if sym not in accToSym: + xref = c.find("Xref") + if xref==None: + logging.debug("datanode without xref element, %s" % c) + continue + acc = xref.attrib["ID"] + db = xref.attrib["Database"] + if db=="" and acc=="": # occurs very often, cut/paste error + continue + sym = accToSym.get(acc, None) + if sym==None: + logging.debug("Could not resolve accession %s, db %s" % (acc, db)) + continue + + groupRef = c.attrib.get("GroupRef", None) + if "GraphId" not in c.attrib: # ?? + continue + graphId = c.attrib["GraphId"] + idToGene[graphId] = sym + if groupRef!=None: + groupToSyms[groupRef].append(sym) + #print dataType, sym, groupRef, graphId, groupRef + + + # + # e6f + # + # + # + # + # + # + # + + # + # + # + # + # + # + # + elif c.tag=="Interaction": + # + # + # + + #print c.attrib.get("GraphId", None) + graphEl = c.find("Graphics") + themeSyms = [] + causeSyms = [] + relType = "unknown" + head = None + themeTypes = [] + causeTypes = [] + for pe in graphEl.findall("Point"): + if not "GraphRef" in pe.attrib: + continue + sym = symFromPoint(pe, idToGene, idToGroup, groupToSyms) + if sym==None: + logging.debug("Could not resolve %s to symbol" % pe.attrib) + continue + + graphRef = pe.attrib["GraphRef"] + graphType = groupToType.get(graphRef, "gene") + + if "ArrowHead" in pe.attrib: + themeSyms.append(sym) + head = pe.attrib["ArrowHead"] + if head!=None: + if head=="Arrow": + relType = "activation" + elif head=="TBar": + relType = "inhibition" + else: + relType = head + themeTypes.append(graphType) + else: + causeSyms.append(sym) + causeTypes.append(graphType) + + if len(themeSyms)==1 and len(causeSyms)==1: + theme = themeSyms[0] + cause = causeSyms[0] + elif len(themeSyms)==0 and len(causeSyms)==2: + head = "binding" + cause = causeSyms[0] + theme = causeSyms[1] + else: + logging.debug("Wrong number of interaction partners %s: %s, %s" % (c.attrib, themeSyms, causeSyms)) + continue + + if "/" in cause: + if len(set(causeTypes))!=1: + logging.warn("more than one type of the causes: %s" % str(causeTypes)) + causeType = causeTypes[0] + causeGenes = cause.replace("/", "|") + else: + causeGenes = cause + causeType = "gene" + + if "/" in theme: + if len(set(themeTypes))!=1: + logging.warn("more than one type of the themes: %s" % str(themeTypes)) + if len(themeTypes)==0: + logging.warn("no type for group %s" % str(theme)) + themeType = "family" + else: + themeType = themeTypes[0] + #themeType = "complex" + themeGenes = theme.replace("/", "|") + else: + themeGenes = theme + themeType = "gene" + + if themeGenes=="" or causeGenes=="": + continue + if themeGenes==causeGenes: + continue + + relSubtype = "" + eventId += 1 + eventIdStr = "wiki%d" % eventId + + # get all PMIDs + pmids = [] + for refEl in c.findall("BiopaxRef"): + refId = refEl.text + if not refId in idToPmid: + logging.debug("BiopaxRef %s has no PMID" % refId) + continue + pmids.append(idToPmid[refId]) + + row = [eventIdStr, causeType, "", causeGenes, themeType, "", themeGenes, relType, relSubtype, "wikipathways", pathway, title, "|".join(pmids)] + rows.append(row) + + #print row + + return rows + +def pipeSplitAddAll(string, dict, key): + " split on pipe and add all values to dict with key " + for val in string.split("|"): + dict[val]=key + +def parseUniprot(uniprotTable): + " parse uniprot and return dict with accession -> symbol " + logging.info("Parsing %s" % uniprotTable) + accToSym = {} + for row in lineFileNext(open(uniprotTable)): + sym = row.hgncSym.split("|")[0] + accToSym[sym] = sym # some entries have the HGNC symbol in the xref + pipeSplitAddAll(row.entrezGene, accToSym, sym) + pipeSplitAddAll(row.mainIsoAcc, accToSym, sym) + pipeSplitAddAll(row.accList, accToSym, sym) + pipeSplitAddAll(row.acc, accToSym, sym) + pipeSplitAddAll(row.ensemblGene, accToSym, sym) + return accToSym + +# ----------- MAIN -------------- +if args==[]: + parser.print_help() + exit(1) + +filenames = args + +uniprotTable = "/hive/data/inside/pubs/parsedDbs/uniprot.9606.tab" +accToSym = parseUniprot(uniprotTable) + +print "#"+"\t".join(headers) +for filename in filenames: + logging.debug(filename) + rows = parseGpml(filename, accToSym) + for row in rows: + l = u"\t".join(row) + print l.encode("utf8")