96d63c692fc15357af3800b6a0c363a18d33db89 max Sat Jun 3 08:45:23 2017 -0700 adding support and converter for the openbel dataset, refs #13634 diff --git src/utils/ggPidToTab src/utils/ggPidToTab index 5172aff..7d1c960 100755 --- src/utils/ggPidToTab +++ src/utils/ggPidToTab @@ -24,32 +24,33 @@ # link to interactions like this: # http://pid.nci.nih.gov/search/InteractionPage?atomid=204109 # This site is down as of End 2016 # Ndex copied the data, their links look like this: # http://www.ndexbio.org/#/search?searchType=All&searchString=labels%253Amtor_4pathway # # output file headers headers = "eventId,causeType,causeName,causeGenes,themeType,themeName,themeGenes,relType,relSubtype,sourceDb,sourceId,sourceDesc,pmids,evidence".split(",") # ID of event in output file, goes across input files, so global eventId = 0 # === COMMAND LINE INTERFACE, OPTIONS AND HELP === -parser = optparse.OptionParser("""usage: %prog [options] filename - convert NCI PID xml files to tab-sep format +parser = optparse.OptionParser("""usage: %prog [options] filename - convert NCI PID xml files to tab-sep format. +If filename contains 'BioCarta', adapt the pathway names for biocarta links """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") #parser.add_option("-k", "--keggDir", dest="keggDir", action="store", help="the KEGG ftp mirror directory on disk, default %default", default="/hive/data/outside/kegg/06032011/ftp.genome.jp/pub/kegg") parser.add_option("-s", "--hgncFname", dest="hgncFname", action="store", help="the HGNC tab file on disk, default %default", default="/hive/data/outside/hgnc/111413/hgnc_complete_set.txt") parser.add_option("-u", "--uniprotFname", dest="uniprotFname", action="store", help="the uniprot file from the pubs parser, default %default", default="/hive/data/inside/pubs/parsedDbs/uniprot.9606.tab") #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") #parser.add_option("", "--test", dest="test", action="store_true", help="do something") (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) # ==== FUNCTIONs ===== @@ -270,42 +271,49 @@ idToMember[molId] = ("complex", name, symStr) logging.info("pass4 families again") resolveFamilies(root, idToMember) #for id, tup in idToMember.iteritems(): #print id, tup # create a dict with interactionId -> list of pathway names intIdToName = {} # this is only a temporary fix for the NDex database: they do not have interaction pages anymore, unlike the original PID intIdToShortName = {} for pwEl in root.findall("Model/PathwayList/Pathway"): pwName = pwEl.find("LongName").text pwShortName = pwEl.find("ShortName").text + if "BioCarta" in filename: + # bad hack, but uppercase is lost in the xml + pwShortName = "h_"+pwShortName.replace("pathway", "Pathway") #print pwName pwCompEls = pwEl.findall("PathwayComponentList/PathwayComponent") if pwCompEls==None: continue for pwCompEl in pwCompEls: intIdToName[pwCompEl.attrib["interaction_idref"]]=pwName intIdToShortName[pwCompEl.attrib["interaction_idref"]]=pwShortName # now iterate over interactions and output their components as pairs global eventId skipCount = 0 for ic in root.findall("Model/InteractionList/Interaction"): + dbName = "pid" + if "BioCarta" in filename: + dbName = "biocarta" + intId = ic.attrib["id"] iType = ic.attrib["interaction_type"] if iType in ["pathway", "subnet"]: # refs to complete pathways continue #print iType #assert(iType in ["transcription", "modification", "translocation"]) src = ic.find("Source").text # prepr evidence and pmid strings evidList = [] for evidEl in ic.findall("EvidenceList/Evidence"): evidList.append(evidToName[evidEl.text]) evidStr = "|".join(evidList) pmids = [] @@ -338,31 +346,31 @@ if m1 in edges["agent"]: subRel = "activates" elif m1 in edges["inhibitor"]: subRel = "inhibits" elif m1 in edges["output"] and (m2 in edges["agent"] or m2 in edges["inhibitor"]): # we already have act/inhibitor -> output covered, no need to add the reverse continue pwName = intIdToName[intId] row = ["pid%d"%eventId] row.extend(m1) row.extend(m2) row.append(iType) row.append(subRel) - row.append("pid") + row.append(dbName) #row.append(intId) row.append(intIdToShortName[intId]) row.append(pwName) row.append(pmidStr) row.append(evidStr) eventId += 1 yield row #print len(edges), memCount, intId, iType, evidStr, pmidStr, edges logging.warn("Resolved %d members. Could not resolve %d members in interactions" % (len(idToMember), skipCount)) def pipeSplitAddAll(string, dict, key, toLower=False): " split on pipe and add all values to dict with key " for val in string.split("|"):