src/utils/ggPidToTab 96d63c692fc15357af3800b6a0c363a18d33db89

96d63c692fc15357af3800b6a0c363a18d33db89
max
  Sat Jun 3 08:45:23 2017 -0700
adding support and converter for the openbel dataset, refs #13634

diff --git src/utils/ggPidToTab src/utils/ggPidToTab
index 5172aff..7d1c960 100755
--- src/utils/ggPidToTab
+++ src/utils/ggPidToTab
@@ -24,32 +24,33 @@
 
 # link to interactions like this:
 # http://pid.nci.nih.gov/search/InteractionPage?atomid=204109
 # This site is down as of End 2016
 # Ndex copied the data, their links look like this:
 # http://www.ndexbio.org/#/search?searchType=All&searchString=labels%253Amtor_4pathway
 # 
 
 # output file headers
 headers = "eventId,causeType,causeName,causeGenes,themeType,themeName,themeGenes,relType,relSubtype,sourceDb,sourceId,sourceDesc,pmids,evidence".split(",")
 
 # ID of event in output file, goes across input files, so global
 eventId = 0
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
-parser = optparse.OptionParser("""usage: %prog [options] filename - convert NCI PID xml files to tab-sep format
+parser = optparse.OptionParser("""usage: %prog [options] filename - convert NCI PID xml files to tab-sep format.
 
+If filename contains 'BioCarta', adapt the pathway names for biocarta links
 """)
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
 #parser.add_option("-k", "--keggDir", dest="keggDir", action="store", help="the KEGG ftp mirror directory on disk, default %default", default="/hive/data/outside/kegg/06032011/ftp.genome.jp/pub/kegg") 
 parser.add_option("-s", "--hgncFname", dest="hgncFname", action="store", help="the HGNC tab file on disk, default %default", default="/hive/data/outside/hgnc/111413/hgnc_complete_set.txt") 
 parser.add_option("-u", "--uniprotFname", dest="uniprotFname", action="store", help="the uniprot file from the pubs parser, default %default", default="/hive/data/inside/pubs/parsedDbs/uniprot.9606.tab")
 #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
 #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
 else:
     logging.basicConfig(level=logging.INFO)
 # ==== FUNCTIONs =====
@@ -270,42 +271,49 @@
             idToMember[molId] = ("complex", name, symStr)
             
     logging.info("pass4 families again")
     resolveFamilies(root, idToMember)
 
     #for id, tup in idToMember.iteritems():
         #print id, tup
 
     # create a dict with interactionId -> list of pathway names
     intIdToName = {}
     # this is only a temporary fix for the NDex database: they do not have interaction pages anymore, unlike the original PID
     intIdToShortName = {}
     for pwEl in root.findall("Model/PathwayList/Pathway"):
         pwName = pwEl.find("LongName").text
         pwShortName = pwEl.find("ShortName").text
+        if "BioCarta" in filename:
+            # bad hack, but uppercase is lost in the xml
+            pwShortName = "h_"+pwShortName.replace("pathway", "Pathway")
         #print pwName
         pwCompEls = pwEl.findall("PathwayComponentList/PathwayComponent")
         if pwCompEls==None:
             continue
         for pwCompEl in pwCompEls:
             intIdToName[pwCompEl.attrib["interaction_idref"]]=pwName
             intIdToShortName[pwCompEl.attrib["interaction_idref"]]=pwShortName
 
     # now iterate over interactions and output their components as pairs
     global eventId
     skipCount = 0
     for ic in root.findall("Model/InteractionList/Interaction"):
+        dbName = "pid"
+        if "BioCarta" in filename:
+            dbName = "biocarta"
+
         intId = ic.attrib["id"]
         iType = ic.attrib["interaction_type"]
         if iType in ["pathway", "subnet"]: # refs to complete pathways
             continue
         #print iType
         #assert(iType in ["transcription", "modification", "translocation"])
         src = ic.find("Source").text
 
         # prepr evidence and pmid strings
         evidList = []
         for evidEl in ic.findall("EvidenceList/Evidence"):
             evidList.append(evidToName[evidEl.text])
         evidStr = "|".join(evidList)
 
         pmids = []
@@ -338,31 +346,31 @@
             if m1 in edges["agent"]:
                 subRel = "activates"
             elif m1 in edges["inhibitor"]:
                 subRel = "inhibits"
             elif m1 in edges["output"] and (m2 in edges["agent"] or m2 in edges["inhibitor"]):
                 # we already have act/inhibitor -> output covered, no need to add the reverse
                 continue
 
             pwName = intIdToName[intId]
 
             row = ["pid%d"%eventId]
             row.extend(m1)
             row.extend(m2)
             row.append(iType)
             row.append(subRel)
-            row.append("pid")
+            row.append(dbName)
             #row.append(intId)
             row.append(intIdToShortName[intId])
             row.append(pwName)
             row.append(pmidStr)
             row.append(evidStr)
             eventId += 1
 
             yield row
 
         #print len(edges), memCount, intId, iType, evidStr, pmidStr, edges
     logging.warn("Resolved %d members. Could not resolve %d members in interactions" % (len(idToMember), skipCount))
 
 def pipeSplitAddAll(string, dict, key, toLower=False):
     " split on pipe and add all values to dict with key "
     for val in string.split("|"):