16720e7dfab79a7f60e91f0cb102a213c3e4738a max Fri Apr 28 15:39:08 2017 -0700 first big commit for hgGeneGraph. Others will follow as QA progresses. refs #13634 diff --git src/utils/ggGeneClasses src/utils/ggGeneClasses new file mode 100755 index 0000000..c28dfd8 --- /dev/null +++ src/utils/ggGeneClasses @@ -0,0 +1,465 @@ +#!/usr/bin/env python2.7 + +import logging, sys, optparse, itertools +from collections import defaultdict, namedtuple +from os.path import join, basename, dirname, isfile + +# === COMMAND LINE INTERFACE, OPTIONS AND HELP === +parser = optparse.OptionParser("""usage: %prog [options] filename - merge the panther classification PTHR9.0_human and the HPRD classification into the format symbol-main class + +HPRD: +uses the most specific class +just ignores the TREMBL proteins (a few hundred) + +panther data is in /hive/data/outside/panther/3.3 +hprd data is in /hive/data/outside/hprd/081814/ +hprd data received from Arun Patil by email, arun@ibioinformatics.org + +output: +symbol,description + +to load: +create table geneClasses (gene varchar(255), text varchar(30000)); +hgLoadSqlTab publications geneClasses geneClasses.tab +""") + +parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") +#parser.add_option("-k", "--keggDir", dest="keggDir", action="store", help="the KEGG ftp mirror directory on disk, default %default", default="/hive/data/outside/kegg/06032011/ftp.genome.jp/pub/kegg") +parser.add_option("-s", "--hgncFname", dest="hgncFname", action="store", help="the HGNC tab file on disk, default %default", default="/hive/data/outside/hgnc/111413/hgnc_complete_set.txt") +parser.add_option("-p", "--pantherFname", dest="pantherFname", action="store", help="the panther tab file on disk, default %default", default="/hive/data/outside/panther/3.3/PTHR9.0_human") +parser.add_option("", "--hprdFname", dest="hprdFname", action="store", help="the HPRD tab file on disk, default %default", default="/hive/data/outside/hprd/081814/HPRD_molecular_class_081914.txt") +#parser.add_option("-u", "--uniprotFname", dest="uniprotFname", action="store", help="the uniprot file from the pubs parser, default %default", default="/hive/data/inside/pubs/parsedDbs/uniprot.9606.tab") +#parser.add_option("-f", "--file", dest="file", action="store", help="run on file") +#parser.add_option("", "--test", dest="test", action="store_true", help="do something") +(options, args) = parser.parse_args() + +if options.debug: + logging.basicConfig(level=logging.DEBUG) +else: + logging.basicConfig(level=logging.INFO) +# ==== FUNCTIONs ===== +def lineFileNext(fh, headers=None): + """ + parses tab-sep file with headers as field names + yields collection.namedtuples + strips "#"-prefix from header line + """ + line1 = fh.readline() + line1 = line1.strip("\n").strip("#") + if headers==None: + headers = line1.split("\t") + headers = [h.replace(" ", "_") for h in headers] + headers = [h.replace("(", "") for h in headers] + headers = [h.replace(")", "") for h in headers] + Record = namedtuple('tsvRec', headers) + + for line in fh: + line = line.rstrip("\n") + fields = line.split("\t") + try: + rec = Record(*fields) + except Exception, msg: + logging.error("Exception occured while parsing line, %s" % msg) + logging.error("Filename %s" % fh.name) + logging.error("Line was: %s" % repr(line)) + logging.error("Does number of fields match headers?") + logging.error("Headers are: %s" % headers) + #raise Exception("wrong field count in line %s" % line) + continue + # convert fields to correct data type + yield rec + +def resolveFamilies(root, idToMember): + " add id to member tuple to idToMember dict for all families " + logging.info("pass2 - families") + for mEl in root.findall("Model/MoleculeList/Molecule"): + molId = mEl.attrib["id"] + molType = mEl.attrib["molecule_type"] + famMemEl = mEl.find("FamilyMemberList") + if famMemEl!=None: + # <FamilyMemberList> + # <Member member_molecule_idref="200502"> + name = mEl.find("Name").attrib["value"] + symList = [] + for memEl in famMemEl.findall("Member"): + #print name, memEl.attrib + memId = memEl.attrib["member_molecule_idref"] + if memId not in idToMember: + #logging.debug("skipping %s, not defined yet" % memId) + continue + sym = idToMember[memId][-1] + symList.append(sym) + #print symList + symStr = "|".join(symList) + idToMember[molId] = ("family", name, symStr) + + +def trySymFromName(nameStr, accToSym): + # try a few things to get the official symbol of a synonym + # handles complexes and returns a | sep list for them + + if "/" in nameStr: + parts = nameStr.split("/") + syms = [] + for p in parts: + p = p.strip() + pSym = trySymFromName(p, accToSym) + if pSym!="": + syms.append(pSym) + if len(syms)!=0: + return "|".join(syms) + + if nameStr == "": + nameStr = accTonameStr.get(nameStr, "") + if nameStr=="": + nameStr = accTonameStr.get(nameStr.split("-")[0].lower(), "") + if nameStr=="": + nameStr = accTonameStr.get(nameStr.split("/")[0].lower(), "") + if nameStr=="": + nameStr = accTonameStr.get(nameStr.split()[0].lower(), "") + # give up + return "" + +def parseXml(filename, accToSym): + """ parse NCI XML, returns list of tuples, see global "headers" variable + """ + logging.info(filename) + rows = [] + tree = et.parse(filename) + root = tree.getroot() + + + source = basename(filename).split(".")[0] + idToMember = {} # id -> ("complex" or "family" or "gene", name, |-sep list of genes) + idToSym = {} # id -> sym + + logging.info("pass1") + skipUps = [] + noSym = set() + for mEl in root.findall("Model/MoleculeList/Molecule"): + # <Molecule molecule_type="protein" id="201405"> + # <Name name_type="UP" long_name_type="UniProt" value="P98170" /> + famMemEl = mEl.find("FamilyMemberList") + # ignore molecules with families + if famMemEl!=None: + continue + molId = mEl.attrib["id"] + molType = mEl.attrib["molecule_type"] + + names = {} + for nameEl in mEl.findall("Name"): + names[nameEl.attrib["name_type"]] = nameEl.attrib["value"] + + # <ComplexComponentList> + # <ComplexComponent molecule_idref="215568"> + # </ComplexComponentList> + complCompEls = mEl.findall("ComplexComponentList/ComplexComponent") + if molType in ["protein", "rna"] or (molType=="complex" and len(complCompEls)==0): + # weird enough, this can happen: a complex with a protein UP entry... + # and no components -> we treat these like a normal protein + upAcc = None + sym = "" + if "UP" in names: + upAcc = names["UP"] + upAcc = upAcc.split("-")[0] + sym = accToSym.get(upAcc, None) + if sym==None: + skipUps.append(upAcc) + continue + nameStr = names["PF"] + + if sym=="": + sym = trySymFromName(nameStr, accToSym) + compType = "gene" + if "|" in sym: + compType = "complex" + + idToMember[molId] = ("gene", nameStr, sym) + elif molType=="compound": + # <Molecule molecule_type="compound" id="201403"> + # <Name name_type="CA" long_name_type="Chemical Abstracts" value="521-18-6" /> + # <Name name_type="PF" long_name_type="preferred symbol" value="DHT" /> + for nameEl in mEl.findall("Name"): + if nameEl.attrib["name_type"]=="CA": + idStr = "compound:CAS"+nameEl.attrib["value"] + elif nameEl.attrib["name_type"]=="PF": + name = nameEl.attrib["value"] + idToMember[molId] = ("compound", name, idStr) + + logging.info("Could not resolve %d uniprot IDs, out of %d" % (len(skipUps), len(idToMember))) + logging.info("%d molecules without an official symbol" % (len(noSym))) + logging.debug("list: %s" % ",".join(noSym)) + + logging.info("pass2 families") + resolveFamilies(root, idToMember) + + logging.info("pass3 complexes") + for mEl in root.findall("Model/MoleculeList/Molecule"): + molId = mEl.attrib["id"] + molType = mEl.attrib["molecule_type"] + famMemEl = mEl.find("FamilyMemberList") + names = {} + for nameEl in mEl.findall("Name"): + names[nameEl.attrib["name_type"]] = nameEl.attrib["value"] + if molType=="complex" and not "UP" in names: + # <Name name_type="PF" long_name_type="preferred symbol" value="FA complex/FANCD2/Ubiquitin" /> + symList = [] + name = names["PF"] + for compEl in mEl.findall("ComplexComponentList/ComplexComponent"): + compId = compEl.attrib["molecule_idref"] + if compId not in idToMember: + # just skip invalid ones + continue + compType, compName, compSym = idToMember[compId] + if compType=="family": + compSym = compSym.replace("|", "_") + if compSym=="": # some members don't have an offical symbol, skip them + continue + if compSym in symList: + continue + symList.append(compSym) + #assert(symList!=[]) # we have some complexes without member proteins + + # some are not annotated, we resort to guessing + if len(symList)==0: + symStr = trySymFromName(name, accToSym) + else: + symStr = "|".join(symList) + idToMember[molId] = ("complex", name, symStr) + + logging.info("pass4 families again") + resolveFamilies(root, idToMember) + + #for id, tup in idToMember.iteritems(): + #print id, tup + + # create a dict with interactionId -> list of pathways + intIdToName = {} + for pwEl in root.findall("Model/PathwayList/Pathway"): + pwName = pwEl.find("LongName").text + #print pwName + pwCompEls = pwEl.findall("PathwayComponentList/PathwayComponent") + if pwCompEls==None: + continue + for pwCompEl in pwCompEls: + intIdToName[pwCompEl.attrib["interaction_idref"]]=pwName + + # now iterate over interactions and output their components as pairs + eventId = 0 + skipCount = 0 + for ic in root.findall("Model/InteractionList/Interaction"): + intId = ic.attrib["id"] + iType = ic.attrib["interaction_type"] + if iType in ["pathway", "subnet"]: # refs to complete pathways + continue + #print iType + #assert(iType in ["transcription", "modification", "translocation"]) + src = ic.find("Source").text + + evidList = [] + for evidEl in ic.findall("EvidenceList/Evidence"): + evidList.append(evidToName[evidEl.text]) + evidStr = "|".join(evidList) + + pmids = [] + for refEl in ic.findall("ReferenceList/Reference"): + pmids.append(refEl.attrib["pmid"]) + pmidStr = "|".join(pmids) + + edges = defaultdict(set) + allParts = set() + for intCompEl in ic.findall("InteractionComponentList/InteractionComponent"): + idref = intCompEl.attrib["molecule_idref"] + role = intCompEl.attrib["role_type"] + member = idToMember.get(idref, None) + if member==None: + skipCount += 1 + continue + + assert(role in ["input", "output", "inhibitor", "agent"]) + edges[role].add(member) + allParts.add(member) + + memCount = 0 + for compType, iList in edges.iteritems(): + memCount+= len(iList) + + for pair in itertools.combinations(allParts,2 ): + m1, m2 = pair + subRel = "participates" + if m1 in edges["agent"]: + subRel = "activates" + elif m1 in edges["inhibitor"]: + subRel = "inhibits" + + pwName = intIdToName[intId] + + row = ["pid%d"%eventId] + row.extend(m1) + row.extend(m2) + row.append(iType) + row.append(subRel) + row.append("pid") + row.append(intId) + row.append(pwName) + row.append(pmidStr) + row.append(evidStr) + eventId += 1 + + yield row + + #print len(edges), memCount, intId, iType, evidStr, pmidStr, edges + logging.warn("Could not resolve %d members of interactions (probably non-human species)" % skipCount) + +def pipeSplitAddAll(string, dict, key, toLower=False): + " split on pipe and add all values to dict with key " + for val in string.split("|"): + if toLower: + val = val.lower() + dict[val]=key + +def parseUniprot(uniprotTable, accToSym): + " parse uniprot and return dict with accession or synonym -> symbol " + logging.info("Parsing %s" % uniprotTable) + for row in lineFileNext(open(uniprotTable)): + sym = row.hgncSym.split("|")[0] + accToSym[sym] = sym # some entries have the HGNC symbol in the xref + #pipeSplitAddAll(row.entrezGene, accToSym, sym) + pipeSplitAddAll(row.mainIsoAcc, accToSym, sym) + pipeSplitAddAll(row.accList, accToSym, sym) + pipeSplitAddAll(row.acc, accToSym, sym) + + # pull out various synonyms: names & symbols + pipeSplitAddAll(row.geneName, accToSym, sym, toLower=True) + pipeSplitAddAll(row.geneSynonyms, accToSym, sym, toLower=True) + pipeSplitAddAll(row.isoNames, accToSym, sym, toLower=True) + pipeSplitAddAll(row.protFullNames, accToSym, sym, toLower=True) + pipeSplitAddAll(row.protShortNames, accToSym, sym, toLower=True) + pipeSplitAddAll(row.protAltFullNames, accToSym, sym, toLower=True) + pipeSplitAddAll(row.protAltShortNames, accToSym, sym, toLower=True) + #pipeSplitAddAll(row.ensemblGene, accToSym, sym) + return accToSym + +def parseHgnc(fname, addEntrez=False): + " return a uniprotAcc -> hgnc symbol dict from the HGNC tab-sep file " + logging.info("Parsing HGNC table") + upToSym = {} + skipSyms = set() + for row in lineFileNext(open(fname)): + sym = row.Approved_Symbol + if "withdrawn" in sym or "-AS" in sym: + continue + upAcc = row.UniProt_ID_supplied_by_UniProt + if upAcc=="" or upAcc=="-": + continue + if upAcc in upToSym: + #logging.debug("uniprot accession %s assigned to %s, but already assigned to symbol %s" % (upAcc, sym, upToSym[upAcc])) + skipSyms.add(sym) + continue + entrez = row.Entrez_Gene_ID + if addEntrez and entrez!="": + upToSym[entrez] = sym + upToSym[upAcc] = sym + logging.info("Skipped these symbols due to duplicate uniprot IDs: %s" % ",".join(skipSyms)) + return upToSym + +# ----------- MAIN -------------- +#if args==[]: + #parser.print_help() + #exit(1) + +pantherFilename = options.pantherFname + +#uniprotTable = "/hive/data/inside/pubs/parsedDbs/uniprot.9606.tab" +accToSym = parseHgnc(options.hgncFname) + +catCounts = defaultdict(int) + +symToCats = defaultdict(list) + +for line in open(pantherFilename): + fs = line.rstrip("\n").split('\t') + idStr = fs[0] + idStr = idStr.split("|")[-1] + if not idStr.startswith('UniProt'): + continue + acc = idStr.split("=")[-1] + if acc not in accToSym: + logging.warn("Not found: %s" % acc) + continue + sym = accToSym[acc] + + classes = fs[8].split(";") + classes = [c.split("#")[0] for c in classes] + for c in classes: + catCounts[c]+=1 + symToCats[sym].append(c) + +#for cat, count in catCounts.iteritems(): + #print count, cat + +pantherClasses = {} +for sym, cats in symToCats.iteritems(): + scoredCats = [] + for c in cats: + scoredCats.append ( (catCounts[c], c) ) + scoredCats.sort() + bestCat = scoredCats[0][-1] + #if len(scoredCats)>1: + #nextCat = scoredCats[1][-1] + #bestCat = bestCat+", "+nextCat + + otherStr = "" + if len(scoredCats)>1: + otherCats = scoredCats[1:] + otherCats = [c[-1] for c in otherCats] + otherStr = ",".join(otherCats) + #row = [sym, bestCat, otherStr] + #print "\t".join(row) + bestCat = bestCat.replace(" molecule", "") + bestCat = bestCat.replace("G protein", "G-protein") + bestCat = bestCat.replace(";", ", ") + pantherClasses[sym] = bestCat + +# process hprd and write to dict sym -> text +headers = "sym,entrez,desc,refseq,classStr,empty".split(",") +hprdClasses = {} +for row in lineFileNext(open(options.hprdFname), headers=headers): + if row.classStr.startswith("Unclass"): + continue + if row.classStr=="-": + continue + classStr = row.classStr + classStr = classStr.replace("G protein", "G-protein") + classStr = classStr.replace(";", ", ") + hprdClasses[row.sym] = classStr + +# join both dicts +allSyms = set(pantherClasses).union(hprdClasses) +for sym in allSyms: + parts = [] + if sym in pantherClasses: + parts.append(pantherClasses.get(sym, "")) + if sym in hprdClasses: + parts.append(hprdClasses.get(sym, "")) + #row = [sym, ", ".join(parts)] + hprd = hprdClasses.get(sym, "") + panther = pantherClasses.get(sym, "") + parts = [hprd, panther] + + # take longer desc if overlap by one word + if len(set(hprd.lower().split()).intersection(panther.lower().split()))!=0: + if len(hprd)>len(panther): + parts=[hprd] + else: + parts=[panther] + + parts = [p for p in parts if p!=""] + #row = [sym, hprd, panther, ", ".join(parts)] + desc = ", ".join(parts) + if len(desc)>1: + desc = desc[0].upper()+"".join(desc[1:]) + if desc=="": + continue + row = [sym, desc] + print "\t".join(row)