4e5e5c77cb6bac09f0da5ae36a19e4677c65a0e3 max Fri Jul 15 15:38:12 2016 -0700 forgot a line in last commit diff --git src/utils/uniprotToTab src/utils/uniprotToTab index 7e7df20..b438132 100755 --- src/utils/uniprotToTab +++ src/utils/uniprotToTab @@ -1,1055 +1,1056 @@ #!/usr/bin/env python # load default python packages import logging, optparse, sys, glob, gzip, collections, copy, gzip, os, doctest, re from os.path import * from collections import defaultdict try: import xml.etree.cElementTree as etree except: raise Exception("lxml library not found. install elementtree with 'sudo apt-get install libxml2-dev libxslt-dev python-dev; pip install lxml' or just 'apt-get install python-lxml'") debugMode=False # --- FASTA FILES --- class FastaReader: """ a class to parse a fasta file Example: fr = FastaReader(filename) for (id, seq) in fr.parse(): print id,seq """ def __init__(self, fname): if hasattr(fname, 'read'): self.f = fname elif fname=="stdin": self.f=sys.stdin elif fname.endswith(".gz"): self.f=gzip.open(fname) else: self.f=open(fname) self.lastId=None def parse(self): """ Generator: returns sequences as tuple (id, sequence) """ lines = [] for line in self.f: if line.startswith("\n") or line.startswith("#"): continue elif not line.startswith(">"): lines.append(line.replace(" ","").strip()) continue else: if len(lines)!=0: # on first >, seq is empty faseq = (self.lastId, "".join(lines)) self.lastId=line.strip(">").strip() lines = [] yield faseq else: if self.lastId!=None: sys.stderr.write("warning: when reading fasta file: empty sequence, id: %s\n" % line) self.lastId=line.strip(">").strip() lines=[] # if it's the last sequence in a file, loop will end on the last line if len(lines)!=0: faseq = (self.lastId, "".join(lines)) yield faseq else: yield (None, None) def parseFastaAsDict(fname, inDict=None): if inDict==None: inDict = {} #logging.info("Parsing %s" % fname) fr = FastaReader(fname) for (id, seq) in fr.parse(): if id in inDict: print inDict print inDict[id] raise Exception("%s already seen before" % id) inDict[id]=seq return inDict class ProgressMeter: """ prints a message "x%" every stepCount/taskCount calls of taskCompleted() """ def __init__(self, taskCount, stepCount=20, quiet=False): self.taskCount=taskCount self.stepCount=stepCount self.tasksPerMsg = taskCount/stepCount self.i=0 self.quiet = quiet #print "".join(9*["."]) def taskCompleted(self, count=1): if self.quiet and self.taskCount<=5: return #logging.debug("task completed called, i=%d, tasksPerMsg=%d" % (self.i, self.tasksPerMsg)) if self.tasksPerMsg!=0 and self.i % self.tasksPerMsg == 0: donePercent = (self.i*100) / self.taskCount #print "".join(5*[chr(8)]), sys.stderr.write("%.2d%% " % donePercent) sys.stderr.flush() self.i += count if self.i==self.taskCount: print "" def setupLogging(progName, options, parser=None, logFileName=None, \ debug=False, fileLevel=logging.DEBUG, minimumLog=False, fileMode="w"): """ direct logging to a file and also to stdout, depending on options (debug, verbose, jobId, etc) """ assert(progName!=None) global debugMode + stdoutLevel=logging.INFO if options==None: stdoutLevel=logging.DEBUG elif options.debug or debug: stdoutLevel=logging.DEBUG debugMode = True rootLog = logging.getLogger('') rootLog.setLevel(fileLevel) logging.root.handlers = [] # setup file logger if logFileName != None and logFileName!="": fh = logging.FileHandler(logFileName) fh.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) rootLog.addHandler(fh) # define a handler which writes messages to sys.stderr console = logging.StreamHandler() # set a format which is simpler for console use formatter = logging.Formatter('%(levelname)-8s-%(message)s') # tell the handler to use this format console.setFormatter(formatter) console.setLevel(stdoutLevel) # make sure that the root logger gets verbose messages logging.getLogger('').setLevel(min(stdoutLevel, fileLevel)) # add the handler to the root logger rootLog.addHandler(console) # UNIPROT PARSING # remove these PMIDs from all evidences pmidBlackList = set([17344846]) # high-throughput study # only parse these feature types featTypes = { "sequence variant": "variant", "mutagenesis site": "mutagen", "modified residue": "modif", "cross-link": "cross-link", "region of interest": "interest", "short sequence motif": "motif", "metal ion-binding site": "ion-binding", "site": "site", "topological domain" : "topo", "transmembrane region" : "transmem", "disulfide bond" : "disulf bond", "glycosylation site" : "glyco", "binding site" : "bind", "active site" : "enzyme act site", "signal peptide" : "signal pep", "transit peptide" : "trans pep", "calcium-binding region" : "calcium bind", "lipid moiety-binding region" : "lipid", "propeptide" : "propep", "intramembrane region" : "intramem", "peptide" : "peptide", "nucleotide phosphate-binding region" : "nucl phos bind", "helix" : "helix", "chain" : "chain", "coiled-coil region" : "coiled-coil", "turn" : "turn", "strand" : "beta", "domain" : "domain", "zinc finger region" : "zinc finger", "repeat" : "repeat", "compositionally biased region" : "biased", "initiator methionine" : "init Met", "non-standard amino acid" : "non-std", "non-consecutive residues" : "non-consec", "unsure residue" : "unsure", "DNA-binding region" : "DNA-binding", "non-terminal residue" : "nonTerm" } # main record info entryHeaders = ["dataset", "acc", "mainIsoAcc", "orgName", "orgCommon", "taxonId", "name", "accList", \ "protFullNames", "protShortNames", "protAltFullNames", "protAltShortNames", \ "geneName", "geneSynonyms", "isoNames", \ "geneOrdLocus", "geneOrf", \ "hgncSym", "hgncId", "refSeq", "refSeqProt", "entrezGene", "ensemblGene", "ensemblProt", \ "kegg", "emblMrna", "emblMrnaProt", "emblDna", "emblDnaProt", \ "pdb", "ec", \ "uniGene", "omimGene", "omimPhenotype", "subCellLoc", "functionText", "mainSeq", "isoIds", "isoSeqs"] EntryRec = collections.namedtuple("uprec", entryHeaders) # originally only disease associated mutation # but now all annotations get parsed into this format mutHeaders = ["acc", "mainIsoAcc", "varId", "featType", "shortFeatType", "begin", "end", "origAa", "mutAa", "dbSnpId", "disRelated", "disease", "disCode", "pmid", "comment"] MutRec = collections.namedtuple("mutrec", mutHeaders) # references from record refHeaders = ["name", "citType", "year", "journal", "vol", "page", \ "title", "authors", "doi", "pmid", "scopeList"] RefRec = collections.namedtuple("refRec", refHeaders) emptyRef = dict(zip(refHeaders, len(refHeaders)*[""])) def strip_namespace_inplace(etree, namespace=None,remove_from_attr=True): """ Takes a parsed ET structure and does an in-place removal of all namespaces, or removes a specific namespacem (by its URL). Can make node searches simpler in structures with unpredictable namespaces and in content given to be non-mixed. By default does so for node names as well as attribute names. (doesn't remove the namespace definitions, but apparently ElementTree serialization omits any that are unused) Note that for attributes that are unique only because of namespace, this may attributes to be overwritten. For example: would become: I don't think I've seen any XML where this matters, though. """ if namespace==None: # all namespaces for elem in etree.getiterator(): tagname = elem.tag if not isinstance(elem.tag, basestring): continue if tagname[0]=='{': elem.tag = tagname[ tagname.index('}',1)+1:] if remove_from_attr: to_delete=[] to_set={} for attr_name in elem.attrib: if attr_name[0]=='{': old_val = elem.attrib[attr_name] to_delete.append(attr_name) attr_name = attr_name[attr_name.index('}',1)+1:] to_set[attr_name] = old_val for key in to_delete: elem.attrib.pop(key) elem.attrib.update(to_set) else: # asked to remove specific namespace. ns = '{%s}' % namespace nsl = len(ns) for elem in etree.getiterator(): if elem.tag.startswith(ns): elem.tag = elem.tag[nsl:] if remove_from_attr: to_delete=[] to_set={} for attr_name in elem.attrib: if attr_name.startswith(ns): old_val = elem.attrib[attr_name] to_delete.append(attr_name) attr_name = attr_name[nsl:] to_set[attr_name] = old_val for key in to_delete: elem.attrib.pop(key) elem.attrib.update(to_set) def parseDiseases(fname): " parse the file humanDiseases.txt from uniprot to resolve disease IDs to disease names " dis = {} for line in open(fname).read().splitlines(): if line.startswith("ID"): name = line[5:].strip(".") if line.startswith("AR"): code = line[5:].strip(".") dis[code]=name return dis def findSaveList(el, path, dataDict, key, attribKey=None, attribVal=None, useAttrib=None, subSubEl=None): """ find all text of subelemets matching path with given optionally attrib and save into dataDict with key You can specify a subSubEl of the element to get the text from. """ l = [] for se in el.findall(path): if attribKey!=None and se.attrib.get(attribKey, None)!=attribVal: continue if useAttrib: val = se.attrib[useAttrib] else: if subSubEl: val = se.find(subSubEl).text else: val = se.text l.append(val) s = "|".join(l) dataDict[key] = s def openOutTabFile(subDir, outName, headers): " create outdir and open outfile, write headers " #subDir = join(outDir, outSubDir) if not isdir(subDir): logging.info("Creating dir %s" % subDir) os.makedirs(subDir) outPath = join(subDir, outName) logging.info("Writing output to %s" % outPath) ofh = open(outPath, "w") ofh.write("\t".join(headers)+"\n") return ofh def findDisCodes(text, disToName): """ find disease codes in text, return as a set of disease codes >>> findDiseases("Defects in HAL are the cause of histidinemia (HISTID) ") set(['HISTID']) """ disSet = set() for m in re.finditer("[(]([a-zA-Z0-9- ]+)[)]", text): word = m.group(1) if word in disToName: disSet.add(word) return disSet #def findDiseases(text): # """ find disease codes and their names in text, return as dict code -> name # >>> findDiseases("Defects in CEACAM16 are the cause of deafness autosomal dominant type 4B (DFNA4B) [MIM:614614].") # {'DFNA4B': 'deafness autosomal dominant type 4B'} # >>> findDiseases("Defects in ALX4 are the cause of parietal foramina 2 (PFM2) [MIM:609597]; also known as foramina parietalia permagna (FPP). PFM2 is an autosomal dominant disease characterized by oval defects of the parietal bones caused by deficient ossification around the parietal notch, which is normally obliterated during the fifth fetal month. PFM2 is also a clinical feature of Potocki-Shaffer syndrome.") # {'PFM2': 'parietal foramina 2', 'FPP': 'foramina parietalia permagna'} # # # disease is only one word, but long enough # >>> findDiseases("Defects in HAL are the cause of histidinemia (HISTID) ") # {'HISTID': 'histidinemia'} # """ # result = {} # phrases = re.split("[;.] ", text) # notDisease = set(["of", "with", "to", "as", "or", "also", "in"]) # # for phrase in phrases: # words = phrase.split() # revWords = reversed(words) # # grabWords = False # disWords = [] # disCode = None # # go backwords over words and look for acronym, then grab all words before that # # until we find a common English word # for word in revWords: # m = re.match("[(]([A-Z0-9-]+)[)]", word) # if m!=None: # disCode = m.group(1) # grabWords = True # continue # # if word in notDisease and (len(disWords)>1 or len("".join(disWords))>=9): # disName = " ".join(list(reversed(disWords))) # if disCode==None: # logging.debug("Found disease %s, but no code for it" % disName) # continue # result[disCode] = disName # disCode = None # disWords = [] # grabWords = False # # if grabWords: # disWords.append(word) # # return result #def parseDiseaseComment(entryEl): # """ return two dicts # one with evidence code -> disease code # one with disease code -> disease name # """ # disRefs = {} # disCodes = {} # for commentEl in entryEl.findall("comment"): # textEl = commentEl.find("text") # if commentEl.attrib["type"]=="disease": # refStr = commentEl.attrib.get("evidence", None) # # website xml is different, has evidence attribute on text element # if refStr==None: # refStr = textEl.attrib.get("evidence", None) # if refStr==None: # continue # # refs = refStr.split(" ") # # text = textEl.text # logging.debug("Disease comment: %s, evidence %s" % (text, refStr)) # disCodes.update(findDiseases(text)) # # for refId in refs: # disRefs[refId] = disCodes # # logging.debug("Found disease evidences: %s" % disRefs) # logging.debug("Found disease names: %s" % disCodes) # return disRefs, disCodes def parseDiseaseComment(entryEl, disToName): """ parse the general comments, disease section from up record return evidence codes that refer to diseases also return disease codes """ disRefs = {} disCodes = set() for commentEl in entryEl.findall("comment"): textEl = commentEl.find("text") if commentEl.attrib["type"]=="disease": refStr = commentEl.attrib.get("evidence", None) # website xml is different, has evidence attribute on text element if refStr==None: refStr = textEl.attrib.get("evidence", None) if refStr==None: continue refs = refStr.split(" ") text = textEl.text logging.debug("Disease comment: %s, evidence %s" % (text, refStr)) disCodes.update(findDisCodes(text, disToName)) for refId in refs: disRefs[refId] = disCodes logging.debug("Found disease evidences: %s" % disRefs) #logging.debug("Found disease names: %s" % disCodes) return disRefs, disCodes def parseIsoforms(acc, mainSeq, entryEl, isoSeqs): " parse sequences of isoforms, returns lists: isoIds, isoNames, seqs " isoDefined = False isoIds = [] isoNames = [] seqs = [] for isoEl in entryEl.findall("comment/isoform"): isoDefined = True # get id idEl = isoEl.find("id") isoId = idEl.text # get names (just as gene synonyms) for nameEl in isoEl.find("name"): isoNames.append(nameEl.text) seqEl = isoEl.find("sequence") # get sequences seqType = seqEl.attrib["type"] if seqType=="displayed": seqs.append(mainSeq) isoIds.append(isoId) elif seqType=="external": pass # weird anyways else: if isoId not in isoSeqs: logging.debug("sequence %s does not exist" % isoId) else: seqs.append(isoSeqs[isoId]) isoIds.append(isoId) if not isoDefined: isoIds = [acc] seqs = [mainSeq] assert(len(seqs)==len(isoIds)) return isoIds, isoNames, seqs def parseDbRefs(entryEl): " return dict with db -> id (various special cases) " dbRefs = defaultdict(set) dbRefs["emblMrna"] = [] dbRefs["emblMrnaProt"] = [] dbRefs["emblDna"] = [] dbRefs["emblDnaProt"] = [] for dbRefEl in entryEl.findall("dbReference"): db = dbRefEl.attrib["type"] mainId = dbRefEl.attrib["id"] if db=="EMBL": # special case, don't add yet emblId = mainId else: dbRefs[db].add(mainId) propEls = dbRefEl.findall("property") emblProtId = "na" id = None for propEl in propEls: propType = propEl.attrib["type"] propDb = db if (db, propType) ==("RefSeq", "nucleotide sequence ID"): id = propEl.attrib["value"] propDb = "refseqNucl" elif db=="HGNC" and propType=="gene designation": id = propEl.attrib["value"] propDb = "hgncGene" elif db=="Ensembl" and propType=="gene ID": id = propEl.attrib["value"] propDb = "ensemblGene" elif db=="Ensembl" and propType=="protein sequence ID": id = propEl.attrib["value"] propDb = "ensemblProt" elif db=="EMBL" and propType=="protein sequence ID": emblProtId = propEl.attrib["value"] continue # don't add yet elif db=="MIM" and propType=="type": omimCat = propEl.attrib["value"] if omimCat=="phenotype": dbRefs["omimPhenotype"].add(mainId) elif omimCat=="gene" or omimCat=="gene+phenotype": dbRefs["omimGene"].add(mainId) else: assert(False) elif db=="EMBL" and propType=="molecule type": val = propEl.attrib["value"] if val=="mRNA": # add now dbRefs["emblMrna"].append(emblId) dbRefs["emblMrnaProt"].append(emblProtId) else: dbRefs["emblDna"].append(emblId) dbRefs["emblDnaProt"].append(emblProtId) continue # don't add any id else: id = dbRefEl.attrib["id"] if id!=None: dbRefs[propDb].add(id) result = {} for db, valList in dbRefs.iteritems(): result[db] = "|".join(valList) logging.debug("dbRefs: %s" % result) return result def splitAndResolve(disName, disCodes, splitWord): " split and split word, try to resolve via disCodes and rejoin again " subDises = disName.split(splitWord) newDises = [] for subDis in subDises: subDis = subDis.strip() if subDis in disCodes: newDises.append(disCodes[subDis]) else: newDises.append(subDis) disName = ",".join(newDises) return disName def parseFeatDesc(text, disToName): """ parse the description of a feature to find code name of disease, snpId and comments return tuple: (disease name, dbSnpId, otherComments) >>> parseFeatDesc("In sporadic cancers; somatic mutation; dbSNP:rs11540654.", {}) ('sporadic cancers', 'rs11540654', 'somatic mutation') >>> parseFeatDesc("In RIEG1; pointless comment", {"RIEG1" : "Axel-Riegerfeldt syndrome"}) ('Axel-Riegerfeldt syndrome', '', 'pointless comment') """ # find disease name and try to resolve via disToNames logging.debug("Feature description: %s " % (text)) text = text.strip(".").strip() parts = text.split("; ") disCode = "" comments = [] for part in parts: part = part.replace("a patient with", "") part = part.replace("in a ", "in ") partLow = part.lower() if partLow.startswith("in ") and "dbSNP" not in part and "allele" not in part: disCode = " ".join(part.split()[1:]) # some entries contain two disease names else: if "dbSNP" not in part: comments.append(part) # we got a plain disease code if disCode in disToName: disLongName = disToName[disCode] # or two dis codes with and elif " and " in disCode: disLongName = splitAndResolve(disCode, disToName, " and ") else: # there are dis code somewhere inside the text intDisCodes = findDisCodes(disCode, disToName) if len(intDisCodes)!=0: disLongName = disCode disCode = ",".join(intDisCodes) # ok nothing worked, keep it as it is else: disLongName = disCode # find snpId snpId = "" for m in re.finditer("dbSNP:(rs[0-9]+)", text): if m!=None: #assert(snpId=="") snpId = m.group(1) logging.debug("Disease: %s, snpId: %s" % (disLongName, snpId)) return disCode, disLongName, snpId, "; ".join(comments) ignoredTypes = collections.Counter() def parseFeatures(entryEl, disRefs, defaultDisCodes, disToName, evidPmids, mainIsoAcc): " go over features and yield mutation records " acc = entryEl.find("accession").text mutations = [] for featEl in entryEl.findall("feature"): featType = featEl.attrib["type"] if featType not in featTypes: ignoredTypes[featType] += 1 continue if featType in ["sequence variant"]: isVariant = True else: isVariant = False shortFeatType = featTypes[featType] logging.debug("type: %s" % featType) varId = featEl.attrib.get("id", "") logging.debug("Variant ID %s" % varId) origEl = featEl.find("original") if origEl==None: #logging.debug("No original residue") #continue orig = "" else: orig = origEl.text varEl = featEl.find("variation") if varEl==None: variant = "" else: variant = varEl.text logging.debug("residue change: %s->%s" % (orig, variant)) posEl = featEl.find("location/position") if posEl!=None: begin = posEl.attrib["position"] end = str(int(begin)+1) else: beginEl = featEl.find("location/begin") begin = beginEl.attrib.get("position", None) if begin==None: logging.debug("Unknown start, skipping a feature") continue endEl = featEl.find("location/end") end = endEl.attrib.get("position", None) if end==None: logging.debug("Unknown end, skipping a feature") continue end = str(int(end)+1) desc = featEl.attrib.get("description", None) if desc==None: #logging.debug("No description") #continue desc = "" if "sulfinic" in desc: shortFeatType = "sulfo" descWords = desc.split() if len(descWords)>0: desc1 = descWords[0].lower() if "phos" in desc1: shortFeatType = "phos" elif "acetyl" in desc1: shortFeatType = "acetyl" elif "methyl" in desc1: shortFeatType = "methyl" elif "lipo" in desc1: shortFeatType = "lipo" elif "hydroxy" in desc1: shortFeatType = "hydroxy" elif "nitro" in desc1: shortFeatType = "nitro" evidStr = featEl.attrib.get("evidence", "") logging.debug("variant pos %s-%s, desc %s, evidence %s" % (begin, end, desc, evidStr)) desc = desc.strip("() ") #if desc=="": #logging.debug("No description") #continue #if evidStr==None: #logging.debug("No evidence") #continue evidList = evidStr.split() if isVariant: # only do this for natural variants disCode, disName, snpId, comments = parseFeatDesc(desc, disToName) # if no disease annotated to feature, use the one from the record if disCode=="" and len(defaultDisCodes)==1: disCode = list(defaultDisCodes)[0] disName = disToName.get(disCode, disCode)+" (not annotated on variant but on gene record)" disCode = disCode + "?" else: disCode, disName, snpId, comments = "", "", "", desc varPmids = [] if disCode!="": diseaseRelated = "disRelated" else: diseaseRelated = "noEvidence" for evidId in evidList: if evidId in disRefs: diseaseRelated="disRelated" else: diseaseRelated="notDisRelated" logging.debug("evidence is not a disease evidence or blacklisted, check description") pmids = evidPmids.get(evidId, []) assert(len(pmids)<=1) if len(pmids)>0: pmid = list(pmids)[0] varPmids.append(pmid) if len(varPmids)==1 and set(varPmids).intersection(pmidBlackList)==len(varPmids): logging.debug("only blacklist pmids, skipping feature") mut = MutRec(acc, mainIsoAcc, varId, featType, shortFeatType, begin, end, orig, variant, snpId, diseaseRelated, disName, disCode, ",".join(varPmids), comments) logging.debug("Accepted variant: %s" % str(mut)) # rewrite disulfide bonds to two separate features, one for each cysteine involved if featType=="disulfide bond": end = mut.end comment = mut.comment if comment!="": comment+= "; " newComment = comment + "disulfide bond to position %s" % str(int(mut.end)-1) mut1 = mut._replace(end=str(int(mut.begin)+1), comment=newComment) yield mut1 newComment = comment + "disulfide bond to position %s" % mut.begin mut2 = mut._replace(begin=str(int(mut.end)-1), comment=newComment) yield mut2 else: yield mut def parseEvidence(entryEl): " return a dict with evidCode -> PMID " result = {} for evidEl in entryEl.findall("evidence"): evidCode = evidEl.attrib["key"] for dbRefEl in evidEl.findall("source/dbReference"): dbType = dbRefEl.attrib["type"] if dbType=="PubMed": pmid = dbRefEl.attrib["id"] #if pmid in pmidBlackList: #continue result.setdefault(evidCode, []) result[evidCode].append(pmid) return result def parseVariants(entryEl, mainIsoAcc, disToName): " return MutRecs with disease associated variants " # parse the general record comment about diseases disRefs, allDiseaseCodes = parseDiseaseComment(entryEl, disToName) #if len(disRefs)==0: #logging.debug("No disease evidence") #return [] acc = entryEl.find("accession").text logging.debug("Diseases in %s" % acc) evidPmids = parseEvidence(entryEl) mutRecs = list(parseFeatures(entryEl, disRefs, allDiseaseCodes, disToName, evidPmids, mainIsoAcc)) return mutRecs def parseRecInfo(entryEl, entry, isoSeqs): """parse uniprot general record info into entry dict use isoform sequences from isoSeqs only process certain taxonIds """ dataset = entryEl.attrib["dataset"] entry["dataset"] = dataset findSaveList(entryEl, "name", entry, "name") findSaveList(entryEl, "accession", entry, "accList") entry["acc"] = entry["accList"].split("|")[0] logging.debug("Parsing rec info for acc %s" % entry["acc"]) findSaveList(entryEl, "protein/recommendedName/fullName", entry, "protFullNames") findSaveList(entryEl, "protein/recommendedName/shortName", entry, "protShortNames") findSaveList(entryEl, "protein/alternativeName/fullName", entry, "protAltFullNames") findSaveList(entryEl, "protein/alternativeName/shortName", entry, "protAltShortNames") findSaveList(entryEl, "gene/name", entry, "geneName", attribKey="type", attribVal="primary") findSaveList(entryEl, "gene/name", entry, "geneSynonyms", attribKey="type", attribVal="synonym") findSaveList(entryEl, "gene/name", entry, "geneOrdLocus", attribKey="type", attribVal="ordered locus") findSaveList(entryEl, "gene/name", entry, "geneOrf", attribKey="type", attribVal="ORF") findSaveList(entryEl, "organism/name", entry, "orgName", attribKey="type", attribVal="scientific") findSaveList(entryEl, "organism/name", entry, "orgCommon", attribKey="type", attribVal="common") findSaveList(entryEl, "organism/dbReference", entry, "taxonId", useAttrib="id") findSaveList(entryEl, "comment/isoform/id", entry, "isoIds") findSaveList(entryEl, "comment/isoform/name", entry, "isoNames") findSaveList(entryEl, "comment/subcellularLocation/location", entry, "subCellLoc") findSaveList(entryEl, "comment", entry, "functionText", attribKey="type", attribVal="function", subSubEl="text") mainSeq = entryEl.find("sequence").text entry["mainSeq"] = mainSeq isoIds, isoNames, seqs = parseIsoforms(entry["acc"], mainSeq, entryEl, isoSeqs) dbRefs = parseDbRefs(entryEl) entry["mainIsoAcc"] = isoIds[0] entry["hgncSym"] = dbRefs.get("hgncGene", "") entry["hgncId"] = dbRefs.get("HGNC", "") entry["refSeq"] = dbRefs.get("refseqNucl", "") entry["refSeqProt"] = dbRefs.get("RefSeq", "") entry["ensemblProt"] = dbRefs.get("ensemblProt", "") entry["ensemblGene"] = dbRefs.get("ensemblGene", "") entry["entrezGene"] = dbRefs.get("GeneID", "") entry["kegg"] = dbRefs.get("KEGG", "") entry["uniGene"] = dbRefs.get("UniGene", "") entry["omimGene"] = dbRefs.get("omimGene", "") entry["omimPhenotype"] = dbRefs.get("omimPhenotype", "") entry["emblMrna"] = dbRefs.get("emblMrna", "") # mrnas entry["emblMrnaProt"] = dbRefs.get("emblMrnaProt", "") # the protein accessions for mrnas entry["emblDna"] = dbRefs.get("EmblDna", "") # anything not an mrna entry["emblDnaProt"] = dbRefs.get("EmblDnaProt", "") # protein accessions for non-mrnas entry["pdb"] = dbRefs.get("PDB", "") entry["ec"] = dbRefs.get("EC", "") entry["isoIds"]="|".join(isoIds) entry["isoSeqs"]="|".join(seqs) entry["isoNames"]="|".join(isoNames) entryRow = EntryRec(**entry) return entryRow def parseRefInfo(entryEl, recName): for refEl in entryEl.findall("reference"): ref = copy.copy(emptyRef) ref["name"] = recName citEl = refEl.find("citation") ref["citType"] = citEl.attrib["type"] year = citEl.attrib.get("date", "") ref["year"] = year.split("-")[0] ref["journal"] = citEl.attrib.get("name", "") if ref["journal"]=="": ref["journal"] = citEl.attrib.get("db", "") # for submissions ref["vol"] = citEl.attrib.get("volume", "") ref["page"] = citEl.attrib.get("first", "") for titleEl in citEl.findall("title"): ref["title"] = titleEl.text authorList = [] for personEl in citEl.findall("authorList/person"): if "name" in personEl.attrib: name = personEl.attrib["name"] name = name.replace(" ", ",", 1) authorList.append(name) ref["authors"]=";".join(authorList) for dbRefEl in citEl.findall("dbReference"): if "type" in dbRefEl.attrib: if dbRefEl.attrib["type"]=="DOI": ref["doi"] = dbRefEl.attrib["id"] if dbRefEl.attrib["type"]=="PubMed": ref["pmid"] = dbRefEl.attrib["id"] findSaveList(refEl, "scope", ref, "scopeList") refRow = RefRec(**ref) yield refRow def readIsoforms(inDir): " return all isoform sequences as dict isoName (eg. P48347-2) -> sequence " isoFname = join(inDir, "uniprot_sprot_varsplic.fasta.gz") isoFile = gzip.open(isoFname) logging.info("reading isoform sequences from %s" % isoFname) isoSeqs = parseFastaAsDict(isoFile) result = {} seqNames = [] for id, seq in isoSeqs.iteritems(): idParts = id.split("|") isoName = idParts[1] result[isoName] = seq seqNames.append(idParts[2].split()[0]) logging.info("Found %d isoform sequences" % len(result)) return result, len(set(seqNames)) def writeFaSeqs(entry, faFiles, allVariants=False): """ write main sequence to faFile with the right taxonId base sequence always has accession as ID """ #seqIds = entry.isoIds.split("|") if allVariants: seqIds = entry.isoIds.split("|") seqs = entry.isoSeqs.split("|") else: seqIds = [entry.acc] seqs = [entry.isoSeqs.split("|")[0]] taxonId = entry.taxonId if "all" in faFiles: ofh = faFiles["all"] else: ofh = faFiles[taxonId] #for seqId, seq in zip(seqIds, seqs): #ofh.write(">%s\n%s\n" % (seqId, seq)) c = 0 for seqId, seq in zip(seqIds, seqs): # this was to make sure that the first variant has the uniprot ID # sounds like not such a good idea but maybe necessary for the # uniprot lifter file? #if c==0 and allVariants: #seqId = entry.acc ofh.write(">%s\n%s\n" % (seqId.strip(), seq.strip())) c+=1 def openFaFiles(taxonIds, outDir, outPrefix, seqType="base"): faFiles = {} if taxonIds == None: taxonIds = ["all"] for taxonId in taxonIds: taxonId = str(taxonId) seqQual = "" if seqType!="base": seqQual = "."+seqType faFname = join(outDir, outPrefix+"."+taxonId+seqQual+".fa.gz") faFiles[taxonId] = gzip.open(faFname, "w") logging.info("Writing fasta seqs for taxon %s to %s (seqType: %s)" % (taxonId, faFname, seqType)) return faFiles def parseUniprot(db, inDir, outDir, taxonIds): " parse uniprot, write records and refs to outdir " if options.parse: fname = options.parse logging.info("Debug parse of %s" % fname) xmlFile = open(fname) isoSeqs, recCount = {}, 1 outDir = "." outPrefix = "temp" else: isoSeqs, recCount = readIsoforms(inDir) if db=="uniprot": xmlBase = "uniprot_sprot.xml.gz" outPrefix = "uniprot" recCount = 500000 elif db=="trembl": xmlBase = "uniprot_trembl.xml.gz" outPrefix = "uniprotTrembl" recCount = 500000*37 else: raise Exception("unknown db") xmlFile = gzip.open(join(inDir, xmlBase)) logging.info("Parsing main XML file %s" % xmlFile.name) # create a dict taxonId -> output file handles for record info, pmid reference info and mutation info outFhs = {} for taxId in taxonIds: entryOf = openOutTabFile(outDir, "%s.%s.tab" % (outPrefix, taxId), entryHeaders) refOf = openOutTabFile(outDir, "%s.%s.refs.tab" % (outPrefix, taxId), refHeaders) mutOf = openOutTabFile(outDir, "%s.%s.annots.tab" % (outPrefix, taxId), mutHeaders) outFhs[taxId] = (entryOf, refOf, mutOf) disToName = parseDiseases(join(inDir, "humdisease.txt")) # base and variant sequence filehandles faFiles = openFaFiles(taxonIds, outDir, outPrefix) varFaFiles = openFaFiles(taxonIds, outDir, outPrefix, "var") emptyEntry = dict(zip(entryHeaders, len(entryHeaders)*[""])) pm = ProgressMeter(recCount) #for _, entryEl in etree.iterparse(xmlFile.name, tag='{http://uniprot.org/uniprot}entry'): for _, entryEl in etree.iterparse(xmlFile): if entryEl.tag!="{http://uniprot.org/uniprot}entry": continue strip_namespace_inplace(entryEl) # die, die stupid namespaces!! entry = copy.copy(emptyEntry) pm.taskCompleted() entryTax = int(entryEl.find("organism/dbReference").attrib["id"]) if taxonIds==['all']: taxId = "all" else: if entryTax not in taxonIds: continue entryOf, refOf, mutOf = outFhs[taxId] entryRow = parseRecInfo(entryEl, entry, isoSeqs) writeFaSeqs(entryRow, faFiles) writeFaSeqs(entryRow, varFaFiles, allVariants=True) entryOf.write("\t".join(entryRow)+"\n") recName = entryRow.name refRows = list(parseRefInfo(entryEl, recName)) for refRow in refRows: refOf.write("\t".join(refRow)+"\n") mutRecs = parseVariants(entryEl, entryRow.mainIsoAcc, disToName) for mutRow in mutRecs: logging.debug("writing row %s" % str(mutRow)) mutOf.write("\t".join(mutRow)+"\n") entryEl.clear() logging.info("Skipped annotation types: %s" % ignoredTypes.most_common()) def main(args, options): #logFname = join(outDir, "dbParse.log") if options.test: import doctest doctest.testmod() sys.exit(0) setupLogging("pubParseDb", options) db = args[0] db = options.dbType dbDir = args[0] taxonIds = args[1] if taxonIds=="all": taxonIds = ['all'] else: taxonIds=[int(x) for x in taxonIds.split(",")] refDir = args[2] if not isdir(refDir): logging.info("Making directory %s" % refDir) os.makedirs(refDir) if len(args)!=3: raise Exception("Invalid command line. Show help with -h") parseUniprot(db, dbDir, refDir, taxonIds) # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] uniprotFtpDir taxonIds outDir - Convert UniProt to tab-sep files taxonIds can be "all" To download uniProt, this command is a good idea: lftp ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/ -e "mirror --use-pget-n=10 --exclude-glob *.dat.gz -P 5" This parser: - goes over the disease comment evidences and tries to classify annotations as disease-related or not. - resolves disease codes to full disease names - blacklists some PMIDs, annotations from these are skipped Example: %prog /hive/data/outside/uniProtCurrent/ 9606 uniprotTab/ """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("", "--test", dest="test", action="store_true", help="run tests") parser.add_option("-p", "--parse", dest="parse", action="store", help="parse a single uniprot xml file (debugging)") parser.add_option("-t", "--dbType", dest="dbType", action="store", help="one of 'uniprot' or 'trembl', default %default", default="uniprot") (options, args) = parser.parse_args() if args==[] and not options.test: parser.print_help() exit(1) main(args, options)