src/utils/uniprotToTab fe2357d86f58b3f8bc2855a0283ecb0bc455ae0c

fe2357d86f58b3f8bc2855a0283ecb0bc455ae0c
max
  Thu Oct 19 15:49:30 2017 -0700
changing comment in uniprot converter, email from Angie, no redmine

diff --git src/utils/uniprotToTab src/utils/uniprotToTab
index 7782afa..47d06f7 100755
--- src/utils/uniprotToTab
+++ src/utils/uniprotToTab
@@ -1,1095 +1,1095 @@
 #!/usr/bin/env python
 
 # load default python packages
 import logging, optparse, sys, glob, gzip, collections, copy, gzip, os, doctest, re
 from os.path import *
 from collections import defaultdict
 
 try:
     from lxml import etree # if this fails, comment out this line and uncomment the next one. Or do the 'pip install' below.
     #import xml.etree.cElementTree as etree # if using this line, search for cElementTree in this file and comment out the other part
 except:
     raise Exception("lxml library not found. install elementtree with 'sudo apt-get install libxml2-dev libxslt-dev python-dev; pip install lxml' or just 'apt-get install python-lxml'. Or read the source code to get rid of the dependency.")
 
 debugMode=False
 
 # --- FASTA FILES ---
 class FastaReader:
     """ a class to parse a fasta file
     Example:
         fr = FastaReader(filename)
         for (id, seq) in fr.parse():
             print id,seq """
 
     def __init__(self, fname):
         if hasattr(fname, 'read'):
             self.f = fname
         elif fname=="stdin":
             self.f=sys.stdin
         elif fname.endswith(".gz"):
             self.f=gzip.open(fname)
         else:
             self.f=open(fname)
         self.lastId=None
 
     def parse(self):
       """ Generator: returns sequences as tuple (id, sequence) """
       lines = []
 
       for line in self.f:
               if line.startswith("\n") or line.startswith("#"):
                   continue
               elif not line.startswith(">"):
                  lines.append(line.replace(" ","").strip())
                  continue
               else:
                  if len(lines)!=0: # on first >, seq is empty
                        faseq = (self.lastId, "".join(lines))
                        self.lastId=line.strip(">").strip()
                        lines = []
                        yield faseq
                  else:
                        if self.lastId!=None:
                            sys.stderr.write("warning: when reading fasta file: empty sequence, id: %s\n" % line)
                        self.lastId=line.strip(">").strip()
                        lines=[]
 
       # if it's the last sequence in a file, loop will end on the last line
       if len(lines)!=0:
           faseq = (self.lastId, "".join(lines))
           yield faseq
       else:
           yield (None, None)
 
 def parseFastaAsDict(fname, inDict=None):
     if inDict==None:
         inDict = {}
     fname2 = fname.replace(".gz","")
     if isfile(fname2):
         logging.warn("Preferring unzipped file %s" % fname2)
         fname = fname2
 
     fr = FastaReader(fname)
     for (id, seq) in fr.parse():
         if id in inDict:
             print inDict
             print inDict[id]
             raise Exception("%s already seen before" % id)
         inDict[id]=seq
     return inDict
 
 class ProgressMeter:
     """ prints a message "x%" every stepCount/taskCount calls of taskCompleted()
     """
     def __init__(self, taskCount, stepCount=20, quiet=False):
         self.taskCount=taskCount
         self.stepCount=stepCount
         self.tasksPerMsg = taskCount/stepCount
         self.i=0
         self.quiet = quiet
         #print "".join(9*["."])
 
     def taskCompleted(self, count=1):
         if self.quiet and self.taskCount<=5:
             return
         #logging.debug("task completed called, i=%d, tasksPerMsg=%d" % (self.i, self.tasksPerMsg))
         if self.tasksPerMsg!=0 and self.i % self.tasksPerMsg == 0:
             donePercent = (self.i*100) / self.taskCount
             #print "".join(5*[chr(8)]),
             sys.stderr.write("%.2d%% " % donePercent)
             sys.stderr.flush()
         self.i += count
         if self.i==self.taskCount:
             print ""
 
 def setupLogging(progName, options, parser=None, logFileName=None, \
         debug=False, fileLevel=logging.DEBUG, minimumLog=False, fileMode="w"):
     """ direct logging to a file and also to stdout, depending on options (debug, verbose, jobId, etc) """
     assert(progName!=None)
     global debugMode
 
     stdoutLevel=logging.INFO
     if options==None:
         stdoutLevel=logging.DEBUG
 
     elif options.debug or debug:
         stdoutLevel=logging.DEBUG
         debugMode = True
 
     rootLog = logging.getLogger('')
     rootLog.setLevel(fileLevel)
     logging.root.handlers = []
 
     # setup file logger
     if logFileName != None and logFileName!="":
         fh = logging.FileHandler(logFileName)
         fh.setLevel(logging.DEBUG)
         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
         fh.setFormatter(formatter)
         rootLog.addHandler(fh)
 
     # define a handler which writes messages to sys.stderr
     console = logging.StreamHandler()
     # set a format which is simpler for console use
     formatter = logging.Formatter('%(levelname)-8s-%(message)s')
     # tell the handler to use this format
     console.setFormatter(formatter)
     console.setLevel(stdoutLevel)
     # make sure that the root logger gets verbose messages 
     logging.getLogger('').setLevel(min(stdoutLevel, fileLevel))
     # add the handler to the root logger
     rootLog.addHandler(console)
 
 
 # UNIPROT PARSING 
 
 # only parse these feature types
 # anything else triggers a warning at the end of the parse
 # As of 2017, these are all types of annotations
 featTypes = {
     "splice variant" : "splicing",
     "sequence variant": "variant",
     "sequence conflict": "conflict",
     "mutagenesis site": "mutagen",
     "modified residue": "modif",
     "cross-link": "cross-link",
     "region of interest": "interest",
     "short sequence motif": "motif",
     "metal ion-binding site": "ion-binding",
     "site": "site",
     "topological domain" : "topo",
     "transmembrane region" : "transmem",
     "disulfide bond" : "disulf bond",
     "glycosylation site" : "glyco",
     "binding site" : "bind",
     "active site" : "enzyme act site",
     "signal peptide" : "signal pep",
     "transit peptide" : "trans pep",
     "calcium-binding region" : "calcium bind",
     "lipid moiety-binding region" : "lipid",
     "propeptide" : "propep",
     "intramembrane region" : "intramem",
     "peptide" : "peptide",
     "nucleotide phosphate-binding region" : "nucl phos bind",
     "helix" : "helix",
     "chain" : "chain",
     "coiled-coil region" : "coiled-coil",
     "turn" : "turn",
     "strand" : "beta",
     "domain" : "domain",
     "zinc finger region" : "zinc finger",
     "repeat" : "repeat",
     "compositionally biased region" : "biased",
     "initiator methionine" : "init Met",
     "non-standard amino acid" : "non-std",
     "non-consecutive residues" : "non-consec",
     "unsure residue" : "unsure",
     "DNA-binding region" : "DNA-binding",
     "non-terminal residue" : "nonTerm"
 }
 
 # main record info
 entryHeaders = ["dataset", "acc", "mainIsoAcc", "orgName", "orgCommon", "taxonId", "name", "accList", \
     "protFullNames", "protShortNames", "protAltFullNames", "protAltShortNames", \
     "geneName", "geneSynonyms", "isoNames", \
     "geneOrdLocus", "geneOrf", \
     "hgncSym", "hgncId", "refSeq", "refSeqProt", "entrezGene", "ensemblGene", "ensemblProt", "ensemblTrans", \
     "kegg", "emblMrna", "emblMrnaProt", "emblDna", "emblDnaProt", \
     "pdb", "ec", \
     "uniGene", "omimGene", "omimPhenotype", "subCellLoc", "functionText", "isoIds"]
 EntryRec = collections.namedtuple("uprec", entryHeaders)
 
 # all annotations get parsed into this format
 annotHeaders = ["acc", "mainIsoAcc", "varId", "featType", "shortFeatType", "begin", "end", "origAa", "mutAa", "dbSnpId", "disRelated", "disease", "disCode", "pmid", "comment"]
 AnnotRec = collections.namedtuple("mutrec", annotHeaders)
 
 # references from record
 refHeaders = ["name", "citType", "year", "journal", "vol", "page", \
         "title", "authors", "doi", "pmid", "scopeList"]
 RefRec = collections.namedtuple("refRec", refHeaders)
 emptyRef = dict(zip(refHeaders, len(refHeaders)*[""]))
 
 def strip_namespace_inplace(etree, namespace=None,remove_from_attr=True):
     """ Takes a parsed ET structure and does an in-place removal of all namespaces,
         or removes a specific namespacem (by its URL).
 
         Can make node searches simpler in structures with unpredictable namespaces
         and in content given to be non-mixed.
 
         By default does so for node names as well as attribute names.
         (doesn't remove the namespace definitions, but apparently
          ElementTree serialization omits any that are unused)
 
         Note that for attributes that are unique only because of namespace,
         this may attributes to be overwritten. 
         For example: <e p:at="bar" at="quu">   would become: <e at="bar">
 
         I don't think I've seen any XML where this matters, though.
     """
     if namespace==None: # all namespaces                               
         for elem in etree.getiterator():
             tagname = elem.tag
             if not isinstance(elem.tag, basestring):
                 continue
             if tagname[0]=='{':
                 elem.tag = tagname[ tagname.index('}',1)+1:]
 
             if remove_from_attr:
                 to_delete=[]
                 to_set={}
                 for attr_name in elem.attrib:
                     if attr_name[0]=='{':
                         old_val = elem.attrib[attr_name]
                         to_delete.append(attr_name)
                         attr_name = attr_name[attr_name.index('}',1)+1:]
                         to_set[attr_name] = old_val
                 for key in to_delete:
                     elem.attrib.pop(key)
                 elem.attrib.update(to_set)
 
     else: # asked to remove specific namespace.
         ns = '{%s}' % namespace
         nsl = len(ns)
         for elem in etree.getiterator():
             if elem.tag.startswith(ns):
                 elem.tag = elem.tag[nsl:]
 
             if remove_from_attr:
                 to_delete=[]
                 to_set={}
                 for attr_name in elem.attrib:
                     if attr_name.startswith(ns):
                         old_val = elem.attrib[attr_name]
                         to_delete.append(attr_name)
                         attr_name = attr_name[nsl:]
                         to_set[attr_name] = old_val
                 for key in to_delete:
                     elem.attrib.pop(key)
                 elem.attrib.update(to_set)
 
 
 def parseDiseases(fname):
     " parse the file humanDiseases.txt from uniprot to resolve disease IDs to disease names "
     logging.info("Parsing %s" % fname)
     dis = {}
     for line in open(fname).read().splitlines():
         if line.startswith("ID"):
             name = line[5:].strip(".")
         if line.startswith("AR"):
             code = line[5:].strip(".")
             dis[code]=name
     logging.info("read %d disease code -> disease name mappings" % len(dis))
     return dis
 
 def findSaveList(el, path, dataDict, key, attribKey=None, attribVal=None, useAttrib=None, subSubEl=None):
     """ find all text of subelemets matching path with given optionally attrib and save into dataDict with key
     You can specify a subSubEl of the element to get the text from.
     """
     l = []
     for se in el.findall(path):
         if attribKey!=None and se.attrib.get(attribKey, None)!=attribVal:
             continue
         if useAttrib:
             val = se.attrib[useAttrib]
         else:
             if subSubEl:
                 val = se.find(subSubEl).text
             else:
                 val = se.text
         l.append(val)
     s = "|".join(l)
     dataDict[key] = s
 
 def openOutTabFile(subDir, outName, headers):
     " create outdir and open outfile, write headers "
     #subDir = join(outDir, outSubDir) 
     if not isdir(subDir):
         logging.info("Creating dir %s" % subDir)
         os.makedirs(subDir)
     outPath = join(subDir, outName)
     logging.debug("Writing output to %s" % outPath)
     ofh = open(outPath, "w")
     ofh.write("\t".join(headers)+"\n")
     return ofh
 
 def findDisCodes(text, disToName):
     """ find disease codes in text, return as a set of disease codes 
     >>> findDiseases("Defects in HAL are the cause of histidinemia (HISTID) ")
     set(['HISTID'])
     """
     disSet = set()
     for m in re.finditer("[(]([a-zA-Z0-9- ]+)[)]", text):
         word = m.group(1)
         if word in disToName:
             disSet.add(word)
     return disSet
 
 # original code tried to guess the acronyms.
 # these days, UniProt provides a file with most of the acronyms
 # leaving it here in case that UniProt ever decides to stop updating their acronyms
 
 #def findDiseases(text):
 #    """ find disease codes and their names in text, return as dict code -> name 
 #    >>> findDiseases("Defects in CEACAM16 are the cause of deafness autosomal dominant type 4B (DFNA4B) [MIM:614614].")
 #    {'DFNA4B': 'deafness autosomal dominant type 4B'}
 #    >>> findDiseases("Defects in ALX4 are the cause of parietal foramina 2 (PFM2) [MIM:609597]; also known as foramina parietalia permagna (FPP). PFM2 is an autosomal dominant disease characterized by oval defects of the parietal bones caused by deficient ossification around the parietal notch, which is normally obliterated during the fifth fetal month. PFM2 is also a clinical feature of Potocki-Shaffer syndrome.")
 #    {'PFM2': 'parietal foramina 2', 'FPP': 'foramina parietalia permagna'}
 #
 #    # disease is only one word, but long enough
 #    >>> findDiseases("Defects in HAL are the cause of histidinemia (HISTID) ")
 #    {'HISTID': 'histidinemia'}
 #    """
 #    result = {}
 #    phrases = re.split("[;.] ", text)
 #    notDisease = set(["of", "with", "to", "as", "or", "also", "in"])
 #
 #    for phrase in phrases:
 #        words = phrase.split()
 #        revWords = reversed(words)
 #
 #        grabWords = False
 #        disWords = []
 #        disCode = None
 #        # go backwords over words and look for acronym, then grab all words before that
 #        # until we find a common English word
 #        for word in revWords:
 #            m = re.match("[(]([A-Z0-9-]+)[)]", word)
 #            if m!=None:
 #                disCode = m.group(1)
 #                grabWords = True
 #                continue
 #
 #            if word in notDisease and (len(disWords)>1 or len("".join(disWords))>=9):
 #                disName = " ".join(list(reversed(disWords)))
 #                if disCode==None:
 #                    logging.debug("Found disease %s, but no code for it" % disName)
 #                    continue
 #                result[disCode] = disName
 #                disCode = None
 #                disWords = []
 #                grabWords = False
 #
 #            if grabWords:
 #                disWords.append(word)
 #
 #    return result
 
 #def parseDiseaseComment(entryEl):
 #    """ return two dicts 
 #    one with evidence code -> disease code
 #    one with disease code -> disease name 
 #    """
 #    disRefs = {}
 #    disCodes = {}
 #    for commentEl in entryEl.findall("comment"):
 #        textEl = commentEl.find("text")
 #        if commentEl.attrib["type"]=="disease":
 #            refStr = commentEl.attrib.get("evidence", None)
 #            # website xml is different, has evidence attribute on text element
 #            if refStr==None:
 #                refStr = textEl.attrib.get("evidence", None)
 #                if refStr==None:
 #                    continue
 #
 #            refs = refStr.split(" ")
 #
 #            text = textEl.text
 #            logging.debug("Disease comment: %s, evidence %s" % (text, refStr))
 #            disCodes.update(findDiseases(text))
 #
 #            for refId in refs:
 #                disRefs[refId] = disCodes
 #
 #    logging.debug("Found disease evidences: %s" % disRefs)
 #    logging.debug("Found disease names: %s" % disCodes)
 #    return disRefs, disCodes
 
 def parseDiseaseComment(entryEl, disToName):
     """ 
     parse the general comments, disease section from up record 
     return evidence codes that refer to diseases 
     also return disease codes 
     """
     disRefs = {}
     disCodes = set()
     for commentEl in entryEl.findall("comment"):
         textEl = commentEl.find("text")
         if commentEl.attrib["type"]=="disease":
             refStr = commentEl.attrib.get("evidence", None)
             # website xml is different, has evidence attribute on text element
             if refStr==None:
                 refStr = textEl.attrib.get("evidence", None)
                 if refStr==None:
                     continue
 
             refs = refStr.split(" ")
 
             text = textEl.text
             logging.debug("Disease comment: %s, evidence %s" % (text, refStr))
             disCodes.update(findDisCodes(text, disToName))
 
             for refId in refs:
                 disRefs[refId] = disCodes
 
     logging.debug("Found disease evidences: %s" % disRefs)
     return disRefs, disCodes
 
 def parseIsoforms(entryEl, mainId):
     """ parse sequences of isoforms, returns lists: isoIds, isoNames, dispId 
     dispId is mainId<spc>isoId for records with isoforms
     """
     isoDefined = False
     isoIds = []
     isoNames = []
     mainIsoId = mainId
     for isoEl in entryEl.findall("comment/isoform"):
         isoDefined = True
         # get id
         idEl = isoEl.find("id")
         isoId = idEl.text
 
         # get names (aka transcript synonyms), currently a reassignment to transcripts is impossible from my files, I just collect them for now
         for nameEl in isoEl.find("name"):
             isoNames.append(nameEl.text)
         seqEl = isoEl.find("sequence")
 
         # get sequences
         seqType = seqEl.attrib["type"]
         if seqType=="displayed":
             # the main sequence also is an "isoform", it's very strange
             # one of the isoform sequences is "displayed" = main isoform
             # we have already covered these
             mainIsoId = isoId
         elif seqType=="described":
             isoIds.append(isoId)
         else:
             assert(seqType in ["external", "not described"]) # weird Uniprot: they refer to sequences that they do not have
 
     #assert(len(seqs)==len(isoNames)) # will often be different, one transcript can have many names
 
     return isoIds, isoNames, mainIsoId
 
 def parseDbRefs(entryEl):
     " return dict with db -> id (various special cases) "
     dbRefs = defaultdict(set)
     dbRefs["emblMrna"] =  []
     dbRefs["emblMrnaProt"] =  []
     dbRefs["emblDna"] =  []
     dbRefs["emblDnaProt"] =  []
     for dbRefEl in entryEl.findall("dbReference"):
         db = dbRefEl.attrib["type"]
         mainId = dbRefEl.attrib["id"]
         if db=="EMBL": # special case, don't add yet
             emblId = mainId
         else:
             dbRefs[db].add(mainId)
         propEls = dbRefEl.findall("property")
         emblProtId = "na"
         id = None
         for propEl in propEls:
             propType = propEl.attrib["type"]
             propDb = db
             if (db, propType) ==("RefSeq", "nucleotide sequence ID"):
                 id = propEl.attrib["value"]
                 propDb = "refseqNucl"
             elif db=="HGNC" and propType=="gene designation":
                 id = propEl.attrib["value"]
                 propDb = "hgncGene"
             elif db=="Ensembl" and propType=="gene ID":
                 id = propEl.attrib["value"]
                 propDb = "ensemblGene"
             elif db=="Ensembl" and propType=="protein sequence ID":
                 id = propEl.attrib["value"]
                 propDb = "ensemblProt"
             elif db=="EMBL" and propType=="protein sequence ID":
                 emblProtId = propEl.attrib["value"]
                 continue # don't add yet
             elif db=="MIM" and propType=="type":
                 omimCat = propEl.attrib["value"]
                 if omimCat=="phenotype":
                     dbRefs["omimPhenotype"].add(mainId)
                 elif omimCat=="gene" or omimCat=="gene+phenotype":
                     dbRefs["omimGene"].add(mainId)
                 else:
                     assert(False)
             elif db=="EMBL" and propType=="molecule type":
                 val = propEl.attrib["value"]
                 if val=="mRNA":
                     # add now
                     dbRefs["emblMrna"].append(emblId)
                     dbRefs["emblMrnaProt"].append(emblProtId)
                 else:
                     dbRefs["emblDna"].append(emblId)
                     dbRefs["emblDnaProt"].append(emblProtId)
                 continue # don't add any id
             else:
                 id = dbRefEl.attrib["id"]
             if id!=None:
                 dbRefs[propDb].add(id)
 
     result = {}
     for db, valList in dbRefs.iteritems():
         result[db] = "|".join(valList)
         
     logging.debug("dbRefs: %s" % result)
     return result
 
 def splitAndResolve(disName, disCodes, splitWord):
     " split and split word, try to resolve via disCodes and rejoin again "
     subDises = disName.split(splitWord)
     newDises = []
     for subDis in subDises:
         subDis = subDis.strip()
         if subDis in disCodes:
             newDises.append(disCodes[subDis])
         else:
             newDises.append(subDis)
     disName = ",".join(newDises)
     return disName
 
 def parseFeatDesc(text, disToName):
     """ 
     parse the description of a feature to find code name of disease, snpId and comments 
     return tuple: (disease name, dbSnpId, otherComments)
     >>> parseFeatDesc("In sporadic cancers; somatic mutation; dbSNP:rs11540654.", {})
     ('sporadic cancers', 'rs11540654', 'somatic mutation')
     >>> parseFeatDesc("In RIEG1; pointless comment", {"RIEG1" : "Axel-Riegerfeldt syndrome"})
     ('Axel-Riegerfeldt syndrome', '', 'pointless comment')
     """
     # find disease name and try to resolve via disToNames
     logging.debug("Feature description: %s " % (text))
     text = text.strip(".").strip()
     parts = text.split("; ")
     disCode = ""
     comments = []
     for part in parts:
         part = part.replace("a patient with", "")
         part = part.replace("in a ", "in ")
         partLow = part.lower()
         if partLow.startswith("in ") and "dbSNP" not in part and "allele" not in part:
             disCode = " ".join(part.split()[1:])
             # some entries contain two disease names
         else:
             if "dbSNP" not in part:
                 comments.append(part)
                     
     # we got a plain disease code
     if disCode in disToName:
         disLongName = disToName[disCode]
     # or two dis codes with and
     elif " and " in disCode:
         disLongName = splitAndResolve(disCode, disToName, " and ")
     else:
         # there are dis code somewhere inside the text
         intDisCodes = findDisCodes(disCode, disToName)
         if len(intDisCodes)!=0:
             disLongName = disCode
             disCode = ",".join(intDisCodes)
         # ok nothing worked, keep it as it is
         else:
             disLongName = disCode
 
     # find snpId
     snpId = ""
     for m in re.finditer("dbSNP:(rs[0-9]+)", text):
         if m!=None:
             #assert(snpId=="")
             snpId = m.group(1)
 
     logging.debug("Disease: %s, snpId: %s" % (disLongName, snpId))
     return disCode, disLongName, snpId, "; ".join(comments)
 
 
 ignoredTypes = collections.Counter()
 
 def parseFeatures(entryEl, disRefs, defaultDisCodes, disToName, evidPmids, mainIsoAcc):
     " go over features and yield annotation records "
 
     acc = entryEl.find("accession").text
 
     mutations = []
     for featEl in entryEl.findall("feature"):
         featType = featEl.attrib["type"]
         if featType not in featTypes:
             ignoredTypes[featType] += 1 
             continue
 
         if featType in ["sequence variant"]:
             isVariant = True
         else:
             isVariant = False
 
         shortFeatType = featTypes[featType]
         logging.debug("type: %s" % featType)
 
         varId = featEl.attrib.get("id", "")
         logging.debug("Variant ID %s" % varId)
 
         origEl = featEl.find("original")
         if origEl==None:
             orig = ""
         else:
             orig = origEl.text
 
         varEl = featEl.find("variation")
         if varEl==None:
             variant = ""
         else:
             variant = varEl.text
             logging.debug("residue change: %s->%s" % (orig, variant))
 
         posEl = featEl.find("location/position")
         # if features have only a single position, they have only <position position="xxx">
         # we convert them to begin=xxx and end=xxx+1
         if posEl!=None:
             begin = posEl.attrib["position"]
             end = str(int(begin)+1)
         else:
             beginEl = featEl.find("location/begin")
             begin = beginEl.attrib.get("position", None)
             if begin==None:
                 logging.debug("Unknown start, skipping a feature")
                 continue
             endEl = featEl.find("location/end")
             end = endEl.attrib.get("position", None)
             if end==None:
                 logging.debug("Unknown end, skipping a feature")
                 continue
-            end = str(int(end)+1) # UniProt is 1-based, closed-end
+            end = str(int(end)+1) # UniProt is 1-based, open-end
 
         desc = featEl.attrib.get("description", None)
         if desc==None:
             desc = ""
         if "sulfinic" in desc:
             shortFeatType = "sulfo"
 
         descWords = desc.split()
         if len(descWords)>0:
             desc1 = descWords[0].lower()
             if "phos" in desc1:
                 shortFeatType = "phos"
             elif "acetyl" in desc1:
                 shortFeatType = "acetyl"
             elif "methyl" in desc1:
                 shortFeatType = "methyl"
             elif "lipo" in desc1:
                 shortFeatType = "lipo"
             elif "hydroxy" in desc1:
                 shortFeatType = "hydroxy"
             elif "nitro" in desc1:
                 shortFeatType = "nitro"
 
         evidStr = featEl.attrib.get("evidence", "")
         logging.debug("annotation pos %s-%s, desc %s, evidence %s" % (begin, end, desc, evidStr))
         desc = desc.strip("() ")
         evidList = evidStr.split()
 
         if isVariant:
             # only do this for mutations
             disCode, disName, snpId, comments = parseFeatDesc(desc, disToName)
             # if no disease annotated to feature, use the one from the record
             if disCode=="" and len(defaultDisCodes)==1:
                 disCode = list(defaultDisCodes)[0]
                 disName = disToName.get(disCode, disCode)+" (not annotated on variant but on gene record)"
                 disCode = disCode + "?"
         else:
             disCode, disName, snpId, comments = "", "", "", desc
 
         annotPmids = []
         if disCode!="":
             diseaseRelated = "disRelated"
         else:
             diseaseRelated = "noEvidence"
 
         for evidId in evidList:
             if evidId in disRefs:
                 diseaseRelated="disRelated"
             else:
                 diseaseRelated="notDisRelated"
                 logging.debug("evidence is not a disease evidence or blacklisted, check description")
 
             pmids = evidPmids.get(evidId, [])
             assert(len(pmids)<=1)
             if len(pmids)>0:
                 pmid = list(pmids)[0]
                 annotPmids.append(pmid)
 
         annot = AnnotRec(acc, mainIsoAcc, varId, featType, shortFeatType, begin, end, orig, variant, snpId, diseaseRelated, disName, disCode, ",".join(annotPmids), comments)
         logging.debug("Accepted annotation: %s" % str(annot))
 
         yield annot
 
 def parseEvidence(entryEl):
     " return a dict with evidCode -> PMID "
     result = {}
     for evidEl in entryEl.findall("evidence"):
         evidCode = evidEl.attrib["key"]
         for dbRefEl in evidEl.findall("source/dbReference"):
             dbType = dbRefEl.attrib["type"]
             if dbType=="PubMed":
                 pmid = dbRefEl.attrib["id"]
                 result.setdefault(evidCode, [])
                 result[evidCode].append(pmid)
     return result
     
 def parseAnnotations(entryEl, mainIsoAcc, disToName):
     " return MutRecs with disease associated variants "
     # parse the general record comment about diseases
     disRefs, allDiseaseCodes = parseDiseaseComment(entryEl, disToName)
 
     acc = entryEl.find("accession").text
     logging.debug("Diseases in %s" % acc)
 
     evidPmids = parseEvidence(entryEl)
     annotRecs = list(parseFeatures(entryEl, disRefs, allDiseaseCodes, disToName, evidPmids, mainIsoAcc))
     return annotRecs
 
 def parseRecInfo(entryEl, entry, isoSeqs):
     """parse uniprot general record info into entry dict
     use isoform sequences from isoSeqs
     only process certain taxonIds
     """
     dataset = entryEl.attrib["dataset"]
     entry["dataset"] = dataset
 
     findSaveList(entryEl, "name", entry, "name")
     findSaveList(entryEl, "accession", entry, "accList")
     acc = entry["accList"].split("|")[0]
     entry["acc"] = acc
 
     logging.debug("Parsing rec info for acc %s" % acc)
 
     findSaveList(entryEl, "protein/recommendedName/fullName", entry, "protFullNames")
     findSaveList(entryEl, "protein/recommendedName/shortName", entry, "protShortNames")
     findSaveList(entryEl, "protein/alternativeName/fullName", entry, "protAltFullNames")
     findSaveList(entryEl, "protein/alternativeName/shortName", entry, "protAltShortNames")
     findSaveList(entryEl, "gene/name", entry, "geneName", attribKey="type", attribVal="primary")
     findSaveList(entryEl, "gene/name", entry, "geneSynonyms", attribKey="type", attribVal="synonym")
     findSaveList(entryEl, "gene/name", entry, "geneOrdLocus", attribKey="type", attribVal="ordered locus")
     findSaveList(entryEl, "gene/name", entry, "geneOrf", attribKey="type", attribVal="ORF")
     findSaveList(entryEl, "organism/name", entry, "orgName", attribKey="type", attribVal="scientific")
     findSaveList(entryEl, "organism/name", entry, "orgCommon", attribKey="type", attribVal="common")
     findSaveList(entryEl, "organism/dbReference", entry, "taxonId", useAttrib="id")
     findSaveList(entryEl, "comment/isoform/id", entry, "isoIds")
     findSaveList(entryEl, "comment/isoform/name", entry, "isoNames")
     findSaveList(entryEl, "comment/subcellularLocation/location", entry, "subCellLoc")
     findSaveList(entryEl, "comment", entry, "functionText", attribKey="type", attribVal="function", subSubEl="text")
 
     mainSeq = entryEl.find("sequence").text
     #entry["mainSeq"] = mainSeq
 
     dbRefs = parseDbRefs(entryEl)
 
     isoIds, isoNames, mainIsoId = parseIsoforms(entryEl, acc)
 
     entry["mainIsoAcc"] = mainIsoId
 
     seqs = []
     seqs.append( (mainIsoId+" isRefOf "+acc, mainSeq) )
 
     for isoId in isoIds:
         if isoId not in isoSeqs:
             logging.warn("No sequence for isoform %s" % isoId)
             continue
         seqs.append( (isoId, isoSeqs[isoId]) )
 
     entry["hgncSym"] = dbRefs.get("hgncGene", "")
     entry["hgncId"] = dbRefs.get("HGNC", "")
     entry["refSeq"] = dbRefs.get("refseqNucl", "")
     entry["refSeqProt"] = dbRefs.get("RefSeq", "")
     entry["ensemblProt"] = dbRefs.get("ensemblProt", "")
     entry["ensemblGene"] = dbRefs.get("ensemblGene", "")
     entry["ensemblTrans"] = dbRefs.get("Ensembl", "")
     entry["entrezGene"] = dbRefs.get("GeneID", "")
     entry["kegg"] = dbRefs.get("KEGG", "")
     entry["uniGene"] = dbRefs.get("UniGene", "")
     entry["omimGene"] = dbRefs.get("omimGene", "")
     entry["omimPhenotype"] = dbRefs.get("omimPhenotype", "")
     entry["emblMrna"] = dbRefs.get("emblMrna", "") # mrnas
     entry["emblMrnaProt"] = dbRefs.get("emblMrnaProt", "") # the protein accessions for mrnas
     entry["emblDna"] = dbRefs.get("EmblDna", "") # anything not an mrna
     entry["emblDnaProt"] = dbRefs.get("EmblDnaProt", "") # protein accessions for non-mrnas
     entry["pdb"] = dbRefs.get("PDB", "")
     entry["ec"] = dbRefs.get("EC", "")
         
     entry["isoIds"]="|".join(isoIds)
     #entry["isoSeqs"]="|".join(seqs)
     entry["isoNames"]="|".join(isoNames)
 
     entryRow = EntryRec(**entry)
     return entryRow, seqs
 
 def parseRefInfo(entryEl, recName):
     for refEl in entryEl.findall("reference"):
         ref = copy.copy(emptyRef)
         ref["name"] = recName
         citEl = refEl.find("citation")
         ref["citType"] = citEl.attrib["type"]
         year = citEl.attrib.get("date", "")
         ref["year"] = year.split("-")[0]
         ref["journal"] = citEl.attrib.get("name", "")
         if ref["journal"]=="":
             ref["journal"] = citEl.attrib.get("db", "") # for submissions
         ref["vol"] = citEl.attrib.get("volume", "")
         ref["page"] = citEl.attrib.get("first", "")
         for titleEl in citEl.findall("title"):
             ref["title"] = titleEl.text
         authorList = []
         for personEl in citEl.findall("authorList/person"):
             if "name" in personEl.attrib:
                 name = personEl.attrib["name"]
                 name = name.replace(" ", ",", 1)
                 authorList.append(name)
         ref["authors"]=";".join(authorList)
         for dbRefEl in citEl.findall("dbReference"):
             if "type" in dbRefEl.attrib:
                 if dbRefEl.attrib["type"]=="DOI":
                     ref["doi"] = dbRefEl.attrib["id"]
                 if dbRefEl.attrib["type"]=="PubMed":
                     ref["pmid"] = dbRefEl.attrib["id"]
 
         findSaveList(refEl, "scope", ref, "scopeList")
         refRow = RefRec(**ref)
         yield refRow
 
 def readIsoforms(inDir, db):
     " return all isoform sequences as dict isoName (eg. P48347-2) -> sequence "
     if db=="swissprot":
         isoFname = join(inDir, "uniprot_sprot_varsplic.fasta.gz")
     elif db=="trembl":
         isoFname = join(inDir, "uniprot_trembl.fasta.gz")
     else:
         assert(False)
 
     logging.info("reading isoform sequences from %s (or non-gz version)" % isoFname)
     isoSeqs = parseFastaAsDict(isoFname)
     result = {}
     for id, seq in isoSeqs.iteritems():
         idParts = id.split("|")
         isoName = idParts[1]
         result[isoName] = seq
     logging.info("Found %d isoform sequences" % len(result))
     return result
 
 def writeFaSeqs(faFiles, taxonId, seqs):
     """ write main sequence to faFile with the right taxonId
     base sequence always has accession as ID
     """
     #seqIds = entry.isoIds.split("|")
     #if allVariants:
     if "all" in faFiles:
         ofh = faFiles["all"]
     else:
         ofh = faFiles[taxonId]
 
     for seqId, seq in seqs:
         ofh.write(">%s\n%s\n" % (seqId.strip(), seq.strip()))
 
 def openFaFiles(taxonIds, outDir, outPrefix):
     faFiles = {}
     if taxonIds == None:
         taxonIds = ["all"]
 
     for taxonId in taxonIds:
         taxonId = str(taxonId)
         faFname = join(outDir, outPrefix+"."+taxonId+".fa.gz")
         faFiles[int(taxonId)] = gzip.open(faFname, "w")
         logging.debug("Writing fasta seqs for taxon %s to %s" % (taxonId, faFname))
     return faFiles
 
 def stupidXmlFilter(xmlFile, taxonIds):
     " return only the uniprot XML lines that refer to one of the taxon Ids "
     lines = []
     taxonOk = False
     for line in xmlFile:
         if line.startswith("<entry"):
             lines = []
             lines.append(line)
             taxonOk = True # in case of doubt, it's True
         # parse line like <dbReference id="654924" type="NCBI Taxonomy"/>
         elif line.startswith('    <dbReference id="') and line.endswith('type="NCBI Taxonomy"/>\n'):
             recTax = int(line.split('"')[1])
             if taxonIds!=['all'] and recTax not in taxonIds:
                 taxonOk = False
             lines.append(line)
         elif line.startswith("</entry"):
             lines.append(line)
             if taxonOk:
                 yield lines
             else:
                 yield None
             lines = None
         else:
             if taxonOk and lines is not None:
                 lines.append(line)
 
 def parseUniprot(db, inDir, outDir, taxonIds):
     " parse uniprot, write records and refs to outdir "
 
     if options.parse:
         fname = options.parse
         logging.info("Debug parse of %s" % fname)
         xmlFile = open(fname)
         isoSeqs, recCount = {}, 1
         outDir = "."
         outPrefix = "temp"
         disToName = {}
     else:
         isoSeqs = readIsoforms(inDir, db)
         if db=="swissprot":
             xmlBase = "uniprot_sprot.xml.gz"
             outPrefix = "swissprot"
             recCount = 600000
         elif db=="trembl":
             xmlBase = "uniprot_trembl.xml.gz"
             outPrefix = "trembl"
             recCount = 600000*100
         else:
             raise Exception("unknown db")
 
         xmlBase2 = xmlBase.replace(".gz", "")
         if isfile(xmlBase2):
             logging.debug("Using non-gzipped file %s" % xmlBase2)
             xmlFile = open(join(inDir, xmlBase2))
         else:
             xmlFile = gzip.open(join(inDir, xmlBase))
 
         logging.info("Parsing main XML file %s" % xmlFile.name)
         disToName = parseDiseases(join(inDir, "docs", "humdisease.txt"))
 
     faFiles = openFaFiles(taxonIds, outDir, outPrefix)
 
     logging.debug("Only extracting taxon IDs %s" % str(taxonIds))
 
     # create a dict taxonId -> output file handles for record info, pmid reference info and annotation info
     outFhs = {}
     for taxId in taxonIds:
         entryOf = openOutTabFile(outDir, "%s.%s.tab" % (outPrefix, taxId), entryHeaders)
         refOf = openOutTabFile(outDir, "%s.%s.refs.tab" % (outPrefix, taxId), refHeaders)
         annotOf = openOutTabFile(outDir, "%s.%s.annots.tab" % (outPrefix, taxId), annotHeaders)
         outFhs[taxId] = (entryOf, refOf, annotOf)
 
     emptyEntry = dict(zip(entryHeaders, len(entryHeaders)*[""]))
 
     pm = ProgressMeter(recCount)
     for recLines in stupidXmlFilter(xmlFile, taxonIds):
     # the original solution below did a partial parse, but required 300GB of 
     # RAM
     #for _, entryEl in etree.iterparse(xmlFile):
         #if entryEl.tag!="{http://uniprot.org/uniprot}entry":
             #continue
         #strip_namespace_inplace(entryEl) # die, die stupid namespaces!!
 
         pm.taskCompleted()
         if recLines is None:
             continue
 
         entryEl = etree.fromstring("".join(recLines))
 
         entryTax = int(entryEl.find("organism/dbReference").attrib["id"])
 
         if taxonIds==['all']:
             taxId = "all"
         else:
             if entryTax not in taxonIds:
                 logging.debug("taxon ID %d not a target taxon" % entryTax)
                 continue
         entryOf, refOf, annotOf = outFhs[entryTax]
 
         entry = copy.copy(emptyEntry)
         entryRow, seqs = parseRecInfo(entryEl, entry, isoSeqs)
 
         writeFaSeqs(faFiles, entryTax, seqs)
 
         entryOf.write("\t".join(entryRow)+"\n")
         recName = entryRow.name
 
         refRows = list(parseRefInfo(entryEl, recName))
         for refRow in refRows:
             refOf.write("\t".join(refRow)+"\n")
 
         annotRecs = parseAnnotations(entryEl, entryRow.mainIsoAcc, disToName)
         for annotRow in annotRecs:
             logging.debug("writing row %s" % str(annotRow))
             annotOf.write("\t".join(annotRow)+"\n")
 
         # clear some RAM (not all)
         # not needed anymore, since I'm using stupidXmlFilter now
         # https://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory
         # remove this if using cElementTree
         #entryEl.clear()
         #for ancestor in entryEl.xpath('ancestor-or-self::*'):
             #while ancestor.getprevious() is not None:
                 #del ancestor.getparent()[0]
 
     logging.info("Skipped annotation types: %s" % ignoredTypes.most_common())
 
 def main(args, options):
     if options.test:
         import doctest
         doctest.testmod()
         sys.exit(0)
 
     setupLogging("pubParseDb", options)
     db = args[0]
 
     db = "swissprot"
     if options.trembl:
         db = "trembl"
 
     dbDir = args[0]
 
     taxonIds = args[1]
     if taxonIds=="all":
         taxonIds = ['all']
     else:
         taxonIds=[int(x) for x in taxonIds.split(",")]
 
     refDir = args[2]
     if not isdir(refDir):
         logging.info("Making directory %s" % refDir)
         os.makedirs(refDir)
 
     if len(args)!=3:
         raise Exception("Invalid command line. Show help with -h")
 
     parseUniprot(db, dbDir, refDir, taxonIds)
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] uniprotFtpDir taxonIds outDir - Convert UniProt to tab-sep files
 
 taxonIds can be "all"
 
 To download uniProt, this command is a good idea:
 lftp ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/ -e \
     "mirror . --use-pget-n=10 --exclude-glob *.dat.gz -P 5"
 
 This parser:
 - goes over the disease comment evidences and tries to
   classify annotations as disease-related or not.
 - resolves disease codes to full disease names
 - gets the PMIDs for all evidences
 - only gets a limited list of xrefs, but others are easy to add
 - can parse Trembl (Who came up with the idea of creating
   a 500GB XML file?)
 
 Example:
 %prog /hive/data/outside/uniProt/current 9606 tab/
 
 If you get no results from this script, your species may be only in Trembl.
 Use the '--trembl' option to parse UniProt/Trembl instead of UniProt/SwissProt.
 Most organisms have entries in both databases and you have to run
 the script twice to get all entries.
 """)
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
 parser.add_option("", "--test", dest="test", action="store_true", help="run tests")
 parser.add_option("-p", "--parse", dest="parse", action="store", help="parse a single uniprot xml file (debugging)")
 parser.add_option("", "--trembl", dest="trembl", action="store_true", help="parse trembl. Default is to parse only the swissprot files.")
 (options, args) = parser.parse_args()
 
 if args==[] and not options.test:
     parser.print_help()
     exit(1)
 
 main(args, options)