src/utils/uniprotToTab 9461ce3c12c7faa7da2480f9629a370284c54605

9461ce3c12c7faa7da2480f9629a370284c54605
max
  Tue Nov 29 15:52:54 2016 -0800
fixing uniprot on hg38, refs #18451

diff --git src/utils/uniprotToTab src/utils/uniprotToTab
index 60aee05..4146d81 100755
--- src/utils/uniprotToTab
+++ src/utils/uniprotToTab
@@ -253,37 +253,39 @@
                 to_delete=[]
                 to_set={}
                 for attr_name in elem.attrib:
                     if attr_name.startswith(ns):
                         old_val = elem.attrib[attr_name]
                         to_delete.append(attr_name)
                         attr_name = attr_name[nsl:]
                         to_set[attr_name] = old_val
                 for key in to_delete:
                     elem.attrib.pop(key)
                 elem.attrib.update(to_set)
 
 
 def parseDiseases(fname):
     " parse the file humanDiseases.txt from uniprot to resolve disease IDs to disease names "
+    logging.info("Parsing %s" % fname)
     dis = {}
     for line in open(fname).read().splitlines():
         if line.startswith("ID"):
             name = line[5:].strip(".")
         if line.startswith("AR"):
             code = line[5:].strip(".")
             dis[code]=name
+    logging.info("read %d disease code -> disease name mappings" % len(dis))
     return dis
 
 def findSaveList(el, path, dataDict, key, attribKey=None, attribVal=None, useAttrib=None, subSubEl=None):
     """ find all text of subelemets matching path with given optionally attrib and save into dataDict with key
     You can specify a subSubEl of the element to get the text from.
     """
     l = []
     for se in el.findall(path):
         if attribKey!=None and se.attrib.get(attribKey, None)!=attribVal:
             continue
         if useAttrib:
             val = se.attrib[useAttrib]
         else:
             if subSubEl:
                 val = se.find(subSubEl).text
@@ -943,31 +945,31 @@
             outPrefix = "uniprotTrembl"
             recCount = 600000*37
         else:
             raise Exception("unknown db")
         xmlFile = gzip.open(join(inDir, xmlBase))
         logging.info("Parsing main XML file %s" % xmlFile.name)
 
     # create a dict taxonId -> output file handles for record info, pmid reference info and mutation info
     outFhs = {}
     for taxId in taxonIds:
         entryOf = openOutTabFile(outDir, "%s.%s.tab" % (outPrefix, taxId), entryHeaders)
         refOf = openOutTabFile(outDir, "%s.%s.refs.tab" % (outPrefix, taxId), refHeaders)
         mutOf = openOutTabFile(outDir, "%s.%s.annots.tab" % (outPrefix, taxId), mutHeaders)
         outFhs[taxId] = (entryOf, refOf, mutOf)
 
-    disToName = parseDiseases(join(inDir, "humdisease.txt"))
+    disToName = parseDiseases(join(inDir, "docs", "humdisease.txt"))
     # base and variant sequence filehandles
     faFiles = openFaFiles(taxonIds, outDir, outPrefix)
     varFaFiles = openFaFiles(taxonIds, outDir, outPrefix, "var")
 
     emptyEntry = dict(zip(entryHeaders, len(entryHeaders)*[""]))
 
     pm = ProgressMeter(recCount)
     #for _, entryEl in etree.iterparse(xmlFile.name, tag='{http://uniprot.org/uniprot}entry'):
     for _, entryEl in etree.iterparse(xmlFile):
         if entryEl.tag!="{http://uniprot.org/uniprot}entry":
             continue
         strip_namespace_inplace(entryEl) # die, die stupid namespaces!!
         entry = copy.copy(emptyEntry)
 
         pm.taskCompleted()