9461ce3c12c7faa7da2480f9629a370284c54605 max Tue Nov 29 15:52:54 2016 -0800 fixing uniprot on hg38, refs #18451 diff --git src/utils/uniprotToTab src/utils/uniprotToTab index 60aee05..4146d81 100755 --- src/utils/uniprotToTab +++ src/utils/uniprotToTab @@ -253,37 +253,39 @@ to_delete=[] to_set={} for attr_name in elem.attrib: if attr_name.startswith(ns): old_val = elem.attrib[attr_name] to_delete.append(attr_name) attr_name = attr_name[nsl:] to_set[attr_name] = old_val for key in to_delete: elem.attrib.pop(key) elem.attrib.update(to_set) def parseDiseases(fname): " parse the file humanDiseases.txt from uniprot to resolve disease IDs to disease names " + logging.info("Parsing %s" % fname) dis = {} for line in open(fname).read().splitlines(): if line.startswith("ID"): name = line[5:].strip(".") if line.startswith("AR"): code = line[5:].strip(".") dis[code]=name + logging.info("read %d disease code -> disease name mappings" % len(dis)) return dis def findSaveList(el, path, dataDict, key, attribKey=None, attribVal=None, useAttrib=None, subSubEl=None): """ find all text of subelemets matching path with given optionally attrib and save into dataDict with key You can specify a subSubEl of the element to get the text from. """ l = [] for se in el.findall(path): if attribKey!=None and se.attrib.get(attribKey, None)!=attribVal: continue if useAttrib: val = se.attrib[useAttrib] else: if subSubEl: val = se.find(subSubEl).text @@ -943,31 +945,31 @@ outPrefix = "uniprotTrembl" recCount = 600000*37 else: raise Exception("unknown db") xmlFile = gzip.open(join(inDir, xmlBase)) logging.info("Parsing main XML file %s" % xmlFile.name) # create a dict taxonId -> output file handles for record info, pmid reference info and mutation info outFhs = {} for taxId in taxonIds: entryOf = openOutTabFile(outDir, "%s.%s.tab" % (outPrefix, taxId), entryHeaders) refOf = openOutTabFile(outDir, "%s.%s.refs.tab" % (outPrefix, taxId), refHeaders) mutOf = openOutTabFile(outDir, "%s.%s.annots.tab" % (outPrefix, taxId), mutHeaders) outFhs[taxId] = (entryOf, refOf, mutOf) - disToName = parseDiseases(join(inDir, "humdisease.txt")) + disToName = parseDiseases(join(inDir, "docs", "humdisease.txt")) # base and variant sequence filehandles faFiles = openFaFiles(taxonIds, outDir, outPrefix) varFaFiles = openFaFiles(taxonIds, outDir, outPrefix, "var") emptyEntry = dict(zip(entryHeaders, len(entryHeaders)*[""])) pm = ProgressMeter(recCount) #for _, entryEl in etree.iterparse(xmlFile.name, tag='{http://uniprot.org/uniprot}entry'): for _, entryEl in etree.iterparse(xmlFile): if entryEl.tag!="{http://uniprot.org/uniprot}entry": continue strip_namespace_inplace(entryEl) # die, die stupid namespaces!! entry = copy.copy(emptyEntry) pm.taskCompleted()