745524a7cc8f2fb4bceea120fd25201ddeff6acc max Tue Jul 19 13:39:08 2016 -0700 fixing an issue where uniprotToTab would not work with trembl. Email from Eric Ho, refs #17717 diff --git src/utils/uniprotToTab src/utils/uniprotToTab index b438132..60aee05 100755 --- src/utils/uniprotToTab +++ src/utils/uniprotToTab @@ -847,33 +847,37 @@ name = personEl.attrib["name"] name = name.replace(" ", ",", 1) authorList.append(name) ref["authors"]=";".join(authorList) for dbRefEl in citEl.findall("dbReference"): if "type" in dbRefEl.attrib: if dbRefEl.attrib["type"]=="DOI": ref["doi"] = dbRefEl.attrib["id"] if dbRefEl.attrib["type"]=="PubMed": ref["pmid"] = dbRefEl.attrib["id"] findSaveList(refEl, "scope", ref, "scopeList") refRow = RefRec(**ref) yield refRow -def readIsoforms(inDir): +def readIsoforms(inDir, db): " return all isoform sequences as dict isoName (eg. P48347-2) -> sequence " + if db=="uniprot": isoFname = join(inDir, "uniprot_sprot_varsplic.fasta.gz") + else: + isoFname = join(inDir, "uniprot_trembl.fasta.gz") + isoFile = gzip.open(isoFname) logging.info("reading isoform sequences from %s" % isoFname) isoSeqs = parseFastaAsDict(isoFile) result = {} seqNames = [] for id, seq in isoSeqs.iteritems(): idParts = id.split("|") isoName = idParts[1] result[isoName] = seq seqNames.append(idParts[2].split()[0]) logging.info("Found %d isoform sequences" % len(result)) return result, len(set(seqNames)) def writeFaSeqs(entry, faFiles, allVariants=False): """ write main sequence to faFile with the right taxonId @@ -917,39 +921,39 @@ faFiles[taxonId] = gzip.open(faFname, "w") logging.info("Writing fasta seqs for taxon %s to %s (seqType: %s)" % (taxonId, faFname, seqType)) return faFiles def parseUniprot(db, inDir, outDir, taxonIds): " parse uniprot, write records and refs to outdir " if options.parse: fname = options.parse logging.info("Debug parse of %s" % fname) xmlFile = open(fname) isoSeqs, recCount = {}, 1 outDir = "." outPrefix = "temp" else: - isoSeqs, recCount = readIsoforms(inDir) + isoSeqs, recCount = readIsoforms(inDir, db) if db=="uniprot": xmlBase = "uniprot_sprot.xml.gz" outPrefix = "uniprot" - recCount = 500000 + recCount = 600000 elif db=="trembl": xmlBase = "uniprot_trembl.xml.gz" outPrefix = "uniprotTrembl" - recCount = 500000*37 + recCount = 600000*37 else: raise Exception("unknown db") xmlFile = gzip.open(join(inDir, xmlBase)) logging.info("Parsing main XML file %s" % xmlFile.name) # create a dict taxonId -> output file handles for record info, pmid reference info and mutation info outFhs = {} for taxId in taxonIds: entryOf = openOutTabFile(outDir, "%s.%s.tab" % (outPrefix, taxId), entryHeaders) refOf = openOutTabFile(outDir, "%s.%s.refs.tab" % (outPrefix, taxId), refHeaders) mutOf = openOutTabFile(outDir, "%s.%s.annots.tab" % (outPrefix, taxId), mutHeaders) outFhs[taxId] = (entryOf, refOf, mutOf) disToName = parseDiseases(join(inDir, "humdisease.txt")) # base and variant sequence filehandles @@ -1027,30 +1031,32 @@ # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] uniprotFtpDir taxonIds outDir - Convert UniProt to tab-sep files taxonIds can be "all" To download uniProt, this command is a good idea: lftp ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/ -e "mirror --use-pget-n=10 --exclude-glob *.dat.gz -P 5" This parser: - goes over the disease comment evidences and tries to classify annotations as disease-related or not. - resolves disease codes to full disease names - blacklists some PMIDs, annotations from these are skipped Example: - %prog /hive/data/outside/uniProtCurrent/ 9606 uniprotTab/ + +If you get no results from this script, your species may be only in Trembl. +Use the '-t trembl' option to parse UniProt/Trembl instead of UniProt/SwissProt. """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("", "--test", dest="test", action="store_true", help="run tests") parser.add_option("-p", "--parse", dest="parse", action="store", help="parse a single uniprot xml file (debugging)") parser.add_option("-t", "--dbType", dest="dbType", action="store", help="one of 'uniprot' or 'trembl', default %default", default="uniprot") (options, args) = parser.parse_args() if args==[] and not options.test: parser.print_help() exit(1) main(args, options)