src/utils/uniprotToTab 745524a7cc8f2fb4bceea120fd25201ddeff6acc

745524a7cc8f2fb4bceea120fd25201ddeff6acc
max
  Tue Jul 19 13:39:08 2016 -0700
fixing an issue where uniprotToTab would not work with trembl. Email
from Eric Ho, refs #17717

diff --git src/utils/uniprotToTab src/utils/uniprotToTab
index b438132..60aee05 100755
--- src/utils/uniprotToTab
+++ src/utils/uniprotToTab
@@ -847,33 +847,37 @@
                 name = personEl.attrib["name"]
                 name = name.replace(" ", ",", 1)
                 authorList.append(name)
         ref["authors"]=";".join(authorList)
         for dbRefEl in citEl.findall("dbReference"):
             if "type" in dbRefEl.attrib:
                 if dbRefEl.attrib["type"]=="DOI":
                     ref["doi"] = dbRefEl.attrib["id"]
                 if dbRefEl.attrib["type"]=="PubMed":
                     ref["pmid"] = dbRefEl.attrib["id"]
 
         findSaveList(refEl, "scope", ref, "scopeList")
         refRow = RefRec(**ref)
         yield refRow
 
-def readIsoforms(inDir):
+def readIsoforms(inDir, db):
     " return all isoform sequences as dict isoName (eg. P48347-2) -> sequence "
+    if db=="uniprot":
         isoFname = join(inDir, "uniprot_sprot_varsplic.fasta.gz")
+    else:
+        isoFname = join(inDir, "uniprot_trembl.fasta.gz")
+
     isoFile = gzip.open(isoFname)
     logging.info("reading isoform sequences from %s" % isoFname)
     isoSeqs = parseFastaAsDict(isoFile)
     result = {}
     seqNames = []
     for id, seq in isoSeqs.iteritems():
         idParts = id.split("|")
         isoName = idParts[1]
         result[isoName] = seq
         seqNames.append(idParts[2].split()[0])
     logging.info("Found %d isoform sequences" % len(result))
     return result, len(set(seqNames))
 
 def writeFaSeqs(entry, faFiles, allVariants=False):
     """ write main sequence to faFile with the right taxonId 
@@ -917,39 +921,39 @@
         faFiles[taxonId] = gzip.open(faFname, "w")
         logging.info("Writing fasta seqs for taxon %s to %s (seqType: %s)" % (taxonId, faFname, seqType))
     return faFiles
 
 def parseUniprot(db, inDir, outDir, taxonIds):
     " parse uniprot, write records and refs to outdir "
 
     if options.parse:
         fname = options.parse
         logging.info("Debug parse of %s" % fname)
         xmlFile = open(fname)
         isoSeqs, recCount = {}, 1
         outDir = "."
         outPrefix = "temp"
     else:
-        isoSeqs, recCount = readIsoforms(inDir)
+        isoSeqs, recCount = readIsoforms(inDir, db)
         if db=="uniprot":
             xmlBase = "uniprot_sprot.xml.gz"
             outPrefix = "uniprot"
-            recCount = 500000
+            recCount = 600000
         elif db=="trembl":
             xmlBase = "uniprot_trembl.xml.gz"
             outPrefix = "uniprotTrembl"
-            recCount = 500000*37
+            recCount = 600000*37
         else:
             raise Exception("unknown db")
         xmlFile = gzip.open(join(inDir, xmlBase))
         logging.info("Parsing main XML file %s" % xmlFile.name)
 
     # create a dict taxonId -> output file handles for record info, pmid reference info and mutation info
     outFhs = {}
     for taxId in taxonIds:
         entryOf = openOutTabFile(outDir, "%s.%s.tab" % (outPrefix, taxId), entryHeaders)
         refOf = openOutTabFile(outDir, "%s.%s.refs.tab" % (outPrefix, taxId), refHeaders)
         mutOf = openOutTabFile(outDir, "%s.%s.annots.tab" % (outPrefix, taxId), mutHeaders)
         outFhs[taxId] = (entryOf, refOf, mutOf)
 
     disToName = parseDiseases(join(inDir, "humdisease.txt"))
     # base and variant sequence filehandles
@@ -1027,30 +1031,32 @@
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] uniprotFtpDir taxonIds outDir - Convert UniProt to tab-sep files
 
 taxonIds can be "all"
 
 To download uniProt, this command is a good idea:
 lftp ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/ -e "mirror --use-pget-n=10 --exclude-glob *.dat.gz -P 5"
 
 This parser:
 - goes over the disease comment evidences and tries to
   classify annotations as disease-related or not.  
 - resolves disease codes to full disease names
 - blacklists some PMIDs, annotations from these are skipped
 
 Example:
-
 %prog /hive/data/outside/uniProtCurrent/ 9606 uniprotTab/
+
+If you get no results from this script, your species may be only in Trembl.
+Use the '-t trembl' option to parse UniProt/Trembl instead of UniProt/SwissProt.
 """)
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
 parser.add_option("", "--test", dest="test", action="store_true", help="run tests")
 parser.add_option("-p", "--parse", dest="parse", action="store", help="parse a single uniprot xml file (debugging)")
 parser.add_option("-t", "--dbType", dest="dbType", action="store", help="one of 'uniprot' or 'trembl', default %default", default="uniprot")
 (options, args) = parser.parse_args()
 
 if args==[] and not options.test:
     parser.print_help()
     exit(1)
 
 main(args, options)