src/utils/uniprotLift 3887c3b11338a8ed7de6f5805d09019c25979023

3887c3b11338a8ed7de6f5805d09019c25979023
max
  Fri Jul 15 15:14:53 2016 -0700
removing dependencies for uniprot pipeline, moving it into kent repo,
for #17716. Also preparing for hg38 uniprot track, #17717

diff --git src/utils/uniprotLift src/utils/uniprotLift
index 61b7a80..fe49425 100755
--- src/utils/uniprotLift
+++ src/utils/uniprotLift
@@ -1,27 +1,25 @@
 #!/usr/bin/env python2.7
 # a script to lift uniprot mutations/annotations created by the publications uniprot parser
 # to a DB and create two bigbed files for them
 
 from os.path import join, isfile
 from os import system
 from collections import defaultdict, namedtuple
 
 # import the publications libraries
 import sys, gzip, optparse, logging,re
-sys.path.append("/cluster/home/max/projects/pubs/tools/lib")
-import maxCommon
 
 # Globals  ---
 DEBUG = False
 
 pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"])
 
 # if --filter option provided, remove these altogether
 removeTypes = set(["helix", "strand", "beta", "turn", "signal"])
 
 # by default, only output identifiers, not html links to them
 # html links are good for custom tracks
 # identifers are good for local spMut tracks
 createLinks = False
 
 urls = {"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?",
@@ -215,87 +213,91 @@
     if len(diseases)!=0:
         commentField = ",".join(diseases) + "; " + commentField
     bed.append(commentField)
     if isMut:
         bed.append(htmlLink('var', varIds))
         dbSnpIds = [id for id in dbSnpIds if id.strip()!=""]
         bed.append(",".join(dbSnpIds))
     bed.append(htmlLink('uniProt', [acc]))
     bed.append(htmlLink('pubmed', pmids))
     bed[5] = "."
     bedLine = "\t".join(bed)+"\n"
     return bedLine
 
 # ------ MAIN ------
 if __name__ == '__main__':
-    parser = optparse.OptionParser("""usage: %prog [options] taxonId dbName liftPslFile - lift uniprot annotations to genome. 
+    parser = optparse.OptionParser("""usage: %prog [options] uniprotTabDir taxonId chromSizesFile liftPslFile - lift uniprot annotations to genome. 
     
-    Needs a mapping PSL from uniprot nucleotide coordinates to genome positions.
+    uniprotTabDir is the directory with the output from uniprotToTab
+    
+    Needs a mapping PSL from uniprot (in faked nucleotide coordinates) to genome positions.
     - for genomes with introns, makeLiftOver.sh from
-      src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19
+      kent/src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19
     - for a quicker rough version, create one with BLAT and -q=dnax folowed by
       pslProtCnv 
 
-    Input uniprot files have to be created with the pubParseDb uniprot parser
-
     """)
     parser.add_option("", "--annot", dest="annot", action="store_true", help="lift other, non-variant annotations")
     parser.add_option("-d", "", dest="debug", action="store_true", help="debug output")
     parser.add_option("", "--filter", dest="filterAnnots", action="store_true", help="remove certain annotations that are not very useful, like helix,chain,beta, etc")
-    parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs")
+    #parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs")
     (options, args) = parser.parse_args()
 
     if options.debug:
         DEBUG=True
 
-    taxonId, db, liftFname = args
+    if len(args)==0:
+        parser.print_help()
+        sys.exit(1)
+
+    uniprotDir, taxonId, chromSizesPath, liftFname = args
 
     mutData = defaultdict(list)
 
     cmd = "rm -rf work; mkdir work"
     run(cmd)
 
     # create seq sizes for all uniprot sequences of this species
-    uniprotFa = join(options.uniprotDir, "uniprot.%s.fa" % taxonId)
+    uniprotFa = join(uniprotDir, "uniprot.%s.fa" % taxonId)
     uniprotFaGz = uniprotFa+".gz"
     if not isfile(uniprotFa):
         print "uncompressing uniprot gz fa"
         open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read())
 
     if not isfile(uniprotFa):
         raise Exception("Not found: %s" % uniprotFa)
 
     print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes")
     cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa
     run(cmd)
 
     # keep protein lengths for filtering later, we'll remove everything that 
     # covers the whole protein
     seqSizes = {}
     for line in open("work/chromSizes"):
         seq, size = line.rstrip("\n").split()
         seqSizes[seq] = int(size)/3
         
     # get uniprot IDs 
     speciesAccs = set([line.split()[0] for line in open("work/chromSizes")])
     print "got %d accessions" % len(speciesAccs)
 
     # read data, write bed to file
     # this annotates in protein coordinates
     print "converting tab to bed"
     ofh = open("work/temp.bed", "w")
-    uniProtMutFname = join(options.uniprotDir, "uniprot.%s.mut.tab" % taxonId)
+    uniProtMutFname = join(uniprotDir, "uniprot.%s.annots.tab" % taxonId)
     print "reading %s" % uniProtMutFname
     for mut in iterTsvRows(open(uniProtMutFname)):
         if mut.acc not in speciesAccs:
             continue
         if options.filterAnnots and mut.featType in removeTypes:
             continue
         mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa
 
         # remove annotations that cover the whole protein, probably not very useful
         #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc]
         if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1:
             continue
 
         mutPos = 3*(int(mut.begin)-1)
         mutPosEnd = 3*(int(mut.end)-1)
@@ -348,34 +350,34 @@
             keyFields = tuple(bedLine.split("\t")[:4])
             if keyFields in doneAnnots:
                 continue
             doneAnnots.add(keyFields)
         ofh.write(bedLine)
         count += 1
 
     print "%d features written to %s" % (count, ofh.name)
     mutBed.close()
     annotBed.close()
     structBed.close()
 
     #print "%d variants not mapped to genome" % len(notMapped)
 
     asFname = "bed12UniProtMut.as"
-    cmd = "bedToBigBed -as=%s spMut.bed /hive/data/genomes/%s/chrom.sizes spMut.bb -type=bed12+ -tab" % \
-        (asFname, db)
+    cmd = "bedToBigBed -as=%s spMut.bed %s spMut.bb -type=bed12+ -tab" % \
+        (asFname, chromSizesPath)
     run(cmd)
 
     asFname = "bed12UniProtAnnot.as"
-    cmd = "bedToBigBed -as=%s spAnnot.bed /hive/data/genomes/%s/chrom.sizes spAnnot.bb -type=bed12+ -tab" % \
-        (asFname, db)
+    cmd = "bedToBigBed -as=%s spAnnot.bed %s spAnnot.bb -type=bed12+ -tab" % \
+        (asFname, chromSizesPath)
     run(cmd)
-    cmd = "bedToBigBed -as=%s spStruct.bed /hive/data/genomes/%s/chrom.sizes spStruct.bb -type=bed12+ -tab" % \
-        (asFname, db)
+    cmd = "bedToBigBed -as=%s spStruct.bed %s spStruct.bb -type=bed12+ -tab" % \
+        (asFname, chromSizesPath)
     run(cmd)
 
     cmd = "rm -rf work"
     run(cmd)
     cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed"
     run(cmd)
     #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname)
     print "%d features skipped because of blacklisted PMIDS" % (blackCount)
     print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)