src/utils/uniprotLift 8c2dc989cad61bb0116cd8257cd9b424988eb64a

8c2dc989cad61bb0116cd8257cd9b424988eb64a
max
  Mon Nov 10 13:09:45 2014 -0800
making uniprot annotations track ready for hg19, refs #13688
diff --git src/utils/uniprotLift src/utils/uniprotLift
index bf1cfb0..baddfe7 100755
--- src/utils/uniprotLift
+++ src/utils/uniprotLift
@@ -1,19 +1,18 @@
 #!/usr/bin/env python2.7
 # a script to lift uniprot mutations/annotations created by the publications uniprot parser
-# to a DB and create three bigbed files for them:
-# one for structures, one for mutations, one for other annotations
+# to a DB and create two bigbed files for them
 
 from os.path import join, isfile
 from os import system
 from collections import defaultdict, namedtuple
 
 # import the publications libraries
 import sys, gzip, optparse, logging,re
 sys.path.append("/cluster/home/max/projects/pubs/tools/lib")
 import maxCommon
 
 # Globals  ---
 DEBUG = False
 
 pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"])
 
@@ -133,43 +132,62 @@
 
     disName = ",".join(disCodes)
     if len(disName)>30:
         disName = disName[:30]+"..."
     if len(disCodes)>0 and not "strain" in disName:
         bed[3] += " in %s" % disName
 
     if mut.featType=="sequence variant":
         varType = "Naturally occuring sequence variant"
         bed[8] = "100,0,0"
     elif mut.featType=="mutagenesis site":
         varType = "Experimental mutation of amino acids"
         bed[8] = "0,0,100"
     else:
         bed[3] = mut.shortFeatType
+        if mut.featType in useComment:
+            bed[3] = mut.comment
+
         if mut.featType=="signal peptide":
             bed[3] = "Signal peptide"
         if mut.featType=="lipid moiety-binding region":
             bed[3] = "Lipidation"
         if mut.featType=="transmembrane region":
             bed[3] = "Transmembrane"
-        if mut.featType in useComment:
-            bed[3] = mut.comment
+            
+        if mut.comment=="Nuclear localization signal":
+            bed[3] = "Nuclear loc"
         if mut.comment=="Involved in receptor recognition and/or post-binding events":
             bed[3] = "Recept recog"
+        if mut.comment=="Fibronectin type-III":
+            bed[3] = "FibronectIII"
+
+        # more general rules
+        if mut.comment.startswith("Zinc finger protein "):
+            bed[3] = mut.comment.replace("Zinc finger protein ", "ZF-")
+        if mut.comment.startswith("Necessary for the"):
+            bed[3] = mut.comment.replace("Necessary for the ", "")
+        if mut.comment.startswith("Interaction with "):
+            bed[3] = mut.comment.replace("Interaction with ", "Int:")
+        if mut.comment.startswith("Involved in the"):
+            bed[3] = mut.comment.replace("Involved in the ", "")
+        if mut.comment.startswith("Required for "):
+            bed[3] = mut.comment.replace("Required for ", "")
         if mut.comment.startswith("Cleavage; by host "):
             bed[3] = "Cleave:"+mut.comment.split()[-1]
 
+
         if bed[3] == "Intrinsically disordered":
             bed[3] = "Disordered"
         
         if len(bed[3])>17:
             bed[3] = bed[3][:14]+"..."
 
         varType = mut.featType
         bed[8] = featTypeColors.get(mut.featType, "0,0,0")
 
     bed.append(varType)
     if isMut:
         bed.append(",".join(diseases))
 
     if isMut:
         # create description of mutation
@@ -237,46 +255,60 @@
     run(cmd)
 
     # create seq sizes for all uniprot sequences of this species
     uniprotFa = join(options.uniprotDir, "uniprot.%s.fa" % taxonId)
     uniprotFaGz = uniprotFa+".gz"
     if not isfile(uniprotFa):
         print "uncompressing uniprot gz fa"
         open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read())
 
     if not isfile(uniprotFa):
         raise Exception("Not found: %s" % uniprotFa)
 
     print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes")
     cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa
     run(cmd)
+
+    # keep protein lengths for filtering later, we'll remove everything that 
+    # covers the whole protein
+    seqSizes = {}
+    for line in open("work/chromSizes"):
+        seq, size = line.rstrip("\n").split()
+        seqSizes[seq] = int(size)/3
+        
     # get uniprot IDs 
     speciesAccs = set([line.split()[0] for line in open("work/chromSizes")])
     print "got %d accessions" % len(speciesAccs)
 
     # read data, write bed to file
-    # this annotates mutations on the protein sequence
+    # this annotates in protein coordinates
     print "converting tab to bed"
     ofh = open("work/temp.bed", "w")
     uniProtMutFname = join(options.uniprotDir, "uniprot.%s.mut.tab" % taxonId)
     print "reading %s" % uniProtMutFname
     for mut in iterTsvRows(open(uniProtMutFname)):
         if mut.acc not in speciesAccs:
             continue
         if options.filterAnnots and mut.featType in removeTypes:
             continue
         mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa
+
+        # remove annotations that cover the whole protein, probably not very useful
+        #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc]
+        if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1:
+            continue
+
         mutPos = 3*(int(mut.begin)-1)
         mutPosEnd = 3*(int(mut.end)-1)
         if mutName not in mutData:
             ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n")
         mutData[mutName].append(mut)
     ofh.close()
     
     # lift from protein sequence bed to target bed
     print "lifting"
     cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname
     run(cmd)
 
     print "adding extra fields"
     # read lifted bed and add uniprot annotations to it as additional fields
     mutBed = open("spMut.bed", "w")