8c2dc989cad61bb0116cd8257cd9b424988eb64a
max
  Mon Nov 10 13:09:45 2014 -0800
making uniprot annotations track ready for hg19, refs #13688
diff --git src/utils/uniprotLift src/utils/uniprotLift
index bf1cfb0..baddfe7 100755
--- src/utils/uniprotLift
+++ src/utils/uniprotLift
@@ -1,349 +1,381 @@
 #!/usr/bin/env python2.7
 # a script to lift uniprot mutations/annotations created by the publications uniprot parser
-# to a DB and create three bigbed files for them:
-# one for structures, one for mutations, one for other annotations
+# to a DB and create two bigbed files for them
 
 from os.path import join, isfile
 from os import system
 from collections import defaultdict, namedtuple
 
 # import the publications libraries
 import sys, gzip, optparse, logging,re
 sys.path.append("/cluster/home/max/projects/pubs/tools/lib")
 import maxCommon
 
 # Globals  ---
 DEBUG = False
 
 pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"])
 
 # if --filter option provided, remove these altogether
 removeTypes = set(["helix", "strand", "beta", "turn", "signal"])
 
 # by default, only output identifiers, not html links to them
 # html links are good for custom tracks
 # identifers are good for local spMut tracks
 createLinks = False
 
 urls = {"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?",
         "uniProt" : "http://www.uniprot.org/uniprot/",
         "pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/"
         }
 
 # some feature types should not go into the bed file name field, but rather be replaced 
 # with the curator's comments
 useComment = set(["domain", "chain","region of interest","topological domain","short sequence motif"])
 
 # -------------
 
 def iterTsvRows(inFile,encoding="utf8", fieldSep="\t", isGzip=False):
     """ 
         parses tab-sep file with headers as field names 
         yields collection.namedtuples
         strips "#"-prefix from header line
     """
 
     fh = inFile
     line1 = fh.readline()
     line1 = line1.strip("\n").strip("#")
     headers = line1.split(fieldSep)
     headers = [re.sub("[^a-zA-Z0-9_]","_", h) for h in headers]
 
     Record = namedtuple('tsvRec', headers)
     for line in fh:
         line = line.rstrip("\n")
         fields = line.split(fieldSep)
         if encoding!=None:
             fields = [f.decode(encoding) for f in fields]
         try:
             rec = Record(*fields)
         except Exception, msg:
             logging.error("Exception occured while parsing line, %s" % msg)
             logging.error("Filename %s" % fh.name)
             logging.error("Line was: %s" % line)
             logging.error("Does number of fields match headers?")
             logging.error("Headers are: %s" % headers)
             raise Exception("wrong field count in line %s" % line)
         # convert fields to correct data type
         yield rec
 
 def htmlLink(urlType, accs):
     if createLinks:
         strList = []
         for acc in accs:
             strList.append('<a href="%s%s">%s</a>' % (urls[urlType], acc, acc))
         return ", ".join(strList)
     else:
         return ",".join(accs)
 
 def run(cmd):
     if DEBUG:
         print "running", cmd
     ret = system(cmd)
     if ret!=0:
         print "Could not run %s" % cmd
         sys.exit(1)
 
 threeToOne = \
     {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K',
      'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N',
      'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W',
      'Ala': 'A', 'Val':'V',  'Glu': 'E', 'Tyr': 'Y', 'Met': 'M',
      'Sec': 'U' # very very rare amino acid (=stop codon)
      }
 
 oneToThree = dict([[v,k] for k,v in threeToOne.items()])
 
 def aaToLong(seq):
     " convert amino acid to three letter code "
     res = []
     for aa in seq:
         longAa = oneToThree.get(aa, aa)
         if longAa==aa:
             print "unknown iupac", aa
         res.append(longAa)
     return "-".join(res)
 
 featTypeColors = {
 "modified residue" : "200,200,0",
 "transmembrane region" : "0,0,100",
 "glycosylation site" : "0,100,100",
 "disulfide bond" : "100,100,100",
 "topological domain" : "100,0,0"
 }
 
 def makeBedLine(mut, bed, isMut):
     disStatus = set([mut.disRelated for mut in muts])
     comments = [mut.comment for mut in muts if mut.comment!=""]
     # skip if it's not disease related 
     #if not "disRelated" in disStatus and mut.featType=="sequence variant" \
         #and len(comments)==0 and mut.disease=="":
         #noInfoCount +=1
         
     diseases = list(set([mut.disease for mut in muts if mut.disease!=""]))
     disCodes = list(set([mut.disCode for mut in muts if mut.disCode!=""]))
     acc = muts[0].acc
     dbSnpIds = [mut.dbSnpId for mut in muts]
 
     # set the bed name field to a disease, to the mutation or something else
     if isMut and mut.origAa=="":
         bed[3] = "%s-%s:del" % (mut.begin, mut.end)
     else:
         bed[3] = "%s->%s" % (mut.origAa, mut.mutAa)
 
     disName = ",".join(disCodes)
     if len(disName)>30:
         disName = disName[:30]+"..."
     if len(disCodes)>0 and not "strain" in disName:
         bed[3] += " in %s" % disName
 
     if mut.featType=="sequence variant":
         varType = "Naturally occuring sequence variant"
         bed[8] = "100,0,0"
     elif mut.featType=="mutagenesis site":
         varType = "Experimental mutation of amino acids"
         bed[8] = "0,0,100"
     else:
         bed[3] = mut.shortFeatType
+        if mut.featType in useComment:
+            bed[3] = mut.comment
+
         if mut.featType=="signal peptide":
             bed[3] = "Signal peptide"
         if mut.featType=="lipid moiety-binding region":
             bed[3] = "Lipidation"
         if mut.featType=="transmembrane region":
             bed[3] = "Transmembrane"
-        if mut.featType in useComment:
-            bed[3] = mut.comment
+            
+        if mut.comment=="Nuclear localization signal":
+            bed[3] = "Nuclear loc"
         if mut.comment=="Involved in receptor recognition and/or post-binding events":
             bed[3] = "Recept recog"
+        if mut.comment=="Fibronectin type-III":
+            bed[3] = "FibronectIII"
+
+        # more general rules
+        if mut.comment.startswith("Zinc finger protein "):
+            bed[3] = mut.comment.replace("Zinc finger protein ", "ZF-")
+        if mut.comment.startswith("Necessary for the"):
+            bed[3] = mut.comment.replace("Necessary for the ", "")
+        if mut.comment.startswith("Interaction with "):
+            bed[3] = mut.comment.replace("Interaction with ", "Int:")
+        if mut.comment.startswith("Involved in the"):
+            bed[3] = mut.comment.replace("Involved in the ", "")
+        if mut.comment.startswith("Required for "):
+            bed[3] = mut.comment.replace("Required for ", "")
         if mut.comment.startswith("Cleavage; by host "):
             bed[3] = "Cleave:"+mut.comment.split()[-1]
 
+
         if bed[3] == "Intrinsically disordered":
             bed[3] = "Disordered"
         
         if len(bed[3])>17:
             bed[3] = bed[3][:14]+"..."
 
         varType = mut.featType
         bed[8] = featTypeColors.get(mut.featType, "0,0,0")
 
     bed.append(varType)
     if isMut:
         bed.append(",".join(diseases))
 
     if isMut:
         # create description of mutation
         if int(mut.end)-int(mut.begin)==1:
             posStr = "position %s" % mut.begin
         else:
             posStr = "position %s-%s" % (mut.begin, mut.end)
 
         if mut.origAa=="":
             bed.append("%s, removal of amino acids" % (posStr))
         else:
             bed.append("%s, %s changed to %s" % \
                 (posStr, aaToLong(mut.origAa), aaToLong(mut.mutAa)))
     else:
         #  just position of feature
         if int(mut.end)-int(mut.begin)==1:
             posStr = "amino acid %s" % str(int(mut.begin)+1)
         else:
             posStr = "amino acids %s-%s" % (mut.begin, str(int(mut.end)-1))
         posStr += " on protein %s" % mut.acc
         bed.append(posStr)
 
     comments = [com for com in comments if com.strip()!=""]
     commentField = ", ".join(comments)
     if len(diseases)!=0:
         commentField = ",".join(diseases) + "; " + commentField
     bed.append(commentField)
     if isMut:
         bed.append(htmlLink('var', varIds))
         dbSnpIds = [id for id in dbSnpIds if id.strip()!=""]
         bed.append(",".join(dbSnpIds))
     bed.append(htmlLink('uniProt', [acc]))
     bed.append(htmlLink('pubmed', pmids))
     bed[5] = "."
     bedLine = "\t".join(bed)+"\n"
     return bedLine
 
 # ------ MAIN ------
 if __name__ == '__main__':
     parser = optparse.OptionParser("""usage: %prog [options] taxonId dbName liftPslFile - lift uniprot annotations to genome. 
     
     Needs a mapping PSL from uniprot nucleotide coordinates to genome positions.
     - for genomes with introns, makeLiftOver.sh from
       src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19
     - for a quicker rough version, create one with BLAT and -q=dnax folowed by
       pslProtCnv 
 
     Input uniprot files have to be created with the pubParseDb uniprot parser
 
     """)
     parser.add_option("", "--annot", dest="annot", action="store_true", help="lift other, non-variant annotations")
     parser.add_option("-d", "", dest="debug", action="store_true", help="debug output")
     parser.add_option("", "--filter", dest="filterAnnots", action="store_true", help="remove certain annotations that are not very useful, like helix,chain,beta, etc")
     parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs")
     (options, args) = parser.parse_args()
 
     if options.debug:
         DEBUG=True
 
     taxonId, db, liftFname = args
 
     mutData = defaultdict(list)
 
     cmd = "rm -rf work; mkdir work"
     run(cmd)
 
     # create seq sizes for all uniprot sequences of this species
     uniprotFa = join(options.uniprotDir, "uniprot.%s.fa" % taxonId)
     uniprotFaGz = uniprotFa+".gz"
     if not isfile(uniprotFa):
         print "uncompressing uniprot gz fa"
         open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read())
 
     if not isfile(uniprotFa):
         raise Exception("Not found: %s" % uniprotFa)
 
     print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes")
     cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa
     run(cmd)
+
+    # keep protein lengths for filtering later, we'll remove everything that 
+    # covers the whole protein
+    seqSizes = {}
+    for line in open("work/chromSizes"):
+        seq, size = line.rstrip("\n").split()
+        seqSizes[seq] = int(size)/3
+        
     # get uniprot IDs 
     speciesAccs = set([line.split()[0] for line in open("work/chromSizes")])
     print "got %d accessions" % len(speciesAccs)
 
     # read data, write bed to file
-    # this annotates mutations on the protein sequence
+    # this annotates in protein coordinates
     print "converting tab to bed"
     ofh = open("work/temp.bed", "w")
     uniProtMutFname = join(options.uniprotDir, "uniprot.%s.mut.tab" % taxonId)
     print "reading %s" % uniProtMutFname
     for mut in iterTsvRows(open(uniProtMutFname)):
         if mut.acc not in speciesAccs:
             continue
         if options.filterAnnots and mut.featType in removeTypes:
             continue
         mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa
+
+        # remove annotations that cover the whole protein, probably not very useful
+        #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc]
+        if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1:
+            continue
+
         mutPos = 3*(int(mut.begin)-1)
         mutPosEnd = 3*(int(mut.end)-1)
         if mutName not in mutData:
             ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n")
         mutData[mutName].append(mut)
     ofh.close()
     
     # lift from protein sequence bed to target bed
     print "lifting"
     cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname
     run(cmd)
 
     print "adding extra fields"
     # read lifted bed and add uniprot annotations to it as additional fields
     mutBed = open("spMut.bed", "w")
     annotBed = open("spAnnot.bed" , "w")
     structBed = open("spStruct.bed" , "w")
 
     count = 0
     blackCount = 0
     noInfoCount = 0
     doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name)
 
     for line in open("work/temp.lifted.bed"):
         bed = line.strip().split()
         muts = mutData[bed[3]]
         varIds = set([mut.varId for mut in muts])
         pmids = set()
         for mut in muts:
             pmids.update(mut.pmid.split(","))
 
         if len(pmidBlackList.intersection(pmids))==len(pmids):
             blackCount +=1
             continue
 
         if mut.featType in ["mutagenesis site", "sequence variant"]:
             isMut = True
             ofh = mutBed
         elif mut.featType in ["strand","helix", "coiled-coil region", "turn", "disulfide bond", "glycosylation site"]:
             isMut = False
             ofh = structBed
         else:
             isMut = False
             ofh = annotBed
 
         bedLine = makeBedLine(mut, bed, isMut)
         # skip annotations that we most likely have covered before
         if not isMut:
             keyFields = tuple(bedLine.split("\t")[:4])
             if keyFields in doneAnnots:
                 continue
             doneAnnots.add(keyFields)
         ofh.write(bedLine)
         count += 1
 
     print "%d features written to %s" % (count, ofh.name)
     mutBed.close()
     annotBed.close()
     structBed.close()
 
     #print "%d variants not mapped to genome" % len(notMapped)
 
     asFname = "bed12UniProtMut.as"
     cmd = "bedToBigBed -as=%s spMut.bed /hive/data/genomes/%s/chrom.sizes spMut.bb -type=bed12+ -tab" % \
         (asFname, db)
     run(cmd)
 
     asFname = "bed12UniProtAnnot.as"
     cmd = "bedToBigBed -as=%s spAnnot.bed /hive/data/genomes/%s/chrom.sizes spAnnot.bb -type=bed12+ -tab" % \
         (asFname, db)
     run(cmd)
     cmd = "bedToBigBed -as=%s spStruct.bed /hive/data/genomes/%s/chrom.sizes spStruct.bb -type=bed12+ -tab" % \
         (asFname, db)
     run(cmd)
 
     cmd = "rm -rf work"
     run(cmd)
     cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed"
     run(cmd)
     #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname)
     print "%d features skipped because of blacklisted PMIDS" % (blackCount)
     print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)