8c2dc989cad61bb0116cd8257cd9b424988eb64a max Mon Nov 10 13:09:45 2014 -0800 making uniprot annotations track ready for hg19, refs #13688 diff --git src/utils/uniprotLift src/utils/uniprotLift index bf1cfb0..baddfe7 100755 --- src/utils/uniprotLift +++ src/utils/uniprotLift @@ -1,19 +1,18 @@ #!/usr/bin/env python2.7 # a script to lift uniprot mutations/annotations created by the publications uniprot parser -# to a DB and create three bigbed files for them: -# one for structures, one for mutations, one for other annotations +# to a DB and create two bigbed files for them from os.path import join, isfile from os import system from collections import defaultdict, namedtuple # import the publications libraries import sys, gzip, optparse, logging,re sys.path.append("/cluster/home/max/projects/pubs/tools/lib") import maxCommon # Globals --- DEBUG = False pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"]) @@ -133,43 +132,62 @@ disName = ",".join(disCodes) if len(disName)>30: disName = disName[:30]+"..." if len(disCodes)>0 and not "strain" in disName: bed[3] += " in %s" % disName if mut.featType=="sequence variant": varType = "Naturally occuring sequence variant" bed[8] = "100,0,0" elif mut.featType=="mutagenesis site": varType = "Experimental mutation of amino acids" bed[8] = "0,0,100" else: bed[3] = mut.shortFeatType + if mut.featType in useComment: + bed[3] = mut.comment + if mut.featType=="signal peptide": bed[3] = "Signal peptide" if mut.featType=="lipid moiety-binding region": bed[3] = "Lipidation" if mut.featType=="transmembrane region": bed[3] = "Transmembrane" - if mut.featType in useComment: - bed[3] = mut.comment + + if mut.comment=="Nuclear localization signal": + bed[3] = "Nuclear loc" if mut.comment=="Involved in receptor recognition and/or post-binding events": bed[3] = "Recept recog" + if mut.comment=="Fibronectin type-III": + bed[3] = "FibronectIII" + + # more general rules + if mut.comment.startswith("Zinc finger protein "): + bed[3] = mut.comment.replace("Zinc finger protein ", "ZF-") + if mut.comment.startswith("Necessary for the"): + bed[3] = mut.comment.replace("Necessary for the ", "") + if mut.comment.startswith("Interaction with "): + bed[3] = mut.comment.replace("Interaction with ", "Int:") + if mut.comment.startswith("Involved in the"): + bed[3] = mut.comment.replace("Involved in the ", "") + if mut.comment.startswith("Required for "): + bed[3] = mut.comment.replace("Required for ", "") if mut.comment.startswith("Cleavage; by host "): bed[3] = "Cleave:"+mut.comment.split()[-1] + if bed[3] == "Intrinsically disordered": bed[3] = "Disordered" if len(bed[3])>17: bed[3] = bed[3][:14]+"..." varType = mut.featType bed[8] = featTypeColors.get(mut.featType, "0,0,0") bed.append(varType) if isMut: bed.append(",".join(diseases)) if isMut: # create description of mutation @@ -237,46 +255,60 @@ run(cmd) # create seq sizes for all uniprot sequences of this species uniprotFa = join(options.uniprotDir, "uniprot.%s.fa" % taxonId) uniprotFaGz = uniprotFa+".gz" if not isfile(uniprotFa): print "uncompressing uniprot gz fa" open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read()) if not isfile(uniprotFa): raise Exception("Not found: %s" % uniprotFa) print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes") cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa run(cmd) + + # keep protein lengths for filtering later, we'll remove everything that + # covers the whole protein + seqSizes = {} + for line in open("work/chromSizes"): + seq, size = line.rstrip("\n").split() + seqSizes[seq] = int(size)/3 + # get uniprot IDs speciesAccs = set([line.split()[0] for line in open("work/chromSizes")]) print "got %d accessions" % len(speciesAccs) # read data, write bed to file - # this annotates mutations on the protein sequence + # this annotates in protein coordinates print "converting tab to bed" ofh = open("work/temp.bed", "w") uniProtMutFname = join(options.uniprotDir, "uniprot.%s.mut.tab" % taxonId) print "reading %s" % uniProtMutFname for mut in iterTsvRows(open(uniProtMutFname)): if mut.acc not in speciesAccs: continue if options.filterAnnots and mut.featType in removeTypes: continue mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa + + # remove annotations that cover the whole protein, probably not very useful + #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc] + if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1: + continue + mutPos = 3*(int(mut.begin)-1) mutPosEnd = 3*(int(mut.end)-1) if mutName not in mutData: ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n") mutData[mutName].append(mut) ofh.close() # lift from protein sequence bed to target bed print "lifting" cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname run(cmd) print "adding extra fields" # read lifted bed and add uniprot annotations to it as additional fields mutBed = open("spMut.bed", "w")