8c2dc989cad61bb0116cd8257cd9b424988eb64a max Mon Nov 10 13:09:45 2014 -0800 making uniprot annotations track ready for hg19, refs #13688 diff --git src/utils/uniprotLift src/utils/uniprotLift index bf1cfb0..baddfe7 100755 --- src/utils/uniprotLift +++ src/utils/uniprotLift @@ -1,349 +1,381 @@ #!/usr/bin/env python2.7 # a script to lift uniprot mutations/annotations created by the publications uniprot parser -# to a DB and create three bigbed files for them: -# one for structures, one for mutations, one for other annotations +# to a DB and create two bigbed files for them from os.path import join, isfile from os import system from collections import defaultdict, namedtuple # import the publications libraries import sys, gzip, optparse, logging,re sys.path.append("/cluster/home/max/projects/pubs/tools/lib") import maxCommon # Globals --- DEBUG = False pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"]) # if --filter option provided, remove these altogether removeTypes = set(["helix", "strand", "beta", "turn", "signal"]) # by default, only output identifiers, not html links to them # html links are good for custom tracks # identifers are good for local spMut tracks createLinks = False urls = {"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?", "uniProt" : "http://www.uniprot.org/uniprot/", "pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/" } # some feature types should not go into the bed file name field, but rather be replaced # with the curator's comments useComment = set(["domain", "chain","region of interest","topological domain","short sequence motif"]) # ------------- def iterTsvRows(inFile,encoding="utf8", fieldSep="\t", isGzip=False): """ parses tab-sep file with headers as field names yields collection.namedtuples strips "#"-prefix from header line """ fh = inFile line1 = fh.readline() line1 = line1.strip("\n").strip("#") headers = line1.split(fieldSep) headers = [re.sub("[^a-zA-Z0-9_]","_", h) for h in headers] Record = namedtuple('tsvRec', headers) for line in fh: line = line.rstrip("\n") fields = line.split(fieldSep) if encoding!=None: fields = [f.decode(encoding) for f in fields] try: rec = Record(*fields) except Exception, msg: logging.error("Exception occured while parsing line, %s" % msg) logging.error("Filename %s" % fh.name) logging.error("Line was: %s" % line) logging.error("Does number of fields match headers?") logging.error("Headers are: %s" % headers) raise Exception("wrong field count in line %s" % line) # convert fields to correct data type yield rec def htmlLink(urlType, accs): if createLinks: strList = [] for acc in accs: strList.append('%s' % (urls[urlType], acc, acc)) return ", ".join(strList) else: return ",".join(accs) def run(cmd): if DEBUG: print "running", cmd ret = system(cmd) if ret!=0: print "Could not run %s" % cmd sys.exit(1) threeToOne = \ {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K', 'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N', 'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W', 'Ala': 'A', 'Val':'V', 'Glu': 'E', 'Tyr': 'Y', 'Met': 'M', 'Sec': 'U' # very very rare amino acid (=stop codon) } oneToThree = dict([[v,k] for k,v in threeToOne.items()]) def aaToLong(seq): " convert amino acid to three letter code " res = [] for aa in seq: longAa = oneToThree.get(aa, aa) if longAa==aa: print "unknown iupac", aa res.append(longAa) return "-".join(res) featTypeColors = { "modified residue" : "200,200,0", "transmembrane region" : "0,0,100", "glycosylation site" : "0,100,100", "disulfide bond" : "100,100,100", "topological domain" : "100,0,0" } def makeBedLine(mut, bed, isMut): disStatus = set([mut.disRelated for mut in muts]) comments = [mut.comment for mut in muts if mut.comment!=""] # skip if it's not disease related #if not "disRelated" in disStatus and mut.featType=="sequence variant" \ #and len(comments)==0 and mut.disease=="": #noInfoCount +=1 diseases = list(set([mut.disease for mut in muts if mut.disease!=""])) disCodes = list(set([mut.disCode for mut in muts if mut.disCode!=""])) acc = muts[0].acc dbSnpIds = [mut.dbSnpId for mut in muts] # set the bed name field to a disease, to the mutation or something else if isMut and mut.origAa=="": bed[3] = "%s-%s:del" % (mut.begin, mut.end) else: bed[3] = "%s->%s" % (mut.origAa, mut.mutAa) disName = ",".join(disCodes) if len(disName)>30: disName = disName[:30]+"..." if len(disCodes)>0 and not "strain" in disName: bed[3] += " in %s" % disName if mut.featType=="sequence variant": varType = "Naturally occuring sequence variant" bed[8] = "100,0,0" elif mut.featType=="mutagenesis site": varType = "Experimental mutation of amino acids" bed[8] = "0,0,100" else: bed[3] = mut.shortFeatType + if mut.featType in useComment: + bed[3] = mut.comment + if mut.featType=="signal peptide": bed[3] = "Signal peptide" if mut.featType=="lipid moiety-binding region": bed[3] = "Lipidation" if mut.featType=="transmembrane region": bed[3] = "Transmembrane" - if mut.featType in useComment: - bed[3] = mut.comment + + if mut.comment=="Nuclear localization signal": + bed[3] = "Nuclear loc" if mut.comment=="Involved in receptor recognition and/or post-binding events": bed[3] = "Recept recog" + if mut.comment=="Fibronectin type-III": + bed[3] = "FibronectIII" + + # more general rules + if mut.comment.startswith("Zinc finger protein "): + bed[3] = mut.comment.replace("Zinc finger protein ", "ZF-") + if mut.comment.startswith("Necessary for the"): + bed[3] = mut.comment.replace("Necessary for the ", "") + if mut.comment.startswith("Interaction with "): + bed[3] = mut.comment.replace("Interaction with ", "Int:") + if mut.comment.startswith("Involved in the"): + bed[3] = mut.comment.replace("Involved in the ", "") + if mut.comment.startswith("Required for "): + bed[3] = mut.comment.replace("Required for ", "") if mut.comment.startswith("Cleavage; by host "): bed[3] = "Cleave:"+mut.comment.split()[-1] + if bed[3] == "Intrinsically disordered": bed[3] = "Disordered" if len(bed[3])>17: bed[3] = bed[3][:14]+"..." varType = mut.featType bed[8] = featTypeColors.get(mut.featType, "0,0,0") bed.append(varType) if isMut: bed.append(",".join(diseases)) if isMut: # create description of mutation if int(mut.end)-int(mut.begin)==1: posStr = "position %s" % mut.begin else: posStr = "position %s-%s" % (mut.begin, mut.end) if mut.origAa=="": bed.append("%s, removal of amino acids" % (posStr)) else: bed.append("%s, %s changed to %s" % \ (posStr, aaToLong(mut.origAa), aaToLong(mut.mutAa))) else: # just position of feature if int(mut.end)-int(mut.begin)==1: posStr = "amino acid %s" % str(int(mut.begin)+1) else: posStr = "amino acids %s-%s" % (mut.begin, str(int(mut.end)-1)) posStr += " on protein %s" % mut.acc bed.append(posStr) comments = [com for com in comments if com.strip()!=""] commentField = ", ".join(comments) if len(diseases)!=0: commentField = ",".join(diseases) + "; " + commentField bed.append(commentField) if isMut: bed.append(htmlLink('var', varIds)) dbSnpIds = [id for id in dbSnpIds if id.strip()!=""] bed.append(",".join(dbSnpIds)) bed.append(htmlLink('uniProt', [acc])) bed.append(htmlLink('pubmed', pmids)) bed[5] = "." bedLine = "\t".join(bed)+"\n" return bedLine # ------ MAIN ------ if __name__ == '__main__': parser = optparse.OptionParser("""usage: %prog [options] taxonId dbName liftPslFile - lift uniprot annotations to genome. Needs a mapping PSL from uniprot nucleotide coordinates to genome positions. - for genomes with introns, makeLiftOver.sh from src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19 - for a quicker rough version, create one with BLAT and -q=dnax folowed by pslProtCnv Input uniprot files have to be created with the pubParseDb uniprot parser """) parser.add_option("", "--annot", dest="annot", action="store_true", help="lift other, non-variant annotations") parser.add_option("-d", "", dest="debug", action="store_true", help="debug output") parser.add_option("", "--filter", dest="filterAnnots", action="store_true", help="remove certain annotations that are not very useful, like helix,chain,beta, etc") parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs") (options, args) = parser.parse_args() if options.debug: DEBUG=True taxonId, db, liftFname = args mutData = defaultdict(list) cmd = "rm -rf work; mkdir work" run(cmd) # create seq sizes for all uniprot sequences of this species uniprotFa = join(options.uniprotDir, "uniprot.%s.fa" % taxonId) uniprotFaGz = uniprotFa+".gz" if not isfile(uniprotFa): print "uncompressing uniprot gz fa" open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read()) if not isfile(uniprotFa): raise Exception("Not found: %s" % uniprotFa) print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes") cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa run(cmd) + + # keep protein lengths for filtering later, we'll remove everything that + # covers the whole protein + seqSizes = {} + for line in open("work/chromSizes"): + seq, size = line.rstrip("\n").split() + seqSizes[seq] = int(size)/3 + # get uniprot IDs speciesAccs = set([line.split()[0] for line in open("work/chromSizes")]) print "got %d accessions" % len(speciesAccs) # read data, write bed to file - # this annotates mutations on the protein sequence + # this annotates in protein coordinates print "converting tab to bed" ofh = open("work/temp.bed", "w") uniProtMutFname = join(options.uniprotDir, "uniprot.%s.mut.tab" % taxonId) print "reading %s" % uniProtMutFname for mut in iterTsvRows(open(uniProtMutFname)): if mut.acc not in speciesAccs: continue if options.filterAnnots and mut.featType in removeTypes: continue mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa + + # remove annotations that cover the whole protein, probably not very useful + #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc] + if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1: + continue + mutPos = 3*(int(mut.begin)-1) mutPosEnd = 3*(int(mut.end)-1) if mutName not in mutData: ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n") mutData[mutName].append(mut) ofh.close() # lift from protein sequence bed to target bed print "lifting" cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname run(cmd) print "adding extra fields" # read lifted bed and add uniprot annotations to it as additional fields mutBed = open("spMut.bed", "w") annotBed = open("spAnnot.bed" , "w") structBed = open("spStruct.bed" , "w") count = 0 blackCount = 0 noInfoCount = 0 doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name) for line in open("work/temp.lifted.bed"): bed = line.strip().split() muts = mutData[bed[3]] varIds = set([mut.varId for mut in muts]) pmids = set() for mut in muts: pmids.update(mut.pmid.split(",")) if len(pmidBlackList.intersection(pmids))==len(pmids): blackCount +=1 continue if mut.featType in ["mutagenesis site", "sequence variant"]: isMut = True ofh = mutBed elif mut.featType in ["strand","helix", "coiled-coil region", "turn", "disulfide bond", "glycosylation site"]: isMut = False ofh = structBed else: isMut = False ofh = annotBed bedLine = makeBedLine(mut, bed, isMut) # skip annotations that we most likely have covered before if not isMut: keyFields = tuple(bedLine.split("\t")[:4]) if keyFields in doneAnnots: continue doneAnnots.add(keyFields) ofh.write(bedLine) count += 1 print "%d features written to %s" % (count, ofh.name) mutBed.close() annotBed.close() structBed.close() #print "%d variants not mapped to genome" % len(notMapped) asFname = "bed12UniProtMut.as" cmd = "bedToBigBed -as=%s spMut.bed /hive/data/genomes/%s/chrom.sizes spMut.bb -type=bed12+ -tab" % \ (asFname, db) run(cmd) asFname = "bed12UniProtAnnot.as" cmd = "bedToBigBed -as=%s spAnnot.bed /hive/data/genomes/%s/chrom.sizes spAnnot.bb -type=bed12+ -tab" % \ (asFname, db) run(cmd) cmd = "bedToBigBed -as=%s spStruct.bed /hive/data/genomes/%s/chrom.sizes spStruct.bb -type=bed12+ -tab" % \ (asFname, db) run(cmd) cmd = "rm -rf work" run(cmd) cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed" run(cmd) #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname) print "%d features skipped because of blacklisted PMIDS" % (blackCount) print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)