9ab0e5cc221ce3e6e0c19b50940eb0a24ea56243 max Thu Sep 25 01:22:04 2014 -0700 moving small tools into utils directory diff --git src/utils/uniprotLift src/utils/uniprotLift new file mode 100755 index 0000000..bf1cfb0 --- /dev/null +++ src/utils/uniprotLift @@ -0,0 +1,349 @@ +#!/usr/bin/env python2.7 +# a script to lift uniprot mutations/annotations created by the publications uniprot parser +# to a DB and create three bigbed files for them: +# one for structures, one for mutations, one for other annotations + +from os.path import join, isfile +from os import system +from collections import defaultdict, namedtuple + +# import the publications libraries +import sys, gzip, optparse, logging,re +sys.path.append("/cluster/home/max/projects/pubs/tools/lib") +import maxCommon + +# Globals --- +DEBUG = False + +pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"]) + +# if --filter option provided, remove these altogether +removeTypes = set(["helix", "strand", "beta", "turn", "signal"]) + +# by default, only output identifiers, not html links to them +# html links are good for custom tracks +# identifers are good for local spMut tracks +createLinks = False + +urls = {"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?", + "uniProt" : "http://www.uniprot.org/uniprot/", + "pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/" + } + +# some feature types should not go into the bed file name field, but rather be replaced +# with the curator's comments +useComment = set(["domain", "chain","region of interest","topological domain","short sequence motif"]) + +# ------------- + +def iterTsvRows(inFile,encoding="utf8", fieldSep="\t", isGzip=False): + """ + parses tab-sep file with headers as field names + yields collection.namedtuples + strips "#"-prefix from header line + """ + + fh = inFile + line1 = fh.readline() + line1 = line1.strip("\n").strip("#") + headers = line1.split(fieldSep) + headers = [re.sub("[^a-zA-Z0-9_]","_", h) for h in headers] + + Record = namedtuple('tsvRec', headers) + for line in fh: + line = line.rstrip("\n") + fields = line.split(fieldSep) + if encoding!=None: + fields = [f.decode(encoding) for f in fields] + try: + rec = Record(*fields) + except Exception, msg: + logging.error("Exception occured while parsing line, %s" % msg) + logging.error("Filename %s" % fh.name) + logging.error("Line was: %s" % line) + logging.error("Does number of fields match headers?") + logging.error("Headers are: %s" % headers) + raise Exception("wrong field count in line %s" % line) + # convert fields to correct data type + yield rec + +def htmlLink(urlType, accs): + if createLinks: + strList = [] + for acc in accs: + strList.append('<a href="%s%s">%s</a>' % (urls[urlType], acc, acc)) + return ", ".join(strList) + else: + return ",".join(accs) + +def run(cmd): + if DEBUG: + print "running", cmd + ret = system(cmd) + if ret!=0: + print "Could not run %s" % cmd + sys.exit(1) + +threeToOne = \ + {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K', + 'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N', + 'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W', + 'Ala': 'A', 'Val':'V', 'Glu': 'E', 'Tyr': 'Y', 'Met': 'M', + 'Sec': 'U' # very very rare amino acid (=stop codon) + } + +oneToThree = dict([[v,k] for k,v in threeToOne.items()]) + +def aaToLong(seq): + " convert amino acid to three letter code " + res = [] + for aa in seq: + longAa = oneToThree.get(aa, aa) + if longAa==aa: + print "unknown iupac", aa + res.append(longAa) + return "-".join(res) + +featTypeColors = { +"modified residue" : "200,200,0", +"transmembrane region" : "0,0,100", +"glycosylation site" : "0,100,100", +"disulfide bond" : "100,100,100", +"topological domain" : "100,0,0" +} + +def makeBedLine(mut, bed, isMut): + disStatus = set([mut.disRelated for mut in muts]) + comments = [mut.comment for mut in muts if mut.comment!=""] + # skip if it's not disease related + #if not "disRelated" in disStatus and mut.featType=="sequence variant" \ + #and len(comments)==0 and mut.disease=="": + #noInfoCount +=1 + + diseases = list(set([mut.disease for mut in muts if mut.disease!=""])) + disCodes = list(set([mut.disCode for mut in muts if mut.disCode!=""])) + acc = muts[0].acc + dbSnpIds = [mut.dbSnpId for mut in muts] + + # set the bed name field to a disease, to the mutation or something else + if isMut and mut.origAa=="": + bed[3] = "%s-%s:del" % (mut.begin, mut.end) + else: + bed[3] = "%s->%s" % (mut.origAa, mut.mutAa) + + disName = ",".join(disCodes) + if len(disName)>30: + disName = disName[:30]+"..." + if len(disCodes)>0 and not "strain" in disName: + bed[3] += " in %s" % disName + + if mut.featType=="sequence variant": + varType = "Naturally occuring sequence variant" + bed[8] = "100,0,0" + elif mut.featType=="mutagenesis site": + varType = "Experimental mutation of amino acids" + bed[8] = "0,0,100" + else: + bed[3] = mut.shortFeatType + if mut.featType=="signal peptide": + bed[3] = "Signal peptide" + if mut.featType=="lipid moiety-binding region": + bed[3] = "Lipidation" + if mut.featType=="transmembrane region": + bed[3] = "Transmembrane" + if mut.featType in useComment: + bed[3] = mut.comment + if mut.comment=="Involved in receptor recognition and/or post-binding events": + bed[3] = "Recept recog" + if mut.comment.startswith("Cleavage; by host "): + bed[3] = "Cleave:"+mut.comment.split()[-1] + + if bed[3] == "Intrinsically disordered": + bed[3] = "Disordered" + + if len(bed[3])>17: + bed[3] = bed[3][:14]+"..." + + varType = mut.featType + bed[8] = featTypeColors.get(mut.featType, "0,0,0") + + bed.append(varType) + if isMut: + bed.append(",".join(diseases)) + + if isMut: + # create description of mutation + if int(mut.end)-int(mut.begin)==1: + posStr = "position %s" % mut.begin + else: + posStr = "position %s-%s" % (mut.begin, mut.end) + + if mut.origAa=="": + bed.append("%s, removal of amino acids" % (posStr)) + else: + bed.append("%s, %s changed to %s" % \ + (posStr, aaToLong(mut.origAa), aaToLong(mut.mutAa))) + else: + # just position of feature + if int(mut.end)-int(mut.begin)==1: + posStr = "amino acid %s" % str(int(mut.begin)+1) + else: + posStr = "amino acids %s-%s" % (mut.begin, str(int(mut.end)-1)) + posStr += " on protein %s" % mut.acc + bed.append(posStr) + + comments = [com for com in comments if com.strip()!=""] + commentField = ", ".join(comments) + if len(diseases)!=0: + commentField = ",".join(diseases) + "; " + commentField + bed.append(commentField) + if isMut: + bed.append(htmlLink('var', varIds)) + dbSnpIds = [id for id in dbSnpIds if id.strip()!=""] + bed.append(",".join(dbSnpIds)) + bed.append(htmlLink('uniProt', [acc])) + bed.append(htmlLink('pubmed', pmids)) + bed[5] = "." + bedLine = "\t".join(bed)+"\n" + return bedLine + +# ------ MAIN ------ +if __name__ == '__main__': + parser = optparse.OptionParser("""usage: %prog [options] taxonId dbName liftPslFile - lift uniprot annotations to genome. + + Needs a mapping PSL from uniprot nucleotide coordinates to genome positions. + - for genomes with introns, makeLiftOver.sh from + src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19 + - for a quicker rough version, create one with BLAT and -q=dnax folowed by + pslProtCnv + + Input uniprot files have to be created with the pubParseDb uniprot parser + + """) + parser.add_option("", "--annot", dest="annot", action="store_true", help="lift other, non-variant annotations") + parser.add_option("-d", "", dest="debug", action="store_true", help="debug output") + parser.add_option("", "--filter", dest="filterAnnots", action="store_true", help="remove certain annotations that are not very useful, like helix,chain,beta, etc") + parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs") + (options, args) = parser.parse_args() + + if options.debug: + DEBUG=True + + taxonId, db, liftFname = args + + mutData = defaultdict(list) + + cmd = "rm -rf work; mkdir work" + run(cmd) + + # create seq sizes for all uniprot sequences of this species + uniprotFa = join(options.uniprotDir, "uniprot.%s.fa" % taxonId) + uniprotFaGz = uniprotFa+".gz" + if not isfile(uniprotFa): + print "uncompressing uniprot gz fa" + open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read()) + + if not isfile(uniprotFa): + raise Exception("Not found: %s" % uniprotFa) + + print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes") + cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa + run(cmd) + # get uniprot IDs + speciesAccs = set([line.split()[0] for line in open("work/chromSizes")]) + print "got %d accessions" % len(speciesAccs) + + # read data, write bed to file + # this annotates mutations on the protein sequence + print "converting tab to bed" + ofh = open("work/temp.bed", "w") + uniProtMutFname = join(options.uniprotDir, "uniprot.%s.mut.tab" % taxonId) + print "reading %s" % uniProtMutFname + for mut in iterTsvRows(open(uniProtMutFname)): + if mut.acc not in speciesAccs: + continue + if options.filterAnnots and mut.featType in removeTypes: + continue + mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa + mutPos = 3*(int(mut.begin)-1) + mutPosEnd = 3*(int(mut.end)-1) + if mutName not in mutData: + ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n") + mutData[mutName].append(mut) + ofh.close() + + # lift from protein sequence bed to target bed + print "lifting" + cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname + run(cmd) + + print "adding extra fields" + # read lifted bed and add uniprot annotations to it as additional fields + mutBed = open("spMut.bed", "w") + annotBed = open("spAnnot.bed" , "w") + structBed = open("spStruct.bed" , "w") + + count = 0 + blackCount = 0 + noInfoCount = 0 + doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name) + + for line in open("work/temp.lifted.bed"): + bed = line.strip().split() + muts = mutData[bed[3]] + varIds = set([mut.varId for mut in muts]) + pmids = set() + for mut in muts: + pmids.update(mut.pmid.split(",")) + + if len(pmidBlackList.intersection(pmids))==len(pmids): + blackCount +=1 + continue + + if mut.featType in ["mutagenesis site", "sequence variant"]: + isMut = True + ofh = mutBed + elif mut.featType in ["strand","helix", "coiled-coil region", "turn", "disulfide bond", "glycosylation site"]: + isMut = False + ofh = structBed + else: + isMut = False + ofh = annotBed + + bedLine = makeBedLine(mut, bed, isMut) + # skip annotations that we most likely have covered before + if not isMut: + keyFields = tuple(bedLine.split("\t")[:4]) + if keyFields in doneAnnots: + continue + doneAnnots.add(keyFields) + ofh.write(bedLine) + count += 1 + + print "%d features written to %s" % (count, ofh.name) + mutBed.close() + annotBed.close() + structBed.close() + + #print "%d variants not mapped to genome" % len(notMapped) + + asFname = "bed12UniProtMut.as" + cmd = "bedToBigBed -as=%s spMut.bed /hive/data/genomes/%s/chrom.sizes spMut.bb -type=bed12+ -tab" % \ + (asFname, db) + run(cmd) + + asFname = "bed12UniProtAnnot.as" + cmd = "bedToBigBed -as=%s spAnnot.bed /hive/data/genomes/%s/chrom.sizes spAnnot.bb -type=bed12+ -tab" % \ + (asFname, db) + run(cmd) + cmd = "bedToBigBed -as=%s spStruct.bed /hive/data/genomes/%s/chrom.sizes spStruct.bb -type=bed12+ -tab" % \ + (asFname, db) + run(cmd) + + cmd = "rm -rf work" + run(cmd) + cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed" + run(cmd) + #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname) + print "%d features skipped because of blacklisted PMIDS" % (blackCount) + print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)