0e029fa9d8deb0b20ba6cd85a43844698aa631e0 max Mon Jul 24 01:08:01 2017 -0400 major changes to the uniprot track: splitting by annotation into more subtracks, reorg of the code so it can handle that many files, getting ready for ottomization and multi-org running. refs #19351 diff --git src/utils/uniprotLift src/utils/uniprotLift deleted file mode 100755 index b644feb..0000000 --- src/utils/uniprotLift +++ /dev/null @@ -1,442 +0,0 @@ -#!/usr/bin/env python2.7 -# a script to lift uniprot mutations/annotations created by the publications uniprot parser -# to a DB and create two bigbed files for them - -from os.path import join, isfile -from os import system -from collections import defaultdict, namedtuple - -# import the publications libraries -import sys, gzip, optparse, logging,re - -# Globals --- -DEBUG = False - -pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"]) - -# if --filter option provided, remove these altogether -removeTypes = set(["helix", "strand", "beta", "turn", "signal"]) - -# by default, only output identifiers, not html links to them -# html links are good for custom tracks -# identifers are good for local spMut tracks -createLinks = False - -urls = { -#"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?", -"var": "http://www.uniprot.org/uniprot/", -"uniProt" : "http://www.uniprot.org/uniprot/", -"pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/" -} - -# some feature types should not go into the bed file name field, but rather be replaced -# with the curator's comments -useComment = set(["domain", "chain","region of interest","topological domain","short sequence motif"]) - -# ------------- - -def iterTsvRows(inFile,encoding="utf8", fieldSep="\t", isGzip=False): - """ - parses tab-sep file with headers as field names - yields collection.namedtuples - strips "#"-prefix from header line - """ - - fh = inFile - line1 = fh.readline() - line1 = line1.strip("\n").strip("#") - headers = line1.split(fieldSep) - headers = [re.sub("[^a-zA-Z0-9_]","_", h) for h in headers] - - Record = namedtuple('tsvRec', headers) - for line in fh: - line = line.rstrip("\n") - fields = line.split(fieldSep) - if encoding!=None: - fields = [f.decode(encoding) for f in fields] - try: - rec = Record(*fields) - except Exception, msg: - logging.error("Exception occured while parsing line, %s" % msg) - logging.error("Filename %s" % fh.name) - logging.error("Line was: %s" % line) - logging.error("Does number of fields match headers?") - logging.error("Headers are: %s" % headers) - raise Exception("wrong field count in line %s" % line) - # convert fields to correct data type - yield rec - -def htmlLink(urlType, accs): - if createLinks: - strList = [] - for acc in accs: - strList.append('%s' % (urls[urlType], acc, acc)) - return ", ".join(strList) - else: - return ",".join(accs) - -def run(cmd): - if DEBUG: - print "running", cmd - ret = system(cmd) - if ret!=0: - print "Could not run %s" % cmd - sys.exit(1) - -threeToOne = \ - {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K', - 'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N', - 'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W', - 'Ala': 'A', 'Val':'V', 'Glu': 'E', 'Tyr': 'Y', 'Met': 'M', - 'Sec': 'U' # very very rare amino acid (=stop codon) - } - -oneToThree = dict([[v,k] for k,v in threeToOne.items()]) - -def aaToLong(seq): - " convert amino acid to three letter code " - res = [] - for aa in seq: - longAa = oneToThree.get(aa, aa) - if longAa==aa: - print "unknown iupac", aa - res.append(longAa) - return "-".join(res) - -featTypeColors = { -"modified residue" : "200,200,0", -"transmembrane region" : "0,0,100", -"glycosylation site" : "0,100,100", -"disulfide bond" : "100,100,100", -"topological domain" : "100,0,0" -} - -disShortNames = { - "hepatocellular carcinoma" : "HepatocC", - "a hepatocellular carcinoma" : "HepatocC", - "a hepatocellular carcinoma sample" : "HepatocC", - "hepatocellular carcinomas" : "HepatocC", - "ovarian cancer" : "OvC", - "colon cancer" : "ColC", - "colorectal cancer" : "ColrectC", - "hepatoblastoma" : "HepatoBl", - "a sporadic cancer": "sporadic", - "sporadic cancers": "sporadic", - "non-small cell lung cancer cell lines": "NSCLC-CL", - "a colon cancer cell line":"ColC-CL" -} - -def removeAnds(disNames): - " remove useless 'and' in disease names " - newNames = [] - for name in disNames: - if " and " in name: - splitNames = name.split(" and ") - newNames.extend(splitNames) - del disCodes[disCodes.index(name)] - disCodes.extend(splitNames) - else: - newNames.append(name) - return newNames - -def shortenDisCodes(disCodes): - " some disease names are not shortened yet by UniProt. Do this now. " - #print "XX", disNames, disCodes - #assert(len(disNames)==len(disCodes)) - newCodes = [] - for code in disCodes: - if " and " in code: - splitCodes = code.split(" and ") - newCodes.append(disShortNames.get(splitCodes[0], splitCodes[0])) - newCodes.append(disShortNames.get(splitCodes[1], splitCodes[1])) - else: - newCodes.append(disShortNames.get(code, code)) - return newCodes - #return list(set(newCodes)) - -xxFh = open("uniprotLiftDbg.txt", "w") - -def makeBedLine(muts, bed, isMut): - mut = muts[-1] - disStatus = set([mut.disRelated for mut in muts]) - comments = [mut.comment for mut in muts if mut.comment!=""] - # skip if it's not disease related - #if not "disRelated" in disStatus and mut.featType=="sequence variant" \ - #and len(comments)==0 and mut.disease=="": - #noInfoCount +=1 - - diseases = list([mut.disease for mut in muts if mut.disease!=""]) - disCodes = list([mut.disCode for mut in muts if mut.disCode!=""]) - #diseases = removeAnds(diseases) - if len(diseases)!=0: - xxFh.write("%s\t%s\n" % ("|".join(diseases), "|".join(disCodes))) - disCodes = shortenDisCodes(disCodes) - - acc = muts[0].acc - dbSnpIds = [mut.dbSnpId for mut in muts] - - # set the bed name field to a disease, to the mutation or something else - if isMut and mut.origAa=="": - bed[3] = "%s-%sdel" % (mut.begin, mut.end) - else: - bed[3] = "%s%s%s" % (mut.origAa, mut.begin, mut.mutAa) - - disName = ",".join(disCodes) - if len(disName)>30: - disName = disName[:30]+"..." - if len(disCodes)>0 and not "strain" in disName: - bed[3] += " in %s" % disName - - if mut.featType=="sequence variant": - varType = "Naturally occurring sequence variant" - bed[8] = "100,0,0" - elif mut.featType=="mutagenesis site": - varType = "Experimental mutation of amino acids" - bed[8] = "0,0,100" - else: - bed[3] = mut.shortFeatType - if mut.featType in useComment: - bed[3] = mut.comment - - if mut.featType=="signal peptide": - bed[3] = "Signal peptide" - if mut.featType=="lipid moiety-binding region": - bed[3] = "Lipidation" - if mut.featType=="transmembrane region": - bed[3] = "Transmembrane" - - if mut.comment=="Nuclear localization signal": - bed[3] = "Nuclear loc" - if mut.comment=="Involved in receptor recognition and/or post-binding events": - bed[3] = "Recept recog" - if mut.comment=="Fibronectin type-III": - bed[3] = "FibronectIII" - - # more general rules - if mut.comment.startswith("Zinc finger protein "): - bed[3] = mut.comment.replace("Zinc finger protein ", "ZF-") - if mut.comment.startswith("Necessary for the"): - bed[3] = mut.comment.replace("Necessary for the ", "") - if mut.comment.startswith("Interaction with "): - bed[3] = mut.comment.replace("Interaction with ", "Int:") - if mut.comment.startswith("Involved in the"): - bed[3] = mut.comment.replace("Involved in the ", "") - if mut.comment.startswith("Required for "): - bed[3] = mut.comment.replace("Required for ", "") - if mut.comment.startswith("Cleavage; by host "): - bed[3] = "Cleave:"+mut.comment.split()[-1] - - - if bed[3] == "Intrinsically disordered": - bed[3] = "Disordered" - - if len(bed[3])>17: - bed[3] = bed[3][:14]+"..." - - varType = mut.featType - bed[8] = featTypeColors.get(mut.featType, "0,0,0") - - bed.append(varType) - if isMut: - bed.append(",".join(diseases)) - - if isMut: - # create description of mutation - if int(mut.end)-int(mut.begin)==1: - posStr = "position %s" % mut.begin - else: - posStr = "position %s-%s" % (mut.begin, mut.end) - - if mut.origAa=="": - bed.append("%s, removal of amino acids" % (posStr)) - else: - bed.append("%s, %s changed to %s" % \ - (posStr, aaToLong(mut.origAa), aaToLong(mut.mutAa))) - else: - # just position of feature - if int(mut.end)-int(mut.begin)==1: - posStr = "amino acid %s" % str(int(mut.begin)) # show a 1-based coordinate - else: - posStr = "amino acids %s-%s" % (mut.begin, str(int(mut.end)-1)) - posStr += " on protein %s" % mut.acc - bed.append(posStr) - - comments = [com for com in comments if com.strip()!=""] - commentField = ", ".join(comments) - if len(diseases)!=0: - commentField = ",".join(diseases) + "; " + commentField - bed.append(commentField) - if isMut: - varIds = [mut.varId for mut in muts] - if varIds!=[""]: - varIds = ["%s#%s|%s" % (acc, x, x) for x in varIds] # Andrew Nightingale request that we link to UniProt, not SwissProt - bed.append(htmlLink('var', varIds)) - dbSnpIds = [id for id in dbSnpIds if id.strip()!=""] - bed.append(",".join(dbSnpIds)) - bed.append(htmlLink('uniProt', [acc])) - bed.append(htmlLink('pubmed', pmids)) - bed[5] = "." - bedLine = "\t".join(bed)+"\n" - return bedLine - -# ------ MAIN ------ -if __name__ == '__main__': - parser = optparse.OptionParser("""usage: %prog [options] uniprotTabDir taxonId chromSizesFile liftPslFile - lift uniprot annotations to genome. - - uniprotTabDir is the directory with the output from uniprotToTab - - Needs a mapping PSL from uniprot (in faked nucleotide coordinates) to genome positions. - - for genomes with introns, makeLiftOver.sh from - kent/src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19 - - for a quicker rough version, create one with BLAT and -q=dnax folowed by - pslProtCnv - - """) - parser.add_option("", "--annot", dest="annot", action="store_true", help="lift other, non-variant annotations") - parser.add_option("-d", "", dest="debug", action="store_true", help="debug output") - parser.add_option("", "--filter", dest="filterAnnots", action="store_true", help="remove certain annotations that are not very useful, like helix,chain,beta, etc") - #parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs") - (options, args) = parser.parse_args() - - if options.debug: - DEBUG=True - - if len(args)==0: - parser.print_help() - sys.exit(1) - - uniprotDir, taxonId, chromSizesPath, liftFname = args - - mutData = defaultdict(list) - - cmd = "rm -rf work; mkdir work" - run(cmd) - - # create seq sizes for all uniprot sequences of this species - uniprotFa = join(uniprotDir, "uniprot.%s.fa" % taxonId) - uniprotFaGz = uniprotFa+".gz" - if not isfile(uniprotFa): - print "uncompressing uniprot gz fa" - open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read()) - - if not isfile(uniprotFa): - raise Exception("Not found: %s" % uniprotFa) - - print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes") - cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa - run(cmd) - - # keep protein lengths for filtering later, we'll remove everything that - # covers the whole protein - seqSizes = {} - for line in open("work/chromSizes"): - seq, size = line.rstrip("\n").split() - seqSizes[seq] = int(size)/3 - - # get uniprot IDs - speciesAccs = set([line.split()[0] for line in open("work/chromSizes")]) - print "got %d accessions" % len(speciesAccs) - - # read data, write bed to file - # this annotates in protein coordinates - print "converting tab to bed" - ofh = open("work/temp.bed", "w") - uniProtMutFname = join(uniprotDir, "uniprot.%s.annots.tab" % taxonId) - print "reading %s" % uniProtMutFname - for mut in iterTsvRows(open(uniProtMutFname)): - if mut.acc not in speciesAccs: - logging.debug("skipping %s, not in work/chromSizes" % mut.acc) - continue - if options.filterAnnots and mut.featType in removeTypes: - logging.debug("skipping %s, explicitely removed annotation" % str(mut)) - continue - mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa - - # remove annotations that cover the whole protein, probably not very useful - #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc] - if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1: - logging.debug("skipping %s, covers whole protein" % str(mut)) - continue - - mutPos = 3*(int(mut.begin)-1) - mutPosEnd = 3*(int(mut.end)-1) - if mutName not in mutData: - ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n") - mutData[mutName].append(mut) - ofh.close() - - # lift from protein sequence bed to target bed - print "lifting" - cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | LC_COLLATE=C sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname - run(cmd) - - print "adding extra fields" - # read lifted bed and add uniprot annotations to it as additional fields - mutBed = open("spMut.bed", "w") - annotBed = open("spAnnot.bed" , "w") - structBed = open("spStruct.bed" , "w") - - count = 0 - blackCount = 0 - noInfoCount = 0 - doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name) - # name is sequence:start: - - for line in open("work/temp.lifted.bed"): - bed = line.strip().split() - muts = mutData[bed[3]] - varIds = set([mut.varId for mut in muts]) - pmids = set() - for mut in muts: - pmids.update(mut.pmid.split(",")) - - if len(pmidBlackList.intersection(pmids))==len(pmids): - blackCount +=1 - continue - - if mut.featType in ["mutagenesis site", "sequence variant"]: - isMut = True - ofh = mutBed - elif mut.featType in ["strand","helix", "coiled-coil region", "turn"]: - isMut = False - ofh = structBed - else: - isMut = False - ofh = annotBed - - bedLine = makeBedLine(muts, bed, isMut) - # skip annotations that we most likely have covered before - keyFields = tuple(bedLine.split("\t")[:4]) - if keyFields in doneAnnots: - continue - doneAnnots.add(keyFields) - ofh.write(bedLine) - count += 1 - - print "%d features written to %s" % (count, ofh.name) - mutBed.close() - annotBed.close() - structBed.close() - - #print "%d variants not mapped to genome" % len(notMapped) - - asFname = "bed12UniProtMut.as" - cmd = "bedToBigBed -as=%s spMut.bed %s spMut.bb -type=bed12+ -tab" % \ - (asFname, chromSizesPath) - run(cmd) - - asFname = "bed12UniProtAnnot.as" - cmd = "bedToBigBed -as=%s spAnnot.bed %s spAnnot.bb -type=bed12+ -tab" % \ - (asFname, chromSizesPath) - run(cmd) - cmd = "bedToBigBed -as=%s spStruct.bed %s spStruct.bb -type=bed12+ -tab" % \ - (asFname, chromSizesPath) - run(cmd) - - #cmd = "rm -rf work" - #run(cmd) - cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed" - run(cmd) - #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname) - print "%d features skipped because of blacklisted PMIDS" % (blackCount) - print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)