9461ce3c12c7faa7da2480f9629a370284c54605 max Tue Nov 29 15:52:54 2016 -0800 fixing uniprot on hg38, refs #18451 diff --git src/utils/uniprotLift src/utils/uniprotLift index d29868c..b644feb 100755 --- src/utils/uniprotLift +++ src/utils/uniprotLift @@ -10,31 +10,33 @@ import sys, gzip, optparse, logging,re # Globals --- DEBUG = False pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"]) # if --filter option provided, remove these altogether removeTypes = set(["helix", "strand", "beta", "turn", "signal"]) # by default, only output identifiers, not html links to them # html links are good for custom tracks # identifers are good for local spMut tracks createLinks = False -urls = {"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?", +urls = { +#"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?", +"var": "http://www.uniprot.org/uniprot/", "uniProt" : "http://www.uniprot.org/uniprot/", "pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/" } # some feature types should not go into the bed file name field, but rather be replaced # with the curator's comments useComment = set(["domain", "chain","region of interest","topological domain","short sequence motif"]) # ------------- def iterTsvRows(inFile,encoding="utf8", fieldSep="\t", isGzip=False): """ parses tab-sep file with headers as field names yields collection.namedtuples strips "#"-prefix from header line @@ -97,48 +99,99 @@ for aa in seq: longAa = oneToThree.get(aa, aa) if longAa==aa: print "unknown iupac", aa res.append(longAa) return "-".join(res) featTypeColors = { "modified residue" : "200,200,0", "transmembrane region" : "0,0,100", "glycosylation site" : "0,100,100", "disulfide bond" : "100,100,100", "topological domain" : "100,0,0" } -def makeBedLine(mut, bed, isMut): +disShortNames = { + "hepatocellular carcinoma" : "HepatocC", + "a hepatocellular carcinoma" : "HepatocC", + "a hepatocellular carcinoma sample" : "HepatocC", + "hepatocellular carcinomas" : "HepatocC", + "ovarian cancer" : "OvC", + "colon cancer" : "ColC", + "colorectal cancer" : "ColrectC", + "hepatoblastoma" : "HepatoBl", + "a sporadic cancer": "sporadic", + "sporadic cancers": "sporadic", + "non-small cell lung cancer cell lines": "NSCLC-CL", + "a colon cancer cell line":"ColC-CL" +} + +def removeAnds(disNames): + " remove useless 'and' in disease names " + newNames = [] + for name in disNames: + if " and " in name: + splitNames = name.split(" and ") + newNames.extend(splitNames) + del disCodes[disCodes.index(name)] + disCodes.extend(splitNames) + else: + newNames.append(name) + return newNames + +def shortenDisCodes(disCodes): + " some disease names are not shortened yet by UniProt. Do this now. " + #print "XX", disNames, disCodes + #assert(len(disNames)==len(disCodes)) + newCodes = [] + for code in disCodes: + if " and " in code: + splitCodes = code.split(" and ") + newCodes.append(disShortNames.get(splitCodes[0], splitCodes[0])) + newCodes.append(disShortNames.get(splitCodes[1], splitCodes[1])) + else: + newCodes.append(disShortNames.get(code, code)) + return newCodes + #return list(set(newCodes)) + +xxFh = open("uniprotLiftDbg.txt", "w") + +def makeBedLine(muts, bed, isMut): + mut = muts[-1] disStatus = set([mut.disRelated for mut in muts]) comments = [mut.comment for mut in muts if mut.comment!=""] # skip if it's not disease related #if not "disRelated" in disStatus and mut.featType=="sequence variant" \ #and len(comments)==0 and mut.disease=="": #noInfoCount +=1 - diseases = list(set([mut.disease for mut in muts if mut.disease!=""])) - disCodes = list(set([mut.disCode for mut in muts if mut.disCode!=""])) + diseases = list([mut.disease for mut in muts if mut.disease!=""]) + disCodes = list([mut.disCode for mut in muts if mut.disCode!=""]) + #diseases = removeAnds(diseases) + if len(diseases)!=0: + xxFh.write("%s\t%s\n" % ("|".join(diseases), "|".join(disCodes))) + disCodes = shortenDisCodes(disCodes) + acc = muts[0].acc dbSnpIds = [mut.dbSnpId for mut in muts] # set the bed name field to a disease, to the mutation or something else if isMut and mut.origAa=="": - bed[3] = "%s-%s:del" % (mut.begin, mut.end) + bed[3] = "%s-%sdel" % (mut.begin, mut.end) else: - bed[3] = "%s->%s" % (mut.origAa, mut.mutAa) + bed[3] = "%s%s%s" % (mut.origAa, mut.begin, mut.mutAa) disName = ",".join(disCodes) if len(disName)>30: disName = disName[:30]+"..." if len(disCodes)>0 and not "strain" in disName: bed[3] += " in %s" % disName if mut.featType=="sequence variant": varType = "Naturally occurring sequence variant" bed[8] = "100,0,0" elif mut.featType=="mutagenesis site": varType = "Experimental mutation of amino acids" bed[8] = "0,0,100" else: bed[3] = mut.shortFeatType @@ -190,42 +243,45 @@ if isMut: # create description of mutation if int(mut.end)-int(mut.begin)==1: posStr = "position %s" % mut.begin else: posStr = "position %s-%s" % (mut.begin, mut.end) if mut.origAa=="": bed.append("%s, removal of amino acids" % (posStr)) else: bed.append("%s, %s changed to %s" % \ (posStr, aaToLong(mut.origAa), aaToLong(mut.mutAa))) else: # just position of feature if int(mut.end)-int(mut.begin)==1: - posStr = "amino acid %s" % str(int(mut.begin)+1) + posStr = "amino acid %s" % str(int(mut.begin)) # show a 1-based coordinate else: posStr = "amino acids %s-%s" % (mut.begin, str(int(mut.end)-1)) posStr += " on protein %s" % mut.acc bed.append(posStr) comments = [com for com in comments if com.strip()!=""] commentField = ", ".join(comments) if len(diseases)!=0: commentField = ",".join(diseases) + "; " + commentField bed.append(commentField) if isMut: + varIds = [mut.varId for mut in muts] + if varIds!=[""]: + varIds = ["%s#%s|%s" % (acc, x, x) for x in varIds] # Andrew Nightingale request that we link to UniProt, not SwissProt bed.append(htmlLink('var', varIds)) dbSnpIds = [id for id in dbSnpIds if id.strip()!=""] bed.append(",".join(dbSnpIds)) bed.append(htmlLink('uniProt', [acc])) bed.append(htmlLink('pubmed', pmids)) bed[5] = "." bedLine = "\t".join(bed)+"\n" return bedLine # ------ MAIN ------ if __name__ == '__main__': parser = optparse.OptionParser("""usage: %prog [options] uniprotTabDir taxonId chromSizesFile liftPslFile - lift uniprot annotations to genome. uniprotTabDir is the directory with the output from uniprotToTab @@ -312,75 +368,75 @@ # lift from protein sequence bed to target bed print "lifting" cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | LC_COLLATE=C sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname run(cmd) print "adding extra fields" # read lifted bed and add uniprot annotations to it as additional fields mutBed = open("spMut.bed", "w") annotBed = open("spAnnot.bed" , "w") structBed = open("spStruct.bed" , "w") count = 0 blackCount = 0 noInfoCount = 0 doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name) + # name is sequence:start: for line in open("work/temp.lifted.bed"): bed = line.strip().split() muts = mutData[bed[3]] varIds = set([mut.varId for mut in muts]) pmids = set() for mut in muts: pmids.update(mut.pmid.split(",")) if len(pmidBlackList.intersection(pmids))==len(pmids): blackCount +=1 continue if mut.featType in ["mutagenesis site", "sequence variant"]: isMut = True ofh = mutBed - elif mut.featType in ["strand","helix", "coiled-coil region", "turn", "disulfide bond", "glycosylation site"]: + elif mut.featType in ["strand","helix", "coiled-coil region", "turn"]: isMut = False ofh = structBed else: isMut = False ofh = annotBed - bedLine = makeBedLine(mut, bed, isMut) + bedLine = makeBedLine(muts, bed, isMut) # skip annotations that we most likely have covered before - if not isMut: keyFields = tuple(bedLine.split("\t")[:4]) if keyFields in doneAnnots: continue doneAnnots.add(keyFields) ofh.write(bedLine) count += 1 print "%d features written to %s" % (count, ofh.name) mutBed.close() annotBed.close() structBed.close() #print "%d variants not mapped to genome" % len(notMapped) asFname = "bed12UniProtMut.as" cmd = "bedToBigBed -as=%s spMut.bed %s spMut.bb -type=bed12+ -tab" % \ (asFname, chromSizesPath) run(cmd) asFname = "bed12UniProtAnnot.as" cmd = "bedToBigBed -as=%s spAnnot.bed %s spAnnot.bb -type=bed12+ -tab" % \ (asFname, chromSizesPath) run(cmd) cmd = "bedToBigBed -as=%s spStruct.bed %s spStruct.bb -type=bed12+ -tab" % \ (asFname, chromSizesPath) run(cmd) - cmd = "rm -rf work" - run(cmd) + #cmd = "rm -rf work" + #run(cmd) cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed" run(cmd) #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname) print "%d features skipped because of blacklisted PMIDS" % (blackCount) print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)