3887c3b11338a8ed7de6f5805d09019c25979023 max Fri Jul 15 15:14:53 2016 -0700 removing dependencies for uniprot pipeline, moving it into kent repo, for #17716. Also preparing for hg38 uniprot track, #17717 diff --git src/utils/uniprotLift src/utils/uniprotLift index 61b7a80..fe49425 100755 --- src/utils/uniprotLift +++ src/utils/uniprotLift @@ -1,27 +1,25 @@ #!/usr/bin/env python2.7 # a script to lift uniprot mutations/annotations created by the publications uniprot parser # to a DB and create two bigbed files for them from os.path import join, isfile from os import system from collections import defaultdict, namedtuple # import the publications libraries import sys, gzip, optparse, logging,re -sys.path.append("/cluster/home/max/projects/pubs/tools/lib") -import maxCommon # Globals --- DEBUG = False pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"]) # if --filter option provided, remove these altogether removeTypes = set(["helix", "strand", "beta", "turn", "signal"]) # by default, only output identifiers, not html links to them # html links are good for custom tracks # identifers are good for local spMut tracks createLinks = False urls = {"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?", @@ -215,87 +213,91 @@ if len(diseases)!=0: commentField = ",".join(diseases) + "; " + commentField bed.append(commentField) if isMut: bed.append(htmlLink('var', varIds)) dbSnpIds = [id for id in dbSnpIds if id.strip()!=""] bed.append(",".join(dbSnpIds)) bed.append(htmlLink('uniProt', [acc])) bed.append(htmlLink('pubmed', pmids)) bed[5] = "." bedLine = "\t".join(bed)+"\n" return bedLine # ------ MAIN ------ if __name__ == '__main__': - parser = optparse.OptionParser("""usage: %prog [options] taxonId dbName liftPslFile - lift uniprot annotations to genome. + parser = optparse.OptionParser("""usage: %prog [options] uniprotTabDir taxonId chromSizesFile liftPslFile - lift uniprot annotations to genome. - Needs a mapping PSL from uniprot nucleotide coordinates to genome positions. + uniprotTabDir is the directory with the output from uniprotToTab + + Needs a mapping PSL from uniprot (in faked nucleotide coordinates) to genome positions. - for genomes with introns, makeLiftOver.sh from - src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19 + kent/src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19 - for a quicker rough version, create one with BLAT and -q=dnax folowed by pslProtCnv - Input uniprot files have to be created with the pubParseDb uniprot parser - """) parser.add_option("", "--annot", dest="annot", action="store_true", help="lift other, non-variant annotations") parser.add_option("-d", "", dest="debug", action="store_true", help="debug output") parser.add_option("", "--filter", dest="filterAnnots", action="store_true", help="remove certain annotations that are not very useful, like helix,chain,beta, etc") - parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs") + #parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs") (options, args) = parser.parse_args() if options.debug: DEBUG=True - taxonId, db, liftFname = args + if len(args)==0: + parser.print_help() + sys.exit(1) + + uniprotDir, taxonId, chromSizesPath, liftFname = args mutData = defaultdict(list) cmd = "rm -rf work; mkdir work" run(cmd) # create seq sizes for all uniprot sequences of this species - uniprotFa = join(options.uniprotDir, "uniprot.%s.fa" % taxonId) + uniprotFa = join(uniprotDir, "uniprot.%s.fa" % taxonId) uniprotFaGz = uniprotFa+".gz" if not isfile(uniprotFa): print "uncompressing uniprot gz fa" open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read()) if not isfile(uniprotFa): raise Exception("Not found: %s" % uniprotFa) print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes") cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa run(cmd) # keep protein lengths for filtering later, we'll remove everything that # covers the whole protein seqSizes = {} for line in open("work/chromSizes"): seq, size = line.rstrip("\n").split() seqSizes[seq] = int(size)/3 # get uniprot IDs speciesAccs = set([line.split()[0] for line in open("work/chromSizes")]) print "got %d accessions" % len(speciesAccs) # read data, write bed to file # this annotates in protein coordinates print "converting tab to bed" ofh = open("work/temp.bed", "w") - uniProtMutFname = join(options.uniprotDir, "uniprot.%s.mut.tab" % taxonId) + uniProtMutFname = join(uniprotDir, "uniprot.%s.annots.tab" % taxonId) print "reading %s" % uniProtMutFname for mut in iterTsvRows(open(uniProtMutFname)): if mut.acc not in speciesAccs: continue if options.filterAnnots and mut.featType in removeTypes: continue mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa # remove annotations that cover the whole protein, probably not very useful #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc] if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1: continue mutPos = 3*(int(mut.begin)-1) mutPosEnd = 3*(int(mut.end)-1) @@ -348,34 +350,34 @@ keyFields = tuple(bedLine.split("\t")[:4]) if keyFields in doneAnnots: continue doneAnnots.add(keyFields) ofh.write(bedLine) count += 1 print "%d features written to %s" % (count, ofh.name) mutBed.close() annotBed.close() structBed.close() #print "%d variants not mapped to genome" % len(notMapped) asFname = "bed12UniProtMut.as" - cmd = "bedToBigBed -as=%s spMut.bed /hive/data/genomes/%s/chrom.sizes spMut.bb -type=bed12+ -tab" % \ - (asFname, db) + cmd = "bedToBigBed -as=%s spMut.bed %s spMut.bb -type=bed12+ -tab" % \ + (asFname, chromSizesPath) run(cmd) asFname = "bed12UniProtAnnot.as" - cmd = "bedToBigBed -as=%s spAnnot.bed /hive/data/genomes/%s/chrom.sizes spAnnot.bb -type=bed12+ -tab" % \ - (asFname, db) + cmd = "bedToBigBed -as=%s spAnnot.bed %s spAnnot.bb -type=bed12+ -tab" % \ + (asFname, chromSizesPath) run(cmd) - cmd = "bedToBigBed -as=%s spStruct.bed /hive/data/genomes/%s/chrom.sizes spStruct.bb -type=bed12+ -tab" % \ - (asFname, db) + cmd = "bedToBigBed -as=%s spStruct.bed %s spStruct.bb -type=bed12+ -tab" % \ + (asFname, chromSizesPath) run(cmd) cmd = "rm -rf work" run(cmd) cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed" run(cmd) #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname) print "%d features skipped because of blacklisted PMIDS" % (blackCount) print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)