src/utils/uniprotLift 9ab0e5cc221ce3e6e0c19b50940eb0a24ea56243

9ab0e5cc221ce3e6e0c19b50940eb0a24ea56243
max
  Thu Sep 25 01:22:04 2014 -0700
moving small tools into utils directory
diff --git src/utils/uniprotLift src/utils/uniprotLift
new file mode 100755
index 0000000..bf1cfb0
--- /dev/null
+++ src/utils/uniprotLift
@@ -0,0 +1,349 @@
+#!/usr/bin/env python2.7
+# a script to lift uniprot mutations/annotations created by the publications uniprot parser
+# to a DB and create three bigbed files for them:
+# one for structures, one for mutations, one for other annotations
+
+from os.path import join, isfile
+from os import system
+from collections import defaultdict, namedtuple
+
+# import the publications libraries
+import sys, gzip, optparse, logging,re
+sys.path.append("/cluster/home/max/projects/pubs/tools/lib")
+import maxCommon
+
+# Globals  ---
+DEBUG = False
+
+pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"])
+
+# if --filter option provided, remove these altogether
+removeTypes = set(["helix", "strand", "beta", "turn", "signal"])
+
+# by default, only output identifiers, not html links to them
+# html links are good for custom tracks
+# identifers are good for local spMut tracks
+createLinks = False
+
+urls = {"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?",
+        "uniProt" : "http://www.uniprot.org/uniprot/",
+        "pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/"
+        }
+
+# some feature types should not go into the bed file name field, but rather be replaced 
+# with the curator's comments
+useComment = set(["domain", "chain","region of interest","topological domain","short sequence motif"])
+
+# -------------
+
+def iterTsvRows(inFile,encoding="utf8", fieldSep="\t", isGzip=False):
+    """ 
+        parses tab-sep file with headers as field names 
+        yields collection.namedtuples
+        strips "#"-prefix from header line
+    """
+
+    fh = inFile
+    line1 = fh.readline()
+    line1 = line1.strip("\n").strip("#")
+    headers = line1.split(fieldSep)
+    headers = [re.sub("[^a-zA-Z0-9_]","_", h) for h in headers]
+
+    Record = namedtuple('tsvRec', headers)
+    for line in fh:
+        line = line.rstrip("\n")
+        fields = line.split(fieldSep)
+        if encoding!=None:
+            fields = [f.decode(encoding) for f in fields]
+        try:
+            rec = Record(*fields)
+        except Exception, msg:
+            logging.error("Exception occured while parsing line, %s" % msg)
+            logging.error("Filename %s" % fh.name)
+            logging.error("Line was: %s" % line)
+            logging.error("Does number of fields match headers?")
+            logging.error("Headers are: %s" % headers)
+            raise Exception("wrong field count in line %s" % line)
+        # convert fields to correct data type
+        yield rec
+
+def htmlLink(urlType, accs):
+    if createLinks:
+        strList = []
+        for acc in accs:
+            strList.append('<a href="%s%s">%s</a>' % (urls[urlType], acc, acc))
+        return ", ".join(strList)
+    else:
+        return ",".join(accs)
+
+def run(cmd):
+    if DEBUG:
+        print "running", cmd
+    ret = system(cmd)
+    if ret!=0:
+        print "Could not run %s" % cmd
+        sys.exit(1)
+
+threeToOne = \
+    {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K',
+     'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N',
+     'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W',
+     'Ala': 'A', 'Val':'V',  'Glu': 'E', 'Tyr': 'Y', 'Met': 'M',
+     'Sec': 'U' # very very rare amino acid (=stop codon)
+     }
+
+oneToThree = dict([[v,k] for k,v in threeToOne.items()])
+
+def aaToLong(seq):
+    " convert amino acid to three letter code "
+    res = []
+    for aa in seq:
+        longAa = oneToThree.get(aa, aa)
+        if longAa==aa:
+            print "unknown iupac", aa
+        res.append(longAa)
+    return "-".join(res)
+
+featTypeColors = {
+"modified residue" : "200,200,0",
+"transmembrane region" : "0,0,100",
+"glycosylation site" : "0,100,100",
+"disulfide bond" : "100,100,100",
+"topological domain" : "100,0,0"
+}
+
+def makeBedLine(mut, bed, isMut):
+    disStatus = set([mut.disRelated for mut in muts])
+    comments = [mut.comment for mut in muts if mut.comment!=""]
+    # skip if it's not disease related 
+    #if not "disRelated" in disStatus and mut.featType=="sequence variant" \
+        #and len(comments)==0 and mut.disease=="":
+        #noInfoCount +=1
+        
+    diseases = list(set([mut.disease for mut in muts if mut.disease!=""]))
+    disCodes = list(set([mut.disCode for mut in muts if mut.disCode!=""]))
+    acc = muts[0].acc
+    dbSnpIds = [mut.dbSnpId for mut in muts]
+
+    # set the bed name field to a disease, to the mutation or something else
+    if isMut and mut.origAa=="":
+        bed[3] = "%s-%s:del" % (mut.begin, mut.end)
+    else:
+        bed[3] = "%s->%s" % (mut.origAa, mut.mutAa)
+
+    disName = ",".join(disCodes)
+    if len(disName)>30:
+        disName = disName[:30]+"..."
+    if len(disCodes)>0 and not "strain" in disName:
+        bed[3] += " in %s" % disName
+
+    if mut.featType=="sequence variant":
+        varType = "Naturally occuring sequence variant"
+        bed[8] = "100,0,0"
+    elif mut.featType=="mutagenesis site":
+        varType = "Experimental mutation of amino acids"
+        bed[8] = "0,0,100"
+    else:
+        bed[3] = mut.shortFeatType
+        if mut.featType=="signal peptide":
+            bed[3] = "Signal peptide"
+        if mut.featType=="lipid moiety-binding region":
+            bed[3] = "Lipidation"
+        if mut.featType=="transmembrane region":
+            bed[3] = "Transmembrane"
+        if mut.featType in useComment:
+            bed[3] = mut.comment
+        if mut.comment=="Involved in receptor recognition and/or post-binding events":
+            bed[3] = "Recept recog"
+        if mut.comment.startswith("Cleavage; by host "):
+            bed[3] = "Cleave:"+mut.comment.split()[-1]
+
+        if bed[3] == "Intrinsically disordered":
+            bed[3] = "Disordered"
+        
+        if len(bed[3])>17:
+            bed[3] = bed[3][:14]+"..."
+
+        varType = mut.featType
+        bed[8] = featTypeColors.get(mut.featType, "0,0,0")
+
+    bed.append(varType)
+    if isMut:
+        bed.append(",".join(diseases))
+
+    if isMut:
+        # create description of mutation
+        if int(mut.end)-int(mut.begin)==1:
+            posStr = "position %s" % mut.begin
+        else:
+            posStr = "position %s-%s" % (mut.begin, mut.end)
+
+        if mut.origAa=="":
+            bed.append("%s, removal of amino acids" % (posStr))
+        else:
+            bed.append("%s, %s changed to %s" % \
+                (posStr, aaToLong(mut.origAa), aaToLong(mut.mutAa)))
+    else:
+        #  just position of feature
+        if int(mut.end)-int(mut.begin)==1:
+            posStr = "amino acid %s" % str(int(mut.begin)+1)
+        else:
+            posStr = "amino acids %s-%s" % (mut.begin, str(int(mut.end)-1))
+        posStr += " on protein %s" % mut.acc
+        bed.append(posStr)
+
+    comments = [com for com in comments if com.strip()!=""]
+    commentField = ", ".join(comments)
+    if len(diseases)!=0:
+        commentField = ",".join(diseases) + "; " + commentField
+    bed.append(commentField)
+    if isMut:
+        bed.append(htmlLink('var', varIds))
+        dbSnpIds = [id for id in dbSnpIds if id.strip()!=""]
+        bed.append(",".join(dbSnpIds))
+    bed.append(htmlLink('uniProt', [acc]))
+    bed.append(htmlLink('pubmed', pmids))
+    bed[5] = "."
+    bedLine = "\t".join(bed)+"\n"
+    return bedLine
+
+# ------ MAIN ------
+if __name__ == '__main__':
+    parser = optparse.OptionParser("""usage: %prog [options] taxonId dbName liftPslFile - lift uniprot annotations to genome. 
+    
+    Needs a mapping PSL from uniprot nucleotide coordinates to genome positions.
+    - for genomes with introns, makeLiftOver.sh from
+      src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19
+    - for a quicker rough version, create one with BLAT and -q=dnax folowed by
+      pslProtCnv 
+
+    Input uniprot files have to be created with the pubParseDb uniprot parser
+
+    """)
+    parser.add_option("", "--annot", dest="annot", action="store_true", help="lift other, non-variant annotations")
+    parser.add_option("-d", "", dest="debug", action="store_true", help="debug output")
+    parser.add_option("", "--filter", dest="filterAnnots", action="store_true", help="remove certain annotations that are not very useful, like helix,chain,beta, etc")
+    parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs")
+    (options, args) = parser.parse_args()
+
+    if options.debug:
+        DEBUG=True
+
+    taxonId, db, liftFname = args
+
+    mutData = defaultdict(list)
+
+    cmd = "rm -rf work; mkdir work"
+    run(cmd)
+
+    # create seq sizes for all uniprot sequences of this species
+    uniprotFa = join(options.uniprotDir, "uniprot.%s.fa" % taxonId)
+    uniprotFaGz = uniprotFa+".gz"
+    if not isfile(uniprotFa):
+        print "uncompressing uniprot gz fa"
+        open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read())
+
+    if not isfile(uniprotFa):
+        raise Exception("Not found: %s" % uniprotFa)
+
+    print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes")
+    cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa
+    run(cmd)
+    # get uniprot IDs 
+    speciesAccs = set([line.split()[0] for line in open("work/chromSizes")])
+    print "got %d accessions" % len(speciesAccs)
+
+    # read data, write bed to file
+    # this annotates mutations on the protein sequence
+    print "converting tab to bed"
+    ofh = open("work/temp.bed", "w")
+    uniProtMutFname = join(options.uniprotDir, "uniprot.%s.mut.tab" % taxonId)
+    print "reading %s" % uniProtMutFname
+    for mut in iterTsvRows(open(uniProtMutFname)):
+        if mut.acc not in speciesAccs:
+            continue
+        if options.filterAnnots and mut.featType in removeTypes:
+            continue
+        mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa
+        mutPos = 3*(int(mut.begin)-1)
+        mutPosEnd = 3*(int(mut.end)-1)
+        if mutName not in mutData:
+            ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n")
+        mutData[mutName].append(mut)
+    ofh.close()
+    
+    # lift from protein sequence bed to target bed
+    print "lifting"
+    cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname
+    run(cmd)
+
+    print "adding extra fields"
+    # read lifted bed and add uniprot annotations to it as additional fields
+    mutBed = open("spMut.bed", "w")
+    annotBed = open("spAnnot.bed" , "w")
+    structBed = open("spStruct.bed" , "w")
+
+    count = 0
+    blackCount = 0
+    noInfoCount = 0
+    doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name)
+
+    for line in open("work/temp.lifted.bed"):
+        bed = line.strip().split()
+        muts = mutData[bed[3]]
+        varIds = set([mut.varId for mut in muts])
+        pmids = set()
+        for mut in muts:
+            pmids.update(mut.pmid.split(","))
+
+        if len(pmidBlackList.intersection(pmids))==len(pmids):
+            blackCount +=1
+            continue
+
+        if mut.featType in ["mutagenesis site", "sequence variant"]:
+            isMut = True
+            ofh = mutBed
+        elif mut.featType in ["strand","helix", "coiled-coil region", "turn", "disulfide bond", "glycosylation site"]:
+            isMut = False
+            ofh = structBed
+        else:
+            isMut = False
+            ofh = annotBed
+
+        bedLine = makeBedLine(mut, bed, isMut)
+        # skip annotations that we most likely have covered before
+        if not isMut:
+            keyFields = tuple(bedLine.split("\t")[:4])
+            if keyFields in doneAnnots:
+                continue
+            doneAnnots.add(keyFields)
+        ofh.write(bedLine)
+        count += 1
+
+    print "%d features written to %s" % (count, ofh.name)
+    mutBed.close()
+    annotBed.close()
+    structBed.close()
+
+    #print "%d variants not mapped to genome" % len(notMapped)
+
+    asFname = "bed12UniProtMut.as"
+    cmd = "bedToBigBed -as=%s spMut.bed /hive/data/genomes/%s/chrom.sizes spMut.bb -type=bed12+ -tab" % \
+        (asFname, db)
+    run(cmd)
+
+    asFname = "bed12UniProtAnnot.as"
+    cmd = "bedToBigBed -as=%s spAnnot.bed /hive/data/genomes/%s/chrom.sizes spAnnot.bb -type=bed12+ -tab" % \
+        (asFname, db)
+    run(cmd)
+    cmd = "bedToBigBed -as=%s spStruct.bed /hive/data/genomes/%s/chrom.sizes spStruct.bb -type=bed12+ -tab" % \
+        (asFname, db)
+    run(cmd)
+
+    cmd = "rm -rf work"
+    run(cmd)
+    cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed"
+    run(cmd)
+    #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname)
+    print "%d features skipped because of blacklisted PMIDS" % (blackCount)
+    print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)