src/utils/uniprotLift 0e029fa9d8deb0b20ba6cd85a43844698aa631e0

0e029fa9d8deb0b20ba6cd85a43844698aa631e0
max
  Mon Jul 24 01:08:01 2017 -0400
major changes to the uniprot track: splitting by annotation into more
subtracks, reorg of the code so it can handle that many files, getting
ready for ottomization and multi-org running. refs #19351

diff --git src/utils/uniprotLift src/utils/uniprotLift
deleted file mode 100755
index b644feb..0000000
--- src/utils/uniprotLift
+++ /dev/null
@@ -1,442 +0,0 @@
-#!/usr/bin/env python2.7
-# a script to lift uniprot mutations/annotations created by the publications uniprot parser
-# to a DB and create two bigbed files for them
-
-from os.path import join, isfile
-from os import system
-from collections import defaultdict, namedtuple
-
-# import the publications libraries
-import sys, gzip, optparse, logging,re
-
-# Globals  ---
-DEBUG = False
-
-pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"])
-
-# if --filter option provided, remove these altogether
-removeTypes = set(["helix", "strand", "beta", "turn", "signal"])
-
-# by default, only output identifiers, not html links to them
-# html links are good for custom tracks
-# identifers are good for local spMut tracks
-createLinks = False
-
-urls = {
-#"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?",
-"var": "http://www.uniprot.org/uniprot/",
-"uniProt" : "http://www.uniprot.org/uniprot/",
-"pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/"
-}
-
-# some feature types should not go into the bed file name field, but rather be replaced 
-# with the curator's comments
-useComment = set(["domain", "chain","region of interest","topological domain","short sequence motif"])
-
-# -------------
-
-def iterTsvRows(inFile,encoding="utf8", fieldSep="\t", isGzip=False):
-    """ 
-        parses tab-sep file with headers as field names 
-        yields collection.namedtuples
-        strips "#"-prefix from header line
-    """
-
-    fh = inFile
-    line1 = fh.readline()
-    line1 = line1.strip("\n").strip("#")
-    headers = line1.split(fieldSep)
-    headers = [re.sub("[^a-zA-Z0-9_]","_", h) for h in headers]
-
-    Record = namedtuple('tsvRec', headers)
-    for line in fh:
-        line = line.rstrip("\n")
-        fields = line.split(fieldSep)
-        if encoding!=None:
-            fields = [f.decode(encoding) for f in fields]
-        try:
-            rec = Record(*fields)
-        except Exception, msg:
-            logging.error("Exception occured while parsing line, %s" % msg)
-            logging.error("Filename %s" % fh.name)
-            logging.error("Line was: %s" % line)
-            logging.error("Does number of fields match headers?")
-            logging.error("Headers are: %s" % headers)
-            raise Exception("wrong field count in line %s" % line)
-        # convert fields to correct data type
-        yield rec
-
-def htmlLink(urlType, accs):
-    if createLinks:
-        strList = []
-        for acc in accs:
-            strList.append('<a href="%s%s">%s</a>' % (urls[urlType], acc, acc))
-        return ", ".join(strList)
-    else:
-        return ",".join(accs)
-
-def run(cmd):
-    if DEBUG:
-        print "running", cmd
-    ret = system(cmd)
-    if ret!=0:
-        print "Could not run %s" % cmd
-        sys.exit(1)
-
-threeToOne = \
-    {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K',
-     'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N',
-     'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W',
-     'Ala': 'A', 'Val':'V',  'Glu': 'E', 'Tyr': 'Y', 'Met': 'M',
-     'Sec': 'U' # very very rare amino acid (=stop codon)
-     }
-
-oneToThree = dict([[v,k] for k,v in threeToOne.items()])
-
-def aaToLong(seq):
-    " convert amino acid to three letter code "
-    res = []
-    for aa in seq:
-        longAa = oneToThree.get(aa, aa)
-        if longAa==aa:
-            print "unknown iupac", aa
-        res.append(longAa)
-    return "-".join(res)
-
-featTypeColors = {
-"modified residue" : "200,200,0",
-"transmembrane region" : "0,0,100",
-"glycosylation site" : "0,100,100",
-"disulfide bond" : "100,100,100",
-"topological domain" : "100,0,0"
-}
-
-disShortNames = {
-    "hepatocellular carcinoma" : "HepatocC",
-    "a hepatocellular carcinoma" : "HepatocC",
-    "a hepatocellular carcinoma sample" : "HepatocC",
-    "hepatocellular carcinomas" : "HepatocC",
-    "ovarian cancer" : "OvC",
-    "colon cancer" : "ColC",
-    "colorectal cancer" : "ColrectC",
-    "hepatoblastoma" : "HepatoBl",
-    "a sporadic cancer": "sporadic",
-    "sporadic cancers": "sporadic",
-    "non-small cell lung cancer cell lines": "NSCLC-CL",
-    "a colon cancer cell line":"ColC-CL"
-}
-
-def removeAnds(disNames):
-    " remove useless 'and' in disease names "
-    newNames = []
-    for name in disNames:
-        if " and " in name:
-            splitNames = name.split(" and ")
-            newNames.extend(splitNames)
-            del disCodes[disCodes.index(name)]
-            disCodes.extend(splitNames)
-        else:
-            newNames.append(name)
-    return newNames
-
-def shortenDisCodes(disCodes):
-    " some disease names are not shortened yet by UniProt. Do this now. "
-    #print "XX", disNames, disCodes
-    #assert(len(disNames)==len(disCodes))
-    newCodes = []
-    for code in disCodes:
-        if " and " in code:
-            splitCodes = code.split(" and ")
-            newCodes.append(disShortNames.get(splitCodes[0], splitCodes[0]))
-            newCodes.append(disShortNames.get(splitCodes[1], splitCodes[1]))
-        else:
-            newCodes.append(disShortNames.get(code, code))
-    return newCodes
-    #return list(set(newCodes))
-
-xxFh = open("uniprotLiftDbg.txt", "w")
-
-def makeBedLine(muts, bed, isMut):
-    mut = muts[-1]
-    disStatus = set([mut.disRelated for mut in muts])
-    comments = [mut.comment for mut in muts if mut.comment!=""]
-    # skip if it's not disease related 
-    #if not "disRelated" in disStatus and mut.featType=="sequence variant" \
-        #and len(comments)==0 and mut.disease=="":
-        #noInfoCount +=1
-        
-    diseases = list([mut.disease for mut in muts if mut.disease!=""])
-    disCodes = list([mut.disCode for mut in muts if mut.disCode!=""])
-    #diseases = removeAnds(diseases)
-    if len(diseases)!=0:
-        xxFh.write("%s\t%s\n" % ("|".join(diseases), "|".join(disCodes)))
-    disCodes = shortenDisCodes(disCodes)
-
-    acc = muts[0].acc
-    dbSnpIds = [mut.dbSnpId for mut in muts]
-
-    # set the bed name field to a disease, to the mutation or something else
-    if isMut and mut.origAa=="":
-        bed[3] = "%s-%sdel" % (mut.begin, mut.end)
-    else:
-        bed[3] = "%s%s%s" % (mut.origAa, mut.begin, mut.mutAa)
-
-    disName = ",".join(disCodes)
-    if len(disName)>30:
-        disName = disName[:30]+"..."
-    if len(disCodes)>0 and not "strain" in disName:
-        bed[3] += " in %s" % disName
-
-    if mut.featType=="sequence variant":
-        varType = "Naturally occurring sequence variant"
-        bed[8] = "100,0,0"
-    elif mut.featType=="mutagenesis site":
-        varType = "Experimental mutation of amino acids"
-        bed[8] = "0,0,100"
-    else:
-        bed[3] = mut.shortFeatType
-        if mut.featType in useComment:
-            bed[3] = mut.comment
-
-        if mut.featType=="signal peptide":
-            bed[3] = "Signal peptide"
-        if mut.featType=="lipid moiety-binding region":
-            bed[3] = "Lipidation"
-        if mut.featType=="transmembrane region":
-            bed[3] = "Transmembrane"
-            
-        if mut.comment=="Nuclear localization signal":
-            bed[3] = "Nuclear loc"
-        if mut.comment=="Involved in receptor recognition and/or post-binding events":
-            bed[3] = "Recept recog"
-        if mut.comment=="Fibronectin type-III":
-            bed[3] = "FibronectIII"
-
-        # more general rules
-        if mut.comment.startswith("Zinc finger protein "):
-            bed[3] = mut.comment.replace("Zinc finger protein ", "ZF-")
-        if mut.comment.startswith("Necessary for the"):
-            bed[3] = mut.comment.replace("Necessary for the ", "")
-        if mut.comment.startswith("Interaction with "):
-            bed[3] = mut.comment.replace("Interaction with ", "Int:")
-        if mut.comment.startswith("Involved in the"):
-            bed[3] = mut.comment.replace("Involved in the ", "")
-        if mut.comment.startswith("Required for "):
-            bed[3] = mut.comment.replace("Required for ", "")
-        if mut.comment.startswith("Cleavage; by host "):
-            bed[3] = "Cleave:"+mut.comment.split()[-1]
-
-
-        if bed[3] == "Intrinsically disordered":
-            bed[3] = "Disordered"
-        
-        if len(bed[3])>17:
-            bed[3] = bed[3][:14]+"..."
-
-        varType = mut.featType
-        bed[8] = featTypeColors.get(mut.featType, "0,0,0")
-
-    bed.append(varType)
-    if isMut:
-        bed.append(",".join(diseases))
-
-    if isMut:
-        # create description of mutation
-        if int(mut.end)-int(mut.begin)==1:
-            posStr = "position %s" % mut.begin
-        else:
-            posStr = "position %s-%s" % (mut.begin, mut.end)
-
-        if mut.origAa=="":
-            bed.append("%s, removal of amino acids" % (posStr))
-        else:
-            bed.append("%s, %s changed to %s" % \
-                (posStr, aaToLong(mut.origAa), aaToLong(mut.mutAa)))
-    else:
-        #  just position of feature
-        if int(mut.end)-int(mut.begin)==1:
-            posStr = "amino acid %s" % str(int(mut.begin)) # show a 1-based coordinate
-        else:
-            posStr = "amino acids %s-%s" % (mut.begin, str(int(mut.end)-1))
-        posStr += " on protein %s" % mut.acc
-        bed.append(posStr)
-
-    comments = [com for com in comments if com.strip()!=""]
-    commentField = ", ".join(comments)
-    if len(diseases)!=0:
-        commentField = ",".join(diseases) + "; " + commentField
-    bed.append(commentField)
-    if isMut:
-        varIds = [mut.varId for mut in muts]
-        if varIds!=[""]:
-            varIds = ["%s#%s|%s" % (acc, x, x) for x in varIds] # Andrew Nightingale request that we link to UniProt, not SwissProt
-        bed.append(htmlLink('var', varIds))
-        dbSnpIds = [id for id in dbSnpIds if id.strip()!=""]
-        bed.append(",".join(dbSnpIds))
-    bed.append(htmlLink('uniProt', [acc]))
-    bed.append(htmlLink('pubmed', pmids))
-    bed[5] = "."
-    bedLine = "\t".join(bed)+"\n"
-    return bedLine
-
-# ------ MAIN ------
-if __name__ == '__main__':
-    parser = optparse.OptionParser("""usage: %prog [options] uniprotTabDir taxonId chromSizesFile liftPslFile - lift uniprot annotations to genome. 
-    
-    uniprotTabDir is the directory with the output from uniprotToTab
-    
-    Needs a mapping PSL from uniprot (in faked nucleotide coordinates) to genome positions.
-    - for genomes with introns, makeLiftOver.sh from
-      kent/src/hg/utils/uniprotMutations/makeUniProtToHg.sh creates one for hg19
-    - for a quicker rough version, create one with BLAT and -q=dnax folowed by
-      pslProtCnv 
-
-    """)
-    parser.add_option("", "--annot", dest="annot", action="store_true", help="lift other, non-variant annotations")
-    parser.add_option("-d", "", dest="debug", action="store_true", help="debug output")
-    parser.add_option("", "--filter", dest="filterAnnots", action="store_true", help="remove certain annotations that are not very useful, like helix,chain,beta, etc")
-    #parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", help="uniprot parser directory, default %default", default="/hive/data/inside/pubs/parsedDbs")
-    (options, args) = parser.parse_args()
-
-    if options.debug:
-        DEBUG=True
-
-    if len(args)==0:
-        parser.print_help()
-        sys.exit(1)
-
-    uniprotDir, taxonId, chromSizesPath, liftFname = args
-
-    mutData = defaultdict(list)
-
-    cmd = "rm -rf work; mkdir work"
-    run(cmd)
-
-    # create seq sizes for all uniprot sequences of this species
-    uniprotFa = join(uniprotDir, "uniprot.%s.fa" % taxonId)
-    uniprotFaGz = uniprotFa+".gz"
-    if not isfile(uniprotFa):
-        print "uncompressing uniprot gz fa"
-        open(uniprotFa, "w").write(gzip.open(uniprotFaGz).read())
-
-    if not isfile(uniprotFa):
-        raise Exception("Not found: %s" % uniprotFa)
-
-    print "writing nucleotide lengths of %s to %s" % (uniprotFa, "work/chromSizes")
-    cmd = "faSize %s -detailed | gawk '{$2=$2*3; print}'> work/chromSizes" % uniprotFa
-    run(cmd)
-
-    # keep protein lengths for filtering later, we'll remove everything that 
-    # covers the whole protein
-    seqSizes = {}
-    for line in open("work/chromSizes"):
-        seq, size = line.rstrip("\n").split()
-        seqSizes[seq] = int(size)/3
-        
-    # get uniprot IDs 
-    speciesAccs = set([line.split()[0] for line in open("work/chromSizes")])
-    print "got %d accessions" % len(speciesAccs)
-
-    # read data, write bed to file
-    # this annotates in protein coordinates
-    print "converting tab to bed"
-    ofh = open("work/temp.bed", "w")
-    uniProtMutFname = join(uniprotDir, "uniprot.%s.annots.tab" % taxonId)
-    print "reading %s" % uniProtMutFname
-    for mut in iterTsvRows(open(uniProtMutFname)):
-        if mut.acc not in speciesAccs:
-            logging.debug("skipping %s, not in work/chromSizes" % mut.acc)
-            continue
-        if options.filterAnnots and mut.featType in removeTypes:
-            logging.debug("skipping %s, explicitely removed annotation" % str(mut))
-            continue
-        mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa
-
-        # remove annotations that cover the whole protein, probably not very useful
-        #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc]
-        if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1:
-            logging.debug("skipping %s, covers whole protein" % str(mut))
-            continue
-
-        mutPos = 3*(int(mut.begin)-1)
-        mutPosEnd = 3*(int(mut.end)-1)
-        if mutName not in mutData:
-            ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n")
-        mutData[mutName].append(mut)
-    ofh.close()
-    
-    # lift from protein sequence bed to target bed
-    print "lifting"
-    cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | LC_COLLATE=C sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname
-    run(cmd)
-
-    print "adding extra fields"
-    # read lifted bed and add uniprot annotations to it as additional fields
-    mutBed = open("spMut.bed", "w")
-    annotBed = open("spAnnot.bed" , "w")
-    structBed = open("spStruct.bed" , "w")
-
-    count = 0
-    blackCount = 0
-    noInfoCount = 0
-    doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name)
-    # name is sequence:start:
-
-    for line in open("work/temp.lifted.bed"):
-        bed = line.strip().split()
-        muts = mutData[bed[3]]
-        varIds = set([mut.varId for mut in muts])
-        pmids = set()
-        for mut in muts:
-            pmids.update(mut.pmid.split(","))
-
-        if len(pmidBlackList.intersection(pmids))==len(pmids):
-            blackCount +=1
-            continue
-
-        if mut.featType in ["mutagenesis site", "sequence variant"]:
-            isMut = True
-            ofh = mutBed
-        elif mut.featType in ["strand","helix", "coiled-coil region", "turn"]:
-            isMut = False
-            ofh = structBed
-        else:
-            isMut = False
-            ofh = annotBed
-
-        bedLine = makeBedLine(muts, bed, isMut)
-        # skip annotations that we most likely have covered before
-        keyFields = tuple(bedLine.split("\t")[:4])
-        if keyFields in doneAnnots:
-            continue
-        doneAnnots.add(keyFields)
-        ofh.write(bedLine)
-        count += 1
-
-    print "%d features written to %s" % (count, ofh.name)
-    mutBed.close()
-    annotBed.close()
-    structBed.close()
-
-    #print "%d variants not mapped to genome" % len(notMapped)
-
-    asFname = "bed12UniProtMut.as"
-    cmd = "bedToBigBed -as=%s spMut.bed %s spMut.bb -type=bed12+ -tab" % \
-        (asFname, chromSizesPath)
-    run(cmd)
-
-    asFname = "bed12UniProtAnnot.as"
-    cmd = "bedToBigBed -as=%s spAnnot.bed %s spAnnot.bb -type=bed12+ -tab" % \
-        (asFname, chromSizesPath)
-    run(cmd)
-    cmd = "bedToBigBed -as=%s spStruct.bed %s spStruct.bb -type=bed12+ -tab" % \
-        (asFname, chromSizesPath)
-    run(cmd)
-
-    #cmd = "rm -rf work"
-    #run(cmd)
-    cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed"
-    run(cmd)
-    #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname)
-    print "%d features skipped because of blacklisted PMIDS" % (blackCount)
-    print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)