src/utils/uniprotLift 9461ce3c12c7faa7da2480f9629a370284c54605

9461ce3c12c7faa7da2480f9629a370284c54605
max
  Tue Nov 29 15:52:54 2016 -0800
fixing uniprot on hg38, refs #18451

diff --git src/utils/uniprotLift src/utils/uniprotLift
index d29868c..b644feb 100755
--- src/utils/uniprotLift
+++ src/utils/uniprotLift
@@ -10,31 +10,33 @@
 import sys, gzip, optparse, logging,re
 
 # Globals  ---
 DEBUG = False
 
 pmidBlackList = set(["17344846", "15489334", "16959974", "14702039", "17974005"])
 
 # if --filter option provided, remove these altogether
 removeTypes = set(["helix", "strand", "beta", "turn", "signal"])
 
 # by default, only output identifiers, not html links to them
 # html links are good for custom tracks
 # identifers are good for local spMut tracks
 createLinks = False
 
-urls = {"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?",
+urls = {
+#"var": "http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?",
+"var": "http://www.uniprot.org/uniprot/",
 "uniProt" : "http://www.uniprot.org/uniprot/",
 "pubmed" : "http://www.ncbi.nlm.nih.gov/pubmed/"
 }
 
 # some feature types should not go into the bed file name field, but rather be replaced 
 # with the curator's comments
 useComment = set(["domain", "chain","region of interest","topological domain","short sequence motif"])
 
 # -------------
 
 def iterTsvRows(inFile,encoding="utf8", fieldSep="\t", isGzip=False):
     """ 
         parses tab-sep file with headers as field names 
         yields collection.namedtuples
         strips "#"-prefix from header line
@@ -97,48 +99,99 @@
     for aa in seq:
         longAa = oneToThree.get(aa, aa)
         if longAa==aa:
             print "unknown iupac", aa
         res.append(longAa)
     return "-".join(res)
 
 featTypeColors = {
 "modified residue" : "200,200,0",
 "transmembrane region" : "0,0,100",
 "glycosylation site" : "0,100,100",
 "disulfide bond" : "100,100,100",
 "topological domain" : "100,0,0"
 }
 
-def makeBedLine(mut, bed, isMut):
+disShortNames = {
+    "hepatocellular carcinoma" : "HepatocC",
+    "a hepatocellular carcinoma" : "HepatocC",
+    "a hepatocellular carcinoma sample" : "HepatocC",
+    "hepatocellular carcinomas" : "HepatocC",
+    "ovarian cancer" : "OvC",
+    "colon cancer" : "ColC",
+    "colorectal cancer" : "ColrectC",
+    "hepatoblastoma" : "HepatoBl",
+    "a sporadic cancer": "sporadic",
+    "sporadic cancers": "sporadic",
+    "non-small cell lung cancer cell lines": "NSCLC-CL",
+    "a colon cancer cell line":"ColC-CL"
+}
+
+def removeAnds(disNames):
+    " remove useless 'and' in disease names "
+    newNames = []
+    for name in disNames:
+        if " and " in name:
+            splitNames = name.split(" and ")
+            newNames.extend(splitNames)
+            del disCodes[disCodes.index(name)]
+            disCodes.extend(splitNames)
+        else:
+            newNames.append(name)
+    return newNames
+
+def shortenDisCodes(disCodes):
+    " some disease names are not shortened yet by UniProt. Do this now. "
+    #print "XX", disNames, disCodes
+    #assert(len(disNames)==len(disCodes))
+    newCodes = []
+    for code in disCodes:
+        if " and " in code:
+            splitCodes = code.split(" and ")
+            newCodes.append(disShortNames.get(splitCodes[0], splitCodes[0]))
+            newCodes.append(disShortNames.get(splitCodes[1], splitCodes[1]))
+        else:
+            newCodes.append(disShortNames.get(code, code))
+    return newCodes
+    #return list(set(newCodes))
+
+xxFh = open("uniprotLiftDbg.txt", "w")
+
+def makeBedLine(muts, bed, isMut):
+    mut = muts[-1]
     disStatus = set([mut.disRelated for mut in muts])
     comments = [mut.comment for mut in muts if mut.comment!=""]
     # skip if it's not disease related 
     #if not "disRelated" in disStatus and mut.featType=="sequence variant" \
         #and len(comments)==0 and mut.disease=="":
         #noInfoCount +=1
         
-    diseases = list(set([mut.disease for mut in muts if mut.disease!=""]))
-    disCodes = list(set([mut.disCode for mut in muts if mut.disCode!=""]))
+    diseases = list([mut.disease for mut in muts if mut.disease!=""])
+    disCodes = list([mut.disCode for mut in muts if mut.disCode!=""])
+    #diseases = removeAnds(diseases)
+    if len(diseases)!=0:
+        xxFh.write("%s\t%s\n" % ("|".join(diseases), "|".join(disCodes)))
+    disCodes = shortenDisCodes(disCodes)
+
     acc = muts[0].acc
     dbSnpIds = [mut.dbSnpId for mut in muts]
 
     # set the bed name field to a disease, to the mutation or something else
     if isMut and mut.origAa=="":
-        bed[3] = "%s-%s:del" % (mut.begin, mut.end)
+        bed[3] = "%s-%sdel" % (mut.begin, mut.end)
     else:
-        bed[3] = "%s->%s" % (mut.origAa, mut.mutAa)
+        bed[3] = "%s%s%s" % (mut.origAa, mut.begin, mut.mutAa)
 
     disName = ",".join(disCodes)
     if len(disName)>30:
         disName = disName[:30]+"..."
     if len(disCodes)>0 and not "strain" in disName:
         bed[3] += " in %s" % disName
 
     if mut.featType=="sequence variant":
         varType = "Naturally occurring sequence variant"
         bed[8] = "100,0,0"
     elif mut.featType=="mutagenesis site":
         varType = "Experimental mutation of amino acids"
         bed[8] = "0,0,100"
     else:
         bed[3] = mut.shortFeatType
@@ -190,42 +243,45 @@
     if isMut:
         # create description of mutation
         if int(mut.end)-int(mut.begin)==1:
             posStr = "position %s" % mut.begin
         else:
             posStr = "position %s-%s" % (mut.begin, mut.end)
 
         if mut.origAa=="":
             bed.append("%s, removal of amino acids" % (posStr))
         else:
             bed.append("%s, %s changed to %s" % \
                 (posStr, aaToLong(mut.origAa), aaToLong(mut.mutAa)))
     else:
         #  just position of feature
         if int(mut.end)-int(mut.begin)==1:
-            posStr = "amino acid %s" % str(int(mut.begin)+1)
+            posStr = "amino acid %s" % str(int(mut.begin)) # show a 1-based coordinate
         else:
             posStr = "amino acids %s-%s" % (mut.begin, str(int(mut.end)-1))
         posStr += " on protein %s" % mut.acc
         bed.append(posStr)
 
     comments = [com for com in comments if com.strip()!=""]
     commentField = ", ".join(comments)
     if len(diseases)!=0:
         commentField = ",".join(diseases) + "; " + commentField
     bed.append(commentField)
     if isMut:
+        varIds = [mut.varId for mut in muts]
+        if varIds!=[""]:
+            varIds = ["%s#%s|%s" % (acc, x, x) for x in varIds] # Andrew Nightingale request that we link to UniProt, not SwissProt
         bed.append(htmlLink('var', varIds))
         dbSnpIds = [id for id in dbSnpIds if id.strip()!=""]
         bed.append(",".join(dbSnpIds))
     bed.append(htmlLink('uniProt', [acc]))
     bed.append(htmlLink('pubmed', pmids))
     bed[5] = "."
     bedLine = "\t".join(bed)+"\n"
     return bedLine
 
 # ------ MAIN ------
 if __name__ == '__main__':
     parser = optparse.OptionParser("""usage: %prog [options] uniprotTabDir taxonId chromSizesFile liftPslFile - lift uniprot annotations to genome. 
     
     uniprotTabDir is the directory with the output from uniprotToTab
     
@@ -312,75 +368,75 @@
     # lift from protein sequence bed to target bed
     print "lifting"
     cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | LC_COLLATE=C sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname
     run(cmd)
 
     print "adding extra fields"
     # read lifted bed and add uniprot annotations to it as additional fields
     mutBed = open("spMut.bed", "w")
     annotBed = open("spAnnot.bed" , "w")
     structBed = open("spStruct.bed" , "w")
 
     count = 0
     blackCount = 0
     noInfoCount = 0
     doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name)
+    # name is sequence:start:
 
     for line in open("work/temp.lifted.bed"):
         bed = line.strip().split()
         muts = mutData[bed[3]]
         varIds = set([mut.varId for mut in muts])
         pmids = set()
         for mut in muts:
             pmids.update(mut.pmid.split(","))
 
         if len(pmidBlackList.intersection(pmids))==len(pmids):
             blackCount +=1
             continue
 
         if mut.featType in ["mutagenesis site", "sequence variant"]:
             isMut = True
             ofh = mutBed
-        elif mut.featType in ["strand","helix", "coiled-coil region", "turn", "disulfide bond", "glycosylation site"]:
+        elif mut.featType in ["strand","helix", "coiled-coil region", "turn"]:
             isMut = False
             ofh = structBed
         else:
             isMut = False
             ofh = annotBed
 
-        bedLine = makeBedLine(mut, bed, isMut)
+        bedLine = makeBedLine(muts, bed, isMut)
         # skip annotations that we most likely have covered before
-        if not isMut:
         keyFields = tuple(bedLine.split("\t")[:4])
         if keyFields in doneAnnots:
             continue
         doneAnnots.add(keyFields)
         ofh.write(bedLine)
         count += 1
 
     print "%d features written to %s" % (count, ofh.name)
     mutBed.close()
     annotBed.close()
     structBed.close()
 
     #print "%d variants not mapped to genome" % len(notMapped)
 
     asFname = "bed12UniProtMut.as"
     cmd = "bedToBigBed -as=%s spMut.bed %s spMut.bb -type=bed12+ -tab" % \
         (asFname, chromSizesPath)
     run(cmd)
 
     asFname = "bed12UniProtAnnot.as"
     cmd = "bedToBigBed -as=%s spAnnot.bed %s spAnnot.bb -type=bed12+ -tab" % \
         (asFname, chromSizesPath)
     run(cmd)
     cmd = "bedToBigBed -as=%s spStruct.bed %s spStruct.bb -type=bed12+ -tab" % \
         (asFname, chromSizesPath)
     run(cmd)
 
-    cmd = "rm -rf work"
-    run(cmd)
+    #cmd = "rm -rf work"
+    #run(cmd)
     cmd = "wc -l spMut.bed spAnnot.bed spStruct.bed"
     run(cmd)
     #print "%d features written to spMut.bed and %s.bed" % (count, outFname, outFname)
     print "%d features skipped because of blacklisted PMIDS" % (blackCount)
     print "%d features skipped because no info (no disease evidence, no comment)" % (noInfoCount)