src/utils/bedAppendPaperInfo ffa0c07bfa9b8783b30f5a8ff17b3955bf2701a2

ffa0c07bfa9b8783b30f5a8ff17b3955bf2701a2
max
  Tue Dec 3 08:18:57 2019 -0800
avada update based on QA feedback, refs #24156

diff --git src/utils/bedAppendPaperInfo src/utils/bedAppendPaperInfo
index bdbb3e4..53084d9 100755
--- src/utils/bedAppendPaperInfo
+++ src/utils/bedAppendPaperInfo
@@ -1,106 +1,119 @@
 #!/usr/bin/env python2
 
 import logging, sys, optparse
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile
 
 from unidecode import unidecode # not installed? Install with "pip install unidecode" possibly followed by --user
 
 sys.path.append("/hive/data/inside/pubs/tools/lib/")
 from pubStore import lookupArticleByPmid
 #from pubMap import makeRefString
 from pubGeneric import dictToRefString, firstAuthor
 
 # ==== functions =====
     
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("usage: %prog [options] inBed pmidColumnIdx outBed - add information on a PMID, journal, author, abstract, to a BED file that has a PMID column. pmidColumnIdx is 0-based.")
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
     parser.add_option("", "--db", dest="db", action="store",
             help="directory where articles.db is stored. Used /dev/shm to speed up lookups by 100x."
                 "Default will use 'medline' relative to ~/.pubConf directory.", default="medline")
     parser.add_option("", "--pmidIdx", dest="pmidColIdx", action="store", type="int",
             help="field with PMID, default is last field", default=-1)
+    parser.add_option("", "--geneIdx", dest="geneIdx", action="store", type="int",
+            help="field with gene, default is no gene field", default=None)
     (options, args) = parser.parse_args()
 
     if args==[]:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
         logging.getLogger().setLevel(logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
         logging.getLogger().setLevel(logging.INFO)
 
     return args, options
 
 def shortenString(s, maxLen=255, onlyFirst=False):
     """ if string is longer than 255 chars, take first 125+'...'+last 125 chars """
     if len(s) > maxLen:
         if onlyFirst:
             s = s[:maxLen] + "..."
         else:
             s = s[:((maxLen/2)-3)]+"..."+s[-(maxLen/2):]
     return s
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     inBedFname, outBedFname = args
     pmidColIdx = int(options.pmidColIdx)
+    geneIdx = options.geneIdx
 
     ofh = open(outBedFname, "w")
 
     artCache = {}
+    gene = None
 
     lCount = 0
     for line in open(inBedFname):
         line = line.decode("latin1")
         if lCount % 1000 == 0:
             print("%d rows written" % lCount)
             ofh.flush()
         lCount += 1
 
         row = line.rstrip("\n").split("\t")
         pmid = row[pmidColIdx]
+
+        if geneIdx:
+            gene = row[geneIdx]
+
         if pmid in artCache:
             artInfo = artCache[pmid] # saves a little time
         else:
             artInfo = lookupArticleByPmid([options.db], pmid)
             artCache[pmid] = artInfo
 
         ref = dictToRefString(artInfo)
 
         # The UCSC browser only handles latin1, but the name field can only contain ASCII
         row[3] = unidecode(row[3])
         lineStart = "\t".join(row)
         lineStart = lineStart.encode("latin1")
 
         newRow = []
         newRow.append(shortenString(artInfo["title"], maxLen=10000))
         newRow.append(shortenString(artInfo["authors"]))
         newRow.append(ref)
         newRow.append(artInfo["doi"])
         newRow.append(shortenString(artInfo["abstract"], maxLen=10000))
 
-        mouseOver = row[3]+" in:"+firstAuthor(artInfo["authors"])+ " "+ str(artInfo["year"]) + " - " + \
+        if gene:
+            prefix = gene+ ":"+row[3]
+        else:
+            prefix = row[3]
+
+        mouseOver = prefix + " in: "+firstAuthor(artInfo["authors"])+ " "+ str(artInfo["year"]) + " - " + \
             shortenString(artInfo["title"], maxLen=150, onlyFirst=True)
         newRow.append(mouseOver)
 
         newLine = u"\t".join(newRow)
         try:
             newLine = newLine.encode("latin1")
         except UnicodeEncodeError:
             newLine = unidecode(newLine) # if we cannot encode it in latin1, use the hand-built translation tables
 
         fullLine = lineStart+"\t"+newLine
 
         ofh.write(fullLine)
         ofh.write("\n")
 
 main()