src/utils/bedAppendPaperInfo cade13fb56eb21ed057867859814ff02ba55457b

cade13fb56eb21ed057867859814ff02ba55457b
max
  Mon Dec 2 08:51:11 2019 -0800
adding medline info to AVADA track, refs #24156

diff --git src/utils/bedAppendPaperInfo src/utils/bedAppendPaperInfo
new file mode 100755
index 0000000..bdbb3e4
--- /dev/null
+++ src/utils/bedAppendPaperInfo
@@ -0,0 +1,106 @@
+#!/usr/bin/env python2
+
+import logging, sys, optparse
+from collections import defaultdict
+from os.path import join, basename, dirname, isfile
+
+from unidecode import unidecode # not installed? Install with "pip install unidecode" possibly followed by --user
+
+sys.path.append("/hive/data/inside/pubs/tools/lib/")
+from pubStore import lookupArticleByPmid
+#from pubMap import makeRefString
+from pubGeneric import dictToRefString, firstAuthor
+
+# ==== functions =====
+    
+def parseArgs():
+    " setup logging, parse command line arguments and options. -h shows auto-generated help page "
+    parser = optparse.OptionParser("usage: %prog [options] inBed pmidColumnIdx outBed - add information on a PMID, journal, author, abstract, to a BED file that has a PMID column. pmidColumnIdx is 0-based.")
+
+    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+    parser.add_option("", "--db", dest="db", action="store",
+            help="directory where articles.db is stored. Used /dev/shm to speed up lookups by 100x."
+                "Default will use 'medline' relative to ~/.pubConf directory.", default="medline")
+    parser.add_option("", "--pmidIdx", dest="pmidColIdx", action="store", type="int",
+            help="field with PMID, default is last field", default=-1)
+    (options, args) = parser.parse_args()
+
+    if args==[]:
+        parser.print_help()
+        exit(1)
+
+    if options.debug:
+        logging.basicConfig(level=logging.DEBUG)
+        logging.getLogger().setLevel(logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+        logging.getLogger().setLevel(logging.INFO)
+
+    return args, options
+
+def shortenString(s, maxLen=255, onlyFirst=False):
+    """ if string is longer than 255 chars, take first 125+'...'+last 125 chars """
+    if len(s) > maxLen:
+        if onlyFirst:
+            s = s[:maxLen] + "..."
+        else:
+            s = s[:((maxLen/2)-3)]+"..."+s[-(maxLen/2):]
+    return s
+
+# ----------- main --------------
+def main():
+    args, options = parseArgs()
+
+    inBedFname, outBedFname = args
+    pmidColIdx = int(options.pmidColIdx)
+
+    ofh = open(outBedFname, "w")
+
+    artCache = {}
+
+    lCount = 0
+    for line in open(inBedFname):
+        line = line.decode("latin1")
+        if lCount % 1000 == 0:
+            print("%d rows written" % lCount)
+            ofh.flush()
+        lCount += 1
+
+        row = line.rstrip("\n").split("\t")
+        pmid = row[pmidColIdx]
+        if pmid in artCache:
+            artInfo = artCache[pmid] # saves a little time
+        else:
+            artInfo = lookupArticleByPmid([options.db], pmid)
+            artCache[pmid] = artInfo
+
+        ref = dictToRefString(artInfo)
+
+        # The UCSC browser only handles latin1, but the name field can only contain ASCII
+        row[3] = unidecode(row[3])
+        lineStart = "\t".join(row)
+        lineStart = lineStart.encode("latin1")
+
+        newRow = []
+        newRow.append(shortenString(artInfo["title"], maxLen=10000))
+        newRow.append(shortenString(artInfo["authors"]))
+        newRow.append(ref)
+        newRow.append(artInfo["doi"])
+        newRow.append(shortenString(artInfo["abstract"], maxLen=10000))
+
+        mouseOver = row[3]+" in:"+firstAuthor(artInfo["authors"])+ " "+ str(artInfo["year"]) + " - " + \
+            shortenString(artInfo["title"], maxLen=150, onlyFirst=True)
+        newRow.append(mouseOver)
+
+        newLine = u"\t".join(newRow)
+        try:
+            newLine = newLine.encode("latin1")
+        except UnicodeEncodeError:
+            newLine = unidecode(newLine) # if we cannot encode it in latin1, use the hand-built translation tables
+
+        fullLine = lineStart+"\t"+newLine
+
+        ofh.write(fullLine)
+        ofh.write("\n")
+
+main()