cade13fb56eb21ed057867859814ff02ba55457b max Mon Dec 2 08:51:11 2019 -0800 adding medline info to AVADA track, refs #24156 diff --git src/utils/bedAppendPaperInfo src/utils/bedAppendPaperInfo new file mode 100755 index 0000000..bdbb3e4 --- /dev/null +++ src/utils/bedAppendPaperInfo @@ -0,0 +1,106 @@ +#!/usr/bin/env python2 + +import logging, sys, optparse +from collections import defaultdict +from os.path import join, basename, dirname, isfile + +from unidecode import unidecode # not installed? Install with "pip install unidecode" possibly followed by --user + +sys.path.append("/hive/data/inside/pubs/tools/lib/") +from pubStore import lookupArticleByPmid +#from pubMap import makeRefString +from pubGeneric import dictToRefString, firstAuthor + +# ==== functions ===== + +def parseArgs(): + " setup logging, parse command line arguments and options. -h shows auto-generated help page " + parser = optparse.OptionParser("usage: %prog [options] inBed pmidColumnIdx outBed - add information on a PMID, journal, author, abstract, to a BED file that has a PMID column. pmidColumnIdx is 0-based.") + + parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") + parser.add_option("", "--db", dest="db", action="store", + help="directory where articles.db is stored. Used /dev/shm to speed up lookups by 100x." + "Default will use 'medline' relative to ~/.pubConf directory.", default="medline") + parser.add_option("", "--pmidIdx", dest="pmidColIdx", action="store", type="int", + help="field with PMID, default is last field", default=-1) + (options, args) = parser.parse_args() + + if args==[]: + parser.print_help() + exit(1) + + if options.debug: + logging.basicConfig(level=logging.DEBUG) + logging.getLogger().setLevel(logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + logging.getLogger().setLevel(logging.INFO) + + return args, options + +def shortenString(s, maxLen=255, onlyFirst=False): + """ if string is longer than 255 chars, take first 125+'...'+last 125 chars """ + if len(s) > maxLen: + if onlyFirst: + s = s[:maxLen] + "..." + else: + s = s[:((maxLen/2)-3)]+"..."+s[-(maxLen/2):] + return s + +# ----------- main -------------- +def main(): + args, options = parseArgs() + + inBedFname, outBedFname = args + pmidColIdx = int(options.pmidColIdx) + + ofh = open(outBedFname, "w") + + artCache = {} + + lCount = 0 + for line in open(inBedFname): + line = line.decode("latin1") + if lCount % 1000 == 0: + print("%d rows written" % lCount) + ofh.flush() + lCount += 1 + + row = line.rstrip("\n").split("\t") + pmid = row[pmidColIdx] + if pmid in artCache: + artInfo = artCache[pmid] # saves a little time + else: + artInfo = lookupArticleByPmid([options.db], pmid) + artCache[pmid] = artInfo + + ref = dictToRefString(artInfo) + + # The UCSC browser only handles latin1, but the name field can only contain ASCII + row[3] = unidecode(row[3]) + lineStart = "\t".join(row) + lineStart = lineStart.encode("latin1") + + newRow = [] + newRow.append(shortenString(artInfo["title"], maxLen=10000)) + newRow.append(shortenString(artInfo["authors"])) + newRow.append(ref) + newRow.append(artInfo["doi"]) + newRow.append(shortenString(artInfo["abstract"], maxLen=10000)) + + mouseOver = row[3]+" in:"+firstAuthor(artInfo["authors"])+ " "+ str(artInfo["year"]) + " - " + \ + shortenString(artInfo["title"], maxLen=150, onlyFirst=True) + newRow.append(mouseOver) + + newLine = u"\t".join(newRow) + try: + newLine = newLine.encode("latin1") + except UnicodeEncodeError: + newLine = unidecode(newLine) # if we cannot encode it in latin1, use the hand-built translation tables + + fullLine = lineStart+"\t"+newLine + + ofh.write(fullLine) + ofh.write("\n") + +main()