ffa0c07bfa9b8783b30f5a8ff17b3955bf2701a2 max Tue Dec 3 08:18:57 2019 -0800 avada update based on QA feedback, refs #24156 diff --git src/utils/bedAppendPaperInfo src/utils/bedAppendPaperInfo index bdbb3e4..53084d9 100755 --- src/utils/bedAppendPaperInfo +++ src/utils/bedAppendPaperInfo @@ -11,96 +11,109 @@ #from pubMap import makeRefString from pubGeneric import dictToRefString, firstAuthor # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] inBed pmidColumnIdx outBed - add information on a PMID, journal, author, abstract, to a BED file that has a PMID column. pmidColumnIdx is 0-based.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("", "--db", dest="db", action="store", help="directory where articles.db is stored. Used /dev/shm to speed up lookups by 100x." "Default will use 'medline' relative to ~/.pubConf directory.", default="medline") parser.add_option("", "--pmidIdx", dest="pmidColIdx", action="store", type="int", help="field with PMID, default is last field", default=-1) + parser.add_option("", "--geneIdx", dest="geneIdx", action="store", type="int", + help="field with gene, default is no gene field", default=None) (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options def shortenString(s, maxLen=255, onlyFirst=False): """ if string is longer than 255 chars, take first 125+'...'+last 125 chars """ if len(s) > maxLen: if onlyFirst: s = s[:maxLen] + "..." else: s = s[:((maxLen/2)-3)]+"..."+s[-(maxLen/2):] return s # ----------- main -------------- def main(): args, options = parseArgs() inBedFname, outBedFname = args pmidColIdx = int(options.pmidColIdx) + geneIdx = options.geneIdx ofh = open(outBedFname, "w") artCache = {} + gene = None lCount = 0 for line in open(inBedFname): line = line.decode("latin1") if lCount % 1000 == 0: print("%d rows written" % lCount) ofh.flush() lCount += 1 row = line.rstrip("\n").split("\t") pmid = row[pmidColIdx] + + if geneIdx: + gene = row[geneIdx] + if pmid in artCache: artInfo = artCache[pmid] # saves a little time else: artInfo = lookupArticleByPmid([options.db], pmid) artCache[pmid] = artInfo ref = dictToRefString(artInfo) # The UCSC browser only handles latin1, but the name field can only contain ASCII row[3] = unidecode(row[3]) lineStart = "\t".join(row) lineStart = lineStart.encode("latin1") newRow = [] newRow.append(shortenString(artInfo["title"], maxLen=10000)) newRow.append(shortenString(artInfo["authors"])) newRow.append(ref) newRow.append(artInfo["doi"]) newRow.append(shortenString(artInfo["abstract"], maxLen=10000)) - mouseOver = row[3]+" in:"+firstAuthor(artInfo["authors"])+ " "+ str(artInfo["year"]) + " - " + \ + if gene: + prefix = gene+ ":"+row[3] + else: + prefix = row[3] + + mouseOver = prefix + " in: "+firstAuthor(artInfo["authors"])+ " "+ str(artInfo["year"]) + " - " + \ shortenString(artInfo["title"], maxLen=150, onlyFirst=True) newRow.append(mouseOver) newLine = u"\t".join(newRow) try: newLine = newLine.encode("latin1") except UnicodeEncodeError: newLine = unidecode(newLine) # if we cannot encode it in latin1, use the hand-built translation tables fullLine = lineStart+"\t"+newLine ofh.write(fullLine) ofh.write("\n") main()