16720e7dfab79a7f60e91f0cb102a213c3e4738a max Fri Apr 28 15:39:08 2017 -0700 first big commit for hgGeneGraph. Others will follow as QA progresses. refs #13634 diff --git src/utils/ggMsrToTab src/utils/ggMsrToTab new file mode 100755 index 0000000..247810e --- /dev/null +++ src/utils/ggMsrToTab @@ -0,0 +1,88 @@ +#!/usr/bin/env python2.7 + +import logging, sys, optparse +from collections import defaultdict, namedtuple +from os.path import join, basename, dirname, isfile + +outFields = "eventId,causeType,causeName,causeGenes,themeType,themeName,themeGenes"\ + ",relType,relSubtype,pmid,sentenceId,triggerTokenId,themeTokenStart,themeTokenEnd,"\ + "causeTokenStart,causeTokenEnd,sentence".split(',') +# === COMMAND LINE INTERFACE, OPTIONS AND HELP === +parser = optparse.OptionParser("usage: %prog [options] filename") + +parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages - convert MSR BioNLP text mining format to our tab sep gene graph format, write to stdout") +#parser.add_option("-f", "--file", dest="file", action="store", help="run on file") +#parser.add_option("", "--test", dest="test", action="store_true", help="do something") +(options, args) = parser.parse_args() + +if options.debug: + logging.basicConfig(level=logging.DEBUG) +else: + logging.basicConfig(level=logging.INFO) +# ==== FUNCTIONs ===== +def lineFileNext(fh, headers=None, colCount=None): + """ + parses tab-sep file with headers as field names + yields collection.namedtuples, strips "#"-prefix from header line + """ + if headers==None: + line1 = fh.readline() + line1 = line1.strip("\n").strip("#") + headers = line1.split("\t") + headers = [h.replace(" ","_").replace("(","").replace(")","") for h in headers] + Record = namedtuple('tsvRec', headers) + + for line in fh: + line = line.rstrip("\n") + fields = line.split("\t") + if colCount!=None: + fields = fields[:colCount] + try: + rec = Record(*fields) + except Exception, msg: + logging.error("Exception occured while parsing line, %s" % msg) + logging.error("Filename %s" % fh.name) + logging.error("Line was: %s" % repr(line)) + logging.error("Does number of fields match headers?") + logging.error("Headers are: %s" % headers) + #raise Exception("wrong field count in line %s" % line) + continue + # convert fields to correct data type + yield rec + +def convMsr(gene, fam, comp, tokenStart, tokenEnd, tokens): + " given the split msr format, translate to our format type, name, id and extract name from snippet " + name = " ".join(tokens[int(tokenStart):int(tokenEnd)]) + if gene!="": + entType = "gene" + ids = gene + elif fam!="": + entType = "family" + ids = fam.split(".")[1].replace("_", "|") + elif comp!="": + entType = "complex" + ids = comp.split(".")[1].replace("_", "|") + return entType, name, ids + + +# ----------- MAIN -------------- +if args==[]: + parser.print_help() + exit(1) + +print "\t".join(outFields) +filename = args[0] +for row in lineFileNext(open(filename)): + eventId = "msr"+row.EventId + tokens = row.Sentence.split() + themeType, themeName, themeGenes = \ + convMsr(row.ThemeGene, row.ThemeFamily, row.ThemeComplex, row.ThemeTokenStart, row.ThemeTokenEnd, tokens) + causeType, causeName, causeGenes = \ + convMsr(row.CauseGene, row.CauseFamily, row.CauseComplex, row.CauseTokenStart, row.CauseTokenEnd, tokens) + subType = row.EventSubtype.replace("Theme:","") + newRow = [eventId, themeType, themeName, themeGenes, causeType, causeName, causeGenes, \ + row.EventType, subType, row.Pmid, row.SentenceId, row.TriggerTokenId, row.ThemeTokenStart, \ + row.ThemeTokenEnd, row.CauseTokenStart, row.CauseTokenEnd, row.Sentence] + print "\t".join(newRow) + +