src/utils/ggTables b88380414ddecaab81f5c19b9f0cf06dbbedc787

b88380414ddecaab81f5c19b9f0cf06dbbedc787
max
  Fri May 26 06:36:28 2017 -0700
changing python hash bang again for hgGeneGraph, tiny wrangling fix, refs #13634

diff --git src/utils/ggTables src/utils/ggTables
index bb07669..7dbe660 100755
--- src/utils/ggTables
+++ src/utils/ggTables
@@ -1,1255 +1,1267 @@
 #!/usr/bin/env python2.7
 
 import logging, sys, optparse, glob, itertools, os, tempfile, gzip, operator, re, cPickle, gc
 import marshal
 from collections import defaultdict, namedtuple, Counter
 from itertools import chain, combinations
 from os.path import join, basename, dirname, isfile, expanduser
 import ujson
 
 # saves 20% of time when loading graph marshal
 import gc
 gc.disable()
 
 #LTPMAXMEMBERS=5 # maximum number of proteins in a complex for an interaction to quality for low throughput
 LTPMAX=5 # maximum number of interactions a PMID can have to be declared low throughput
 
 # don't even write out links with less than this number of documents. 2 = weeds out many false positives.
 # not using right now, because a few pathway databases do not annotate ANY PMID. Increasing this filter would remove
 # all interactions from these pathways databases.
 MINSUPP=0
 
 outFields = ["gene1", "gene2", "flags", "refs", "fwWeight", "revWeight", "snip"]
 
 # directory with autoSql descriptions of output tables
 autoSqlDir = expanduser("~/kent/src/hg/lib/")
 
 # file with all of medline in short form
 allArtFname = "textInfo.tab"
 # file with just pmids and events
 #pmidEventFname = "temp.pmidToEvent.tab"
 
 # RE to split sentences
 wordRe = re.compile("[a-zA-Z0-9]+")
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] build|load pathwayDir ppiDir textDir outDir - given various tab sep files with text-mining, gene interaction or pathway information, build the  table ggLink, ggDoc, ggDb and ggText
 
 run it like this:
 
-%prog medline - to reduce the big medline table to something smaller, only needed once
+Reduce the big medline table to something smaller, only needed once:
+    %prog medline
 
+Slowest part: build the big table of interactions mysql/ggLink.tmp.tab
     %prog build pathways ppi text mysql
-%prog docs mysql  # creates the ggDocs.tab file, slow
+
+Create mysql/ggDocs.tab, very slow
+    %prog docs mysql
+
+Add the "context" (aka mesh terms) to mysql/ggLink.tmp.tab 
+and write to mysql/ggLink.tab file
     %prog context mysql  
-%prog load mysql publications
+format is:
+gene1, gene2, flags, forwDocCount, revDocCount, allDocCount, databaseList, minimalPairCountPaper, snippet
+
+Load all tables in mysql/ into MySql:
+    %prog load mysql hgFixed
+
+Create the bigBed File
     %prog bigBed outDir bigBedFile db
 """)
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
 #parser.add_option("-t", "--test", dest="test", action="store_true", help="run tests") 
 parser.add_option("-t", "--textDir", dest="textDir", action="store", help="directory with the parsed copy of medline, default %default", default="/hive/data/inside/pubs/text/medline")
 parser.add_option("-m", "--meshFile", dest="meshFname", action="store", help="An mtrees<year>.bin file, default %default", default="/hive/data/outside/ncbi/mesh/mtrees2015.bin")
 parser.add_option("-j", "--journalInfo", dest="journalInfo", action="store", help="tab-sep file with journal info from the NLM Catalog converted by 'pubPrepCrawl publishers'. Used to shorten the journal names. Optional and not used if file is not found. Default %default", default="/cluster/home/max/projects/pubs/tools/data/journals/journals.tab")
 parser.add_option("-b", "--wordFname", dest="wordFname", action="store", help="a file with common English words", default="/hive/data/outside/pubs/wordFrequency/bnc/bnc.txt")
 #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
 #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
 else:
     logging.basicConfig(level=logging.INFO)
 # ==== FUNCTIONs =====
     
 def parseAutoSql(asFname):
     " parse auto sql file and return list of field names "
     headers = []
     for line in open(asFname):
         line = line.replace("(","").replace(")","").strip()
         if line.startswith('"') or line.startswith("\n"):
             continue
         if line.startswith("table"):
             continue
         if not ";" in line:
             continue
         parts = line.split(";")[0]
         parts = parts.split()
         assert(len(parts)>=2)
         headers.append(parts[1])
     return headers
 
 def lineFileNext(fh, headers=None, asFname=None):
     """ parses tab-sep file with headers as field names , assumes that file starts with headers
         yields collection.namedtuples
     """
     line1 = fh.readline()
     line1 = line1.strip("\n").strip("#")
     if headers==None and asFname!=None:
         headers = parseAutoSql(asFname)
     elif headers==None:
         headers = line1.split("\t")
         headers = [h.replace(" ", "_") for h in headers]
         headers = [h.replace("(", "") for h in headers]
         headers = [h.replace(")", "") for h in headers]
     Record = namedtuple('tsvRec', headers)
 
     for line in fh:
         line = line.rstrip("\n")
         fields = line.split("\t")
         try:
             rec = Record(*fields)
         except Exception, msg:
             logging.error("Exception occured while parsing line, %s" % msg)
             logging.error("Filename %s" % fh.name)
             logging.error("Line was: %s" % repr(line))
             logging.error("Does number of fields match headers?")
             logging.error("Headers are: %s" % headers)
             #raise Exception("wrong field count in line %s" % line)
             continue
         # convert fields to correct data type
         yield rec
 
 def loadFiles(inDir, prefix=None):
     """ load .tab files into dict fType -> list of rows and return tuple (ppiRows, textRows)
     """
     #typeRows = defaultdict(list)
     #jppiRows = list()
     #jtextRows = list()
     pairs = defaultdict(list)
     inFnames = glob.glob(join(inDir, "*.tab"))
     rows = list()
     for inFname in inFnames:
         logging.info("Loading %s" % inFname)
         #fType = None
         for row in lineFileNext(open(inFname)):
             if prefix!=None:
                 row = row._replace(eventId=prefix+row.eventId)
             rows.append(row)
     return rows
 
 def getResultCounts(pairs):
     """
     Input is a pair -> rows dictionary
     for each PMID, count how many pairs are assigned to it. This is
     something like the "resultCount" of a paper, the lower, the better.
     Then, for each pair, get the minimum resultCount and return as a dict
     pair -> resultCount
     """
     logging.info("Getting low-throughput studies in PPI and pathway data")
 
     # create dict doc -> set of gene pairs
     docToPairs = defaultdict(set)
     for pair, rows in pairs.iteritems():
         #print "pp", pair
         for row in rows:
-            #print "r", row
             #members = row.themeGenes.split("|")
             #members.extend(row.causeGenes.split("|"))
             # complexes with more than 5 proteins are not low-throughput anyways
             # skip these right away
             #if len(members)>LTPMAXMEMBERS:
                 #continue
             docIds = row.pmids.split("|")
             for docId in docIds:
                 if docId=="":
                     continue
                 docToPairs[docId].add(pair)
 
     #print "d2p", docToPairs
 
     pairMinResultCounts = {}
     for pair, rows in pairs.iteritems():
         #print "pp", pair
         resultCounts = []
         # get all docIds in rows
         docIds = []
         for row in rows:
             docIds.extend(row.pmids.split("|"))
 
         # get the minimal resultCount of all docIds 
         for docId in set(docIds):
             if docId=="":
                 continue
             #print "di2", docId
             #print "d2p", docToPairs[docId]
             resCount = len(docToPairs[docId])
             #print "rc", resCount
             resultCounts.append(resCount)
         if len(resultCounts)!=0:
             minResCount = min(resultCounts)
         else:
             minResCount = 0
         #print "min", minResCount
         pairMinResultCounts[pair] = minResCount
         
     #ltPairs = set()
     #ltDocs = []
     #for pmid, pairList in docToPairs.iteritems():
         #if len(pairList) <= LTPMAX:
             #ltPairs.update(pairList)
             #ltDocs.append(pmid)
 
     #logging.info("Found %d low-throughput studies out of %d" % (len(ltDocs), len(pmidToPairs)))
     #logging.info("Found %d low-throughput interactions out of %d" % (len(ltPairs), len(pairs)))
     #return ltPairs, ltDocs
     return pairMinResultCounts, docToPairs
             
 def allSubPairs(pair):
     """ given a pair of two strings, where each can be a _-separate list of genes (a family), 
     return all combinations of each member 
     """
     x, y = pair
     xs = x.split("_")
     ys = y.split("_")
     for subPair in [(a, b) for a in xs for b in ys]:
         a, b = subPair
         if a=="" or b=="" or a=="-" or b=="-" or \
                 a.startswith("compound") or b.startswith("compound"):
             continue
         yield subPair
     
 def iterAllPairs(row):
     """ yield all pairs of interacting genes for a given row. Handles families 
     >>> list(iterAllPairs("gene", ["TP1","TP2"], "complex", ["OMG1","OMG2"]))
     [('OMG1', 'OMG2'), ('TP1', 'OMG1'), ('TP1', 'OMG2'), ('TP2', 'OMG1'), ('TP2', 'OMG2')]
     >>> list(iterAllPairs("complex", ["TP1_TEST2","TP2"], "complex", ["OMG1","OMG2"]))
     """
 
     type1 = row.causeType
     type2 = row.themeType
     genes1 = set(row.causeGenes.split("|"))
     genes2 = set(row.themeGenes.split("|"))
     # all genes of complexes interact in some way
     if type1=="complex":
         for pair in itertools.combinations(genes1, 2):
             # a complex can contain families
             for subPair in allSubPairs(pair):
                 yield tuple(subPair)
     if type2=="complex":
         for pair in itertools.combinations(genes2, 2):
             # a complex can contain families
             for subPair in allSubPairs(pair):
                 yield tuple(subPair)
 
     # all genes from the left and the right side interact
     if type2!="":
         pairs = list([(aa, bb) for aa in genes1 for bb in genes2])
         for pair in pairs:
             for subPair in allSubPairs(pair):
                 gene1, gene2 = subPair
                 if gene1=="-" or gene1=="" or gene2=="" or gene2=="-":
                     #skipCount += 1
                     continue
                 if gene1.startswith("compound") or gene2.startswith("compound"):
                     continue
                 yield subPair
     
 def indexPairs(ppiRows, desc):
     """ given rows with theme and cause genes, return 
     a dict with sorted (gene1, gene2) -> list of eventIds """
     logging.info("enumerating all interacting pairs: %s" % desc)
     pairs = defaultdict(list)
     for row in ppiRows:
         for pair in iterAllPairs(row):
             gene1, gene2 = pair
             if gene1.startswith("compound") or gene2.startswith("compound"):
                 continue
             pairs[tuple(sorted(pair))].append(row)
     logging.info("got %d pairs" % len(pairs))
     return pairs
 
 def mergePairs(dicts):
     " merge a list of defaultdict(list) into one defaultdict(set) "
     logging.info("Merging pairs")
     data = defaultdict(set)
     for defDict in dicts:
         for key, valList in defDict.iteritems():
             data[key].update(valList)
     return data
 
 def directedPairToDocs(rows):
     """ get documents of text mining pairs. a dict with pair -> text rows
     create a dict pair -> set of document IDs. Note that these pairs are
     DIRECTED - so they can be used to infer the direction of the interaction
     """
     # create a dict with pair -> pmids
     pairPmids = defaultdict(set)
     for row in rows:
         genes1 = set(row.causeGenes.split("|"))
         genes2 = set(row.themeGenes.split("|"))
         pairs = list([(aa, bb) for aa in genes1 for bb in genes2])
         for cause, theme in pairs:
             pairPmids[(cause, theme)].add(row.pmid)
     
     return pairPmids
 
 def writeGraphTable(allPairs, pairDocs, pairToDbs, pairMinResCounts, pwDirPairs, bestSentences, outFname, outFname2):
     " write the ggLink table "
     logging.info("writing merged graph to %s" % outFname)
     rows = []
     rows2 = []
     for pair,pairRows in allPairs.iteritems():
         gene1, gene2 = pair
 
         dbs = set()
         flags = []
         if pair in dbPairs:
             flags.append("ppi")
         if pair in pwPairs:
             flags.append("pwy")
         if pair in textPairs:
             flags.append("text")
         refs = [row.eventId for row in pairRows]
         #if pairMinResultCounts:
             #flags.append("low")
         # direction of interaction - only based on pathways
         if pair in pwDirPairs:
             flags.append("fwd")
         if tuple(reversed(pair)) in pwDirPairs:
             flags.append("rev")
             
         forwDocs = pairDocs.get(pair, [])
         revDocs = pairDocs.get(tuple(reversed(pair)), [])
         allDocs = set(forwDocs).union(set(revDocs))
 
         if len(allDocs)<MINSUPP and "pwy" not in flags and "ppi" not in flags:
             # if it's text-mining only and less than X documents, just skip it
             continue
         pairMinResCount = pairMinResCounts.get(pair, 0)
 
         pairDbs = "|".join(pairToDbs.get(pair, []))
         snippet = bestSentences.get(pair, "")
         row = [gene1, gene2, ",".join(flags), str(len(forwDocs)), str(len(revDocs)), \
             str(len(allDocs)), pairDbs, str(pairMinResCount), snippet]
         rows.append(row)
 
         refs = list(refs)
         refs.sort()
         for ref in refs:
             #row2 = [gene1, gene2, ",".join(refs)]
             row = [gene1, gene2, ref]
             rows2.append(row)
 
     ofh = open(outFname, "w")
     rows.sort()
     for row in rows:
         ofh.write("\t".join(row))
         ofh.write("\n")
     ofh.close()
 
     ofh2 = open(outFname2, "w")
     rows2.sort()
     for row in rows2:
         ofh2.write("\t".join(row))
         ofh2.write("\n")
     ofh2.close()
 
 def runCmd(cmd):
     """ run command in shell, exit if not successful """
     msg = "Running shell command: %s" % cmd
     logging.debug(msg)
     ret = os.system(cmd)
     if ret!=0:
         raise Exception("Could not run command (Exitcode %d): %s" % (ret, cmd))
     return ret
 
 def asToSql(table, sqlDir):
     " given a table name, return the name of a .sql file with CREATE TABLE for it"
 
     asPath = join(autoSqlDir, table+".as")
     #tempBase = tempfile.mktemp()
     outBase = join(sqlDir, table)
     cmd = "autoSql %s %s" % (asPath, outBase)
     runCmd(cmd)
     #sql = open("%s.sql" % sqlFname).read()
 
     # delete the files that are not needed
     #assert(len(tempBase)>5) # paranoia check
     #cmd = "rm -f %s.h %s.c"  % (tempBase, tempBase)
     #runCmd(cmd)
 
     return outBase+".sql"
 
 def loadTable(db, tableDir, table):
     " load table into mysql, using autoSql "
     #sqlFname = join(tableDir, table+".sql")
     tmpSqlFname = asToSql(table, tableDir)
     tabFname = join(tableDir, table+".tab")
 
     cmd = "hgLoadSqlTab %s %s %s %s" % (db, table, tmpSqlFname, tabFname)
     try:
         runCmd(cmd)
     except:
         # make sure that the temp file gets deleted
         os.remove(tmpSqlFname)
         raise
 
     os.remove(tmpSqlFname)
 
 def hgsql(db, query):
     assert('"' not in sql)
     cmd = "hgsql %s -NBe '%s'" % (db, query)
 
 def addIndexes(db):
     " add the indexes for mysql "
     query = "ALTER TABLE ggLinkEvent ADD INDEX gene12Idx (gene1, gene2);"
     hgsql(db, query)
 
     query = "ALTER TABLE ggEventText ADD INDEX docIdIdx (docId);"
     hgsql(db, query)
 
 def loadTables(tableDir, db):
     " load graph tables into mysql "
 
     loadTable(db, tableDir, "ggDoc")
     loadTable(db, tableDir, "ggDocEvent")
     loadTable(db, tableDir, "ggEventDb")
     loadTable(db, tableDir, "ggEventText")
     loadTable(db, tableDir, "ggLink")
     loadTable(db, tableDir, "ggLinkEvent")
 
     addIndexes(db)
 
 def indexPmids(rowList, textRows):
     " return dict pmid -> list of event Ids "
     pmidToIds = defaultdict(set) 
     for rows in rowList:
         for row in rows:
             pmidStr = row.pmids
             if pmidStr=="":
                 continue
             pmids = pmidStr.split("|")
             rowId = row.eventId
             for pmid in pmids:
                 if pmid=="":
                     continue
                 pmidToIds[pmid].add(rowId)
 
     for row in textRows:
         if row.pmid=="":
             continue
         pmidToIds[row.pmid].add(row.eventId)
 
     return pmidToIds
 
 def writeDocEvents(pmidToId, outFname):
     " write a table with PMID -> list of event Ids "
     logging.info("Writing docId-eventId %s" % outFname)
     ofh = open(outFname, "w")
     for docId, eventIds in pmidToId.iteritems():
         eventIds = sorted(list(eventIds))
         for eventId in eventIds:
             ofh.write("%s\t%s\n" % (docId, eventId))
     ofh.close()
 
 def writeEventTable(rowList, outFname, colCount=None):
     " write the event table with event details "
     logging.info("Writing events to %s" % outFname)
     ofh = open(outFname, "w")
     for rows in rowList:
         for row in rows:
             if colCount:
                 row = row[:colCount]
             ofh.write("%s\n" % ("\t".join(row)))
     ofh.close()
 
 def pairToDbs(pairs):
     """ given pairs and data rows, return a dict pair -> int 
     that indicates how many DBs a pair is referenced in 
     """
     # first make dict event -> source dbs
     #eventDbs = defaultdict(set)
     #for row in pwRows:
         #eventDbs[row.eventId].add(row.sourceDb)
     #for row in dbRows:
         #sourceDbs = row.sourceDbs.split("|")
         #eventDbs[row.eventId].update(sourceDbs)
 
     # construct a dict pair -> source dbs
     pairDbs = defaultdict(set)
     for pair, rows in pairs.iteritems():
         for row in rows:
             pairDbs[pair].add(row.sourceDb)
 
     return pairDbs
 
 def parseMeshContext(fname):
     " given a medline trees file, return the list of disease and pathway names in it "
     # ex. filename is mtrees2013.bin (it's ascii)
     # WAGR Syndrome;C10.597.606.643.969
     terms = set()
     lines = open(fname)
     for line in lines:
         line = line.strip()
         term, code = line.split(";")
         term = term.strip()
         # all disease terms start with C
         if code.startswith("C"):
             terms.add(term)
         # all signalling pathways a below a very specific branch
         elif code.startswith("G02.149.115.800") and not code=="G02.149.115.800":
             terms.add(term)
     logging.info("Read %d disease/context MESH terms from %s" % (len(terms), fname))
     return terms
 
 
 def getDirectedPairs(pwRows):
     " get the set of directed gene pairs from the rows, keep the direction "
     pairs = set()
     for row in pwRows:
         for pair in iterAllPairs(row):
             pairs.add(pair)
     return pairs
             
 def writeAllDocInfo(textDir, outFname):
     " get all author/year/journal/title as tab-sep from a pubtools-style input directory, ~5GB big "
     mask = join(textDir, "*.articles.gz")
 
     ofh = open(outFname, "w")
     fnames = glob.glob(mask)
     doneDocs = set()
     for i, fname in enumerate(fnames):
         if i % 10 == 0:
             logging.info("%d out of %d files" % (i, len(fnames)))
 
         for row in lineFileNext(gzip.open(fname)):
             # skip duplicates
             if row.pmid in doneDocs:
                 continue
             doneDocs.add(row.pmid)
 
             if row.year.isdigit() and int(row.year)>1975:
                 newRow = (row.pmid, row.authors, row.year, row.journal, row.printIssn, \
                         row.title, row.abstract, row.keywords)
                 ofh.write("\t".join(newRow))
                 ofh.write("\n")
     ofh.close()
     logging.info("Article info written to %s" % outFname)
 
 def parseShortNames(journalFname):
     # get dict ISSN -> short name
     shortNames = {}
     if isfile(journalFname):
         for row in lineFileNext(open(journalFname)):
             if row.medlineTA!="" and row.pIssn!="":
                 shortNames[row.pIssn] = row.medlineTA
         logging.info("Read a short journal name for %d ISSNs from  %s" % (len(shortNames), journalFname))
     else:
         logging.info("%s not found, not shortening journal names" % journalFname)
     return shortNames
 
 def writeDocsTable(pmidEventPath, medlinePath, shortNames, contextFilter, resCounts, outFname):
     """ join pmid-Event info and our shortened medline version 
     resCount is a set of docIds with low-throughput data (fewer than LTPMAX interactions per doc)
     """
     # parse the PMIDs to export
     docIds = set()
     for row in lineFileNext(open(pmidEventPath), headers=["docId", "eventId"]):
         docIds.add(row.docId)
     logging.info("read %d document IDs from %s" %  (len(docIds), pmidEventPath))
 
     docContexts = {}
 
     logging.info("Writing to %s" % outFname)
     ofh = open(outFname, "w")
     # fields are: docId, authors, year, journal, printIssn, title, abstract, keywords
     foundIds = set()
     for line in open(medlinePath):
         fields = line.rstrip("\n").split("\t")
         docId = fields[0]
         if docId in docIds:
             issn = fields[4]
             shortName = shortNames.get(issn)
             if shortName!=None:
                 fields[3] = shortName
 
             newKeywords = []
             for kw in fields[7].split("/"):
                 if kw in contextFilter:
                     newKeywords.append(kw)
             docContext = "|".join(newKeywords)
             fields[7] = docContext
             if docContext!="":
                 docContexts[docId] = docContext
 
             # add a field: how many gene-pairs are associated to this paper
             fields.append(resCounts.get(docId, "0"))
 
             line = "\t".join(fields)+"\n"
 
             ofh.write(line)
             foundIds.add(docId)
     ofh.close()
 
     notFoundIds = docIds - foundIds
     logging.info("No info for %d documents" % len(notFoundIds))
     logging.debug("No info for these documents: %s" % notFoundIds)
 
     return docContexts
 
 def sumBasic(sentences, commonWords):
     """ given probabilities of words, rank sentences by average prob
     (removing commonWords).
     Sentences is a list of list of words
     Algorithm is described in http://ijcai.org/papers07/Papers/IJCAI07-287.pdf
 
     Returns sentence with highest score and shortest length, if several have a highest score
     """
 
     if len(sentences)==0:
         return ""
     sentWordsList = [set(wordRe.findall(sentence)) for sentence in sentences]
     words = list(chain.from_iterable(sentWordsList))
     wordProbs = {word: float(count)/len(words) for word, count in Counter(words).items()}
 
     scoredSentences = []
     for sentWords, sentence in zip(sentWordsList, sentences):
         mainWords = sentWords - commonWords
         if len(mainWords)==0:
             continue
         avgProb = sum([wordProbs[word] for word in mainWords]) / len(mainWords)
         scoredSentences.append((avgProb, sentence, sentWords))
 
     # happens rarely: all words are common English words
     if len(scoredSentences)==0:
         return ""
 
     # get sentences with equally good top score
     scoredSentences.sort(key=operator.itemgetter(0), reverse=True)
     topScore = scoredSentences[0][0]
     topSents = [(sent, words) for score, sent, words in scoredSentences if score >= topScore]
 
     # sort these by length and pick shortest one
     topSentLens = [(len(s), s, w) for s, w in topSents]
     topSentLens.sort(key=operator.itemgetter(0))
     topLen, topSent, topWords = topSentLens[0]
 
     # update word frequencies
     for word in topWords:
         wordProbs[word] *= wordProbs[word]
 
     return topSent
 
 def runSumBasic(textPairs, wordFname):
     """ Get all sentences for an interaction and use sumBasic to pick the best one 
     """
     # get list of very common English words
     bncWords = set([line.split()[0] for line in open(wordFname).read().splitlines()])
     logging.info("Loaded %d common English words from %s" % (len(bncWords), wordFname))
 
     logging.info("Running SumBasic on sentences")
     bestSentences = {}
     for pair, rows in textPairs.iteritems():
         sentSet = set()
         for row in rows:
             if row.sentence!="":
                 sentSet.add(row.sentence)
 
         sentences = list(sentSet)
         bestSentences[pair] = sumBasic(sentences, bncWords)
     return bestSentences
 
 def readDictList(fname, reverse=False):
     " read a key-value tab-sep table and return as a dict of key -> values"
     logging.info("reading %s" % fname)
     data = defaultdict(list)
     for line in open(fname):
         key, val = line.rstrip("\n").split("\t")
         if reverse:
             key, val = val, key
         data[key].append(val)
     return data
 
 def readDict(fname, reverse=False):
     " read a key-value tab-sep table and return as a dict of key -> value"
     logging.info("reading %s" % fname)
     data = defaultdict(list)
     for line in open(fname):
         key, val = line.rstrip("\n").split("\t")
         if reverse:
             key, val = val, key
         data[key] = val
     return data
 
 def readPairEvent(fname):
     """ read a tab-sep table in format (gene1, gene2, eventId) and return as
     dict (gene1,gene2) -> list of eventId""" 
     logging.info("reading %s" % fname)
     data = defaultdict(list)
     for line in open(fname):
         gene1, gene2, val = line.rstrip("\n").split("\t")
         data[(gene1, gene2)].append(val)
     return data
 
 def addContext(ctFname, docEventFname, linkEventFname, linkFname):
     " read the data from the first three files, and put it into the last field of linkFname "
     docContext = readDict(ctFname)
     eventDocs = readDictList(docEventFname, reverse=True)
     pairEvents = readPairEvent(linkEventFname)
     
     logging.info("Reading %s" % linkFname)
     newLines = []
     for line in open(linkFname):
         #print line
         fields = line.rstrip("\n").split("\t")
         pair = (fields[0], fields[1])
 
         contextCounts = Counter()
         for eventId in pairEvents[pair]:
             #print "pair %s, event %s" % (pair, eventId)
             for docId in eventDocs.get(eventId, []):
                 #print "doc %s" % docId
                 contexts = docContext.get(docId, "")
                 for context in contexts.split("|"):
                     #print "context %s" % context
                     if context=="" or context==" ":
                         continue
                     contextCounts[context]+=1
         # take best three contexts and reformat as a string
         suffix = ""
         if len(contextCounts)>3:
             suffix = "..."
         bestContexts = contextCounts.most_common()
         contextStrings = ["%s (%d)" % (ct, count) for ct, count in bestContexts]
         contextStr = ", ".join(contextStrings)
         fields.append(contextStr)
         newLines.append("\t".join(fields))
     return newLines
 
 def convGraph(outDir):
     import networkx as nx
     fname = join(outDir, "ggLink.tab")
     G=nx.Graph()
     for line in open(fname):
         g1, g2 = line.split()[:2]
         G.add_edge(g1, g2)
     outFname = join(outDir, "graph.bin")
     #nx.write_adjlist(G, outFname)
     cPickle.dump(G, open(outFname, "w"), cPickle.HIGHEST_PROTOCOL)
     logging.info("Wrote graph to %s" % outFname)
 
 def convGraph2(outDir):
     import igraph as ig
     fname = join(outDir, "ggLink.tab")
     G=ig.Graph()
     geneToId = {}
     nextId = 0
     edges = []
     for line in open(fname):
         g1, g2 = line.split()[:2]
 
         if g1 not in geneToId:
             id1 = geneToId[g1] = nextId
             nextId += 1
         else:
             id1 = geneToId[g1]
 
         if g2 not in geneToId:
             id2 = geneToId[g2] = nextId
             nextId += 1
         else:
             id2 = geneToId[g2]
 
         edges.append( (id1, id2) )
     #print nextId
     #print edges
     G.add_vertices(nextId+1)
     G.add_edges(edges)
     #outFname = join(outDir, "graph2.bin")
     #nx.write_adjlist(G, outFname)
     #cPickle.dump(G, open(outFname, "w"), cPickle.HIGHEST_PROTOCOL)
     #logging.info("Wrote graph to %s" % outFname)
     outFname = join(outDir, "graph.lgl")
     G.write(outFname, "lgl")
     logging.info("Wrote graph to %s" % outFname)
 
     outFname = join(outDir, "graph.genes.txt")
     ofh = open(outFname, "w")
     for gene, geneId in geneToId.iteritems():
         ofh.write("%s\t%s\n" % (gene, geneId))
     ofh.close()
     logging.info("Wrote nodeId-symbol mapping to %s" % outFname)
     #outFname = join(outDir, "graph.adj")
     #G.write(outFname, "adjacency")
     #outFname = join(outDir, "graph.leda")
     #G.write(outFname, "leda")
     #outFname = join(outDir, "graph.dot")
     #G.write(outFname, "dot")
     #outFname = join(outDir, "graph.pajek")
     #G.write(outFname, "pajek")
     #logging.info("Wrote graph to %s" % outFname)
 
 def convGraph3(outDir):
     """ convert graph to a compact format gene -> list of connected genes
     """
     fname = join(outDir, "ggLink.tab")
     geneLinks = defaultdict(set)
     #genes = set()
     # format: gene1, gene2, flags, forwDocs, revDocs, allDocs, pairDbs, pairMinResCount, snippet
     for line in open(fname):
         fields = line.split("\t")
 
         g1, g2, flags = fields[:3]
         #flags = flags.split(",")
         pairMinResCount = int(fields[7])
         docCount = int(fields[5])
 
         # require many documents or manually curated database
         if docCount < 3 and not "ppi" in flags and not "pwy" in flags:
             continue
         #pairDbs = fields[6]
 
         # ignore all interactions that are derived from papers 
         # that reported more than 100 interactions, e.g. big complexes
         if pairMinResCount > 100:
             continue
 
         #geneLinks[g1].add((g2, flags))
         #geneLinks[g2].add((g1, flags))
         #geneLinks[g1].add(g2)
         #geneLinks[g2].add(g1)
         geneLinks[g1].add((g2, docCount))
         geneLinks[g2].add((g1, docCount))
 
         #genes.add(g1)
         #genes.add(g2)
     geneLinks = dict(geneLinks)
 
     # rewrite to dict str -> list
     #geneLinks = {k:list(v) for k,v in geneLinks.iteritems()}
 
     # map gene -> integer
     #geneToId = {}
     #for geneId, gene in enumerate(genes):
         #geneToId[gene] = geneId
 
     # this is only slightly slower - probably best format
     #outFname = join(outDir, "graph.txt")
     #ofh = open(outFname, "w")
     #for gene, neighbors in geneLinks.iteritems():
         #ofh.write("%s\t%s\n" % (gene, ",".join(neighbors)))
     #ofh.close()
     #logging.info("Wrote links to %s" % outFname)
 
     # this is not faster
     #intLinks = {}
     #for gene, neighbors in geneLinks.iteritems():
         #intLinks[geneToId[gene]] = [geneToId[n] for n in neighbors]
 
     # fastest when taking into account set-building time
     outFname = join(outDir, "graph.marshal")
     marshal.dump(geneLinks, open(outFname, "w"), 2) # prot 0, 1 not faster
     logging.info("Wrote links to %s" % outFname)
 
     #outFname = join(outDir, "graph.ujson")
     #ujson.dump(geneLinks, open(outFname, "w"))
     #logging.info("Wrote json strings links to %s" % outFname)
 
     #outFname = join(outDir, "graphInt.marshal")
     #idToGene = {v: k for k, v in geneToId.iteritems()}
     #data = (idToGene, intLinks)
     #marshal.dump(data, open(outFname, "w"), 2)
     #logging.info("Wrote integer links to %s" % outFname)
 
     #outFname = join(outDir, "graphInt.ujson")
     #ujson.dump(data, open(outFname, "w"))
     #logging.info("Wrote json integer links to %s" % outFname)
 
 def loadGraph(outDir):
     import networkx as nx
     inFname = join(outDir, "graph.bin")
     gc.disable() # no need for GC here, saves 2 seconds
     G = cPickle.load(open(inFname))
     gc.enable()
     geneList = ["OTX2", "PITX2", "APOE", "TP53", "TNF", "SP1"]
     foundPairs = set()
     G2 = G.subgraph(geneList)
     print nx.nodes(G)
     #for g1, g2 in combinations(geneList, 2):
         #path = nx.shortest_path(G, g1, g2)
         #print path
         #for i in range(0, len(path)-1):
             #pair = tuple(sorted(path[i:i+2]))
             #foundPairs.add(pair)
     #print foundPairs
         
 def parseGeneToId(inFname):
     geneToId = {}
     idToGene = {}
     for l in open(inFname):
         gene, geneId = l.rstrip("\n").split("\t")
         geneId = int(geneId)
         geneToId[gene] = geneId
         idToGene[geneId] = gene
     return geneToId, idToGene
 
 def loadGraph2(outDir):
     import igraph as ig
     inFname = join(outDir, "graph.lgl")
     #gc.disable() # no need for GC here, saves 2 seconds
     #G = cPickle.load(open(inFname))
     #gc.enable()
     #G=ig.Graph()
     G = ig.load(inFname)
     #print G
 
     inFname = join(outDir, "graph.genes.txt")
     geneToId, idToGene = parseGeneToId(inFname)
 
     geneList = ["OTX2", "PITX2", "APOE", "TP53", "TNF", "SP1","ABCA1", "CD4", "BRCA2", "APP", "SRY", "GAST", "MYOD1"]
     idList = [geneToId[g] for g in geneList]
     allSyms = set()
     for i in range(0, len(idList)-1):
         fromId = idList[i]
         fromGene = geneList[i]
         toIds = idList[i+1:]
         toGenes = geneList[i+1:]
         print fromId, fromGene, toIds, toGenes
         paths = G.get_shortest_paths(fromId, toIds, mode=ig.ALL) 
         genePaths = []
         for idPath in paths:
             genePath = [idToGene[i] for i in idPath]
             genePaths.append(genePath)
             allSyms.update(genePath)
         print "paths", genePaths
         print ",".join(allSyms)
         #for i in range(0, len(path)-1):
             #pair = tuple(sorted(path[i:i+2]))
             #foundPairs.add(pair)
     #print foundPairs
     #geneList = ["OTX2", "PITX2", "APOE", "TP53", "TNF", "SP1"]
     #G2 = G.subgraph(geneList)
 
 def loadGraph3(outDir):
     inFname = join(outDir, "graph.marshal")
     #idToGene, idLinks = marshal.load(open(inFname))
     #idToGene, idLinks = ujson.load(open(inFname))
     #geneLinks = ujson.load(open(inFname))
     geneLinks = marshal.load(open(inFname))
     return geneLinks
 
     #idToGene = {int(k):v for k,v in idToGene.iteritems()}
 
     #geneLinks = {}
     #for geneId, linkedIds in idLinks.iteritems():
         #geneLinks[idToGene[int(geneId)]] = [idToGene[i] for i in linkedIds]
 
     # reversing the list was 25% slower than reading it all from disk
     # 1.1 seconds, so 20% slower than the marshal version
     #inFname = join(outDir, "graph.links.txt")
     #ofh = open(outFname, "w")
     #graph = {}
     #for line in open(inFname):
         #gene, neighbors = line.rstrip("\n").split("\t")
         #neighbors = set(neighbors.split(","))
         #graph[gene] = neighbors
     ##logging.info("Wrote links to %s" % outFname)
 
 def parseLinkTargets(outDir, validSyms):
     """ parse the ggLink table in outDir and return a dict gene -> Counter() of targetGenes -> count.
     Count is either the article count or, if there is no text mining hit, the count of databases 
     """
     errFh = open("ggLink.errors.tab", "w")
 
     inFname = join(outDir, "ggLink.tab")
     logging.info("Parsing %s" % inFname)
 
     asPath = join(autoSqlDir, "ggLink.as")
     targets = defaultdict(Counter)
     for row in lineFileNext(open(inFname), asFname=asPath):
         gene1, gene2 = row.gene1, row.gene2
         count = int(row.docCount)
         if count==0:
             count = len(row.dbList.split("|"))
 
         if gene1 not in validSyms:
             if gene2 not in validSyms:
                 errFh.write("BothSymsInvalid\t"+"\t".join(row)+"\n")
             else:
                 errFh.write("sym1Invalid\t"+"\t".join(row)+"\n")
         if gene2 not in validSyms:
             errFh.write("sym2Invalid\t"+"\t".join(row)+"\n")
 
         targets[gene1][gene2]=count
         targets[gene2][gene1]=count
 
     errFh.close()
     logging.info("Wrote rows from ggLink.tab with invalid symbols to ggLink.errors.tab")
 
     return targets
 
 def makeBigBed(inDir, outDir, bedFname, db):
     " create a file geneInteractions.<db>.bb in outDir from bedFname "
     validSyms = set()
     for line in open(bedFname):
         sym = line.split("\t")[3].rstrip("\n")
         validSyms.add(sym)
 
     # get interactors from our ggLink table
     geneCounts = parseLinkTargets(inDir, validSyms)
 
     # get genes from knownGenes table and write to bed
     #bedFname = join(outDir, "genes.%s.bed" % db)
     #logging.info("Writing genes to %s" % bedFname)
     #cmd = "hgsql %s -NBe 'select chrom, chromStart, chromEnd, geneSymbol from knownCanonical JOIN kgXref ON kgId=transcript' > %s" % (db, bedFname)
     #runCmd(cmd)
     #bedFname = "geneModels/gencode19.median.bed"
 
     # rewriting bed file and fill with counts
     bedOutFname = join(outDir, "geneInteractions.%s.bed" % db)
     ofh = open(bedOutFname, "w")
     logging.info("Rewriting %s to %s" % (bedFname, bedOutFname))
     doneSymbols = set()
     for line in open(bedFname):
         row = line.rstrip("\n").split("\t")
         gene = row[3]
         counts = geneCounts.get(gene, None)
         if counts==None:
             # skip gene if not found
             continue
 
         # create the new name field
         docCount = 0
         strList = []
         geneCount = 0
         for targetGene, count in counts.most_common():
             #strList.append("%s:%d" % (targetGene, count))
             if geneCount < 10:
                 strList.append("%s" % (targetGene))
             docCount += count
             geneCount += 1
 
         score = min(docCount, 1000)
 
         targetGenes = ",".join(strList)
         row[3] = gene+": "+targetGenes # why a space? see linkIdInName trackDb statement
 
         row.append( str(score) )
         row.append(".")
         row.append(row[1])
         row.append(row[2])
 
         if docCount > 100:
             color = "0,0,0"
         elif docCount > 10:
             color = "0,0,128"
         else:
             color = "173,216,230"
 
         row.append(color)
 
         ofh.write("\t".join(row))
         ofh.write("\n")
         doneSymbols.add(gene)
     ofh.close()
 
     missingSyms = set(geneCounts) - set(doneSymbols)
     logging.info("%d symbols in ggLink not found in BED file" % len(missingSyms))
     logging.info("missing symbols written to missSym.txt")
 
     ofh= open("missSym.txt", "w")
     ofh.write("\n".join(missingSyms))
     ofh.close()
 
     cmd = "bedSort %s %s" % (bedOutFname, bedOutFname)
     runCmd(cmd)
 
     bbFname = join(outDir, "geneInteractions.%s.bb" % db)
     chromSizeFname = "/hive/data/genomes/%s/chrom.sizes" % db
     cmd = "bedToBigBed -tab %s %s %s"  % (bedOutFname, chromSizeFname, bbFname)
     runCmd(cmd)
     logging.info("bigBed file written to %s" % bbFname)
     
 def findBestPaths(genes, geneLinks):
     " find best paths of max length 2 between genes using geneLinks. return pairs. "
     pairs = set()
     links = defaultdict(list) # dict (from, to) -> list of (docCountSum, path)
     for gene1 in genes:
         # search at distance 1
         for gene2, docCount2 in geneLinks.get(gene1, []):
             if gene2 in genes and gene2!=gene1:
                 # stop if found
                 print "%s-%s" % (gene1, gene2)
                 pairs.add( tuple(sorted((gene1, gene2))) )
                 links[ (gene1, gene2) ].append( (docCount2, [gene1, gene2]) )
                 continue
 
             # search at distance 2
             for gene3, docCount3 in geneLinks.get(gene2, []):
                 if gene3 in genes and gene3!=gene2 and gene3!=gene1:
                     # distance = 2
                     print "%s-%s-%s" % (gene1, gene2, gene3)
                     pairs.add( tuple(sorted((gene1, gene2))) )
                     pairs.add( tuple(sorted((gene2, gene3))) )
                     links[ (gene1, gene3) ].append( ((docCount2+docCount3)/2, [gene1, gene2, gene3]) )
 
     for genePair, paths in links.iteritems():
         paths.sort(reverse=True)
         print genePair, paths
         
     return pairs
 
 # ----------- MAIN --------------
 #if options.test:
     #import doctest
     #doctest.testmod()
     #sys.exit(0)
 
 if args==[]:
     parser.print_help()
     exit(1)
 
 cmd = args[0]
 if cmd == "build":
     wordFname = options.wordFname
 
     pathwayDir, dbDir, textDir, outDir = args[1:]
     # load the input files into memory
     dbRows = loadFiles(dbDir, prefix="ppi_") 
     pwRows = loadFiles(pathwayDir)
     textRows = loadFiles(textDir)
 
     # index and merge them
     dbPairs   = indexPairs(dbRows, "ppi databases")
     pwPairs   = indexPairs(pwRows, "pathways")
     textPairs = indexPairs(textRows, "text mining")
     pwDirPairs = getDirectedPairs(pwRows)
 
     curatedPairs = mergePairs([dbPairs, pwPairs])
     pairMinResultCounts, docToPairs = getResultCounts(curatedPairs)
 
     bestSentences = runSumBasic(textPairs, wordFname)
     allPairs = mergePairs([curatedPairs, textPairs])
 
     #ltPairs, ltDocs = getResultCounts(curatedPairs)
     # keep result counts for the "docs" step
     ofh = open(join(outDir, "resultCounts.tmp.txt"), "w")
     for docId, pairs in docToPairs.iteritems():
         ofh.write("%s\t%d\n" % (docId, len(pairs)))
     ofh.close()
 
     pairDirDocs = directedPairToDocs(textRows)
     pairDbs = pairToDbs(curatedPairs)
 
     outFname = join(outDir, "ggLink.tmp.txt") # needs the addContext step to complete it
     eventFname = join(outDir, "ggLinkEvent.tab")
     writeGraphTable(allPairs, pairDirDocs, pairDbs, pairMinResultCounts, pwDirPairs, \
         bestSentences, outFname, eventFname)
 
     pmidToId = indexPmids([dbRows,pwRows], textRows)
     outFname = join(outDir, "ggDocEvent.tab")
     writeDocEvents(pmidToId, outFname)
 
     outFname = join(outDir, "ggEventDb.tab")
     writeEventTable([dbRows, pwRows], outFname, colCount=13)
 
     outFname = join(outDir, "ggEventText.tab")
     writeEventTable([textRows], outFname)
 
     # make sure we don't forget to update the link table with context
     linkFname = join(outDir, "ggLink.tab")
     if isfile(linkFname):
         os.remove(linkFname)
 
 elif cmd == "medline":
     outDir = args[1]
     textDir = options.textDir
     medlineFname = join(outDir, allArtFname)
     writeAllDocInfo(textDir, medlineFname)
 
 elif cmd == "docs":
     outDir = args[1]
     outFname = join(outDir, "ggDoc.tab")
     pmidEventPath = join(outDir, "ggDocEvent.tab")
 
     medlineFname = join(outDir, allArtFname)
     meshTerms = parseMeshContext(options.meshFname)
 
     shortNames = parseShortNames(options.journalInfo)
 
     resCountFname = join(outDir, "resultCounts.tmp.txt")
     resCounts = readDict(resCountFname)
 
     docContext = writeDocsTable(pmidEventPath, medlineFname, shortNames, meshTerms, resCounts, outFname)
 
     # write docContext to file
     ctFname = join(outDir, "docContext.txt")
     ofh = open(ctFname, "w")
     for docId, context in docContext.iteritems():
         ofh.write("%s\t%s\n" % (docId, context))
     ofh.close()
     logging.info("Written document contexts to %s for %d documents" % (ctFname, len(docContext)))
 
 elif cmd == "context":
     outDir = args[1]
     ctFname = join(outDir, "docContext.txt")
     docEventFname = join(outDir, "ggDocEvent.tab")
     linkEventFname = join(outDir, "ggLinkEvent.tab")
     linkFname = join(outDir, "ggLink.tmp.txt")
     newLines = addContext(ctFname, docEventFname, linkEventFname, linkFname)
 
     outFname = join(outDir, "ggLink.tab")
     ofh = open(outFname, "w")
     for l in newLines:
         ofh.write("%s\n" % l)
     ofh.close()
     logging.info("appended document context to %s" % outFname)
 
 elif cmd == "bigBed":
     inDir = args[1]
     outDir = args[2]
     geneBedFile = args[3]
     db = args[4]
     makeBigBed(inDir, outDir, geneBedFile, db)
 
 elif cmd == "load":
     inDir = args[1]
     db = args[2]
     loadTables(inDir, db)
 
 # --- DEBUGGING / TESTING ----
 
 elif cmd=="sumBasic": # for debugging
     inFname = args[1]
     rows = []
     for row in lineFileNext(open(inFname)):
         rows.append(row)
     textPairs = indexPairs(rows, "text mining")
     for pair, sent in runSumBasic(textPairs, options.wordFname).iteritems():
         print pair, sent
     
 elif cmd == "graph":
     outDir = args[1]
     convGraph(outDir)
 
 elif cmd == "graph2":
     outDir = args[1]
     convGraph2(outDir)
 
 elif cmd == "graph3":
     outDir = args[1]
     convGraph3(outDir)
 
 elif cmd == "loadgraph":
     outDir = args[1]
     loadGraph(outDir)
 
 elif cmd == "load2":
     outDir = args[1]
     loadGraph2(outDir)
 
 elif cmd == "load3":
     outDir = args[1]
     loadGraph3(outDir)
 
 elif cmd == "subnet":
     outDir, geneFile = args[1:]
     geneLinks = loadGraph3(outDir)
     genes = set(open(geneFile).read().splitlines())
 
     print len(findBestPaths(genes, geneLinks))
 
 else:
     logging.error("unknown command %s" % cmd)