3b2e14c90156120b53c5031856842de0c57ca58c
max
  Tue May 30 17:34:02 2017 -0700
gene interactions track: add to hgGene, patch GBIC/GBIB for it, refs #13634

diff --git src/utils/ggTables src/utils/ggTables
index 7dbe660..c79207d 100755
--- src/utils/ggTables
+++ src/utils/ggTables
@@ -289,30 +289,31 @@
     pairPmids = defaultdict(set)
     for row in rows:
         genes1 = set(row.causeGenes.split("|"))
         genes2 = set(row.themeGenes.split("|"))
         pairs = list([(aa, bb) for aa in genes1 for bb in genes2])
         for cause, theme in pairs:
             pairPmids[(cause, theme)].add(row.pmid)
     
     return pairPmids
 
 def writeGraphTable(allPairs, pairDocs, pairToDbs, pairMinResCounts, pwDirPairs, bestSentences, outFname, outFname2):
     " write the ggLink table "
     logging.info("writing merged graph to %s" % outFname)
     rows = []
     rows2 = []
+    allSyms = set()
     for pair,pairRows in allPairs.iteritems():
         gene1, gene2 = pair
 
         dbs = set()
         flags = []
         if pair in dbPairs:
             flags.append("ppi")
         if pair in pwPairs:
             flags.append("pwy")
         if pair in textPairs:
             flags.append("text")
         refs = [row.eventId for row in pairRows]
         #if pairMinResultCounts:
             #flags.append("low")
         # direction of interaction - only based on pathways
@@ -324,51 +325,56 @@
         forwDocs = pairDocs.get(pair, [])
         revDocs = pairDocs.get(tuple(reversed(pair)), [])
         allDocs = set(forwDocs).union(set(revDocs))
 
         if len(allDocs)<MINSUPP and "pwy" not in flags and "ppi" not in flags:
             # if it's text-mining only and less than X documents, just skip it
             continue
         pairMinResCount = pairMinResCounts.get(pair, 0)
 
         pairDbs = "|".join(pairToDbs.get(pair, []))
         snippet = bestSentences.get(pair, "")
         row = [gene1, gene2, ",".join(flags), str(len(forwDocs)), str(len(revDocs)), \
             str(len(allDocs)), pairDbs, str(pairMinResCount), snippet]
         rows.append(row)
 
+        allSyms.add(gene1)
+        allSyms.add(gene2)
+
         refs = list(refs)
         refs.sort()
         for ref in refs:
             #row2 = [gene1, gene2, ",".join(refs)]
             row = [gene1, gene2, ref]
             rows2.append(row)
 
     ofh = open(outFname, "w")
     rows.sort()
     for row in rows:
         ofh.write("\t".join(row))
         ofh.write("\n")
     ofh.close()
 
     ofh2 = open(outFname2, "w")
     rows2.sort()
     for row in rows2:
         ofh2.write("\t".join(row))
         ofh2.write("\n")
     ofh2.close()
 
+    return allSyms
+
 def runCmd(cmd):
     """ run command in shell, exit if not successful """
     msg = "Running shell command: %s" % cmd
     logging.debug(msg)
     ret = os.system(cmd)
     if ret!=0:
         raise Exception("Could not run command (Exitcode %d): %s" % (ret, cmd))
     return ret
 
 def asToSql(table, sqlDir):
     " given a table name, return the name of a .sql file with CREATE TABLE for it"
 
     asPath = join(autoSqlDir, table+".as")
     #tempBase = tempfile.mktemp()
     outBase = join(sqlDir, table)
@@ -402,30 +408,31 @@
 def hgsql(db, query):
     assert('"' not in sql)
     cmd = "hgsql %s -NBe '%s'" % (db, query)
 
 def addIndexes(db):
     " add the indexes for mysql "
     query = "ALTER TABLE ggLinkEvent ADD INDEX gene12Idx (gene1, gene2);"
     hgsql(db, query)
 
     query = "ALTER TABLE ggEventText ADD INDEX docIdIdx (docId);"
     hgsql(db, query)
 
 def loadTables(tableDir, db):
     " load graph tables into mysql "
 
+    loadTable(db, tableDir, "ggSymbol")
     loadTable(db, tableDir, "ggDoc")
     loadTable(db, tableDir, "ggDocEvent")
     loadTable(db, tableDir, "ggEventDb")
     loadTable(db, tableDir, "ggEventText")
     loadTable(db, tableDir, "ggLink")
     loadTable(db, tableDir, "ggLinkEvent")
 
     addIndexes(db)
 
 def indexPmids(rowList, textRows):
     " return dict pmid -> list of event Ids "
     pmidToIds = defaultdict(set) 
     for rows in rowList:
         for row in rows:
             pmidStr = row.pmids
@@ -1134,48 +1141,54 @@
     bestSentences = runSumBasic(textPairs, wordFname)
     allPairs = mergePairs([curatedPairs, textPairs])
 
     #ltPairs, ltDocs = getResultCounts(curatedPairs)
     # keep result counts for the "docs" step
     ofh = open(join(outDir, "resultCounts.tmp.txt"), "w")
     for docId, pairs in docToPairs.iteritems():
         ofh.write("%s\t%d\n" % (docId, len(pairs)))
     ofh.close()
 
     pairDirDocs = directedPairToDocs(textRows)
     pairDbs = pairToDbs(curatedPairs)
 
     outFname = join(outDir, "ggLink.tmp.txt") # needs the addContext step to complete it
     eventFname = join(outDir, "ggLinkEvent.tab")
-    writeGraphTable(allPairs, pairDirDocs, pairDbs, pairMinResultCounts, pwDirPairs, \
+    allSyms = writeGraphTable(allPairs, pairDirDocs, pairDbs, pairMinResultCounts, pwDirPairs, \
         bestSentences, outFname, eventFname)
 
     pmidToId = indexPmids([dbRows,pwRows], textRows)
     outFname = join(outDir, "ggDocEvent.tab")
     writeDocEvents(pmidToId, outFname)
 
     outFname = join(outDir, "ggEventDb.tab")
     writeEventTable([dbRows, pwRows], outFname, colCount=13)
 
     outFname = join(outDir, "ggEventText.tab")
     writeEventTable([textRows], outFname)
 
     # make sure we don't forget to update the link table with context
     linkFname = join(outDir, "ggLink.tab")
     if isfile(linkFname):
         os.remove(linkFname)
 
+    # hgGene does not like it if the gene symbols are in two different
+    # columns, so we create a very simple table with just the gene symbols
+    symFname = join(outDir, "ggSymbol.tab")
+    logging.info("Writing %s" % symFname)
+    open(symFname, "w").write("\n".join(allSyms))
+
 elif cmd == "medline":
     outDir = args[1]
     textDir = options.textDir
     medlineFname = join(outDir, allArtFname)
     writeAllDocInfo(textDir, medlineFname)
 
 elif cmd == "docs":
     outDir = args[1]
     outFname = join(outDir, "ggDoc.tab")
     pmidEventPath = join(outDir, "ggDocEvent.tab")
 
     medlineFname = join(outDir, allArtFname)
     meshTerms = parseMeshContext(options.meshFname)
 
     shortNames = parseShortNames(options.journalInfo)