96d63c692fc15357af3800b6a0c363a18d33db89 max Sat Jun 3 08:45:23 2017 -0700 adding support and converter for the openbel dataset, refs #13634 diff --git src/utils/ggTables src/utils/ggTables index c79207d..68e1710 100755 --- src/utils/ggTables +++ src/utils/ggTables @@ -1,24 +1,26 @@ #!/usr/bin/env python2.7 import logging, sys, optparse, glob, itertools, os, tempfile, gzip, operator, re, cPickle, gc import marshal from collections import defaultdict, namedtuple, Counter from itertools import chain, combinations from os.path import join, basename, dirname, isfile, expanduser import ujson +# TODO: change indexPairs so it corrects synonyms to the current gene symbol + # saves 20% of time when loading graph marshal import gc gc.disable() #LTPMAXMEMBERS=5 # maximum number of proteins in a complex for an interaction to quality for low throughput LTPMAX=5 # maximum number of interactions a PMID can have to be declared low throughput # don't even write out links with less than this number of documents. 2 = weeds out many false positives. # not using right now, because a few pathway databases do not annotate ANY PMID. Increasing this filter would remove # all interactions from these pathways databases. MINSUPP=0 outFields = ["gene1", "gene2", "flags", "refs", "fwWeight", "revWeight", "snip"] # directory with autoSql descriptions of output tables @@ -394,41 +396,44 @@ #sqlFname = join(tableDir, table+".sql") tmpSqlFname = asToSql(table, tableDir) tabFname = join(tableDir, table+".tab") cmd = "hgLoadSqlTab %s %s %s %s" % (db, table, tmpSqlFname, tabFname) try: runCmd(cmd) except: # make sure that the temp file gets deleted os.remove(tmpSqlFname) raise os.remove(tmpSqlFname) def hgsql(db, query): - assert('"' not in sql) + assert('"' not in query) cmd = "hgsql %s -NBe '%s'" % (db, query) def addIndexes(db): " add the indexes for mysql " query = "ALTER TABLE ggLinkEvent ADD INDEX gene12Idx (gene1, gene2);" hgsql(db, query) query = "ALTER TABLE ggEventText ADD INDEX docIdIdx (docId);" hgsql(db, query) + query = "alter table ggDocEvent add index eventIdIdx (eventId);" + hgsql(db, query) + def loadTables(tableDir, db): " load graph tables into mysql " loadTable(db, tableDir, "ggSymbol") loadTable(db, tableDir, "ggDoc") loadTable(db, tableDir, "ggDocEvent") loadTable(db, tableDir, "ggEventDb") loadTable(db, tableDir, "ggEventText") loadTable(db, tableDir, "ggLink") loadTable(db, tableDir, "ggLinkEvent") addIndexes(db) def indexPmids(rowList, textRows): " return dict pmid -> list of event Ids " @@ -457,31 +462,35 @@ logging.info("Writing docId-eventId %s" % outFname) ofh = open(outFname, "w") for docId, eventIds in pmidToId.iteritems(): eventIds = sorted(list(eventIds)) for eventId in eventIds: ofh.write("%s\t%s\n" % (docId, eventId)) ofh.close() def writeEventTable(rowList, outFname, colCount=None): " write the event table with event details " logging.info("Writing events to %s" % outFname) ofh = open(outFname, "w") for rows in rowList: for row in rows: if colCount: - row = row[:colCount] + if len(row)+1 == colCount: + row = list(row) + row.append("") + #row = row[:colCount] + assert(len(row)==colCount) ofh.write("%s\n" % ("\t".join(row))) ofh.close() def pairToDbs(pairs): """ given pairs and data rows, return a dict pair -> int that indicates how many DBs a pair is referenced in """ # first make dict event -> source dbs #eventDbs = defaultdict(set) #for row in pwRows: #eventDbs[row.eventId].add(row.sourceDb) #for row in dbRows: #sourceDbs = row.sourceDbs.split("|") #eventDbs[row.eventId].update(sourceDbs) @@ -1149,31 +1158,31 @@ ofh.close() pairDirDocs = directedPairToDocs(textRows) pairDbs = pairToDbs(curatedPairs) outFname = join(outDir, "ggLink.tmp.txt") # needs the addContext step to complete it eventFname = join(outDir, "ggLinkEvent.tab") allSyms = writeGraphTable(allPairs, pairDirDocs, pairDbs, pairMinResultCounts, pwDirPairs, \ bestSentences, outFname, eventFname) pmidToId = indexPmids([dbRows,pwRows], textRows) outFname = join(outDir, "ggDocEvent.tab") writeDocEvents(pmidToId, outFname) outFname = join(outDir, "ggEventDb.tab") - writeEventTable([dbRows, pwRows], outFname, colCount=13) + writeEventTable([dbRows, pwRows], outFname, colCount=14) outFname = join(outDir, "ggEventText.tab") writeEventTable([textRows], outFname) # make sure we don't forget to update the link table with context linkFname = join(outDir, "ggLink.tab") if isfile(linkFname): os.remove(linkFname) # hgGene does not like it if the gene symbols are in two different # columns, so we create a very simple table with just the gene symbols symFname = join(outDir, "ggSymbol.tab") logging.info("Writing %s" % symFname) open(symFname, "w").write("\n".join(allSyms))