3b2e14c90156120b53c5031856842de0c57ca58c max Tue May 30 17:34:02 2017 -0700 gene interactions track: add to hgGene, patch GBIC/GBIB for it, refs #13634 diff --git src/utils/ggTables src/utils/ggTables index 7dbe660..c79207d 100755 --- src/utils/ggTables +++ src/utils/ggTables @@ -289,30 +289,31 @@ pairPmids = defaultdict(set) for row in rows: genes1 = set(row.causeGenes.split("|")) genes2 = set(row.themeGenes.split("|")) pairs = list([(aa, bb) for aa in genes1 for bb in genes2]) for cause, theme in pairs: pairPmids[(cause, theme)].add(row.pmid) return pairPmids def writeGraphTable(allPairs, pairDocs, pairToDbs, pairMinResCounts, pwDirPairs, bestSentences, outFname, outFname2): " write the ggLink table " logging.info("writing merged graph to %s" % outFname) rows = [] rows2 = [] + allSyms = set() for pair,pairRows in allPairs.iteritems(): gene1, gene2 = pair dbs = set() flags = [] if pair in dbPairs: flags.append("ppi") if pair in pwPairs: flags.append("pwy") if pair in textPairs: flags.append("text") refs = [row.eventId for row in pairRows] #if pairMinResultCounts: #flags.append("low") # direction of interaction - only based on pathways @@ -324,51 +325,56 @@ forwDocs = pairDocs.get(pair, []) revDocs = pairDocs.get(tuple(reversed(pair)), []) allDocs = set(forwDocs).union(set(revDocs)) if len(allDocs)<MINSUPP and "pwy" not in flags and "ppi" not in flags: # if it's text-mining only and less than X documents, just skip it continue pairMinResCount = pairMinResCounts.get(pair, 0) pairDbs = "|".join(pairToDbs.get(pair, [])) snippet = bestSentences.get(pair, "") row = [gene1, gene2, ",".join(flags), str(len(forwDocs)), str(len(revDocs)), \ str(len(allDocs)), pairDbs, str(pairMinResCount), snippet] rows.append(row) + allSyms.add(gene1) + allSyms.add(gene2) + refs = list(refs) refs.sort() for ref in refs: #row2 = [gene1, gene2, ",".join(refs)] row = [gene1, gene2, ref] rows2.append(row) ofh = open(outFname, "w") rows.sort() for row in rows: ofh.write("\t".join(row)) ofh.write("\n") ofh.close() ofh2 = open(outFname2, "w") rows2.sort() for row in rows2: ofh2.write("\t".join(row)) ofh2.write("\n") ofh2.close() + return allSyms + def runCmd(cmd): """ run command in shell, exit if not successful """ msg = "Running shell command: %s" % cmd logging.debug(msg) ret = os.system(cmd) if ret!=0: raise Exception("Could not run command (Exitcode %d): %s" % (ret, cmd)) return ret def asToSql(table, sqlDir): " given a table name, return the name of a .sql file with CREATE TABLE for it" asPath = join(autoSqlDir, table+".as") #tempBase = tempfile.mktemp() outBase = join(sqlDir, table) @@ -402,30 +408,31 @@ def hgsql(db, query): assert('"' not in sql) cmd = "hgsql %s -NBe '%s'" % (db, query) def addIndexes(db): " add the indexes for mysql " query = "ALTER TABLE ggLinkEvent ADD INDEX gene12Idx (gene1, gene2);" hgsql(db, query) query = "ALTER TABLE ggEventText ADD INDEX docIdIdx (docId);" hgsql(db, query) def loadTables(tableDir, db): " load graph tables into mysql " + loadTable(db, tableDir, "ggSymbol") loadTable(db, tableDir, "ggDoc") loadTable(db, tableDir, "ggDocEvent") loadTable(db, tableDir, "ggEventDb") loadTable(db, tableDir, "ggEventText") loadTable(db, tableDir, "ggLink") loadTable(db, tableDir, "ggLinkEvent") addIndexes(db) def indexPmids(rowList, textRows): " return dict pmid -> list of event Ids " pmidToIds = defaultdict(set) for rows in rowList: for row in rows: pmidStr = row.pmids @@ -1134,48 +1141,54 @@ bestSentences = runSumBasic(textPairs, wordFname) allPairs = mergePairs([curatedPairs, textPairs]) #ltPairs, ltDocs = getResultCounts(curatedPairs) # keep result counts for the "docs" step ofh = open(join(outDir, "resultCounts.tmp.txt"), "w") for docId, pairs in docToPairs.iteritems(): ofh.write("%s\t%d\n" % (docId, len(pairs))) ofh.close() pairDirDocs = directedPairToDocs(textRows) pairDbs = pairToDbs(curatedPairs) outFname = join(outDir, "ggLink.tmp.txt") # needs the addContext step to complete it eventFname = join(outDir, "ggLinkEvent.tab") - writeGraphTable(allPairs, pairDirDocs, pairDbs, pairMinResultCounts, pwDirPairs, \ + allSyms = writeGraphTable(allPairs, pairDirDocs, pairDbs, pairMinResultCounts, pwDirPairs, \ bestSentences, outFname, eventFname) pmidToId = indexPmids([dbRows,pwRows], textRows) outFname = join(outDir, "ggDocEvent.tab") writeDocEvents(pmidToId, outFname) outFname = join(outDir, "ggEventDb.tab") writeEventTable([dbRows, pwRows], outFname, colCount=13) outFname = join(outDir, "ggEventText.tab") writeEventTable([textRows], outFname) # make sure we don't forget to update the link table with context linkFname = join(outDir, "ggLink.tab") if isfile(linkFname): os.remove(linkFname) + # hgGene does not like it if the gene symbols are in two different + # columns, so we create a very simple table with just the gene symbols + symFname = join(outDir, "ggSymbol.tab") + logging.info("Writing %s" % symFname) + open(symFname, "w").write("\n".join(allSyms)) + elif cmd == "medline": outDir = args[1] textDir = options.textDir medlineFname = join(outDir, allArtFname) writeAllDocInfo(textDir, medlineFname) elif cmd == "docs": outDir = args[1] outFname = join(outDir, "ggDoc.tab") pmidEventPath = join(outDir, "ggDocEvent.tab") medlineFname = join(outDir, allArtFname) meshTerms = parseMeshContext(options.meshFname) shortNames = parseShortNames(options.journalInfo)