b88380414ddecaab81f5c19b9f0cf06dbbedc787 max Fri May 26 06:36:28 2017 -0700 changing python hash bang again for hgGeneGraph, tiny wrangling fix, refs #13634 diff --git src/utils/ggTables src/utils/ggTables index bb07669..7dbe660 100755 --- src/utils/ggTables +++ src/utils/ggTables @@ -25,36 +25,49 @@ autoSqlDir = expanduser("~/kent/src/hg/lib/") # file with all of medline in short form allArtFname = "textInfo.tab" # file with just pmids and events #pmidEventFname = "temp.pmidToEvent.tab" # RE to split sentences wordRe = re.compile("[a-zA-Z0-9]+") # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] build|load pathwayDir ppiDir textDir outDir - given various tab sep files with text-mining, gene interaction or pathway information, build the table ggLink, ggDoc, ggDb and ggText run it like this: -%prog medline - to reduce the big medline table to something smaller, only needed once +Reduce the big medline table to something smaller, only needed once: + %prog medline +Slowest part: build the big table of interactions mysql/ggLink.tmp.tab %prog build pathways ppi text mysql -%prog docs mysql # creates the ggDocs.tab file, slow + +Create mysql/ggDocs.tab, very slow + %prog docs mysql + +Add the "context" (aka mesh terms) to mysql/ggLink.tmp.tab +and write to mysql/ggLink.tab file %prog context mysql -%prog load mysql publications +format is: +gene1, gene2, flags, forwDocCount, revDocCount, allDocCount, databaseList, minimalPairCountPaper, snippet + +Load all tables in mysql/ into MySql: + %prog load mysql hgFixed + +Create the bigBed File %prog bigBed outDir bigBedFile db """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") #parser.add_option("-t", "--test", dest="test", action="store_true", help="run tests") parser.add_option("-t", "--textDir", dest="textDir", action="store", help="directory with the parsed copy of medline, default %default", default="/hive/data/inside/pubs/text/medline") parser.add_option("-m", "--meshFile", dest="meshFname", action="store", help="An mtrees.bin file, default %default", default="/hive/data/outside/ncbi/mesh/mtrees2015.bin") parser.add_option("-j", "--journalInfo", dest="journalInfo", action="store", help="tab-sep file with journal info from the NLM Catalog converted by 'pubPrepCrawl publishers'. Used to shorten the journal names. Optional and not used if file is not found. Default %default", default="/cluster/home/max/projects/pubs/tools/data/journals/journals.tab") parser.add_option("-b", "--wordFname", dest="wordFname", action="store", help="a file with common English words", default="/hive/data/outside/pubs/wordFrequency/bnc/bnc.txt") #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") #parser.add_option("", "--test", dest="test", action="store_true", help="do something") (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) @@ -131,31 +144,30 @@ def getResultCounts(pairs): """ Input is a pair -> rows dictionary for each PMID, count how many pairs are assigned to it. This is something like the "resultCount" of a paper, the lower, the better. Then, for each pair, get the minimum resultCount and return as a dict pair -> resultCount """ logging.info("Getting low-throughput studies in PPI and pathway data") # create dict doc -> set of gene pairs docToPairs = defaultdict(set) for pair, rows in pairs.iteritems(): #print "pp", pair for row in rows: - #print "r", row #members = row.themeGenes.split("|") #members.extend(row.causeGenes.split("|")) # complexes with more than 5 proteins are not low-throughput anyways # skip these right away #if len(members)>LTPMAXMEMBERS: #continue docIds = row.pmids.split("|") for docId in docIds: if docId=="": continue docToPairs[docId].add(pair) #print "d2p", docToPairs pairMinResultCounts = {}