src/utils/ggTables b88380414ddecaab81f5c19b9f0cf06dbbedc787

b88380414ddecaab81f5c19b9f0cf06dbbedc787
max
  Fri May 26 06:36:28 2017 -0700
changing python hash bang again for hgGeneGraph, tiny wrangling fix, refs #13634

diff --git src/utils/ggTables src/utils/ggTables
index bb07669..7dbe660 100755
--- src/utils/ggTables
+++ src/utils/ggTables
@@ -25,36 +25,49 @@
 autoSqlDir = expanduser("~/kent/src/hg/lib/")
 
 # file with all of medline in short form
 allArtFname = "textInfo.tab"
 # file with just pmids and events
 #pmidEventFname = "temp.pmidToEvent.tab"
 
 # RE to split sentences
 wordRe = re.compile("[a-zA-Z0-9]+")
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] build|load pathwayDir ppiDir textDir outDir - given various tab sep files with text-mining, gene interaction or pathway information, build the  table ggLink, ggDoc, ggDb and ggText
 
 run it like this:
 
-%prog medline - to reduce the big medline table to something smaller, only needed once
+Reduce the big medline table to something smaller, only needed once:
+    %prog medline
 
+Slowest part: build the big table of interactions mysql/ggLink.tmp.tab
     %prog build pathways ppi text mysql
-%prog docs mysql  # creates the ggDocs.tab file, slow
+
+Create mysql/ggDocs.tab, very slow
+    %prog docs mysql
+
+Add the "context" (aka mesh terms) to mysql/ggLink.tmp.tab 
+and write to mysql/ggLink.tab file
     %prog context mysql  
-%prog load mysql publications
+format is:
+gene1, gene2, flags, forwDocCount, revDocCount, allDocCount, databaseList, minimalPairCountPaper, snippet
+
+Load all tables in mysql/ into MySql:
+    %prog load mysql hgFixed
+
+Create the bigBed File
     %prog bigBed outDir bigBedFile db
 """)
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
 #parser.add_option("-t", "--test", dest="test", action="store_true", help="run tests") 
 parser.add_option("-t", "--textDir", dest="textDir", action="store", help="directory with the parsed copy of medline, default %default", default="/hive/data/inside/pubs/text/medline")
 parser.add_option("-m", "--meshFile", dest="meshFname", action="store", help="An mtrees<year>.bin file, default %default", default="/hive/data/outside/ncbi/mesh/mtrees2015.bin")
 parser.add_option("-j", "--journalInfo", dest="journalInfo", action="store", help="tab-sep file with journal info from the NLM Catalog converted by 'pubPrepCrawl publishers'. Used to shorten the journal names. Optional and not used if file is not found. Default %default", default="/cluster/home/max/projects/pubs/tools/data/journals/journals.tab")
 parser.add_option("-b", "--wordFname", dest="wordFname", action="store", help="a file with common English words", default="/hive/data/outside/pubs/wordFrequency/bnc/bnc.txt")
 #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
 #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
@@ -131,31 +144,30 @@
 def getResultCounts(pairs):
     """
     Input is a pair -> rows dictionary
     for each PMID, count how many pairs are assigned to it. This is
     something like the "resultCount" of a paper, the lower, the better.
     Then, for each pair, get the minimum resultCount and return as a dict
     pair -> resultCount
     """
     logging.info("Getting low-throughput studies in PPI and pathway data")
 
     # create dict doc -> set of gene pairs
     docToPairs = defaultdict(set)
     for pair, rows in pairs.iteritems():
         #print "pp", pair
         for row in rows:
-            #print "r", row
             #members = row.themeGenes.split("|")
             #members.extend(row.causeGenes.split("|"))
             # complexes with more than 5 proteins are not low-throughput anyways
             # skip these right away
             #if len(members)>LTPMAXMEMBERS:
                 #continue
             docIds = row.pmids.split("|")
             for docId in docIds:
                 if docId=="":
                     continue
                 docToPairs[docId].add(pair)
 
     #print "d2p", docToPairs
 
     pairMinResultCounts = {}