src/utils/ggTables 00e2e1aba8679cb0c869fc09e5dbf0a6942f16f6

00e2e1aba8679cb0c869fc09e5dbf0a6942f16f6
max
  Fri Aug 4 14:15:36 2017 -0700
fixing ghost interactions problem in interactions track, refs #19938

diff --git src/utils/ggTables src/utils/ggTables
index 68e1710..0773578 100755
--- src/utils/ggTables
+++ src/utils/ggTables
@@ -9,30 +9,33 @@
 
 # TODO: change indexPairs so it corrects synonyms to the current gene symbol
 
 # saves 20% of time when loading graph marshal
 import gc
 gc.disable()
 
 #LTPMAXMEMBERS=5 # maximum number of proteins in a complex for an interaction to quality for low throughput
 LTPMAX=5 # maximum number of interactions a PMID can have to be declared low throughput
 
 # don't even write out links with less than this number of documents. 2 = weeds out many false positives.
 # not using right now, because a few pathway databases do not annotate ANY PMID. Increasing this filter would remove
 # all interactions from these pathways databases.
 MINSUPP=0
 
+# a cutoff on the number of documents required for text mined documents to show up in the UI
+UI_MINSUPP=2
+
 outFields = ["gene1", "gene2", "flags", "refs", "fwWeight", "revWeight", "snip"]
 
 # directory with autoSql descriptions of output tables
 autoSqlDir = expanduser("~/kent/src/hg/lib/")
 
 # file with all of medline in short form
 allArtFname = "textInfo.tab"
 # file with just pmids and events
 #pmidEventFname = "temp.pmidToEvent.tab"
 
 # RE to split sentences
 wordRe = re.compile("[a-zA-Z0-9]+")
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] build|load pathwayDir ppiDir textDir outDir - given various tab sep files with text-mining, gene interaction or pathway information, build the  table ggLink, ggDoc, ggDb and ggText
@@ -976,30 +979,33 @@
 
 def parseLinkTargets(outDir, validSyms):
     """ parse the ggLink table in outDir and return a dict gene -> Counter() of targetGenes -> count.
     Count is either the article count or, if there is no text mining hit, the count of databases 
     """
     errFh = open("ggLink.errors.tab", "w")
 
     inFname = join(outDir, "ggLink.tab")
     logging.info("Parsing %s" % inFname)
 
     asPath = join(autoSqlDir, "ggLink.as")
     targets = defaultdict(Counter)
     for row in lineFileNext(open(inFname), asFname=asPath):
         gene1, gene2 = row.gene1, row.gene2
         count = int(row.docCount)
+        # text mining documents needs some minimum support
+        if row.linkTypes=="text" and count<UI_MINSUPP:
+            continue
         if count==0:
             count = len(row.dbList.split("|"))
 
         if gene1 not in validSyms:
             if gene2 not in validSyms:
                 errFh.write("BothSymsInvalid\t"+"\t".join(row)+"\n")
             else:
                 errFh.write("sym1Invalid\t"+"\t".join(row)+"\n")
         if gene2 not in validSyms:
             errFh.write("sym2Invalid\t"+"\t".join(row)+"\n")
 
         targets[gene1][gene2]=count
         targets[gene2][gene1]=count
 
     errFh.close()