00e2e1aba8679cb0c869fc09e5dbf0a6942f16f6 max Fri Aug 4 14:15:36 2017 -0700 fixing ghost interactions problem in interactions track, refs #19938 diff --git src/utils/ggTables src/utils/ggTables index 68e1710..0773578 100755 --- src/utils/ggTables +++ src/utils/ggTables @@ -9,30 +9,33 @@ # TODO: change indexPairs so it corrects synonyms to the current gene symbol # saves 20% of time when loading graph marshal import gc gc.disable() #LTPMAXMEMBERS=5 # maximum number of proteins in a complex for an interaction to quality for low throughput LTPMAX=5 # maximum number of interactions a PMID can have to be declared low throughput # don't even write out links with less than this number of documents. 2 = weeds out many false positives. # not using right now, because a few pathway databases do not annotate ANY PMID. Increasing this filter would remove # all interactions from these pathways databases. MINSUPP=0 +# a cutoff on the number of documents required for text mined documents to show up in the UI +UI_MINSUPP=2 + outFields = ["gene1", "gene2", "flags", "refs", "fwWeight", "revWeight", "snip"] # directory with autoSql descriptions of output tables autoSqlDir = expanduser("~/kent/src/hg/lib/") # file with all of medline in short form allArtFname = "textInfo.tab" # file with just pmids and events #pmidEventFname = "temp.pmidToEvent.tab" # RE to split sentences wordRe = re.compile("[a-zA-Z0-9]+") # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] build|load pathwayDir ppiDir textDir outDir - given various tab sep files with text-mining, gene interaction or pathway information, build the table ggLink, ggDoc, ggDb and ggText @@ -976,30 +979,33 @@ def parseLinkTargets(outDir, validSyms): """ parse the ggLink table in outDir and return a dict gene -> Counter() of targetGenes -> count. Count is either the article count or, if there is no text mining hit, the count of databases """ errFh = open("ggLink.errors.tab", "w") inFname = join(outDir, "ggLink.tab") logging.info("Parsing %s" % inFname) asPath = join(autoSqlDir, "ggLink.as") targets = defaultdict(Counter) for row in lineFileNext(open(inFname), asFname=asPath): gene1, gene2 = row.gene1, row.gene2 count = int(row.docCount) + # text mining documents needs some minimum support + if row.linkTypes=="text" and count