src/utils/genePredCompare 15286a4682e55985f6da7de23e7c700eef4bb436

15286a4682e55985f6da7de23e7c700eef4bb436
max
  Fri Feb 6 04:01:09 2026 -0800
adding genePredCompare tool to kent repo, see https://github.com/galaxyproject/brc-analytics/issues/1131

diff --git src/utils/genePredCompare src/utils/genePredCompare
new file mode 100755
index 00000000000..5ab260c8076
--- /dev/null
+++ src/utils/genePredCompare
@@ -0,0 +1,200 @@
+#!/usr/bin/env python
+# compare two genepred files and output some stats about the differences
+
+import sys, gzip, operator, logging
+from collections import defaultdict
+
+CLOSE = 10
+
+STRIPVERSION = True
+
+def main():
+    if len(sys.argv)==1:
+        print("genePredComp file1 file2: compare two gene pred files and output some stats")
+        sys.exit(0)
+
+    logging.basicConfig(level=logging.INFO)
+    fname1, fname2 = sys.argv[1], sys.argv[2]
+    gp1 = parseGp(fname1)
+    gp2 = parseGp(fname2)
+    genePredComp(fname1, fname2, gp1, gp2)
+
+def parseGp(inFname):
+    " return as dict id -> tuple "
+    logging.info("Parsing %s..." % inFname)
+    if inFname.endswith(".gz"):
+        ofunc = gzip.open
+    else:
+        ofunc = open
+    ifh = ofunc(inFname)
+
+    data = defaultdict(list)
+    for line in ifh:
+        if line.startswith("name"):
+            continue
+        row = line.rstrip("\n").split("\t")
+        row = [r.strip(",") for r in row]
+        row = row[:10]
+        #name = row[0].split(".")[0]
+        name = row[0]
+        if STRIPVERSION:
+            name = name.split(".")[0]
+
+        if row[1].isdigit() or row[1]=="Y" or row[1]=="X":
+            row[1] = "chr"+row[1]
+        if row[1]=="MT":
+            row[1]="chrM"
+
+        data[name].append(tuple(row[1:]))
+    return data
+
+#def countVerDiff(ids1, ids2):
+    #for id in ids1:
+        #base, ver = id.split(".")
+
+def coordOverlap(start1, end1, start2, end2):
+    """ returns true if two Features overlap """
+    start1 = int(start1)
+    end1 = int(end1)
+    start2 = int(start2)
+    end2 = int(end2)
+
+    result = (( start2 <= start1 and end2 > start1) or \
+            (start2 < end1 and end2 >= end1) or \
+            (start1 >= start2 and end1 <= end2) or \
+            (start2 >= start1 and end2 <= end1))
+    return result
+    # modified: end2 > end1
+
+def genePredComp(fname1, fname2, gp1, gp2):
+    keys1 = gp1.keys()
+    keys2 = gp2.keys()
+    print("%d IDs in %s" % (len(keys1), fname1))
+    print("%d IDs in %s" % (len(keys2), fname2))
+    commonIds = set(keys1).intersection(keys2)
+    print("%d common IDs" % (len(commonIds)))
+
+    print("For the common IDs:")
+    diffTransCount = 0
+    diffTransList = []
+    exactMatch = 0
+    overlapCount = 0
+    noOverlap = 0
+    txMatch = 0
+    txMatchList = []
+    txNonMatch = 0
+    txClose = 0
+    txStartClose = 0
+    txEndClose = 0
+    cdsMatch = 0
+
+    ofh = open("noOverlap.gp", "w")
+    diffOfh = open("different.gp", "w")
+
+    for id in commonIds:
+        tList1 = gp1[id]
+        tList2 = gp2[id]
+        if len(tList1)!=1 or len(tList2)!=1:
+            diffTransCount +=1
+            diffTransList.append(id)
+        t1 = tList1[0]
+        t2 = tList2[0]
+        
+        #print id
+        #print t1
+        #print t2
+        #print
+        if t1==t2:
+            exactMatch += 1
+        else:
+            txStart1, txEnd1 = int(t1[2]), int(t1[3])
+            txStart2, txEnd2 = int(t2[2]), int(t2[3])
+            if txStart1==txStart2 and txEnd1==txEnd2:
+                txMatch += 1
+                txMatchList.append(id)
+            else:
+                txNonMatch +=1
+                diffOfh.write(id+"\t"+"\t".join(t1)+"\n")
+                diffOfh.write(id+"\t"+"\t".join(t2)+"\n")
+
+                cdsStart1, cdsEnd1 = int(t1[2]), int(t1[3])
+                cdsStart2, cdsEnd2 = int(t2[2]), int(t2[3])
+                if cdsStart1==cdsStart2 and cdsEnd1==cdsEnd2:
+                    cdsMatch +=1
+
+                if abs(txStart1-txStart2)<CLOSE and abs(txEnd1-txEnd2)<CLOSE:
+                    txClose += 1
+                if abs(txStart1-txStart2)<CLOSE and txEnd1==txEnd2:
+                    txStartClose += 1
+                if txStart1==txStart2 and abs(txEnd1-txEnd2)<CLOSE:
+                    txEndClose += 1
+                if coordOverlap(t1[2], t1[3], t2[2], t2[3]):
+                    overlapCount += 1
+                else:
+                    noOverlap += 1
+                    ofh.write("%s\t%s\n" % (id, "\t".join(t1)))
+                    ofh.write("%s\t%s\n" % (id, "\t".join(t2)))
+
+    print("  - %d IDs ignored because number of transcripts/id is > 1, e.g. %s" % (diffTransCount, diffTransList[:3]))
+    #print "  - %d IDs ignored because different version" % diffVerCount
+    if len(commonIds)==0:
+        print("Error: Nothing in common. ")
+        print("First set of keys: %s " % list(keys1)[:3])
+        print("Second set of keys: %s " % list(keys2)[:3])
+        sys.exit(1)
+
+    exactMatchShare = 100*float(exactMatch)/float(len(commonIds))
+    print("  - %d transcripts (%0.00f%%) are exactly identical" % (exactMatch, exactMatchShare))
+    print("For those %d that are not exactly identical:" % (len(commonIds)-exactMatch))
+    print("  - %d transcripts have identical txStart,txEnd (e.g. %s)" % (txMatch, txMatchList[:3]))
+    print("  - %d transcripts have a different txStart,txEnd (written to different.gp)" % txNonMatch)
+    print("For those %d that don't have identical txStart,txEnd:" % txNonMatch)
+    print("  - %d transcripts have identical cdsStart,cdsEnd" % cdsMatch)
+    print("  - %d transcripts have txStart or txEnd off by <%dbp" % (txClose, CLOSE))
+    print("  - %d transcripts have only txStart off by <%d bp" % (txStartClose, CLOSE))
+    print("  - %d transcripts have only txEnd off by <%d bp" % (txEndClose, CLOSE))
+    print("  - %d transcripts overlap" % overlapCount)
+    print("  - %d transcripts have no overlap at all (written to noOverlap.gp)" % noOverlap)
+
+def printData(exonSpans):
+    # for each transcript, 
+    for ctgTrans, exonList in exonSpans.iteritems():
+        contig, transId = ctgTrans
+        # sort on start pos
+        exonList.sort(key=operator.itemgetter(1))
+        # find cdsStart, cdsEnd
+        cdsExons = [e for e in exonList if e[4] == "CDS"]
+        if len(cdsExons)==0:
+            cdsStart, cdsEnd = exonList[0][1], exonList[0][1]
+        else:
+            cdsStart = cdsExons[0][1]
+            cdsEnd = cdsExons[-1][2]
+
+        # convert to (start, end) tuples
+        spans = []
+        for chrom, chromStart, chromEnd, strand, typeGroup in exonList:
+            # if directly adjacent, fuse
+            if len(spans)!=0 and chromStart==spans[-1][1]:
+                spans[-1][1] = chromEnd
+            # otherwise append
+            else:
+                spans.append( [chromStart, chromEnd] )
+
+        exonStarts = [str(s[0]) for s in spans]
+        exonEnds = [str(s[1]) for s in spans]
+
+        txStart = exonStarts[0]
+        txEnd = exonEnds[-1]
+
+        row = [ \
+            transId, "chr"+chrom, strand, \
+            txStart, txEnd, \
+            cdsStart, cdsEnd, \
+            len(exonStarts), \
+            ",".join(exonStarts), \
+            ",".join(exonEnds) 
+            ]
+        row = [str(s) for s in row]
+        print("\t".join(row))
+
+main()