src/utils/mapViewerToGenePred/mapViewerToGenePred 18638f4f1b06edafe7c2b2001c299e232ce11ec6

18638f4f1b06edafe7c2b2001c299e232ce11ec6
max
  Wed Oct 23 13:32:27 2013 -0700
adding ncbi mapviewer to genepred converter
diff --git src/utils/mapViewerToGenePred/mapViewerToGenePred src/utils/mapViewerToGenePred/mapViewerToGenePred
new file mode 100755
index 0000000..6bcb281
--- /dev/null
+++ src/utils/mapViewerToGenePred/mapViewerToGenePred
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# written by Max Haeussler, Aug 2013
+# converts ncbi mapviewer .md files to gene pred
+import sys, gzip, operator, logging, optparse
+from collections import defaultdict
+
+# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
+parser = optparse.OptionParser("""usage: %prog [options] inFileGzOK outFileStdOutOk - converts NCBI mapviewer files to UCSC genePred format
+
+NCBI provides mapviewer files at ftp://ftp.ncbi.nih.gov/genomes/MapView/
+An example for Homo sapiens is seq_gene.md.gz
+""")
+
+parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
+parser.add_option("-a", "--assembly", dest="assembly", action="store", help="The assemly to use, default %default", default="Primary_Assembly") 
+#parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
+(options, args) = parser.parse_args()
+
+if options.debug:
+    logging.basicConfig(level=logging.DEBUG)
+else:
+    logging.basicConfig(level=logging.INFO)
+
+if args==[]:
+    parser.print_help()
+    exit(1)
+
+# ============== FUNCTIONS =================
+def main(args, options):
+    inFname, outFname = args
+    if inFname.endswith(".gz"):
+        ofunc = gzip.open
+    else:
+        ofunc = open
+    ifh = ofunc(inFname)
+
+    logging.info("Parsing %s..." % inFname)
+    exonSpans = parseMapViewer(ifh, options.assembly)
+    writeData(exonSpans, outFname)
+
+def parseMapViewer(ifh, filterAssembly):
+    exonSpans = defaultdict(list)
+    for line in ifh:
+        if line.startswith("#"):
+            continue
+        l = line.rstrip("\n")
+        fields = l.split("\t")
+        taxId, chrom, chromStart, chromEnd, strand, contig, \
+            ctgStart, ctgStop, ctgStrand, featureName, geneId, \
+            typeGroup, assembly, transcript, evidence_code = fields
+        chromStart = int(chromStart)-1
+        chromEnd = int(chromEnd)
+        # skip over alternative assemblies
+        if assembly != filterAssembly:
+            continue
+
+        #if typeGroup == "GENE":
+            #geneToSym[geneId] = featureName
+        if typeGroup in ["CDS", "UTR"]:
+            if transcript=="-":
+                continue
+            exon = [chrom, chromStart, chromEnd, strand, typeGroup]
+            # need to keep contig here, for duplicated refseqs
+            exonSpans[(contig, transcript)].append( exon )
+    return exonSpans
+
+def writeData(exonSpans, outFname):
+    # for each transcript, 
+    if outFname=="stdout":
+        ofh = sys.stdout
+    else:
+        ofh = open(outFname, "w")
+
+    for ctgTrans, exonList in exonSpans.iteritems():
+        contig, transId = ctgTrans
+        # sort on start pos
+        exonList.sort(key=operator.itemgetter(1))
+        # find cdsStart, cdsEnd
+        cdsExons = [e for e in exonList if e[4] == "CDS"]
+        if len(cdsExons)==0:
+            cdsStart, cdsEnd = exonList[0][1], exonList[0][1]
+        else:
+            cdsStart = cdsExons[0][1]
+            cdsEnd = cdsExons[-1][2]
+
+        # convert to (start, end) tuples
+        spans = []
+        for chrom, chromStart, chromEnd, strand, typeGroup in exonList:
+            # if directly adjacent, fuse
+            if len(spans)!=0 and chromStart==spans[-1][1]:
+                spans[-1][1] = chromEnd
+            # otherwise append
+            else:
+                spans.append( [chromStart, chromEnd] )
+
+        exonStarts = [str(s[0]) for s in spans]
+        exonEnds = [str(s[1]) for s in spans]
+
+        txStart = exonStarts[0]
+        txEnd = exonEnds[-1]
+
+        row = [ \
+            transId, "chr"+chrom, strand, \
+            txStart, txEnd, \
+            cdsStart, cdsEnd, \
+            len(exonStarts), \
+            ",".join(exonStarts), \
+            ",".join(exonEnds) 
+            ]
+        row = [str(s) for s in row]
+        ofh.write( "\t".join(row))
+        ofh.write("\n")
+    ofh.close()
+
+main(args, options)