5f58b8c336cffcaa1a7792363bbafa82a7d02e9e max Thu Oct 24 15:58:50 2013 -0700 fix typo in help message diff --git src/utils/mapViewerToGenePred/mapViewerToGenePred src/utils/mapViewerToGenePred/mapViewerToGenePred index 6bcb281..193cbaa 100755 --- src/utils/mapViewerToGenePred/mapViewerToGenePred +++ src/utils/mapViewerToGenePred/mapViewerToGenePred @@ -1,115 +1,115 @@ #!/usr/bin/env python # written by Max Haeussler, Aug 2013 # converts ncbi mapviewer .md files to gene pred import sys, gzip, operator, logging, optparse from collections import defaultdict # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] inFileGzOK outFileStdOutOk - converts NCBI mapviewer files to UCSC genePred format NCBI provides mapviewer files at ftp://ftp.ncbi.nih.gov/genomes/MapView/ An example for Homo sapiens is seq_gene.md.gz """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") -parser.add_option("-a", "--assembly", dest="assembly", action="store", help="The assemly to use, default %default", default="Primary_Assembly") +parser.add_option("-a", "--assembly", dest="assembly", action="store", help="The assembly to use, default %default", default="Primary_Assembly") #parser.add_option("", "--test", dest="test", action="store_true", help="do something") (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if args==[]: parser.print_help() exit(1) # ============== FUNCTIONS ================= def main(args, options): inFname, outFname = args if inFname.endswith(".gz"): ofunc = gzip.open else: ofunc = open ifh = ofunc(inFname) logging.info("Parsing %s..." % inFname) exonSpans = parseMapViewer(ifh, options.assembly) writeData(exonSpans, outFname) def parseMapViewer(ifh, filterAssembly): exonSpans = defaultdict(list) for line in ifh: if line.startswith("#"): continue l = line.rstrip("\n") fields = l.split("\t") taxId, chrom, chromStart, chromEnd, strand, contig, \ ctgStart, ctgStop, ctgStrand, featureName, geneId, \ typeGroup, assembly, transcript, evidence_code = fields chromStart = int(chromStart)-1 chromEnd = int(chromEnd) # skip over alternative assemblies if assembly != filterAssembly: continue #if typeGroup == "GENE": #geneToSym[geneId] = featureName if typeGroup in ["CDS", "UTR"]: if transcript=="-": continue exon = [chrom, chromStart, chromEnd, strand, typeGroup] # need to keep contig here, for duplicated refseqs exonSpans[(contig, transcript)].append( exon ) return exonSpans def writeData(exonSpans, outFname): # for each transcript, if outFname=="stdout": ofh = sys.stdout else: ofh = open(outFname, "w") for ctgTrans, exonList in exonSpans.iteritems(): contig, transId = ctgTrans # sort on start pos exonList.sort(key=operator.itemgetter(1)) # find cdsStart, cdsEnd cdsExons = [e for e in exonList if e[4] == "CDS"] if len(cdsExons)==0: cdsStart, cdsEnd = exonList[0][1], exonList[0][1] else: cdsStart = cdsExons[0][1] cdsEnd = cdsExons[-1][2] # convert to (start, end) tuples spans = [] for chrom, chromStart, chromEnd, strand, typeGroup in exonList: # if directly adjacent, fuse if len(spans)!=0 and chromStart==spans[-1][1]: spans[-1][1] = chromEnd # otherwise append else: spans.append( [chromStart, chromEnd] ) exonStarts = [str(s[0]) for s in spans] exonEnds = [str(s[1]) for s in spans] txStart = exonStarts[0] txEnd = exonEnds[-1] row = [ \ transId, "chr"+chrom, strand, \ txStart, txEnd, \ cdsStart, cdsEnd, \ len(exonStarts), \ ",".join(exonStarts), \ ",".join(exonEnds) ] row = [str(s) for s in row] ofh.write( "\t".join(row)) ofh.write("\n") ofh.close() main(args, options)