a0ea77a19c81a4c1d0929d6f97994e1843309adb max Mon May 18 08:06:53 2020 -0700 fixing Orf1ab to ORF1ab, refs #25114 diff --git src/utils/genbankToBigGenePred src/utils/genbankToBigGenePred index 100fee7..93f11d4 100755 --- src/utils/genbankToBigGenePred +++ src/utils/genbankToBigGenePred @@ -1,209 +1,209 @@ #!/usr/bin/env python import logging, sys, optparse, json, subprocess from collections import defaultdict from os.path import join, basename, dirname, isfile, expanduser from Bio import SeqIO # not found? Install with pip --user install biopython --upgrade # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] filename") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") #parser.add_option("", "--test", dest="test", action="store_true", help="do something") (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options def writeQualifiers(ft, ofh): for key, vals in ft.qualifiers.items(): ofh.write(key+"\t"+"|".join(vals)+"\n") doneProducts = set() # products can be duplicated. let's just skip anything with a duplicated name for now def writeFeatures(ft, chromName, ofh, isGenePred=False, rec=None, nameField="type"): global doneProducts strand = "+" if ft.strand==-1: strand = "-" elif ft.strand==0: strand = "." start = ft.location.start end = ft.location.end size = ft.location.end-ft.location.start geneList = ft.qualifiers.get("gene") geneId = "" if geneList and len(geneList)==1: geneId = geneList[0] # requested by Jason Fernandes, email Apr 29, 2020, 9:34 PM if chromName=="NC_045512v2" and start==265 and end==13483: - geneId = "Orf1a" + geneId = "ORF1a" prodList = ft.qualifiers.get("product") prod = "" if prodList and len(prodList)==1: prod = prodList[0] if nameField=="product" and prod in doneProducts: return doneProducts.add(prod) # set the BED name field if nameField=="gene": name = geneId elif nameField=="product": name = prod geneId = prod else: name = ft.type # extract a few qualifiers and assign to variables protId = ft.qualifiers.get("protein_id") if protId and len(protId)==1: protId = protId[0] dbx = ft.qualifiers.get("db_xref") xrefs = {} if dbx and len(dbx)==1: for part in dbx[0].split(): # "GeneID:43740578" k, v = part.split(":") xrefs[k] = v color = "0" exonSizes = [] exonStarts = [] for p in ft.location.parts: exSize = p.end-p.start exStart = p.start-start exonSizes.append(str(exSize)) exonStarts.append(str(exStart)) exonSizStr = ",".join(exonSizes) exonStStr = ",".join(exonStarts) # XX - what am I going to do here? Exonframe right now are ALL to 0 FIX ONE DAY exonFrames = ",".join(["0"]*len(exonSizes)) # bed 12 row = [chromName, str(start), str(end), name, "0", strand, start, end, color, str(len(exonSizes)), \ exonSizStr, exonStStr] # string name2; "Alternative/human readable name" #string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)" #string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)" #int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon" #string type; "Transcript type" #string geneName; "Primary identifier for gene" #string geneName2; "Alternative/human readable gene name" #string geneType; "Gene type" #string note; "Notes" #string product; "Protein Product ID" #string geneId; "NCBI Gene ID" #string _cdnaSeq; "cDNA Sequence" #string _cdnaPsl; "cDNA to genome PSL alignment (or empty)" #string _protSeq; "Protein Sequence" #string _protPsl; "protein to cDNA PSL alignment (or empty)" if isGenePred: row.append(geneId) row.append("cmpl") # ??? row.append("cmpl") row.append(exonFrames) # exonFrames row.append("N.a.") # transcript type row.append(geneId) # Primary identifier for gene row.append(geneId) # Alternative/human readable gene name row.append("N.a.") # Gene type row.append(ft.qualifiers.get("note", [""])[0]) # Notes row.append(protId) # Protein product ID row.append(xrefs.get("GeneID", "")) # NCBI Gene ID row.append(str(ft.extract(rec).seq)) # Gene type row.append("") # cDNA PSL row.append(repr(ft.qualifiers)) # protein sequence row.append("") # prot to cDNA PSL #row.append(repr(ft.qualifiers)) # anythign else else: row.append(ft.qualifiers.get("inference", [""])[0]) row.append(ft.qualifiers.get("function", [""])[0]) #row.append(repr(ft.qualifiers)) # anythign else, for debugging row = [str(x) for x in row] ofh.write("\t".join(row)) ofh.write("\n") def bedToBigBed(fname, chromSizes, asName, bbDir): " convert bed to bigBed " cmd = ["bedSort", fname, fname] assert (subprocess.call(cmd)==0) outFname = basename(fname).replace(".bed", ".bb") cmd = ["/cluster/home/braney/bin/x86_64/bedToBigBed", "-tab", "-type=bed12+", "-as="+asName, fname, chromSizes, join(bbDir, outFname), "-extraIndex=name", "-allow1bpOverlap"] logging.info("Running: "+(" ".join(cmd))) ret = subprocess.call(cmd) assert(ret==0) def gbToBigGenePred(fname, chromName, chromSizes, outDir, bbDir): outFnames = { "source" : join(outDir, "source.ra"), "genes" : join(outDir, "genes.bed"), "peptides" : join(outDir, "peptides.bed"), "other" : join(outDir, "other.bed"), } outFhs = {} for k, val in outFnames.items(): outFhs[k] = open(val, "w") for rec in SeqIO.parse(fname, "genbank"): for ft in rec.features: ftType = ft.type if ftType=="source": writeQualifiers(ft, outFhs["source"]) elif ftType=="mat_peptide": writeFeatures(ft, chromName, outFhs["peptides"], isGenePred=True, rec=rec, nameField="product") elif ftType == "gene": pass # cannot do anything with these right now, are they useful at all? elif ftType in ["CDS"]: writeFeatures(ft, chromName, outFhs["genes"], isGenePred=True, rec=rec, nameField="gene") else: writeFeatures(ft, chromName, outFhs["other"]) for ofh in outFhs.values(): ofh.close() for key, fname in outFnames.items(): if key=="source": continue asName = "~/kent/src/hg/lib/genbankBed12.as" if key in ["genes", "peptides"]: asName = "~/kent/src/hg/lib/genbankBigGenePred.as" asName = expanduser(asName) bedToBigBed(fname, chromSizes, asName, bbDir) # ----------- main -------------- def main(): args, options = parseArgs() fname, chromName, chromSizes, outDir, bbDir = args gbToBigGenePred(fname, chromName, chromSizes, outDir, bbDir) main()