808fa9a4c306d2d85df51d5c7c8ea9976a0fb0f1 max Thu Apr 30 11:41:48 2020 -0700 adding genbankToBigGenePred tool, right now only for wuhCor1 assembly, but possibly useful elsewhere, as per emails with Braney diff --git src/utils/genbankToBigGenePred src/utils/genbankToBigGenePred new file mode 100755 index 0000000..60d6279 --- /dev/null +++ src/utils/genbankToBigGenePred @@ -0,0 +1,205 @@ +#!/usr/bin/env python + +import logging, sys, optparse, json, subprocess +from collections import defaultdict +from os.path import join, basename, dirname, isfile, expanduser + +from Bio import SeqIO # not found? Install with pip --user install biopython --upgrade + +# ==== functions ===== + +def parseArgs(): + " setup logging, parse command line arguments and options. -h shows auto-generated help page " + parser = optparse.OptionParser("usage: %prog [options] filename") + + parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") + #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") + #parser.add_option("", "--test", dest="test", action="store_true", help="do something") + (options, args) = parser.parse_args() + + if args==[]: + parser.print_help() + exit(1) + + if options.debug: + logging.basicConfig(level=logging.DEBUG) + logging.getLogger().setLevel(logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + logging.getLogger().setLevel(logging.INFO) + + return args, options + +def writeQualifiers(ft, ofh): + for key, vals in ft.qualifiers.items(): + ofh.write(key+"\t"+"|".join(vals)+"\n") + +doneProducts = set() # products can be duplicated. let's just skip anything with a duplicated name for now + +def writeFeatures(ft, chromName, ofh, isGenePred=False, rec=None, nameField="type"): + global doneProducts + strand = "+" + if ft.strand==-1: + strand = "-" + elif ft.strand==0: + strand = "." + start = ft.location.start + end = ft.location.end + size = ft.location.end-ft.location.start + + geneList = ft.qualifiers.get("gene") + geneId = "" + if geneList and len(geneList)==1: + geneId = geneList[0] + + prodList = ft.qualifiers.get("product") + prod = "" + if prodList and len(prodList)==1: + prod = prodList[0] + + if nameField=="product" and prod in doneProducts: + return + + doneProducts.add(prod) + + # set the BED name field + if nameField=="gene": + name = geneId + elif nameField=="product": + name = prod + geneId = prod + else: + name = ft.type + + # extract a few qualifiers and assign to variables + protId = ft.qualifiers.get("protein_id") + if protId and len(protId)==1: + protId = protId[0] + + dbx = ft.qualifiers.get("db_xref") + xrefs = {} + if dbx and len(dbx)==1: + for part in dbx[0].split(): + # "GeneID:43740578" + k, v = part.split(":") + xrefs[k] = v + + color = "0" + + exonSizes = [] + exonStarts = [] + for p in ft.location.parts: + exSize = p.end-p.start + exStart = p.start-start + exonSizes.append(str(exSize)) + exonStarts.append(str(exStart)) + + exonSizStr = ",".join(exonSizes) + exonStStr = ",".join(exonStarts) + + # XX - what am I going to do here? Exonframe right now are ALL to 0 FIX ONE DAY + exonFrames = ",".join(["0"]*len(exonSizes)) + + # bed 12 + row = [chromName, str(start), str(end), name, "0", strand, start, end, color, str(len(exonSizes)), \ + exonSizStr, exonStStr] + + # string name2; "Alternative/human readable name" + #string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)" + #string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)" + #int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon" + #string type; "Transcript type" + #string geneName; "Primary identifier for gene" + #string geneName2; "Alternative/human readable gene name" + #string geneType; "Gene type" + #string note; "Notes" + #string product; "Protein Product ID" + #string geneId; "NCBI Gene ID" + #string _cdnaSeq; "cDNA Sequence" + #string _cdnaPsl; "cDNA to genome PSL alignment (or empty)" + #string _protSeq; "Protein Sequence" + #string _protPsl; "protein to cDNA PSL alignment (or empty)" + if isGenePred: + row.append(geneId) + row.append("cmpl") # ??? + row.append("cmpl") + row.append(exonFrames) # exonFrames + row.append("N.a.") # transcript type + row.append(geneId) # Primary identifier for gene + row.append(geneId) # Alternative/human readable gene name + row.append("N.a.") # Gene type + row.append(ft.qualifiers.get("note", [""])[0]) # Notes + row.append(protId) # Protein product ID + row.append(xrefs.get("GeneID", "")) # NCBI Gene ID + row.append(str(ft.extract(rec).seq)) # Gene type + row.append("") # cDNA PSL + row.append(repr(ft.qualifiers)) # protein sequence + row.append("") # prot to cDNA PSL + row.append(repr(ft.qualifiers)) # anythign else + else: + row.append(ft.qualifiers.get("inference", [""])[0]) + row.append(ft.qualifiers.get("function", [""])[0]) + #row.append(repr(ft.qualifiers)) # anythign else, for debugging + + row = [str(x) for x in row] + ofh.write("\t".join(row)) + ofh.write("\n") + +def bedToBigBed(fname, chromSizes, asName, bbDir): + " convert bed to bigBed " + cmd = ["bedSort", fname, fname] + assert (subprocess.call(cmd)==0) + + outFname = basename(fname).replace(".bed", ".bb") + cmd = ["/cluster/home/braney/bin/x86_64/bedToBigBed", "-tab", "-type=bed12+", "-as="+asName, fname, chromSizes, join(bbDir, outFname)] + logging.info("Running: "+(" ".join(cmd))) + ret = subprocess.call(cmd) + assert(ret==0) + +def gbToBigGenePred(fname, chromName, chromSizes, outDir, bbDir): + outFnames = { + "source" : join(outDir, "source.ra"), + "genes" : join(outDir, "genes.bed"), + "peptides" : join(outDir, "peptides.bed"), + "other" : join(outDir, "other.bed"), + } + + outFhs = {} + for k, val in outFnames.items(): + outFhs[k] = open(val, "w") + + for rec in SeqIO.parse(fname, "genbank"): + for ft in rec.features: + ftType = ft.type + if ftType=="source": + writeQualifiers(ft, outFhs["source"]) + elif ftType=="mat_peptide": + writeFeatures(ft, chromName, outFhs["peptides"], isGenePred=True, rec=rec, nameField="product") + elif ftType == "gene": + pass # cannot do anything with these right now, are they useful at all? + elif ftType in ["CDS"]: + writeFeatures(ft, chromName, outFhs["genes"], isGenePred=True, rec=rec, nameField="gene") + else: + writeFeatures(ft, chromName, outFhs["other"]) + + for ofh in outFhs.values(): + ofh.close() + + + for key, fname in outFnames.items(): + if key=="source": + continue + asName = "~/kent/src/hg/lib/genbankBed12.as" + if key in ["genes", "peptides"]: + asName = "~/kent/src/hg/lib/genbankBigGenePred.as" + asName = expanduser(asName) + bedToBigBed(fname, chromSizes, asName, bbDir) + +# ----------- main -------------- +def main(): + args, options = parseArgs() + + fname, chromName, chromSizes, outDir, bbDir = args + gbToBigGenePred(fname, chromName, chromSizes, outDir, bbDir) + +main()