7a556d297a7e4cc0871f7189c2235d7422b30656 max Thu Jul 31 07:17:27 2025 -0700 adding the json command to cbGenes diff --git src/cbPyLib/cellbrowser/genes.py src/cbPyLib/cellbrowser/genes.py index cbdac9f..f1890fe 100755 --- src/cbPyLib/cellbrowser/genes.py +++ src/cbPyLib/cellbrowser/genes.py @@ -34,30 +34,31 @@ ls - list all available (built or downloaded) gene models on this machine Examples (common): %prog fetch # show the files that are available for the 'build' command %prog fetch gencode-34 # geneId -> symbol mapping for human gencode relase 34 %prog fetch hg38.gencode-34 # gene -> chrom mapping for human gencode relase 34 %prog ls %prog guess genes.txt mouse # guess the best gencode version for this file %prog check features.tsv.gz gencode-40 # check if the genes match gencode-40 and which ones don't Examples (rare - if you build your own gene models): %prog build # show the files that are available %prog build mm10 gencode-M25 %prog index # used at UCSC to prepare the files for 'guess' %prog allSyms human # build big geneId -> symbol table from all + %prog json ce11 wormbase235 # build JSON file from a .bed.gz file - only needed for model organisms """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") (options, args) = parser.parse_args() if args==[] or (args[0]=="guess" and len(args)==1): parser.print_help() exit(1) setDebug(options.debug) return args, options # ----------- main -------------- def parseSignatures(org, geneIdType): " return dict with gene release -> list of unique signature genes " @@ -500,45 +501,49 @@ logging.info("Finding unique values") uniqSyms, commonSyms = keepOnlyUnique(allSyms) uniqIds, commonIds = keepOnlyUnique(allIds) logging.info("%d symbols and %d geneIds are shared among all releases" % (commonSyms, commonIds)) writeUniqs(uniqSyms, join(dataDir, org+".syms.unique.tsv.gz")) writeUniqs(uniqIds, join(dataDir, org+".ids.unique.tsv.gz")) def bedToJson(db, geneIdType, jsonFname): " convert BED file to more compact json file: chrom -> list of (start, end, strand, gene) " geneToSym = readGeneSymbols(geneIdType) # index transcripts by gene bySym = defaultdict(dict) for row in iterBedRows(db, geneIdType): - chrom, start, end, geneId, score, strand = row[:6] - sym = geneToSym[geneId] + chrom, start, end, transId, score, strand = row[:6] + if not transId in geneToSym: + logging.warn("%s does not have a gene symbol" % transId) + sym = transId + else: + sym = geneToSym[transId] start = int(start) end = int(end) transLen = end-start - rawGeneId = geneId.split(".")[0] # for lookups, we hopefully will never need the version ID... - fullGeneId = rawGeneId+"|"+sym - bySym[fullGeneId].setdefault(chrom, []).append( (transLen, start, end, strand, geneId) ) + rawTransId = transId.split(".")[0] # for lookups, we hopefully will never need the version ID... + fullTransId = rawTransId+"|"+sym + bySym[fullTransId].setdefault(chrom, []).append( (transLen, start, end, strand, transId) ) symLocs = defaultdict(list) - for geneId, chromDict in bySym.items(): + for transId, chromDict in bySym.items(): for chrom, transList in chromDict.items(): transList.sort(reverse=True) # take longest transcript per chrom _, start, end, strand, transId = transList[0] - symLocs[chrom].append( (start, end, strand, geneId) ) + symLocs[chrom].append( (start, end, strand, transId) ) sortedLocs = {} for chrom, geneList in symLocs.items(): geneList.sort() sortedLocs[chrom] = geneList ofh = open(jsonFname, "wt") outs = json.dumps(sortedLocs) #md5 = hashlib.md5(outs.encode("utf8")).hexdigest()[:10] ofh.write(outs) ofh.close() logging.info("Wrote %s" % jsonFname) logging.info("If this is a new .json file and you are on hgwdev, copy it now to /usr/local/apache/htdocs-cells/downloads/cellbrowserData/genes/ and note this directory for the dataset release push to the RR. The reason is that users may want to cbBuild using this gene transcript set and that is easier if we provide the .json file") #fileInfo[code] = {"label":label, "file" : jsonFname, "md5" :md5} @@ -603,21 +608,22 @@ listModelsLocal() elif command=="index": buildGuessIndex() elif command=="allSyms": org = args[1] bigSymTable(org) elif command=="add": addFileLocal(args[1], args[2]) elif command=="push": pushLocal() - elif command=="json": # undocumented - db, geneType, outFname = args[1:] - bedToJson(db, geneType, outFname) + elif command=="json": + db, geneType = args[1:] + jsonFname = getStaticPath(getGeneJsonPath(db, geneType)) + bedToJson(db, geneType, jsonFname) else: errAbort("Unrecognized command: %s" % command)