f5fa2ebc9c1b54033cc804cd249bf6e807757c54 max Thu Aug 14 06:42:16 2025 -0700 allowing gene files for non-human atac seq datasets, for elegans-emb-multiome-atlas, Marc diff --git src/cbPyLib/cellbrowser/genes.py src/cbPyLib/cellbrowser/genes.py index f1890fe..cdb582c 100755 --- src/cbPyLib/cellbrowser/genes.py +++ src/cbPyLib/cellbrowser/genes.py @@ -496,50 +496,53 @@ continue geneId, sym = line.rstrip("\n").split("\t") syms.append(sym) allSyms["entrez"] = set(syms) logging.info("Finding unique values") uniqSyms, commonSyms = keepOnlyUnique(allSyms) uniqIds, commonIds = keepOnlyUnique(allIds) logging.info("%d symbols and %d geneIds are shared among all releases" % (commonSyms, commonIds)) writeUniqs(uniqSyms, join(dataDir, org+".syms.unique.tsv.gz")) writeUniqs(uniqIds, join(dataDir, org+".ids.unique.tsv.gz")) def bedToJson(db, geneIdType, jsonFname): " convert BED file to more compact json file: chrom -> list of (start, end, strand, gene) " - geneToSym = readGeneSymbols(geneIdType) + transToSym = readGeneSymbols(geneIdType) # index transcripts by gene bySym = defaultdict(dict) for row in iterBedRows(db, geneIdType): chrom, start, end, transId, score, strand = row[:6] - if not transId in geneToSym: + if not transId in transToSym: logging.warn("%s does not have a gene symbol" % transId) sym = transId else: - sym = geneToSym[transId] + sym = transToSym[transId] + logging.debug("Mapping %s to symbol %s" % (transId, sym)) start = int(start) end = int(end) transLen = end-start + rawTransId = transId + if rawTransId.startswith("EN") or rawTransId.startswith("NM_"): rawTransId = transId.split(".")[0] # for lookups, we hopefully will never need the version ID... fullTransId = rawTransId+"|"+sym - bySym[fullTransId].setdefault(chrom, []).append( (transLen, start, end, strand, transId) ) + bySym[sym].setdefault(chrom, []).append( (transLen, start, end, strand, fullTransId) ) symLocs = defaultdict(list) - for transId, chromDict in bySym.items(): + for sym, chromDict in bySym.items(): for chrom, transList in chromDict.items(): transList.sort(reverse=True) # take longest transcript per chrom _, start, end, strand, transId = transList[0] symLocs[chrom].append( (start, end, strand, transId) ) sortedLocs = {} for chrom, geneList in symLocs.items(): geneList.sort() sortedLocs[chrom] = geneList ofh = open(jsonFname, "wt") outs = json.dumps(sortedLocs) #md5 = hashlib.md5(outs.encode("utf8")).hexdigest()[:10] ofh.write(outs) ofh.close()