a21a083f8efa63c9e50707291ea65fb49ea05a11 max Fri May 23 09:09:06 2025 -0700 typo again... diff --git src/cbPyLib/cellbrowser/convert.py src/cbPyLib/cellbrowser/convert.py index 925c4c2..30b52cd 100644 --- src/cbPyLib/cellbrowser/convert.py +++ src/cbPyLib/cellbrowser/convert.py @@ -1,787 +1,787 @@ # various format converters for single cell data: # - cellranger, mtx to tsv, matcat, metaCat etc import logging, optparse, io, sys, os, shutil, operator, glob, re, json, subprocess from collections import defaultdict, OrderedDict from .cellbrowser import runGzip, openFile, errAbort, setDebug, moveOrGzip, makeDir, iterItems from .cellbrowser import mtxToTsvGz, writeCellbrowserConf, getAllFields, readMatrixAnndata, adataStringFix from .cellbrowser import anndataMatrixToTsv, loadConfig, sanitizeName, lineFileNextRow, scanpyToCellbrowser, build from .cellbrowser import generateHtmls, getObsKeys, renameFile, getMatrixFormat, generateDataDesc from .cellbrowser import copyFileIfDiffSize from .cellbrowser import generateQuickGenes from os.path import join, basename, dirname, isfile, isdir, relpath, abspath, getsize, getmtime, expanduser def cbToolCli_parseArgs(showHelp=False): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] command filenames - convert various single-cell related files Command is one of: mtx2tsv - convert matrix market to .tsv.gz matCatCells - merge expression matrices into a big tsv.gz matrix. This command adds more cells. Adds cells on the right side of the matrix. If you have files from different cells, same assay, same genes, this allows to merge them into a bigger matrix. Format is one-line-per-gene. Matrices must have identical genes in the same order and the same number of lines. Handles .csv files, otherwise defaults to tab-sep input. gzip OK. matCatGenes - concatenate expression matrices. If you have data from the same cells but with different genes (assays, multi-modal), this allows to merge them and add a prefix to every gene. This command adds more genes. Adds genes to the end of the matrix. metaCat - concat/join meta tables on the first (cell ID) field or reorder their fields reorder - reorder the meta fields quickgenes - make quickgenes.tsv from markers.tsv in current directory Examples: - %prog mtx2tsv matrix.mtx genes.tsv barcodes.tsv exprMatrix.tsv.gz - convert .mtx to .tsv.gz file - %prog matCatCells mat1.tsv.gz mat2.tsv.gz exprMatrix.tsv.gz - merge expression matrices - %prog matCatGenes mat1.csv.gz mat2.tsv.gz=atac- exprMatrix.tsv.gz - concatenate expression matrices and prefix all genes in the second file with 'atac-' - %prog metaCat meta.tsv seurat/meta.tsv scanpy/meta.tsv newMeta.tsv - merge meta matrices - %prog reorder meta.tsv meta.newOrder.tsv --del samId --order=cluster,cellType,age - reorder meta fields """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("", "--fixDots", dest="fixDots", action="store_true", help="for reorder and metaCat: try to fix R's mangling of various special chars to '.' in the cell IDs") parser.add_option("", "--order", dest="order", action="store", help="only for reorder and metaCat: new order of fields, comma-sep, e.g. 'disease,age'. Do not include cellId, it's always the first field by definition. Fields that are in the file but not specified here will be appended as the last columns.") parser.add_option("", "--only", dest="only", action="store_true", help="when using --order: only use the specified fields, do not append other fields at the end") parser.add_option("", "--del", dest="delFields", action="store", help="only for reorder and metaCat: names of fields to remove") (options, args) = parser.parse_args() if showHelp: parser.print_help() exit(1) setDebug(options.debug) return args, options def cbToolCli(): " run various tools from the command line " args, options = cbToolCli_parseArgs() - if len(args)<=1 and (len(args)!=0 and args[0]!="quickgenes"): + if len(args)<=1 and not (len(args)!=0 and args[0]=="quickgenes"): cbToolCli_parseArgs(showHelp=True) sys.exit(1) cmd = args[0] cmds = ["mtx2tsv", "matCatCells", "matCatGenes" "metaCat", "reorder", "cxg", "quickgenes"] if cmd=="mtx2tsv": mtxFname = args[1] geneFname = args[2] barcodeFname = args[3] outFname = args[4] mtxToTsvGz(mtxFname, geneFname, barcodeFname, outFname) elif cmd=="matCatCells": inFnames = args[1:-1] outFname = args[-1] matCat(inFnames, outFname) elif cmd=="matCatGenes": inFnames = args[1:-1] outFname = args[-1] matCatGenes(inFnames, outFname) elif cmd=="cxg": subCmd = args[1] if subCmd=="allDatasets": printRows(getCxgAllDatasets()) elif subCmd=="asset": dsIds = args[2:] getCxgAssets(dsIds) elif subCmd=="allAssets": getCxgAllAssets() elif subCmd=="cp": dsId = args[2] assetId = args[3] outFname = args[4] url = getCxgAssetUrl(dsId, assetId) cmd = ["curl", url, "--output", outFname] subprocess.run(cmd) elif subCmd=="desc": collId = args[2] getCxgColl(collId) elif cmd=="metaCat" or cmd=="reorder": inFnames = args[1:-1] outFname = args[-1] if len(inFnames)==0: errAbort("You must provide at least two filenames: one input filename and one output filename") if len(inFnames)==1 and isfile(outFname): errAbort("You provided only one input file and the output file already exists. To avoid " "accidentally overwriting a file, please either remove the output file or provide at least " "three filenames: two input files and one output file. The second input file can be 'none', " "which will suppress this error message.") if len(inFnames)==2 and inFnames[1]=="none": inFnames.pop() metaCat(inFnames, outFname, options) elif cmd=="quickgenes": generateQuickGenes(".") else: errAbort("Command %s is not a valid command. Valid commands are: %s" % (cmd, ", ".join(cmds))) def getLabels(l): " given a list of dicts, return a |-sep list of 'label' values " labels = [] for d in l: if type(d)==str: labels.append(d) else: labels.append(d["label"]) return "|".join(labels) def printRows(iterator, headDone=False): " print all rows gotten from an iterator " #headDone = False for row in iterator: if headDone is False: print("\t".join(row.keys())) headDone = True print("\t".join(row.values())) def getCxgAssetUrl(datasetId, assetId): " cxg has signed links, get one for an asset ID " url = 'https://api.cellxgene.cziscience.com/dp/v1/datasets/%s/asset/%s' % (datasetId, assetId) import requests ret = requests.post(url=url).json() # keys: dataset_id, file_name, presigned_url return ret["presigned_url"] def getCxgAllDatasets(): " yield all cxg datasets as ordered dicts " import requests url = "https://api.cellxgene.cziscience.com/dp/v1/datasets/index" datasets = requests.get(url=url).json() attrs = ['id', 'name', 'organism', 'collection_id', 'assay', 'explorer_url', 'is_primary_data', \ 'cell_count', 'mean_genes_per_cell', 'development_stage', 'development_stage_ancestors', \ 'disease', \ 'published_at', 'revised_at', 'schema_version', 'self_reported_ethnicity', 'sex', 'tissue', \ 'tissue_ancestors', 'cell_type', 'cell_type_ancestors'] for dataset in datasets: row = OrderedDict() for attr in attrs: val = dataset.get(attr, "UNDEFINED") if type(val)==type([]): strVal = getLabels(val) else: strVal = str(val) row[attr] = strVal yield row def getCxgAllAssets(): "" headDone = False for ds in getCxgAllDatasets(): printRows(getCxgAssets([ds["id"]]), headDone) headDone = True def getCxgAssets(dsIds, headDone=False): " download assets for a list of cxg datasets " import requests attrs = ['dataset_id', 'dataset', 'filename', 'filetype', 'id', 's3_uri', 'created_at', 'updated_at', 'user_submitted'] #if not headDone: #print("\t".join(attrs)) for dsId in dsIds: url = "https://api.cellxgene.cziscience.com/dp/v1/datasets/%s/assets" % dsId ret = requests.get(url=url).json() for asset in ret["assets"]: #row = [] row = OrderedDict() for attr in attrs: val = asset[attr] if attr == "dataset": val = val["name"] #row.append(str(val)) row[attr] = str(val) #print("\t".join(row)) yield row def getCxgColl(collId): " " def matCat(inFnames, outFname): tmpFname = outFname+".tmp" ofh = openFile(tmpFname, "w") ifhs = [] sep = "\t" if ".csv" in inFnames[0]: sep = "," for fn in inFnames: ifhs.append( openFile(fn) ) headers, colCounts = getAllFields(ifhs, sep) ofh.write("\t".join(headers)) ofh.write("\n") for i, ifh in enumerate(ifhs): logging.info("File %s: %d columns with values" % (ifhs[i].name, colCounts[i]-1)) fileCount = len(inFnames) progressStep = 1000 doProcess = True lineCount = 0 while doProcess: geneId = None lineCount += 1 if lineCount % progressStep == 0: logging.info("Processed %d rows" % (lineCount)) for i, ifh in enumerate(ifhs): lineStr = ifh.readline() # check if we're at EOF if lineStr=='': doProcess = False break fields = lineStr.rstrip('\r\n').split(sep) fields = [x.strip('"') for x in fields] if (len(fields)!=colCounts[i]): # check number of columns against header count raise Exception("Illegal number of fields: file %s, line %d, field count: %d, expected %d" % (ifh.name, lineCount, len(fields), colCounts[i])) # check the gene ID if i==0: # get the gene ID from the first file geneId = fields[0] allVals = [geneId] else: #assert(geneId == fields[0]) # if this fails, then the rows are not in the same order if geneId != fields[0]: print("Error: File %s has gene IDs that is out of order." % ifh.name) print("Expected geneID: %s, got geneID: %s" % (geneId, fields[0])) sys.exit(1) allVals.extend(fields[1:]) ofh.write("\t".join(allVals)) ofh.write("\n") # make sure that we've read through all files for ifh in ifhs: assert(ifh.readline()=='') # a file has still lines left to read? ofh.close() moveOrGzip(tmpFname, outFname) logging.info("Wrote %d lines (not counting header)" % lineCount) def matCatGenes(inFnames, outFname): " cat the lines several tab-sep or csv files, check headers and skip the header lines " tmpFname = outFname+".tmp" ofh = openFile(tmpFname, "w") otherHeaders = None lineCount = 0 for fn in inFnames: prefix = "" if "=" in fn: fn, prefix = fn.split("=") sep = "\t" if ".csv" in fn: sep = "," headers = None logging.info("Reading %s" % fn) doReorder = False fieldOrder = None for line in openFile(fn): row = line.rstrip("\n\r").split(sep) row = [x.strip('"') for x in row] if otherHeaders is None: otherHeaders = row ofh.write("\t".join(otherHeaders)) ofh.write("\n") if headers is None: headers = row if (headers != otherHeaders): # all files must have identical columns = identical header line logging.warn("Headers are not identical, need to re-order columns, this will be pretty slow") doReorder = True fieldOrder = [0] for h in otherHeaders[1:]: idx = headers.index(h) assert(idx!=0) # internal logic error. Should never happen. fieldOrder.append( idx ) # if this fails, then one file has a column that the other one doesn't have. Not clear what should happen then -> just fail. continue if prefix!="": row[0] = prefix+row[0] if doReorder: ofh.write("\t".join([row[i] for i in fieldOrder])) else: ofh.write("\t".join(row)) ofh.write("\n") lineCount += 1 logging.info("Wrote %d lines to output file (not counting header)" % lineCount) ofh.close() logging.info("Compressing %s", outFname) moveOrGzip(tmpFname, outFname) def reorderFields(row, firstFields, skipFields, onlyFirst): " reorder the row to have firstFields first " #logging.debug("Reordering row to have %s fields first" % firstFields) newRow = [row[0]] for idx in firstFields: newRow.append(row[idx]) if not onlyFirst: for i in range(1, len(row)): if i not in firstFields and i not in skipFields: newRow.append(row[i]) return newRow def metaCat(inFnames, outFname, options): " merge all tsv/csv columns in inFnames into a new file, outFname. Column 1 is ID to join on. " logging.debug("In fnames: %s, out fname: %s" % (repr(inFnames), repr(outFname))) allHeaders = ["cellId"] allRows = defaultdict(dict) # cellId -> fileIdx -> list of non-ID fields fieldCounts = {} # fileIdx -> number of non-ID fields allIds = set() # set with all cellIds firstFields = [] if options.order!="" and options.order is not None: firstFields = options.order.split(",") delFields = [] if options.delFields!="" and options.delFields is not None: delFields = options.delFields.split(",") for fileIdx, fname in enumerate(inFnames): logging.info("Reading %s" % fname) headers = None rowCount = 0 for row in lineFileNextRow(fname, headerIsRow=True): if headers is None: headers = row[1:] logging.info("Reading %s, %d columns: %s" % (fname, len(headers), repr(headers))) allHeaders.extend(headers) fieldCounts[fileIdx] = len(headers) continue rowCount +=1 cellId = row[0] allIds.add(cellId) allRows[cellId][fileIdx] = row[1:] logging.info("Read %d rows" % rowCount) nonCharRe = re.compile(r'[^a-zA-Z0-9]') if options.fixDots: # first create a map realId -> rId rToReal = {} for cellId, rowData in iterItems(allRows): rId = nonCharRe.sub(".", cellId) if rId!=cellId and rId in allRows: assert(rId not in rToReal) # Uh-oh! Mapping R <-> realId is not simple. May need some manual work. rToReal[rId]=cellId logging.info("Found %d cell IDs that look like they've been mangled by R" % (len(rToReal))) # now merge the R-id entries into the real ID entries newRows = {} for rId, readlId in iterItems(rToReal): allRows[cellId].update(allRows[rId]) for rId in rToReal.keys(): del allRows[rId] logging.info("Merged back rIds into normal data") # find the field indices of the fields we're putting first firstFieldIdx = [] for h in firstFields: try: idx = allHeaders.index(h) firstFieldIdx.append(idx) except ValueError: logging.warning("Field %s specified on command line --order is not in the meta data file" % repr(h)) # and those of fields we will skip delFieldIdx = [] for h in delFields: try: idx = allHeaders.index(h) except ValueError: errAbort("Field %s specified on command line with --del is not in the meta data file" % repr(h)) delFieldIdx.append(idx) if len(firstFields)!=0: logging.info("Order of fields that come first: %s" % firstFields) if len(delFields)!=0: logging.info("Removing these fields: %s" % delFields) if outFname=="stdout" or outFname=="/dev/stdout": ofh = sys.stdout else: tmpFname = outFname+".tmp" ofh = openFile(tmpFname, "w") onlyFirst = options.only allHeaders = reorderFields(allHeaders, firstFieldIdx, delFieldIdx, onlyFirst) ofh.write("\t".join(allHeaders)) ofh.write("\n") for cellId, rowData in iterItems(allRows): row = [cellId] for fileIdx in range(0, len(inFnames)): if fileIdx in rowData: row.extend(rowData[fileIdx]) else: row.extend([""]*fieldCounts[fileIdx]) row = reorderFields(row, firstFieldIdx, delFieldIdx, onlyFirst) ofh.write("\t".join(row)) ofh.write("\n") ofh.close() if ofh!=sys.stdout: renameFile(tmpFname, outFname) logging.info("Output field order is: %s" % allHeaders) logging.info("Wrote %d lines (not counting header)" % len(allRows)) def importCellrangerMatrix(inDir, outDir): " convert the cellranger 2 or 3 matrix, use the .mtx or .h5 file, whatever is available " outExprFname = join(outDir, "exprMatrix.tsv.gz") # cellranger 2: filtered_feature_bc_matrix/<db>/matrix.mtx and not gzipped mask1 = join(inDir, "filtered_gene_bc_matrices/*/matrix.mtx") # cellranger 3: filtered_gene_bc_matrices and gzipped mask2 = join(inDir, "filtered_feature_bc_matrix/matrix.mtx.gz") # h5 file as fallback, requires scanpy # cellranger3: filtered_feature_bc_matrix.h5 mask3 = join(inDir, "*filtered_*matri*.h5") # should work for cr 1, 2 and 3 matFnames1 = glob.glob(mask1) matFnames2 = glob.glob(mask2) matFnames3 = glob.glob(mask3) logging.info("Looking for %s (cellranger 1/2) or %s (cellranger3) or %s" % (mask1, mask2, mask3)) # ugly code warning if len(matFnames1)!=0: # cellranger 1/2 mtx files are not gziped assert(len(matFnames1)==1) matFname = matFnames1[0] logging.info("Found %s" % matFname) barcodeFname = matFname.replace("matrix.mtx", "barcodes.tsv") geneFname = matFname.replace("matrix.mtx", "genes.tsv") mtxToTsvGz(matFname, geneFname, barcodeFname, outExprFname) elif len(matFnames2)!=0: # cellranger 3 mtx files are gzipped assert(len(matFnames2)==1) matFname = matFnames2[0] logging.info("Found %s" % matFname) barcodeFname = matFname.replace("matrix.mtx.gz", "barcodes.tsv.gz") geneFname = matFname.replace("matrix.mtx.gz", "features.tsv.gz") mtxToTsvGz(matFname, geneFname, barcodeFname, outExprFname) elif len(matFnames3)!=0: # h5 files of cellranger 1/2/3 are read via scanpy matFnames3 = glob.glob(mask3) logging.info("Found %s" % matFnames3) assert(len(matFnames3)==1) matFname = matFnames3[0] import scanpy.api as sc logging.info("Reading matrix %s" % matFname) adata = readMatrixAnndata(matFname) anndataMatrixToTsv(adata, outExprFname) else: errAbort("Could not find matrix, neither as %s, %s nor %s" % (mask1, mask2, mask3)) logging.info("Looking for %s" % mask3) return matFname def crangerToCellbrowser(datasetName, inDir, outDir, noMat): " convert cellranger output to a cellbrowser directory " makeDir(outDir) # copy over the clusters clustFname = join(inDir, "analysis/clustering/graphclust/clusters.csv") if not isfile(clustFname): logging.warn("Cannot find %s" % clustFname) clustFname = join(inDir, "analysis/kmeans/10_clusters/clusters.csv") logging.warn("Using%s instead" % clustFname) metaFname = join(outDir, "meta.csv") shutil.copy(clustFname, metaFname) # copy over the t-SNE coords tsneFname = join(inDir, "analysis/tsne/2_components/projection.csv") if not isfile(tsneFname): tsneFname = join(inDir, "analysis/tsne/projection.csv") coordFname = join(outDir, "tsne.coords.csv") shutil.copy(tsneFname, coordFname) # copy over the markers dgeFname = join(inDir, "analysis/diffexp/graphclust/differential_expression.csv") if not isfile(dgeFname): logging.warn("Not found: %s" % dgeFname) dgeFname = join(inDir, "analysis/kmeans/10_clusters/differential_expression.csv") logging.warn("Using instead: %s" % dgeFname) markerFname = join(outDir, "markers.tsv") geneFname = join(outDir, "gene2sym.tsv") crangerSignMarkers(dgeFname, markerFname, geneFname, 0.01, 100) if not noMat: matFname = importCellrangerMatrix(inDir, outDir) else: matFname = "unknown" confName = join(outDir, "cellbrowser.conf") coordDescs = [{"file":"tsne.coords.csv", "shortLabel":"CellRanger t-SNE"}] confArgs = {"meta" : "meta.csv", "clusterField" : "Cluster", "tags" : ["10x"]} writeCellbrowserConf(datasetName, coordDescs, confName, confArgs) crangerWriteMethods(inDir, outDir, matFname) generateHtmls(datasetName, outDir) def crangerWriteMethods(inDir, outDir, matFname): htmlFname = join(outDir, "methods.html") if isfile(htmlFname): logging.info("%s exists, not overwriting" % htmlFname) return import csv csvMask = join(inDir, "*_metrics_summary.csv") csvFnames = glob.glob(csvMask) jsonFname = join(inDir, "summary.json") if len(csvFnames)==0: if isfile(jsonFname): qcVals = json.load(open(jsonFname)) else: logging.warn("Cannot find %s nor %s, not writing %s" % (csvMask, jsonFname, htmlFname)) return else: qcVals = list(csv.DictReader(open(csvFnames[0])))[0] ofh = open(htmlFname, "w") ofh.write("<p>This dataset was imported from a CellRanger analysis directory with cbCellranger.</p>\n") ofh.write("<p><b>QC metrics reported by CellRanger:</b></p>\n") for key, value in iterItems(qcVals): ofh.write("<i>%s:</i> %s<br>\n" % (key, str(value))) ofh.close() logging.info("Wrote %s" % ofh.name) def crangerSignMarkers(dgeFname, markerFname, geneFname, maxPval, maxGenes): " convert cellranger diff exp file to markers.tsv file " # Old Cellranger 1 format: # Gene ID,Gene Name,Cluster 1 Weight,Cluster 1 UMI counts/cell,Cluster 2 Weight,Cluster 2 UMI counts/cell,Cluster 3 Weight,Cluster 3 UMI counts/cell,Cluster 4 Weight,Cluster 4 UMI counts/cell,Cluster 5 Weight,Cluster 5 UMI counts/cell,Cluster 6 Weight,Cluster 6 UMI counts/cell,Cluster 7 Weight,Cluster 7 UMI counts/cell,Cluster 8 Weight,Cluster 8 UMI counts/cell,Cluster 9 Weight,Cluster 9 UMI counts/cell,Cluster 10 Weight,Cluster 10 UMI counts/cell ofh = open(markerFname, "w") ofh.write("cluster\tgene\tAdj. P-Value\tLog2 fold change\tMean UMI Counts\n") clusterToGenes = defaultdict(list) # read the significant markers and their p-Values logging.info("Reading %s" % dgeFname) ifh = open(dgeFname) # determine file format line1 = ifh.readline() # cellranger 3 fileVersion = None if line1.startswith("Feature ID"): fieldsProCluster = 3 fileVersion = 3 # cellranger 1 or 2 elif line1.startswith("Gene ID"): if "Cluster 1 Weight" in line1: fieldsProCluster = 2 fileVersion = 1 else: fieldsProCluster = 3 fileVersion = 2 assert(fileVersion is not None) # unknown cellranger version? for line in ifh: row = line.rstrip("\n\r").split(",") geneId = row[0] sym = row[1] clusterCount = int((len(row)-2) / fieldsProCluster) for clusterIdx in range(0, clusterCount): startField = (clusterIdx*fieldsProCluster)+2 if fileVersion>=2: mean = float(row[startField]) fc = float(row[startField+1]) pVal = float(row[startField+2]) else: fc = float(row[startField]) mean = float(row[startField+1]) pVal = 0.0001 if pVal < maxPval: clusterToGenes[clusterIdx+1].append((fc, mean, pVal, sym)) # write out the markers for clusterId, clusterGenes in iterItems(clusterToGenes): clusterGenes.sort(key=operator.itemgetter(2)) # sort by fold change maxIdx = min(maxGenes, len(clusterGenes)) for i in range(0, maxIdx): fc, mean, pVal, sym = clusterGenes[i] ofh.write("%d\t%s\t%g\t%f\t%f\n" % (clusterId, sym, pVal, fc, mean)) ofh.close() logging.info("Wrote %s" % ofh.name) def cbCellrangerCli(): args, options = cbCellrangerCli_parseArgs() if options.outDir is None or options.inDir is None or options.datasetName is None: logging.error("You have to specify at least an input and an output directory and a dataset name.") cbCellrangerCli_parseArgs(showHelp=True) crangerToCellbrowser(options.datasetName, options.inDir, options.outDir, options.noMat) def cbCellrangerCli_parseArgs(showHelp=False): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] -i cellRangerDir -o outputDir - convert the cellranger output to cellbrowser format and create a cellranger.conf file """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-i", "--inDir", dest="inDir", action="store", help="input folder with the cellranger analysis output. This is the directory with the two directories 'analysis' and 'filtered_gene_bc_matrices'") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="output directory") #parser.add_option("-g", "--geneSet", dest="geneSet", action="store", help="geneset, e.g. gencode28 or gencode-m13 or similar. Default: %default", default="gencode24") parser.add_option("-n", "--name", dest="datasetName", action="store", help="name of the dataset. No spaces or special characters.") parser.add_option("-m", "--noMat", dest="noMat", action="store_true", help="do not export the matrix again, saves some time if you changed something small since the last run") (options, args) = parser.parse_args() if showHelp: parser.print_help() exit(1) setDebug(options.debug) return args, options def cbImportScanpy_parseArgs(showHelp=False): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] -i inFilename -o outDir - convert Scanpy AnnData object to cellbrowser. inFilename can be an .h5ad or .loom file. Exports raw.X by default, or .X alternatively, see --proc and --layer. Exports all meta data. Writes .tsv by default, or alternatively .mtx.gz Example: - %prog -i pbmc3k.h5ad -o pbmc3kScanpy - convert pbmc3k to directory with tab-separated files """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-i", "--inFile", dest="inFile", action="store", help="input .h5ad file. Required parameter") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Output directory. Required parameter") parser.add_option("-n", "--name", dest="datasetName", action="store", help="Dataset name for generated cellbrowser.conf. If not specified, the last component of -o will be used") parser.add_option("", "--htmlDir", dest="htmlDir", action="store", help="do not only convert to tab-sep files but also run cbBuild to" "convert the data and add the dataset under htmlDir") parser.add_option("-p", "--port", dest="port", action="store", type="int", help="only with --htmlDir: start webserver on port to serve htmlDir") parser.add_option("", "--markerField", dest="markerField", action="store", help="name of the marker genes field, default: %default", default="rank_genes_groups") parser.add_option("", "--clusterField", dest="clusterField", action="store", help="if no marker genes are present, use this field to calculate them. Default is to try a list of common field names, like 'Cluster' or 'louvain' and a few others") parser.add_option("-f", "--matrixFormat", dest="matrixFormat", action="store", default="mtx", help="Output matrix file format. 'mtx' or 'tsv'. default: mtx",) parser.add_option("-m", "--skipMatrix", dest="skipMatrix", action="store_true", help="do not convert the matrix, saves time if the same one has been exported before to the " "same directory") parser.add_option("", "--skipMarkers", dest="skipMarkers", action="store_true", help="do not try to calculate cluster-specific marker genes. Only useful for the rare datasets where a bug in scanpy crashes the marker gene calculation.") parser.add_option("", "--proc", dest="useProc", action="store_true", help="when exporting, do not use the raw input data, instead use the normalized and corrected matrix, aka 'ad.X'. This has no effect if the anndata.raw attribute is not present in the anndata object") parser.add_option("-l", "--layer", dest="layer", action="store", help="Specify the layer to export. This takes precedence over --proc.") parser.add_option("", "--atac", dest="atac", action="store", help="Indicate that this is an ATAC dataset and specify genome assembly and gene model, for example 'hg38.gencode-42'. Use 'cbGenes ls' to show the list of all available gene models on your disk or cbGenes fetch to download other ones. This will only be passed through to cellbrowser.conf.") (options, args) = parser.parse_args() if showHelp: parser.print_help() exit(1) setDebug(options.debug) return args, options def cbImportScanpyCli(): " convert h5ad to directory " args, options = cbImportScanpy_parseArgs() if len(args)==0 and not options.inFile and not options.outDir: cbImportScanpy_parseArgs(showHelp=True) sys.exit(1) inFname = options.inFile outDir = options.outDir if None in [inFname, outDir]: errAbort("You need to specify at least an input .h5ad file and an output directory") datasetName = options.datasetName if datasetName is None: datasetName = basename(abspath(outDir)) markerField = options.markerField clusterField = options.clusterField skipMarkers = options.skipMarkers matrixFormat = getMatrixFormat(options) atac = options.atac layer = options.layer if atac is not None and "." not in atac: errAbort("the --atac option needs to be in the format <assembly>.<genemodel> e.g. hg38.gencode-42") ad = readMatrixAnndata(inFname, reqCoords=True) scanpyToCellbrowser(ad, outDir, datasetName, skipMatrix=options.skipMatrix, useRaw=(not options.useProc), markerField=markerField, clusterField=clusterField, skipMarkers=skipMarkers, matrixFormat=matrixFormat, atac=atac, layer=layer) copyFileIfDiffSize(inFname, join(outDir, basename(inFname))) generateDataDesc(datasetName, outDir, other={"supplFiles": [{"label":"Scanpy H5AD", "file":basename(inFname)}]}) if options.port and not options.htmlDir: errAbort("--port requires --htmlDir") if options.htmlDir: build(outDir, options.htmlDir, port=options.port)