src/cbPyLib/cellbrowser/convert.py c1bdbd09670416fb6efd05466977c45e2e9c9d95

c1bdbd09670416fb6efd05466977c45e2e9c9d95
max
  Fri May 23 08:54:09 2025 -0700
adding cbtools quickgenes command

diff --git src/cbPyLib/cellbrowser/convert.py src/cbPyLib/cellbrowser/convert.py
index bedd27f..1f0f13d 100644
--- src/cbPyLib/cellbrowser/convert.py
+++ src/cbPyLib/cellbrowser/convert.py
@@ -1,781 +1,787 @@
 # various format converters for single cell data:
 # - cellranger, mtx to tsv, matcat, metaCat etc
 
 import logging, optparse, io, sys, os, shutil, operator, glob, re, json, subprocess
 from collections import defaultdict, OrderedDict
 
 from .cellbrowser import runGzip, openFile, errAbort, setDebug, moveOrGzip, makeDir, iterItems
 from .cellbrowser import mtxToTsvGz, writeCellbrowserConf, getAllFields, readMatrixAnndata, adataStringFix
 from .cellbrowser import anndataMatrixToTsv, loadConfig, sanitizeName, lineFileNextRow, scanpyToCellbrowser, build
 from .cellbrowser import generateHtmls, getObsKeys, renameFile, getMatrixFormat, generateDataDesc
 from .cellbrowser import copyFileIfDiffSize
+from .cellbrowser import generateQuickGenes
 
 from os.path import join, basename, dirname, isfile, isdir, relpath, abspath, getsize, getmtime, expanduser
 
 def cbToolCli_parseArgs(showHelp=False):
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] command filenames - convert various single-cell related files
 
 Command is one of:
     mtx2tsv   - convert matrix market to .tsv.gz
     matCatCells - merge expression matrices into a big tsv.gz matrix.
         This command adds more cells. Adds cells on the right side of the matrix.
         If you have files from different cells, same assay, same genes, this
         allows to merge them into a bigger matrix. Format is one-line-per-gene.
         Matrices must have identical genes in the same order and the same number of
         lines. Handles .csv files, otherwise defaults to tab-sep input. gzip OK.
     matCatGenes - concatenate expression matrices. If you have data from the same cells but with different
         genes (assays, multi-modal), this allows to merge them and add a prefix to every gene.
         This command adds more genes. Adds genes to the end of the matrix.
     metaCat - concat/join meta tables on the first (cell ID) field or reorder their fields
     reorder - reorder the meta fields
+    quickgenes - make quickgenes.tsv from markers.tsv in current directory
 
 Examples:
     - %prog mtx2tsv matrix.mtx genes.tsv barcodes.tsv exprMatrix.tsv.gz - convert .mtx to .tsv.gz file
     - %prog matCatCells mat1.tsv.gz mat2.tsv.gz exprMatrix.tsv.gz - merge expression matrices
     - %prog matCatGenes mat1.csv.gz mat2.tsv.gz=atac- exprMatrix.tsv.gz - concatenate expression matrices
            and prefix all genes in the second file with 'atac-'
     - %prog metaCat meta.tsv seurat/meta.tsv scanpy/meta.tsv newMeta.tsv - merge meta matrices
     - %prog reorder meta.tsv meta.newOrder.tsv --del samId --order=cluster,cellType,age - reorder meta fields
     """)
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true",
         help="show debug messages")
     parser.add_option("", "--fixDots", dest="fixDots", action="store_true",
             help="for reorder and metaCat: try to fix R's mangling of various special chars to '.' in the cell IDs")
     parser.add_option("", "--order", dest="order", action="store",
         help="only for reorder and metaCat: new order of fields, comma-sep, e.g. 'disease,age'. Do not include cellId, it's always the first field by definition. Fields that are in the file but not specified here will be appended as the last columns.")
     parser.add_option("", "--only", dest="only", action="store_true",
             help="when using --order: only use the specified fields, do not append other fields at the end")
     parser.add_option("", "--del", dest="delFields", action="store",
         help="only for reorder and metaCat: names of fields to remove")
 
 
     (options, args) = parser.parse_args()
 
     if showHelp:
         parser.print_help()
         exit(1)
 
     setDebug(options.debug)
     return args, options
 
 def cbToolCli():
     " run various tools from the command line "
     args, options = cbToolCli_parseArgs()
 
-    if len(args)<=1:
+    if len(args)<=1 and args[0] != "quickgenes":
         cbToolCli_parseArgs(showHelp=True)
         sys.exit(1)
 
     cmd = args[0]
 
-    cmds = ["mtx2tsv", "matCatCells", "matCatGenes" "metaCat", "reorder", "cxg"]
+    cmds = ["mtx2tsv", "matCatCells", "matCatGenes" "metaCat", "reorder", "cxg", "quickgenes"]
 
     if cmd=="mtx2tsv":
         mtxFname = args[1]
         geneFname = args[2]
         barcodeFname = args[3]
         outFname = args[4]
         mtxToTsvGz(mtxFname, geneFname, barcodeFname, outFname)
     elif cmd=="matCatCells":
         inFnames = args[1:-1]
         outFname = args[-1]
         matCat(inFnames, outFname)
     elif cmd=="matCatGenes":
         inFnames = args[1:-1]
         outFname = args[-1]
         matCatGenes(inFnames, outFname)
     elif cmd=="cxg":
         subCmd = args[1]
         if subCmd=="allDatasets":
             printRows(getCxgAllDatasets())
         elif subCmd=="asset":
             dsIds = args[2:]
             getCxgAssets(dsIds)
         elif subCmd=="allAssets":
             getCxgAllAssets()
         elif subCmd=="cp":
             dsId = args[2]
             assetId = args[3]
             outFname = args[4]
             url = getCxgAssetUrl(dsId, assetId)
             cmd = ["curl", url, "--output", outFname]
             subprocess.run(cmd)
         elif subCmd=="desc":
             collId = args[2]
             getCxgColl(collId)
 
     elif cmd=="metaCat" or cmd=="reorder":
         inFnames = args[1:-1]
         outFname = args[-1]
 
         if len(inFnames)==0:
             errAbort("You must provide at least two filenames: one input filename and one output filename")
 
         if len(inFnames)==1 and isfile(outFname):
             errAbort("You provided only one input file and the output file already exists. To avoid "
             "accidentally overwriting a file, please either remove the output file or provide at least "
             "three filenames: two input files and one output file. The second input file can be 'none', "
             "which will suppress this error message.")
 
         if len(inFnames)==2 and inFnames[1]=="none":
             inFnames.pop()
 
         metaCat(inFnames, outFname, options)
+
+    elif cmd=="quickgenes":
+        generateQuickGenes(".")
+
     else:
         errAbort("Command %s is not a valid command. Valid commands are: %s" % (cmd, ", ".join(cmds)))
 
 def getLabels(l):
     " given a list of dicts, return a |-sep list of 'label' values "
     labels = []
     for d in l:
         if type(d)==str:
             labels.append(d)
         else:
             labels.append(d["label"])
     return "|".join(labels)
 
 def printRows(iterator, headDone=False):
     " print all rows gotten from an iterator "
     #headDone = False
     for row in iterator:
         if headDone is False:
             print("\t".join(row.keys()))
             headDone = True
         print("\t".join(row.values()))
 
 def getCxgAssetUrl(datasetId, assetId):
     " cxg has signed links, get one for an asset ID "
     url = 'https://api.cellxgene.cziscience.com/dp/v1/datasets/%s/asset/%s' % (datasetId, assetId)
     import requests
     ret = requests.post(url=url).json()
     # keys: dataset_id, file_name, presigned_url
     return ret["presigned_url"]
 
 def getCxgAllDatasets():
     " yield all cxg datasets as ordered dicts "
     import requests
     url = "https://api.cellxgene.cziscience.com/dp/v1/datasets/index"
     datasets = requests.get(url=url).json()
     attrs = ['id', 'name', 'organism', 'collection_id', 'assay', 'explorer_url', 'is_primary_data', \
             'cell_count', 'mean_genes_per_cell',
             'development_stage', 'development_stage_ancestors', \
             'disease', \
             'published_at', 'revised_at', 'schema_version', 'self_reported_ethnicity', 'sex', 'tissue', \
             'tissue_ancestors', 'cell_type', 'cell_type_ancestors']
     for dataset in datasets:
         row = OrderedDict()
         for attr in attrs:
             val = dataset.get(attr, "UNDEFINED")
             if type(val)==type([]):
                 strVal = getLabels(val)
             else:
                 strVal = str(val)
             row[attr] = strVal
         yield row
 
 def getCxgAllAssets():
     ""
     headDone = False
     for ds in getCxgAllDatasets():
         printRows(getCxgAssets([ds["id"]]), headDone)
         headDone = True
 
 def getCxgAssets(dsIds, headDone=False):
     " download assets for a list of cxg datasets "
     import requests
     attrs = ['dataset_id', 'dataset', 'filename', 'filetype', 'id', 's3_uri', 'created_at', 'updated_at', 'user_submitted']
     #if not headDone:
         #print("\t".join(attrs))
 
     for dsId in dsIds:
         url = "https://api.cellxgene.cziscience.com/dp/v1/datasets/%s/assets" % dsId
         ret = requests.get(url=url).json()
         for asset in ret["assets"]:
             #row = []
             row = OrderedDict()
             for attr in attrs:
                 val = asset[attr]
                 if attr == "dataset":
                     val = val["name"]
                 #row.append(str(val))
                 row[attr] = str(val)
             #print("\t".join(row))
             yield row
 
 def getCxgColl(collId):
     " "
 
 def matCat(inFnames, outFname):
     tmpFname = outFname+".tmp"
     ofh = openFile(tmpFname, "w")
 
     ifhs = []
 
     sep = "\t"
     if ".csv" in inFnames[0]:
         sep = ","
 
     for fn in inFnames:
         ifhs.append( openFile(fn) )
 
     headers, colCounts = getAllFields(ifhs, sep)
     ofh.write("\t".join(headers))
     ofh.write("\n")
 
     for i, ifh in enumerate(ifhs):
         logging.info("File %s: %d columns with values" % (ifhs[i].name, colCounts[i]-1)) 
 
     fileCount = len(inFnames)
     progressStep = 1000
 
     doProcess = True
     lineCount = 0
 
     while doProcess:
         geneId = None
 
         lineCount += 1
         if lineCount % progressStep == 0:
             logging.info("Processed %d rows" % (lineCount))
 
         for i, ifh in enumerate(ifhs):
             lineStr = ifh.readline()
             # check if we're at EOF
             if lineStr=='':
                 doProcess = False
                 break
 
             fields = lineStr.rstrip('\r\n').split(sep)
             fields = [x.strip('"') for x in fields]
             if (len(fields)!=colCounts[i]): # check number of columns against header count
                 raise Exception("Illegal number of fields: file %s, line %d, field count: %d, expected %d" % 
                     (ifh.name, lineCount, len(fields), colCounts[i]))
 
             # check the gene ID
             if i==0: # get the gene ID from the first file
                 geneId = fields[0]
                 allVals = [geneId]
             else:
                 #assert(geneId == fields[0]) # if this fails, then the rows are not in the same order
                 if geneId != fields[0]:
                     print("Error: File %s has gene IDs that is out of order." % ifh.name)
                     print("Expected geneID: %s, got geneID: %s" % (geneId, fields[0]))
                     sys.exit(1)
 
             allVals.extend(fields[1:])
 
         ofh.write("\t".join(allVals))
         ofh.write("\n")
 
     # make sure that we've read through all files
     for ifh in ifhs:
         assert(ifh.readline()=='') # a file has still lines left to read?
 
     ofh.close()
     moveOrGzip(tmpFname, outFname)
     logging.info("Wrote %d lines (not counting header)" % lineCount)
 
 def matCatGenes(inFnames, outFname):
     " cat the lines several tab-sep or csv files, check headers and skip the header lines "
     tmpFname = outFname+".tmp"
     ofh = openFile(tmpFname, "w")
 
     otherHeaders = None
     lineCount = 0
     for fn in inFnames:
         prefix = ""
         if "=" in fn:
             fn, prefix = fn.split("=")
 
         sep = "\t"
         if ".csv" in fn:
             sep = ","
 
         headers = None
         logging.info("Reading %s" % fn)
 
         doReorder = False
         fieldOrder = None
 
         for line in openFile(fn):
             row = line.rstrip("\n\r").split(sep)
             row = [x.strip('"') for x in row]
 
             if otherHeaders is None:
                 otherHeaders = row
                 ofh.write("\t".join(otherHeaders))
                 ofh.write("\n")
             if headers is None:
                 headers = row
                 if (headers != otherHeaders): # all files must have identical columns = identical header line
                     logging.warn("Headers are not identical, need to re-order columns, this will be pretty slow")
                     doReorder = True
 
                     fieldOrder = [0]
                     for h in otherHeaders[1:]:
                         idx = headers.index(h)
                         assert(idx!=0) # internal logic error. Should never happen.
                         fieldOrder.append( idx ) # if this fails, then one file has a column that the other one doesn't have. Not clear what should happen then -> just fail.
                 continue
 
             if prefix!="":
                 row[0] = prefix+row[0]
 
             if doReorder:
                 ofh.write("\t".join([row[i] for i in fieldOrder]))
             else:
                 ofh.write("\t".join(row))
 
             ofh.write("\n")
             lineCount += 1
 
         logging.info("Wrote %d lines to output file (not counting header)" % lineCount)
 
     ofh.close()
     logging.info("Compressing %s", outFname)
     moveOrGzip(tmpFname, outFname)
 
 def reorderFields(row, firstFields, skipFields, onlyFirst):
     " reorder the row to have firstFields first "
     #logging.debug("Reordering row to have %s fields first" % firstFields)
     newRow = [row[0]]
     for idx in firstFields:
         newRow.append(row[idx])
 
     if not onlyFirst:
         for i in range(1, len(row)):
             if i not in firstFields and i not in skipFields:
                 newRow.append(row[i])
 
     return newRow
 
 def metaCat(inFnames, outFname, options):
     " merge all tsv/csv columns in inFnames into a new file, outFname. Column 1 is ID to join on. "
     logging.debug("In fnames: %s, out fname: %s" % (repr(inFnames), repr(outFname)))
     allHeaders = ["cellId"]
     allRows = defaultdict(dict) # cellId -> fileIdx -> list of non-ID fields
     fieldCounts = {} # fileIdx -> number of non-ID fields
     allIds = set() # set with all cellIds
 
     firstFields = []
     if options.order!="" and options.order is not None:
         firstFields = options.order.split(",")
 
     delFields = []
     if options.delFields!="" and options.delFields is not None:
         delFields = options.delFields.split(",")
 
     for fileIdx, fname in enumerate(inFnames):
         logging.info("Reading %s" % fname)
         headers = None
         rowCount = 0
         for row in lineFileNextRow(fname, headerIsRow=True):
             if headers is None:
                 headers = row[1:]
                 logging.info("Reading %s, %d columns:  %s" % (fname, len(headers), repr(headers)))
                 allHeaders.extend(headers)
                 fieldCounts[fileIdx] = len(headers)
                 continue
 
             rowCount +=1
             cellId = row[0]
             allIds.add(cellId)
             allRows[cellId][fileIdx] = row[1:]
         logging.info("Read %d rows" % rowCount)
 
     nonCharRe = re.compile(r'[^a-zA-Z0-9]')
 
     if options.fixDots:
         # first create a map realId -> rId
         rToReal = {}
         for cellId, rowData in iterItems(allRows):
             rId = nonCharRe.sub(".", cellId)
             if rId!=cellId and rId in allRows:
                 assert(rId not in rToReal) # Uh-oh! Mapping R <-> realId is not simple. May need some manual work.
                 rToReal[rId]=cellId
         logging.info("Found %d cell IDs that look like they've been mangled by R" % (len(rToReal)))
 
         # now merge the R-id entries into the real ID entries
         newRows = {}
         for rId, readlId in iterItems(rToReal):
             allRows[cellId].update(allRows[rId])
         for rId in rToReal.keys():
             del allRows[rId]
         logging.info("Merged back rIds into normal data")
 
     # find the field indices of the fields we're putting first
     firstFieldIdx = []
     for h in firstFields:
         try:
             idx = allHeaders.index(h)
             firstFieldIdx.append(idx)
         except ValueError:
             logging.warning("Field %s specified on command line --order is not in the meta data file" % repr(h))
 
     # and those of fields we will skip
     delFieldIdx = []
     for h in delFields:
         try:
             idx = allHeaders.index(h)
         except ValueError:
             errAbort("Field %s specified on command line with --del is not in the meta data file" % repr(h))
         delFieldIdx.append(idx)
 
     if len(firstFields)!=0:
         logging.info("Order of fields that come first: %s" % firstFields)
     if len(delFields)!=0:
         logging.info("Removing these fields: %s" % delFields)
 
 
     if outFname=="stdout" or outFname=="/dev/stdout":
         ofh = sys.stdout
     else:
         tmpFname = outFname+".tmp"
         ofh = openFile(tmpFname, "w")
 
     onlyFirst = options.only
 
     allHeaders = reorderFields(allHeaders, firstFieldIdx, delFieldIdx, onlyFirst)
     ofh.write("\t".join(allHeaders))
     ofh.write("\n")
 
     for cellId, rowData in iterItems(allRows):
         row = [cellId]
         for fileIdx in range(0, len(inFnames)):
             if fileIdx in rowData:
                 row.extend(rowData[fileIdx])
             else:
                 row.extend([""]*fieldCounts[fileIdx])
 
         row = reorderFields(row, firstFieldIdx, delFieldIdx, onlyFirst)
         ofh.write("\t".join(row))
         ofh.write("\n")
 
     ofh.close()
     if ofh!=sys.stdout:
         renameFile(tmpFname, outFname)
     logging.info("Output field order is: %s" % allHeaders)
     logging.info("Wrote %d lines (not counting header)" % len(allRows))
 
 def importCellrangerMatrix(inDir, outDir):
     " convert the cellranger 2 or 3 matrix, use the .mtx or .h5 file, whatever is available "
     outExprFname = join(outDir, "exprMatrix.tsv.gz")
     # cellranger 2: filtered_feature_bc_matrix/<db>/matrix.mtx and not gzipped
     mask1 = join(inDir, "filtered_gene_bc_matrices/*/matrix.mtx")
     # cellranger 3: filtered_gene_bc_matrices and gzipped
     mask2 = join(inDir, "filtered_feature_bc_matrix/matrix.mtx.gz")
     # h5 file as fallback, requires scanpy
     # cellranger3: filtered_feature_bc_matrix.h5
     mask3 = join(inDir, "*filtered_*matri*.h5") # should work for cr 1, 2 and 3
     matFnames1 = glob.glob(mask1)
     matFnames2 = glob.glob(mask2)
     matFnames3 = glob.glob(mask3)
     logging.info("Looking for %s (cellranger 1/2) or %s (cellranger3) or %s" % (mask1, mask2, mask3))
 
     # ugly code warning
     if len(matFnames1)!=0:
         # cellranger 1/2 mtx files are not gziped
         assert(len(matFnames1)==1)
         matFname = matFnames1[0]
         logging.info("Found %s" % matFname)
         barcodeFname = matFname.replace("matrix.mtx", "barcodes.tsv")
         geneFname = matFname.replace("matrix.mtx", "genes.tsv")
         mtxToTsvGz(matFname, geneFname, barcodeFname, outExprFname)
     elif len(matFnames2)!=0:
         # cellranger 3 mtx files are gzipped
         assert(len(matFnames2)==1)
         matFname = matFnames2[0]
         logging.info("Found %s" % matFname)
         barcodeFname = matFname.replace("matrix.mtx.gz", "barcodes.tsv.gz")
         geneFname = matFname.replace("matrix.mtx.gz", "features.tsv.gz")
         mtxToTsvGz(matFname, geneFname, barcodeFname, outExprFname)
     elif len(matFnames3)!=0:
         # h5 files of cellranger 1/2/3 are read via scanpy
         matFnames3 = glob.glob(mask3)
         logging.info("Found %s" % matFnames3)
         assert(len(matFnames3)==1)
         matFname = matFnames3[0]
 
         import scanpy.api as sc
         logging.info("Reading matrix %s" % matFname)
         adata = readMatrixAnndata(matFname)
         anndataMatrixToTsv(adata, outExprFname)
     else:
         errAbort("Could not find matrix, neither as %s, %s nor %s" % (mask1, mask2, mask3))
         logging.info("Looking for %s" % mask3)
 
     return matFname
 
 def crangerToCellbrowser(datasetName, inDir, outDir, noMat):
     " convert cellranger output to a cellbrowser directory "
     makeDir(outDir)
     # copy over the clusters
     clustFname = join(inDir, "analysis/clustering/graphclust/clusters.csv")
     if not isfile(clustFname):
         logging.warn("Cannot find %s" % clustFname)
         clustFname = join(inDir, "analysis/kmeans/10_clusters/clusters.csv")
         logging.warn("Using%s instead" % clustFname)
     metaFname = join(outDir, "meta.csv")
     shutil.copy(clustFname, metaFname)
 
     # copy over the t-SNE coords
     tsneFname = join(inDir, "analysis/tsne/2_components/projection.csv")
     if not isfile(tsneFname):
         tsneFname = join(inDir, "analysis/tsne/projection.csv")
     coordFname = join(outDir, "tsne.coords.csv")
     shutil.copy(tsneFname, coordFname)
 
     # copy over the markers
     dgeFname = join(inDir, "analysis/diffexp/graphclust/differential_expression.csv")
     if not isfile(dgeFname):
         logging.warn("Not found: %s" % dgeFname)
         dgeFname = join(inDir, "analysis/kmeans/10_clusters/differential_expression.csv")
         logging.warn("Using instead: %s" % dgeFname)
 
     markerFname = join(outDir, "markers.tsv")
     geneFname = join(outDir, "gene2sym.tsv")
     crangerSignMarkers(dgeFname, markerFname, geneFname, 0.01, 100)
 
     if not noMat:
         matFname = importCellrangerMatrix(inDir, outDir)
     else:
         matFname = "unknown"
 
     confName = join(outDir, "cellbrowser.conf")
     coordDescs = [{"file":"tsne.coords.csv", "shortLabel":"CellRanger t-SNE"}]
     confArgs =  {"meta" : "meta.csv", "clusterField" : "Cluster", "tags" : ["10x"]}
     writeCellbrowserConf(datasetName, coordDescs, confName, confArgs)
 
     crangerWriteMethods(inDir, outDir, matFname)
     generateHtmls(datasetName, outDir)
 
 def crangerWriteMethods(inDir, outDir, matFname):
     htmlFname = join(outDir, "methods.html")
     if isfile(htmlFname):
         logging.info("%s exists, not overwriting" % htmlFname)
         return
 
     import csv
     csvMask = join(inDir, "*_metrics_summary.csv")
     csvFnames = glob.glob(csvMask)
 
     jsonFname = join(inDir, "summary.json")
     if len(csvFnames)==0:
         if isfile(jsonFname):
             qcVals = json.load(open(jsonFname))
         else:
             logging.warn("Cannot find %s nor %s, not writing %s" % (csvMask, jsonFname, htmlFname))
             return
     else:
         qcVals = list(csv.DictReader(open(csvFnames[0])))[0]
 
     ofh = open(htmlFname, "w")
     ofh.write("<p>This dataset was imported from a CellRanger analysis directory with cbCellranger.</p>\n")
     ofh.write("<p><b>QC metrics reported by CellRanger:</b></p>\n")
 
     for key, value in iterItems(qcVals):
         ofh.write("<i>%s:</i> %s<br>\n" % (key, str(value)))
     ofh.close()
     logging.info("Wrote %s" % ofh.name)
 
 def crangerSignMarkers(dgeFname, markerFname, geneFname, maxPval, maxGenes):
     " convert cellranger diff exp file to markers.tsv file "
     # Old Cellranger 1 format:
     # Gene ID,Gene Name,Cluster 1 Weight,Cluster 1 UMI counts/cell,Cluster 2 Weight,Cluster 2 UMI counts/cell,Cluster 3 Weight,Cluster 3 UMI counts/cell,Cluster 4 Weight,Cluster 4 UMI counts/cell,Cluster 5 Weight,Cluster 5 UMI counts/cell,Cluster 6 Weight,Cluster 6 UMI counts/cell,Cluster 7 Weight,Cluster 7 UMI counts/cell,Cluster 8 Weight,Cluster 8 UMI counts/cell,Cluster 9 Weight,Cluster 9 UMI counts/cell,Cluster 10 Weight,Cluster 10 UMI counts/cell
 
     ofh = open(markerFname, "w")
     ofh.write("cluster\tgene\tAdj. P-Value\tLog2 fold change\tMean UMI Counts\n")
 
     clusterToGenes = defaultdict(list)
 
     # read the significant markers and their p-Values
     logging.info("Reading %s" % dgeFname)
     ifh = open(dgeFname)
 
     # determine file format
     line1 = ifh.readline()
     # cellranger 3
     fileVersion = None
     if line1.startswith("Feature ID"):
         fieldsProCluster = 3
         fileVersion = 3
     # cellranger 1 or 2
     elif line1.startswith("Gene ID"):
         if "Cluster 1 Weight" in line1:
             fieldsProCluster = 2
             fileVersion = 1
         else:
             fieldsProCluster = 3
             fileVersion = 2
     assert(fileVersion is not None) # unknown cellranger version?
 
     for line in ifh:
         row = line.rstrip("\n\r").split(",")
         geneId = row[0]
         sym = row[1]
         clusterCount = int((len(row)-2) / fieldsProCluster)
         for clusterIdx in range(0, clusterCount):
             startField = (clusterIdx*fieldsProCluster)+2
             if fileVersion>=2:
                 mean = float(row[startField])
                 fc = float(row[startField+1])
                 pVal = float(row[startField+2])
             else:
                 fc = float(row[startField])
                 mean = float(row[startField+1])
                 pVal = 0.0001
 
             if pVal < maxPval:
                 clusterToGenes[clusterIdx+1].append((fc, mean, pVal, sym))
 
     # write out the markers
     for clusterId, clusterGenes in iterItems(clusterToGenes):
         clusterGenes.sort(key=operator.itemgetter(2)) # sort by fold change
         maxIdx = min(maxGenes, len(clusterGenes))
         for i in range(0, maxIdx):
             fc, mean, pVal, sym = clusterGenes[i]
             ofh.write("%d\t%s\t%g\t%f\t%f\n" % (clusterId, sym, pVal, fc, mean))
 
     ofh.close()
     logging.info("Wrote %s" % ofh.name)
 
 def cbCellrangerCli():
     args, options = cbCellrangerCli_parseArgs()
 
     if options.outDir is None or options.inDir is None or options.datasetName is None:
         logging.error("You have to specify at least an input and an output directory and a dataset name.")
         cbCellrangerCli_parseArgs(showHelp=True)
 
     crangerToCellbrowser(options.datasetName, options.inDir, options.outDir, options.noMat)
 
 def cbCellrangerCli_parseArgs(showHelp=False):
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] -i cellRangerDir -o outputDir - convert the cellranger output to cellbrowser format and create a cellranger.conf file
 
     """)
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true",
         help="show debug messages")
 
     parser.add_option("-i", "--inDir", dest="inDir", action="store", help="input folder with the cellranger analysis output. This is the directory with the two directories 'analysis' and 'filtered_gene_bc_matrices'")
     parser.add_option("-o", "--outDir", dest="outDir", action="store", help="output directory")
     #parser.add_option("-g", "--geneSet", dest="geneSet", action="store", help="geneset, e.g. gencode28 or gencode-m13 or similar. Default: %default", default="gencode24")
     parser.add_option("-n", "--name", dest="datasetName", action="store", help="name of the dataset. No spaces or special characters.")
     parser.add_option("-m", "--noMat", dest="noMat", action="store_true", help="do not export the matrix again, saves some time if you changed something small since the last run")
 
     (options, args) = parser.parse_args()
 
     if showHelp:
         parser.print_help()
         exit(1)
 
     setDebug(options.debug)
 
     return args, options
 
 def cbImportScanpy_parseArgs(showHelp=False):
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] -i inFilename -o outDir - convert Scanpy AnnData object to cellbrowser. inFilename can be an .h5ad or .loom file. 
     Exports raw.X by default, or .X alternatively, see --proc and --layer.
     Exports all meta data. Writes .tsv by default, or alternatively .mtx.gz
 
     Example:
     - %prog -i pbmc3k.h5ad -o pbmc3kScanpy - convert pbmc3k to directory with tab-separated files
     """)
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true",
         help="show debug messages")
 
     parser.add_option("-i", "--inFile", dest="inFile", action="store",
         help="input .h5ad file. Required parameter")
 
     parser.add_option("-o", "--outDir", dest="outDir", action="store",
         help="Output directory. Required parameter")
 
     parser.add_option("-n", "--name", dest="datasetName", action="store",
         help="Dataset name for generated cellbrowser.conf. If not specified, the last component of -o will be used")
 
     parser.add_option("", "--htmlDir", dest="htmlDir", action="store",
         help="do not only convert to tab-sep files but also run cbBuild to"
             "convert the data and add the dataset under htmlDir")
 
     parser.add_option("-p", "--port", dest="port", action="store", type="int",
             help="only with --htmlDir: start webserver on port to serve htmlDir")
 
     parser.add_option("", "--markerField", dest="markerField", action="store",
             help="name of the marker genes field, default: %default", default="rank_genes_groups")
 
     parser.add_option("", "--clusterField", dest="clusterField", action="store",
             help="if no marker genes are present, use this field to calculate them. Default is to try a list of common field names, like 'Cluster' or 'louvain' and a few others")
 
     parser.add_option("-f", "--matrixFormat", dest="matrixFormat", action="store", default="mtx",
             help="Output matrix file format. 'mtx' or 'tsv'. default: mtx",)
 
     parser.add_option("-m", "--skipMatrix", dest="skipMatrix", action="store_true",
         help="do not convert the matrix, saves time if the same one has been exported before to the "
         "same directory")
 
     parser.add_option("", "--skipMarkers", dest="skipMarkers", action="store_true",
             help="do not try to calculate cluster-specific marker genes. Only useful for the rare datasets where a bug in scanpy crashes the marker gene calculation.")
 
     parser.add_option("", "--proc", dest="useProc", action="store_true",
         help="when exporting, do not use the raw input data, instead use the normalized and corrected matrix, aka 'ad.X'. This has no effect if the anndata.raw attribute is not present in the anndata object")
 
     parser.add_option("-l", "--layer", dest="layer", action="store",
             help="Specify the layer to export. This takes precedence over --proc.")
 
     parser.add_option("", "--atac", dest="atac", action="store",
             help="Indicate that this is an ATAC dataset and specify genome assembly and gene model, for example 'hg38.gencode-42'. Use 'cbGenes ls' to show the list of all available gene models on your disk or cbGenes fetch to download other ones. This will only be passed through to cellbrowser.conf.")
 
     (options, args) = parser.parse_args()
 
     if showHelp:
         parser.print_help()
         exit(1)
 
     setDebug(options.debug)
     return args, options
 
 def cbImportScanpyCli():
     " convert h5ad to directory "
     args, options = cbImportScanpy_parseArgs()
 
     if len(args)==0 and not options.inFile and not options.outDir:
         cbImportScanpy_parseArgs(showHelp=True)
         sys.exit(1)
 
     inFname = options.inFile
     outDir = options.outDir
     if None in [inFname, outDir]:
         errAbort("You need to specify at least an input .h5ad file and an output directory")
 
     datasetName = options.datasetName
     if datasetName is None:
         datasetName = basename(abspath(outDir))
 
     markerField = options.markerField
     clusterField = options.clusterField
     skipMarkers = options.skipMarkers
     matrixFormat = getMatrixFormat(options)
     atac = options.atac
     layer = options.layer
 
     if atac is not None and "." not in atac:
         errAbort("the --atac option needs to be in the format <assembly>.<genemodel> e.g. hg38.gencode-42")
 
     ad = readMatrixAnndata(inFname, reqCoords=True)
 
     scanpyToCellbrowser(ad, outDir, datasetName, skipMatrix=options.skipMatrix, useRaw=(not options.useProc),
             markerField=markerField, clusterField=clusterField, skipMarkers=skipMarkers, matrixFormat=matrixFormat, atac=atac, layer=layer)
 
     copyFileIfDiffSize(inFname, join(outDir, basename(inFname)))
     generateDataDesc(datasetName, outDir, other={"supplFiles": [{"label":"Scanpy H5AD", "file":basename(inFname)}]})
 
     if options.port and not options.htmlDir:
         errAbort("--port requires --htmlDir")
 
     if options.htmlDir:
         build(outDir, options.htmlDir, port=options.port)