7c04a101e90c131afccf4d973d2e76da384543b8 max Fri May 16 11:11:00 2025 -0700 do not stop on invalid genes in quickgenes file. skip invalid genes in marker file diff --git src/cbPyLib/cellbrowser/cellbrowser.py src/cbPyLib/cellbrowser/cellbrowser.py index 925a311..3c77bf4 100755 --- src/cbPyLib/cellbrowser/cellbrowser.py +++ src/cbPyLib/cellbrowser/cellbrowser.py @@ -1,7375 +1,7383 @@ #!/usr/bin/env python # this library mostly contains functions that convert tab-sep/loom/h5ad/mtx files # (=single cell expression matrix and meta data) into the binary format that is read by the # javascript viewer cbWeb/js/cellbrowser.js and cbData.js. # Helper functions here allow importing data from other tools, e.g. cellranger or scanpy. # requires at least python2.6, version tested was 2.6.6 # should work with python2.5, not tested # works on python3, version tested was 3.6.5 # all functions related to cbScanpy() require python3, as scanpy requires at least python3 import logging, sys, optparse, struct, json, os, string, shutil, gzip, re, unicodedata import zlib, math, operator, doctest, copy, bisect, array, glob, io, time, subprocess import hashlib, timeit, datetime, keyword, itertools, os.path, urllib, platform from collections import namedtuple, OrderedDict from os.path import join, basename, dirname, isfile, isdir, relpath, abspath, getsize, getmtime, expanduser from time import gmtime, strftime import csv try: # python3 from urllib.parse import urljoin from urllib.request import urlopen HTTPERR = urllib.error.HTTPError except ImportError: # python2 from urlparse import urljoin from urllib2 import urlopen import urllib2 HTTPERR = urllib2.URLError try: # > python3.3 from collections.abc import Mapping except: # < python 3.3 from collections import Mapping try: # python2.7+ from collections import defaultdict, Counter except ImportError: # python2.6 has no collections.abc, defaultdict or Counter yet from backport_collections import defaultdict, Counter # error? -> pip2 install backport-collections from collections import Mapping # We do not require numpy but numpy is around 30-40% faster in serializing arrays # So use it if it's present numpyLoaded = True try: import numpy as np except: numpyLoaded = False logging.debug("Numpy could not be loaded. This is fine but matrix export may be 1/3 slower.") # older numpy versions don't have tobytes() if numpyLoaded: try: np.ndarray.tobytes except: numpyLoaded = False logging.error("Numpy version too old. Falling back to normal Python array handling.") isPy3 = False if sys.version_info >= (3, 0): isPy3 = True # directory to static data files, e.g. gencode tables # By default, this is ~/cellbrowserData, or alternatively /usr/local/share/cellbrowser # or the directory in the environment variable CBDATA, see findCbData() dataDir = None # the default html dir, used if the --htmlDir option is set but empty # this variable is initialized below (no forward declaration in Python) # just before cbBuild_parseArgs defOutDir = None CBHOMEURL = "https://cells.ucsc.edu/downloads/cellbrowserData/" CBHOMEURL_TEST = "https://cells-test.gi.ucsc.edu/downloads/cellbrowserData/" # a special value that is used for both x and y to indicate that the cell should not be shown # must match the same value in maxPlot.js HIDDENCOORD = 12345 # special value representing NaN in floating point arrays # must match the same value in cellBrowser.js FLOATNAN = float('-inf') # NaN and sorting does not work. we want NaN always to be first, so encode as -inf # special value representing NaN in integer arrays, again, we want this to be first after sorting # must match the same value in cellBrowser.js INTNAN = -2**16 # how many md5 characters to keep in version identifiers. We load all files using their md5 to get around # internet browser caching MD5LEN = 10 # list of tags that are required: # for cellbrowser.conf of a dataset reqTagsDataset =['coords', 'meta'] # for cellbrowser.conf of a collection reqTagsColl =['shortLabel'] coordLabels = { # generic igraph neighbor-based layouts "fa": "ForceAtlas2", "fr": "Fruchterman Reingold", "grid_fr": "Grid Fruchterman Reingold", # deactivated for now due to https://github.com/igraph/python-igraph/issues/152 "kk": "Kamadi Kawai", "lgl": "Large Graph Layout", # looks useless "drl": "DrL Distributed Recursive Layout", "rt": "Reingold Tilford tree", # doesn't look useful # special scanpy layouts "tsne" : "t-SNE", "umap" : "UMAP", "pagaFa" : "PAGA/ForceAtlas2", "pagaFr" : "PAGA/Fruchterman-Reingold", "phate" : "PHATE" } # default layouts if you specify "all" in scanpy.conf recommendedLayouts = ["fa", "fr", "kk", "drl", "tsne", "umap", "pagaFa", "phate"] # give some meta fields better names metaLabels = { "louvain" : "Louvain Cluster", "percent_mito" : "Percent Mitochond.", "n_genes" : "Expressed Genes", "n_counts" : "UMI Count" } doForce = False # any meta fields with errors get collected here, so we are not stopping just for one field errorFields = [] # ==== functions ===== debugDone = False debugMode = False def setDebug(doDebug): " activate debugging if needed " global debugDone global debugMode if debugDone: return if doDebug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) debugMode = True else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) debugDone = True def isDebugMode(): return debugMode def sendDebugToFile(fname): return debugMode logger = logging.getLogger('') logger.setLevel(logging.DEBUG) #By default, logs all messages # log to console consLevel = logging.INFO ch = logging.StreamHandler() #StreamHandler logs to console ch.setLevel(consLevel) ch_format = logging.Formatter('%(asctime)s - %(message)s') ch.setFormatter(ch_format) # also always log everything to file fh = logging.FileHandler(fname, "w+") # w+ means to overwrite the old file, "a" = append #fh.setLevel(logging.DEBUG) fh.setLevel(logging.INFO) fh_format = logging.Formatter('%(asctime)s - %(lineno)d - %(levelname)-8s - %(message)s') fh.setFormatter(fh_format) logger.addHandler(fh) logger.addHandler(ch) logging.info("Logging messages to %s" % fname) def makeDir(outDir): if not isdir(outDir): logging.info("Creating %s" % outDir) os.makedirs(outDir) def renameFile(oldName, newName): " windows doesn't accept if newName already exists " logging.debug("Renaming %s -> %s" % (oldName, newName)) if isfile(newName): os.remove(newName) os.rename(oldName, newName) def errAbort(msg): logging.error(msg) sys.exit(1) def iterItems(d): " wrapper for iteritems for all python versions " if isPy3: return d.items() else: return d.iteritems() def nextEl(d): " wrapper for next() for all python versions " if isPy3: return next(d) else: return d.next() def splitOnce(s, sep, splitCount=1): " split s only once on sep, for all python versions " if isPy3: tup = s.split(sep, maxsplit=splitCount) else: tup = string.split(s, sep, maxsplit=splitCount) return tup def which(prog): " return path of program in PATH " try: import distutils.spawn return distutils.spawn.find_executable(prog) except: return shutil.which(prog) assert(False) def findCbData(): """ return the name of the dataDir directory: This is the directory in the environment variable CBDATA. If this variable is not set, we use or /usr/local/share/cellbrowser, if any of these exist. If all of that fails, we use ~/cellbrowserData. """ global dataDir if dataDir is not None: return dataDir envCbData = os.environ.get("CBDATA") if envCbData is not None: logging.debug("CBDATA variable found, points to %s" % envCbData) dataDir = envCbData else: baseDir = dirname(__file__) # = directory of this library libRelDir = abspath(join(baseDir, "../../../cellbrowserData")) for altDir in [libRelDir, "/usr/local/share/cellbrowser", "/usr/local/cellbrowser"]: if isdir(altDir): logging.debug("Found data directory in %s" % altDir) dataDir = altDir break if dataDir is None: dataDir = expanduser("~/cellbrowserData") logging.debug("Using %s as data directory" % dataDir) if not isdir(dataDir): makeDir(dataDir) return dataDir def downloadUrlBinary(remoteUrl): " open URL, slurp in all data and return the resulting binary string " try: if sys.version_info >= ( 2, 7, 9 ): # newer python versions check the https certificate. It seems that UCSC uses certificates # that are not part of the cert database on some linux distributions. # To avoid any problems, we're switching off cert verification import ssl data = urlopen(remoteUrl, context=ssl._create_unverified_context()).read() else: data = urlopen(remoteUrl).read() except HTTPERR: logging.error("Cannot download %s" % remoteUrl) data = None return data def downloadUrlLines(url): " open URL, slurp in all data and return a list of the text lines " data = downloadUrlBinary(url) if data is None: errAbort("Cannot download %s" % url) if url.endswith(".gz"): if isPy3: data = gzip.decompress(data) else: # weird hack, but works, from https://stackoverflow.com/questions/41432800/decompress-zip-string-in-python-2-7 from gzip import GzipFile from StringIO import StringIO data = GzipFile(fileobj=StringIO(data)).read() lines = data.splitlines() lines = [l.decode("latin1") for l in lines] return lines def getDownloadsUrl(): " return the big static file downloads URL on cells.ucsc.edu " cbHomeUrl = CBHOMEURL if getConfig("useTest"): logging.debug("Using test server") cbHomeUrl = CBHOMEURL_TEST return cbHomeUrl def downloadStaticFile(remotePath, localPath): " download a file from CBHOMEURL/ to localPath " localDir = dirname(localPath) makeDir(localDir) cbHomeUrl = getDownloadsUrl() remoteUrl = urljoin(cbHomeUrl, remotePath) logging.info("Downloading %s to %s..." % (remoteUrl, localPath)) data = downloadUrlBinary(remoteUrl) if data is None: return False localTmp = localPath+".download" ofh = open(localTmp, "wb") ofh.write(data) ofh.close() renameFile(localTmp, localPath) return True def getStaticPath(relPath): " return full path to a static file in the dataDir directory " dataDir = findCbData() absPath = join(dataDir, relPath) return absPath def getStaticFile(relPath, verbose=False): """ get the full path to a static file in the dataDir directory (~/cellbrowserData or $CBDATA, by default, see above). If the file is not present, it will be downloaded from https://cells.ucsc.edu/downloads/cellbrowserData/ and copied onto the local disk under dataDir """ absPath = getStaticPath(relPath) if isfile(absPath): if verbose: logging.info("%s already exists" % absPath) else: logging.info("%s not found" % absPath) success = downloadStaticFile(relPath, absPath) if not success: return None return absPath def getGeneSymPath(geneType): " return rel path to a tsv file with geneId, sym " path = join("genes", geneType+".symbols.tsv.gz") return path def getGeneBedPath(db, geneType): " return rel path to a tsv file with geneId, sym " path = join("genes", db+"."+geneType+".bed.gz") return path def getGeneJsonPath(db, geneType): " return rel path to a json.gz file with chrom -> list of (start, end, strand, symbol) " path = join("genes", db+"."+geneType+".json") return path def openStaticFile(relPath, mode="r"): " download static file and return an open file handle to it " absPath = getStaticFile(relPath) fh = openFile(absPath, mode) return fh def staticFileNextRow(relPath): " yield rows from a static file, possibly downloading it first " fh = openStaticFile(relPath) for row in lineFileNextRow(fh): yield row def findPkgFile(relPath): " return filename of file that is part of the pip package or git folder " baseDir = dirname(__file__) # = directory of this script srcPath = join(baseDir, relPath) return srcPath def copyPkgFile(relPath, outDir=None, values=None): """ copy file from directory under the current package directory to outDir or current directory Don't overwrite if the file is already there. """ if outDir is None: outDir = os.getcwd() srcPath = findPkgFile(relPath) destPath = join(outDir, basename(relPath)) if isfile(destPath): logging.info("%s already exists, not overwriting" % destPath) else: if values is None: shutil.copyfile(srcPath, destPath) logging.info("Wrote %s" % destPath) # egg-support commented out for now #s = pkg_resources.resource_string(__name__, srcPath) #ofh = open(destPath, "wb") #ofh.write(s) #ofh.close() else: logging.debug("Using %s as template for %s, vars %s" % (srcPath, destPath, values)) data = open(srcPath).read() dataNew = data % values ofh = open(destPath, "w") ofh.write(dataNew) ofh.close() logging.info("Wrote %s" % destPath) def execfile(filepath, globals=None, locals=None): " version of execfile for both py2 and py3 " logging.debug("Executing %s" % filepath) if globals is None: globals = {} globals.update({ "__file__": filepath, "__name__": "__main__", }) with open(filepath, 'rb') as file: exec(compile(file.read(), filepath, 'exec'), globals, locals) def readLines(lines, fname, encoding): " recursively read lines from fname, understands lines like #include 'filename.conf' " for line in io.open(fname, encoding=encoding): line = line.rstrip("\r\n") if line.startswith("#include"): newFname = splitOnce(line, " ")[1] newFname = newFname.strip('"').strip("'") lines.extend(readLines(lines, newFname, encoding)) else: lines.append(line) return lines def readLinesAnyEncoding(lines, fname): " read lines from fname, auto-detect utf8 / latin1 encoding " try: return readLines(lines, fname, "utf8") except UnicodeDecodeError: logging.error("Could not open file %s with utf8 encoding. Falling back to latin1" % fname) return readLines(lines, fname, "latin1") def loadConfig(fname, ignoreName=False, reqTags=[], addTo=None, addName=False): """ parse python in fname and return variables as dictionary. add the directory of fname to the dict as 'inDir'. """ logging.debug("Loading settings from %s" % fname) conf = OrderedDict() if addTo: logging.debug("Adding existing settings") conf.update(addTo) g = {} g["fileBase"] = basename(fname).split('.')[0] g["dirName"] = basename(dirname(fname)) lines = readLinesAnyEncoding([], fname) try: exec("\n".join(lines), g, conf) except: logging.error("Error parsing file %s" % fname) raise for rt in reqTags: if not rt in conf: errAbort("The input configuration has to define the %s statement" % rt) if rt=="tags": if type(conf["tags"])!=type([]): errAbort("'tags' in input config file must be a list") conf["inDir"] = dirname(fname) if "name" in conf and ignoreName: logging.debug("%s: 'name' entry in cellbrowser.conf is ignored" % fname) if (not "name" in conf and addName) or ignoreName: name = basename(dirname(abspath(fname))) logging.debug("Deriving name from directory name: %s -> %s" % (abspath(fname), name)) assert(name!="") conf["name"] = name if "name" in conf and "/" in conf["name"]: errAbort("Config file %s contains a slash in the name. Slashes in names are no allowed" % fname) return conf def maybeLoadConfig(confFname): if isfile(confFname): conf = loadConfig(confFname) else: logging.debug("Could not find %s, not loading config file" % confFname) conf = OrderedDict() return conf cbConf = None def getConfig(tag, defValue=None): " get a global cellbrowser config value from ~/.cellbrowser (old value ~/.cellbrowser.conf still accepted, but that filename was too confusing for people)" global cbConf if cbConf is None: confPath = expanduser("~/.cellbrowser") if not isfile(confPath): confPath = expanduser("~/.cellbrowser.conf") cbConf = maybeLoadConfig(confPath) ret = cbConf.get(tag, defValue) return ret def main_parseArgs(): " arg parser for __main__, only used internally " parser = optparse.OptionParser("""usage: %prog serve outDir port serve outDir via http on port """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") (options, args) = parser.parse_args() if len(args)==0: parser.print_help() exit(1) setDebug(options.debug) return args, options # ---- GLOBAL ---- defOutDir = getConfig("htmlDir") if defOutDir is None: defOutDir = os.environ.get("CBOUT") if defOutDir is not None: defOutDir = expanduser(defOutDir) # ---- GLOBAL END ---- def cbBuild_parseArgs(showHelp=False): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] -i cellbrowser.conf -o outputDir - add a dataset to the single cell viewer directory If you have previously built into the same output directory with the same dataset and the expression matrix has not changed its filesize, this will be detected and the expression matrix will not be copied again. This means that an update of a few meta data attributes is quite quick. Gene symbol/annotation files are downloaded to ~/cellbrowserData when needed. Config defaults can be specified in ~/.cellbrowser. See documentation at https://cellbrowser.readthedocs.io/ """) parser.add_option("", "--init", dest="init", action="store_true", help="copy sample cellbrowser.conf and desc.conf to current directory") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-i", "--inConf", dest="inConf", action="append", help="a cellbrowser.conf file that specifies labels and all input files, default is ./cellbrowser.conf, can be specified multiple times") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="output directory, default can be set through the env. variable CBOUT or ~/.cellbrowser, current value: %default", default=defOutDir) parser.add_option("-p", "--port", dest="port", action="store", help="if build is successful, start an http server on this port and serve the result via http://localhost:port", type="int") parser.add_option("-r", "--recursive", dest="recursive", action="store_true", help="run in all subdirectories of the current directory. Useful when rebuilding a full hierarchy. Cannot be used with -p.") parser.add_option("", "--depth", dest="depth", action="store", type="int", help="when using -r: only go this many directories deep") parser.add_option("", "--redo", dest="redo", action="store", default="meta", help="do not use cached old data. Can be: 'meta' or 'matrix' (matrix includes meta).") parser.add_option("", "--force", dest="force", action="store_true", help="ignore errors that usually stop the build and go ahead anyways.") (options, args) = parser.parse_args() if showHelp: parser.print_help() exit(1) setDebug(options.debug) return args, options def cbUpgrade_parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] outDir - update the list of datasets in the output directory, optionally updating the javascript code") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="output directory, default can be set through the env. variable CBOUT, current value: %default", default=defOutDir) parser.add_option("-p", "--port", dest="port", type="int", action="store", help="after upgrade, start HTTP server bound to port and serve ") parser.add_option("", "--code", dest="addCode", action="store_true", help="also update the javascript code") parser.add_option("", "--dev", dest="devMode", action="store_true", help="only for developers: do not add version to js/css links") (options, args) = parser.parse_args() if options.outDir==None: parser.print_help() exit(1) setDebug(options.debug) return args, options def cbScanpy_parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] -e matrixFile -o outDir -n datasetName - run scanpy and output .tsv files """) parser.add_option("-e", "--exprMatrix", dest="exprMatrix", action="store", help="gene-cell expression matrix file, possible formats: .csv, .h5, .mtx, .txt, .tab, .loom, .h5ad. Existing meta data from .loom and .h5ad will be kept and exported.") parser.add_option("-m", "--meta", dest="meta", action="store", help="name of cell meta data table. A table like tsv or csv format, first row has cellId and the cellId must match a sample name in the expression matrix. Optional but required if you use --inCluster. 'inMeta' in scanpy.conf") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="output directory") parser.add_option("-n", "--name", dest="name", action="store", help="internal name of dataset in cell browser. No spaces or special characters. default: name of output directory (-o)") parser.add_option("", "--init", dest="init", action="store_true", help="copy sample scanpy.conf to current directory") parser.add_option("-s", "--samplesOnRows", dest="samplesOnRows", action="store_true", help="when reading the expression matrix from a text file, assume that samples are on lines (default behavior is one-gene-per-line, one-sample-per-column). Also in scanpy.conf.") parser.add_option("-c", "--confFname", dest="confFname", action="store", default="scanpy.conf", help="config file from which settings are read, default is %default") parser.add_option("", "--inCluster", dest="inCluster", action="store", help="Do not run louvain-clustering, but use this meta field from ad.obs when calculating marker genes. The default is to use the louvain clustering results. Can be specified also in scanpy.conf.") parser.add_option("", "--skipMatrix", dest="skipMatrix", action="store_true", help="do not write the scanpy matrix to the destination directory as a file exprMatrix.tsv.gz") parser.add_option("", "--skipMarkers", dest="skipMarkers", action="store_true", help="do not try to calculate cluster-specific marker genes. Only useful for the rare datasets where a bug in scanpy crashes the marker gene calculation.") parser.add_option("-f", "--matrixFormat", dest="matrixFormat", action="store", help="Output matrix file format. 'mtx' or 'tsv'. default: tsv",) parser.add_option("", "--copyMatrix", dest="copyMatrix", action="store_true", help="Instead of reading the input matrix into scanpy and then writing it back out, just copy the input matrix. Only works if the input matrix is gzipped and in the right format and a tsv or csv file, not mtx or h5-based files.") parser.add_option("-g", "--genome", dest="genome", action="store", help="when reading 10X HDF5 files, the genome to read. Default is %default. Use h5ls to show possible genomes", default="GRCh38") parser.add_option("", "--test", dest="test", action="store_true", help="run doctests") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="open an iPython shell when an exception occurs. also output debug messages") (options, args) = parser.parse_args() if options.test: import doctest doctest.testmod() sys.exit(0) if (options.exprMatrix is None or options.outDir is None) and not options.init: print("Please specify at least the expression matrix (-e) and the output directory (-o)") parser.print_help() exit(1) setDebug(options.debug) return args, options kwSet = set(keyword.kwlist) def sanitizeHeaders(headers): " make headers of tsv/csv file compatible with namedtuple names " headers[0] = headers[0].lstrip("#") #if utfHacks: #line1 = line1.decode("latin1") # skip special chars in meta data and keep only ASCII #line1 = unicodedata.normalize('NFKD', line1).encode('ascii','ignore') if len(headers)>=255: errAbort("Cannot read more than 255 columns. Are you sure that this file is in the correct format?" " It may have the wrong line endings and may require treatment with dos2unix or mac2unix. " " Or it may be the wrong file type for this input, e.g. an expression matrix instead of a " " coordinate or meta data file.") # python does unfortunately not accept reserved names as named tuple names # We append a useless string to avoid errors if len(kwSet.intersection(headers))!=0: newHeaders = [] for h in headers: if h in kwSet: h = h+"_Value" newHeaders.append(h) headers = newHeaders #headers = [x if x!="" else "noName" for x in headers] if headers[0]=="": # R does not name the first column by default headers[0]="rowName" if "" in headers: logging.error("Found empty cells in a file header line.") logging.error("This often happens with Excel files. Make sure that the conversion from Excel was done correctly. Use cut -f-lastColumn to remove empty trailing columns.") errAbort("abort") # Python does not accept headers that start with a digit or underscore filtHeads = [] for h in headers: if h[0].isdigit() or h[0] == '_': filtHeads.append("x"+h) else: filtHeads.append(h) headers = filtHeads origHeaders = headers headers = [nonAlphaToUnderscores(h) for h in headers] return headers def csvReader(fh): " yield rows from input file object using Python's csv reader " import csv reader = csv.reader(fh) try: for row in reader: yield row except csv.Error as e: sys.exit('file %s, line %d: %s' % (fh.name, reader.line_num, e)) def tsvReader(fh): " yield rows from input file object " hasQuotes = None for line in fh: row = line.rstrip("\r\n").split("\t") if hasQuotes is None and row[0].startswith('"'): hasQuotes = row[0].startswith('"') if hasQuotes: row = [f.strip('"') for f in row] # this is technically not necessary, but was requested in #130 yield row def textFileRows(inFile): " iterate over lines from tsv or csv file and yield lists " if isinstance(inFile, str): # input file is a string = file name fh = openFile(inFile, mode="rt") sep = sepForFile(inFile) else: fh = inFile sep = "\t" if sep==",": rowReader = csvReader(fh) else: rowReader = tsvReader(fh) for row in rowReader: yield row def lineFileNextRow(inFile, headerIsRow=False, noHeaders=False) : """ parses tab-sep file with headers in first line yields collection.namedtuples strips "#"-prefix from header line Can parse csv files with quotes. headerIsRow: if True, yield the header itself just like any other row noHeaders: file has no headers, construct pseudo-headers "col0", "col1", etc """ ifh = textFileRows(inFile) if noHeaders: # must read first line now to get number of fields row1 = nextEl(ifh) headers = ["col"+str(i) for i in range(0, len(row1))] rawHeaders = headers Record = namedtuple('tsvCsvRec', headers) savedLines = [Record(*row1)] else: savedLines = [] rawHeaders = nextEl(ifh) headers = sanitizeHeaders(rawHeaders) Record = namedtuple('tsvCsvRec', headers) if headerIsRow: yield rawHeaders # the calling script wants the *real* headers, e.g. "my.name", not "my_name" instead lineCount = 0 for fields in itertools.chain(savedLines, ifh): lineCount += 1 if fields[0].startswith("#"): continue try: rec = Record(*fields) except Exception as msg: logging.error("Exception occurred while parsing line, %s" % msg) logging.error("Filename %s" % inFile) logging.error("Fields are: %s" % fields) logging.error("Does number of fields match headers?") logging.error("Headers are: %s" % headers) raise Exception("header count: %d != field count: %d wrong field count in line %d" % (len(headers), len(fields), lineCount)) yield rec def parseMetaDesc(inConf): " parse two-row tsv file with meta field descriptions " if "metaDesc" not in inConf: return None metaDesc = {} for row in textFileRows(inConf["metaDesc"]): if len(row)<2: logging.warning("row '%s' in metaDesc file does not have two fields" % row) continue metaDesc[row[0]] = row[1] return metaDesc def parseOneColumn(fname, colName): " return a single column from a tsv as a list, without the header " vals = [] colIdx = None logging.debug("Parsing column %s from file %s" % (colName, fname)) colIdx = None for row in textFileRows(fname): if colIdx is None: try: #row = [x.split("|")[0] for x in row] # we do not remember why this was added so allowing | as a valid character in values again colIdx = row.index(colName) continue except ValueError: raise Exception("There is no column %s in the file %s. This may have to do with special characters in the column name. Try not to use special characters in column names, fix meta.tsv and cellbrowser.conf and try again. It can also happen if you reference a field in cellbrowser.conf that has been excluded from the meta data because it only has a single value. Possible row names: %s" % (repr(colName), fname, row)) vals.append(row[colIdx]) return vals def parseIntoColumns(fname): " parse tab sep file vertically, return as a list of (headerName, list of values) " ifh = open(fname) sep = "\t" headers = ifh.readline().rstrip("\r\n").split(sep) if headers[0]=="": headers[0]="cell_id" # some tolerance, for R headers = [h.split("|")[0] for h in headers] for i, h in enumerate(headers): if h=="": errAbort("Header '%s' of column %d is empty. Please fix the meta data file and give every column a name" % (h, i)) colsToGet = range(len(headers)) columns = [] for h in headers: columns.append([]) for line in ifh: row = line.rstrip("\r\n").split(sep) for colIdx in colsToGet: columns[colIdx].append(row[colIdx]) return zip(headers, columns) def openFile(fname, mode="rt", encoding="utf8"): if fname.endswith(".gz"): mode = mode.replace("U", "") # gzip reader does not support Universal newline mode yet, python bug if isPy3: fh = gzip.open(fname, mode, encoding=encoding) else: fh = gzip.open(fname, mode) else: if isPy3: fh = io.open(fname, mode, encoding=encoding) else: fh = open(fname, mode) return fh def parseDict(fname): """ parse text file in format keyvalue and return as dict key->val """ d = {} fh = openFile(fname) sep = "\t" if fname.endswith(".csv"): sep = "," lno = 0 for line in fh: lno+=1 parts = line.rstrip("\r\n").split(sep) if len(parts)!=2: errAbort("Error in line %d of file %s: Should contain two values, it seems to contain only a single value. " \ "This can also happen if in a .tsv file a space was used instead of a tab." % (lno, fname)) key, val = parts d[key] = val return d def readGeneToSym(fname): " given a file with geneId,symbol return a dict geneId -> symbol. Strips anything after . in the geneId " if fname.lower()=="none": return None logging.info("Reading gene,symbol mapping from %s" % fname) # Jim's files and CellRanger files have no headers, they are just key-value line1 = openFile(fname).readline().rstrip("\r\n") fieldCount = len(line1.split('\t')) if "geneId" not in line1 and "symbol" not in line1: logging.debug("geneID,symbol file does not start with 'geneId', parsing as key,value") d = parseDict(fname) # my new files are smaller and have headers elif line1=="#geneId\tsymbol" or fieldCount==2: d = {} for row in lineFileNextRow(fname): if row.symbol=="": continue geneId = row.geneId if geneId.startswith("EN") and "." in geneId: geneId = geneId.split(".")[0] d[geneId]=row.symbol else: assert(False) # symbols file does not have a header like #geneIdsymbol logging.debug("Found symbols for %d genes" % len(d)) return d def getDecilesList_np(values): deciles = np.percentile( values, [0,10,20,30,40,50,60,70,80,90,100] ) return deciles def bytesAndFmt(x): """ how many bytes do we need to store x values and what is the sprintf format string for it? Return tuple of Javascript type and python struct type. See : Javascript typed array names, https://developer.mozilla.org/en-US/docs/Web/JavaScript/Typed_arrays https://docs.python.org/2/library/struct.html """ if x > 65535: return "Uint32", " 255: return "Uint16", ">> l = [0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10] # >>> getDecilesWithZeros(l) # ([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) # """ # nonZeros = [x for x in numVals if x!=0.0] # # zeroCount = len(numVals) - len(nonZeros) # deciles = getDecilesList_np(nonZeros) # # decArr = np.searchsorted(deciles, numVals) # decCounts(deciles, nonZeros) # # decCounts.insert(0, zeroCount) # return deciles, decCounts, newVals def findBins(numVals, breakVals, hasNan): """ find the right bin index defined by breakVals for every value in numVals. Special handling for the last value. The comparison uses "<=". The first break is assumed to be the minimum of numVals and is therefore ignored. Also returns an array with the count for every bin. hasNan triggers a special mode where the first bin is reserved for NaNs (encoded as -inf) >>> findBins([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1, 2,3,5,6]) ([0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3], [6, 2, 4, 2]) """ bArr = [] binCounts = [0]*(len(breakVals)-1) # NaNs mean that the non-NaN bins are all +1 if hasNan: breaks = breakVals[2:] else: breaks = breakVals[1:] if hasNan: for x in numVals: # we use -inf for the NaN value everywhere, sorting is undefined in lists that contain NaN if math.isinf(x): binIdx = 0 else: binIdx = bisect.bisect_left(breaks, x)+1 bArr.append(binIdx) binCounts[binIdx]+=1 else: for x in numVals: binIdx = bisect.bisect_left(breaks, x) # comparison operator is <= bArr.append(binIdx) binCounts[binIdx]+=1 return bArr, binCounts def countBinsBetweenBreaks(numVals, breakVals): """ count how many numVals fall into the bins defined by breakVals. Special handling for the last value. Comparison uses "<=". The first break is assumed to be the minimum of numVals. Also returns an array with the bin for every element in numVals >>> countBinsBetweenBreaks([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1,2,3,5,6]) ([6, 2, 4, 2], [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3]) """ binCounts = [] binCount = 0 i = 1 dArr = [] for x in numVals: if x <= breakVals[i]: binCount+=1 else: binCounts.append(binCount) binCount = 1 i += 1 dArr.append(i-1) binCounts.append(binCount) assert(len(dArr)==len(numVals)) assert(len(binCounts)==len(breakVals)-1) return binCounts, dArr def discretizeArray(numVals, fieldMeta): """ discretize numeric values based on quantiles. """ maxBinCount = 10 counts = Counter(numVals).most_common() counts.sort() # sort by value, not count if len(counts) < maxBinCount: # if we have just a few values, do not do any binning binCounts = [y for x,y in counts] values = [x for x,y in counts] valToBin = {} for i, x in enumerate(values): valToBin[x] = i dArr = [valToBin[x] for x in numVals] fieldMeta["binMethod"] = "raw" fieldMeta["values"] = values fieldMeta["binCounts"] = binCounts return dArr, fieldMeta # ten breaks breakPercs = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] countLen = len(counts) breakIndices = [int(round(bp*countLen)) for bp in breakPercs] # as with all histograms, the last break is always a special case (0-based array) breakIndices.append(countLen-1) breakVals = [counts[idx][0] for idx in breakIndices] # NaNs are encoded as -inf so they always are the first break # The first non-NaN value is at index 1 # If we have NaNs, we need one more bin, with the first non-Nan value hasNan = False if math.isinf(breakVals[0]): hasNan = True breakVals.insert(1, counts[1][0]) logging.info("Field has NaN/Unknown values") logging.debug("Breaks are: %s" % breakVals) dArr, binCounts = findBins(numVals, breakVals, hasNan) logging.info("Number of values per decile-bin: %s" % binCounts) # we should have 11 breaks/10 bins, or 12 breaks/11bins if we have NaN elements assert((not hasNan and len(breakVals)==11) or (hasNan and len(breakVals)==12)) assert((not hasNan and len(binCounts)==10) or (hasNan and len(binCounts)==11)) assert((len(binCounts)+1 == len(breakVals))) fieldMeta["binMethod"] = "quantiles" fieldMeta["binCounts"] = binCounts if math.isinf(breakVals[0]): # -infinity is not valid in JSON breakVals[0] = "Unknown" fieldMeta["breaks"] = breakVals return dArr, fieldMeta def discretizeNumField(numVals, fieldMeta, numType): " given a list of numbers, add attributes to fieldMeta that describe the binning scheme " digArr, fieldMeta = discretizeArray(numVals, fieldMeta) #deciles, binCounts, newVals = getDecilesWithZeros(numVals) fieldMeta["arrType"] = "uint8" fieldMeta["_fmt"] = "val and a list of keys, return (key, val) in the order of the list """ logging.debug("sorting by manually specified order") ret = [] doneKeys = set() for key in keyOrder: ret.append((key, valDict[key])) # if this fails, check your order file doneKeys.add(key) missKeys = set(valDict) - doneKeys if len(missKeys)!=0: errAbort("Keys %s are in the meta file but not in the enum order file." % missKeys) return ret def guessFieldMeta(valList, fieldMeta, colors, forceType, enumOrder): """ given a list of strings, determine if they're all int, float or strings. Return fieldMeta, as dict, and a new valList, with the correct python type - 'type' can be: 'int', 'float', 'enum' or 'uniqueString' - if int or float: replace 0 or NaN with the FLOATNAN global (-inf) - if uniqueString: 'maxLen' is the length of the longest string - if enum: 'values' is a list of all possible values - if colors is not None: 'colors' is a dict of fieldName -> value -> color. The fieldname "__default__" is always tried to look up colors. - arrType is the Javascript TypedArray type of the array: float32, uint8, uint16, uint32 """ global errorFields unknownCount = 0 intCount = 0 floatCount = 0 valCounts = defaultdict(int) newVals = [] for val in valList: fieldType = "string" newVal = val if likeEmptyString(val): unknownCount += 1 newVal = FLOATNAN else: try: newVal = int(val) intCount += 1 floatCount += 1 except: try: newVal = float(val) floatCount += 1 except: pass newVals.append(newVal) valCounts[val] += 1 valToInt = None assert(len(newVals)==len(valList)) if len(valCounts)==1: logging.warn("Field %s contains only a single value" % fieldMeta["name"]) logging.debug("Total number of different values: %d. - by type: int %d, float %d, unknown %d. Forced type: %s" % (len(valList), intCount, floatCount, unknownCount, forceType)) if intCount+unknownCount==len(valList) and not forceType: # JS supports only 32bit signed ints so we store integers as floats logging.debug("Field type is integer") newVals = [float(x) for x in newVals] fieldMeta["arrType"] = "float32" fieldMeta["_fmt"] = " 0: fieldMeta["colors"] = colArr if len(notFound)!=0: msg = "No colors found for field %s, values %s." % (repr(fieldMeta), list(notFound)) if isDefColors: if foundColors > 0: # otherwise would print warning on every field logging.warn(msg+" These will fall back to palette default colors") else: errAbort(msg) fieldMeta["valCounts"] = valCounts fieldMeta["arrType"], fieldMeta["_fmt"] = bytesAndFmt(len(valArr)) if fieldMeta["arrType"].endswith("32"): logging.error("Meta field %s has more than 32k different values and makes little sense to keep. " "Please or remove the field from the meta data table or contact us, cells@ucsc.edu."% fieldMeta["label"]) errorFields.append(fieldMeta["label"]) valToInt = dict([(y[0],x) for (x,y) in enumerate(valCounts)]) # dict with value -> index in valCounts newVals = [valToInt[x] for x in valList] # fieldMeta["diffValCount"] = len(valCounts) return fieldMeta, newVals def writeNum(col, packFmt, ofh): " write a list of numbers to a binary file " def moveOrGzip(inFname, outFname): " if outFname has .gz, runGzip, otherwise just move file over " if outFname.endswith(".gz"): runGzip(inFname, outFname) else: renameFile(inFname, outFname) def runGzip(fname, finalFname=None): " compress fname and move to finalFname when done, to make it atomic " if which("gzip") is None: logging.debug("gzip not found, falling back to Python's gzip") if finalFname is None: finalFname = fname+".gz" ifh = open(fname, "rb") ofh = gzip.open(finalFname, "wb") logging.debug("Compressing %s to %s with built-in gzip" % (fname, finalFname)) ofh.write(ifh.read()) ifh.close() ofh.close() else: if which("pigz") is not None: logging.debug("Compressing %s with pigz" % fname) cmd = "pigz -f %s" % fname else: logging.debug("Compressing %s with gzip" % fname) cmd = "gzip -f %s" % fname runCommand(cmd) gzipFname = fname+".gz" if finalFname==None: return gzipFname renameFile(gzipFname, finalFname) if isfile(fname): os.remove(fname) return finalFname def addLongLabels(acronyms, fieldMeta): """ add a 'longLabel' meta info entry, a list of longer strings for enum fields, if any shortLabels have a longLabel in the acronyms dict of shortLabel = longLabel """ if acronyms is None: return fieldMeta foundCount = 0 for shortLabel, count in fieldMeta["valCounts"]: if shortLabel in acronyms: foundCount+=1 if foundCount > 0: longLabels = [] for shortLabel, count in fieldMeta["valCounts"]: longLabel = acronyms.get(shortLabel) if longLabel is None: logging.warning("Field %s: value %s has no long label through the acronyms" % (fieldMeta["label"], shortLabel)) longLabel = shortLabel #longLabel = longLabel.replace("_", " ") longLabels.append(longLabel) fieldMeta["longLabels"] = longLabels return fieldMeta def addDesc(fieldDescs, fieldMeta): " add a 'desc' attribute to the meta info " if fieldDescs is None or fieldMeta["label"] not in fieldDescs: return fieldMeta desc = fieldDescs[fieldMeta["label"]] if desc!="": fieldMeta["desc"] = desc return fieldMeta def metaToBin(inDir, inConf, outConf, fname, outDir): """ convert meta table to binary files. outputs fields.json and one binary file per field. adds names of metadata fields to outConf and returns outConf """ colorFname = inConf.get("colors") enumFields = inConf.get("enumFields") logging.info("Converting to numbers and compressing meta data fields") makeDir(outDir) colData = list(parseIntoColumns(fname)) colors = parseColors(inDir, inConf, outConf, colData) acronyms = readAcronyms(inConf, outConf) # XX no md5 yet for acronyms metaDescs = parseMetaDesc(inConf) enumOrder = inConf.get("enumOrder") # the user inputs the enum fields in cellbrowser.conf as their real names, but internally, unfortunately # we have to strip special chars so fix the user's field names to our format sanEnumFields = [] if enumFields is not None: if "" in enumFields: errAbort("The enumFields statement in cellbrowser.conf must not contain an empty string") sanEnumFields = [sanitizeName(n) for n in enumFields] fieldInfo = [] validFieldNames = set() for colIdx, (fieldName, col) in enumerate(colData): enumOrderList = None if enumOrder and fieldName in enumOrder: orderFname = join(inConf["inDir"], enumOrder[fieldName]) enumOrderList = open(orderFname).read().splitlines() logging.debug("Meta data field index %d: '%s'" % (colIdx, fieldName)) validFieldNames.add(fieldName) cleanFieldName = sanitizeName(fieldName.split("|")[0]) forceType = None if (cleanFieldName in sanEnumFields): forceType = "enum" # very dumb heuristic to recognize fields that should not be treated as numbers but as enums # res.0.6 is the default field name for Seurat clustering. Field header sanitizing changes it to # res_0_6 which is not optimal, but namedtuple doesn't allow dots in names if "eiden" in cleanFieldName or \ "luster" in cleanFieldName or \ "ouvain" in cleanFieldName or (fieldName.startswith("res_") and "_" in fieldName): forceType="enum" if colIdx==0: forceType = "unique" fieldMeta = OrderedDict() fieldMeta["name"] = cleanFieldName nameParts = fieldName.split("|") fieldMeta["label"] = nameParts[0] if len(nameParts)>1: fieldMeta["desc"] = fieldName fieldMeta, binVals = guessFieldMeta(col, fieldMeta, colors, forceType, enumOrderList) if enumOrder: fieldMeta["sortBy"] = "none" else: if inConf.get("sortBy") and fieldName in inConf["sortBy"]: defSortVal = inConf["sortBy"][fieldName] if not defSortVal in ["name", "freq"]: errAbort("sortBy must be a dictionary with fieldName -> value and the value must be 'name' or 'freq'.") fieldMeta["defaultSort"] = defSortVal fieldType = fieldMeta["type"] if fieldType=="enum": fieldMeta = addLongLabels(acronyms, fieldMeta) fieldMeta = addDesc(metaDescs, fieldMeta) if "metaOpt" in inConf and fieldName in inConf["metaOpt"]: fieldMeta["opt"] = inConf["metaOpt"][fieldName] packFmt = fieldMeta["_fmt"] binName = join(outDir, cleanFieldName+".bin") if fieldMeta["type"]=="uniqueString": # unique strings are simply written as-is logging.debug("writing as normal strings to %s" % binName) textFh = openFile(binName, "w") for x in col: textFh.write("%s\n" % x) textFh.close() else: # default case: write data as binary file logging.debug("writing as binary data to %s" % binName) binFh = open(binName, "wb") for x in binVals: binFh.write(struct.pack(packFmt, x)) binFh.close() runGzip(binName) zippedName = binName+".gz" fieldMeta["md5"] = md5WithPython(zippedName)[:MD5LEN] del fieldMeta["_fmt"] fieldInfo.append(fieldMeta) if "type" in fieldMeta: logging.info(("Field %(name)s: type %(type)s, %(diffValCount)d different values" % fieldMeta)) else: logging.info(("Field %(name)s: type %(type)s, %(diffValCount)d different values, max size %(maxSize)d " % fieldMeta)) global errorFields if len(errorFields)>0: logging.error("Stopping now, since these fields were found with errors: %s. Run cbBuild with --force " "to ignore these errors and build anyways OR check that number fields are not recognized as enum fields. " % ",".join(errorFields)) if doForce: errorFields = [] else: errAbort("build stopped") return fieldInfo, validFieldNames def iterLineOffsets(ifh): """ parse a text file and yield tuples of (line, startOffset, endOffset). endOffset does not include the newline, but the newline is not stripped from line. """ line = True start = 0 while line!='': line = ifh.readline() end = ifh.tell()-1 if line!="": yield line, start, end start = ifh.tell() def findMtxFiles(fname): """ given the name of a .mtx.gz or directory name, find the .mtx.gz, genes/features and barcode files return three filenames in the order (mtx, genes, barcode) """ logging.debug("Finding mtx/features/barcode filenames for mtx file %s" % fname) if isdir(fname): matDir = fname mtxFname = join(matDir, "matrix.mtx.gz") else: matDir = dirname(fname) mtxFname = fname if not isfile(mtxFname): errAbort("Could not find %s" % mtxFname) prefix = "" if "_" in basename(mtxFname): prefix = basename(mtxFname).split("_")[0]+"_" logging.debug("Basename-prefix of mtx is: %s") genesFname = join(matDir, prefix+"genes.tsv.gz") if not isfile(genesFname): genesFname = join(matDir, prefix+"genes.tsv") if not isfile(genesFname): # zealous cellranger 3 engineers renamed the genes file. Argh. genesFname = join(matDir, prefix+"features.tsv.gz") if not isfile(genesFname): genesFname = join(matDir, prefix+"features.tsv") barcodeFname = join(matDir, prefix+"barcodes.tsv.gz") if not isfile(barcodeFname): barcodeFname = join(matDir, prefix+"barcodes.tsv") if not isfile(genesFname): errAbort("Found file %s, so expected genes file %s (or with .gz) to exist but could not find it. " % (mtxFname, genesFname)) if not isfile(barcodeFname): errAbort("Found file %s, so expected genes file %s (or with .gz) to exist but could not find it. " % (mtxFname, barcodeFname)) logging.debug("mtx filename: %s, %s and %s" % (mtxFname, genesFname, barcodeFname)) return mtxFname, genesFname, barcodeFname class MatrixMtxReader: " open a .mtx file and yield rows via iterRows. gz and csv OK." def __init__(self, geneToSym=None): " can automatically translate to symbols, if dict geneId -> sym is provided " logging.debug(".mtx.gz reader initialized") self.geneToSym = geneToSym def open(self, fname, matType=None): import scipy.io import numpy as np logging.info("Loading %s" % fname) mtxFname, genesFname, barcodeFname = findMtxFiles(fname) self.mat, self.genes, self.barcodes = open10xMtxForRows(mtxFname, genesFname, barcodeFname) def close(self): pass def getMatType(self): " return 'int' or 'float' " dt = str(self.mat.dtype) if dt.startswith('int'): return "int" else: return "float" def getSampleNames(self): return self.barcodes def iterRows(self): " yield (geneId, symbol, array) tuples from gene expression file. " mat = self.mat genes = self.genes skipIds = 0 geneToSym = self.geneToSym for i in range(0, len(self.genes)): geneId = genes[i] geneId, geneSym, skipIds = resolveGene(geneId, geneToSym, skipIds) logging.debug("geneId %s, geneSym %s", geneId, geneSym) if i%1000==0: logging.info("%d genes written..." % i) arr = mat.getrow(i).toarray() if arr.ndim==2: # scipy sparse arrays have changed their entire data model and now all operations # return 2D matrices. So need to unpack it to get the array. Grrr. arr = arr[0] yield (geneId, geneSym, arr) def resolveGene(gene, geneToSym, skipIds): if "|" in gene: gene, symbol = gene.split("|") else: if geneToSym is None: symbol = gene else: if gene.startswith("EN"): gene = gene.split(".")[0] symbol = geneToSym.get(gene) logging.debug("%s -> %s" % (gene, symbol)) if symbol is None: skipIds += 1 logging.warn("could not find symbol for ID %s, looks like it is not a valid gene ID, check geneIdType setting in cellbrowser.conf or gene symbol mapping tables" % (gene)) symbol = gene if symbol.isdigit(): logging.warn("in gene matrix: gene identifier %s is a number. If this is indeed a gene identifier, you can " \ "ignore this warning. Otherwise, your matrix may have no gene ID in the first column and you will have to fix the matrix. " \ "Another possibility is that your geneIds are entrez gene IDs, but this is very rare." % (symbol)) return gene, symbol, skipIds class MatrixTsvReader: " open a .tsv or .csv file and yield rows via iterRows. gz and csv OK." def __init__(self, geneToSym=None): " can automatically translate to symbols, if dict geneId -> sym is provided " self.geneToSym = geneToSym def open(self, fname, matType=None, usePyGzip=False): " open file and guess field sep and format of numbers (float or int) " if which("gunzip")==None: logging.warn("Gunzip not in PATH, falling back to Python's built-in") usePyGzip = True # Note: The encoding for popen below is utf8. Is this a good choice? # Does performance change if we don't do unicode strings but # byte strings instead? Does unicode eat up the performance gain # of gunzip -c ? logging.debug("Opening %s" % fname) self.fname = fname if fname.endswith(".gz"): # python's gzip is slower, but does not return an error if we read just # a single line if usePyGzip: self.ifh = openFile(fname) else: cmd = ['gunzip', '-c', fname] proc, stdout = popen(cmd, doWait=False) self.ifh = stdout # faster implementation and in addition uses 2 CPUs else: #self.ifh = io.open(fname, "r", encoding="utf8") # utf8 performance? necessary for python3? self.ifh = io.open(fname, "rt") # utf8 performance? necessary for python3? self.sep = sepForFile(fname) logging.debug("Field separator is %s" % repr(self.sep)) headLine = self.ifh.readline() headLine = headLine.rstrip("\r\n") self.headers = headLine.split(self.sep) self.sampleNames = self.headers[1:] self.sampleNames = [x.strip('"') for x in self.sampleNames] assert(len(self.sampleNames)!=0) logging.debug("Read %d sampleNames, e.g. %s" % (len(self.sampleNames), self.sampleNames[0])) self.oldRows = [] if matType is None: self.matType = self._autoDetectMatType(10) logging.info("Auto-detect: Numbers in matrix are of type '%s'", self.matType) else: logging.debug("Pre-determined: Numbers in matrix are '%s'" % matType) self.matType = matType def close(self): self.ifh.close() def getMatType(self): return self.matType def getSampleNames(self): return self.sampleNames def _autoDetectMatType(self, n): " check if matrix has 'int' or 'float' data type by looking at the first n genes" # auto-detect the type of the matrix: int vs float logging.info("Auto-detecting number type of %s" % self.fname) geneCount = 0 self.matType = "float" # iterRows needs this attribute matType = "int" for geneId, sym, a in self.iterRows(): self.oldRows.append( (geneId, sym, a) ) geneCount+=1 if numpyLoaded: a_int = a.astype(int) hasOnlyInts = np.array_equal(a, a_int) if not hasOnlyInts: matType = "float" break else: for x in a: frac, whole = math.modf(x) if frac != 0.0: matType = "float" break if matType=="float": break if geneCount==n: break if geneCount==0: errAbort("empty expression matrix?") logging.debug("Matrix type is: %s" % matType) return matType def iterRows(self): " yield (geneId, symbol, array) tuples from gene expression file. " if self.matType == "float": npType = "float32" else: npType = "int32" # during auto-detection, we've already read a few lines logging.debug("spooling back %d saved rows" % len(self.oldRows)) for (geneId, sym, arr) in self.oldRows: # for the integer case though we have to fix up the type now if self.matType=="int" or self.matType=="forceInt": logging.debug("spoolback: fix up integer") if numpyLoaded: arr = arr.astype(int) else: arr = [int(x) for x in arr] logging.debug("Yielding gene %s, sym %s, %d fields" % (geneId, sym, len(arr))) yield (geneId, sym, arr) skipIds = 0 doneGenes = set() lineNo = 0 sep = self.sep sampleCount = len(self.sampleNames) geneToSym = self.geneToSym for line in self.ifh: self.lineLen = len(line) gene, rest = splitOnce(line.rstrip("\r\n"), sep) gene = gene.strip('"') if numpyLoaded: arr = np.fromstring(rest, dtype=npType, sep=sep, count=sampleCount) else: if self.matType=="int": arr = [int(x) for x in rest.split(sep)] elif self.matType=="forceInt": arr = [int(float(x)) for x in rest.split(sep)] else: arr = [] for x in rest.split(sep): if x=="0" or x=="0.0": arr.append(0.0) else: arr.append(float(x)) geneCount = len(arr) if geneCount+1 != len(self.headers): errAbort("The first header line of the matrix file has %d columns. "\ "However, line %d has %d columns. All lines must have the same number of columns." % (len(self.headers), lineNo+1, geneCount+1)) gene, symbol, skipIds = resolveGene(gene, geneToSym, skipIds) if symbol in doneGenes: logging.warn("line %d: Gene %s/%s is duplicated in matrix, using only first occurrence for symbol, kept second occurrence with original geneId" % (lineNo, gene, symbol)) symbol = gene doneGenes.add(gene) lineNo += 1 logging.debug("Yielding gene %s, sym %s, %d fields" % (gene, symbol, len(arr))) yield gene, symbol, arr if skipIds!=0: logging.warn("Kept %d genes as original IDs, due to duplication or unknown ID" % skipIds) def iterRowsWithOffsets(self): " like iterRows, but also return offset and line length " offset = self.ifh.tell() for gene, sym, row in self.iterRows(): yield gene, sym, row, offset, self.lineLen offset = self.ifh.tell() def getDecilesList(values): """ given a list of values, return the 10 values that define the 10 ranges for the deciles """ if len(values)==0: return None valCount = len(values) binSize = float(valCount-1) / 10.0; # width of each bin, in number of elements, fractions allowed values = list(sorted(values)) # get deciles from the list of sorted values deciles = [] pos = 0 for i in range(10): # 10 bins means that we need 10 limits, the last limit is at 90% pos = int (binSize * i) if pos > valCount: # this should not happen, but maybe it can, due to floating point issues? logging.warn("decile exceeds 10, binSize %d, i %d, len(values) %d" % (binSize, i, len(values))) pos = len(values) deciles.append ( values[pos] ) return deciles def findBin(ranges, val): """ given an array of values, find the index i where ranges[i] < val <= ranges[i+1] ranges have to be sorted. This is a dumb brute force implementation - maybe binary search is faster, if ever revisit this again Someone said up to 10 binary search is not faster. """ if val==0: # speedup return 0 for i in range(0, len(ranges)): if (val < ranges[i]): return i # if doesn't fit in anywhere, return beyond last possible index return i+1 def discretizeArr_uniform(arr, fieldMeta): """ given an array of numbers, get min/max, create 10 bins between min and max then translate the array to bins and return the list of bins """ arrMin = min(arr) arrMax = max(arr) stepSize = (arrMax-arrMin)/10.0 dArr = [0]*len(arr) binCounts = [0]*10 for i, x in enumerate(arr): binIdx = int(round((x - arrMin)/stepSize)) if x == arrMax: binIdx = 9 assert(binIdx <= 9) dArr[i] = binIdx binCounts[binIdx]+=1 fieldMeta["binMethod"] = "uniform" fieldMeta["minVal"] = arrMin fieldMeta["maxVal"] = arrMax fieldMeta["stepSize"] = stepSize fieldMeta["binCounts"] = binCounts return dArr, fieldMeta def digitize_py(arr, matType): """ calculate deciles ignoring 0s from arr, use these deciles to digitize the whole arr, return (digArr, zeroCount, bins). bins is an array of (min, max, count) There are at most 11 bins and bin0 is just for the value zero. For bin0, min and max are both 0.0 matType can be "int" or "float". If it is 'int' and arr has only <= 11 values, will not calculate deciles, but rather just count the numbers and use them to create bins, one per number. #>>> digitize_py([1,1,1,1,1,2,3,4,5,6,4,5,5,5,5], "float") """ if matType=="int": valCount = len(set(arr)) if valCount <= 11: # 10 deciles + 0s counts = Counter(arr).most_common() counts.sort() valToIdx = {} for i, (val, count) in enumerate(counts): valToIdx[val] = i digArr = [valToIdx[x] for x in arr] bins = [] for val, count in counts: bins.append( (val, val, count) ) return digArr, bins noZeroArr = [x for x in arr if x!=0] zeroCount = len(arr) - len(noZeroArr) deciles = getDecilesList(noZeroArr) # there are 10 limits for the 10 deciles, 0% - 90% deciles.insert(0, 0) # bin0 is always for the zeros # we now have 11 limits assert(len(deciles)<=11) # digitize and count bins digArr = [] binCounts = len(deciles)*[0] for x in arr: binIdx = findBin(deciles, x) # bin1 is always empty, so move down all other indices if binIdx>0: binIdx-=1 digArr.append(binIdx) binCounts[binIdx]+=1 # create the bin info bins = [] if zeroCount!=0: bins.append( [float(0), float(0), float(zeroCount)]) for i in range(1, len(deciles)): minVal = deciles[i-1] maxVal = deciles[i] count = binCounts[i] # skip empty bins #if count!=0: bins.append( [float(minVal), float(maxVal), float(count)] ) # add the maximum value explicitly, more meaningful bins[-1][1] = np.amax(arr) return digArr, bins def digitizeArr(arr, numType): if numpyLoaded: return digitize_np(arr, numType) else: return digitize_py(arr, numType) def binEncode(bins): " encode a list of at 11 three-tuples into a string of 33 floats (little endian)" # add (0,0,0) elements to bins until it has 11 elements " padBins = copy.copy(bins) for i in range(len(bins), 11): padBins.append( (0.0, 0.0, 0.0) ) #print len(padBins), padBins, len(padBins) assert(len(padBins)==11) strList = [] for xMin, xMax, count in padBins: strList.append( struct.pack(">> digitize_np([1,2,3,4,5,6,4,1,1,1], "int") #>>> digitize_np([0,0,0,1,1,1,1,1,2,3,4,5,6,4,5,5,5,5], "float") #>>> digitize_np([1,1,1,1,1,2,3,4,5,6,4,5,5,5,5], "float") """ # meta data comes in as a list if not isinstance(arr, np.ndarray): arr = np.array(arr) if matType=="int": # raw counts mode: # first try if there are enough unique values in the array # if there are <= 10 values, deciles make no sense, # so simply enumerate the values and map to bins 0-10 binCounts = np.bincount(arr) nonZeroCounts = binCounts[np.nonzero(binCounts)] # remove the 0s if nonZeroCounts.size <= 11: logging.debug("we have read counts and <11 values: not using quantiles, just enumerating") posWithValue = np.where(binCounts != 0)[0] valToBin = {} bins = [] binIdx = 0 #for val, count in enumerate(binCounts): #if count!=0: for val in posWithValue: count = binCounts[val] bins.append( (val, val, count) ) valToBin[val] = binIdx binIdx += 1 # map values to bin indices, from stackoverflow digArr = np.vectorize(valToBin.__getitem__)(arr) return digArr, bins logging.debug("calculating deciles") # calculate the deciles without the zeros, otherwise # the 0s completely distort the deciles #noZero = np.copy(arr) #nonZeroIndices = np.nonzero(arr) noZero = arr[np.nonzero(arr)] # gene not expressed -> do nothing if noZero.size==0: logging.debug("expression vector is all zeroes") return np.zeros(arr.size, dtype=np.int8), [(0.0, 0.0, arr.size)] deciles = np.percentile( noZero, [0,10,20,30,40,50,60,70,80,90] , interpolation="lower") # make sure that we always have a bin for the zeros deciles = np.insert(deciles, 0, 0) logging.debug("deciles are: %s" % str(deciles)) # now we have 10 limits, defining 11 bins # but bin1 will always be empty, as there is nothing between the value 0 and the lowest limit digArr = np.searchsorted(deciles, arr, side="right") # so we decrease all bin indices that are not 0 np.putmask(digArr, digArr>0, digArr-1) binCounts = np.bincount(digArr) bins = [] zeroCount = binCounts[0] # bin0 is a bit special if zeroCount!=0: bins.append( [float(0), float(0), zeroCount] ) for i in range(1, len(deciles)): binCount = binCounts[i] #if binCount==0: #continue minVal = deciles[i-1] maxVal = deciles[i] bins.append( [minVal, maxVal, binCount] ) bins[-1][1] = np.amax(arr) #print bins, len(digArr), digArr return digArr, bins def maxVal(a): if numpyLoaded: return np.amax(a) # if MTX old old numpy is loaded, so isNumpy is false but var is still an ndarray -> # weird syntax to avoid syntax error on undefined "np" elif 'np' in dir(): if isinstance(a, np.ndarray): return np.amax(a) elif 'numpy' in sys.modules: import numpy if isinstance(a, numpy.ndarray): return numpy.amax(a) else: return max(a) #def discretExprRowEncode(geneDesc, binInfo, digArr): # " encode geneDesc, deciles and array of decile indixes into a single string that can be read by the .js code " # # The format of a record is: # # - 2 bytes: length of descStr, e.g. gene identifier or else # # - len(descStr) bytes: the descriptive string descStr # # - 132 bytes: 11 deciles, encoded as 11 * 3 floats (=min, max, count) # # - array of n bytes, n = number of cells # decChrList = [chr(x) for x in digArr] # decStr = "".join(decChrList) # geneIdLen = struct.pack(" force to a list if str(type(exprArr))=="": exprArr = exprArr.tolist()[0] # Python 3.9 removed tostring() if sys.version_info >= (3, 2): exprStr = array.array(arrType, exprArr).tobytes() else: exprStr = array.array(arrType, exprArr).tostring() minVal = min(exprArr) if isPy3: geneStr = geneIdLen+bytes(geneDesc, encoding="ascii")+exprStr else: geneStr = geneIdLen+geneDesc+exprStr geneCompr = zlib.compress(geneStr) fact = float(len(geneCompr)) / len(geneStr) logging.debug("raw - compression factor of %s: %f, before %d, after %d"% (geneDesc, fact, len(geneStr), len(geneCompr))) return geneCompr, minVal def indexAtacOffsetsByChrom(exprIndex): """ given a dict with name -> (offset, len) and name being a string of chrom:start-end, reformat the dict to one with chrom -> [ [start, end, offset, len], ... ] and return it. The positions are sorted. """ byChrom = defaultdict(list) for chromRange, (offs, dataLen) in iterItems(exprIndex): # chromRange can be in format chr:start-end or chr_start_end or chr-start-end if ":" in chromRange: chrom, startEnd = chromRange.split(":") start, end = startEnd.split("-") elif "_" in chromRange: chrom, start, end = chromRange.split("_") else: chrom, start, end = chromRange.split("-") # chrom names must be in format chrom:start-end or chrom_start_end or chrom-start-end. Broken? Email us at cells@ucsc.edu start = int(start) end = int(end) byChrom[chrom].append( (start, end, offs, dataLen) ) # sort by start for chrom, posArr in iterItems(byChrom): posArr.sort() return dict(byChrom) def matrixToBin(fname, geneToSym, binFname, jsonFname, discretBinFname, discretJsonFname, \ metaSampleNames, matType=None, genesAreRanges=False): """ convert gene expression vectors to vectors of deciles and make json gene symbol -> (file offset, line length) """ logging.info("converting %s to %s and writing index to %s, type %s" % (fname, binFname, jsonFname, matType)) logging.info("Compressing gene expression vectors...") tmpFname = binFname + ".tmp" ofh = open(tmpFname, "wb") discretTmp = discretBinFname + ".tmp" discretOfh = open(discretTmp, "w") discretIndex = {} exprIndex = {} skipIds = 0 highCount = 0 if isMtx(fname): matReader = MatrixMtxReader(geneToSym) else: matReader = MatrixTsvReader(geneToSym) matReader.open(fname, matType=matType) if matType is None: matType = matReader.getMatType() sampleNames = matReader.getSampleNames() # filter matrix: find the indices of sample names that are in the matrix metaSet = set(metaSampleNames) matrixSet = set(sampleNames) idxList = None if len(matrixSet - metaSet)!=0: idxList = [] for i in range(len(sampleNames)): sampleName = sampleNames[i] if sampleName in metaSet: idxList.append(i) if numpyLoaded: idxList = np.array(idxList) logging.debug("Filtering %d matrix samples down to %d" % (len(matrixSet), len(idxList))) assert(len(metaSet-matrixSet)==0) # at this stage, samples with meta but not in matrix cannot happen dataType = "genes" if genesAreRanges: dataType = "genome-peaks" symCounts = defaultdict(int) geneCount = 0 atacChromCount = 0 allMin = 99999999 noSymFound = 0 for geneId, sym, exprArr in matReader.iterRows(): geneCount += 1 key = sym if sym is None: noSymFound +=1 else: symCounts[key]+=1 if geneId!=sym and sym is not None: key = geneId+"|"+sym if geneId.startswith("chr"): atacChromCount+=1 if symCounts[key] > 1000: errAbort("The gene ID/symbol %s appears more than 1000 times in the expression matrix. " "Are you sure that the matrix is in the right format? Each gene should be on a row. " "The gene ID must be in the first column and " "can optionally include the gene symbol, e.g. 'ENSG00000142168|SOD1'. " % key) if maxVal(exprArr) > 100: highCount += 1 logging.debug("Processing %s, symbol %s" % (geneId, sym)) # filter the row down to the meta-samples if idxList is not None: if numpyLoaded: exprArr = exprArr[idxList] else: exprArr = [exprArr[i] for i in idxList] exprStr, minVal = exprEncode(geneId, exprArr, matType) exprIndex[key] = (ofh.tell(), len(exprStr)) ofh.write(exprStr) if geneCount % 1000 == 0: logging.info("Wrote compressed expression values for %d %s" % (geneCount, dataType)) allMin = min(allMin, minVal) discretOfh.close() ofh.close() if noSymFound!=0: logging.warn("No symbol found for %d genes" % noSymFound) if genesAreRanges: logging.info("ATAC-mode is one. Assuming that genes are in format chrom:start-end or chrom_start_end or chrom-start-end") exprIndex = indexAtacOffsetsByChrom(exprIndex) else: if atacChromCount > 100: errAbort("There are more than 100 genes that look like a chrom_start_end range but the atacSearch cellbrowser.conf" " is not set. Please add this option or contact us if you are confused about this error.") if highCount==0: logging.warn("No single value in the matrix is > 100. It looks like this " "matrix has been log'ed. Our recommendation for visual inspection is to not transform matrices") if len(exprIndex)==0: errAbort("No genes from the expression matrix could be mapped to symbols." "Are you sure these are Ensembl IDs? Adapt geneIdType in cellbrowser.conf.") # keep a flag so the client later can figure out if the expression matrix contains any negative values # this is important for handling the 0-value exprIndex["_range"] = (int(allMin),0) logging.info("Global minimum in matrix is: %f" % allMin) jsonOfh = open(jsonFname, "w") json.dump(exprIndex, jsonOfh) jsonOfh.close() jsonOfh = open(discretJsonFname, "w") json.dump(discretIndex, jsonOfh) jsonOfh.close() renameFile(tmpFname, binFname) renameFile(discretTmp, discretBinFname) return matType def sepForFile(fname): if fname.endswith(".csv") or fname.endswith(".csv.gz") or fname.endswith(".csv.Z"): sep = "," elif ".tsv" in fname or ".tab" in fname: sep = "\t" else: ifh = openFile(fname) line1 = ifh.readline() if "\t" in line1: sep = "\t" else: sep = "," logging.debug("Separator for %s is %s" % (fname, repr(sep))) return sep def indexMeta(fname, outFname): """ index a tsv by its first field. Writes binary data to outFname. binary data is (offset/4 bytes, line length/2 bytes) """ # traditionally, I always used four bytes. The biggest datasets have meta data # that is bigger than 4GB, however indexFormat = " 4294967295: indexFormat = " pip install webcolors try: color = webcolors.name_to_hex(color, spec='css3').lstrip("#") except ValueError: # R knows more colors, like deeppink4. We simply map to deeppink for now # there does not seem to be a good table with R colors in Python yet color = "".join([c for c in color if not c.isdigit()]) color = webcolors.name_to_hex(color, spec='css3').lstrip("#") newDict[metaVal] = color return newDict def parseColors(inDir, inConf, outConf, colData): """ parse color table and return as dict fieldName -> value -> color. Can be a single filename (backwards compat) or a dict fieldName -> filename """ if not "colors" in inConf: return {} metaFieldNames = [x for x,y in colData] # so we can check that fieldnames are valid colorConf = inConf["colors"] if type(colorConf)==type(""): logging.debug("Color config is a single string, falling back to old behavior") colorConf = { "__default__":colorConf } colors = {} for fieldName, fname in colorConf.items(): if fieldName != "__default__" and fieldName not in metaFieldNames: logging.warn("fieldName %s from 'colors' specification is not a valid meta data field name. Possible names are: %s" % (repr(fieldName), repr(metaFieldNames))) continue fname = abspath(join(inDir, fname)) if isfile(fname): outConf["fileVersions"]["colors:"+fieldName] = getFileVersion(fname) else: if fieldName=="__default__": # old behavior was to tolerate missing color files. Not sure why. Tolerating this now only for the default field. logging.warn("Color file %s does not exist, skipping it" % fname) continue else: errAbort("Color file %s does not exist, skipping it" % fname) colors[fieldName] = parseOneColorFile(fname) return colors def scalePoint(scaleX, scaleY, minX, maxX, minY, maxY, flipY, useTwoBytes, x, y): " scale a point to the new coordinate system. used for either lines or points " if useTwoBytes or scaleY!=1.0 or scaleX!=1.0: x = int(scaleX * (x - minX)) y = int(scaleY * (y - minY)) if flipY: y = maxY - y return x,y def scaleCoords(coords, limits): " scale a list of named 2D-coords to be between min-max limits, return as a dict cellId -> point " logging.debug("scaling coords, limits are: %s" % repr(limits)) minX, maxX, minY, maxY, scaleX, scaleY, useTwoBytes, flipY = limits newCoords = {} for cellId, x, y in coords: x, y = scalePoint(scaleX, scaleY, minX, maxX, minY, maxY, flipY, useTwoBytes, x, y) newCoords[cellId] = (x, y) return newCoords def calcScaleFact(minX, maxX, minY, maxY, useTwoBytes): logging.debug("making scale factors from limits: %f, %f, %f, %f" % (minX, maxX, minY, maxY)) scaleX = 1.0 scaleY = 1.0 if useTwoBytes: aspect = float(maxX-minX)/float(maxY-minY) # the usual case: wider than high if (aspect > 1.0): scaleX = 65535/(maxX-minX) scaleY = (1/aspect)*65535/(maxY-minY) else: # higher than wide scaleX = aspect*65535/(maxX-minX) scaleY = 65535/(maxY-minY) minX = scaleX * minX maxX = scaleX * maxX minY = scaleY * minY maxY = scaleY * maxY logging.debug("2-byte mode: aspect ratio %.12f, scaleX %.12f, scaleY %.12f" % (aspect, scaleX, scaleY)) return scaleX, scaleY, minX, maxX, minY, maxY def parseCoordsAsDict(fname): """ parse tsv file in format cellId, x, y and return as list of (cellId, x, y) """ logging.debug("Parsing coordinates from %s." % (fname)) coords = [] skipCount = 0 # parse and find the max values warn1Done = False warn2Done = False headDone = False for row in lineFileNextRow(fname, noHeaders=True): if (len(row)<3): if not warn1Done: errAbort("file %s needs to have at least three columns" % fname) warn1Done = True if (len(row)>3): # coord file has to have three rows (cellId, x, y), we just ignore the headers if not warn2Done: logging.warn("file %s has more than three columns. Everything beyond column 3 will be ignored" % fname) warn2Done = True cellId = row[0] try: x = float(row[1]) y = float(row[2]) except: if headDone: logging.warn("file %s: cannot parse x,y coords, skipping line %s" % (fname, row)) headDone = True continue coords.append( (cellId, x, y) ) return coords def sliceRow(row, skipFields): " yield all fields, except the ones with an index in skipFields " for i, val in enumerate(row): if i not in skipFields: yield val def readMatrixSampleNames(fname): " return a list of the sample names of a matrix fname " if fname.endswith(".mtx.gz"): matrixPath = dirname(fname) barcodePath = findMtxFiles(fname)[2] logging.info("Reading sample names for %s -> %s" % (matrixPath, barcodePath)) lines = openFile(barcodePath).read().splitlines() ret = [] for l in lines: ret.append(l.split("\t")[0]) return ret else: names = readHeaders(fname)[1:] return names def metaReorderFilter(matrixFname, metaFname, fixedMetaFname, keepFields): """ check and reorder the meta data, has to be in the same order as the expression matrix, write to fixedMetaFname. Remove single value fields from the meta data, except if they are in keepFields. """ logging.info("Checking and reordering meta data to %s" % fixedMetaFname) metaSampleNames = readSampleNames(metaFname) matrixSampleNames = readMatrixSampleNames(matrixFname) if len(matrixSampleNames)==0: errAbort("Could not read a single sample name from the matrix. Internal error") # check that there is a 1:1 sampleName relationship mat = set(matrixSampleNames) meta = set(metaSampleNames) if len(meta)!=len(metaSampleNames): logging.error("sample names in the meta data differ in length from the sample names in the matrix: %d sample names in the meta data, %d sample names in the matrix" % (len(meta), len(metaSampleNames))) sys.exit(1) if len(mat.intersection(meta))==0: logging.error("Meta data and expression matrix have no single sample name in common. Sure that the expression matrix has one gene per row? Example Meta ID: %s, Example matrix ID: %s" % (list(sorted(metaSampleNames))[0], list(sorted(matrixSampleNames))[0])) sys.exit(1) metaNotMatrix = meta - mat matrixNotMeta = mat - meta stop = False mustFilterMatrix = False if len(matrixNotMeta)!=0: logging.warn("%d sample names are in the expression matrix, but not in the meta data. Examples: %s" % (len(matrixNotMeta), list(matrixNotMeta)[:10])) logging.warn("These samples will be removed from the expression matrix, if possible") matrixSampleNames = [x for x in matrixSampleNames if x in meta] mustFilterMatrix = True if len(metaNotMatrix)!=0: logging.warn("%d sample names are in the meta data, but not in the expression matrix. Examples: %s" % (len(metaNotMatrix), list(metaNotMatrix)[:10])) logging.warn("These samples will be removed from the meta data.") # filter the meta data file logging.info("Data contains %d samples/cells" % len(matrixSampleNames)) # slurp in the whole meta data, keep track of which fields contain only a single value tmpFname = fixedMetaFname+".tmp" ofh = open(tmpFname, "w") metaToRow = {} fieldValues = defaultdict(set) headers = None for row in textFileRows(metaFname): if headers is None: headers = row continue metaToRow[row[0]] = row for fieldIdx, val in enumerate(row): fieldValues[fieldIdx].add(val) # find fields that contain only a single value skipFields = set() for fieldIdx, values in iterItems(fieldValues): #logging.debug("fieldIdx %d, values %s" % (fieldIdx, values)) if len(values)==1: fieldName = headers[fieldIdx] logging.info("Field %d, '%s', has only a single value. Removing this field from meta data." % (fieldIdx, fieldName)) if fieldName in keepFields: logging.warn("Actually, keeping field '%s' since its the default label or color field, " "but check with submitter if this makes sense." % fieldName) else: skipFields.add(fieldIdx) # write the header line, removing unused fields ofh.write("\t".join(sliceRow(headers, skipFields))) ofh.write("\n") # and write the rows in the right order, also removing unused fields for matrixName in matrixSampleNames: ofh.write("\t".join(sliceRow(metaToRow[matrixName], skipFields))) ofh.write("\n") ofh.close() renameFile(tmpFname, fixedMetaFname) return matrixSampleNames, mustFilterMatrix def writeCoords(coordName, coords, sampleNames, coordBinFname, coordInfo, textOutName, limits): """ write coordinates given as a dictionary to coordBin, in the order of sampleNames Also return as a list. """ tmpFname = coordBinFname+".tmp" logging.info("Writing coordinates for %s" % (coordName)) binFh = open(tmpFname, "wb") debugFh = None if debugMode: debugFname = coordBinFname+".txt" debugFh = open(debugFname, "w") minX, maxX, minY, maxY, scaleX, scaleY, useTwoBytes, flipY = limits xVals = [] yVals = [] textOutTmp = textOutName+".tmp" textOfh = open(textOutTmp, "w") missNames = [] for sampleName in sampleNames: coordTuple = coords.get(sampleName) if coordTuple is None: missNames.append(sampleName) x = HIDDENCOORD y = HIDDENCOORD else: x, y = coordTuple textOfh.write("%s\t%f\t%f\n" % (sampleName, x, y)) # all little endian if useTwoBytes: x = int(x) y = int(y) binFh.write(struct.pack(" %s" % (inFname, outFname) ret = runCommand(cmd) if ret!=0 and isfile(outFname): os.remove(outFname) sys.exit(1) return matType sep = "\t" logging.info("Copying+reordering+trimming %s to %s, keeping %d columns with sample ID in meta" % (inFname, outFname, len(filtSampleNames))) logging.debug("matrix type: %s" % matType) matIter = MatrixTsvReader(geneToSym) matIter.open(inFname, matType=matType) sampleNames = matIter.getSampleNames() keepFields = set(filtSampleNames) keepIdx = [] doneSampleNames = set() for i, name in enumerate(sampleNames): if name in keepFields: if name in doneSampleNames: logging.warn("sample '%s' appears at least twice in expression matrix. Only the first one is used now" % name) else: keepIdx.append(i) doneSampleNames.add(name) assert(len(keepIdx)!=0) logging.debug("Keeping %d fields" % len(keepIdx)) tmpFname = outFname+".tmp" ofh = openFile(tmpFname, "w") ofh.write("gene\t") ofh.write("\t".join(filtSampleNames)) ofh.write("\n") count = 0 for geneId, sym, exprArr in matIter.iterRows(): newRow = [geneId+"|"+sym] for idx in keepIdx: newRow.append(str(exprArr[idx])) ofh.write("\t".join(newRow)) ofh.write("\n") count += 1 if count%1000==0: logging.info("Wrote %d text rows" % count) ofh.close() matIter.close() runGzip(tmpFname, outFname) return matIter.getMatType() def convIdToSym(geneToSym, geneId, printWarning=True): if "|" in geneId: return geneId.split("|")[1] if geneToSym is None: return geneId else: if geneId not in geneToSym: if printWarning: logging.warn("Could not convert geneId %s to symbol" % geneId) return geneId return geneToSym[geneId] def to_camel_case(snake_str): components = snake_str.split('_') # We capitalize the first letter of each component except the first one # with the 'title' method and join them together. return components[0] + ''.join(x.title() for x in components[1:]) nonAscii = re.compile(r'[^a-zA-Z_0-9+]') def sanitizeName(name): " remove all nonalpha chars, allow underscores, special treatment for %, + and -. Makes a valid file name. " assert(name!=None) origName = name # some characters are actually pretty common and there we have seen fields where the only # difference are these characters # if this continues to be aproblem, maybe append the MD5 of a raw field name to the sanitized name # IMPORTANT: There is a javascript version of this in cellBrowser.js. The function has the same # name and must return the same results, otherwise the front end can't find the filename # for a given cluster name. name = name.replace("+", "Plus").replace("-", "Minus").replace("%", "Perc") #newName = ''.join([ch for ch in name if (ord(chr) > 255 or ch.isalnum() or ch=="_")]) #newName = ''.join([ch for ch in name if (ord(chr) > 255 or ch.isalnum() or ch=="_")]) newName = nonAscii.sub("", name) if newName!=name: logging.debug("Sanitized %s -> %s" % (repr(name), newName)) if (len(newName)==0): errAbort("Field name %s has only special chars?" % repr(origName)) return newName def nonAlphaToUnderscores(name): " for tab-sep tables: replace nonalpha chars with underscores. Makes a valid name for namedtuple. " assert(name!=None) newName = name.replace("+", "Plus").replace("-", "Minus").replace("%", "Perc") newName = re.sub("[^a-zA-Z0-9_]","_", newName) newName = re.sub("^_","", newName) # remove _ prefix logging.debug("Sanitizing %s -> %s" % (repr(name), newName)) return newName def guessMarkerFields(headers): """ return a tuple (newHeaders, clusterIdx, geneIdx, otherStart, otherEnd), all of these are 0-based indexes of the respective fields. Should recognize Seurat2 and Seurat3 formats. """ # note: with seurat2, field 0 is not the gene ID, it has some weird suffix appended. seurat2Headers = ['', 'p_val', 'avg_logFC', 'pct.1', 'pct.2', 'p_val_adj', 'cluster', 'gene'] seurat3Headers = ["Gene","p-value","log2(FoldChange)","pct.1","pct.2","adjusted p-value","Cluster"] seurat4Headers = ['', 'p_val', 'avg_log2FC', 'pct.1', 'pct.2', 'p_val_adj', 'cluster', 'gene'] # the name of the first field varies depending now people use table.write, so we ignore it for the # header comparison if headers[1:8] == seurat2Headers[1:8] or headers[1:8]==seurat4Headers[1:8]: logging.info("Cluster marker file was recognized to be in Seurat2/4 format") geneIdx = 7 # see note above: field0 is not the actual geneId, but something else, a unique row ID scoreIdx = 1 clusterIdx = 6 otherStart = 2 otherEnd = len(headers) newHeaders = ["rowNameFromR", "pVal", "avg. logFC", "PCT1", "PCT2", "pVal adj.", "Cluster", "Gene"] newHeaders.extend(headers[8:]) headers = newHeaders elif headers[1:7] == seurat3Headers[1:]: logging.info("Cluster marker file was recognized to be in Seurat3 format") geneIdx = 0 scoreIdx = 1 clusterIdx = 6 otherStart = 2 otherEnd = len(headers) else: logging.info("Assuming non-Seurat marker file format (cluster, gene, score) + any other fields") clusterIdx = 0 geneIdx = 1 scoreIdx = 2 otherStart = 3 otherEnd = 9999 return headers, clusterIdx, geneIdx, scoreIdx, otherStart, otherEnd def parseMarkerTable(filename, geneToSym): " parse marker gene table and return dict clusterName -> list of rows (geneId, geneSym, otherFields...)" logging.debug("Reading cluster markers from %s" % (filename)) ifh = openFile(filename) headerLine = ifh.readline().rstrip("\r\n") sep = sepForFile(filename) headers = headerLine.split(sep) headers, clusterIdx, geneIdx, scoreIdx, otherStart, otherEnd = guessMarkerFields(headers) otherHeaders = headers[otherStart:otherEnd] logging.debug("Other headers: %s" % otherHeaders) reader = csv.reader(ifh, delimiter=sep, quotechar='"') data = defaultdict(list) otherColumns = defaultdict(list) for row in reader: if row[0].startswith("#"): continue clusterName = row[clusterIdx] if clusterName=="": errAbort("Found empty string as cluster name in cluster marker file. Please fix the marker file.") geneId = row[geneIdx] try: scoreVal = float(row[scoreIdx]) except ValueError: logging.error("File %s, column %d: value is not a number but we expect a number in this colum." "Columns should be: cluster, gene, score. '%s' is not a score`" % (filename, scoreIdx+1, row[scoreIdx])) raise otherFields = row[otherStart:otherEnd] for colIdx, val in enumerate(otherFields): otherColumns[colIdx].append(val) geneSym = convIdToSym(geneToSym, geneId, printWarning=False) if "|" in geneId: geneId = geneId.split("|")[0] newRow = [] newRow.append(geneId) newRow.append(geneSym) newRow.append(scoreVal) newRow.extend(otherFields) data[clusterName].append(newRow) # annotate otherColumns with their data type, separated by |. This is optional and only used for sorting # in the user interface otherColType = {} for colIdx, vals in iterItems(otherColumns): otherColType[colIdx] = typeForStrings(vals) otherHeadersWithType = [] for colIdx, header in enumerate(otherHeaders): if otherColType[colIdx]!="string": header = header+"|"+otherColType[colIdx] otherHeadersWithType.append(header) logging.debug("Other headers with type: %s" % otherHeadersWithType) newHeaders = ["id", "symbol", headers[scoreIdx]+"|float"] newHeaders.extend(otherHeadersWithType) if len(data) > 1000: errAbort("Your marker file has more than 1000 clusters. Are you sure that this is correct? The input format is (clusterName, geneSymName, Score), is it possible that you have inversed the order of cluster and gene?") # determine if the score field is most likely a p-value, needed for sorting revSort = True scoreHeader = headers[scoreIdx].lower() logging.debug("score field has name '%s'" % scoreHeader) if scoreHeader in ["p_val", "p-val", "p.val", "pval", "fdr"]: logging.debug("score field name '%s' looks like a p-Value, sorting normally" % scoreHeader) revSort = False for clusterName, rows in iterItems(data): rows.sort(key=operator.itemgetter(2), reverse=revSort) return data, newHeaders -def splitMarkerTable(filename, geneToSym, outDir): +def splitMarkerTable(filename, geneToSym, matrixGeneIds, outDir): """ split .tsv on first field and create many files in outDir with columns 2-end. Returns the names of the clusters and a dict topMarkers with clusterName -> list of five top marker genes. """ topMarkerCount = 5 if filename is None: return data, newHeaders = parseMarkerTable(filename, geneToSym) logging.debug("Splitting cluster markers into directory %s" % (outDir)) fileCount = 0 sanNames = set() topMarkers = {} for clusterName, rows in iterItems(data): logging.debug("Cluster: %s" % clusterName) sanName = sanitizeName(clusterName) assert(sanName not in sanNames) # after removing special chars, cluster names must still be unique. this is most likely due to typos in your meta annotation table. If it is not, we need to change the code here. sanNames.add(sanName) outFname = join(outDir, sanName+".tsv") logging.debug("Writing %s" % outFname) ofh = open(outFname, "w") ofh.write("\t".join(newHeaders)) ofh.write("\n") + missGeneIds = set() for row in rows: row[2] = "%0.5E" % row[2] # limit score to 5 digits + geneId = row[1] + if geneId not in matrixGeneIds: + missGeneIds.add(geneId) + continue + ofh.write("\t".join(row)) ofh.write("\n") + if len(missGeneIds)!=0: + logging.error("Marker table contains these genes, they were skipped, they are not in the matrix: %s" % (",".join(missGeneIds))) + #logging.error("Use --force to accept this.") + topSyms = [row[1] for row in rows[:topMarkerCount]] topMarkers[clusterName] = topSyms ofh.close() runGzip(outFname) fileCount += 1 logging.info("Wrote %d .tsv.gz files into directory %s" % (fileCount, outDir)) return data.keys(), topMarkers def syncFiles(inFnames, outDir): " compare filesizes and copy to outDir, if needed " logging.info("Syncing %s to %s" % (inFnames, outDir)) for inFname in inFnames: outFname = join(outDir, basename(inFname)) if not isfile(outFname) or getsize(inFname)!=getsize(outFname): logging.debug("Copying %s to %s" % (inFname, outFname)) shutil.copyfile(inFname, outFname) def copyDatasetHtmls(inDir, outConf, datasetDir): " copy dataset description html files to output directory. Add md5s to outConf. " filesToCopy = ["summary.html", "methods.html", "downloads.html", "thumb.png", "protocol.pdf", "desc.conf"] outConf["descMd5s"] = {} for fileBase in filesToCopy: inFname = makeAbs(inDir, fileBase) if not isfile(inFname): logging.debug("%s does not exist" % inFname) else: outPath = join(datasetDir, fileBase) logging.debug("Copying %s -> %s" % (inFname, outPath)) shutil.copyfile(inFname, outPath) outConf["descMd5s"][fileBase.split(".")[0]] = md5ForFile(inFname)[:MD5LEN] def getImageSize(inFname): " return width, height of image file " cmd = ["file", inFname] proc, stdout = popen(cmd) imgLine = stdout.readline() # thumb.png: PNG image data, 400 x 267, 8-bit/color RGBA, non-interlaced sizeStr = imgLine.split(":")[1].split(", ")[1] if " x " in sizeStr: width, height = sizeStr.split(" x ") else: cmd = ["identify", inFname] proc, stdout = popen(cmd) imgLine = stdout.readline() # thumb.jpg JPEG 400x566 400x566+0+0 8-bit DirectClass 39.1KB 0.000u 0:00.010 sizeStr = imgLine.split(" ")[2] width, height = sizeStr.split("x") return int(width), int(height) def copyImage(inDir, summInfo, datasetDir): """ copy image to datasetDir and write size to summInfo["imageWidth"] and "imageHeight" """ if summInfo["image"]=="": logging.warn("empty string for the 'image' tag in desc.conf, removing it entirely") del summInfo["image"] return summInfo inFname = join(inDir, summInfo["image"]) logging.debug("Copying %s to %s" % (inFname, datasetDir)) shutil.copyfile(inFname, join(datasetDir, basename(inFname))) width, height = getImageSize(inFname) summInfo["image"] = (summInfo["image"], width, height) summInfo["imageMd5"] = md5ForFile(inFname) if "imageMapFile" in summInfo: mapFname = join(inDir, summInfo["imageMapFile"]) logging.debug("Loading image map from %s" % mapFname) areaData = open(mapFname, "r").read() summInfo["imageMap"] = areaData del summInfo["imageMapFile"] return summInfo def readFile(fname, encoding="utf8"): " read entire file with encoding " if isPy3: fh = open(fname, "r", encoding=encoding) text = fh.read() else: fh = open(fname, "r") text = fh.read().decode(encoding) fh.close() return text def readFileIntoDict(summInfo, key, inDir, fname, mustExist=False, encoding="utf8"): " return file with encoding as string into dictionary " fname = join(inDir, fname) if not isfile(fname): if mustExist: errAbort("%s does not exist" % fname) else: return logging.debug("Reading file into dict. %s, inDir %s, file name %s" % (key, inDir, fname)) if fname.endswith(".json"): data = readJson(fname) summInfo[key] = data else: text = readFile(fname) summInfo[key] = text def copyImageFile(inDir, data, outDir, doneNames, imageSetFnames): """ given data as a dict, try to copy files with key 'file' and 'thumb' to outDir. Also, strip the whole directory from the file name and keep only the base name. doneNames and imageSetFnames are used to catch duplicate-filename cases. """ for key in ["file", "thumb"]: if key not in data: continue rawInPath = join(inDir, data[key]) base = basename(rawInPath) # strip directory part if base in doneNames and base not in imageSetFnames: logging.warn("The basename %s is used twice in the images.json definition. Note that all image names " "must be unique as users may want to download images. Often you can make them unique by " "making their subdirectory part of the filename." % base) data[key] = base rawOutPath = join(outDir, base) if not isfile(rawOutPath) or getsize(rawInPath)!=getsize(rawOutPath): logging.info("Copying %s to %s" % (rawInPath, rawOutPath)) shutil.copyfile(rawInPath, rawOutPath) else: logging.debug("Not copying %s again, already in output directory" % rawInPath) doneNames.add(base) imageSetFnames.add(base) def copyBackgroundImages(inDir, inConf, outConf, datasetDir): " copy over spatial images and get their dimensions " if not "images" in inConf: return newImages = [] for imageInfo in inConf["images"]: inFname = join(inDir, imageInfo["file"]) #outConf["fileVersions"]["spatial_"+imageInfo["label"]] = getFileVersion(inFname) logging.info("Copying %s to %s" % (inFname, datasetDir)) shutil.copyfile(inFname, join(datasetDir, basename(inFname))) #width, height = getImageSize(inFname) #imageInfo["width"] = width #imageInfo["height"] = height addMd5(imageInfo, inFname) newImages.append(imageInfo) outConf["images"] = newImages def parseImageTsv(fname): """ parse a tsv file with images to a nested dict structure and return it input is a four or five column tsv file with mandatory headers: section, set, label, fname, (isDownload, optional) output is a nested dict: list of categories (dict) Each category has "categoryLabel" and "categoryImageSets" (list) Each imageSet is a dict with keys: setLabel and images and downloads. Each image has keys "file" and "label". Each download has keys "file" and "label". """ headerOk = False sections = OrderedDict() headers = None for row in textFileRows(open(fname)): row[0] = row[0].lstrip("#") if headers is None: headers = row if not headers[:4]==["section", "set", "label", "fname"]: headerOk = True if headerOk: errAbort("imagesFile files must be .json or .tsv. If .tsv, must have at least these columns in this order: section, set, label, fname. Header found was: %s" % headers) continue section, imgSet, label, fname = row isDownload = False if len(row)>4: isDownload = bool(row[4]) sections.setdefault(section, OrderedDict()) sections[section].setdefault(imgSet, OrderedDict()) sections[section][imgSet].setdefault(isDownload, []).append( (label, fname) ) out = [] for sectionName, setDict in sections.items(): sectDict = {} sectDict["categoryLabel"] = sectionName for setName, downDict in setDict.items(): setDict = {} setDict["setLabel"] = setName for isDownload, imgs in downDict.items(): for img in imgs: label, fname = img if isDownload: keyName = "downloads" else: keyName = "images" setDict.setdefault(keyName, []).append({"file":fname, "label":label}) sectDict.setdefault("categoryImageSets", []).append(setDict) out.append(sectDict) return out def writeDatasetDesc(inDir, outConf, datasetDir, coordFiles=None, matrixFname=None): " write a json file that describes the dataset abstract/methods/downloads " confFname = join(inDir, "datasetDesc.conf") if not isfile(confFname): confFname = join(inDir, "desc.conf") # old files don't have the outConf object yet if "fileVersions" not in outConf: outConf["fileVersions"] = {} if isfile(confFname): outConf["fileVersions"]["desc"] = getFileVersion(confFname) summInfo = loadConfig(confFname) outConf["hasFiles"] = ["datasetDesc"] else: logging.debug("Could not find %s" % confFname) summFname = join(inDir, "summary.html") if not summFname: logging.debug("no summary.html found") else: summInfo = {} outPath = join(datasetDir, "desc.json") if coordFiles: summInfo["coordFiles"] = coordFiles if matrixFname: summInfo["matrixFile"] = matrixFname # try various ways to get the abstract and methods html text readFileIntoDict(summInfo, "abstract", inDir, "abstract.html") readFileIntoDict(summInfo, "methods", inDir, "methods.html") # this is only for very old datasets with a desc.conf - deprecated readFileIntoDict(summInfo, "abstract", inDir, "summary.html") readFileIntoDict(summInfo, "downloads", inDir, "downloads.html") if "abstractFile" in summInfo: readFileIntoDict(summInfo, "abstract", inDir, summInfo["abstractFile"], mustExist=True) del summInfo["abstractFile"] if "methodsFile" in summInfo: readFileIntoDict(summInfo, "methods", inDir, summInfo["methodsFile"], mustExist=True) del summInfo["methodsFile"] if "imagesFile" in summInfo: imgFname = summInfo["imagesFile"] if imgFname.endswith(".json"): readFileIntoDict(summInfo, "imageSets", inDir, imgFname, mustExist=True) else: summInfo["imageSets"] = parseImageTsv(imgFname) outConf["fileVersions"]["supplImageConf"] = getFileVersion(imgFname) #else: #readFileIntoDict(summInfo, "imageSets", inDir, "images.json") # import the unit description from cellbrowser.conf if "unit" in outConf and not "unitDesc" in summInfo: summInfo["unitDesc"] = outConf["unit"] # import the atachSearch attribute from cellbrowser.conf if "atacSearch" in outConf and not "atacSearch" in summInfo: summInfo["atacSearch"] = outConf["atacSearch"] # copy over the raw matrix file, usually this is a zip or gzip file if "rawMatrixFile" in summInfo: rawInPath = join(inDir, summInfo["rawMatrixFile"]) rawOutPath = join(datasetDir, basename(summInfo["rawMatrixFile"])) if not isfile(rawOutPath) or getsize(rawInPath)!=getsize(rawOutPath): logging.info("Copying %s to %s" % (rawInPath, rawOutPath)) shutil.copyfile(rawInPath, rawOutPath) else: logging.info("Not copying %s again, already in output directory" % rawInPath) # copy over any other supplemental files if "supplFiles" in summInfo: for sf in summInfo["supplFiles"]: if not "file" in sf or not "label" in sf: errAbort("The supplFiles entries in desc.conf all must include at least a 'file' and a 'label' key. " "The entry %s doesn't seem to include one." % sf) rawInPath = join(inDir, sf["file"]) rawOutPath = join(datasetDir, basename(sf["file"])) sf["file"] = basename(sf["file"]) # strip input directory if not isfile(rawOutPath) or getsize(rawInPath)!=getsize(rawOutPath): logging.info("Copying %s to %s" % (rawInPath, rawOutPath)) shutil.copyfile(rawInPath, rawOutPath) else: logging.info("Not copying %s again, already in output directory" % rawInPath) # copy the main image if "image" in summInfo: summInfo = copyImage(inDir, summInfo, datasetDir) # copy over the imageSet files. This can take a while doneNames = set() if "imageSets" in summInfo: imageDir = join(datasetDir, "images") makeDir(imageDir) for catInfo in summInfo["imageSets"]: for imageSet in catInfo["categoryImageSets"]: imageSetFnames = set() if "downloads" in imageSet: for dl in imageSet.get("downloads"): copyImageFile(inDir, dl, imageDir, doneNames, imageSetFnames) if "images" in imageSet: for img in imageSet.get("images"): copyImageFile(inDir, img, imageDir, doneNames, imageSetFnames) # if we have a desc.conf: with so much data now in other files, generate the md5 from the data # itself not just the desc.conf if "desc" in outConf["fileVersions"]: outConf["fileVersions"]["desc"]["md5"] = md5ForDict(summInfo)[:MD5LEN] writeJson(summInfo, outPath) # it's easier to have a single field that tells us if the desc.json is present if not "hasFiles" in outConf: outConf["hasFiles"] = ["datasetDesc"] logging.debug("Wrote dataset description to %s" % outPath) return True def makeAbs(inDir, fname): " return absolute path of fname under inDir " if fname is None: return None return abspath(join(inDir, fname)) def makeAbsDict(conf, key, fnameKey="file"): " given list of dicts with key 'file', assume they are relative to inDir and make their paths absolute " inDir = conf["inDir"] dicts = conf[key] for d in dicts: if fnameKey in d: d[fnameKey] = makeAbs(inDir, d[fnameKey]) return dicts def parseTsvColumn(fname, colName): " parse a tsv file and return a single column as a pair (values, assignment row -> index in values) " logging.info("Parsing column %s from %s" % (colName, fname)) vals = parseOneColumn(fname, colName) newVals = [] valToInt = {} maxIdx = -1 for v in vals: if v not in valToInt: maxIdx+=1 valToInt[v] = maxIdx idx = valToInt[v] newVals.append(idx) # inverse key/val dict intToVal = {} for k, v in iterItems(valToInt): intToVal[v] = k valArr = [] for i in range(0, maxIdx+1): valArr.append(intToVal[i]) return newVals, valArr def makeMids(xVals, yVals, labelVec, labelVals, coordInfo): """ calculate the positions (centers) for the cluster labels given a coord list and a vector of the same size with the label indices, return a list of [x, y, coordLabel] """ logging.debug("Making cluster labels for %s" % labelVals) assert(len(xVals)==len(labelVec)==len(yVals)) # prep the arrays clusterXVals = [] clusterYVals = [] for i in range(len(labelVals)): clusterXVals.append([]) clusterYVals.append([]) assert(len(clusterXVals)==len(labelVals)) # sort the coords into separate arrays, one per cluster for i in range(len(labelVec)): #for (x, y), clusterIdx in zip(coords, labelVec): clusterIdx = labelVec[i] x = xVals[i] y = yVals[i] if x==HIDDENCOORD and y==HIDDENCOORD: continue clusterXVals[clusterIdx].append(x) clusterYVals[clusterIdx].append(y) midInfo = [] for clustIdx, xList in enumerate(clusterXVals): clusterName = labelVals[clustIdx] if len(xList)==0: midInfo.append([HIDDENCOORD, HIDDENCOORD, clusterName]) continue yList = clusterYVals[clustIdx] # get the midpoint of this cluster midX = sum(xList) / float(len(xList)) midY = sum(yList) / float(len(yList)) if len(xList)<3: midInfo.append([midX, midY, clusterName]) continue # take only the best 70% of the points closest to the midpoints xyDist = [] for x, y in zip(xList, yList): dist = math.sqrt((x-midX)**2+(y-midY)**2) xyDist.append( (dist, x, y) ) xyDist.sort() xyDistBest = xyDist[:int(0.7*len(xyDist))] # now recalc the midpoint xSum = sum([x for dist, x, y in xyDistBest]) ySum = sum([y for dist, x, y in xyDistBest]) fixMidX = xSum / float(len(xyDistBest)) fixMidY = ySum / float(len(xyDistBest)) midInfo.append([fixMidX, fixMidY, clusterName]) # make some minimal effort to reduce overlaps #spanX = coordInfo['maxX'] - coordInfo['minX'] #spanY = coordInfo['maxY'] - coordInfo['minY'] #tickX = spanX / 1000 # rough guess how much one pixel could be on #tickY = spanY / 1000 # the final screen #for i, (midX1, midY1, clusterName1) in enumerate(midInfo): #print "first", i, midX1, midY1, clusterName1 #for j, (midX2, midY2, clusterName2) in enumerate(midInfo[i+1:]): #print "second", j, midX2, midY2, clusterName1, clusterName2 #distX = abs(midX2-midX1) #distY = abs(midY2-midY1) #print distX, distY ## if distance between two labels too short: #dist = math.sqrt((((midX2-midX1)/tickX)**2+((midY2-midY1)/tickY)**2)) #print "dist in pixels", dist #if dist< 30: #print "moving" #print "before", midInfo[j] ## move the first label slightly downwards #midInfo[j][1] = midY1 + 5 * tickY #print "after", midInfo[j] return midInfo def readHeaders(fname): " return headers of a file " logging.info("Reading headers from file %s" % fname) ifh = openFile(fname, "rt") line1 = ifh.readline().rstrip("\r\n") sep = sepForFile(fname) row = line1.split(sep) row = [x.rstrip('"').lstrip('"') for x in row] # Excel sometimes adds quotes logging.debug("Found %d fields, e.g. %s" % (len(row), row[:3])) if len(row)==0: errAbort("Could not read headers from file %s" % fname) return row def removeBom(line): " strip BOM from line " return line.replace('\ufeff', '') #def indexRangesByChrom(stringList): # " given a list of strings in format chr:start-end, return dict chrom -> sorted list of (start, end) " # byChrom = defaultdict(list) # for rangeStr in stringList: # chrom, pos = rangeStr.split(":") # start, end = pos.split("-") # start = int(start) # end = int(end) # byChrom[chrom].append( (start,end) ) # # # sort in-place # for chrom, chromList in byChrom.items(): # chromList.sort() # # return byChrom def findOverlappingRanges(atacByChrom, posStr): """ atacByChrom is an ATAC dict with chr -> list of (start, end, offset, len), as returned by indexAtacOffsetsByChrom() and posStr is a string chrom:start-end Returns a string like chr1|start1|end1+chr1|start2|end2 start1|end1, start2|end2, etc all are within start-end. Returns an empty string if nothing found. """ chrom, pos = posStr.split(":") start, end = pos.split("-") start = int(start) end = int(end) # this would be a lot faster with binary search but also a lot more complicated... if chrom not in atacByChrom: errAbort("The quickGenes file contains %s, but this chromosome has no features in the matrix." % repr(posStr)) chromRanges = atacByChrom[chrom] foundRanges = [] for rangeStart, rangeEnd, offset, dataLen in chromRanges: if start <= rangeStart and rangeEnd <= end: foundRanges.append( "|".join( [chrom, str(rangeStart), str(rangeEnd)] ) ) return "+".join(foundRanges) def parseGeneInfo(geneToSym, fname, matrixSyms, matrixGeneIds): """ parse quick genes file with three columns: symbol or geneId, desc (optional), pmid (optional). Return as a dict geneId|symbol -> description """ if fname is None: return {} logging.debug("Parsing %s" % fname) symToGene = None nonUniqueSyms = defaultdict(set) if geneToSym is not None: symToGene = dict() for gene, sym in iterItems(geneToSym): if sym in symToGene: #logging.warning("Symbol %s is not unique, geneID %s" % (sym, gene)) nonUniqueSyms[sym].add(gene) nonUniqueSyms[sym].add(symToGene[sym]) symToGene[sym] = gene atacByChrom = None # only filled lazily if we find any ATAC quick ranges sep = sepForFile(fname) geneInfo = [] for line in openFile(fname): if line.startswith("#"): continue line = removeBom(line) line = line.rstrip("\r\n") if len(line)==0: continue hasDesc = False hasPmid = False if line.startswith("symbol"): continue row = line.split(sep) geneOrSym = row[0] # case 1: user provides both geneId and symbol. Rare. # Necessary when symbol <-> geneId is not unique if "|" in geneOrSym: geneId, sym = geneOrSym.split("|") if geneId not in matrixGeneIds: - errAbort("geneId %s in quickgenes file is not in expression matrix" % repr(geneId)) + logging.info("case 1: geneId %s in quickgenes file is not in expression matrix" % repr(geneId)) + continue geneStr = geneOrSym # case 2: matrix has only symbols and user provides symbol. This is our legacy format for old datasets. # store only the symbol. We could look up the geneId but that's data inference, # which we try not to do. The lookup could be wrong. - elif matrixSyms is not None and geneOrSym in matrixSyms and len(matrixGeneIds)==0: + elif matrixSyms is not None and geneOrSym in matrixSyms: geneStr = geneOrSym + if geneStr not in matrixGeneIds: + logging.info("case 2: geneId %s in quickgenes file is not in expression matrix" % repr(geneStr)) + continue # case 3: matrix has geneIds and user provides a geneId. add the symbol from our mapping # that's data inference, but that should be OK elif geneOrSym in matrixGeneIds: geneId = geneOrSym if not geneToSym: logging.info("quick gene %s has a geneId but we have no geneId/symbol table. You can use " "the format geneId|symbol in the quick genes file to manually assign a label" % repr(geneId)) geneStr = geneId else: if geneId in geneToSym: geneStr = geneId+"|"+geneToSym[geneId] else: geneStr = geneId # case 4: matrix has geneIds and user provides geneId or symbol. Store both. elif geneToSym: # if we have a gene symbol mapping, we can resolve the symbols to geneIds # one can provide either geneIDs or symbols in the quick genes file if geneOrSym in geneToSym: geneId = geneOrSym sym = geneToSym[geneId] elif geneOrSym in symToGene: sym = geneOrSym geneId = symToGene[sym] else: errAbort("Gene %s in quickgenes file is neither a symbol nor a geneId" % repr(geneOrSym)) if sym in nonUniqueSyms: logging.warn("Symb %s in quick genes files is not unique. Which of these genes do you mean: %s. " "Use geneId|sym in quickgenes file to solve this." % (sym, ",".join(nonUniqueSyms[sym]))) geneStr = geneId+"|"+sym # case 5: it is an ATAC dataset and the quickgenes file has ranges elif ":" in geneOrSym and "-" in geneOrSym: #geneStr = geneOrSym.replace(":", "|").replace("-", "|") geneStr = findOverlappingRanges(matrixGeneIds, geneOrSym) if geneStr=="": logging.error("quickGene ATAC range %s does not contain any peak in the matrix, skipping it" % geneOrSym) continue else: - errAbort("Gene '%s' in quickgenes file is not in expr matrix and there is no geneId<->symbol mapping " + logging.info("Gene %s in quickgenes file is not in expr matrix and there is no geneId<->symbol mapping " "to resolve it to a geneId in the expression matrix and it is not an ATAC range" % repr(geneOrSym)) + continue # if we had no geneToSym, we'll check the symbol later if it's valid info = [geneStr] if len(row)>1: info.append(row[1]) if len(row)>2: info.append(row[2]) geneInfo.append(info) return geneInfo def readSampleNames(fname): " read only the first column of fname, strip the headers " logging.info("Reading sample names from %s" % fname) sampleNames = [] i = 1 doneNames = set() for row in lineFileNextRow(fname, noHeaders=False): metaName = row[0] if metaName=="": errAbort("invalid sample name - line %d in %s: sample name (first field) is empty" % (i, fname)) if metaName in doneNames: logging.warn("sample name duplicated - line %d in %s: sample name %s (first field) has been seen before" % (i, fname, metaName)) continue else: doneNames.add(metaName) sampleNames.append(row[0]) i+=1 logging.debug("Found %d sample names, e.g. %s" % (len(sampleNames), sampleNames[:3])) return sampleNames def guessGeneIdType(inputObj): """ Accepts a list of gene IDs or a matrix file name. returns 'gencode-human', 'gencode-mouse' or 'symbols' depending on the first gene """ if type(inputObj)==type(list()): geneIds = inputObj else: matrixFname = inputObj if matrixFname.endswith(".mtx.gz"): _mtxFname, geneFname, barcodeFname = findMtxFiles(matrixFname) geneIds, barcodes = readGenesBarcodes(geneFname, barcodeFname) else: matIter = MatrixTsvReader() matIter.open(matrixFname, usePyGzip=True) geneId, sym, exprArr = nextEl(matIter.iterRows()) matIter.close() geneIds = [geneId] # only use the first geneId for the guessing geneId = str(geneIds[0]) # when reading from h5ad, sometimes the geneId is a byte string if geneId.startswith("ENSG"): geneType = "gencode-human" elif geneId.startswith("ENSMUSG"): geneType = "gencode-mouse" elif geneId.startswith("KH2013:"): geneType = "ciona-kh2013" elif geneId.startswith("ENS"): errAbort("geneID starts with ENS but is neither mouse nor human. Auto-detection doesn't work. Please specify the geneIdType in cellbrowser.conf. If you do not want geneIDs to be resolved to symbols specify 'symbol' or 'raw'.") elif geneId.isdigit(): geneType = "entrez" # currently contains human and mouse symbols else: geneType = "symbols" logging.info("Auto-detected gene IDs type: %s" % (geneType)) return geneType def scaleLines(lines, limits, flipY): " scale line points to 0-1.0 or 0-65535 " logging.debug("Scaling lines, flip is %s" % flipY) minX, maxX, minY, maxY, scaleX, scaleY, useTwoBytes, _ = limits scaledLines = [] for lineRow in lines: x1, y1, x2, y2 = lineRow[:4] x1, y1 = scalePoint(scaleX, scaleY, minX, maxX, minY, maxY, flipY, useTwoBytes, x1, y1) x2, y2 = scalePoint(scaleX, scaleY, minX, maxX, minY, maxY, flipY, useTwoBytes, x2, y2) # if the line has a label, append it scaledRow = [x1, y1, x2, y2] if len(lineRow)>4: scaledRow.append(lineRow[4]) scaledLines.append( scaledRow ) return scaledLines def parseLineInfo(inFname, limits): " parse a tsv or csv file and use the first four columns as x1,y1,x2,y2 for straight lines " lines = [] endPoints = [] for row in lineFileNextRow(inFname): if len(row) not in [4,5]: errAbort("The line file %s does not have 4 or 5 columns. Format is (x1, y1, x2, y2) with an optional label as the last field" % inFname) lineRow = [ float(row.x1), float(row.y1), float(row.x2), float(row.y2) ] if len(row)>4: lineRow.append(row[4]) lines.append(lineRow) for lineRow in lines: x1, y1, x2, y2 = lineRow[:4] # we need a list of coordinates so we can determine at the end what the total min/max ranges are # and lines can go outside the min/max of the umap coords. endPoints.append( (x1, y1) ) endPoints.append( (x2, y2) ) logging.info("Read lines from %s, got %d lines" % (inFname, len(lines))) return lines, endPoints def convertExprMatrix(inConf, outMatrixFname, outConf, metaSampleNames, geneToSym, outDir, needFilterMatrix): """ trim a copy of the expression matrix for downloads, also create an indexed and compressed version """ matType = inConf.get("matrixType") if matType=="auto": matType = None if isMtx(outMatrixFname) and needFilterMatrix: #errAbort("Some cell identifiers are in the matrix but not in the meta. trimming .mtx files is not supported, only for the tsv.gz format. Consider using cbTool mtx2tsv and edit cellbrowser.conf or remove unannotated cells with Seurat or Scanpy from the expression matrix.Or manually add the missing cells to the meta.tsv. Usually 'cbTool mtx2tsv' is the easiest way forward.") logging.warn("Some cell identifiers are in the matrix but not in the meta. trimming .mtx files is not supported, " "only for the tsv.gz format. This means that the matrix.mtx.gz file that the user can download has more data " "than the meta file, but that may not be a problem for most users. You can use cbTool mtx2tsv and edit " "cellbrowser.conf or remove unannotated cells with Seurat or Scanpy from the expression matrix. Or manually " "add the missing cells to the meta.tsv. In most cases, this is fine to leave as it is.") # step1: copy expression matrix, so people can download it. If needed, # remove sample data from the matrix that are not in the meta data matrixFname = getExprMatrixFname(inConf) try: matType = copyMatrixTrim(matrixFname, outMatrixFname, metaSampleNames, needFilterMatrix, geneToSym, outConf, matType) except ValueError: logging.warn("This is rare: mis-guessed the matrix data type, trying again and using floating point numbers. To avoid this message in the " \ "future, you can set matrixType='float' in cellbrowser.conf.") matType = copyMatrixTrim(matrixFname, outMatrixFname, metaSampleNames, needFilterMatrix, geneToSym, outConf, "float") # step2: compress matrix and index to file binMat = join(outDir, "exprMatrix.bin") binMatIndex = join(outDir, "exprMatrix.json") discretBinMat = join(outDir, "discretMat.bin") discretMatrixIndex = join(outDir, "discretMat.json") genesAreRanges = (inConf.get("atacSearch")!=None) matType = matrixToBin(outMatrixFname, geneToSym, binMat, binMatIndex, discretBinMat, \ discretMatrixIndex, metaSampleNames, matType=matType, genesAreRanges=genesAreRanges) # these are the Javascript type names, not the python ones (they are also better to read than the Python ones) if matType=="int" or matType=="forceInt": outConf["matrixArrType"] = "Uint32" elif matType=="float": outConf["matrixArrType"] = "Float32" else: assert(False) outConf["fileVersions"]["outMatrix"] = getFileVersion(outMatrixFname) def copyConf(inConf, outConf, keyName): " copy value of keyName from inConf dict to outConf dict " if keyName in inConf: outConf[keyName] = inConf[keyName] def getLimits(coordsList, extLimits): " get x-y-limits, either from data or from externally provided dictionary " minX = 2**32 minY = 2**32 maxX = -2**32 maxY = -2**32 for coords in coordsList: for x, y in coords: #logging.debug("x %f y %f, maxY %f" % (x, y, maxY)) # special values (12345,12345) mean "unknown cellId" if x!=HIDDENCOORD and y!=HIDDENCOORD: minX = min(x, minX) minY = min(y, minY) maxX = max(x, maxX) maxY = max(y, maxY) if "minX" in extLimits: minX = extLimits["minX"] if "maxX" in extLimits: maxX = extLimits["maxX"] if "minY" in extLimits: minY = extLimits["minY"] if "maxY" in extLimits: maxY = extLimits["maxY"] logging.debug("Coordinate limits are: minX=%s, maxX=%s, minY=%s, maxY=%s" % (str(minX), str(maxX), str(minY), str(maxY))) return (minX, maxX, minY, maxY) def justPoints(coords): " given id -> (x,y) only return the list of (x,y)" points = [] for cellId, x, y, in coords: points.append( (x,y) ) return points def convertCoords(inDir, inConf, outConf, sampleNames, outMeta, outDir): " convert the coordinates " coordFnames = makeAbsDict(inConf, "coords") coordFnames = makeAbsDict(inConf, "coords", fnameKey="lineFile") useTwoBytes = False # to save space, coordinates are reduced to the range 0-65535 hasLabels = False if "labelField" in inConf and inConf["labelField"] is not None: hasLabels = True clusterLabelField = inConf["labelField"] labelVec, labelVals = parseTsvColumn(outMeta, clusterLabelField) outConf["labelField"] = clusterLabelField outFnames = [] coordConf = [] for coordIdx, inCoordInfo in enumerate(coordFnames): coordFname = inCoordInfo["file"] coordLabel = inCoordInfo["shortLabel"] logging.info("Parsing coordinates for "+coordLabel+" from "+coordFname) # 'limits' is everything needed to transform coordinates to the final 0-1.0 or 0-65535 coord system flipY = inConf.get("flipY", False) # R has a bottom screen 0 different than most drawing libraries flipY = bool(inCoordInfo.get("flipY", flipY)) coords = parseCoordsAsDict(coordFname) if useTwoBytes is None: if len(coords)>100000: useTwoBytes = True logging.info("More than 100k cells, so automatically enabling two-byte encoding for coordinates") else: logging.debug("Not using 2byte mode") useTwoBytes = False hasLines = False allPoints = [justPoints(coords)] minX, maxX, minY, maxY = getLimits(allPoints, inCoordInfo) # now that we have the global limits, scale everything scaleX, scaleY, minX, maxX, minY, maxY = calcScaleFact(minX, maxX, minY, maxY, useTwoBytes) scaleYFact = inCoordInfo.get("scaleYFact") if scaleYFact: scaleY = scaleYFact logging.debug("Override scale factor on y by config: %f" % scaleY) limits = (minX, maxX, minY, maxY, scaleX, scaleY, useTwoBytes, flipY) # parse lines, updating the min-max ranges if "lineFile" in inCoordInfo: lineCoords, linePoints = parseLineInfo(inCoordInfo["lineFile"], limits) allPoints.append(linePoints) hasLines = True coordDict = scaleCoords(coords, limits) coordName = "coords_%d" % coordIdx coordDir = join(outDir, "coords", coordName) makeDir(coordDir) coordBin = join(coordDir, "coords.bin") coordInfo = OrderedDict() coordInfo["name"] = coordName coordInfo["shortLabel"] = coordLabel if "radius" in inCoordInfo: coordInfo["radius"] = inCoordInfo["radius"] if "annots" in inCoordInfo: coordInfo["annots"] = inCoordInfo["annots"] if "colorOnMeta" in inCoordInfo: coordInfo["colorOnMeta"] = inCoordInfo["colorOnMeta"] copyBackgroundImages(inDir, inCoordInfo, coordInfo, outDir) cleanName = sanitizeName(coordLabel.replace(" ", "_")) textOutBase = cleanName+".coords.tsv.gz" textOutName = join(outDir, textOutBase) outFnames.append(textOutBase) coordInfo, xVals, yVals = writeCoords(coordLabel, coordDict, sampleNames, coordBin, \ coordInfo, textOutName, limits) clusterInfo = {} if hasLines: lineFlipY = inCoordInfo.get("lineFlipY", flipY) lineData = scaleLines(lineCoords, limits, lineFlipY) clusterInfo["lines"] = lineData if hasLabels: logging.debug("Calculating cluster midpoints for "+coordLabel) clusterMids= makeMids(xVals, yVals, labelVec, labelVals, coordInfo) clusterOrder = orderClusters(clusterMids, outConf) clusterInfo["labels"] = clusterMids clusterInfo["order"] = clusterOrder else: labelVals = [] if hasLabels: clusterLabelFname = join(coordDir, "clusterLabels.json") midFh = open(clusterLabelFname, "w") json.dump(clusterInfo, midFh, indent=2) logging.debug("Wrote cluster labels, midpoints and order to %s" % clusterLabelFname) addMd5(coordInfo, clusterLabelFname, keyName="labelMd5") coordConf.append( coordInfo ) outConf["coords"] = coordConf copyConf(inConf, outConf, "labelField") copyConf(inConf, outConf, "useTwoBytes") copyConf(inConf, outConf, "useRaw") return outFnames, labelVals def readAcronyms(inConf, outConf): " read the acronyms and save them into the config " inDir = inConf["inDir"] fname = inConf.get("acroFname") if fname is None: fname = inConf.get("acronymFile") if fname is not None: fname = makeAbs(inDir, fname) if not isfile(fname): logging.warn("%s specified in config file, but does not exist, skipping" % fname) acronyms = None else: acronyms = parseDict(fname) logging.info("Read %d acronyms from %s" % (len(acronyms), fname)) return acronyms def checkClusterNames(markerFname, clusterNames, clusterLabels, doAbort): " make sure that the cluster names from the meta.tsv match the ones from markers.tsv " markerClusters = set(clusterNames) labelClusters = set(clusterLabels) notInLabels = markerClusters - labelClusters notInMarkers = labelClusters - markerClusters if len(notInMarkers)!=0: msg = ("%s: the following cluster names are in the meta file but not in the marker file: %s. "+ "Please fix one of the files, clicks onto a label will otherwise not work.") % (markerFname, notInMarkers) if doAbort: errAbort(msg) else: logging.warn(msg) if len(notInLabels)!=0: logging.warn(("%s: the following cluster names are in the marker file but not in the meta file: %s. "+ "Users may not notice the problem, but it may indicate an erroneous meta data file.") % \ (markerFname, notInLabels)) -def convertMarkers(inConf, outConf, geneToSym, clusterLabels, outDir): +def convertMarkers(inConf, outConf, geneToSym, clusterLabels, matrixGeneIds, outDir): """ split the marker tables into one file per cluster and add filenames as 'markers' in outConf also add the 'topMarkers' to outConf, the top five markers for every cluster. """ markerFnames = [] if "markers" in inConf: markerFnames = makeAbsDict(inConf, "markers") newMarkers = [] #doAbort = True # only the first marker file leads to abort, we're more tolerant for the others doAbort = False # temp hack # because of single cell cluster filtering in cbScanpy topMarkersDone = False for markerIdx, markerInfo in enumerate(markerFnames): if type(markerInfo)!=type(dict()): errAbort("The 'markers' setting in cellbrowser.conf is not a dictionary but must be a dictionary with keys 'file' and 'label'.") markerFname = markerInfo["file"] markerLabel = markerInfo["shortLabel"] clusterName = "markers_%d" % markerIdx # use sha1 of input file ? markerDir = join(outDir, "markers", clusterName) makeDir(markerDir) - clusterNames, topMarkers = splitMarkerTable(markerFname, geneToSym, markerDir) + clusterNames, topMarkers = splitMarkerTable(markerFname, geneToSym, matrixGeneIds, markerDir) # only use the top markers of the first marker file if not topMarkersDone: outConf["topMarkers"] = topMarkers topMarkersDone = True checkClusterNames(markerFname, clusterNames, clusterLabels, doAbort) doAbort = False newDict = {"name" : sanitizeName(clusterName), "shortLabel" : markerLabel} if "selectOnClick" in markerInfo: newDict["selectOnClick"] = markerInfo["selectOnClick"] newMarkers.append( newDict ) outConf["markers"] = newMarkers def areProbablyGeneIds(ids): " if 80% of 'ids' start with the same letter, they are probably gene IDs, not symbols " counts = Counter() numCount = 0 for s in ids: counts[s[0]]+=1 if s.isnumeric(): numCount += 1 cutoff = 0.8* len(ids) if counts.most_common()[0][1] >= cutoff or numCount >= cutoff: logging.info("GeneIds in matrix are identifiers, not symbols") return True else: logging.info("GeneIds in matrix are symbols, not identifiers") return False def readValidGenes(outDir, inConf): """ the output directory contains an exprMatrix.json file that contains all valid gene symbols. We're reading them here, so we do not have to read the entire .tsv.gz file. """ matrixJsonFname = join(outDir, "exprMatrix.json") validGenes = readJson(matrixJsonFname) if "atacSearch" in inConf: return None, validGenes, None # in ATAC mode, no need to do anything else validGenes = list(validGenes) syms = [] geneIds = [] geneToSym = {} hasBoth = False symsOrGeneIds = [] for g in validGenes: if "|" in g: parts = g.split("|") geneId = parts[0] sym = parts[1] syms.append( sym ) geneIds.append(geneId) hasBoth = True geneToSym[geneId] = sym else: symsOrGeneIds.append( g ) if hasBoth: logging.debug("Matrix has both geneIds and symbols") geneIds.extend(symsOrGeneIds) else: - #logging.debug("Matrix does not have both geneIds and symbols, it contains either geneIds only or symbols only") - #if areProbablyGeneIds(syms): - #logging("80% look like geneIds: Using only the identifiers from the matrix") - #geneIds = symsOrGeneIds - #syms = geneToSym.values() - #else: - #logging("matrix identifiers do not look like geneIds: assume they are all symbols") logging.debug("Matrix does not have both geneIds and symbols, assuming it contains only geneIds") geneIds = symsOrGeneIds syms = [] if len(geneToSym)==0: logging.info("There are no gene/symbol pairs in the matrix") geneToSym = None symSet = set(syms) #symCounts = Counter(syms).most_common() #for sym, count in symCounts: #if count > 1: #logging.warn("Symbol %d is not unique" % count) return symSet, set(geneIds), geneToSym -def readQuickGenes(inConf, geneToSym, outDir, outConf): +def readQuickGenes(inConf, geneToSym, matrixSyms, matrixGeneIds, geneToSymFromMatrix, outDir, outConf): " read quick genes file and make sure that the genes in it are in the matrix " quickGeneFname = inConf.get("quickGenesFile") if not quickGeneFname: return - matrixSyms, matrixGeneIds, geneToSymFromMatrix = readValidGenes(outDir, inConf) - # prefer the symbols from the matrix over our own symbol tables if geneToSymFromMatrix is not None: logging.info("Matrix has gene symbols, these are used for quick gene file parsing") geneToSym = geneToSymFromMatrix fname = getAbsPath(inConf, "quickGenesFile") quickGenes = parseGeneInfo(geneToSym, fname, matrixSyms, matrixGeneIds) outConf["quickGenes"] = quickGenes logging.info("Read %d quick genes from %s" % (len(quickGenes), fname)) def getFileVersion(fname): data = {} data["fname"] = fname addMd5(data, fname, shortMd5=False) data["size"] = getsize(fname) data["mtime"] = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(getmtime(fname))) return data def checkFieldNames(outConf, fieldNames, validFieldNames, metaConf, labelField): " make sure that all fieldNames in outConf are valid field names. errAbort is not. " for fn in fieldNames: if fn not in outConf: continue if not outConf[fn] in validFieldNames: errAbort("Config statement '%s' contains an invalid field name, '%s'. Valid meta field names are: %s" % \ (fn, outConf[fn], ", ".join(validFieldNames))) #if labelField!=None: #for fieldConf in metaConf: #if fieldConf["name"]==labelField: #if fieldConf["type"]!=="enum": #errAbort("labelField column '%s' is not an enum field def convertMeta(inDir, inConf, outConf, outDir, finalMetaFname): """ convert the meta data to binary files. The new meta is re-ordered, so it's in the same order as the samples in the expression matrix. returns: sampleNames, whether sample names have changed and file name of meta info """ if not "fileVersions" in outConf: outConf["fileVersions"] = {} metaFname = getAbsPath(inConf, "meta") outConf["fileVersions"]["inMeta"] = getFileVersion(metaFname) metaDir = join(outDir, "metaFields") makeDir(metaDir) metaIdxFname = join(outDir, "meta.index") matrixFname = getExprMatrixFname(inConf) keepFields = [] for fieldName in ["labelField", "defColorField"]: if fieldName in inConf: keepFields.append( inConf[fieldName] ) sampleNames, needFilterMatrix = metaReorderFilter(matrixFname, metaFname, finalMetaFname, keepFields) outConf["sampleCount"] = len(sampleNames) outConf["matrixWasFiltered"] = needFilterMatrix fieldConf, validFieldNames = metaToBin(inDir, inConf, outConf, finalMetaFname, metaDir) outConf["metaFields"] = fieldConf labelField = outConf.get("labelField") checkFieldNames(outConf, ["violinField", "clusterField", "defColorField", "labelField"], validFieldNames, \ fieldConf, labelField) useLongLongMeta = indexMeta(finalMetaFname, metaIdxFname) if useLongLongMeta: outConf["metaNeedsEightBytes"] = True logging.info("Kept %d cells present in both meta data file and expression matrix" % len(sampleNames)) outConf["fileVersions"]["outMeta"] = getFileVersion(finalMetaFname) return sampleNames, needFilterMatrix def readGeneSymbols(geneIdType, matrixFnameOrGeneIds=None): " return geneToSym, based on gene tables. Returns None if no translation is necessary. " logging.debug("Reading gene symbols, geneIdType %s, example list of symbols: %s" % (geneIdType, matrixFnameOrGeneIds)) if geneIdType==None or geneIdType=="auto": geneIdType = guessGeneIdType(matrixFnameOrGeneIds) if geneIdType.startswith('symbol') or geneIdType=="raw": return None geneIdTable = getStaticFile(getGeneSymPath(geneIdType)) if geneIdTable is None: errAbort("The geneIdType setting '%s' is not a table provided by UCSC." "Use 'cbGenes avail' to list all available gene models or create new ones." % geneIdTable) geneToSym = readGeneToSym(geneIdTable) if geneIdType=="entrez" and len(geneToSym)<100000: logging.warn("You are using entrez gene IDs. Note that the default mapping table only works for human. "\ "To get the big mapping table for all NCBI organisms, please run this command: "\ "wget https://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz -O - | zcat | cut -f2,3 | gzip -c > %s" % \ geneIdTable) return geneToSym def getSymToGene(geneIdType): " return a dict with symbol -> geneId from our internal system " geneToSym = readGeneSymbols(geneIdType, None) d = {} for key, val in iterItems(geneToSym): if val in d and d[val]!=key: logging.debug("%s symbol is assigned to two gene IDs: %s and %s" % (val, d[val], key)) d[val] = key return d def readMitos(idType): ' return the gene IDs of all mitochondrial genes. 37 for human for all gencode versions ' logging.debug("Creating mito list for gene ID type %s" % idType) if idType.startswith("symbol"): human = readGeneSymbols("gencode-human", None) mouse = readGeneSymbols("gencode-human", None) mitos = [] for sym in human.values(): if sym.startswith("MT-"): mitos.append(sym) for sym in mouse.values(): if sym.startswith("mt-"): mitos.append(sym) logging.info("Built list of human and mouse mitochondrial symbols, %d symbols found" % len(mitos)) if len(mitos)>100: errAbort("Too many mitos found. %s" % mitos) return mitos geneToSym = readGeneSymbols(idType, None) mitos = [] allGeneIds = [] for geneId, sym in iterItems(geneToSym): allGeneIds.append(geneId) if sym.lower().startswith("mt-"): mitos.append(geneId) if len(mitos)==0: errAbort("Could not find any mitochondrial genes in cell browser gene lists for gene ID type %s. Example gene IDs from input file: %s" % (idType, ",".join(allGeneIds[:10]))) logging.debug("Found %d mitochondrial genes for %s, e.g. %s" % (len(mitos), idType, mitos[0])) return mitos def getAbsPath(conf, key): " get assume that value of key in conf is a filename and use the inDir value to make it absolute " return abspath(join(conf["inDir"], conf[key])) def popen(cmd, shell=False, useStderr=False, doWait=True): " run command and return proc object with its stdout attribute " if useStderr: if isPy3: proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, encoding="utf8", shell=shell) else: proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, shell=shell) else: if isPy3: proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, encoding="utf8", shell=shell) else: proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=shell) if doWait: proc.wait() if doWait and proc.returncode!=0: logging.debug("Cmd %s return non-zero exit code %s" % (cmd, str(proc.returncode))) return None, None else: if useStderr: return proc, proc.stderr else: return proc, proc.stdout def getMd5Using(md5Cmd, fname): " posix command line tool is much faster than python " logging.debug("Getting md5 of %s using %s command line tool" % (fname, md5Cmd)) cmd = [md5Cmd, fname] logging.debug("Cmd: %s" % cmd) proc, stdout = popen(cmd) if stdout is None: errAbort("File %s does not exist (command failed: %s)" % (fname, cmd)) md5 = stdout.readline() #proc.stdout.close() #stat = os.waitpid(proc.pid, 0) #err = stat[1] #assert(err==0) return md5 def md5ForList(l): " given a list of strings, return their md5 " hash_md5 = hashlib.md5() for s in l: hash_md5.update(s.encode("utf8")) md5 = hash_md5.hexdigest() return md5 def md5ForDict(summInfo): " given a dict of various objects, return the md5 " md5 = md5ForList([json.dumps(summInfo)]) return md5 def md5WithPython(fname): " get md5 using python lib. OK for small files, very slow for big files. " hash_md5 = hashlib.md5() with open(fname, "rb") as f: for chunk in iter(lambda: f.read(65536), b""): hash_md5.update(chunk) md5 = hash_md5.hexdigest() return md5 def md5ForFile(fname, isSmall=False): " return the md5sum of a file. Use a command line tool, if possible. " logging.debug("Getting md5 of %s" % fname) if which("md5sum")!=None and not isSmall: md5 = getMd5Using("md5sum", fname).split()[0] elif which("md5")!=None and not isSmall: md5 = getMd5Using("md5", fname).split()[-1] else: md5 = md5WithPython(fname) return md5 def readOldSampleNames(datasetDir, lastConf): """ reads the old cell identifiers from the dataset directory """ # this obscure command gets file with the the cell identifiers in the dataset directory " if len(lastConf["metaFields"])==0: sampleNameFname = "does_not_exist" # this almost never happens, only if the output dataset is broken else: sampleNameFname = join(datasetDir, "metaFields", lastConf["metaFields"][0]["name"]+".bin.gz") logging.debug("Reading meta sample names from %s" % sampleNameFname) # python3's gzip has 'text mode' but python2 doesn't have that so decode explicitly metaSampleNames = [] if isfile(sampleNameFname): for line in openFile(sampleNameFname): metaSampleNames.append(line.rstrip("\n")) else: oldMetaFname = join(datasetDir, "meta.tsv") headDone = False for line in open(oldMetaFname, "r"): if not headDone: headDone = True continue metaSampleNames.append(splitOnce(line, "\t")[0]) return metaSampleNames def matrixOrSamplesHaveChanged(datasetDir, inMatrixFname, outMatrixFname, outConf): """ compare filesize stored in datasetDir/cellbrowser.json.bak with file size of inMatrixFname and also compare the sample names with the sample names in outMatrixFname """ logging.info("Determining if %s needs to be created" % outMatrixFname) if not isfile(outMatrixFname): logging.info("%s does not exist. Must build matrix now." % outMatrixFname) return True, None confName = join(datasetDir, "dataset.json") if not isfile(confName): logging.debug("%s does not exist. This looks like the first run with this output directory" % confName) return True, None try: lastConf = readJson(confName) except json.decoder.JSONDecodeError: errAbort("Is the file %s broken? Please remove the file and run this command again." % confName) if not "fileVersions" in lastConf or not "inMatrix" in lastConf["fileVersions"] \ or not "outMatrix" in lastConf["fileVersions"]: logging.warn("Internal error? Missing 'fileVersions' tag in %s" % confName) return True, None oldMatrixInfo = lastConf["fileVersions"]["inMatrix"] origSize = oldMatrixInfo ["size"] nowSize = getsize(inMatrixFname) if not "fileVersions" in outConf: outConf["fileVersions"] = {} # mtx has three files, so have to add their sizes to the total now if isMtx(inMatrixFname) and "barcodes" in lastConf["fileVersions"]: oldBarInfo = lastConf["fileVersions"]["barcodes"] oldFeatInfo = lastConf["fileVersions"]["features"] origSize += oldBarInfo["size"] + oldFeatInfo["size"] # very old files do not have a fileVersions object if not "fileVersions" in outConf: outConf["fileVersions"] = {} outConf["fileVersions"]["barcodes"] = oldBarInfo outConf["fileVersions"]["features"] = oldFeatInfo mtxFname, geneFname, barFname = findMtxFiles(inMatrixFname) nowSize += getsize(geneFname) + getsize(barFname) matrixIsSame = (origSize==nowSize) if not matrixIsSame: logging.info("input matrix has input file size that is different from previously " "processed matrix. Expression matrix must be reindexed. Old file(s): %s, current file: %d" % (oldMatrixInfo, nowSize)) return True, None outConf["fileVersions"]["inMatrix"] = oldMatrixInfo outConf["fileVersions"]["outMatrix"] = lastConf["fileVersions"]["outMatrix"] outConf["matrixArrType"] = lastConf["matrixArrType"] metaSampleNames = readOldSampleNames(datasetDir, lastConf) if metaSampleNames is None: return False, None # this can only happen if the old dataset directory is broken somehow matrixSampleNames = readMatrixSampleNames(outMatrixFname) assert(len(matrixSampleNames)!=0) if set(metaSampleNames)!=set(matrixSampleNames): logging.info("meta sample samples from previous run are different from sample names in current matrix, have to reindex the matrix. Counts: %d vs. %d" % (len(metaSampleNames), len(matrixSampleNames))) return True, None logging.info("current input matrix looks identical to previously processed matrix, same file size, same sample names") return False, metaSampleNames def readJson(fname, keepOrder=False): " read .json and return as a dict " logging.debug("parsing json file %s" % fname) if not isfile(fname): logging.debug("%s does not exist, returning empty dict" % fname) return OrderedDict() if keepOrder: customdecoder = json.JSONDecoder(object_pairs_hook=OrderedDict) inStr = readFile(fname) if not isPy3: inStr = inStr.encode("utf8") data = customdecoder.decode(inStr) else: data = json.load(open(fname)) return data def orderClusters(labelCoords, outConf): " given the cluster label coordinates, order them by similarity " # create dict with label1 -> list of (dist, label2) dists = defaultdict(list) for i, (x1, y1, label1) in enumerate(labelCoords): for x2, y2, label2 in labelCoords[i:]: dist = math.sqrt((x2-x1)**2+(y2-y1)**2) dists[label1].append((dist, label2)) dists[label2].append((dist, label1)) labelOrder = [] doneLabels = set() currLabel = labelCoords[0][2] for i in range(0, len(labelCoords)): otherLabels = dists[currLabel] otherLabels.sort() for dist, label in otherLabels: if not label in doneLabels: currLabel = label labelOrder.append(currLabel) doneLabels.add(currLabel) break return labelOrder def metaHasChanged(datasetDir, metaOutFname): """ return true if md5 of metaOutFname is different from the one in datasetDir/dataset.json:fileVersions -> outMeta -> md5""" oldJsonFname = join(datasetDir, "dataset.json") if not isfile(oldJsonFname): logging.debug("%s not found, assuming that meta data has to be recreated" % oldJsonFname) return True oldData = readJson(oldJsonFname) if not "outMeta" in oldData["fileVersions"]: logging.debug("Old config file has no meta data MD5, re-converting meta data") return True elif not isfile(metaOutFname): logging.info("file %s does not exist, reconverting it" % metaOutFname) return True else: oldMd5 = oldData["fileVersions"]["outMeta"]["md5"] newMd5 = md5ForFile(metaOutFname) if oldMd5 != newMd5[:len(oldMd5)]: logging.debug("%s has a different md5 now, rebuilding meta data" % metaOutFname) return True logging.info("%s has the same md5 as in %s, no need to rebuild meta data" % (metaOutFname, oldJsonFname)) return False def checkConfig(inConf, isTopLevel): # at UCSC, we enforce some required dataset tags global reqTagsDataset reqTags = getConfig("reqTags") if reqTags: reqTagsDataset.extend(reqTags) if isTopLevel: if not "facets" in inConf: # tolerate if there are no facets in the dataset at all - third party installs may use collections but not facets. logging.warn("No 'facets' declared in cellbrowser.conf, but is a top-level dataset and using dataset hierarchies. If this is an old dataset, "\ "consider moving facets into their own object.") else: facets = inConf["facets"] reqFacets = ["body_parts", "diseases", "domains", "life_stages", "organisms", "assays", "sources"] missFacets = [] for rf in reqFacets: if rf not in facets: missFacets.append(rf) if len(missFacets)!=0: errAbort("The following facets were not defined: %s" % ",".join(missFacets)) for tag in reqTagsDataset: if tag not in inConf: errAbort("As required by ~/.cellbrowser: tag '%s' must be defined in cellbrowser.conf" % tag) if tag=="visibility": if tag in inConf and inConf[tag] not in ["hide", "show"]: errAbort("Error in cellbrowser.conf: '%s' can only have values: 'hide' or 'show'" % (tag)) if "exprMatrix" not in inConf and "matrices" not in inConf: errAbort("The tag 'matrices' must be defined in cellbrowser.conf, with multiple matrices, or alternatively " "the tag 'exprMatrix', with a single expression matrix file") if "name" in inConf and " " in inConf["name"] or "/" in inConf["name"]: errAbort("whitespace or slashes in the dataset 'name' in cellbrowser.conf are not allowed") def copyGenes(inConf, outConf, outDir): " handle the ATAC closest-gene search - copy a json.gz file to datasetDir/genes " if not "atacSearch" in inConf: return geneDir = join(outDir, "genes") makeDir(geneDir) dbGene = inConf["atacSearch"] db, geneIdType = dbGene.split(".") inFname = getStaticFile(getGeneJsonPath(db, geneIdType)) if inFname is None: errAbort("The gene model file %s could not be found on the UCSC cell browser downloads server. " "Unless your internet is down, this means that at UCSC we do not have a prebuilt " "gene model file for this genome assembly " "with this name. Check if this name is indeed in the format ., " "then you can either contact us or use cbGenes to build the file yourself. " % dbGene) syncFiles([inFname], geneDir) #outFname = join(geneDir, getGeneJsonPath(db, geneIdType).replace(".gz", "")) #open(outFname, "wb").write(gzip.open(inFname).read()) outConf["fileVersions"]["geneLocs"] = getFileVersion(inFname) outConf["atacSearch"] = inConf["atacSearch"] def makeFilesTxt(outConf, datasetDir): " create a file files.txt with all important download files in the current dataset, for curl " outFname = join(datasetDir, "files.txt") # 'outMatrix': {'fname': '/usr/local/apache/htdocs-cells/mg-models/organoids/exprMatrix.tsv.gz' metaName = basename(outConf["fileVersions"]["outMeta"]["fname"]) matrixName = basename(outConf["fileVersions"]["outMatrix"]["fname"]) with open(outFname, "wt") as ofh: ofh.write(matrixName+"\n") ofh.write(metaName+"\n") ofh.write("dataset.json\n") ofh.write("desc.json\n") for coords in outConf["coords"]: ofh.write(coords["textFname"]+"\n") def copyFacets(inConf, outConf): """ for backwards compatibility with old Javascript UI, we copy the inConf["facets"] directly into outConf """ if "facets" in inConf: logging.debug("Copying facets into object") for key, val in inConf["facets"].items(): assert(type(val)==type([])) # facets must be lists outConf[key] = val outConf["facets"] = inConf["facets"] return outConf def convertTraces(inConf, sampleNames, datasetDir, outConf): " read ephys traces in format 'cellIdcomma-sep list of values', one per line and write to outDir " if not "traceFile" in inConf: return fname = inConf["traceFile"] traces = {} for rowTuple in lineFileNextRow(fname): cellId = rowTuple[0] arrStr = rowTuple[1] strArr = arrStr.split(",") floatArr = [] for x in strArr: x = float(x) if x > 1.0: x = round(x, 3) floatArr.append(x) if all(i != i for i in floatArr): continue # do not store if all elements are Nan traces[cellId] = floatArr traceOutFn = join(datasetDir, "traces.json") with open(traceOutFn, "w") as ofh: json.dump(traces, ofh) logging.info("Wrote %s" % traceOutFn) outConf["fileVersions"]["traces"] = getFileVersion(traceOutFn) def getExprMatrixFname(inConf): if "exprMatrix" in inConf: inMatrixFname = getAbsPath(inConf, "exprMatrix") else: inMatrixFname = abspath(join(inConf["inDir"], inConf["matrices"][0]["fileName"])) return inMatrixFname def convertDataset(inDir, inConf, outConf, datasetDir, redo, isTopLevel): """ convert everything needed for a dataset to datasetDir, write config to outConf. If the expression matrix has not changed since the last run, and the sampleNames are the same, the matrix won't be converted again, which saves a lot of time. """ checkConfig(inConf, isTopLevel) inMatrixFname = getExprMatrixFname(inConf) # outMetaFname/outMatrixFname are reordered & trimmed tsv versions of the matrix/meta data if isMtx(inMatrixFname): baseName = basename(inMatrixFname) # our cbImportSeurat script uses the format _matrix.mtx.gz, keep this format internally if baseName.endswith("_matrix.mtx.gz") and len(baseName.split("_"))==2: baseName = basename(findMtxFiles(inMatrixFname)[0]) outMatrixFname = join(datasetDir, baseName) else: outMatrixFname = join(datasetDir, "matrix.mtx.gz") else: outMatrixFname = join(datasetDir, "exprMatrix.tsv.gz") outMetaFname = join(datasetDir, "meta.tsv") # try not to recreate files that have been created before, as it is all quite slow (=Python) doMatrix, sampleNames = matrixOrSamplesHaveChanged(datasetDir, inMatrixFname, outMatrixFname, outConf) doMeta = metaHasChanged(datasetDir, outMetaFname) geneToSym = -1 # -1 = "we have not read any", "None" would mean "there are no gene symbols to map to" if not doMeta: sampleNames = readSampleNames(outMetaFname) needFilterMatrix = True oldConfFname = join(datasetDir, "dataset.json") logging.info("Loading old config from %s" % oldConfFname) oldConf = readJson(oldConfFname, keepOrder=True) if doMeta or doMatrix or redo in ["meta", "matrix"]: # convertMeta also compares the sample IDs between meta and matrix to determine if the meta file # needs reordering or trimming (=if the meta contains more cells than the matrix) sampleNames, needFilterMatrix = convertMeta(inDir, inConf, outConf, datasetDir, outMetaFname) else: sampleNames = readSampleNames(outMetaFname) needFilterMatrix = False outConf.update(oldConf) if doMatrix or redo=='matrix': geneToSym = readGeneSymbols(inConf.get("geneIdType"), inMatrixFname) convertExprMatrix(inConf, outMatrixFname, outConf, sampleNames, geneToSym, datasetDir, needFilterMatrix) # in case we crash after this, keep the current state of the config, as matrix ops are so slow writeConfig(inConf, outConf, datasetDir) else: logging.info("Matrix and meta sample names have not changed, not indexing matrix again") convertTraces(inConf, sampleNames, datasetDir, outConf) coordFiles, clusterLabels = convertCoords(inDir, inConf, outConf, sampleNames, outMetaFname, datasetDir) foundConf = writeDatasetDesc(inConf["inDir"], outConf, datasetDir, coordFiles, outMatrixFname) if geneToSym==-1: geneToSym = readGeneSymbols(inConf.get("geneIdType"), inMatrixFname) - convertMarkers(inConf, outConf, geneToSym, clusterLabels, datasetDir) + matrixSyms, matrixGeneIds, geneToSymFromMatrix = readValidGenes(datasetDir, inConf) + + convertMarkers(inConf, outConf, geneToSym, clusterLabels, matrixGeneIds, datasetDir) - readQuickGenes(inConf, geneToSym, datasetDir, outConf) + readQuickGenes(inConf, geneToSym, matrixSyms, matrixGeneIds, geneToSymFromMatrix, datasetDir, outConf) copyBackgroundImages(inDir, inConf, outConf, datasetDir) # a few settings are passed through to the Javascript as they are for tag in ["shortLabel", "radius", "alpha", "priority", "tags", "sampleDesc", "geneLabel", "clusterField", "defColorField", "xenaPhenoId", "xenaId", "hubUrl", "showLabels", "ucscDb", "unit", "violinField", "visibility", "coordLabel", "lineWidth", "hideDataset", "hideDownload", "metaBarWidth", "supplFiles", "defQuantPal", "defCatPal", "clusterPngDir", "wrangler", "shepherd", "binStrategy", "split", "lineAlpha", "lineWidth", "lineColor", # the following are there only for old datasets, they are now nested under "facets" # they are just here for backwards-compatibility and will eventually get removed "body_parts", "organisms", "diseases", "projects", "life_stages", "domains", "sources", "assays", # facets are taking their place now "facets", "multiModal"]: copyConf(inConf, outConf, tag) if "name" not in outConf: copyConf(inConf, outConf, "name") # for the news generator and for future logging, note when this dataset was built for the first time into this directory # and also when it was built most recently if "firstBuildTime" in oldConf: outConf["firstBuildTime"] = oldConf["firstBuildTime"] else: outConf["firstBuildTime"] = datetime.datetime.now().isoformat() outConf["lastBuildTime"] = datetime.datetime.now().isoformat() makeFilesTxt(outConf, datasetDir) def filterFields(anndata, coordFields): " remove all fields with more than 5 columns from anndata.obsm " filtCoordFields = [] droppedKeys = [] for fieldName in coordFields: colCount = anndata.obsm[fieldName].shape[1] if colCount < 5: filtCoordFields.append(fieldName) else: droppedKeys.append(fieldName) logging.info("These obsm entries were dropped, too many columns: %s" % droppedKeys) coordFields = filtCoordFields return coordFields def writeAnndataCoords(anndata, coordFields, outDir, desc): " write all embedding coordinates from anndata object to outDir, the new filename is _coords.tsv " import pandas as pd if "spatial" in anndata.uns and "is_single" in anndata.uns["spatial"]: logging.warn("Data fix: Found 'is_single' in ad.uns, removing this field") del anndata.uns["spatial"]["is_single"] if len(anndata.uns["spatial"])==0: logging.warn("Data fix: No spatial data left, removing ad.uns['spatial']") del anndata.uns["spatial"] if coordFields=="all" or coordFields is None: coordFields = getObsmKeys(anndata) # spatial datasets have a ton of obsm attributes that are usually not coordinates coordFields = filterFields(anndata, coordFields) # move over a field to obs: # ad.obs = pd.concat([ad.obs, ad.obsm["ctype_props"]], axis=1) coordFields.sort(reverse=True) # umap first, then t-sne, then PCA... sorting works to make a good order! for fieldName in coordFields: # examples: # X_draw_graph_tsne - old versions # X_tsne - newer versions # also seen in the wild: X_Compartment_tSNE if fieldName=="spatial" and "spatial" in anndata.uns: logging.info("Not exporting spatial coords now, because there are also spatial images. Will export these coords later.") continue coordName = fieldName.replace("X_draw_graph_","").replace("X_","") fullName = coordLabels.get(coordName, coordName) fileBase = coordName+"_coords.tsv" fname = join(outDir, fileBase) logging.info("Writing %s coords to %s" % (fullName, fname)) coordDf=pd.DataFrame(anndata.obsm[fieldName],index=anndata.obs.index) # they usually only have (x,y), but some objects like PCA have more than 2 dimensions if len(coordDf.columns)==2: coordDf.columns=['x','y'] coordDf.to_csv(fname,sep='\t') desc.append( {'file':fileBase, 'shortLabel': fullName} ) return coordFields def writeCellbrowserConf(name, coordsList, fname, addMarkers=True, args={}): checkDsName(name) metaFname = args.get("meta", "meta.tsv") clusterField = args.get("clusterField", "Louvain Cluster") coordStr = json.dumps(coordsList, indent=4) matrixFname = args.get("exprMatrix", "exprMatrix.tsv.gz") conf = """ # This is a bare-bones, auto-generated Cell Browser config file. # Look at https://github.com/maximilianh/cellBrowser/blob/master/src/cbPyLib/cellbrowser/sampleConfig/cellbrowser.conf # for a list of possible options # You can also write a default template into the current directory with cbBuild --init name='%(name)s' shortLabel='%(name)s' exprMatrix='%(matrixFname)s' #tags = ["10x", 'smartseq2'] meta='%(metaFname)s' geneIdType='auto' defColorField='%(clusterField)s' labelField='%(clusterField)s' enumFields=['%(clusterField)s'] coords=%(coordStr)s #alpha=0.3 #body_parts=["embryo", "heart", "brain"] #radius=2 """ % locals() if addMarkers: conf += 'markers = [{"file": "markers.tsv", "shortLabel":"Cluster Markers"}]\n' if "quickGenesFile" in args: conf += 'quickGenesFile="%s"\n' % args["quickGenesFile"] else: conf += "#quickGenesFile='quickGenes.tsv'\n" if "geneToSym" in args: conf += "geneToSym='%s'\n" % args["geneToSym"] if "spatial" in args: conf += "spatial=%s\n" % json.dumps(args["spatial"], indent=4) if "spatialMeta" in args: conf += "spatialMeta=%s\n" % json.dumps(args["spatialMeta"], indent=4) if "colors" in args: conf += "colors=%s\n" % json.dumps(args["colors"], indent=4) if isfile(fname): logging.info("Not overwriting %s, file already exists." % fname) return ofh = open(fname, "w") ofh.write(conf) ofh.close() logging.info("Wrote %s" % ofh.name) def geneSeriesToStrings(var, geneKey, indexFirst=False, sep="|", justValues=False): " convert a pandas data series to a list of |-separated strings " geneIdSeries = var[geneKey] logging.info("To get the gene names, using the field %s of the .var attribute" % geneKey) if justValues: genes = geneIdSeries.values genes = [str(x) for x in genes] else: if indexFirst: geneIdAndSyms = list(zip(geneIdSeries.index, geneIdSeries.values)) else: geneIdAndSyms = list(zip(geneIdSeries.values, geneIdSeries.index)) genes = [str(x)+sep+str(y) for (x,y) in geneIdAndSyms] return genes def geneStringsFromVar(ad, sep="|"): " return a list of strings in format geneIdgeneSymbol " var = ad.var if "gene_ids" in var: logging.debug("Found 'gene_ids' attribute in var") genes = geneSeriesToStrings(var, "gene_ids", indexFirst=False, sep=sep) elif "gene_symbols" in var: logging.debug("Found 'gene_symbols' attribute in var") genes = geneSeriesToStrings(var, "gene_symbols", indexFirst=True, sep=sep) elif "Accession" in var: # only seen this in the ABA Loom files logging.debug("Found 'Accession' attribute in var") genes = geneSeriesToStrings(var, "Accession", indexFirst=False, sep=sep) elif "features" in var: # the files in retina-atac had only one 'feature' keys logging.debug("Found 'features' attribute in var, using just those") genes = var["features"].tolist() elif "_index" in var: # mouse-striatal-dev, but only when opened in cbImportScanpy, has the "features" transformed to "_index" logging.debug("Found '_index' attribute in var") genes = geneSeriesToStrings(var, "_index", justValues=True, sep=sep) else: logging.debug("Using index of var") genes = var.index.tolist() return genes def anndataMatrixToMtx(ad, path): """ write the ad expression matrix into a sparse mtx.gz file (matrix-market format) to test: cbImportScanpy -i /home/michi/ms_python_packages/cellBrowser/sampleData/pbmc_small/anndata.h5ad -o /tmp/cbtest cbBuild -i /tmp/cbtest/cellbrowser.conf -o /tmp/cb_html/ """ import scipy.io mat = ad.X var = ad.var rowCount, colCount = mat.shape logging.info("Writing scanpy matrix (%d cells, %d genes) to %s in mtx.gz format" % (rowCount, colCount, path)) logging.info("Transposing matrix") # necessary, as scanpy has the samples on the rows mat = mat.T mtxfile = join(path, 'matrix.mtx') """ this is stupid: if mat is dense, mmwrite screws up the header: the header is supposed to contain a line with `rows cols nonzero_elements`. If mat is dense, it skips the nonzero_elements, which leads to an error reading the matrix later on. actually it stores it as "array" rather than matrix %%MatrixMarket matrix coordinate real general """ dataType = "float" if ad.X.dtype.kind=="i": dataType = "integer" if ~scipy.sparse.issparse(mat): logging.debug("Convert matrix to sparse") mat = scipy.sparse.csr_matrix(mat) geneLines = geneStringsFromVar(ad, sep="\t") genes_file = join(path, 'features.tsv.gz') logging.info("Writing %s" % genes_file) with gzip.open(genes_file, 'wt') as f: f.write("\n".join(geneLines)) bc_file = join(path, 'barcodes.tsv.gz') sampleNames = [str(x) for x in ad.obs.index.tolist()] logging.info("Writing %s" % bc_file) with gzip.open(bc_file, 'wt') as f: f.write("\n".join(sampleNames)) logging.info("Writing matrix to %s, type=%s" % (mtxfile, dataType)) # scanpy has the samples on the rows scipy.io.mmwrite(mtxfile, mat, precision=7) logging.info("Compressing matrix to %s.gz" % mtxfile) # necessary, as scanpy has the samples on the rows # runGzip(mtxfile, mtxfile) # this is giving me trouble with the same filename with open(mtxfile,'rb') as mtx_in: with gzip.open(mtxfile + '.gz','wb') as mtx_gz: shutil.copyfileobj(mtx_in, mtx_gz) os.remove(mtxfile) def anndataMatrixToTsv(ad, matFname, usePandas=False): " write ad expression matrix to .tsv file and gzip it " import pandas as pd import scipy.sparse mat = ad.X var = ad.var rowCount, colCount = mat.shape logging.info("Writing scanpy matrix (%d cells, %d genes) to %s" % (rowCount, colCount, matFname)) tmpFname = matFname+".tmp" logging.info("Transposing matrix") # necessary, as scanpy has the samples on the rows mat = mat.T if scipy.sparse.issparse(mat): logging.info("Converting csc matrix to row-sparse matrix") mat = mat.tocsr() # makes writing to a file ten times faster, thanks Alex Wolf! if usePandas: # This code is currently not used anywhere by this code. It's still there # so external code that calls this function can use it, if the code below does not work. logging.info("Converting anndata to pandas dataframe") data_matrix=pd.DataFrame(mat, index=var.index.tolist(), columns=ad.obs.index.tolist()) logging.info("Writing pandas dataframe to file (slow?)") data_matrix.to_csv(tmpFname, sep='\t', index=True) else: # manual writing row-by-row should save quite a bit of memory logging.info("Writing gene-by-gene, without using pandas") ofh = open(tmpFname, "w") sampleNames = ad.obs.index.tolist() ofh.write("gene\t") ofh.write("\t".join(sampleNames)) ofh.write("\n") # when reading 10X files, read_h5 puts the geneIds into a separate field # and uses only the symbol. We prefer ENSGxxxx| as the gene ID string genes = geneStringsFromVar(ad) logging.info("Writing %d genes in total" % len(genes)) for i, geneName in enumerate(genes): if i % 2000==0: logging.info("Wrote %d genes" % i) ofh.write(geneName) ofh.write("\t") if scipy.sparse.issparse(mat): #logging.debug("Converting csr row to dense") row = mat.getrow(i).todense() else: row = mat[i,:] # Note: Faster is not possible. savetxt() is super slow, as it is not C code # We used to have this 'np.savetxt(ofh, row, fmt="%g", delimiter="\t", newline="\t")' # But tofile() is several times faster: row.tofile(ofh, sep="\t", format="%.7g") ofh.write("\n") ofh.close() if matFname.endswith(".gz"): runGzip(tmpFname, matFname) else: renameFile(tmpFname, matFname) def makeDictDefaults(inVar, defaults): " convert inVar to dict if necessary, defaulting to our default labels " if isinstance(inVar, Mapping): return inVar d = {} for val in inVar: d[val] = defaults.get(val, val) return d def runSafeRankGenesGroups(adata, clusterField, minCells=5): " run scanpy's rank_genes_groups in a way that hopefully doesn't crash " sc = importScanpy() # see: https://github.com/scverse/scanpy/issues/747 adata.obs.index = adata.obs.index.astype(str) adata.var.index = adata.var.index.astype(str) adata.obs[clusterField] = adata.obs[clusterField].astype("category") # if not category, rank_genes will crash sc.pp.filter_genes(adata, min_cells=minCells) # rank_genes_groups crashes on zero-value genes # cell clusters with a cell count = 1 crash rank_genes, so remove their cells clusterCellCounts = list(adata.obs.groupby([clusterField]).apply(len).items()) filterOutClusters = [cluster for (cluster,count) in clusterCellCounts if count==1] if len(filterOutClusters)!=0: logging.info("Removing cells in clusters %s, as they have only a single cell, will mess up marker gene scoring" % filterOutClusters) adata = adata[~adata.obs[clusterField].isin(filterOutClusters)] logging.info("Calculating 100 marker genes for each cluster") # working around bug in scanpy 1.9.1, see https://github.com/scverse/scanpy/issues/2181 if "log1p" in adata.uns and not "base" in adata.uns['log1p']: adata.uns['log1p']["base"] = None sc.tl.rank_genes_groups(adata, groupby=clusterField) return adata def saveMarkers(adata, markerField, nb_marker, fname): " save nb_marker marker genes from adata object to fname , in a reasonable file format " symToId = None if "gene_ids" in adata.var: symToId = dict((zip(adata.var.index, adata.var["gene_ids"]))) import pandas as pd top_score=pd.DataFrame(adata.uns[markerField]['scores']).loc[:nb_marker] top_gene=pd.DataFrame(adata.uns[markerField]['names']).loc[:nb_marker] marker_df= pd.DataFrame() #for i in range(len(top_score.columns)): for clustName in top_score.columns: topScoreCol = top_score[[clustName]] topGeneCol = top_gene[[clustName]] concat=pd.concat([topScoreCol,topGeneCol],axis=1,ignore_index=True) concat['cluster_name']=clustName col=list(concat.columns) col[0],col[-2]='z_score','gene' concat.columns=col marker_df=pd.concat([marker_df,concat]) #Rearranging columns -> Cluster, gene, score cols=marker_df.columns.tolist() cols=cols[::-1] marker_df=marker_df[cols] #Export logging.info("Writing %s" % fname) #pd.DataFrame.to_csv(marker_df,fname,sep='\t',index=False) # want to keep gene IDs, so replaced pandas with manual code clusterCol = marker_df["cluster_name"] scoreCol = marker_df["z_score"] geneCol = marker_df["gene"] assert(len(clusterCol)==len(scoreCol)==len(geneCol)) sep = "\t" with open(fname, "w") as ofh: ofh.write(sep.join(["cluster_name", "gene", "z_score"])) ofh.write("\n") for cluster, score, gene in zip(clusterCol, scoreCol, geneCol): if isinstance(gene, (bytes, bytearray)): gene = gene.decode("utf8") if symToId: sym = symToId.get(gene) if sym!=None: gene = sym+"|"+gene ofh.write( sep.join( (cluster, gene, str(score)) ) ) ofh.write("\n") def exportScanpySpatial(adata, outDir, configData, coordDescs): """ Export Scanpy image to JPEG files in outDir and add arguments to configData """ from scanpy.plotting._tools.scatterplots import _check_spatial_data, _check_img, _empty, _check_spot_size, \ _check_scale_factor, _check_crop_coord, _check_na_color if not ("spatial" in adata.obsm and "spatial" in adata.uns): logging.debug("No 'spatial' key found in adata.obsm") return configData, coordDescs import pandas as pd coordDf=pd.DataFrame(adata.obsm["spatial"],index=adata.obs.index) libraries = adata.uns["spatial"].keys() for library_id in libraries: #Out[8]: dict_keys(['C47', 'C50', 'C56', 'IBM29', 'IBM31', 'IBM35', 'SRP1', 'SRP4']) coordsDone = False imgConfigs = [] for img_key in ["hires", "lowres"]: if not img_key in adata.uns["spatial"][library_id]["images"]: continue crop_coord = None na_color = None size = 1.0 spot_size = None scale_factor = None bw = False img = None # copied from https://github.com/scverse/scanpy/blob/034ca2823804645e0d4874c9b16ba2eb8c13ac0f/scanpy/plotting/_tools/scatterplots.py#L988 library_id, spatial_data = _check_spatial_data(adata.uns, library_id) img, img_key = _check_img(spatial_data, img, img_key, bw=bw) spot_size = _check_spot_size(spatial_data, spot_size) assert(spot_size!=None) scale_factor = _check_scale_factor( spatial_data, img_key=img_key, scale_factor=scale_factor ) crop_coord = _check_crop_coord(crop_coord, scale_factor) na_color = _check_na_color(na_color, img=img) circle_radius = size * scale_factor * spot_size * 0.5 # end copy import matplotlib imgFname = library_id+"_"+img_key+".jpg" matplotlib.image.imsave(join(outDir, imgFname), img) label = library_id+"_"+img_key imgConfigs.append({"file":imgFname, "label":label, "radius":circle_radius, \ "scale_factor":scale_factor}) if "metadata" in spatial_data: meta = dict(spatial_data["metadata"]) meta["label"] = label meta["py_spot_size"] = spot_size meta["py_radius"] = circle_radius meta["py_size"] = size meta["scalefactors"] = spatial_data["scalefactors"] if crop_coord is not None: meta["crop_coord"] = crop_coord if "spatialMeta" not in configData: configData["spatialMeta"] = [] configData["spatialMeta"].append(meta) if not coordsDone: # the 10X scale_factor indicates the relationship between pixels in the lowres/hires # images and the original TIFFs. # https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/spatial # Here, we use it to derive the min/max limits which the javascript needs to make sure # that the bitmap is really under the spots width, height = img.shape[:2] yMax = round(width / scale_factor) xMax = round(height / scale_factor) coordsDone = True #coordInfo = outConf["coords"][0] #configData["spatial"] = imgConfigs fileBase = "spatial_"+library_id+"_coords.tsv" fname = join(outDir, fileBase) logging.info("Writing spatial coords of library %s to %s" % (library_id, fname)) suffix = "-"+library_id.lower() filtCoords = coordDf[coordDf.index.str.endswith(suffix,na=False)] if filtCoords.shape[0]==0: logging.info("There are no barcodes with the right suffix for the slide %s, using all barcodes" % fileBase) filtCoords = coordDf filtCoords.to_csv(fname,sep='\t') # the origin of the image is top-left, for the spots it's bottom-left. Fixing this for now by flipping the spots on the y. coordConf = {} coordConf["images"] = imgConfigs coordConf["file"] = fileBase coordConf["shortLabel"] = library_id coordConf["flipY"] = 1 # by default, cbBuild moves the circles to the minX/minY of the coords. Must switch this off here to align the image with # the spots coordConf["useRaw"] = 1 # switch off: moving the minX/minY and scaling of data to uint16 coordConf["minX"] = 0 coordConf["minY"] = 0 coordConf["maxX"] = xMax coordConf["maxY"] = yMax coordDescs.append( coordConf ) return configData, coordDescs # copied from https://github.com/scverse/scanpy/blob/d7e13025b931ad4afd03b4344ef5ff4a46f78b2b/scanpy/_utils/__init__.py#L487 def check_nonnegative_integers(X): """Checks values of X to ensure it is count data""" from numbers import Integral data = X if isinstance(X, np.ndarray) else X.data # Check no negatives if np.signbit(data).any(): return False # Check all are integers elif issubclass(data.dtype.type, Integral): return True elif np.any(~np.equal(np.mod(data, 1), 0)): return False else: return True # copy end def exportScanpyOneFieldColor(fieldName, fieldValues, colors, outDir, configData): "write a single color file, for one field" outFname = join(outDir, fieldName+"_colors.tsv") logging.info("Writing colors of field %s to %s" % (fieldName, outFname)) # Debugging: print lengths and contents #print(f"Field name: {fieldName}") #print(f"Field values (length {len(fieldValues)}): {fieldValues}") #print(f"Colors (length {len(colors)}): {colors}") # Check lengths if len(fieldValues) != len(colors): logging.error("Mismatch of lengths in h5ad: %s values vs %d colors" % (len(fieldValues), len(colors))) # Handle mismatch: you can either raise an exception or handle it gracefully return ofh = open(outFname, "w") ofh.write("#val color\n") for val, color in zip(fieldValues, colors): ofh.write("%s\t%s\n" % (val, color)) ofh.close() if "colors" not in configData: configData["colors"] = {} configData["colors"][fieldName] = outFname def exportScanpyColors(adata, outDir, configData): " create one tsv with the colors per color definition in adata " for fieldName in adata.obs.keys(): colorKey = fieldName+"_colors" #if colorKey in adata.uns: #outFname = exportScanpyOneFieldColor(fieldName, adata.obs[fieldName].values.categories, adata.uns[colorKey], outDir, configData) if colorKey in adata.uns and adata.uns[colorKey] is not None and len(adata.uns[colorKey]) > 0: fieldValues = adata.obs[fieldName].values.categories colors = adata.uns[colorKey] # Check if colors is a list/array and has a non-zero length if colors is not None and len(colors) > 0: fieldValues = adata.obs[fieldName].values.categories outFname = exportScanpyOneFieldColor(fieldName, fieldValues, colors, outDir, configData) else: logging.warning("Skipping %s because colors are not available or empty." % fieldName) return configData def scanpyToCellbrowser(adata, path, datasetName, metaFields=None, clusterField=None, nb_marker=50, doDebug=False, coordFields=None, skipMatrix=False, useRaw=False, skipMarkers=False, markerField='rank_genes_groups', matrixFormat="tsv", atac=None, layer=None): """ Mostly written by Lucas Seninge, lucas.seninge@etu.unistra.fr Used by cbImportScanpy and cbScanpy Given a scanpy object, write dataset to a dataset directory under path. This function export files needed for the ucsc cells viewer from the Scanpy Anndata object :param anndata: Scanpy AnnData object where information are stored :param path : Path to folder where to save data (tsv tables) :param clusterField: name of cluster name field, used for labeling and default coloring, default is 'louvain' This field is also used to calculate markers, if no marker genes are found in the object. :param markerField: the ad.uns field where the marker gene calculation results are stored. This is not the meta data field used to calculate markers on. :param coordFields: list of obsm coordinates to export, default is all :param metaFields: list of metadata names (string) to export from the AnnData object (other than 'louvain' to also save (eg: batches, ...)). This can also be a dict of name -> label, if you want to have more human-readable names. :param nb_marker: number of cluster markers to store. Default: 50 :param atac: assume that the file is an ATAC seq file, chnages the cellbrowser.conf output file :param layer: specify the layer to use in the anndata object """ outDir = path if doDebug is not None: setDebug(doDebug) if not isdir(outDir): makeDir(outDir) confName = join(outDir, "cellbrowser.conf") if isfile(confName): logging.warn("%s already exists. Overwriting existing files." % confName) import numpy as np import pandas as pd import anndata configData = {} # dict that can override the default arguments in the generated cellbrowser.conf file # layers are the new thing to store multiple matrixces. In the old days, there only was the adata.raw matrix # in addition to ad.X. So if a layer was specified, we use that. Only when a layer was not specified do we respect # the useRaw option if layer: if layer not in adata.layers: raise ValueError("The layer '%s' does not exist. Available layers are: %s" % (layer, adata.layers.keys())) logging.info("Using layer '%s' of anndata object (not using .raw)" % layer) adata.X = adata.layers[layer] else: if useRaw: if adata.raw is None: logging.warn("The option to export raw expression data is set, but the scanpy object has no 'raw' attribute. Exporting the processed scanpy matrix. Some genes may be missing.") else: logging.info("Using raw expression matrix") adata.X = adata.raw.X adata.var = adata.raw.var mat = adata.X logging.info("Matrix has size (%d cells, %d genes)" % (mat.shape[0], mat.shape[1])) if matrixFormat=="tsv" or matrixFormat is None: matFname = join(outDir, 'exprMatrix.tsv.gz') if not skipMatrix: anndataMatrixToTsv(adata, matFname) elif matrixFormat=="mtx": if not skipMatrix: anndataMatrixToMtx(adata, outDir) configData["exprMatrix"] = "matrix.mtx.gz" else: assert(False) # invalid value for 'matrixFormat' coordDescs = [] coordFields = writeAnndataCoords(adata, coordFields, outDir, coordDescs) configData, coordDescs = exportScanpySpatial(adata, outDir, configData, coordDescs) if len(coordDescs)==0: logging.warn("No valid embeddings were found in anndata.obsm but at least one array of coordinates is usually required. Keys obsm: %s" % (coordFields)) ##Check for cluster markers if (markerField not in adata.uns or clusterField is not None) and not skipMarkers: logging.warn("Couldnt find list of cluster marker genes in the h5ad file in adata.uns with the key '%s'. " "In the future, from Python, try running sc.tl.rank_genes_groups(adata) to " "create the cluster annotation and write the h5ad file then." % markerField) logging.info("Filtering for >5 cells then do sc.tl.rank_genes_groups for meta field '%s'" % clusterField) if "columns" in dir(adata.obs): # in older scanpy objects obs is not a pandas dataframe if clusterField not in adata.obs.columns: tryFields = ["CellType", "cell_type", "Celltypes", "Cell_type", "celltype", "annotated_cell_identity.text", "BroadCellType", "Class"] logging.info("Cluster field '%s' not in adata.obs, trying %s" % (clusterField, tryFields)) foundField = None for fieldName in tryFields: if fieldName in adata.obs.columns: logging.info("Found field '%s', using it as the cell cluster field." % fieldName) foundField = fieldName if foundField is None: logging.info("No exact match found. To find a default cluster label field, " "searching for first field that contains 'eiden', 'luste' or 'ype'") for searchTerm in ['eiden', 'luste', 'ype']: for fieldName in adata.obs.columns: if searchTerm in fieldName: logging.info("Found field %s" % fieldName) foundField = fieldName break if foundField: break if foundField is None: errAbort("Could not find field '%s' in the scanpy object. To make a cell browser, you should have a " " field like 'cluster' or " "'celltype' or 'louvain' in your object. The available fields are: %s ." "These names were tried: %s. " "Re-run the import and specify the field that contains cell-type-like annotations with the " "option --clusterField from the command line or clusterField='xxx' from Jupyter. " "If you have a use case where this field should not be required, please contact us. " % (clusterField, repr(adata.obs.columns), tryFields) ) clusterField = foundField if check_nonnegative_integers(adata.X): logging.info("Looks like expression matrix has no negatives and is all integers: taking log2 of data before running rank_groups") sc = importScanpy() sc.pp.log1p(adata) else: logging.info("Looks like expression matrix has already been log2-ed before") adata = runSafeRankGenesGroups(adata, clusterField, minCells=5) markersExported = False # the quickgenes step later needs to know if a markers.tsv file was created. if not skipMarkers: fname = join(outDir, "markers.tsv") saveMarkers(adata, markerField, nb_marker, fname) markersExported = True ##Save metadata if metaFields is None: metaFields = list(adata.obs.columns.values) else: # check that field names exist for name in metaFields: if name not in adata.obs.keys(): logging.warn('There is no annotation field with the name `%s`.' % name) if name not in ["percent_mito", "n_genes", "n_counts"]: # tolerate missing default fields raise ValueError() metaFields = makeDictDefaults(metaFields, metaLabels) meta_df=pd.DataFrame() for metaKey, metaLabel in iterItems(metaFields): logging.debug("getting meta field: %s -> %s" % (metaKey, metaLabel)) if metaKey not in adata.obs: logging.warn(str(metaKey) + ' field is not present in the AnnData.obs object') else: temp=adata.obs[[metaKey]] #if metaKey in metaLabels: #logging.debug("Using new name %s -> %s" % (metaKey, metaLabels[metaKey])) #temp.name = metaLabel meta_df=pd.concat([meta_df,temp],axis=1) meta_df.rename(metaFields, axis=1, inplace=True) fname = join(outDir, "meta.tsv") meta_df.to_csv(fname,sep='\t', index_label="cellId") configData = exportScanpyColors(adata, outDir, configData) if clusterField is None: clusterField = 'louvain' if clusterField: clusterFieldLabel = metaFields.get(clusterField, clusterField) configData['clusterField'] = clusterFieldLabel if markersExported: generateQuickGenes(outDir) configData['quickGenesFile'] = "quickGenes.tsv" if atac: assert("." in atac) # --atac must have a dot in it configData["atacSearch"] = atac if isfile(confName): logging.info("%s already exists, not overwriting. Remove and re-run command to recreate." % confName) else: writeCellbrowserConf(datasetName, coordDescs, confName, addMarkers=markersExported, args=configData) def writeJson(data, outFname, ignoreKeys=None): """ https://stackoverflow.com/a/37795053/233871 """ # Write JSON file tmpName = outFname+".tmp" if ignoreKeys: ignoredData = {} for key in ignoreKeys: ignoredData[key] = data del data[key] with io.open(tmpName, 'w', encoding='utf8') as outfile: #str_ = json.dumps(data, indent=2, sort_keys=True,separators=(',', ': '), ensure_ascii=False) if isPy3: str_ = json.dumps(data, indent=2, separators=(',', ': '), ensure_ascii=False) outfile.write(str_) else: str_ = json.dumps(data, indent=2, separators=(',', ': '), ensure_ascii=False, encoding="utf8") outfile.write(unicode(str_)) # pylint: disable=E0602 if ignoreKeys: data.update(ignoredData) renameFile(tmpName, outFname) logging.info("Wrote %s" % outFname) def writePyConf(idfInfo, descFname): " write python-style conf file " ofh = open(descFname, "w") for key, val in iterItems(idfInfo): ofh.write("%s = %s\n" % (key, repr(val))) ofh.close() logging.info("Wrote %s" % descFname) def writeConfig(inConf, outConf, datasetDir): " write dataset summary info to json file. Also keep a copy of the input config. " # keep a copy of the original config in the output directory for debugging later confName = join(datasetDir, "cellbrowser.json.bak") writeJson(inConf, confName) logging.debug("Wrote backup config %s" % confName) outConfFname = join(datasetDir, "dataset.json") writeJson(outConf, outConfFname) logging.info("Wrote main config %s" % outConfFname) def startHttpServer(outDir, port): " start an http server on localhost serving outDir on a given port " port = int(port) import RangeHTTPServer try: # py3 import http.server as SimpleHTTPServer from http.server import HTTPServer except: # py2 import SimpleHTTPServer from BaseHTTPServer import HTTPServer #server_address = ('localhost', port) # use this line to allow only access from localhost server_address = ('', port) # by default, we allow access from anywhere HandlerClass = RangeHTTPServer.RangeRequestHandler HandlerClass.protocol_version = "HTTP/1.0" httpd = HTTPServer(server_address, HandlerClass) sa = httpd.socket.getsockname() ipAddr = sa[0] if ipAddr=="0.0.0.0": ipAddr = "127.0.0.1" outDir = os.path.expanduser(outDir) os.chdir(outDir) print("Serving "+outDir+" on port %s" % str(port)) print("Point your internet browser to http://"+ipAddr+":"+str(sa[1])+" (or the IP address of this server)") if os.path.exists("/dev/null"): # can't do this on windows & Co sys.stderr = open("/dev/null", "w") # don't show http status message on console httpd.serve_forever() def cbBuild(confFnames, outDir, port=None): " stay compatible with old name " build(confFnames, outDir, port) # make sure we don't parse a conf file twice confCache = {} def findParentConfigs(inFname, dataRoot, currentName): """ return info about parents. A tuple of cellbrowser.conf filenames, the list of just the parent names and the list of (name, shortLabel) Stops at dataRoot. """ fnameList = [] pathParts = [] parentInfos = [] inFname = abspath(inFname) path = dirname(inFname) logging.debug("Looking for parents of %s (path=%s, dataRoot=%s)" % (inFname, path, dataRoot)) while path!="/" and path!=dataRoot: path = dirname(path) confFname = join(path, "cellbrowser.conf") fnameList.append(confFname) logging.debug("Found parent config at %s" % confFname) if confFname not in confCache: if not isfile(confFname): errAbort("The file %(confFname)s does not exist. It is a parent description of %(inFname)s. "\ "Since you are using dataset hierarchies, every child dataset has parent directories, "\ "and they all require a cellbrowser.conf with at least a shortLabel." % locals()) c = loadConfig(confFname, ignoreName=True) if path==dataRoot: c["name"] = "" confCache[confFname] = c else: c = confCache[confFname] baseName = c["name"].split("/")[-1] pathParts.append(c["name"]) parentInfo = (c["name"], c["shortLabel"]) parentInfos.append(parentInfo) logging.debug("parentInfos: %s" % parentInfos) # need to reverse now, as we went upwards pathParts = list(reversed(pathParts)) # then append the currentName baseName = currentName.split("/")[-1] pathParts.append(baseName) pathParts = [x for x in pathParts if x!=""] fullPath = "/".join(pathParts) logging.debug("parent datasets of %s are: %s, full path: %s, parts %s" % (inFname, fnameList, fullPath, pathParts)) return list(reversed(fnameList)), fullPath, list(reversed(parentInfos)) def rebuildCollections(dataRoot, webRoot, collList): " recreate the dataset.json files for a list of cellbrowser.conf files of collections " collList = list(collList) collList.sort(key = len, reverse=True) # sort by length, so the deepest levels are rebuilt first logging.debug("Need to rebuild these collections: %s" % collList) for collFname in collList: webCollDir = join(webRoot, relpath(dirname(collFname), dataRoot)) relCollDir = relpath(collFname, dataRoot) collOutFname = join(webCollDir, "dataset.json") collInfo = loadConfig(collFname, ignoreName=True) if dirname(collFname)!=dataRoot: fullPath = findParentConfigs(collFname, dataRoot, collInfo["name"])[1] else: fullPath = "" collInfo["name"] = fullPath logging.debug("Rebuilding collection %s from %s and subdirs of %s" % (collOutFname, collFname, webCollDir)) # the collections summary comes from the JSON files datasets = subdirDatasetJsonData(webCollDir) collSumm = summarizeDatasets(datasets) collInfo["datasets"] = collSumm parentFnames, fullPath, parentInfos = findParentConfigs(collFname, dataRoot, collInfo["name"]) if len(parentInfos)!=0: collInfo["parents"] = parentInfos collInfo["md5"] = md5ForList([json.dumps(collInfo)])[:MD5LEN] writeDatasetDesc(dirname(collFname), collInfo, webCollDir, coordFiles=None) writeJson(collInfo, collOutFname) def findRoot(inDir=None): """ return directory dataRoot defined in config file or CBDATAROOT environment variable. """ if 'CBDATAROOT' in os.environ: logging.debug("Using CBDATAROOT env variable") dataRoot = os.environ['CBDATAROOT'] else: dataRoot = getConfig("dataRoot") if dataRoot is None: logging.info("dataRoot is not set in ~/.cellbrowser or via $CBDATAROOT. Dataset hierarchies are not supported.") return None dataRoot = abspath(expanduser(dataRoot).rstrip("/")) if inDir is not None and not abspath(inDir).startswith(dataRoot): logging.info("input directory %s is not located under dataRoot %s. Deactivating hierarchies." % (inDir, dataRoot)) return None logging.debug("Data root is %s" % dataRoot) return dataRoot def resolveOutDir(outDir): """ user can define mapping e.g. {"alpha" : "/usr/local/apache/htdocs-cells"} in ~/.cellbrowser """ confDirs = getConfig("outDirs") if confDirs: if outDir in confDirs: outDir = confDirs[outDir] return outDir def fixupName(inConfFname, inConf): " detect hierarchical mode and construct the output path " inFullPath = abspath(inConfFname) dataRoot = findRoot(dirname(inFullPath)) if dataRoot: if "name" in inConf: logging.debug("using dataset hierarchies: 'name' in %s is ignored" % inFullPath) logging.debug("Deriving dataset name from path") inConf["name"] = basename(dirname(inFullPath)) relPath = relpath(dirname(inFullPath), dataRoot) else: if not "name" in inConf: errAbort("Not running in data hierarchy mode. " "The config file %s needs at least the setting 'name' to a short string, e.g. 'cortex-dev'. " "If you set a dataRoot directory in your ~/.cellbrowser file or via the CBDATAROOT variable, then the input datasets " "are assumed to be under one single directory and their subdirectory name is the datasetname. " "See https://cellbrowser.readthedocs.io/en/master/collections.html " "Otherwise, as now, they can be spread out over the entire filesystem but then the directory name cannot be used to " "derive the cell browser dataset name. Instead, a 'name' setting must be present in every cellbrowser.conf file." % inConfName) relPath = inConf["name"] dsName = inConf["name"] return dataRoot, relPath, inConf, dsName def addParents(inConfFname, dataRoot, dsName, outConf, todoConfigs): # find all parent cellbrowser.conf-files and fixup the dataset's "name" # keep a list of parent config files in todoConfigs (they will need to be updated later) if dataRoot is None: logging.info("dataRoot not set in ~/.cellbrowser, no need to rebuild hierarchy") dataRoot = None else: if not "fileVersions" in outConf: outConf = loadConfig(inConfFname, ignoreName=True) outConf["fileVersions"] = {} parentFnames, fullPath, parentInfos = findParentConfigs(inConfFname, dataRoot, dsName) todoConfigs.update(parentFnames) outConf["parents"] = parentInfos outConf["name"] = fullPath return outConf def rebuildFlatIndex(outDir): " rebuild the flat list, for legacy installs not using dataset hierarchies " logging.info("Rebuilding flat list of datasets, without hierarchies") datasets = subdirDatasetJsonData(outDir) collInfo = {} collInfo["name"] = "" collInfo["shortLabel"] = "All Datasets" collInfo["abstract"] = """This is a local Cell Browser installation, without dataset hierarchies. Please select a dataset from the list on the left.

See the documentation on dataset hierarchies. """ summInfo = summarizeDatasets(datasets) collInfo["datasets"] = summInfo outFname = join(outDir, "dataset.json") writeJson(collInfo, outFname) def checkDsCase(inConfFname, relPath, inConfig): """ relPath should not be uppercase for top-level datasets at UCSC, as we use the hostname part """ if not inConfFname.endswith(".cellbrowser.conf") and not inConfFname.endswith(".cellbrowser"): if not "/" in relPath and getConfig("onlyLower", False) and \ "name" in inConfig and inConfig["name"].isupper(): errAbort("dataset name or directory name should not contain uppercase characters, as these do not work " "if the dataset name is specified in the URL hostname itself (e.g. cortex-dev.cells.ucsc.edu)") def build(confFnames, outDir, port=None, doDebug=False, devMode=False, redo=None): " build browser from config files confFnames into directory outDir and serve on port " outDir = resolveOutDir(outDir) if outDir=="" or outDir==None: outDir = defOutDir outDir = expanduser(outDir) if not isdir(outDir): logging.warn("The directory %s does not exist. Making a new directory now." % (outDir)) setDebug(doDebug) if type(confFnames)==type(""): # it's very easy to forget that the input should be a list so we accept a single string instead logging.debug("got a string, converting to a list") confFnames = [confFnames] datasets = [] dataRoot = None todoConfigs = set() # list of filenames of parent collections that we need to update once we're done for inConfFname in confFnames: # load cellbrowser.conf if isdir(inConfFname): inConfFname = join(inConfFname, "cellbrowser.conf") logging.debug("Processing %s" % inConfFname) inConf = loadConfig(inConfFname) inDir = dirname(abspath(inConfFname)) outConf = OrderedDict() outConf["fileVersions"] = dict() dataRoot, relPath, inConf, dsName = fixupName(inConfFname, inConf) outConf = addParents(inConfFname, dataRoot, dsName, outConf, todoConfigs) checkDsCase(inConfFname, relPath, inConf) datasets.append(dsName) datasetDir = join(outDir, relPath) makeDir(datasetDir) if not "meta" in inConf: # this is a collection: just rebuild the hierarchy of us and the parents logging.info("This is a collection: just rebuilding the hierarchy") inPath = abspath(inConfFname) logging.debug("Adding %s" % inPath) todoConfigs.add(inPath) outConf["shortLabel"] = inConf["shortLabel"] else: # This is an actual dataset - the main function that does all the work is convertDataset() copyGenes(inConf, outConf, outDir) isTopLevel = ("parents" in outConf and len(outConf["parents"])==1) # important for facet checking logging.debug("Datasets parents: %s" % outConf.get("parents")) convertDataset(inDir, inConf, outConf, datasetDir, redo, isTopLevel) outConf = copyFacets(inConf, outConf) outConf["fileVersions"]["conf"] = getFileVersion(abspath(inConfFname)) outConf["md5"] = calcMd5ForDataset(outConf) writeConfig(inConf, outConf, datasetDir) if dataRoot is not None and len(todoConfigs)!=0: rebuildCollections(dataRoot, outDir, todoConfigs) else: rebuildFlatIndex(outDir) cbUpgrade(outDir, doData=False) outIndexFname = join(outDir, "js", "cellBrowser.js") if not isfile(outIndexFname): logging.info("%s does not exist: running cbUpgrade now to make sure there are static js/css files" % outIndexFname) cbUpgrade(outDir, doData=False, doCode=True) if port: print("Interrupt this process, e.g. with Ctrl-C, to stop the webserver") startHttpServer(outDir, int(port)) pidFname = "cellbrowser.pid" def savePid(): " save PID to file " import tempfile tempDir = tempfile.gettempdir() tempName = join(tempDir, pidFname) tempFh = open(tempName, "w") pid = os.getpid() tempFh.write("%d" % pid) tempFh.close() return pid def loadPid(): " return previously saved PID and delete the pid file " import tempfile tempDir = tempfile.gettempdir() tempName = join(tempDir, pidFname) if not isfile(tempName): return None pid = int(open(tempName).read()) os.remove(tempName) return pid def serve(outDir, port): " forking from R/ipython/jupyter is not a good idea at all. Instead, we start a shell that runs Python. " if outDir is None: raise Exception("html outDir must be set if a port is set") port = int(port) stop() cmd = [sys.executable, __file__, "cbServe", "'"+outDir+"'", str(port)] cmdStr = " ".join(cmd) + "&" logging.debug("Running command through shell: '%s'" % cmdStr) ret = os.system(cmdStr) if ret!=0: print("ERROR: Could not run command '%s'" % cmdStr) def serveDirect(outDir, port): """ run a webserver on localhost with a port, fork/detach as a daemon, and write PID into /cellbrowser.pid. NOT USED FOR NOW - Jupyther/ipython/Rstudio don't like this """ # mostly copied from # http://code.activestate.com/recipes/66012-fork-a-daemon-process-on-unix/ # do the UNIX double-fork magic, see Stevens' "Advanced # Programming in the UNIX Environment" for details (ISBN 0201563177) try: pid = os.fork() if pid > 0: # exit first parent #sys.exit(0) return except OSError as e: logging.error("fork #1 failed: %d (%s)" % (e.errno, e.strerror)) sys.exit(1) # decouple from parent environment os.chdir("/") os.setsid() os.umask(0) # do second fork try: pid = os.fork() if pid > 0: # exit from second parent #logging.error( "Daemon PID %d" % pid ) sys.exit(0) except OSError as e: logging.error("fork #2 failed: %d (%s)" % (e.errno, e.strerror) ) sys.exit(1) # start the daemon main loop pid = savePid() logging.info("Starting webserver, process ID: %d" % pid) startHttpServer(outDir, port) def stop(): " kill the process with PID in /cellbrowser.pid " import signal pid = loadPid() if pid is None: logging.debug("No saved process ID found, nothing to kill") return try: os.kill(pid, signal.SIGTERM) #or signal.SIGKILL except OSError as ex: print("Unable to kill cellbrowser process with PID %d" % pid) return print("Successfully killed cellbrowser process with PID %d" % pid) def cbBuildCli(): " command line interface for dataset converter, also copies the html/js/etc files " args, options = cbBuild_parseArgs() if len(args)!=0: errAbort("This program doesn't accept arguments. Did you forget to use the -i option?") if options.force: global doForce doForce = True confFnames = options.inConf if confFnames==None: confFnames = ["cellbrowser.conf"] if options.init: copyPkgFile("sampleConfig/cellbrowser.conf") copyPkgFile("sampleConfig/desc.conf") sys.exit(1) for fname in confFnames: if not isfile(fname): logging.error("File %s does not exist." % fname) cbBuild_parseArgs(showHelp=True) if options.outDir is None: logging.error("You have to specify at least the output directory via -o or set the env. variable CBOUT or set htmlDir in ~/.cellbrowser") cbBuild_parseArgs(showHelp=True) outDir = options.outDir #onlyMeta = options.onlyMeta port = options.port maxDepth = options.depth try: if options.recursive: if isPy3: confFnames = glob.glob("**/cellbrowser.conf", recursive=True) else: confFnames = glob.glob("**/cellbrowser.conf") # not the same, at least doesn't throw an error, good enough, Py2 is old filtConf = [] for cf in confFnames: if "old/" in cf or "tmp/" in cf or "not-used/" in cf or "temp/" in cf or "orig/" in cf or "ignore/" in cf or "skip/" in cf or "marker-recalc/" in cf or "marker_recalc/" in cf or "re-export/" in cf: logging.debug("Skipping %s, name suggests that it should be skipped" % cf) continue cfDepth = cf.count("/") if maxDepth and cfDepth > maxDepth: logging.info("Skipping %s, is at depth %d, but --depth %d was specified" % (cf, cfDepth, maxDepth)) continue filtConf.append(cf) confFnames = filtConf logging.debug("recursive config filenames without anything that contains old/tmp/temp/not-used/orig/ignore/skip/marker-recalc/marker_recalc/re-export/: %s" % confFnames) for cf in confFnames: logging.info("Recursive mode: processing %s" % cf) build(cf, outDir, redo=options.redo) else: build(confFnames, outDir, port, redo=options.redo) except: exc_info = sys.exc_info() print(exc_info) print(type(exc_info[0])) print(type(exc_info[1])) logging.error("Unexpected error: %s" % str(exc_info)) import traceback traceback.print_exception(*exc_info) sys.exit(1) def importLoom(inFname, reqCoords=False): " load a loom file with anndata and if reqCoords is true, fix up the obsm attributes " import pandas as pd import anndata ad = anndata.read_loom(inFname) if not reqCoords: return ad coordKeyList = (["_tSNE1", "_tSNE2"], ["_X", "_Y"], ["UMAP1","UMAP2"], \ ['Main_cluster_umap_1', 'Main_cluster_umap_2']) obsKeys = getObsKeys(ad) foundCoords = False for coordKeys in coordKeyList: if coordKeys[0] in obsKeys and coordKeys[1] in obsKeys: logging.debug("Found %s in anndata.obs, moving these fields into obsm" % repr(coordKeys)) newObj = pd.concat([ad.obs[coordKeys[0]], ad.obs[coordKeys[1]]], axis=1) ad.obsm["tsne"] = newObj del ad.obs[coordKeys[0]] del ad.obs[coordKeys[1]] foundCoords = True break if not foundCoords: logging.warn("Did not find any keys like %s in anndata.obs, cannot import coordinates" % repr(coordKeyList)) return ad def adataStringFix(adata): " some h5ad files read from 10X MTX have bytestrings in them instread of normal strings " if not type(adata.var.index[0])==str: logging.warn("Workaround: Converting byte strings in adata.var.index to normal strings") adata.var.index = adata.var.index.str.decode("utf-8") if "gene_ids" in adata.var and not type(adata.var["gene_ids"][0])==str: logging.warn("Workaround: Converting byte strings in adata.var['gene_ids'] to normal strings") adata.var["gene_ids"] = adata.var["gene_ids"].str.decode("utf-8") return adata def readMatrixAnndata(matrixFname, samplesOnRows=False, genome="hg38", reqCoords=False): """ read an expression matrix and return an adata object. Supports .mtx.h5 and .tsv (not .tsv.gz) If reqCoords is True, will try to find and convert dim. reduc. coordinates in a file (loom). """ sc = importScanpy() #if matrixFname.endswith(".mtx.gz"): #errAbort("For cellranger3-style .mtx files, please specify the directory, not the .mtx.gz file name") if matrixFname.endswith(".h5ad"): logging.info("File name ends with .h5ad: Loading %s using sc.read" % matrixFname) adata = sc.read_h5ad(matrixFname) elif matrixFname.endswith(".loom"): adata = importLoom(matrixFname, reqCoords=reqCoords) elif isMtx(matrixFname): import pandas as pd logging.info("Loading expression matrix: mtx format") adata = sc.read_mtx(matrixFname).T _mtxFname, geneFname, barcodeFname = findMtxFiles(matrixFname) adata.var_names = pd.read_csv(geneFname, header=None, sep='\t')[1] adata.obs_names = pd.read_csv(barcodeFname, header=None, sep='\t')[0] elif isdir(matrixFname): logging.info("Loading expression matrix: cellranger3 mtx.gz format from %s" % matrixFname) adata = sc.read_10x_mtx(matrixFname) elif matrixFname.endswith(".h5"): import h5py ifh = h5py.File(matrixFname,'r') groups = list(ifh.keys()) ifh.close() # "matrix" indicates a cellranger3 file which does not have a clear genome list if "matrix" not in groups and genome not in groups: errAbort("The file %s does not have expression info for the genome %s. Possible genomes are: %s. " "Choose one of these and and specify it with the option -g" % (matrixFname, genome, groups)) logging.info("Loading expression matrix: 10X h5 format") adata = sc.read_10x_h5(matrixFname, genome=genome) else: logging.info("Loading expression matrix: Filename does not end with h5ad/mtx/loom, so trying sc.read for any other scanpy-supported format, like tab-separated, etc.") if matrixFname.endswith(".loom"): logging.info("This is a loom file: activating --samplesOnRows") samplesOnRows = True adata = sc.read(matrixFname, first_column_names=True) if not samplesOnRows: logging.info("Scanpy defaults to samples on lines, so transposing the expression matrix, use --samplesOnRows to change this") adata = adata.T adata = adataStringFix(adata) return adata def addMd5(d, fname, keyName="md5", isSmall=False, shortMd5=True): " add a key 'md5' to dict d with first MD5LEN letters of fname " logging.debug("Getting md5 of %s" % fname) md5 = md5ForFile(fname, isSmall=isSmall)[:MD5LEN] if shortMd5: md5 = md5[:MD5LEN] d[keyName] = md5 def calcMd5ForDataset(datasetDesc): " make a combined md5 for the dataset, based on the config file, meta, expression and the html or dataset descs " md5s = [] if "fileVersions" in datasetDesc: fileVers = datasetDesc["fileVersions"] for key, fileInfo in iterItems(fileVers): md5s.append(fileInfo["md5"]) if "descMd5s" in datasetDesc: descs = datasetDesc["descMd5s"] md5s.append (descs.get("summary", "")) md5s.append (descs.get("methods", "")) md5s.append (descs.get("downloads", "")) return md5ForList(md5s)[:MD5LEN] def subdirDatasetJsonData(searchDir, skipDir=None): " find all dataset.json files under searchDir and return their contents as a list " logging.debug("Searching directory %s for datasets" % searchDir) datasets = [] dsNames = defaultdict(list) for subDir in os.listdir(searchDir): subPath = join(searchDir, subDir) if subPath==skipDir: logging.debug("Skipping directory %s" % subPath) continue fname = join(subPath, "dataset.json") if not isfile(fname): continue logging.debug("Found %s" % fname) datasetDesc = readJson(fname, keepOrder=True) # we need at least a name, an md5 and a shortLabel if not "md5" in datasetDesc: datasetDesc["md5"] = calcMd5ForDataset(datasetDesc) if not "name" in datasetDesc: errAbort("The file dataset.json for the subdirectory %s is not valid: There is no 'name' attribute. Please rebuild it, or remove it, then " "come back here and retry the cbBuild command" % subDir) if not "shortLabel" in datasetDesc: errAbort("The file dataset.json for the subdirectory %s is not valid: There is no 'shortLabel' attribute. Please rebuild it, or remove it, then " "come back here and retry the cbBuild command" % subDir) dsName = datasetDesc["name"] if dsName in dsNames: errAbort("Duplicate name: %s appears in these directories: %s and %s" % \ (dsName, dsNames[dsName], subDir)) dsNames[dsName].append(subDir) datasetDesc["baseUrl"] = subDir+"/" datasets.append(datasetDesc) datasets = list(sorted(datasets, key=lambda k: k.get('priority', 10))) logging.info("Found %d datasets in subdirectories of directory %s" % (len(datasets), searchDir)) return datasets def findDatasets(outDir): """ search all subdirs of outDir for dataset.json files and return their contents as a list of dataset description is a list with three members: A label, the base URL and a longer description that can contain html. The attribute "priority" can be used to enforce an order on the datasets """ datasets = [] dsNames = defaultdict(list) for subDir in os.listdir(outDir): if not isdir(join(outDir, subDir)): continue if subDir.endswith(".skip"): continue fname = join(outDir, subDir, "dataset.json") if not isfile(fname): continue datasetDesc = readJson(fname, keepOrder=True) if "isCollection" in datasetDesc: logging.debug("Dataset %s is a collection, so not parsing now" % datasetDesc["name"]) continue # a few older datasets don't have MD5s, tolerate that for now if not "md5" in datasetDesc: datasetDesc["md5"] = calcMd5ForDataset(datasetDesc) #errAbort("dataset %s has no md5" % datasetDesc) if not "name" in datasetDesc: # every dataset has to have a name errAbort("Dataset %s must have a 'name' field." % datasetDesc) dsName = datasetDesc["name"] if dsName in dsNames: errAbort("Duplicate name: %s appears in these directories: %s and %s" % \ (dsName, dsNames[dsName], subDir)) dsNames[dsName].append(subDir) #assert("shortLabel" in datasetDesc) if not "shortLabel" in datasetDesc: datasetDesc["shortLabel"] = datasetDesc["name"] datasetDesc["baseUrl"] = subDir+"/" datasets.append(datasetDesc) datasets = list(sorted(datasets, key=lambda k: k.get('priority', 0))) logging.info("Found %d datasets" % len(datasets)) return datasets def copyAllFiles(fromDir, subDir, toDir, ext=None): " copy all files in fromDir/subDir to toDir/subDir " outDir = join(toDir, subDir) makeDir(outDir) logging.debug("Copying all files from %s/%s to %s" % (fromDir, subDir, toDir)) for filename in glob.glob(join(fromDir, subDir, '*')): if isdir(filename): continue if ext and not filename.endswith(ext): continue fullPath = filename dstPath = join(outDir, basename(filename)) shutil.copyfile(fullPath, dstPath) def copyAndReplace(inFname, outDir): " copy file, replacing $VERSION and $GENEFILES " try: from ._version import get_versions versionStr = get_versions()['version'] except: versionStr = "versioneerPackageNotInstalled" data = open(inFname).read() data = data.replace("$VERSION$", versionStr) outFname = join(outDir, basename(inFname)) with open(outFname, "w") as ofh: ofh.write(data) logging.debug("Wrote version string %s into file %s, source was %s" % (repr(versionStr), inFname, outFname)) def copyStatic(baseDir, outDir): " copy all js, css and img files to outDir " logging.info("Copying js, css and img files to %s" % outDir) imgDir = join(outDir, "img") copyAllFiles(baseDir, "ext/images", outDir) copyAllFiles(baseDir, "img", outDir) copyAllFiles(baseDir, "ext", outDir) copyAllFiles(baseDir, "js", outDir) copyAllFiles(baseDir, "css", outDir) copyAllFiles(baseDir, "genes", outDir, ext=".json.gz") copyAndReplace(join(baseDir, "js", "cellBrowser.js"), join(outDir, "js")) def writeVersionedLink(ofh, mask, webDir, relFname, addVersion=True): " write sprintf-formatted mask to ofh, but add ?md5 to jsFname first. Goal is to force cache reload in browser. " # hack for jquery - avoid jquery button overriding any other button function if relFname.endswith("jquery-ui.min.js"): ofh.write(""" """) filePath = join(webDir, relFname) md5 = md5WithPython(filePath) if addVersion: verFname = relFname+"?"+md5[:MD5LEN] else: verFname = relFname outLine = mask % verFname ofh.write(outLine+"\n") def writeGaScript(ofh, gaTag): " write Google analytics script to ofh " ofh.write(""" """ % (gaTag, gaTag)) def summarizeDatasets(datasets): """ keep only the most important fields of a list of datasets and return them as a list of dicts. Also create a new md5 from all the datasets. """ dsList = [] for ds in datasets: if ds.get("visibility")=="hide" or ds.get("hideDataset") in [True, "True", "true", 1, "1"]: logging.debug("Hiding dataset %s" % ds["name"]) continue # these must always be present if not "shortLabel" in ds: errAbort("Dataset '%s' has no shortLabel attribute. Internal error." % repr(ds["name"])) summDs = { "shortLabel" : ds["shortLabel"], "name" : ds["name"], "md5" : ds["md5"], } # these are copied if they are present #copyTags = ["body_parts"] #for t in copyTags: #if t in ds: #summDs[t] = ds[t] # these are copied and checked for the correct type # DEPRECATE in 2025: duplicated facets for optListTag in ["tags", "hasFiles", "body_parts", "diseases", "organisms", "projects", "life_stages", \ "domains", "sources", "assays"]: if optListTag in ds: if (type(ds[optListTag])==type([])): # has to be a list summDs[optListTag] = ds[optListTag] else: logging.error("Dataset: %s" % ds["name"]) logging.error("Setting '%s' must be a list of values, not a number or string." % optListTag) # the above tags are replaced with the more modern dictionary 'facets' (the old code is still there, for backwards compatibility if "facets" in ds: summDs["facets"] = dict(ds["facets"]) # these are generated if "sampleCount" in ds: summDs["sampleCount"] = ds["sampleCount"] else: summDs["isCollection"] = True if not "datasets" in ds: errAbort("The dataset %s has a dataset.json file that looks invalid. Please remove the file and rebuild that dataset. " "Then go back to the current directory and retry the same command. " % ds["name"]) children = ds["datasets"] collCount = 0 dsCount = 0 for child in children: if "datasets" in child or "isCollection" in child: collCount += 1 else: dsCount +=1 if dsCount!=0: summDs["datasetCount"] = dsCount if collCount!=0: summDs["collectionCount"] = collCount dsList.append(summDs) dsList = list(sorted(dsList, key=operator.itemgetter('shortLabel'))) return dsList def mergeJsFiles(extJsFnames, baseDir, libFname): # we are starting to have way too many .js files. Reduce all external ones to a single file jsOfh = open(libFname, "w") for fn in extJsFnames: jsOfh.write(open(join(baseDir, fn)).read()) jsOfh.write(";\n/* ---------- FILE END ------------ */\n") jsOfh.close() def makeIndexHtml(baseDir, outDir, devMode=False): " make the index.html, copy over all .js and related files and add their md5s " newFname = join(outDir, "index.html") tmpFname = newFname+".tmp" ofh = open(tmpFname, "w") ofh.write("\n") ofh.write('\n') ofh.write('\n') ofh.write('\n') ofh.write('') ofh.write('') ofh.write('') ofh.write('UCSC Cell Browser\n') cssFnames = ["ext/jquery-ui-1.12.1.css", "ext/spectrum-1.8.0.css", "ext/jquery.contextMenu.css", "ext/slick.grid.css", "ext/slick.examples.css", #"ext/jquery-ui-1.11.3.custom.css", "ext/jquery.tipsy.1.0.3.min.css", "ext/bootstrap.min.css", "ext/introjs.2.4.0.min.css", "ext/bootstrap-submenu.min.css", "ext/bootstrap-dropmenu.min.css", "ext/font-awesome.css", "ext/googleMaterialIcons.css", "ext/chosen.1.8.2.min.css", "ext/select2.4.0.4.min.css", "ext/selectize.bootstrap3.css", #"ext/selectize.0.12.4.min.css", "ext/OverlayScrollbars.min.css", # 1.6.2, from https://cdnjs.com/libraries/overlayscrollbars #"ext/theme.default.css", # tablesorter "ext/theme.bootstrap_3.css", # tablesorter "css/cellBrowser.css" ] addVersion = not devMode for cssFname in cssFnames: writeVersionedLink(ofh, '', baseDir, cssFname, addVersion=addVersion) extJsFnames = ["ext/FileSaver.1.1.20151003.min.js", "ext/jquery.3.1.1.min.js", "ext/palette.js", "ext/spectrum.min.js", "ext/jsurl2.js", "ext/chosen.jquery.min.js", "ext/mousetrap.min.js", "ext/jquery.contextMenu.js", "ext/jquery.ui.position.min.js", "ext/jquery.tipsy.min.js", "ext/intro.min.js", "ext/papaparse.min.js", "ext/bootstrap.min.js", "ext/bootstrap-submenu.js", "ext/pako_inflate.min.js", "ext/FastBitSet.js", "ext/hamster.js", "ext/split.js", "ext/normalizeWheel.js", #"ext/tablesort.js", "ext/tablesort.number.min.js", "ext/Chart.bundle.min.js", "ext/chartjs-chart-box-and-violin-plot.js", "ext/jquery-ui.min.js", "ext/select2.min.js", "ext/selectize.js", # 0.12.16 "ext/jquery.sparkline.min.js", "ext/jquery.overlayScrollbars.min.js", # 1.6.2 from https://cdnjs.com/libraries/overlayscrollbars "ext/jquery.event.drag-2.3.0.js", # for slickgrid 2.4.5 "ext/jquery.tablesorter.js", "ext/jquery.tablesorter.widgets.js", "ext/lz-string.js", # 1.4.4, https://raw.githubusercontent.com/pieroxy/lz-string/master/libs/lz-string.js "ext/slick.core.js", "ext/slick.cellrangedecorator.js", "ext/slick.cellrangeselector.js", "ext/slick.cellselectionmodel.js", "ext/slick.editors.js", "ext/slick.formatters.js", "ext/slick.grid.js", "ext/tiny-queue.js", "ext/science.v1.js", "ext/reorder.v1.js", # commit d51dda9ad5cfb987b9e7f2d7bd81bb9bbea82dfe "ext/scaleColorPerceptual.js", # https://github.com/politiken-journalism/scale-color-perceptual tag 1.1.2 "ext/drawImage-clipper.js", # Safari Monkeypatch https://gist.github.com/Kaiido/ca9c837382d89b9d0061e96181d1d862 ] # we are starting to have way too many .js files. Reduce all external ones to a single file libFname = join(outDir, "ext", "cbLib.js") #mergeJsFiles(extJsFnames, baseDir, libFname) #md5 = md5WithPython(join(outDir, "ext/cbLibs.js")) #ofh.write('\n' % md5[:MD5LEN]) ourJsFnames = [ "js/cellBrowser.js", "js/cbData.js", "js/maxPlot.js", "js/maxHeat.js", ] for jsFname in ourJsFnames: writeVersionedLink(ofh, '', baseDir, jsFname, addVersion=addVersion) for jsFname in extJsFnames: writeVersionedLink(ofh, '', baseDir, jsFname, addVersion=addVersion) # at UCSC, for grant reports, we need to get some idea how many people are using the cell browser ofh.write('\n') if getConfig("gaTag") is not None: gaTag = getConfig("gaTag") writeGaScript(ofh, gaTag) md5 = md5WithPython(join(outDir, "dataset.json")) ofh.write('\n') ofh.write('\n') #ofh.write('

Please wait. Cell Browser is loading...
\n') ofh.write('\n') ofh.write('\n'); ofh.write('\n') ofh.close() renameFile(tmpFname, newFname) #datasetLabels = [x["name"] for x in dsList] logging.info("Wrote %s (devMode: %s)" % (newFname, devMode)) # #def removeHiddenDatasets(datasets): # """ visibility="hidden" removes datasets from the list, e.g. during paper review """ # newDsList = [] # for ds in datasets: # if ds.get("visibility")=="hide": # logging.debug("Dataset %s is set to hide, skipping" % ds["name"]) # continue # newDsList.append(ds) # return newDsList def cbMake(outDir, devMode=False): cbUpgrade(outDir, devMode=devMode) #def makeDatasetListJsons(datasets, outDir): # " recusively write dataset.json files from datasets to outDir " # for dataset in datasets: # if "children" in dataset: # makeDatasetListJsons(dataset["children"], join(outDir, dataset["name"])) # outFname = join(outDir, "dataset.json") # writeJson(dataset, outFname, ignoreKeys=["children"]) def cbUpgrade(outDir, doData=True, doCode=False, devMode=False, port=None): """ Rebuild index.html in outDir. If doData is set: re-index all top-level datasets and recreate dataset.json If doCode is set: copy all js/css If port is set to number: start the minimal webserver. """ logging.debug("running cbUpgrade, doData=%s, doCode=%s, devMode=%s" % (doData, doCode, devMode)) baseDir = dirname(__file__) # = directory of this script webDir = join(baseDir, "cbWeb") outDir = resolveOutDir(outDir) if doData: dataRoot = findRoot() if not dataRoot: logging.info("Not using dataset hierarchies: no need to rebuild dataset list") else: topConfig = join(dataRoot, "cellbrowser.conf") rebuildCollections(dataRoot, outDir, [topConfig]) if doCode or devMode: copyStatic(webDir, outDir) makeIndexHtml(webDir, outDir, devMode=devMode) if port: print("Interrupt this process, e.g. with Ctrl-C, to stop the webserver") startHttpServer(outDir, int(port)) def cbUpgradeCli(): " command line interface for copying over the html and js files and recreate index.html " args, options = cbUpgrade_parseArgs() outDir = options.outDir if outDir is None: errAbort("You have to specify at least the output directory or set the environment variable CBOUT.") if len(args)!=0: errAbort("This command does not accept arguments without options. Did you mean: -o ? ") cbUpgrade(outDir, doCode=options.addCode, devMode=options.devMode, port=options.port) def parseGeneLocs(db, geneType): """ return dict with geneId -> list of bedRows bedRows have (chrom, start, end, geneId, score, strand) """ if isfile(geneType): fname = geneType else: fname = getStaticFile(join("genes", db+"."+geneType+".bed.gz")) logging.info("Reading gene locations from %s" % fname) ret = defaultdict(list) for line in openFile(fname): row = line.rstrip("\r\n").split('\t') name = row[3].split(".")[0] row = row[:6] ret[name].append(row) return ret def extractMatrix(inMatrixFname, hubMatrixFname): if isfile(hubMatrixFname): logging.info("Not extracting to %s, file already exists" % hubMatrixFname) else: logging.info("Extracting matrix to %s" % hubMatrixFname) cmd = "gunzip -c %s > %s" % (inMatrixFname, hubMatrixFname) runCommand(cmd) def getSizesFname(genome): " return chrom.sizes filename for db " fname = getStaticFile(join("genomes", genome+".sizes")) assert(isfile(fname)) return fname def getAliasFname(genome): " return .chromAlias.tsv filename for db " fname = getStaticFile(join("genomes", genome+".chromAlias.tsv")) assert(isfile(fname)) # try 'chromToUcsc --get ' or email us so we can add the file return fname def pipeLog(msg): logging.info(msg) # for iphython debug command line def excepthook(type, value, traceback): from IPython import embed embed() def checkLayouts(conf): """ it's very easy to get the layout names wrong: check them and handle the special value 'all' """ if "doLayouts" not in conf: return ["tsne", "drl", "umap"] doLayouts = conf["doLayouts"] if doLayouts=="all": doLayouts = recommendedLayouts for l in doLayouts: if l not in coordLabels: errAbort("layout name %s is not valid. Valid layouts: %s" % (l, str(coordLabels))) return doLayouts class Tee(object): " file like object that writes to stdout and also to a file " def __init__(self, fname): self.terminal = sys.stdout self.log = open(fname, "a") self.fname = fname def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): #this flush method is needed for python 3 compatibility. #this handles the flush command by doing nothing. #you might want to specify some extra behavior here. pass def addMetaToAnnData(adata, fname): " parse .csv or .tsv and add the meta data as fields to adata.obs " logging.info("Adding meta data from %s to anndata object" % fname) import pandas as pd df1 = adata.obs sep = sepForFile(fname) df2 = pd.read_csv(fname, sep=sep, index_col=0) ids1 = set(df1.index) ids2 = set(df2.index) commonIds = ids1.intersection(ids2) logging.info("Meta data from %s has %d cell identifiers in common with anndata" % (fname, len(commonIds))) if len(commonIds)==0: l1 = list(sorted(ids1)) l2 = list(sorted(ids2)) errAbort("Values in first column in file %s does not seem to match the cell IDs from the expression matrix. Examples: expression matrix: %s, meta data: %s" % (fname, l1[:3], l2[:3])) try: df3 = df1.join(df2, how="inner") logging.info("%d meta data columns before setting in anndata object" % len(adata.obs.index)) adata.obs = df3 logging.debug("list of column names in merged meta data: %s"% ",".join(list(df3.columns))) logging.info("%d meta data columns before setting" % len(df3.index)) logging.info("%d meta data columns after setting in anndata object" % len(adata.obs.index)) except ValueError: logging.warn("Could not merge h5ad and meta data, skipping h5ad meta and using only provided meta. Most likly this happens when the field names in the h5ad and the meta file overlap") adata.obs = df2 return adata def getObsKeys(adata): "get the keys of the obs object. Anndata broke compatibility, so try to accept two ways" try: obsKeys = adata.obsKeys.dtype.names # this used to work except: obsKeys = list(adata.obs.keys()) # this seems to work with newer versions return obsKeys def getObsmKeys(adata): "get the keys of the obsm object. Has this changed in newer versions of anndata? " try: obsmKeys = adata.obsm.dtype.names # this used to work except: obsmKeys = list(adata.obsm.keys()) # this seems to work with newer versions return obsmKeys def cbScanpy(matrixFname, inMeta, inCluster, confFname, figDir, logFname, skipMarkers): """ run expr matrix through scanpy, output a cellbrowser.conf, a matrix and the meta data. Return an adata object. Optionally keeps a copy of the raw matrix in adata.raw """ sc = importScanpy() import pandas as pd import numpy as np import warnings warnings.filterwarnings("ignore") conf = maybeLoadConfig(confFname) doLayouts = checkLayouts(conf) sc.settings.set_figure_params(dpi=200) sc.settings.file_format_figs = 'png' sc.settings.plot_suffix='' sc.settings.autosave=True sc.settings.autoshow=False sc.settings.figdir=figDir #sc.settings.logfile=logFname #sc.settings.logDir=join(outDir, "cbScanpy.log") sys.stdout = Tee(logFname) # we want our log messages and also scanpy messages into one file pipeLog("cbScanpy $Id$") pipeLog("Command: %s" % " ".join(sys.argv)) pipeLog("Matrix input file: %s" % matrixFname) pipeLog("Restricting OPENBLAS to 4 threads") os.environ["OPENBLAS_NUM_THREADS"] = "4" # export OPENBLAS_NUM_THREADS=4 #printLog("Output directory: %s" % outDir) pipeLog("Start time: %s" % datetime.datetime.now()) sc.logging.print_versions() start = timeit.default_timer() adata = readMatrixAnndata(matrixFname, samplesOnRows=options.samplesOnRows, genome=options.genome) # set default values conf["useRaw"] = conf.get("useRaw", True) conf["doExp"] = conf.get("doExp", False) conf["doTrimCells"] = conf.get("doTrimCells", True) conf["minGenes"] = conf.get("minGenes", 200) conf["doTrimGenes"] = conf.get("doTrimGenes", True) conf["minCells"] = conf.get("minCells", 3) conf["doFilterMito"] = conf.get("doFilterMito", True) conf["geneIdType"] = conf.get("geneIdType", "auto") conf["mitoMax"] = conf.get("mitoMax", 0.05) conf["doFilterGenes"] = conf.get("doFilterGenes", True) conf["filterMaxGenes"] = conf.get("filterMaxGenes", 15000) conf["filterMinGenes"] = conf.get("filterMinGenes", 10) conf["doNormalize"] = conf.get("doNormalize", True) conf["countsPerCell"] = conf.get("countsPerCell", 10000) conf["doLog"] = conf.get("doLog", True) conf["doTrimVarGenes"] = conf.get("doTrimVarGenes", True) conf["varMinMean"] = conf.get("varMinMean", 0.0125) conf["varMaxMean"] = conf.get("varMaxMean", 3) conf["varMinDisp"] = conf.get("varMinDisp", 0.5) conf["doRegress"] = conf.get("doRegress", True) conf["regressMax"] = conf.get("regressMax", 10) conf["pcCount"] = conf.get("pcCount", "auto") conf["doLayouts"] = doLayouts conf["doLouvain"] = conf.get("doLouvain", True) conf["doPca"] = conf.get("doPca", True) conf["louvainNeighbors"] = int(conf.get("louvainNeighbors", 6)) conf["louvainRes"] = float(conf.get("louvainRes", 1.0)) conf["doMarkers"] = conf.get("doMarkers", True) if skipMarkers: conf["doMarkers"] = False conf["markerCount"] = int(conf.get("markerCount", 20)) conf["inMeta"] = conf.get("inMeta", inMeta) conf["inCluster"] = conf.get("inCluster", inCluster) if conf["inMeta"]: adata = addMetaToAnnData(adata, inMeta) sampleCount = len(adata.obs) bigDataset = False if sampleCount > 140000: bigDataset = True #useRaw = conf["useRaw"] #if useRaw and not bigDataset: #adata.raw = adata # this is doing much more than assigning, it calls implicitely a function that copies ## a few things around. See the anndata source code under basic.py #else: #bigDataset = True #logging.info("Big dataset: not keeping a .raw copy of the data to save memory during analysis") if "doExp" in conf and conf["doExp"]: pipeLog("Undoing log2 of data, by running np.expm1 on the matrix") adata.X = np.expm1(adata.X) pipeLog("Data has %d samples/observations" % len(adata.obs)) pipeLog("Data has %d genes/variables" % len(adata.var)) if conf["doTrimCells"]: minGenes = conf["minGenes"] pipeLog("Basic filtering: keep only cells with min %d genes" % (minGenes)) # see: https://github.com/scverse/scanpy/issues/747 adata.obs.index = adata.obs.index.astype(str) adata.var.index = adata.var.index.astype(str) sc.pp.filter_cells(adata, min_genes=minGenes) # adds n_genes to adata if conf["doTrimGenes"]: minCells = conf["minCells"] pipeLog("Basic filtering: keep only gene with min %d cells" % (minCells)) sc.pp.filter_genes(adata, min_cells=minCells) # adds n_counts to adata.obs? pipeLog("After filtering: Data has %d samples/observations and %d genes/variables" % (len(adata.obs), len(adata.var))) sampleCountPostFilter = len(adata.obs) if len(list(adata.obs_names))==0: errAbort("No cells left after filtering. Consider lowering the minGenes/minCells cutoffs in scanpy.conf") if len(list(adata.var_names))==0: errAbort("No genes left after filtering. Consider lowering the minGenes/minCells cutoffs in scanpy.conf") removedRatio = 1.0 - (float(sampleCountPostFilter) / float(sampleCount)) if removedRatio > 0.5: logging.warn("!! The filtering removed more than 50% of cells - are you sure this is intentional? Consider lowering minGenes/minCells in scanpy.conf") if removedRatio > 0.9: errAbort("More than 90% of cells were filtered out, this is unlikely to make sense") if not "n_counts" in list(adata.obs.columns.values): logging.debug("Adding obs.n_counts") adata.obs['n_counts'] = np.sum(adata.X, axis=1) #### PARAMETERS FOR GATING CELLS (must be changed) ##### logging.info("Making gene names unique") adata.var_names_make_unique() if conf["doFilterMito"]: geneIdType = conf["geneIdType"] if geneIdType is None or geneIdType=="auto": logging.info("'geneIdType' is not specified in config file or set to 'auto'.") geneIds = list(adata.var_names) geneIdType = guessGeneIdType(geneIds) thrsh_mito=conf["mitoMax"] pipeLog("Remove cells with more than %f percent of mitochondrial genes" % thrsh_mito) pipeLog("Computing percentage of mitochondrial genes") mito_genes = [name for name in adata.var_names if name.startswith('MT.') or name.startswith('MT-')] if len(mito_genes)>50: errAbort("Strange expression matrix - more than 50 mitochondrial genes?") if len(mito_genes)==0: gencodeMitos = readMitos(geneIdType) mito_genes = [name for name in adata.var_names if name.split('.')[0] in gencodeMitos] if(len(mito_genes)==0): # no single mitochondrial gene in the expression matrix ? pipeLog("WARNING - No single mitochondrial gene was found in the expression matrix.") pipeLog("Apoptotic cells cannot be removed - please check your expression matrix") doMito = False conf["mitosFound"] = False else: doMito = True adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1) obsFields = list(adata.obs.columns.values) if "n_genes" in obsFields: # n_counts is always there sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True) adata = adata[adata.obs['percent_mito'] < thrsh_mito, :] if conf["doFilterGenes"]: up_thrsh_genes=conf["filterMaxGenes"] low_thrsh_genes=conf["filterMinGenes"] pipeLog("Remove cells with less than %d and more than %d genes" % (low_thrsh_genes, up_thrsh_genes)) #Filtering out cells according to filter parameters pipeLog('Keeping only cells with < %d genes' % up_thrsh_genes) adata = adata[adata.obs['n_genes'] < up_thrsh_genes, :] pipeLog("After filtering: Data has %d samples/observations and %d genes/variables" % (len(adata.obs), len(adata.var))) pipeLog('Keeping only cells with > %d genes' % low_thrsh_genes) adata = adata[adata.obs['n_genes'] > low_thrsh_genes, :] pipeLog("After filtering: Data has %d samples/observations and %d genes/variables" % (len(adata.obs), len(adata.var))) if conf["doNormalize"]: countsPerCell = conf["countsPerCell"] pipeLog('Expression normalization, counts per cell = %d' % countsPerCell) sc.pp.normalize_per_cell(adata, counts_per_cell_after=countsPerCell) # change June 2019: move up log, use new trim function, make default True if conf["doLog"]: sc.pp.log1p(adata) pipeLog("Did log2'ing of data") if conf.get("doTrimVarGenes", True): if not conf["doLog"]: errAbort("you have set the option doLog=False but doTrimGenes is True. In >= Scanpy 1.4, this is not allowed anymore. If your dataset is already log2'ed, set doExp=True to reverse this and also doLog=True to re-apply it later, if you want to find variable genes.") minMean = conf["varMinMean"] maxMean = conf["varMaxMean"] minDisp = conf["varMinDisp"] pipeLog('Finding highly variable genes') try: sc.pp.highly_variable_genes(adata, min_mean=minMean, max_mean=maxMean, min_disp=minDisp) sc.pl.highly_variable_genes(adata) except: if conf.get("doExp")!=True: pipeLog("An error occurred when finding highly variable genes. This may be due to an input matrix file that is log'ed. Set doExp=True in the cbScanpy config file to undo the logging when the matrix is loaded and rerun the cbScanpy command") raise adata = adata[:, adata.var['highly_variable']] pipeLog("After high-var filtering: Data has %d samples/observations and %d genes/variables" % (len(adata.obs), len(adata.var))) #Regress out variables nUMI, percent_mito if conf["doRegress"]: if doMito: pipeLog('Regressing out percent_mito and number of UMIs') sc.pp.regress_out(adata, ['n_counts', 'percent_mito']) else: pipeLog('Regressing out number of UMIs') sc.pp.regress_out(adata, ['n_counts']) #Scaling after regression maxValue = conf["regressMax"] pipeLog('Scaling data, max_value=%d' % maxValue) sc.pp.scale(adata, max_value=maxValue) if conf["doPca"]: pcCount = conf["pcCount"] if pcCount=="auto": firstPcCount = 100 else: firstPcCount = pcCount pipeLog('Performing initial PCA, number of PCs: %d' % firstPcCount) sc.tl.pca(adata, n_comps=firstPcCount) #Multiply by -1 to compare with Seurat #adata.obsm['X_pca'] *= -1 #Plot of pca variance ratio to see if formula matches visual determination of pc_nb to use sc.pl.pca_variance_ratio(adata, log=True) #Computing number of PCs to be used in clustering if pcCount == "auto": pipeLog("Estimating number of useful PCs based on Shekar et al, Cell 2016") pipeLog("PC weight cutoff used is (sqrt(# of Genes/# of cells) + 1)^2") pipeLog("See http://www.cell.com/cell/fulltext/S0092-8674(16)31007-8, STAR methods") pc_cutoff = ( np.sqrt(float(len(adata.var))/len(adata.obs)) +1 ) **2 pc_nb = 0 for i in adata.uns['pca']['variance']: if i > pc_cutoff: pc_nb+=1 pipeLog('%d PCs will be used for tSNE and clustering' % pc_nb) if pc_nb <= 2: errAbort("Number of useful PCs is too small. The formula from Shekar et all did not work (not gene expression data?). Please set the " "number of PCs manually via the pcCount variable in scanpy.conf") else: pc_nb = int(pcCount) pipeLog("Using %d PCs as configured in config" % pcCount) if conf["doPca"] and "tsne" in doLayouts: pipeLog('Performing tSNE') sc.tl.tsne(adata, n_pcs=int(pc_nb), random_state=2, n_jobs=8) if conf["doPca"] and (not inCluster or "umap" in doLayouts or conf["doLouvain"]): neighbors = conf["louvainNeighbors"] res = conf["louvainRes"] pipeLog('Running knn, using %d PCs and %d neighbors' % (pc_nb, neighbors)) sc.pp.neighbors(adata, n_pcs=int(pc_nb), n_neighbors=neighbors) if not inCluster or (conf["doPca"] and conf["doLouvain"]): pipeLog('Performing Louvain Clustering, resolution %f' % (res)) sc.tl.louvain(adata, resolution=res) pipeLog("Found %d louvain clusters" % len(adata.obs['louvain'].unique())) clusterField = conf.get("inCluster") if clusterField is None: clusterField = "louvain" logging.info("Using cluster annotation from field: %s" % clusterField) if "tsne" in doLayouts: try: sc.pl.tsne(adata, color=clusterField) except: logging.error("Error while plotting with Scanpy, ignoring the error") if "umap" in doLayouts: pipeLog("Performing UMAP") sc.tl.umap(adata) if "phate" in doLayouts: try: import phate hasPhate = True except: hasPhate = False if not hasPhate: pipeLog("Phate is not installed, cannot run PHATE, run 'pip install phate' to install it") else: pipeLog("Performing PHATE") sc.tl.phate(adata) obsmKeys = getObsmKeys(adata) if "pagaFa" in doLayouts: pipeLog("Performing PAGA+ForceAtlas2") sc.tl.paga(adata, groups='louvain') sc.pl.paga(adata, show=True, layout="fa") sc.tl.draw_graph(adata, init_pos='paga', layout="fa") if "X_draw_graph_fa" not in obsmKeys: logging.warn("After paga, 'X_draw_graph_fa' not found in adata object! Older scanpy version?") else: adata.obsm["X_pagaFa"] = adata.obsm["X_draw_graph_fa"] if "pagaFr" in doLayouts: pipeLog("Performing PAGA+Fruchterman Reingold") sc.tl.paga(adata, groups='louvain') sc.pl.paga(adata, show=True, layout="fr") sc.tl.draw_graph(adata, init_pos='paga', layout="fr") if "X_draw_graph_fr" not in obsmKeys: logging.warn("After paga, 'X_draw_graph_fr' not found in adata object! Older scanpy version?") else: adata.obsm["X_pagaFr"] = adata.obsm["X_draw_graph_fr"] otherLayouts = set(doLayouts).difference(set(["tsne", "phate", "umap", "pagaFa", "pagaFr"])) for layoutCode in otherLayouts: pipeLog("Performing Layout '%s' = %s" % (layoutCode, coordLabels[layoutCode])) sc.tl.draw_graph(adata, layout=layoutCode) # Finding Top Markers, sorted by z-score if conf["doMarkers"]: nGenes = conf["markerCount"] pipeLog('Finding top markers for each cluster') adata = runSafeRankGenesGroups(adata, clusterField, minCells=5) if bigDataset: pipeLog("Not doing rank_genes plot, big dataset") else: sc.pl.rank_genes_groups(adata, n_genes=nGenes) #Stop Timer stop= timeit.default_timer() pipeLog("Running time: %f" % (stop-start)) sys.stdout = sys.stdout.terminal import scanpy conf["ScanpyVersion"] = scanpy.__version__ return adata, conf def mustBePython3(): if not isPy3: print("Unsupported Python version.") print("The cellbrowser works on almost any Python version, but Scanpy requires Python3.") print("This script has been installed and is running under this Python: %s" % sys.executable) print("Most likely it has not been installed with a Python3-pip.") print("You will have to install cellbrowser again using a pip command that is") print("using Python3 where Scanpy can be installed.") print("") print("To remove confusion below, it is easier to first remove the current cellbrowser package:") print(" pip uninstall cellbrowser") print("") print("Then use a Python3-pip to re-install the cellbrowser package.") print("On mixed Python2/3 systems, the command is sometimes called 'pip3' (see: 'apt-get install python3-pip').") print("In conda environments, you can create a Python3 environment and activate it with:") print(" conda create -n py36 python=3.6 anaconda && source activate py36") print("If using virtualenvs, you can do something like this:") print(" virtualenv -p /usr/bin/python3 ~/py3env") print(" source ~/py3env/bin/activate") print("On OSX and brew, install python3 and adapt the PATH:") print(" brew install python3 && export PATH=/usr/local/opt/python/libexec/bin:$PATH") print("After this, check that your default python is python3 and pip is using python3 with:") print(" python --version") print(" pip --version") print("") print("Then install cellbrowser using the new pip:") print(" pip install cellbrowser") print("Depending on your OS and setup you may have to remove the old version first and reinstall:") print(" pip2 uninstall cellbrowser") print(" pip3 install cellbrowser --force-reinstall --upgrade") print("and re-run this command") print("") print("As usual, if you're not root and not on conda/virtualenv, you may need to add --user for pip") print("Once this is all done, install scanpy.") sys.exit(1) def generateDataDesc(datasetName, outDir, algParams=None, other=None): " write a desc.conf to outDir " outFname = join(outDir, "desc.conf") if isfile(outFname): logging.info("Not writing %s, already exists" % outFname) return inFname = findPkgFile("sampleConfig/desc.conf") sampleData = open(inFname).read() ofh = open(outFname, "wt") ofh.write(sampleData) # always overwrite the parameters if algParams: ofh.write("algParams = %s\n" % repr(dict(algParams))) if other: for key, val in other.items(): ofh.write("%s = %s\n" % (key, repr(val))) ofh.close() def copyFileIfDiffSize(inFname, outFname): if not isfile(outFname): logging.info("File doesn't exist, copying %s to %s" % (inFname, outFname)) shutil.copyfile(inFname, outFname) elif getsize(inFname)!=getsize(outFname): logging.info("File size difference, copying %s to %s" % (inFname, outFname)) shutil.copyfile(inFname, outFname) else: logging.info("identical and same size, not copying %s to %s" % (inFname, outFname)) def copyTsvMatrix(matrixFname, outMatrixFname): " copy one file to another, but only if both look like valid input formats for cbBuild " if isMtx(matrixFname) or ".h5" in matrixFname: logging.error("Cannot copy %s to %s, as not a text-based file like tsv or csv. You will need to do the conversion yourself manually or with cbTool.") return copyFileIfDiffSize(matrixFname, outMatrixFname) def generateQuickGenes(outDir): " make a quickGenes.tsv in outDir from markers.tsv " outFname = join(outDir, "quickGenes.tsv") markerFname = join(outDir, "markers.tsv") logging.info("Generating %s from %s" % (outFname, markerFname)) clusters = parseMarkerTable(markerFname, None)[0] genesPerCluster = int(round(18 / len(clusters))) # guess a reasonable number of genes per cluster, ~ 18 genes in total maxGenes = None if genesPerCluster == 0: genesPerCluster = 1 maxGenes = 30 quickGenes = defaultdict(list) for clusterName, rows in iterItems(clusters): for row in rows[:genesPerCluster]: sym = row[1] quickGenes[sym].append(clusterName) if maxGenes is not None and len(quickGenes) > maxGenes: logging.info("Stopping at 30 genes to keep size of quickGenes file reasonable") break ofh = open(outFname, "w") for sym, clusterNames in iterItems(quickGenes): ofh.write("%s\t%s\n" % (sym, ", ".join(clusterNames))) ofh.close() def checkDsName(datasetName): " make sure that datasetName contains only ASCII chars and - or _, errAbort if not " msg = "The dataset name '%s' is not a valid name . " \ "Valid names can only contain letters, numbers and dashes. " \ "If this dataset was imported from a h5ad or a Seurat object, which can provide dataset names, " \ "you can use the option -n of the command line tools to override the name. " % datasetName if datasetName.startswith("-"): errAbort(msg+"Dataset name cannot start with a dash. (forgot to supply an argument for -n?)") if "/" in datasetName: errAbort(msg+"Dataset name cannot contain slashes, these are reserved for collections") match = re.match("^[a-z0-9A-Z-_]*$", datasetName) if match is None: errAbort(msg+"Dataset name can only contain lower or uppercase letters or dash or underscore") def importScanpy(): try: logging.info("Loading Scanpy libraries") import scanpy as sc except: print("Cannot run 'import scanpy' in python. ") print("The Python package 'scanpy' or one of its dependencies is not installed ") print("in the default Python interpreter on this machine, '%s'" % sys.executable) print("Please install it following the instructions at https://scanpy.readthedocs.io/en/latest/installation.html") print("The scanpy authors recommend the miniconda-based installation with these commands") print("$ conda install seaborn scikit-learn statsmodels numba pytables") print("$ conda install -c conda-forge python-igraph leiden") print("Then re-run this command.") from traceback import format_exc print(format_exc()) sys.exit(1) return sc def getMatrixFormat(options): " return matrix format, either from options object or from config file " matrixFormat = options.matrixFormat # command line has priority if matrixFormat is None: matrixFormat = getConfig("matrixFormat", matrixFormat) return matrixFormat def cbScanpyCli(): " command line interface for cbScanpy " mustBePython3() global options args, options = cbScanpy_parseArgs() if options.init: copyPkgFile("sampleConfig/scanpy.conf") sys.exit(0) importScanpy() matrixFname = options.exprMatrix metaFname = options.meta outDir = options.outDir confFname = options.confFname inCluster = options.inCluster copyMatrix = options.copyMatrix skipMatrix = options.skipMatrix skipMarkers = options.skipMarkers datasetName=options.name if datasetName is None: datasetName = basename(dirname(abspath(outDir))) logging.info("no dataset name provided, using '%s' as dataset name" % datasetName) matrixFormat = getMatrixFormat(options) checkDsName(datasetName) if copyMatrix and not matrixFname.endswith(".gz"): errAbort("If you use the --copyMatrix option, the input matrix must be gzipped. Please run 'gzip %s' and then re-run cbScanpy" % matrixFname) if copyMatrix and matrixFname.endswith(".csv.gz"): errAbort("If you use the --copyMatrix option, the input matrix cannot be a .csv.gz file. Please convert to .tsv.gz") makeDir(outDir) figDir = join(outDir, "figs") adFname = join(outDir, "anndata.h5ad") matrixOutFname = join(outDir, "exprMatrix.tsv.gz") logFname = join(outDir, "cbScanpy.log") if isfile(logFname): os.remove(logFname) adata, params = cbScanpy(matrixFname, metaFname, inCluster, confFname, figDir, logFname, skipMarkers) logging.info("Writing final result as an anndata object to %s" % adFname) # work around scanpy bug for objects that were converted from seurat, https://github.com/theislab/scvelo/issues/255 if "_raw" in dir(adata) and "_var" in dir(adata.raw) and "_index" in list(adata.raw.var.columns): adata._raw._var.rename(columns={'_index': 'features'}, inplace=True) if params.get("doH5ad", True): adata.write(adFname) scanpyToCellbrowser(adata, outDir, datasetName=datasetName, skipMarkers=skipMarkers, clusterField=inCluster, skipMatrix=(copyMatrix or skipMatrix), matrixFormat=matrixFormat, useRaw=True, atac=False) if copyMatrix: outMatrixFname = join(outDir, "exprMatrix.tsv.gz") copyTsvMatrix(matrixFname, outMatrixFname) generateDataDesc(datasetName, outDir, other = {"supplFiles": [{"label":"Scanpy H5AD", "file":basename(matrixFname)}]}) copyFileIfDiffSize(matrixFname, join(outDir, basename(matrixFname))) def readGenesBarcodes(geneFname, barcodeFname): " read the features.tsv or barcodes.tsv file, return two lists " genes = [] sep = sepForFile(geneFname) for l in openFile(geneFname): row = l.rstrip("\r\n").split(sep) # field 3 is "Gene Expression" in cr3 if len(row)>1: geneId, sym = row[:2] # we have seen features.tsv files where geneId==sym which switched off gene ID resolution # e.g. the t004 dataset if (geneId!=sym): genes.append(geneId+"|"+sym) else: genes.append(geneId) else: geneId = row[0] genes.append(geneId) barcodes = [l.strip() for l in openFile(barcodeFname) if l!="\n"] return genes, barcodes def open10xMtxForRows(mtxFname, geneFname, barcodeFname): """ open the three files required for 10x matrices and return mat, genes, barcodes also convert the csc matrix to a csr matrix so row access is fast. """ import scipy.io genes, barcodes = readGenesBarcodes(geneFname, barcodeFname) logging.info("Loading expression matrix from %s..." % mtxFname) mat = scipy.io.mmread(mtxFname) logging.info("Dimensions of matrix: %d , %d" % mat.shape) #geneCount, cellCount = mat.shape if mat.shape[0]==len(genes)-1: genes = genes[1:] logging.info("The genes file seems to have a GEO-like header, removed the first gene, genecount is now %d" % len(genes)) if mat.shape[1]==len(barcodes)-1: barcodes = barcodes[1:] logging.info("The barcodes file seems to have a GEO-like header, removed the first barcode, count is now %d" % len(barcodes)) if mat.shape[0]==len(barcodes): logging.info("Matrix looks like it's in transposed format (genes on columns), so transposing matrix now") mat = mat.transpose() logging.info("New dimensions of matrix: %d , %d" % mat.shape) logging.info("Converting matrix to row-based layout...") mat = mat.tocsr() #print(mat.shape[0]) #print(len(genes)) assert(mat.shape[0]==len(genes)) # matrix gene count has to match gene tsv file line count. Does the genes file have a strange header? assert(mat.shape[1]==len(barcodes)) # matrix cell count has to match barcodes tsv file line count. Does the barcodes file have a strange header line? return mat, genes, barcodes def mtxToTsvGz(mtxFname, geneFname, barcodeFname, outFname, translateIds=False): " convert mtx to tab-sep without scanpy. gzip if needed " import numpy as np logging.info("Reading matrix from %s, %s and %s" % (mtxFname, geneFname, barcodeFname)) genes, barcodes = readGenesBarcodes(geneFname, barcodeFname) if translateIds: geneToSym = readGeneSymbols(None, genes) genes = [geneToSym[geneId] for geneId in genes] logging.info("Read %d genes and %d barcodes" % (len(genes), len(barcodes))) mat, genes, barcodes = open10xMtxForRows(mtxFname, geneFname, barcodeFname) tmpFname = outFname+".tmp" # could not find a cross-python way to open ofh for np.savetxt # see https://github.com/maximilianh/cellBrowser/issues/73 and numpy ticket referenced therein logging.info("Writing %s" % tmpFname) if isPy3: ofh = open(tmpFname, "w") else: ofh = open(tmpFname, "wb") ofh.write("gene\t") ofh.write("\t".join(barcodes)) ofh.write("\n") for i in range(0, len(genes)): if i%1000==0: logging.info("%d genes written..." % i) ofh.write(genes[i]) ofh.write("\t") arr = mat[i].toarray() fmt = "%d" if arr.dtype==np.int64: np.savetxt(ofh, arr, "%d", "\t", "\n") else: np.savetxt(ofh, arr, "%g", "\t", "\n") ofh.close() moveOrGzip(tmpFname, outFname) logging.info("Created %s" % outFname) def getAllFields(ifhs, sep=None): " give a list of file handles, get all non-gene headers and return as a list of names " fieldToFname = {} allFields = [] colCounts = [] for ifh in ifhs: if sep is None: if ".csv" in ifh.name: fileSep = "," else: fileSep = "\t" else: fileSep = sep fields = ifh.readline().rstrip('\r\n').split(fileSep) fields = [f.strip('"') for f in fields] # R sometimes adds quotes colCounts.append(len(fields)) for i, field in enumerate(fields): # make sure we have no field name overlaps if field in fieldToFname and i!=0: raise Exception("field %s seen twice, in %s and %s" % (field, ifh.name, fieldToFname[field])) fieldToFname[field] = ifh.name # only use first field name from first file if i==0: if len(allFields)==0: allFields.append(field) else: allFields.append(field) return allFields, colCounts def generateDownloads(datasetName, outDir): " generate downloads.html and desc.conf " htmlFname = join(outDir, "downloads.html") if isfile(htmlFname): logging.info("%s exists, not overwriting" % htmlFname) return ofh = open(htmlFname, "w") ofh.write("Expression matrix: exprMatrix.tsv.gz

\n" % datasetName) cFname = join(outDir, "cellbrowser.conf") conf = None if isfile(cFname): conf = loadConfig(cFname) if "unit" in conf: ofh.write("Unit of expression matrix: %s

\n" % conf.get("unit", "unknown")) ofh.write("Cell meta annotations: meta.tsv

" % datasetName) markerFname = join(outDir, "markers.csv") if not isfile(markerFname): markerFname = join(outDir, "markers.tsv") if isfile(markerFname): baseName = basename(markerFname) ofh.write("Cluster Marker Genes: %s

\n" % (datasetName, baseName, baseName)) if conf: coordDescs = conf["coords"] for coordDesc in coordDescs: coordLabel = coordDesc["shortLabel"] cleanName = sanitizeName(coordLabel.replace(" ", "_")) coordFname = cleanName+".coords.tsv.gz" ofh.write("%s coordinates: %s
\n" % (coordLabel, datasetName, coordFname, coordFname)) rdsFname = join(datasetName, "seurat.rds") if isfile(rdsFname): ofh.write("Seurat R data file: seurat.rds

\n" % rdsFname) scriptFname = join(datasetName, "runSeurat.R") if isfile(scriptFname): ofh.write("Seurat R analysis script: runSeurat.R

\n" % scriptFname) logFname = join(datasetName, "analysisLog.txt") if isfile(logFname): ofh.write("Analysis Log File: analysisLog.txt

\n" % logFname) h5adFname = join(datasetName, "anndata.h5ad") if isfile(h5adFname): ofh.write("Scanpy Anndata HDF5 file: anndata.h5ad

\n" % h5adFname) ofh.close() logging.info("Wrote %s" % ofh.name) def generateHtmls(datasetName, outDir, desc=None): " generate desc.conf in outDir, if it doesn't exist " generateDataDesc(datasetName, outDir, {}, other=desc) if __name__=="__main__": args, options = main_parseArgs() cmd = args[0] if cmd=="cbServe": outDir, port = args[1:3] savePid() startHttpServer(outDir, int(port)) else: errAbort("Unknown command %s" % cmd)