7c04a101e90c131afccf4d973d2e76da384543b8 max Fri May 16 11:11:00 2025 -0700 do not stop on invalid genes in quickgenes file. skip invalid genes in marker file diff --git src/cbPyLib/cellbrowser/cellbrowser.py src/cbPyLib/cellbrowser/cellbrowser.py index 925a311..3c77bf4 100755 --- src/cbPyLib/cellbrowser/cellbrowser.py +++ src/cbPyLib/cellbrowser/cellbrowser.py @@ -2981,61 +2981,71 @@ errAbort("Your marker file has more than 1000 clusters. Are you sure that this is correct? The input format is (clusterName, geneSymName, Score), is it possible that you have inversed the order of cluster and gene?") # determine if the score field is most likely a p-value, needed for sorting revSort = True scoreHeader = headers[scoreIdx].lower() logging.debug("score field has name '%s'" % scoreHeader) if scoreHeader in ["p_val", "p-val", "p.val", "pval", "fdr"]: logging.debug("score field name '%s' looks like a p-Value, sorting normally" % scoreHeader) revSort = False for clusterName, rows in iterItems(data): rows.sort(key=operator.itemgetter(2), reverse=revSort) return data, newHeaders -def splitMarkerTable(filename, geneToSym, outDir): +def splitMarkerTable(filename, geneToSym, matrixGeneIds, outDir): """ split .tsv on first field and create many files in outDir with columns 2-end. Returns the names of the clusters and a dict topMarkers with clusterName -> list of five top marker genes. """ topMarkerCount = 5 if filename is None: return data, newHeaders = parseMarkerTable(filename, geneToSym) logging.debug("Splitting cluster markers into directory %s" % (outDir)) fileCount = 0 sanNames = set() topMarkers = {} for clusterName, rows in iterItems(data): logging.debug("Cluster: %s" % clusterName) sanName = sanitizeName(clusterName) assert(sanName not in sanNames) # after removing special chars, cluster names must still be unique. this is most likely due to typos in your meta annotation table. If it is not, we need to change the code here. sanNames.add(sanName) outFname = join(outDir, sanName+".tsv") logging.debug("Writing %s" % outFname) ofh = open(outFname, "w") ofh.write("\t".join(newHeaders)) ofh.write("\n") + missGeneIds = set() for row in rows: row[2] = "%0.5E" % row[2] # limit score to 5 digits + geneId = row[1] + if geneId not in matrixGeneIds: + missGeneIds.add(geneId) + continue + ofh.write("\t".join(row)) ofh.write("\n") + if len(missGeneIds)!=0: + logging.error("Marker table contains these genes, they were skipped, they are not in the matrix: %s" % (",".join(missGeneIds))) + #logging.error("Use --force to accept this.") + topSyms = [row[1] for row in rows[:topMarkerCount]] topMarkers[clusterName] = topSyms ofh.close() runGzip(outFname) fileCount += 1 logging.info("Wrote %d .tsv.gz files into directory %s" % (fileCount, outDir)) return data.keys(), topMarkers def syncFiles(inFnames, outDir): " compare filesizes and copy to outDir, if needed " logging.info("Syncing %s to %s" % (inFnames, outDir)) for inFname in inFnames: @@ -3560,38 +3570,42 @@ if len(line)==0: continue hasDesc = False hasPmid = False if line.startswith("symbol"): continue row = line.split(sep) geneOrSym = row[0] # case 1: user provides both geneId and symbol. Rare. # Necessary when symbol <-> geneId is not unique if "|" in geneOrSym: geneId, sym = geneOrSym.split("|") if geneId not in matrixGeneIds: - errAbort("geneId %s in quickgenes file is not in expression matrix" % repr(geneId)) + logging.info("case 1: geneId %s in quickgenes file is not in expression matrix" % repr(geneId)) + continue geneStr = geneOrSym # case 2: matrix has only symbols and user provides symbol. This is our legacy format for old datasets. # store only the symbol. We could look up the geneId but that's data inference, # which we try not to do. The lookup could be wrong. - elif matrixSyms is not None and geneOrSym in matrixSyms and len(matrixGeneIds)==0: + elif matrixSyms is not None and geneOrSym in matrixSyms: geneStr = geneOrSym + if geneStr not in matrixGeneIds: + logging.info("case 2: geneId %s in quickgenes file is not in expression matrix" % repr(geneStr)) + continue # case 3: matrix has geneIds and user provides a geneId. add the symbol from our mapping # that's data inference, but that should be OK elif geneOrSym in matrixGeneIds: geneId = geneOrSym if not geneToSym: logging.info("quick gene %s has a geneId but we have no geneId/symbol table. You can use " "the format geneId|symbol in the quick genes file to manually assign a label" % repr(geneId)) geneStr = geneId else: if geneId in geneToSym: geneStr = geneId+"|"+geneToSym[geneId] else: geneStr = geneId @@ -3609,32 +3623,33 @@ errAbort("Gene %s in quickgenes file is neither a symbol nor a geneId" % repr(geneOrSym)) if sym in nonUniqueSyms: logging.warn("Symb %s in quick genes files is not unique. Which of these genes do you mean: %s. " "Use geneId|sym in quickgenes file to solve this." % (sym, ",".join(nonUniqueSyms[sym]))) geneStr = geneId+"|"+sym # case 5: it is an ATAC dataset and the quickgenes file has ranges elif ":" in geneOrSym and "-" in geneOrSym: #geneStr = geneOrSym.replace(":", "|").replace("-", "|") geneStr = findOverlappingRanges(matrixGeneIds, geneOrSym) if geneStr=="": logging.error("quickGene ATAC range %s does not contain any peak in the matrix, skipping it" % geneOrSym) continue else: - errAbort("Gene '%s' in quickgenes file is not in expr matrix and there is no geneId<->symbol mapping " + logging.info("Gene %s in quickgenes file is not in expr matrix and there is no geneId<->symbol mapping " "to resolve it to a geneId in the expression matrix and it is not an ATAC range" % repr(geneOrSym)) + continue # if we had no geneToSym, we'll check the symbol later if it's valid info = [geneStr] if len(row)>1: info.append(row[1]) if len(row)>2: info.append(row[2]) geneInfo.append(info) return geneInfo def readSampleNames(fname): " read only the first column of fname, strip the headers " @@ -3958,55 +3973,55 @@ notInLabels = markerClusters - labelClusters notInMarkers = labelClusters - markerClusters if len(notInMarkers)!=0: msg = ("%s: the following cluster names are in the meta file but not in the marker file: %s. "+ "Please fix one of the files, clicks onto a label will otherwise not work.") % (markerFname, notInMarkers) if doAbort: errAbort(msg) else: logging.warn(msg) if len(notInLabels)!=0: logging.warn(("%s: the following cluster names are in the marker file but not in the meta file: %s. "+ "Users may not notice the problem, but it may indicate an erroneous meta data file.") % \ (markerFname, notInLabels)) -def convertMarkers(inConf, outConf, geneToSym, clusterLabels, outDir): +def convertMarkers(inConf, outConf, geneToSym, clusterLabels, matrixGeneIds, outDir): """ split the marker tables into one file per cluster and add filenames as 'markers' in outConf also add the 'topMarkers' to outConf, the top five markers for every cluster. """ markerFnames = [] if "markers" in inConf: markerFnames = makeAbsDict(inConf, "markers") newMarkers = [] #doAbort = True # only the first marker file leads to abort, we're more tolerant for the others doAbort = False # temp hack # because of single cell cluster filtering in cbScanpy topMarkersDone = False for markerIdx, markerInfo in enumerate(markerFnames): if type(markerInfo)!=type(dict()): errAbort("The 'markers' setting in cellbrowser.conf is not a dictionary but must be a dictionary with keys 'file' and 'label'.") markerFname = markerInfo["file"] markerLabel = markerInfo["shortLabel"] clusterName = "markers_%d" % markerIdx # use sha1 of input file ? markerDir = join(outDir, "markers", clusterName) makeDir(markerDir) - clusterNames, topMarkers = splitMarkerTable(markerFname, geneToSym, markerDir) + clusterNames, topMarkers = splitMarkerTable(markerFname, geneToSym, matrixGeneIds, markerDir) # only use the top markers of the first marker file if not topMarkersDone: outConf["topMarkers"] = topMarkers topMarkersDone = True checkClusterNames(markerFname, clusterNames, clusterLabels, doAbort) doAbort = False newDict = {"name" : sanitizeName(clusterName), "shortLabel" : markerLabel} if "selectOnClick" in markerInfo: newDict["selectOnClick"] = markerInfo["selectOnClick"] newMarkers.append( newDict ) outConf["markers"] = newMarkers @@ -4048,62 +4063,53 @@ if "|" in g: parts = g.split("|") geneId = parts[0] sym = parts[1] syms.append( sym ) geneIds.append(geneId) hasBoth = True geneToSym[geneId] = sym else: symsOrGeneIds.append( g ) if hasBoth: logging.debug("Matrix has both geneIds and symbols") geneIds.extend(symsOrGeneIds) else: - #logging.debug("Matrix does not have both geneIds and symbols, it contains either geneIds only or symbols only") - #if areProbablyGeneIds(syms): - #logging("80% look like geneIds: Using only the identifiers from the matrix") - #geneIds = symsOrGeneIds - #syms = geneToSym.values() - #else: - #logging("matrix identifiers do not look like geneIds: assume they are all symbols") logging.debug("Matrix does not have both geneIds and symbols, assuming it contains only geneIds") geneIds = symsOrGeneIds syms = [] if len(geneToSym)==0: logging.info("There are no gene/symbol pairs in the matrix") geneToSym = None symSet = set(syms) #symCounts = Counter(syms).most_common() #for sym, count in symCounts: #if count > 1: #logging.warn("Symbol %d is not unique" % count) return symSet, set(geneIds), geneToSym -def readQuickGenes(inConf, geneToSym, outDir, outConf): +def readQuickGenes(inConf, geneToSym, matrixSyms, matrixGeneIds, geneToSymFromMatrix, outDir, outConf): " read quick genes file and make sure that the genes in it are in the matrix " quickGeneFname = inConf.get("quickGenesFile") if not quickGeneFname: return - matrixSyms, matrixGeneIds, geneToSymFromMatrix = readValidGenes(outDir, inConf) - # prefer the symbols from the matrix over our own symbol tables if geneToSymFromMatrix is not None: logging.info("Matrix has gene symbols, these are used for quick gene file parsing") geneToSym = geneToSymFromMatrix fname = getAbsPath(inConf, "quickGenesFile") quickGenes = parseGeneInfo(geneToSym, fname, matrixSyms, matrixGeneIds) outConf["quickGenes"] = quickGenes logging.info("Read %d quick genes from %s" % (len(quickGenes), fname)) def getFileVersion(fname): data = {} data["fname"] = fname addMd5(data, fname, shortMd5=False) @@ -4656,33 +4662,35 @@ convertExprMatrix(inConf, outMatrixFname, outConf, sampleNames, geneToSym, datasetDir, needFilterMatrix) # in case we crash after this, keep the current state of the config, as matrix ops are so slow writeConfig(inConf, outConf, datasetDir) else: logging.info("Matrix and meta sample names have not changed, not indexing matrix again") convertTraces(inConf, sampleNames, datasetDir, outConf) coordFiles, clusterLabels = convertCoords(inDir, inConf, outConf, sampleNames, outMetaFname, datasetDir) foundConf = writeDatasetDesc(inConf["inDir"], outConf, datasetDir, coordFiles, outMatrixFname) if geneToSym==-1: geneToSym = readGeneSymbols(inConf.get("geneIdType"), inMatrixFname) - convertMarkers(inConf, outConf, geneToSym, clusterLabels, datasetDir) + matrixSyms, matrixGeneIds, geneToSymFromMatrix = readValidGenes(datasetDir, inConf) + + convertMarkers(inConf, outConf, geneToSym, clusterLabels, matrixGeneIds, datasetDir) - readQuickGenes(inConf, geneToSym, datasetDir, outConf) + readQuickGenes(inConf, geneToSym, matrixSyms, matrixGeneIds, geneToSymFromMatrix, datasetDir, outConf) copyBackgroundImages(inDir, inConf, outConf, datasetDir) # a few settings are passed through to the Javascript as they are for tag in ["shortLabel", "radius", "alpha", "priority", "tags", "sampleDesc", "geneLabel", "clusterField", "defColorField", "xenaPhenoId", "xenaId", "hubUrl", "showLabels", "ucscDb", "unit", "violinField", "visibility", "coordLabel", "lineWidth", "hideDataset", "hideDownload", "metaBarWidth", "supplFiles", "defQuantPal", "defCatPal", "clusterPngDir", "wrangler", "shepherd", "binStrategy", "split", "lineAlpha", "lineWidth", "lineColor", # the following are there only for old datasets, they are now nested under "facets" # they are just here for backwards-compatibility and will eventually get removed "body_parts", "organisms", "diseases", "projects", "life_stages", "domains", "sources", "assays", # facets are taking their place now "facets", "multiModal"]: