175c26e8907957b668222471a3b33ee0a13b602c
mspeir
  Fri Jul 18 09:04:53 2025 -0700
fixing cbHub to output same number of barChartColors as there are clusters

diff --git src/cbPyLib/cellbrowser/hubmaker.py src/cbPyLib/cellbrowser/hubmaker.py
index 449789e..5c60e9c 100755
--- src/cbPyLib/cellbrowser/hubmaker.py
+++ src/cbPyLib/cellbrowser/hubmaker.py
@@ -1,1065 +1,1063 @@
 # create a track hub from an expression matrix
 
 import subprocess, colorsys, hashlib
 import logging, sys, optparse, re, unicodedata, string, glob, distutils.spawn, gzip
 from collections import defaultdict, namedtuple
 from os.path import join, basename, dirname, isfile, isdir, splitext, abspath
 import os
 
 import sys
 from .cellbrowser import setDebug, lineFileNextRow, getSizesFname, readGeneSymbols, parseGeneLocs
 from .cellbrowser import MatrixTsvReader, runCommand, copyPkgFile, loadConfig, iterItems, getStaticFile
 from .cellbrowser import errAbort, getConfig, getAliasFname, makeDir
 
 CBEMAIL = os.getenv("CBEMAIL", None)
 
 # We do not require numpy but numpy is around 30-40% faster in processing arrays
 # So use it if it's present
 numpyLoaded = True
 try:
     import numpy as np
 except:
     numpyLoaded = False
     logging.debug("Numpy could not be loaded. This is fine but matrix conversion may be 2/3 slower.")
 #numpyLoaded = False
 
 # ==== functions =====
     
 def cbHub_parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] - create a UCSC track hub for a single cell dataset
 
     Run the program with "--init" to write a sample hub.conf into the current directory.
     Adapt this file to your needs.
 
     Call the program like this if there is hub.conf in the current dir:
     %prog -o hub
     It will read hub.conf, create hub/hub.txt and write the job scripts.
     You can then run the job scripts as described below.
 
     Run the UCSC tool hubCheck on the resulting hub.txt to make sure that all jobs
     have successfully completed.
 
     You can override some settings from cellbrowser.conf with command line options:
     %prog -m meta.tsv -c Cluster -m exprMatrix.tsv.gz -o hub
 
     ONLY if 'bamDir' has been specified in cellbrowser.conf:
       | %prog will create one shell script per cell cluster, with names like 'job-xxx.sh',
       | in the current directory. To run these scripts in parallel on a big server,
       | e.g. with 10 processes use the Unix command "parallel":
       |   ls *.sh | parallel -j 10 bash
       | Each job will write a log of stdout/stderr to files named e.g. job-1.log
       | Requirements that have to be installed on this machine with binaries in PATH:
       | - samtools https://github.com/samtools/samtools
       | - wiggletools https://github.com/Ensembl/WiggleTools
       | - intronProspector https://github.com/diekhans/intronProspector
       | - UCSC tools wigToBigWig, bedToBigBed, chromToUcsc from http://hgdownload.soe.ucsc.edu/admin/exe/
       |
       | To link meta.tsv with bamDir files:
       | The cellId from meta.tsv will be used to try to find the BAM file in bamDir,
       | by stripping off .bam and/or using just the file name before the first dot.
     Otherwise:
       - %prog requires bedToBigBed in the current PATH, see http://hgdownload.soe.ucsc.edu/admin/exe/
     """)
 
     parser.add_option("", "--init", dest="init", action="store_true", \
             help="write a sample hub.conf to the current directory")
 
     parser.add_option("-i", "--inConf", dest="inConf", action="store", \
             help="a hub.conf input file to read all options from. (settings can also be specified via cellbrowser.conf in the current directory.) default %default", \
             default = "hub.conf")
 
     #parser.add_option("-g", "--genome", dest="genome", action="store", \
             #help="a ucsc assembly identifier, like hg19, hg38 or mm10")
 
     parser.add_option("-m", "--metaFname", dest="meta", action="store", \
             help="a csv or tsv matrix, one row per cell")
 
     parser.add_option("-e", "--exprMatrix", dest="exprMatrix", action="store", \
             help="exprMatrix is a tsv or csv expression matrix, one line per cell")
 
     parser.add_option("-c", "--clusterField", dest="clusterField", action="store", \
             help="field in expr matrix that contains the cluster name")
 
     parser.add_option("-o", "--outDir", dest="outDir", action="store", \
             help="the output directory for the hub, default is %default")
 
     parser.add_option("", "--stat", dest="stat", action="store", \
             help="how to summarize data values of a cluster, one of: median, mean, percentile, nonzero. default is %default", default="median")
 
     parser.add_option("", "--perc", dest="percentile", action="store", \
             help="if stat is 'percentile', which percentile to use. a number 0-100. default is %default", default=90,
             type='int')
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
     #parser.add_option("", "--fixDot", dest="fixDot", action="store_true", help="replace dots in cell meta IDs with dashes (for R)")
     #parser.add_option("-t", "--geneType", dest="geneType", help="type of gene IDs in expression matrix. values like 'symbols', or 'gencode22', 'gencode28' or 'gencode-m13'.")
     #parser.add_option("", "--bamDir", dest="bamDir", help="directory with BAM files, one per cell. Merges small BAM files into one per cell cluster.")
     #parser.add_option("", "--clusterOrder", dest="clusterOrder", help="file with cluster names in the order that they should appear in the track. default is alphabetical order.")
     parser.add_option("-s", "--skipBarchart", dest="skipBarchart", help="do not create the bar chart graph", action="store_true")
     #parser.add_option("", "--name", dest="name", help="name of track hub.")
     #parser.add_option("", "--email", dest="email", help="contact email for track hub. Default is %default, taken from the env. variable CBEMAIL", default=CBEMAIL)
     #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
     #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
     (options, args) = parser.parse_args()
 
     #if not options.exprMatrix and not isfile(options.inConf) and not options.init :
     if not options.outDir and not options.init:
         parser.print_help()
         exit(1)
 
     setDebug(options.debug)
     return args, options
 
 def parseClustersFromMeta(metaFname, clusterFieldName, fixDot):
     " parse cluster -> cellId assignment from meta file, return as dict clusterName -> list of cellIds "
     logging.info("Parsing and using first field as the cell ID in file %s" % metaFname)
 
     # in some scripts, meta.tsv has only two columns and no headers. Use stupid heuristics to detect this
     noHeader = False
     for row in lineFileNextRow(metaFname):
         if len(row)==2 and not row[0]=="" and not "ell" in row[0] and not "rcode" in row[0]:
             noHeader = True
         break
 
     clusterToCells = defaultdict(list)
     metaCellIds = set()
     skipCount = 0
 
     for row in lineFileNextRow(metaFname, noHeaders=noHeader):
         clusterName = row._asdict()[clusterFieldName]
         cellId = row[0]
         if fixDot:
             cellId = cellId.replace(".", "-")
 
         # skip all cells assigned to the "" cluster
         if clusterName=="":
             skipCount +=1
             continue
 
         if cellId in metaCellIds:
             errAbort("duplicated cell ID %s in meta data" % cellId)
 
         metaCellIds.add(cellId)
         clusterToCells[clusterName].append(cellId)
 
     logging.info("Got %d clusters and %d cell IDs assigned to them" % (len(clusterToCells), len(metaCellIds)))
     if skipCount!=0:
         logging.info("Skipped %d meta rows with empty cluster names" % skipCount)
     return clusterToCells
 
 # ----------- main --------------
 
 def writeHubStanza(tfh, hubName, db, email):
     "  write a single-file hub.txt "
     tfh.write("""hub scHub
 shortLabel %s
 longLabel %s
 useOneFile on
 email %s
 descriptionUrl hub.txt
 
 genome %s
 
 """ % (hubName, hubName, email, db))
 
 def writeBarChartDesc(hubDir, hubName, hubUrl, refHtmlFname):
     " write the desc. html pages for the barchart track "
     htmFname = join(hubDir, "barChart.html")
     ofh = open(htmFname, "w")
 
     hubBase = hubUrl.rsplit("/")[0]
     refHtml = ""
     if refHtmlFname is not None:
         refHtml = open(refHtmlFname).read()
 
     ofh.write("""
 <h2>Description</h2>
 <p>This track shows the median value of expression, for all cells from each cell cluster.
 
 <h2>Display Conventions and Configuration</h2>
 <p>This track is a <a href='https://genome.ucsc.edu/goldenPath/help/barChart.html'>barChart Graph</a> track.
 
 <h2>Methods</h2>
 <p>The annotation files that associate cells to cluster names were parsed. Then the expression matrix was read.
 For each gene, the expresssion values for all cells from a cluster were read and sorted.
 The median of this sorted list was written to a bigBed file.</p>
 
 <h2>Data Access</h2>
 <p>The bigBed file can be downloaded from the <a href='{hubBase}'>track hub directory</a>.</p>
 
 <h2>Credits</h2>
 <p>Thanks for the authors of the dataset for making their data available.</p>
 
 <h2>References</h2>
 <p>
 {refHtml}
 </p>""".format(**locals()))
 
     ofh.close()
 
 def writeBamDescPages(hubDir, hubName, hubUrl, refHtmlFname):
     " write the desc. html pages for the bam-related tracks "
     saneHubName = sanitizeName(hubName)
     readDesc = join(hubDir, saneHubName+".html")
     readFh = open(readDesc, "w")
 
     hubBase = hubUrl.rsplit("/")[0]
     refHtml = ""
     if refHtmlFname is not None:
         refHtml = open(refHtmlFname).read()
 
     readFh.write("""
 <h2>Description</h2>
 <p>This track shows the alignment of sequencing reads to the genome, their coverage and the splice junctions in them. This can be helpful for quality checking of cluster markers, when designing in-situ probes or when trying to determine how specific a transcript is for a given cell type.
 
 <h2>Display Conventions and Configuration</h2>
 
 <p>This track has multiple "Views", which can be configured independently. See
 the <a href="/goldenPath/help/multiView.html">documentation on Multi-View
 tracks</a>. There are three different types of view sub tracks in this track:</a>
 
 <p><b>BAM Reads:</b> Alignable regions are shown in black, unalignable regions
 between two alignable ones shown with thin lines. For configuration options,
 see <a href="https://genome.ucsc.edu/goldenpath/help/hgBamTrackHelp.html">the
 BAM tracks help page</a>. By default, these tracks are hideen, set any of them to "squish" or "pack" (=clickable) on the track configuration page to see the reads. Click reads to show the alignment and read group.</p>
 
 <p><b>Coverage:</b> bar graphs indicate the number of reads at this base pair.
 You may want to switch on auto-scaling of the y axis. For configuration
 options, see <a
 href="https://genome.ucsc.edu/goldenpath/help/hgWiggleTrackHelp.html">the graph
 tracks configuration help page</a>. These tracks are shown in "dense" by default, set any of the tracks to "full" to see the detailed coverage plot.</p>
 
 <p><b>Splice Junctions:</b> thick rectangles show exons around a splice site,
 connected by a line that indicates the intron. These gaps are shown and are
 annotated with the number of reads, in the 'score' field. You can use the
 'score' filter on the track configuration page to show only introns with a
 certain number of supporting reads. The maximum number of reads that are shown
 is 1000, even if more reads support an intron. These tracks are shown in dense by default, set this track to "pack" to see. Then click the splice junctions to see their score.</p>
 
 <h2>Methods</h2>
 <p>BAM files were provided by the data submitters, one (single end) or two files (paired end) per cell. The BAM alignments were used as submitted. They were merged with "samtools merge" into a single BAM file per cluster. The readgroup (RG) BAM tag indicates the original cell.</p>
 
 <p>From the resulting merged BAM file, coverage was obtained using "wiggletools coverage" a tool written by Daniel Zerbino and the result was converted with the UCSC tool "wigToBigWig".</p>
 
 <p>Also on the merged BAM file, the software IntronProspector was run with default settings. It retains reads with a gap longer than 70 bp and shorter than 500 kbp and merges them into annotated splice junctions.</p>
 
 <h2>Data Access</h2>
 <p>The merged BAM files, coverage bigWig files and splice junctions in bigBed format can be downloaded from the <a href='{hubBase}'>track hub directory</a>.</p>
 
 <p>Since the splice junction .bigBed files have their scores capped at 1000, the original IntronProspector .bed files can also be downloaded from the <a href='{hubBase}'>track hub directory</a>. You can also find there *.calls.tsv files with more details about each junction, e.g. the number of uniquely mapping reads.</p>
 
 <h2>Credits</h2>
 <p>WiggleTools was written by Daniel Zerbino, IntronProspector was written by Mark Diekhans, track hubs were written to a large extent by Brian Raney.</p>
 
 <h2>References</h2>
 <p>
 Zerbino DR, Johnson N, Juettemann T, Wilder SP, Flicek P.
 <a href="https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btt737"
 target="_blank">
 WiggleTools: parallel processing of large collections of genome-wide datasets for visualization and
 statistical analysis</a>.
 <em>Bioinformatics</em>. 2014 Apr 1;30(7):1008-9.
 PMID: <a href="https://www.ncbi.nlm.nih.gov/pubmed/24363377" target="_blank">24363377</a>; PMC: <a
 href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3967112/" target="_blank">PMC3967112</a>
 </p>
 
 <p>
 Mark Diekhans,
 <a href="https://github.com/diekhans/intronProspector"
 target="_blank">
 IntronProspector GitHub Repository </a>.
 <em>Github</em> 2018
 </p>
 
 <p>
 {refHtml}
 </p>""".format(**locals()))
     readFh.close()
 
 def writeParentStanzas(tfh, saneHubName, hubName, cellCount):
     " write tdb entries for the composite parent tracks "
     tfh.write("""track %s\n""" % (saneHubName))
     tfh.write("""shortLabel %s\n""" % (hubName))
     tfh.write("""longLabel %s - %d cells\n""" % (hubName, cellCount))
     tfh.write("compositeTrack on\n")
     tfh.write("subGroup1 view Views reads=Reads cov=Coverage junc=Junctions\n")
     tfh.write("type bed 3\n")
     tfh.write("visibility dense\n")
     tfh.write("\n")
 
     tfh.write("""track %sViewReads\n""" % (saneHubName))
     tfh.write("""shortLabel Reads\n""")
     tfh.write("""longLabel %s Reads - %d cells\n""" % (hubName, cellCount))
     tfh.write("parent %s\n" % saneHubName)
     tfh.write("type bam\n")
     tfh.write("visibility hide\n")
     tfh.write("view reads\n")
     tfh.write("\n")
 
     tfh.write("""track %sViewCov\n""" % (saneHubName))
     tfh.write("""shortLabel Coverage\n""")
     tfh.write("""longLabel %s Coverage - %d cells\n""" % (hubName, cellCount))
     tfh.write("parent %s\n" % saneHubName)
     tfh.write("type bigWig\n")
     tfh.write("visibility dense\n")
     tfh.write("view cov\n")
     tfh.write("\n")
 
     tfh.write("""track %sViewJunc\n""" % (saneHubName))
     tfh.write("""shortLabel Splicing\n""")
     tfh.write("""longLabel %s Splice Junctions - %d cells\n""" % (hubName, cellCount))
     tfh.write("parent %s\n" % saneHubName)
     tfh.write("spectrum on\n")
     tfh.write("type bigBed 12\n")
     tfh.write("visibility dense\n")
     tfh.write("view junc\n")
     tfh.write("\n")
 
     #tfh.write("""track %sCov\n""" % (saneHubName))
     #tfh.write("""shortLabel %s Coverage\n""" % (hubName))
     #tfh.write("""longLabel %s Read Coverage - %d cells\n""" % (hubName, cellCount))
     #tfh.write("compositeTrack on\n")
     #tfh.write("type bigWig\n")
     #tfh.write("visibility dense\n")
     #tfh.write("\n")
 
     #tfh.write("""track %sJunc\n""" % (saneHubName))
     #tfh.write("""shortLabel %s Introns\n""" % (hubName))
     #tfh.write("""longLabel %s Intron Support - %d cells\n""" % (hubName, cellCount))
     #tfh.write("compositeTrack on\n")
     #tfh.write("type bigBed 12 +\n")
     #tfh.write("visibility dense\n")
     #tfh.write("\n")
 
 def findProgram(name):
     " search PATH for binary and return full path "
     progPath = distutils.spawn.find_executable(name)
     if progPath==None:
         errAbort("Cannot find program '%s' in PATH." % name)
     return progPath
 
 def writeJobScript(jobFn, cmds):
     " write commands to a shell script with file name "
     jobFh = open(jobFn, "w")
     logFname = splitext(basename(jobFn))[0]+'.log'
 
     jobFh.write("#!/bin/bash\n")
     jobFh.write("set -e # = abort on error\n")
     jobFh.write("set -o pipefail # = even in pipes\n")
     jobFh.write("echo Job %s start, stdout/stderr go to %s\n" % (jobFn, logFname))
     jobFh.write("exec 1> %s\n" % logFname)
     jobFh.write("exec 2>&1\n");
     jobFh.write("echo PID=$$\n");
     for cmd in cmds:
         jobFh.write(cmd+"\n")
     jobFh.close()
 
     logging.info("Wrote job script %s" % jobFn)
 
 def writeTdbStanzas(tfh, clusterName, saneHubName, cellCount):
     " write the track db statements for a one cluster "
     #saneClusterName = filter(str.isalnum, clusterName) # remove non alpha chars
     saneClusterName = sanitizeName(clusterName)
 
     tfh.write("""track %sReads\n""" % saneClusterName)
     tfh.write("""shortLabel %s BAM\n""" % clusterName)
     tfh.write("""longLabel %s sequencing reads from BAM (%d cells)\n""" % (clusterName, cellCount))
     tfh.write("type bam\n")
     tfh.write("view reads\n")
     tfh.write("parent %sViewReads\n" % saneHubName)
     tfh.write("subGroups view=reads\n")
     tfh.write("bigDataUrl %s\n" % (saneClusterName+".bam"))
     tfh.write("visibility dense\n")
     tfh.write("\n")
 
     tfh.write("""track %sCov\n""" % saneClusterName)
     tfh.write("""shortLabel %s Cov\n""" % (clusterName))
     tfh.write("""longLabel %s Coverage (%d cells)\n""" % (clusterName, cellCount))
     tfh.write("type bigWig\n")
     tfh.write("view cov\n")
     tfh.write("parent %sViewCov\n" % saneHubName)
     tfh.write("subGroups view=cov\n")
     tfh.write("autoScale on\n")
     tfh.write("bigDataUrl %s\n" % (saneClusterName+".bw"))
     tfh.write("visibility dense\n")
     tfh.write("\n")
 
     tfh.write("""track %sJunc\n""" % saneClusterName)
     tfh.write("""shortLabel %s Junc\n""" % (clusterName))
     tfh.write("""longLabel %s Splice Junctions (%d cells)\n""" % (clusterName, cellCount))
     tfh.write("type bigBed 12 +\n")
     tfh.write("spectrum on\n")
     tfh.write("scoreLabel Number of supporting reads (max: 1000)\n")
     tfh.write("parent %sViewJunc\n" % saneHubName)
     tfh.write("subGroups view=junc\n")
     tfh.write("bigDataUrl %s\n" % (saneClusterName+".junctions.bb"))
     tfh.write("visibility dense\n")
     tfh.write("\n")
 
 def cellIdsForBam(bamPath):
     # we try different ways to map the bam file name to a cellID:
     # 1) remove the file ext, e.g. 123.3.bam -> cellId 123.3
     # 2) remove everything after the first dot, 123.3.bam -> cellId 123
     # 3 and 4) like 1) and 2) but with dashes replaced with dots (->R)
     basePath = basename(bamPath)
     cellId = splitext(basePath)[0]
     yield cellId.replace("_R1", "").replace("_R2", "")
 
     cellId2 = basePath.split(".")[0]
     yield cellId2.replace("_R1", "").replace("_R2", "")
 
     cellId3 = cellId.replace("-", ".")
     yield cellId3.replace("_R1", "").replace("_R2", "")
 
     cellId4 = cellId2.replace("-", ".")
     yield cellId4.replace("_R1", "").replace("_R2", "")
 
 def findBams(bamDir, clusterToCells):
     """
     create dict with cluster -> (cellIds, bamFnames)
     """
     metaCellIds = set()
     for cluster, cellIds in iterItems(clusterToCells):
         for cellId in cellIds:
             metaCellIds.add(cellId)
 
     bamMask =join(bamDir, "*.bam")
     logging.info("Getting list of BAM files matching %s" % bamMask)
     bamPaths = glob.glob(bamMask)
     bamPaths.sort()
     logging.info("Found %d BAM files for %d meta cell IDs" % (len(bamPaths), len(metaCellIds)))
 
     # make map cellId -> bam files
     cellIdToBamFnames = defaultdict(list)
     fileCount = 0
     for bamPath in bamPaths:
         logging.debug("BAM file name: %s" % bamPath)
 
         cellId = None
         for tryCellId in cellIdsForBam(bamPath):
             logging.debug("Trying cell ID %s" % tryCellId)
 
             if tryCellId in metaCellIds:
                 cellId = tryCellId
                 break
 
             logging.debug("%s is not in meta data" % tryCellId)
 
         if cellId==None:
             logging.debug("Could not resolve %s to any cellId in meta" % (bamPath))
         else:
             logging.debug("Found file %s -> cellId=%s" % (bamPath, cellId))
             fileCount += 1
             cellIdToBamFnames[cellId].append(bamPath)
 
     clusterBams = dict()
 
     #logging.info("Got %s BAM files and %s cell ids in the meta data" % (len(metaCellIds), len(cellIdToBamFnames)))
 
     emptyClusterCount = 0
     for clusterName, cellIds in iterItems(clusterToCells):
 
         bamFnames = []
         for cellId in cellIds:
             cellBams = cellIdToBamFnames[cellId]
             bamFnames.extend(cellBams)
 
         if len(bamFnames)==0:
             logging.warn("No single BAM file found for cluster %s!" % clusterName)
             emptyClusterCount += 1
             continue
         elif len(bamFnames)==1:
             logging.warn("Only 1 BAM file for cluster %s?" % clusterName)
 
         logging.info("Cluster: %s, %d cell IDs in meta, found %d BAM files, example %s" % (clusterName, len(cellIds), len(bamFnames), bamFnames[0]))
 
         clusterBams[clusterName] = (cellIds, bamFnames)
 
     # now do some sanity checks
     missMeta = set(cellIdToBamFnames) - metaCellIds
     missBam = set(metaCellIds) - set(cellIdToBamFnames)
     allCellIds = set(cellIdToBamFnames).intersection(metaCellIds)
     logging.info("%d BAM cell ids have no meta data. Examples: %s" % (len(missMeta), " ".join(list(missMeta)[:10])))
     logging.info("%d meta cell ids have no BAM file. Examples: %s" % (len(missBam), " ".join(list(missBam)[:10])))
 
     return clusterBams, metaCellIds, cellIdToBamFnames
 
 def writeDebugReport(allMetaCellIds, cellIdToBams, clusterBams, idReportFname):
     """ write a three-column table for debugging BAM file name <-> meta differences.
     Return number of cells with both at least one BAM file and meta data as a number.
     """
     logging.info("Writing %s to help track down BAM file problems" % idReportFname)
     cellCount = 0
     ofh = open(idReportFname, "w")
     ofh.write("#cellId\thasMeta\tcluster\tbamFname1\tbamFname2OrMore\n")
 
     allMetaCellIds = set(allMetaCellIds)
 
     allCellIds = set(allMetaCellIds).union(cellIdToBams)
 
     cellToCluster = {}
     for clusterName, (cellIds, bamFnames) in iterItems(clusterBams):
         for cellId in cellIds:
             cellToCluster[cellId] = clusterName
 
     for cellId in allCellIds:
         ofh.write(cellId+"\t")
 
         if cellId in allMetaCellIds:
             ofh.write("Yes\t")
         else:
             ofh.write("No\t")
 
         clusterName = cellToCluster.get(cellId, "!noClusterFound")
         ofh.write("%s\t" % clusterName)
 
         bamFnames = cellIdToBams[cellId]
         if len(bamFnames)==0:
             ofh.write("!noBamFound\t!noBamFound\n")
             continue
 
         if cellId in allMetaCellIds:
             cellCount += 1
 
         ofh.write(bamFnames[0])
         ofh.write("\t")
 
         if len(bamFnames)>1:
             ofh.write(",".join(bamFnames[1:]))
         ofh.write("\n")
 
     ofh.close()
     return cellCount
 
 
 def mergeBams(hubName, db, tfh, bamDir, clusterToCells, outDir):
     logging.info("*** Merging and analyzing BAM files")
 
     clusterBams, allMetaCellIds, cellIdToBams = findBams(bamDir, clusterToCells)
 
     idReportFname = join(outDir, "metaBamMatch.txt")
     cellCount = writeDebugReport(allMetaCellIds, cellIdToBams, clusterBams, idReportFname)
 
     jobsDir = join(outDir, "jobs")
     outDir = abspath(outDir)
 
     makeDir(join(jobsDir, "jobs"))
     jlFh = open(join(jobsDir, "jobList"), "w")
 
     for clusterName, (cellIds, bamFnames) in iterItems(clusterBams):
         uniqueCellIds = set(cellIds)
         #cellCount += len(uniqueCellIds)
 
     logging.info("Merging BAM files and writing hub")
     saneHubName = sanitizeName(hubName)
     writeParentStanzas(tfh, saneHubName, hubName, cellCount)
 
     chromSizes = getSizesFname(db)
     aliasTable = getAliasFname(db)
 
     jobNo = 0
     emptyClusterCount = 0
     for clusterName, (cellIds, bamFnames) in iterItems(clusterBams):
         saneClusterName = sanitizeName(clusterName)
         logging.info("Processing cluster %s, %d cellIds/BAM files, examples: %s" % (clusterName, len(cellIds), cellIds[0]))
         cmds = []
 
         outBam = join(outDir, saneClusterName+".bam")
         outStat = join(outDir, saneClusterName+".stats.txt")
         outCalls = join(outDir, saneClusterName+".calls.tsv")
         junctionBed = join(outDir, saneClusterName+".junctions.bed")
         intronBed = join(outDir, saneClusterName+".introns.bed")
 
         junctionBedSorted = junctionBed.replace(".bed", ".sorted.bed")
         intronBedSorted = intronBed.replace(".bed", ".sorted.bed")
 
         junctionBb = junctionBed.replace(".bed", ".bb")
         intronBb = intronBed.replace(".bed", ".bb")
 
         cmd = "echo merging %d bam files for %d cell IDs" % (len(bamFnames), len(set(cellIds)))
         cmds.append(cmd)
 
         intronProspector = findProgram("intronProspector")
         samtools = findProgram("samtools")
         bamFnames = [join("..", "..", fn) for fn in bamFnames]
 
         if len(bamFnames)==1:
             # samtools merge aborts if only one input BAM file
             cmd = "cat %s " % (bamFnames[0])
         else:
             # -r construct read groups
             # -f overwrite output files if needed
             cmd = samtools+" merge -f -r - %s " % (" ".join(bamFnames))
 
         cmd += "| tee %s | %s -p /dev/stdout -U -c %s -j %s -n %s " % \
             (outBam, intronProspector, outCalls, junctionBed, intronBed)
         cmd += "| samtools flagstat - > %s" % outStat
         cmds.append(cmd)
         
         #cmd = "echo running intronprospector on merged BAM file"
         #cmds.append(cmd)
 
         # XX could be part of the samtools merge command, in a pipe
         #cmds.append(cmd)
 
         cmd = "samtools index %s" % outBam
         cmds.append(cmd)
 
         cmd = """export LC_COLLATE=C; cat %s | awk '($5>1000) {$5=1000;} { OFS="\\t"; print}' | chromToUcsc -a %s | sort -k1,1 -k2,2n > %s""" % (junctionBed, aliasTable, junctionBedSorted)
         cmds.append(cmd)
 
         cmd = """export LC_COLLATE=C; cat %s | awk '($5>1000) {$5=1000;} { OFS="\\t"; print}' | chromToUcsc -a %s | sort -k1,1 -k2,2n > %s""" % (intronBed, aliasTable, intronBedSorted)
         cmds.append(cmd)
 
         bigBed = findProgram("bedToBigBed")
         cmd = bigBed+" %s %s %s -type=bed6 -tab" % (intronBedSorted, chromSizes, intronBb)
         cmds.append(cmd)
 
         cmd = bigBed+" %s %s %s -type=bed12 -tab" % (junctionBedSorted, chromSizes, junctionBb)
         cmds.append(cmd)
 
         # alternative, 4x slower, silently drops ERCC chroms(?)
         # ~/bin/x86_64/bamToPsl in.bam stdout -chromAlias=mm10.chromAlias.tab -nohead | pslToBed stdin stdout | bedItemOverlapCount mm10 stdin -bed12 > hub/largeintestinegobletcell.bedgraph
         cmd = "echo creating coverage for merged BAM file"
         cmds.append(cmd)
 
         wiggletools = findProgram("wiggletools")
         outWig = join(outDir, saneClusterName+".wig")
         outBw = join(outDir, saneClusterName+".bw")
         # fix up the chrom field and the chrom=xxx stepSize fields
         # For wiggle files with Ensembl chrom names, we need UCSC chrom names
         cmd = wiggletools+ """ coverage %s | chromToUcsc -a %s > %s""" % (outBam, aliasTable, outWig)
         cmds.append(cmd)
 
         wigToBigWig = findProgram("wigToBigWig")
         # -clip also means to not check the sequence names against chrom.sizes anymore
         # clip is needed so GL and KI seqs in hg38 don't mess up the conversion
         cmd = wigToBigWig+ " %s %s %s -clip" % (outWig, chromSizes, outBw)
         cmds.append(cmd)
 
         cmd = "rm -f %s %s %s" % (outWig, intronBedSorted, junctionBedSorted)
         cmds.append(cmd)
 
         if isfile(outBw):
             logging.info("Not running anything for %s, file %s already exists" % (saneClusterName, outBw))
         else:
             jobFn = join(jobsDir, "job-%d.sh" % jobNo)
             writeJobScript(jobFn, cmds)
             jlFh.write("/bin/bash %s {check out exists %s}\n" % (basename(jobFn), outBw))
         jobNo+=1
 
         writeTdbStanzas(tfh, clusterName, saneHubName, len(cellIds))
 
     if emptyClusterCount==len(clusterToCells):
         logging.error("No single BAM file could be linked to the meta data. Please check the identifiers in the meta data table against the file names in %s." % (bamDir))
         sys.exit(1)
 
     jlFh.close()
     logging.info("Wrote jobList file %s" % jlFh.name)
 
     logging.info("Wrote %s" % tfh.name)
 
 def clamp(x):
     return max(0, min(int(x), 255))
 
 def toHex(rgb):
     r = rgb[0]
     g = rgb[1]
     b = rgb[2]
     return "#{0:02x}{1:02x}{2:02x}".format(clamp(r*255), clamp(g*255), clamp(b*255))
 
 def writeBarChartTdb(tfh, bbFname, clusterNames, unitName, stat, percentile):
     " write the barChart tdb stanza "
-    stepSize = 1.0 / len(clusterNames)
     colorCodes = []
-    x = 0
-    while x < 1.0:
+    for i in range(len(clusterNames)):
+        x = i / len(clusterNames)
         colorCodes.append(colorsys.hsv_to_rgb(x, 1.0, 1.0))
-        x+=stepSize
 
     hexCodes = [toHex(x) for x in colorCodes]
 
     tfh.write('track barChart\n')
     tfh.write('type bigBarChart\n')
     tfh.write('visibility full\n')
     tfh.write('shortLabel Cluster expression\n')
     if stat=="percentile":
         metric = "%dth percentile" % percentile
     else:
         metric = stat.capitalize()
     tfh.write('longLabel %s Cluster expression\n' % metric)
 
     # only remove spaces, this is more readable than sanitizeName
     saneClusterNames = []
     for cn in clusterNames:
         cn = cn.replace(" ", "_")
         #cn = ''.join(ch for ch in newName if (ch.isalnum() or ch=="_"))
         saneClusterNames.append(cn)
 
     tfh.write('barChartBars %s\n' % " ".join(saneClusterNames))
     tfh.write('barChartColors %s\n' % " ".join(hexCodes))
     tfh.write('barChartMetric %s\n' % metric)
     tfh.write('barChartUnit %s\n' % unitName)
     tfh.write('barChartMatrixUrl exprMatrix.tsv\n')
     tfh.write('barChartSampleUrl clusters.tsv\n')
     tfh.write('defaultLabelFields name2\n')
     tfh.write('labelFields name2,name\n')
     tfh.write('bigDataUrl barChart.bb\n\n')
 
 def to_camel_case(snake_str):
     components = snake_str.split('_')
     # We capitalize the first letter of each component except the first one
     # with the 'title' method and join them together.
     return components[0] + ''.join(x.title() for x in components[1:])
 
 def sanitizeName(name):
     " remove all nonalpha chars and camel case name "
     newName = to_camel_case(name.replace(" ", "_"))
     newName = ''.join(ch for ch in newName if (ch.isalnum() or ch=="_"))
     if len(newName)>80: # genome browser is limited to 128 chars
         md5 = hashlib.md5(newName.encode('utf-8')).hexdigest()[:10]
         newName = newName[:30]+"_"+newName[-30:]+"_"+md5
 
     return newName
 
 def writeCatFile(clusterToCells, catFname):
     " write file with cellId<tab>clusterName "
     logging.info("Writing %s" % catFname)
     ofh = open(catFname, "w")
     for clusterName, cellIds in iterItems(clusterToCells):
         for cellId in cellIds:
             ofh.write("%s\t%s\n" % (cellId, clusterName))
     ofh.close()
 
 def makeClusterCellIdxArray(clusterOrder, clusterToCells, cellNames, clusterFname):
     """
     make a list of lists of cellIds, one per cluster, in the right order.
     Result is an array (one element per cluster), with each inside array the list of cell indices of this cluster.
     also write the cluster names to clusterFname
     """
     clustOfh = open(clusterFname, "w")
     allCellIds = range(0, len(cellNames))
     cellNameToId = dict(zip(cellNames, allCellIds))
 
     clusterCellIds = [] # list of tuples with cell-indexes, one per cluster
     allCellNames = [] # list for cellIds, with a matrix, meta and with bam file
     allCellIndices = [] # position of all cellIds in allCellNames
     notInMat = []
     for clusterName in clusterOrder:
         cellIdxList = []
         for cellName in clusterToCells[clusterName]:
             if cellName not in cellNameToId:
                 #logging.warn("%s is in meta but not in expression matrix." % cellName)
                 notInMat.append(cellName)
                 continue
             idx = cellNameToId[cellName]
             cellIdxList.append(idx)
             allCellNames.append(cellName)
             allCellIndices.append(idx)
             sanClusterName = clusterName.replace(" ", "_")
             clustOfh.write("%s\t%s\n" % (cellName, sanClusterName))
 
         if len(cellIdxList)==0:
             logging.warn("No cells assigned to cluster %s" % clusterName)
 
         if numpyLoaded:
             cellIds = np.array(cellIdxList)
         else:
             cellIds = tuple(cellIdxList)
 
         clusterCellIds.append(cellIdxList)
 
     clustOfh.close()
 
     if len(notInMat)!=0:
         logging.warn("%d identifiers from the meta file are not in the expression matrix. Examples: %s" % (len(notInMat), notInMat[:10]))
 
     return clusterCellIds, allCellNames, allCellIndices
 
 def makeBarGraphBigBed(genome, inMatrixFname, outMatrixFname, geneType, geneModel, clusterToCells, \
         clusterOrder, stat, percentile, clusterFname, bbFname):
     """ create a barGraph bigBed file for an expression matrix
     clusterToCells is a dict clusterName -> list of cellIDs
     clusterOrder is a list of the clusterNames in the right order
     """
     logging.info("Creating barChartGraph bigbed file: genome %s, geneType %s, geneModel %s" % (genome, geneType, geneModel))
     geneToSym = readGeneSymbols(geneType, inMatrixFname)
     if geneToSym is None:
         geneToSym = readGeneSymbols("gencode-human", inMatrixFname)
         symToGene = {v: k for k, v in geneToSym.items()} # invert the dictionary key->value to value->key
         geneToSym = None
 
     geneLocs = parseGeneLocs(genome, geneModel)
 
     matOfh = open(outMatrixFname, "w")
 
     mr = MatrixTsvReader(geneToSym)
     mr.open(inMatrixFname)
     matType, cellNames = mr.matType, mr.sampleNames
 
     clusterCellIds, allCellNames, allCellIndices = \
             makeClusterCellIdxArray(clusterOrder, clusterToCells, cellNames, clusterFname)
 
     # write header line
     matOfh.write("#gene\t")
     matOfh.write("\t".join(allCellNames))
     matOfh.write("\n")
 
     # make the barchart bed file. format:
     # chr14 95086227 95158010 ENSG00000100697.10 999 - DICER1 5 10.94,11.60,8.00,6.69,4.89 93153 26789
     #bedFname = join(outDir, "barchart.bed")
     bedFname = bbFname.replace(".bb", ".bed")
     assert(bedFname!=bbFname)
 
     bedFh = open(bedFname, "w")
 
     geneCount = 0
     skipCount = 0
     for geneId, sym, exprArr in mr.iterRows():
         logging.debug("Writing BED and matrix line for %s, symbol %s, %d expr values" % (geneId, sym, len(exprArr)))
 
         if geneType.startswith("symbol"):
             # input has symbols
             if symToGene and geneId not in symToGene:
                 logging.warn("Cannot resolve symbol %s to a gene accession" % geneId)
                 continue
             else:
                 geneId = symToGene[geneId]
                 bedRows = geneLocs[geneId]
         else:
             # input has accessions
             if geneId not in geneLocs:
                 geneId2 = geneId.replace(".", "-", 1) # does this make sense? (for R)
                 if geneId2 not in geneLocs:
                     logging.warn("Cannot place gene '%s' onto genome, dropping it" % geneId)
                     skipCount += 1
                     continue
                 else:
                     geneId = geneId2
             bedRows = geneLocs[geneId]
 
         # write the new matrix row first field
         offset = matOfh.tell()
         rowHeader = "%s\t" % (sym)
         matOfh.write(rowHeader)
 
         # write the new matrix row values
         newRow = []
         for idx in allCellIndices:
             newRow.append(str(exprArr[idx]))
 
         newLine = "\t".join(newRow)
         matOfh.write(newLine)
         matOfh.write("\n")
         lineLen = len(geneId)+len(newLine)+2 # include tab and newline
 
         bedScore = 0
 
         medianList = []
         for cellIds in clusterCellIds:
             if not numpyLoaded:
                 assert(False) # XXX at the moment, I require numpy
                 exprList = []
                 for cellId in cellIds:
                     exprList.append(exprArr[cellId])
                 n = len(cellIds)
 
                 if stat=="median":
                     if len(exprList)==0:
                         summVal = 0
                     else:
                         summVal = sorted(exprList)[n//2] # approx OK, no special case for even n's
                 elif stat=="mean":
                     summVal = float(sum(exprList))/len(exprList)
                 elif stat=="percentile":
                     summVal = sorted(exprList)[int(percentile/100.0*len(exprList))] # did not check this against numpy
                 elif stat=="nonzero":
                     summVal = float(len([x for x in exprList if x!=0])) / len(exprList)
             else:
                 clusterExprs = np.take(exprArr, cellIds)
                 if stat=="median":
                     summVal = np.median(clusterExprs)
                 elif stat=="mean":
                     summVal = np.mean(clusterExprs)
                 elif stat=="percentile":
                     summVal = np.percentile(clusterExprs, percentile)
                 elif stat=="nonzero":
                     summVal = np.count_nonzero(clusterExprs) / float(clusterExprs.size)
 
             medianList.append(str(summVal))
 
 
         geneCount += 1
 
         # one geneId may have multiple placements, e.g. Ensembl's rule for duplicate genes
         for bedRow in bedRows:
             #if geneToSym!=None:
                 #sym = geneToSym.get(geneId, geneId)
             #else:
                 #sym = geneId
             bedRow[4] = str(bedScore) # 4 = score field
             bedRow[3] = sym
             bedRow.append(geneId)
             bedRow.append(str(len(medianList)))
             bedRow.append(",".join(medianList))
             bedRow.append(str(offset))
             bedRow.append(str(lineLen))
 
             bedFh.write("\t".join(bedRow))
             bedFh.write("\n")
             bedFh.flush()
 
     bedFh.close()
 
     if skipCount != 0:
         logging.info("Could not place %d genes, these were skipped" % skipCount)
 
     if geneCount==0:
         errAbort("Could not resolve a single gene to a genome location. You will need to change the gene identifier options in hub.conf and check the expression matrix.")
 
     logging.info("Processed %d genes" % geneCount)
     bedFname2 = bedFname.replace(".bed", ".sorted.bed")
     cmd = "LC_COLLATE=C sort -k1,1 -k2,2n %s > %s" % (bedFname, bedFname2)
     runCommand(cmd)
 
     # convert to .bb using .as file
     # from https://genome.ucsc.edu/goldenpath/help/examples/barChart/barChartBed.as
     asFname = getStaticFile(join("genomes", "barChartBed.as"))
     sizesFname = getSizesFname(genome)
 
     cmd = "bedToBigBed -as=%s -type=bed6+5 -tab %s %s %s" % (asFname, bedFname2, sizesFname, bbFname)
     runCommand(cmd)
 
 def buildTrackHub(db, inMatrixFname, metaFname, clusterFieldName, clusterOrderFile, hubName, bamDir, fixDot, \
         geneType, geneModel, unitName, email, refHtmlFname, hubUrl, skipBarchart, stat, percentile, outDir):
     " main function: make bigBarChart, trackDb and possibly bigWig and bigBed files "
 
     clusterToCells = parseClustersFromMeta(metaFname, clusterFieldName, fixDot)
 
     if clusterOrderFile is None:
         clusterOrder = list(sorted(clusterToCells.keys()))
         logging.info("No cluster order specified, using clusters in alphabetical order")
     else:
         logging.info("Reading cluster order from %s" % clusterOrderFile)
         clusterOrder = open(clusterOrderFile).read().splitlines()
 
     assert(set(clusterOrder)==set(clusterToCells)) # cluster order has to match actual cluster names
 
     tdbFname = join(outDir, "hub.txt")
     logging.info("Creating %s" % tdbFname)
     tfh = open(tdbFname, "w")
 
     writeHubStanza(tfh, hubName, db, email)
 
     matrixFname = join(outDir, "exprMatrix.tsv")
 
     bbFname = join(outDir, 'barChart.bb')
     catFname = join(outDir, 'clusters.tsv')
     if skipBarchart or geneModel is None:
         logging.info("Not creating barChart file, got command line option or geneModel is None")
     else:
         writeBarChartTdb(tfh, bbFname, clusterOrder, unitName, stat, percentile)
         #if isfile(bbFname):
             #logging.info("Not creating barChart file, %s already exists" % bbFname)
         makeBarGraphBigBed(db, inMatrixFname, matrixFname, geneType, geneModel, clusterToCells, clusterOrder, stat, percentile, \
                 catFname, bbFname)
         writeBarChartDesc(outDir, hubName, hubUrl, refHtmlFname)
 
     if bamDir:
         mergeBams(hubName, db, tfh, bamDir, clusterToCells, outDir)
         writeBamDescPages(outDir, hubName, hubUrl, refHtmlFname)
 
     tfh.close()
 
 
 def cbTrackHub(options):
     " make track hub given meta file and directory with bam files "
     if options.init:
         copyPkgFile("sampleConfig/hub.conf")
         sys.exit(0)
 
     global CBEMAIL
     if CBEMAIL is None:
         CBEMAIL = getConfig("email")
 
     db, geneModel, email = None, None, None
 
     cbConfFname = join(dirname(options.inConf), "cellbrowser.conf")
     if isfile(cbConfFname):
         logging.info("Initializing configuration from %s" % cbConfFname)
         conf = loadConfig(cbConfFname)
     else:
         conf = {}
 
     if not isfile(options.inConf):
         logging.warn("Could not find a file %s. You can create one with the --init option." % options.inConf)
     else:
         conf = loadConfig(options.inConf, addTo=conf)
 
         geneModel = conf.get("geneModel")
         db = conf.get("ucscDb")
         email = conf.get("email", CBEMAIL)
 
         inMatrixFname = conf.get("exprMatrix", None)
         metaFname = conf["meta"]
         clusterFieldName = conf["clusterField"]
         clusterOrderFile = conf.get("clusterOrder")
         bamDir = conf.get("bamDir")
 
         fixDot = conf.get("fixDot", False)
 
         geneType = conf.get("geneIdType")
 
         if not "unit" in conf:
             errAbort("For track hubs, the unit of the values in the matrix is important. "
             "Please specify a value for the 'unit' setting in cellbrowser.conf or hub.conf.")
 
         unitName = conf.get("unit")
         hubUrl = conf.get("hubUrl")
         if not hubUrl:
             errAbort("For track hubs, the URL to the final hub is important. "
             "Please specify a value for the 'hubUrl' setting in cellbrowser.conf or hub.conf. "
             "If you're unsure, set it to a non-existing URL now and update it later, when you know the URL")
 
         refHtmlFname = conf.get("refHtmlFile", None)
 
         # use name, shortLabel or hubName from conf
         hubName = conf.get("shortLabel", "UCSC single cell track hub, generated by cbHub")
 
     if not db:
         errAbort("You need to specify the ucscDb setting in your hub.conf."
             "A track hub requires at least the name of the UCSC assembly, e.g. 'hg19' or 'mm10'.")
     if not geneModel:
         logging.warn("You did not specify the geneModel setting in your hub.conf. Example values are 'gencode28' for hg38."
             "Because no gene -> genome mapping is known, the barchart graph has been deactivated. ")
     if email is None:
         errAbort("A UCSC track hub should include an email address. "
             "Please specify one in hub.conf with the 'email=' setting or via the CBEMAIL environment variable or with the 'email=' setting in in ~/.cellbrowser.conf")
 
     outDir = options.outDir
 
     if options.exprMatrix:
         inMatrixFname = options.exprMatrix
     if options.meta:
         metaFname = options.meta
     if options.clusterField:
         clusterFieldName = options.clusterField
     #if options.clusterOrder:
         #clusterOrderFile = options.clusterOrder
     if options.outDir:
         outDir = options.outDir
     skipBarchart = options.skipBarchart
 
     if not isdir(outDir):
         logging.info("Making %s" % outDir)
         os.makedirs(outDir)
 
     stat = options.stat
     percentile = options.percentile
 
     buildTrackHub(db, inMatrixFname, metaFname, clusterFieldName, clusterOrderFile, hubName, bamDir, fixDot, geneType, geneModel, unitName, email, refHtmlFname, hubUrl, skipBarchart, stat, percentile, outDir)
 
 def cbHubCli():
     args, options = cbHub_parseArgs()
     cbTrackHub(options, *args)