36e79dca2dc1ba9ef65c7ea13424ff8475e2bc72
braney
  Tue Mar 10 10:43:50 2026 -0700
hubCheck: validate sequence names in 2bit files for illegal characters (refs #37174)

Add check that sequence names contain only [A-Za-z0-9._-], start with
a letter or digit, and are at most 254 characters. Warns per sequence
with the offending character and suggests chromAlias. New -noSeqNameCheck
flag disables the check.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

diff --git src/hg/utils/hubCheck/hubCheck.c src/hg/utils/hubCheck/hubCheck.c
index 977dd1c2f00..65a6ef37a56 100644
--- src/hg/utils/hubCheck/hubCheck.c
+++ src/hg/utils/hubCheck/hubCheck.c
@@ -8,101 +8,105 @@
 #include "dystring.h"
 #include "errCatch.h"
 #include "hgBam.h"
 #include "htmshell.h"
 #include "htmlPage.h"
 #include "hui.h"
 #include "net.h"
 #include "options.h"
 #include "trackDb.h"
 #include "trackHub.h"
 #include "udc.h"
 #include "vcf.h"
 #include "bedTabix.h"
 #include "knetUdc.h"
 #include "hgFind.h"
+#include "twoBit.h"
 
 #ifdef USE_HAL
 #include "halBlockViz.h"
 #endif
 
 static int cacheTime = 1;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "hubCheck - Check a track data hub for integrity.\n"
   "usage:\n"
   "   hubCheck http://yourHost/yourDir/hub.txt\n"
   "options:\n"
   "   -noTracks             - don't check remote files for tracks, just trackDb (faster)\n"
   "   -checkSettings        - check trackDb settings to spec\n"
   "   -version=[v?|url]     - version to validate settings against\n"
   "                                     (defaults to version in hub.txt, or current standard)\n"
   "   -extra=[file|url]     - accept settings in this file (or url)\n"
   "   -level=base|required  - reject settings below this support level\n"
   "   -settings             - just list settings with support level\n"
   "   -genome=genome        - only check this genome\n"
   "   -udcDir=/dir/to/cache - place to put cache for remote bigBed/bigWigs.\n"
   "                                     Will create this directory if not existing\n"
   "   -httpsCertCheck=[abort,warn,log,none] - set the ssl certificate verification mode.\n"  
   "   -httpsCertCheckDomainExceptions= - space separated list of domains to whitelist.\n"  
   "   -printMeta            - print the metadata for each track\n"
   "   -cacheTime=N          - set cache refresh time in seconds, default %d\n"
+  "   -noSeqNameCheck       - skip validation of sequence names in 2bit files\n"
   "   -allowWarnings        - return 0 exit code when only warnings are found (no errors)\n"
   "   -verbose=2            - output verbosely\n"
   , cacheTime
   );
 }
 
 static struct optionSpec options[] = {
    {"version", OPTION_STRING},
    {"level", OPTION_STRING},
    {"extra", OPTION_STRING},
    {"noTracks", OPTION_BOOLEAN},
    {"settings", OPTION_BOOLEAN},
    {"checkSettings", OPTION_BOOLEAN},
    {"genome", OPTION_STRING},
    {"test", OPTION_BOOLEAN},
    {"printMeta", OPTION_BOOLEAN},
    {"udcDir", OPTION_STRING},
    {"httpsCertCheck", OPTION_STRING},
    {"httpsCertCheckDomainExceptions", OPTION_STRING},
    {"specHost", OPTION_STRING},
    {"cacheTime", OPTION_INT},
+   {"noSeqNameCheck", OPTION_BOOLEAN},
    {"allowWarnings", OPTION_BOOLEAN},
    // intentionally undocumented option for hgHubConnect
    {"htmlOut", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 struct trackHubCheckOptions
 /* How to check track hub */
     {
     boolean checkFiles;         /* check remote files exist and are correct type */
     boolean checkSettings;      /* check trackDb settings to spec */
     boolean printMeta;          /* print out the metadata for each track */
     char *version;              /* hub spec version to check */
     char *specHost;             /* server hosting hub spec */
     char *level;                /* check hub is valid to this support level */
     char *extraFile;            /* name of extra file/url with additional settings to accept */
     char *genome;               /* only check this genome */
     /* intermediate data */
     struct hash *settings;      /* supported settings for this version */
     struct hash *extra;         /* additional trackDb settings to accept */
     struct slName *suggest;     /* list of supported settings for suggesting */
+    boolean noSeqNameCheck;     /* skip validation of sequence names in 2bit files */
     boolean allowWarnings;      /* return 0 exit code for warnings (no errors) */
     /* hgHubConnect only */
     boolean htmlOut;            /* put special formatted text into the errors dyString */
     };
 
 struct trackHubSettingSpec
 /* Setting name and support level, from trackDbHub.html (the spec) */
     {
     struct trackHubSettingSpec *next;
     char *name;                 /* setting name */
     char *level;                /* support level (required, base, full, new, deprecated) */
     };
 
 
 /* Mini English spell-check using axt sequence alignment code!  From JK
@@ -1052,51 +1056,91 @@
             }
         }
     else if (!retVal)
         {
         // add "Error" to the trackname to force uniqueness for the jstree
         dyStringPrintf(errors, "{icon: 'fa fa-plus', "
             "id:'%sError', text:'No trackDb configuration errors', parent:'%s'}", idName, idName);
         }
     dyStringPrintf(errors, "];\n");
     }
 
 return retVal;
 }
 
 
+static boolean isValidSeqNameChar(char c)
+/* Return TRUE if c is a valid character for a sequence name: [A-Za-z0-9._-] */
+{
+return isalnum(c) || c == '.' || c == '_' || c == '-';
+}
+
+static void checkSequenceNames(char *twoBitPath, char *genomeName)
+/* Check that sequence names in the 2bit file contain only valid characters:
+ * ASCII letters, digits, period, underscore, hyphen.
+ * First character must be a letter or digit. Max length 254. */
+{
+struct slName *seqList = twoBitSeqNames(twoBitPath);
+struct slName *seq;
+for (seq = seqList; seq != NULL; seq = seq->next)
+    {
+    char *name = seq->name;
+    int len = strlen(name);
+    if (len > 254)
+        warn("warning: sequence name '%s' in genome '%s' exceeds 254 characters (length %d)",
+            name, genomeName, len);
+    if (len > 0 && !isalnum(name[0]))
+        warn("warning: sequence name '%s' in genome '%s' starts with '%c' -must start with a letter or digit",
+            name, genomeName, name[0]);
+    char *p;
+    for (p = name; *p != '\0'; p++)
+        {
+        if (!isValidSeqNameChar(*p))
+            {
+            warn("warning: sequence name '%s' in genome '%s' contains invalid character '%c' -"
+                "only [A-Za-z0-9._-] are allowed. Consider using chromAlias for alternative names.",
+                name, genomeName, *p);
+            break;
+            }
+        }
+    }
+slFreeList(&seqList);
+}
+
 int hubCheckGenome(struct trackHub *hub, struct trackHubGenome *genome,
                 struct trackHubCheckOptions *options, struct dyString *errors)
 /* Check out genome within hub. */
 {
 struct errCatch *errCatch = errCatchNew();
 struct trackDb *tdbList = NULL;
 int genomeErrorCount = 0;
 boolean openedGenome = FALSE;
 verbose(3, "checking genome %s\n", trackHubSkipHubName(genome->name));
 
 if (errCatchStart(errCatch))
     {
     if (genome->twoBitPath != NULL)
         {
         // check that twoBitPath is a valid file, warn instead of errAbort so we can continue checking
         // the genome stanza
         char *twoBit = genome->twoBitPath;
         if (!extFileExists(twoBit))
             warn("Error: '%s' twoBitPath does not exist or is not accessible: '%s'", genome->name, twoBit);
         else
             {
+            if (!options->noSeqNameCheck)
+                checkSequenceNames(twoBit, trackHubSkipHubName(genome->name));
             // verify that the defaultPos references an actual chromosome and valid range for that chromosome
             char *inpPos = cloneString(genome->defaultPos);
             int relStart = 0, relEnd = 0;
             boolean relativeFlag = FALSE, singleBaseSpec = FALSE;
             if (!parseAndResolvePosition(&inpPos, genome->name, NULL, &relStart, &relEnd, &relativeFlag, &singleBaseSpec))
                 {
                 // again use a warn so we can continue checking other genome stanza settings that do not
                 // rely on the 2bit file
                 warn("Error: '%s' defaultPos '%s' does not reference a valid chromosome in the 2bit file: '%s'", trackHubSkipHubName(genome->name), genome->defaultPos, twoBit);
                 }
             }
 
         // groups and htmlPath are optional settings, again only warn if they are malformed
         char *groupsFile = genome->groups;
         if (groupsFile != NULL && !extFileExists(groupsFile))
@@ -1336,30 +1380,31 @@
 optionInit(&argc, argv, options);
 
 if (argc != 2 && !optionExists("settings"))
     usage();
 
 struct trackHubCheckOptions *checkOptions = NULL;
 AllocVar(checkOptions);
 
 checkOptions->specHost = (optionExists("test") ? "genome-test.soe.ucsc.edu" : "genome.ucsc.edu");
 checkOptions->specHost = optionVal("specHost", checkOptions->specHost);
 
 checkOptions->printMeta = optionExists("printMeta");
 checkOptions->checkFiles = !optionExists("noTracks");
 checkOptions->checkSettings = optionExists("checkSettings");
 checkOptions->genome = optionVal("genome", NULL);
+checkOptions->noSeqNameCheck = optionExists("noSeqNameCheck");
 checkOptions->allowWarnings = optionExists("allowWarnings");
 
 struct trackHubSettingSpec *setting = NULL;
 AllocVar(setting);
 setting->level = optionVal("level", "all");
 if (trackHubSettingLevel(setting) < 0)
     {
     fprintf(stderr, "ERROR: Unrecognized support level %s\n\n", setting->level);
     usage();
     }
 checkOptions->level = setting->level;
 
 char *version = NULL;
 if (optionExists("version"))
     version = optionVal("version", NULL);