36e79dca2dc1ba9ef65c7ea13424ff8475e2bc72 braney Tue Mar 10 10:43:50 2026 -0700 hubCheck: validate sequence names in 2bit files for illegal characters (refs #37174) Add check that sequence names contain only [A-Za-z0-9._-], start with a letter or digit, and are at most 254 characters. Warns per sequence with the offending character and suggests chromAlias. New -noSeqNameCheck flag disables the check. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> diff --git src/hg/utils/hubCheck/hubCheck.c src/hg/utils/hubCheck/hubCheck.c index 977dd1c2f00..65a6ef37a56 100644 --- src/hg/utils/hubCheck/hubCheck.c +++ src/hg/utils/hubCheck/hubCheck.c @@ -8,101 +8,105 @@ #include "dystring.h" #include "errCatch.h" #include "hgBam.h" #include "htmshell.h" #include "htmlPage.h" #include "hui.h" #include "net.h" #include "options.h" #include "trackDb.h" #include "trackHub.h" #include "udc.h" #include "vcf.h" #include "bedTabix.h" #include "knetUdc.h" #include "hgFind.h" +#include "twoBit.h" #ifdef USE_HAL #include "halBlockViz.h" #endif static int cacheTime = 1; void usage() /* Explain usage and exit. */ { errAbort( "hubCheck - Check a track data hub for integrity.\n" "usage:\n" " hubCheck http://yourHost/yourDir/hub.txt\n" "options:\n" " -noTracks - don't check remote files for tracks, just trackDb (faster)\n" " -checkSettings - check trackDb settings to spec\n" " -version=[v?|url] - version to validate settings against\n" " (defaults to version in hub.txt, or current standard)\n" " -extra=[file|url] - accept settings in this file (or url)\n" " -level=base|required - reject settings below this support level\n" " -settings - just list settings with support level\n" " -genome=genome - only check this genome\n" " -udcDir=/dir/to/cache - place to put cache for remote bigBed/bigWigs.\n" " Will create this directory if not existing\n" " -httpsCertCheck=[abort,warn,log,none] - set the ssl certificate verification mode.\n" " -httpsCertCheckDomainExceptions= - space separated list of domains to whitelist.\n" " -printMeta - print the metadata for each track\n" " -cacheTime=N - set cache refresh time in seconds, default %d\n" + " -noSeqNameCheck - skip validation of sequence names in 2bit files\n" " -allowWarnings - return 0 exit code when only warnings are found (no errors)\n" " -verbose=2 - output verbosely\n" , cacheTime ); } static struct optionSpec options[] = { {"version", OPTION_STRING}, {"level", OPTION_STRING}, {"extra", OPTION_STRING}, {"noTracks", OPTION_BOOLEAN}, {"settings", OPTION_BOOLEAN}, {"checkSettings", OPTION_BOOLEAN}, {"genome", OPTION_STRING}, {"test", OPTION_BOOLEAN}, {"printMeta", OPTION_BOOLEAN}, {"udcDir", OPTION_STRING}, {"httpsCertCheck", OPTION_STRING}, {"httpsCertCheckDomainExceptions", OPTION_STRING}, {"specHost", OPTION_STRING}, {"cacheTime", OPTION_INT}, + {"noSeqNameCheck", OPTION_BOOLEAN}, {"allowWarnings", OPTION_BOOLEAN}, // intentionally undocumented option for hgHubConnect {"htmlOut", OPTION_BOOLEAN}, {NULL, 0}, }; struct trackHubCheckOptions /* How to check track hub */ { boolean checkFiles; /* check remote files exist and are correct type */ boolean checkSettings; /* check trackDb settings to spec */ boolean printMeta; /* print out the metadata for each track */ char *version; /* hub spec version to check */ char *specHost; /* server hosting hub spec */ char *level; /* check hub is valid to this support level */ char *extraFile; /* name of extra file/url with additional settings to accept */ char *genome; /* only check this genome */ /* intermediate data */ struct hash *settings; /* supported settings for this version */ struct hash *extra; /* additional trackDb settings to accept */ struct slName *suggest; /* list of supported settings for suggesting */ + boolean noSeqNameCheck; /* skip validation of sequence names in 2bit files */ boolean allowWarnings; /* return 0 exit code for warnings (no errors) */ /* hgHubConnect only */ boolean htmlOut; /* put special formatted text into the errors dyString */ }; struct trackHubSettingSpec /* Setting name and support level, from trackDbHub.html (the spec) */ { struct trackHubSettingSpec *next; char *name; /* setting name */ char *level; /* support level (required, base, full, new, deprecated) */ }; /* Mini English spell-check using axt sequence alignment code! From JK @@ -1052,51 +1056,91 @@ } } else if (!retVal) { // add "Error" to the trackname to force uniqueness for the jstree dyStringPrintf(errors, "{icon: 'fa fa-plus', " "id:'%sError', text:'No trackDb configuration errors', parent:'%s'}", idName, idName); } dyStringPrintf(errors, "];\n"); } return retVal; } +static boolean isValidSeqNameChar(char c) +/* Return TRUE if c is a valid character for a sequence name: [A-Za-z0-9._-] */ +{ +return isalnum(c) || c == '.' || c == '_' || c == '-'; +} + +static void checkSequenceNames(char *twoBitPath, char *genomeName) +/* Check that sequence names in the 2bit file contain only valid characters: + * ASCII letters, digits, period, underscore, hyphen. + * First character must be a letter or digit. Max length 254. */ +{ +struct slName *seqList = twoBitSeqNames(twoBitPath); +struct slName *seq; +for (seq = seqList; seq != NULL; seq = seq->next) + { + char *name = seq->name; + int len = strlen(name); + if (len > 254) + warn("warning: sequence name '%s' in genome '%s' exceeds 254 characters (length %d)", + name, genomeName, len); + if (len > 0 && !isalnum(name[0])) + warn("warning: sequence name '%s' in genome '%s' starts with '%c' -must start with a letter or digit", + name, genomeName, name[0]); + char *p; + for (p = name; *p != '\0'; p++) + { + if (!isValidSeqNameChar(*p)) + { + warn("warning: sequence name '%s' in genome '%s' contains invalid character '%c' -" + "only [A-Za-z0-9._-] are allowed. Consider using chromAlias for alternative names.", + name, genomeName, *p); + break; + } + } + } +slFreeList(&seqList); +} + int hubCheckGenome(struct trackHub *hub, struct trackHubGenome *genome, struct trackHubCheckOptions *options, struct dyString *errors) /* Check out genome within hub. */ { struct errCatch *errCatch = errCatchNew(); struct trackDb *tdbList = NULL; int genomeErrorCount = 0; boolean openedGenome = FALSE; verbose(3, "checking genome %s\n", trackHubSkipHubName(genome->name)); if (errCatchStart(errCatch)) { if (genome->twoBitPath != NULL) { // check that twoBitPath is a valid file, warn instead of errAbort so we can continue checking // the genome stanza char *twoBit = genome->twoBitPath; if (!extFileExists(twoBit)) warn("Error: '%s' twoBitPath does not exist or is not accessible: '%s'", genome->name, twoBit); else { + if (!options->noSeqNameCheck) + checkSequenceNames(twoBit, trackHubSkipHubName(genome->name)); // verify that the defaultPos references an actual chromosome and valid range for that chromosome char *inpPos = cloneString(genome->defaultPos); int relStart = 0, relEnd = 0; boolean relativeFlag = FALSE, singleBaseSpec = FALSE; if (!parseAndResolvePosition(&inpPos, genome->name, NULL, &relStart, &relEnd, &relativeFlag, &singleBaseSpec)) { // again use a warn so we can continue checking other genome stanza settings that do not // rely on the 2bit file warn("Error: '%s' defaultPos '%s' does not reference a valid chromosome in the 2bit file: '%s'", trackHubSkipHubName(genome->name), genome->defaultPos, twoBit); } } // groups and htmlPath are optional settings, again only warn if they are malformed char *groupsFile = genome->groups; if (groupsFile != NULL && !extFileExists(groupsFile)) @@ -1336,30 +1380,31 @@ optionInit(&argc, argv, options); if (argc != 2 && !optionExists("settings")) usage(); struct trackHubCheckOptions *checkOptions = NULL; AllocVar(checkOptions); checkOptions->specHost = (optionExists("test") ? "genome-test.soe.ucsc.edu" : "genome.ucsc.edu"); checkOptions->specHost = optionVal("specHost", checkOptions->specHost); checkOptions->printMeta = optionExists("printMeta"); checkOptions->checkFiles = !optionExists("noTracks"); checkOptions->checkSettings = optionExists("checkSettings"); checkOptions->genome = optionVal("genome", NULL); +checkOptions->noSeqNameCheck = optionExists("noSeqNameCheck"); checkOptions->allowWarnings = optionExists("allowWarnings"); struct trackHubSettingSpec *setting = NULL; AllocVar(setting); setting->level = optionVal("level", "all"); if (trackHubSettingLevel(setting) < 0) { fprintf(stderr, "ERROR: Unrecognized support level %s\n\n", setting->level); usage(); } checkOptions->level = setting->level; char *version = NULL; if (optionExists("version")) version = optionVal("version", NULL);