fadb7a07e46918ac377c685de7e8dd124417e972 angie Thu Feb 18 10:46:24 2016 -0800 Simplified cytoBand search and loosened up termRegex to support horse. I removed some old functionality: the cytoBand search code used to cache the entire table, but it is never repetitively searched -- the code is invoked only for a user's search term. Also, keywords like "cen" and "qter" were supported but have been excluded by search regexes for years and noone has missed them. Now, use a regex to help parse the search term into chrom and band name, and look for an exact match in cytoBand. diff --git src/hg/lib/hgFind.c src/hg/lib/hgFind.c index fc2a080..a6a9a0e 100644 --- src/hg/lib/hgFind.c +++ src/hg/lib/hgFind.c @@ -936,226 +936,92 @@ for (table = hgp->tableList; table != NULL; table = table->next) { for (pos = table->posList; pos != NULL; pos = pos->next) { ++posCount; if (pos->chrom != NULL) hgp->singlePos = pos; } } if (posCount != 1) hgp->singlePos = NULL; hgp->posCount = posCount; } -static char *startsWithShortHumanChromName(char *db, char *chrom) -/* Return "cannonical" name of chromosome or NULL - * if not a chromosome. This expects no 'chr' in name. */ -{ -int num; -char buf[64]; -char c = chrom[0]; - -if (c == 'x' || c == 'X' || c == 'Y' || c == 'y') - { - safef(buf, sizeof(buf), "chr%c", toupper(c)); - return hgOfficialChromName(db, buf); - } -if (!isdigit(chrom[0])) - return NULL; -num = atoi(chrom); -if (num < 1 || num > 22) - return NULL; -safef(buf, sizeof(buf), "chr%d", num); -return hgOfficialChromName(db, buf); -} - -static struct cytoBand *loadAllBands(char *db) -/* Load up all bands from database. */ +static boolean hgFindChromBand(char *db, char *chrom, char *band, int *retStart, int *retEnd) +/* Return start/end of band in chromosome. */ { -struct cytoBand *list = NULL, *el; struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr = NULL; char **row; - -sr = sqlGetResult(conn, "NOSQLINJ select * from cytoBand"); -while ((row = sqlNextRow(sr)) != NULL) +struct dyString *query = sqlDyStringCreate("select chromStart, chromEnd from cytoBand " + "where chrom = '%s' and name = '%s'", + chrom, band); +sr = sqlGetResult(conn, query->string); +if ((row = sqlNextRow(sr)) != NULL) { - el = cytoBandLoad(row); - slAddHead(&list, el); + if (retStart) + *retStart = sqlUnsigned(row[0]); + if (retEnd) + *retEnd = sqlUnsigned(row[1]); + return TRUE; } sqlFreeResult(&sr); -slReverse(&list); hFreeConn(&conn); -return list; -} - -static struct cytoBand *bandList = NULL; - -void hgFindChromBand(char *db, char *chromosome, char *band, int *retStart, int *retEnd) -/* Return start/end of band in chromosome. */ -{ -struct cytoBand *chrStart = NULL, *chrEnd = NULL, *cb; -int start = 0, end = 500000000; -boolean anyMatch; -char choppedBand[64], *s, *e; - -if (bandList == NULL) - bandList = loadAllBands(db); - -/* Find first band in chromosome. */ -for (cb = bandList; cb != NULL; cb = cb->next) - { - if (sameString(cb->chrom, chromosome)) - { - chrStart = cb; - break; - } - } -if (chrStart == NULL) - hUserAbort("Couldn't find chromosome %s in band list", chromosome); - -/* Find last band in chromosome. */ -for (cb = chrStart->next; cb != NULL; cb = cb->next) - { - if (!sameString(cb->chrom, chromosome)) - break; - } -chrEnd = cb; - -if (sameWord(band, "cen")) - { - for (cb = chrStart; cb != chrEnd; cb = cb->next) - { - if (cb->name[0] == 'p') - start = cb->chromEnd - 500000; - else if (cb->name[0] == 'q') - { - end = cb->chromStart + 500000; - break; - } - } - *retStart = start; - *retEnd = end; - return; - } -else if (sameWord(band, "qter")) - { - *retStart = *retEnd = hChromSize(db, chromosome); - *retStart -= 1000000; - return; - } -/* Look first for exact match. */ -for (cb = chrStart; cb != chrEnd; cb = cb->next) - { - if (sameWord(cb->name, band)) - { - *retStart = cb->chromStart; - *retEnd = cb->chromEnd; - return; - } - } - -/* See if query is less specific.... */ -strcpy(choppedBand, band); -for (;;) - { - anyMatch = FALSE; - for (cb = chrStart; cb != chrEnd; cb = cb->next) - { - if (startsWith(choppedBand, cb->name)) - { - if (!anyMatch) - { - anyMatch = TRUE; - start = cb->chromStart; - } - end = cb->chromEnd; - } - } - if (anyMatch) - { - *retStart = start; - *retEnd = end; - return; - } - s = strrchr(choppedBand, '.'); - if (s == NULL) - hUserAbort("Couldn't find anything like band '%s'", band); - else - { - e = choppedBand + strlen(choppedBand) - 1; - *e = 0; - if (e[-1] == '.') - e[-1] = 0; - warn("Band %s%s is at higher resolution than data, chopping to %s%s", - chromosome+3, band, chromosome+3, choppedBand); - } - } +dyStringFree(&query); +return FALSE; } -boolean hgIsCytoBandName(char *db, char *spec, char **retChromName, char **retBandName) +boolean hgParseCytoBandName(char *db, char *spec, char **retChromName, char **retBandName) /* Return TRUE if spec is a cytological band name including chromosome short * name. Returns chromosome chrN name and band (with chromosome stripped off) */ { -char *fullChromName, *shortChromName; -int len; -int dotCount = 0; -char *s, c; - -/* First make sure spec is in format to be a band name. */ -if ((fullChromName = startsWithShortHumanChromName(db, spec)) == NULL) - return FALSE; -shortChromName = skipChr(fullChromName); -len = strlen(shortChromName); -spec += len; -c = spec[0]; -if (c != 'p' && c != 'q') - return FALSE; -/* the mouse bands can have a letter here, A-H, searchType cytoBand - * doesn't seem to use the termRegx */ -if (!(isdigit(spec[1]) || (1 == countChars("ABCDEFGH", spec[1])))) - return FALSE; - -/* Make sure rest is digits with maybe one '.' */ -s = spec+2; -while ((c = *s++) != 0) - { - if (c == '.') - ++dotCount; - else if (!isdigit(c)) - return FALSE; +regmatch_t substrArr[5]; +// See if spec looks like a "chr"-less chromosome followed by a p or q, then a number, +// and possibly a '.' and another number. +// Mouse bands may have a letter A-H before the number, and may have no number. +// Horse bands may have "pq". +if (regexMatchSubstrNoCase(spec, "^(X|Y|[0-9]+)([pq]+[A-H]?([0-9]+(\\.[0-9]+)?)?)$", + substrArr, ArraySize(substrArr))) + { + char chrSpec[PATH_LEN]; + safencpy(chrSpec, sizeof(chrSpec), "chr", 3); + safencpy(chrSpec+3, sizeof(chrSpec)-3, spec, substrArr[1].rm_eo); + char *chromName = hgOfficialChromName(db, chrSpec); + if (chromName) + { + if (retChromName) + *retChromName = chromName; + if (retBandName) + *retBandName = cloneString(spec + substrArr[2].rm_so); + return TRUE; + } } -if (dotCount > 1) return FALSE; -*retChromName = fullChromName; -*retBandName = spec; -return TRUE; } boolean hgFindCytoBand(char *db, char *spec, char **retChromName, int *retWinStart, int *retWinEnd) /* Return position associated with cytological band if spec looks to be * in that form. */ { char *bandName; -if (!hgIsCytoBandName(db, spec, retChromName, &bandName)) +if (!hgParseCytoBandName(db, spec, retChromName, &bandName)) return FALSE; -hgFindChromBand(db, *retChromName, bandName, retWinStart, retWinEnd); -return TRUE; +return hgFindChromBand(db, *retChromName, bandName, retWinStart, retWinEnd); } boolean findChromContigPos(char *db, char *name, char **retChromName, int *retWinStart, int *retWinEnd) /* Find position in genome of contig. Look in all chroms. * Don't alter return variables unless found. */ /* NOTE: could probably speed this up by using the chromInfo hashtable */ { struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr = NULL; char **row; char query[256]; boolean foundIt = FALSE; /* In case this is a scaffold-based assembly, check for unsplit table first: */