b622d147b7dbac52dbf3ba26928cd18e02d42bd8 braney Sat Feb 26 12:34:37 2022 -0800 add support for using a bigBed as the chromAlias file diff --git src/hg/lib/chromAlias.c src/hg/lib/chromAlias.c index 79cc541..a0d1027 100644 --- src/hg/lib/chromAlias.c +++ src/hg/lib/chromAlias.c @@ -1,360 +1,481 @@ /* chromAlias.c was originally generated by the autoSql program, which also * generated chromAlias.h and chromAlias.sql. This module links the database and * the RAM representation of objects. */ +#include <pthread.h> #include "common.h" #include "linefile.h" #include "dystring.h" #include "jksql.h" #include "chromAlias.h" #include "hdb.h" #include "trackHub.h" #include "fieldedTable.h" +#include "bigBed.h" +#include "bPlusTree.h" char *chromAliasCommaSepFieldNames = "alias,chrom,source"; void chromAliasStaticLoad(char **row, struct chromAlias *ret) /* Load a row from chromAlias table into ret. The contents of ret will * be replaced at the next call to this function. */ { ret->alias = row[0]; ret->chrom = row[1]; ret->source = row[2]; } struct chromAlias *chromAliasLoad(char **row) /* Load a chromAlias from row fetched with select * from chromAlias * from database. Dispose of this with chromAliasFree(). */ { struct chromAlias *ret; AllocVar(ret); ret->alias = cloneString(row[0]); ret->chrom = cloneString(row[1]); ret->source = cloneString(row[2]); return ret; } struct chromAlias *chromAliasLoadAll(char *fileName) /* Load all chromAlias from a whitespace-separated file. * Dispose of this with chromAliasFreeList(). */ { struct chromAlias *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[3]; while (lineFileRow(lf, row)) { el = chromAliasLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct chromAlias *chromAliasLoadAllByChar(char *fileName, char chopper) /* Load all chromAlias from a chopper separated file. * Dispose of this with chromAliasFreeList(). */ { struct chromAlias *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[3]; while (lineFileNextCharRow(lf, chopper, row, ArraySize(row))) { el = chromAliasLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct chromAlias *chromAliasCommaIn(char **pS, struct chromAlias *ret) /* Create a chromAlias out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new chromAlias */ { char *s = *pS; if (ret == NULL) AllocVar(ret); ret->alias = sqlStringComma(&s); ret->chrom = sqlStringComma(&s); ret->source = sqlStringComma(&s); *pS = s; return ret; } void chromAliasFree(struct chromAlias **pEl) /* Free a single dynamically allocated chromAlias such as created * with chromAliasLoad(). */ { struct chromAlias *el; if ((el = *pEl) == NULL) return; freeMem(el->alias); freeMem(el->chrom); freeMem(el->source); freez(pEl); } void chromAliasFreeList(struct chromAlias **pList) /* Free a list of dynamically allocated chromAlias's */ { struct chromAlias *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; chromAliasFree(&el); } *pList = NULL; } void chromAliasOutput(struct chromAlias *el, FILE *f, char sep, char lastSep) /* Print out chromAlias. Separate fields with sep. Follow last field with lastSep. */ { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->alias); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->chrom); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->source); if (sep == ',') fputc('"',f); fputc(lastSep,f); } void chromAliasJsonOutput(struct chromAlias *el, FILE *f) /* Print out chromAlias in JSON format. */ { fputc('{',f); fputc('"',f); fprintf(f,"alias"); fputc('"',f); fputc(':',f); fputc('"',f); fprintf(f, "%s", el->alias); fputc('"',f); fputc(',',f); fputc('"',f); fprintf(f,"chrom"); fputc('"',f); fputc(':',f); fputc('"',f); fprintf(f, "%s", el->chrom); fputc('"',f); fputc(',',f); fputc('"',f); fprintf(f,"source"); fputc('"',f); fputc(':',f); fputc('"',f); fprintf(f, "%s", el->source); fputc('"',f); fputc('}',f); } /* -------------------------------- End autoSql Generated Code -------------------------------- */ /* our "global" data */ +struct bptIndex +{ +struct bptIndex *next; +int fieldIx; +struct bptFile *bpt; +}; + static struct { boolean inited; -char *database; +boolean bptInited; +struct bptIndex *bptList; +struct bbiFile *bbi; struct hash *chromToAliasHash; struct hash *aliasToChromHash; -struct hash *forwardHash; -struct hash *reverseHash; -} chromHashes; - -static boolean checkDatabase(char *database) -/* Make sure we don't see different databases. */ -{ -if (database == NULL) - return TRUE; - -if (chromHashes.database != NULL) - { - if (!sameString(chromHashes.database, database)) - { - errAbort("chromAliasSetup: only works for one db. %s was passed in earlier, now %s.", chromHashes.database, database); - return FALSE; - } - return TRUE; - } - -chromHashes.database = cloneString(database); -return TRUE; -} +} chromAliasGlobals; static void readOldAlias(struct lineFile *lf) /* Don't assume the table is fully populated, and dummy up a value for source. */ { char *words[1024]; /* process lines, no more than 1,024 words on a line */ char *line; int size; while (lineFileNext(lf, &line, &size)) { int wordCount = chopByWhite(line, words, ArraySize(words)); if (wordCount > 1) { int i = 1; for ( ; i < wordCount; ++i ) { if (isNotEmpty(words[i])) { - struct chromAlias *ali; - AllocVar(ali); - ali->alias = cloneString(words[i]); - ali->chrom = cloneString(words[0]); - ali->source = cloneString("asmHub"); - hashAdd(chromHashes.forwardHash, ali->alias, ali); - hashAdd(chromHashes.reverseHash, ali->chrom, ali); - hashAdd(chromHashes.chromToAliasHash, ali->chrom, ali->alias); - hashAdd(chromHashes.aliasToChromHash, ali->alias, ali->chrom); - //hashAdd(aliasHash, words[0], ali); + char *alias = cloneString(words[i]); + char *chrom = cloneString(words[0]); + hashAdd(chromAliasGlobals.chromToAliasHash, chrom, alias); + hashAdd(chromAliasGlobals.aliasToChromHash, alias, chrom); } } } } } static void readFieldedTable(struct lineFile *lf) /* Use the fieldedTable library to read in fully populated chromAlias.txt file. */ { struct fieldedTable *aliasTable = fieldedTableAttach(lf, NULL, 0); - struct fieldedRow *row; for(row = aliasTable->rowList; row; row = row->next) { char *chrom = row->row[0]; unsigned field; for(field=1; field< aliasTable->fieldCount; field++) { - struct chromAlias *new; - AllocVar(new); - new->chrom = chrom; - new->alias = row->row[field]; - new->source = aliasTable->fields[field]; - - hashAdd(chromHashes.forwardHash, new->alias, new); - hashAdd(chromHashes.reverseHash, new->chrom, new); - hashAdd(chromHashes.chromToAliasHash, new->chrom, new->alias); - hashAdd(chromHashes.aliasToChromHash, new->alias, new->chrom); + char *alias = row->row[field]; + hashAdd(chromAliasGlobals.chromToAliasHash, chrom, alias); + hashAdd(chromAliasGlobals.aliasToChromHash, alias, chrom); + } } } + +static char * gbdbBbExists(char *database) +/* use a gbdb bigBed as our alias file. */ +{ +// not supported at the moment +return NULL; +} + +static void chromAliasSetupBb(char *database, char *bbFile) +/* Look for a chromAlias bigBed file and open it. */ +{ +chromAliasGlobals.bbi = bigBedFileOpen(bbFile); } static void chromAliasSetupHub(char *database) /* Look for a chromAlias text table and load the hashes with its contents. */ { +char *aliasBbFile = trackHubAliasBbFile(database); +if (aliasBbFile != NULL) + { + chromAliasSetupBb(database, aliasBbFile); + return; + } char *aliasFile = trackHubAliasFile(database); if (aliasFile == NULL) return; struct lineFile *lf = udcWrapShortLineFile(aliasFile, NULL, MAX_HUB_TRACKDB_FILE_SIZE); -chromHashes.forwardHash = hashNew(0); -chromHashes.reverseHash = hashNew(0); -chromHashes.chromToAliasHash = hashNew(0); -chromHashes.aliasToChromHash = hashNew(0); +chromAliasGlobals.chromToAliasHash = hashNew(0); +chromAliasGlobals.aliasToChromHash = hashNew(0); char *line; if (!lineFileNext(lf, &line, NULL)) errAbort("%s is empty", lf->fileName); lineFileReuse(lf); // for the moment always read the alias file in the "old" way //if (line[0] == '#') if (0) readFieldedTable(lf); else readOldAlias(lf); lineFileClose(&lf); } static void chromAliasSetupSql(char *database) /* Look for a chromAlias SQL table and load the hashes with its contents. */ { if (!hTableExists(database, "chromAlias")) return; struct sqlConnection *conn = hAllocConn(database); -chromHashes.forwardHash = hashNew(0); -chromHashes.reverseHash = hashNew(0); -chromHashes.chromToAliasHash = hashNew(0); -chromHashes.aliasToChromHash = hashNew(0); +chromAliasGlobals.chromToAliasHash = hashNew(0); +chromAliasGlobals.aliasToChromHash = hashNew(0); char query[2048]; sqlSafef(query, sizeof(query), "select * from chromAlias"); struct sqlResult *sr = sqlGetResult(conn, query); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct chromAlias *new = chromAliasLoad(row); - hashAdd(chromHashes.forwardHash, new->alias, new); - hashAdd(chromHashes.reverseHash, new->chrom, new); - hashAdd(chromHashes.chromToAliasHash, new->chrom, new->alias); - hashAdd(chromHashes.aliasToChromHash, new->alias, new->chrom); + hashAdd(chromAliasGlobals.chromToAliasHash, new->chrom, new->alias); + hashAdd(chromAliasGlobals.aliasToChromHash, new->alias, new->chrom); } sqlFreeResult(&sr); hFreeConn(&conn); } +static pthread_mutex_t ourMutex = PTHREAD_MUTEX_INITIALIZER; + +static void getLock() +/* Create a mutex to make the code thread safe. */ +{ +pthread_mutex_lock( &ourMutex ); +} + +static void releaseLock() +/* Release our mutex. */ +{ +pthread_mutex_unlock( &ourMutex ); +} + void chromAliasSetup(char *database) /* Read in the chromAlias file/table for this database. */ { if (database == NULL) return; -if (!checkDatabase(database)) +getLock(); +if (chromAliasGlobals.inited) return; +chromAliasGlobals.inited = TRUE; -if (chromHashes.inited) - return; -chromHashes.inited = TRUE; - +char *gbdbFile; if (trackHubDatabase(database)) chromAliasSetupHub(database); +else if ((gbdbFile = gbdbBbExists(database)) != NULL) + chromAliasSetupBb(database, gbdbFile); else chromAliasSetupSql(database); +releaseLock(); } -struct hash *chromAliasMakeLookupTable(char *database) -/* Given a database name and a connection to that database, construct a lookup table - * that takes chromosome alias names to a matching struct chromAlias. Returns NULL - * if the given database does not have a chromAlias table. */ +char *findNativeHashes(char *alias) +/* Find a native sequence given an alias using the hash tables. */ { -return chromHashes.forwardHash; +char *chrom = (char *)hashFindVal(chromAliasGlobals.aliasToChromHash, alias); +if (isNotEmpty(chrom)) + return cloneString(chrom); +return NULL; } -struct hash *chromAliasMakeReverseLookupTable(char *database) -/* Given a database name and a connection to that database, construct a lookup table - * that takes the actual assembly chromosome names to struct chromAliases. Because a - * chromosome name may well have multiple aliases, repeated calls to hashLookupNext - * may be required to see them all. Returns NULL if the given database does not have - * a chromAlias table. */ +static struct bptIndex *getBpts(struct bbiFile *bbi) +/* Open any extra indices that this bigBed has. */ +{ +if (chromAliasGlobals.bptInited) + return chromAliasGlobals.bptList; + +if (!chromAliasGlobals.bptInited) + { + struct bptIndex *bptList = NULL; + struct slName *indexList = bigBedListExtraIndexes(bbi); + for(; indexList; indexList = indexList->next) { -return chromHashes.reverseHash; + struct bptIndex *bptIndex; + AllocVar(bptIndex); + bptIndex->bpt = bigBedOpenExtraIndex(bbi, indexList->name, &bptIndex->fieldIx); + slAddHead(&bptList, bptIndex); } + chromAliasGlobals.bptList = bptList; + chromAliasGlobals.bptInited = TRUE; + } + +return chromAliasGlobals.bptList; +} + +char *findNativeBb(struct bbiFile *bbi, char *alias) +/* Find the native seqName for a given alias given a bigBed. */ +{ +struct bptIndex *bptIndex = getBpts(bbi); -struct hash *chromAliasAliasToChromHash(char *database) -/* Get the hash that maps chrom names to their aliases. */ +for(; bptIndex; bptIndex = bptIndex->next) { -return chromHashes.aliasToChromHash; + struct lm *lm = lmInit(0); + struct bigBedInterval *bb= bigBedNameQuery(bbi, bptIndex->bpt, bptIndex->fieldIx, alias, lm); + + if (bb != NULL) + { + char chromName[1024]; + bptStringKeyAtPos(bbi->chromBpt, bb->chromId, chromName, sizeof(chromName)); + + return cloneString(chromName); + } + } + +return NULL; +} + +char *chromAliasFindNative(char *alias) +/* Find the native seqName for a given alias. */ +{ +static struct hash *cachedNative; +char *chrom; + +if (cachedNative == NULL) + cachedNative = newHash(6); + +if ((chrom = hashFindVal(cachedNative, alias)) != NULL) + return chrom; + +getLock(); +if ((chrom = hashFindVal(cachedNative, alias)) == NULL) + { + if (chromAliasGlobals.bbi) + chrom = findNativeBb(chromAliasGlobals.bbi, alias); + else if (chromAliasGlobals.aliasToChromHash) + chrom = findNativeHashes(alias); + + hashAdd(cachedNative, alias, chrom); + } +releaseLock(); + +return cloneString(chrom); } -struct hash *chromAliasChromToAliasHash(char *database) -/* Get the hash that maps chrom names to their aliases. */ +struct slName *findAliasesBb(struct bbiFile *bbi, char *seqName) +/* Find the aliases for a given seqName using the alias bigBed. */ +{ +struct lm *lm = lmInit(0); +struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, seqName, 0, 1, 0, lm); +char *bedRow[bbi->fieldCount]; +char startBuf[16], endBuf[16]; +struct slName *list = NULL; +for (bb = bbList; bb != NULL; bb = bb->next) + { + bigBedIntervalToRow(bb, seqName, startBuf, endBuf, bedRow, ArraySize(bedRow)); + int ii; + for(ii=3; ii < chromAliasGlobals.bbi->fieldCount; ii++) { -return chromHashes.chromToAliasHash; + struct slName *name = newSlName(bedRow[ii]); + slAddHead(&list, name); + } + } + +return list; +} + +struct slName *findAliasesHashes(char *seqName) +/* Find the aliases for a given seqName using the hashes. */ +{ +struct slName *slList = NULL; +struct hashEl *thisEl = hashLookup(chromAliasGlobals.chromToAliasHash, seqName); + +for (;thisEl != NULL; thisEl = hashLookupNext(thisEl)) + { + struct slName *name = newSlName((char *)thisEl->val); + slAddHead(&slList, name); + } + +return slList; +} + +struct slName *chromAliasFindAliases(char *seqName) +/* Find the aliases for a given seqName. */ +{ +static struct hash *cachedAliases; +struct slName *aliases; + +if (cachedAliases == NULL) + cachedAliases = newHash(6); + +if ((aliases = hashFindVal(cachedAliases, seqName)) != NULL) + return aliases; + +getLock(); +if ((aliases = hashFindVal(cachedAliases, seqName)) == NULL) + { + if (chromAliasGlobals.bbi) + aliases = findAliasesBb(chromAliasGlobals.bbi, seqName); + else if (chromAliasGlobals.chromToAliasHash) + aliases = findAliasesHashes(seqName); + + hashAdd(cachedAliases, seqName, aliases); + } +releaseLock(); + +return aliases; }