225c0d55992aefae478461bba278644bdfdda3c5 max Wed Jan 15 08:33:57 2014 -0800 library changes for the browser box: This changes mostly hdb and jksql,plus - to a smaller extent - various other places in the code that deal with /gbdb/ files. The overall aim is to make it possible to have the data remote at UCSC while having the CGIs on a machine far away. At up to 180msecs distance from UCSC (Europe,Japan), each query can get slow. So I tried to reduce the number of queries sent to UCSC while allowing to keep some mysql tables on localhost. I changed four things: - extend larry's table cache to include field names. The code uses "describe" very often, which is slow from remote. With a table name cache these queries can be handled locally. This is configured in hg.conf - mysql "failover" connections: a mysql connection can have a 2nd connection that is used if a query fails, configured in hg.conf (I didn't call it "remote" connections, because we use that term already in the code) - mysql lazy connects: don't connect a sqlConnection right away, but only when needed. a mysql connect takes >500msecs from across the atlantic. - move gbdb: patch various places that use absolute "/gbdb/" pathnames to go through a central function that can change the filename of gbdb files to something else, as configured in hg.conf Plus patch 1 or 2 lines for more speed + update the hgMirror script diff --git src/hg/lib/hdb.c src/hg/lib/hdb.c index 55fb7f9..2fef5b3 100644 --- src/hg/lib/hdb.c +++ src/hg/lib/hdb.c @@ -26,30 +26,31 @@ #include "liftOverChain.h" #include "grp.h" #include "twoBit.h" #include "ra.h" #include "genbank.h" #include "chromInfo.h" #ifndef GBROWSE #include "axtInfo.h" #include "ctgPos.h" #include "hubConnect.h" #include "customTrack.h" #include "hgFind.h" #endif /* GBROWSE */ #include "hui.h" #include "trackHub.h" +#include "udc.h" #ifdef LOWELAB #define DEFAULT_PROTEINS "proteins060115" #define DEFAULT_GENOME "Pyrobaculum aerophilum" #else #define DEFAULT_PROTEINS "proteins" #define DEFAULT_GENOME "Human" #endif static struct sqlConnCache *hdbCc = NULL; /* cache for primary database connection */ static struct sqlConnCache *centralCc = NULL; static char *centralDb = NULL; static struct sqlConnCache *centralArchiveCc = NULL; @@ -771,31 +772,31 @@ struct sqlConnection *conn = hAllocConn(db); int count = sqlTableSizeIfExists(conn, "chromInfo"); hFreeConn(&conn); return (count >= 0 && count <= HDB_MAX_SEQS_FOR_SPLIT); } static void tableListHashAdd(struct hash *dbTblHash, char *profile, char *db) /* Add to a hash that maps a track/table name (unsplit) to an slName list * of actual table names (possibly split) -- we can compute this once and * cache it to save a lot of querying if we will check existence of * lots of tables. */ { struct sqlConnection *conn = hAllocConnProfile(profile, db); struct slName *allTables = sqlListTables(conn); -if (!sameString(CUSTOM_TRASH,db) && hCanHaveSplitTables(db)) +if (!sameString(CUSTOM_TRASH,db) && !sameString("hgFixed",db) && hCanHaveSplitTables(db)) { /* Consolidate split tables into one list per track: */ struct slName *tbl = NULL, *nextTbl = NULL; for (tbl = allTables; tbl != NULL; tbl = nextTbl) { struct hashEl *tHel = NULL; char trackName[HDB_MAX_TABLE_STRING]; char chrom[HDB_MAX_CHROM_STRING]; nextTbl = tbl->next; tbl->next = NULL; hParseTableName(db, tbl->name, trackName, chrom); tHel = hashLookup(dbTblHash, trackName); if (tHel == NULL) hashAdd(dbTblHash, trackName, tbl); else if (! sameString(tbl->name, trackName)) @@ -1009,47 +1010,56 @@ /* fetch a sequence from a 2bit, caching open of the file */ { static struct twoBitFile *tbf = NULL; // cache of open file if ((tbf == NULL) || !sameString(fileName, tbf->fileName)) { twoBitClose(&tbf); tbf = twoBitOpen(fileName); } struct dnaSeq *seq = twoBitReadSeqFrag(tbf, seqName, start, end); return seq; } struct dnaSeq *hFetchSeqMixed(char *fileName, char *seqName, int start, int end) /* Fetch mixed case sequence. */ { -if (twoBitIsFile(fileName)) - return fetchTwoBitSeq(fileName, seqName, start, end); +struct dnaSeq *seq = NULL; +char *newFileName = NULL; +newFileName = hCloneRewriteFileName(fileName); +if (twoBitIsFile(newFileName)) + seq = fetchTwoBitSeq(newFileName, seqName, start, end); else - return nibLoadPartMasked(NIB_MASK_MIXED, fileName, start, end-start); + seq = nibLoadPartMasked(NIB_MASK_MIXED, newFileName, start, end-start); +freez(&newFileName); +return seq; } struct dnaSeq *hFetchSeq(char *fileName, char *seqName, int start, int end) /* Fetch sequence from file. If it is a .2bit file then fetch the named sequence. If it is .nib then just ignore seqName. */ { +fileName = hCloneRewriteFileName(fileName); +struct dnaSeq *seq = NULL; if (twoBitIsFile(fileName)) { - struct dnaSeq *seq = fetchTwoBitSeq(fileName, seqName, start, end); + seq = fetchTwoBitSeq(fileName, seqName, start, end); tolowers(seq->dna); - return seq; } -return nibLoadPart(fileName, start, end-start); +else + seq = nibLoadPart(fileName, start, end-start); +freez(&fileName); +return seq; } struct dnaSeq *hChromSeqMixed(char *db, char *chrom, int start, int end) /* Return mixed case (repeats in lower case) DNA from chromosome. */ { char fileName[HDB_MAX_PATH_STRING]; hNibForChrom(db, chrom, fileName); return hFetchSeqMixed(fileName, chrom, start, end); } struct dnaSeq *hChromSeqMixedFromPath(char *nibPath, char *db, char *chrom, int start, int end) /* Return mixed case (repeats in lower case) DNA from chromosome, given an * input nib path. */ { @@ -1243,74 +1253,113 @@ struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr; char **row; sr = sqlGetResult(conn, "NOSQLINJ select chrom from chromInfo"); while ((row = sqlNextRow(sr)) != NULL) { struct slName *el = slNameNew(row[0]); slAddHead(&list, el); } sqlFreeResult(&sr); hFreeConn(&conn); return list; } +char *hCloneRewriteFileName(char* fileName) + /* Clones and returns a gbdb filename, potentially rewriting it according to hg.conf + * If the settings gbdbLoc1 and gbdbLoc2 are found, try them in order, by + * replacing /gbdb/ with the new locations. + * We assume /gbdb/ does not appear somewhere inside a fileName. + * This function does not guarantee that the filename exists. + * */ +{ +if (fileName==NULL) + return fileName; + +char* newGbdbLoc = cfgOption("gbdbLoc1"); +char* path; + +// if no config option set or not a /gbdb filename, then just return +// otherwise replace /gbdb/ with the new prefix and return if exists. +if (newGbdbLoc==NULL || !startsWith("/gbdb/", fileName)) + return cloneString(fileName); + +path = replaceChars(fileName, "/gbdb/", newGbdbLoc); +if (fileExists(path)) + return path; + +// if the file did not exist, replace with gbdbLoc2 +newGbdbLoc = cfgOption("gbdbLoc2"); +if (newGbdbLoc==NULL) + return path; + +freeMem(path); +path = replaceChars(fileName, "/gbdb/", newGbdbLoc); +return path; +} + char *hTryExtFileNameC(struct sqlConnection *conn, char *extFileTable, unsigned extFileId, boolean abortOnError) /* Get external file name from table and ID. Typically * extFile table will be 'extFile' or 'gbExtFile' * If abortOnError is true, abort if the id is not in the table or if the file * fails size check, otherwise return NULL if either of those checks fail. * Please freeMem the result when you are done with it. * (requires conn passed in) */ { char query[256]; struct sqlResult *sr; char **row; long long dbSize, diskSize; char *path; sqlSafef(query, sizeof(query), "select path,size from %s where id = %u", extFileTable, extFileId); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) == NULL) { if (abortOnError) errAbort("Database inconsistency table '%s.%s' no ext file with id %u", sqlGetDatabase(conn), extFileTable, extFileId); else { sqlFreeResult(&sr); return NULL; } } -path = cloneString(row[0]); +path = hCloneRewriteFileName(row[0]); + dbSize = sqlLongLong(row[1]); -diskSize = fileSize(path); sqlFreeResult(&sr); +// for speed, only do dbSize check if file is local +if (udcIsLocal(path)) + { + diskSize = fileSize(path); if (dbSize != diskSize) { if (abortOnError) errAbort("External file %s cannot be opened or has wrong size. " "Old size %lld, new size %lld, error %s", path, dbSize, diskSize, strerror(errno)); else freez(&path); } + } + return path; } char *hMayExtFileNameC(struct sqlConnection *conn, char *extFileTable, unsigned extFileId) { return hTryExtFileNameC(conn, extFileTable, extFileId, FALSE); } char *hExtFileNameC(struct sqlConnection *conn, char *extFileTable, unsigned extFileId) { return hTryExtFileNameC(conn, extFileTable, extFileId, TRUE); } char *hExtFileName(char *db, char *extFileTable, unsigned extFileId) /* Get external file name from table and ID. Typically @@ -1331,31 +1380,31 @@ /* Constants for selecting seq/extFile or gbSeq/gbExtFile */ #define SEQ_TBL_SET 1 #define GBSEQ_TBL_SET 2 struct largeSeqFile /* Manages our large external sequence files. Typically there will * be around four of these. This basically caches the file handle * so don't have to keep opening and closing them. */ { struct largeSeqFile *next; /* Next in list. */ char *path; /* Path name for file. */ char *extTable; /* external file table */ char *db; /* database this is associated with */ HGID id; /* Id in extFile table. */ - int fd; /* File handle. */ + struct udcFile *fd; /* File handle. */ }; static struct largeSeqFile *largeFileList; /* List of open large files. */ static struct largeSeqFile *largeFileHandle(struct sqlConnection *conn, HGID extId, char *extTable) /* Return handle to large external file. */ { struct largeSeqFile *lsf; char *db = sqlGetDatabase(conn); /* Search for it on existing list and return it if found. */ for (lsf = largeFileList; lsf != NULL; lsf = lsf->next) { if ((lsf->id == extId) && sameString(lsf->db, db) && sameString(lsf->extTable, extTable)) @@ -1363,47 +1412,47 @@ } /* Open file and put it on list. */ { char *path; path = hMayExtFileNameC(conn, extTable, extId); if (path == NULL) return NULL; struct largeSeqFile *lsf; AllocVar(lsf); lsf->path = path; lsf->extTable = cloneString(extTable); lsf->db = cloneString(db); lsf->id = extId; - if ((lsf->fd = open(lsf->path, O_RDONLY)) < 0) - errAbort("Couldn't open external file %s", lsf->path); + lsf->fd = udcFileMayOpen(lsf->path, NULL); + if (lsf->fd == NULL) + errAbort("hdb/largeFileHandle: Couldn't open external file %s", lsf->path); slAddHead(&largeFileList, lsf); return lsf; } } -static void *readOpenFileSection(int fd, off_t offset, size_t size, char *fileName, char *acc) +static void *readOpenFileSection(struct udcFile* fd, off_t offset, size_t size, char *fileName, char *acc) /* Allocate a buffer big enough to hold a section of a file, * and read that section into it. */ { void *buf; buf = needMem(size+1); -if (lseek(fd, offset, SEEK_SET) < 0) - errnoAbort("Couldn't read %s: error seeking to %lld in %s", acc, (long long)offset, fileName); -if (read(fd, buf, size) < size) - errnoAbort("Couldn't read %s: error reading %lld bytes at %lld in %s", acc, (long long)size, (long long)offset, fileName); +// no need to check success, udc will errAbort if offset is invalid +udcSeek(fd, offset); +udcRead(fd, buf, size); return buf; } static char *dbTblParse(char *defaultDb, char *tbl, char **tblRet, char *buf, int bufSize) /* Check if tbl contains db, if so, then split it and the database and table, * returning the database. If a db isn't encoded, return defaultDb and tbl unchanged, or abort if * defaultDb is NULL. buf must be big enough to hold the database. */ { char *dot = strchr(tbl, '.'); if (dot == NULL) { if (defaultDb == NULL) errAbort("no default database and no database specified with table %s", tbl); *tblRet = tbl; @@ -2306,56 +2355,62 @@ if (!organism) organism = hArchiveDbDbOptionalField(database, "organism"); return organism; } char *hGenomeOrArchive(char *database) /* Return genome name associated from the regular or the archive database. */ { char *genome = hGenome(database); if (!genome) genome = hArchiveDbDbOptionalField(database, "genome"); return genome; } char *hDbDbNibPath(char *database) -/* return nibPath from dbDb for database */ +/* return nibPath from dbDb for database, has to be freed */ { -return hDbDbOptionalField(database, "nibPath"); +char *rawNibPath = hDbDbOptionalField(database, "nibPath"); +char *nibPath = hCloneRewriteFileName(rawNibPath); +freez(&rawNibPath); +return nibPath; } char *hGenome(char *database) /* Return genome associated with database. * use freeMem on this when done. */ { return hDbDbOptionalField(database, "genome"); } char *hScientificName(char *database) /* Return scientific name for organism represented by this database */ /* Return NULL if unknown database */ /* NOTE: must free returned string after use */ { return hDbDbOptionalField(database, "scientificName"); } char *hHtmlPath(char *database) /* Return /gbdb path name to html description for this database */ /* Return NULL if unknown database */ /* NOTE: must free returned string after use */ { -return hDbDbOptionalField(database, "htmlPath"); +char *htmlPath = hDbDbOptionalField(database, "htmlPath"); +char *newPath = hCloneRewriteFileName(htmlPath); +freez(&htmlPath); +return newPath; } char *hFreezeDate(char *database) /* Return freeze date of database. Use freeMem when done. */ { return hDbDbField(database, "description"); } char *hFreezeDateOpt(char *database) /* Return freeze date of database or NULL if unknown database * Use freeMem when done. */ { return hDbDbOptionalField(database, "description"); } @@ -2651,67 +2706,51 @@ static boolean fitFields(struct hash *hash, char *chrom, char *start, char *end, char retChrom[HDB_MAX_FIELD_STRING], char retStart[HDB_MAX_FIELD_STRING], char retEnd[HDB_MAX_FIELD_STRING]) /* Return TRUE if chrom/start/end are in hash. * If so copy them to retChrom, retStart, retEnd. * Helper routine for findChromStartEndFields below. */ { if (!fitField(hash, chrom, retChrom)) return FALSE; if (!fitField(hash, start, retStart)) return FALSE; if (!fitField(hash, end, retEnd)) return FALSE; return TRUE; } -boolean hIsBinned(char *db, char *table) -/* Return TRUE if a table is binned. */ -{ -char query[256]; -struct sqlConnection *conn = hAllocConn(db); -struct sqlResult *sr; -char **row; -boolean binned = FALSE; - -/* Read table description into hash. */ -sqlSafef(query, sizeof(query), "describe %s", table); -sr = sqlGetResult(conn, query); -if ((row = sqlNextRow(sr)) != NULL) - { - if (sameString(row[0], "bin")) - binned = TRUE; - } -sqlFreeResult(&sr); -hFreeConn(&conn); -return binned; -} - int hFieldIndex(char *db, char *table, char *field) /* Return index of field in table or -1 if it doesn't exist. */ { struct sqlConnection *conn = hAllocConn(db); int result = sqlFieldIndex(conn, table, field); hFreeConn(&conn); return result; } boolean hHasField(char *db, char *table, char *field) /* Return TRUE if table has field */ { return hFieldIndex(db, table, field) >= 0; } +boolean hIsBinned(char *db, char *table) +/* Return TRUE if a table is binned. */ +{ +return hHasField(db, table, "bin"); +} + boolean hFieldHasIndex(char *db, char *table, char *field) /* Return TRUE if a SQL index exists for table.field. */ { struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr = NULL; char **row = NULL; boolean gotIndex = FALSE; char query[512]; sqlSafef(query, sizeof(query), "show index from %s", table); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { if (sameString(row[4], field)) { @@ -2729,63 +2768,52 @@ char retStart[HDB_MAX_FIELD_STRING], char retEnd[HDB_MAX_FIELD_STRING], char retName[HDB_MAX_FIELD_STRING], char retScore[HDB_MAX_FIELD_STRING], char retStrand[HDB_MAX_FIELD_STRING], char retCdsStart[HDB_MAX_FIELD_STRING], char retCdsEnd[HDB_MAX_FIELD_STRING], char retCount[HDB_MAX_FIELD_STRING], char retStarts[HDB_MAX_FIELD_STRING], char retEndsSizes[HDB_MAX_FIELD_STRING], char retSpan[HDB_MAX_FIELD_STRING], boolean *retBinned) /* Given a table return the fields corresponding to all the bed 12 * fields, if they exist. Fields that don't exist in the given table * will be set to "". */ { -char query[256]; -struct sqlResult *sr; -char **row; -struct hash *hash = newHash(5); -boolean gotIt = TRUE, binned = FALSE; -char *db; - -db = sqlGetDatabase(conn); - /* Set field names to empty strings */ retChrom[0] = 0; retStart[0] = 0; retEnd[0] = 0; retName[0] = 0; retScore[0] = 0; retStrand[0] = 0; retCdsStart[0] = 0; retCdsEnd[0] = 0; retCount[0] = 0; retStarts[0] = 0; retEndsSizes[0] = 0; retSpan[0] = 0; +char *db; +db = sqlGetDatabase(conn); + /* Read table description into hash. */ -sqlSafef(query, sizeof(query), "describe %s", table); -sr = sqlGetResult(conn, query); -while ((row = sqlNextRow(sr)) != NULL) - { - if (sameString(row[0], "bin")) - binned = TRUE; - hashAdd(hash, row[0], NULL); - } -sqlFreeResult(&sr); +boolean gotIt = TRUE, binned = FALSE; +binned = hIsBinned(db, table); +// XX Do I need to free the slList implicitly created here? +struct hash *hash = hashFromSlNameList(sqlListFields(conn, table)); /* Look for bed-style or linkedFeatures names. */ if (fitFields(hash, "chrom", "chromStart", "chromEnd", retChrom, retStart, retEnd)) { if (!fitField(hash, "name", retName)) if (!fitField(hash, "acc", retName)) if (!fitField(hash, "frag", retName)) if (!fitField(hash, "contig", retName)) fitField(hash, "sequence", retName); // so that tagAlign can masquerade as BED fitField(hash, "score", retScore); fitField(hash, "strand", retStrand); fitField(hash, "thickStart", retCdsStart); fitField(hash, "thickEnd", retCdsEnd); if (!fitField(hash, "blockCount", retCount)) fitField(hash, "lfCount", retCount); @@ -3038,32 +3066,32 @@ hashAdd(dbHash, db, hash); } if ((hti = hashFindVal(hash, rootName)) == NULL) { if ((sameString(rootName, "mrna") && sqlTableExists(conn, "all_mrna")) || (sameString(rootName, "est") && sqlTableExists(conn, "all_est"))) { safef(fullName, sizeof(fullName), "all_%s", rootName); rootName = fullName; } else { if (chrom != NULL) { // first try the non-split table name then the split table name. - // This avoids many useless chrX_table lookups - // (today, very few assemblies have split tables) + // In 2013, very few assemblies have split tables + // This avoids dozens of mostly useless chrX_table lookups isSplit = TRUE; safef(fullName, sizeof(fullName), "%s", rootName); if (sqlTableExists(conn, fullName)) isSplit = FALSE; else { safef(fullName, sizeof(fullName), "%s_%s", chrom, rootName); if (sqlTableExists(conn, fullName)) isSplit = TRUE; else return NULL; } } } AllocVar(hti); @@ -3796,30 +3824,31 @@ // 2) call creates list if not in hash // 3) static (to this file) routine gives the actual tdb list // 4) public lib routine that returns fully cloned list // 5) public lib routine that returns cloned individual tdb complete with up/down inheritance // UNFORTUNATELY, cloning the memory with prove costly in time as well, because of all the pointers // to relink. THEREFORE what should be done is to make the tdb list with const ->next pointers and // force discpline on the callers. Sorts should be by hdb.c routines and any new lists (such as // hgTables makes) should be via tdbRefs. // SO we are back to being STALLED because of the volume of work. // static char *existingDb = NULL; // static struct trackDb *tdbList = NULL; struct trackDb *tdbList = NULL; //if (differentStringNullOk(existingDb, db)) // { + tdbList = loadTrackDb(db, NULL); tdbList = trackDbLinkUpGenerations(tdbList); tdbList = trackDbPolishAfterLinkup(tdbList, db); // freeMem(existingDb); // existingDb = cloneString(db); // } return tdbList; } static struct trackDb *loadAndLookupTrackDb(struct sqlConnection *conn, char *where) /* Load trackDb object(s). Nothing done for composite tracks here. */ { return loadTrackDb(sqlGetDatabase(conn), where); } @@ -5195,34 +5224,37 @@ if (fileName == NULL) { if (checkSeqName) { if (startsWith("chr", seqName)) sqlSafef(query, sizeof(query), "select fileName from %s where seqName = '%s'", table, seqName+strlen("chr")); else sqlSafef(query, sizeof(query), "select fileName from %s where seqName = 'chr%s'", table, seqName); fileName = sqlQuickString(conn, query); } else errAbort("Missing fileName in %s table", table); } -return fileName; + +char *rewrittenFname = hCloneRewriteFileName(fileName); +freez(&fileName); +return rewrittenFname; } char *bbiNameFromSettingOrTableChrom(struct trackDb *tdb, struct sqlConnection *conn, char *table, char *seqName) /* Return file name from bigDataUrl or little table that might have a seqName column. * If table does have a seqName column, return NULL if there is no file for seqName. */ { -char *fileName = cloneString(trackDbSetting(tdb, "bigDataUrl")); +char *fileName = hCloneRewriteFileName(trackDbSetting(tdb, "bigDataUrl")); if (fileName == NULL) fileName = bbiNameFromTableChrom(conn, table, seqName); return fileName; } char *bbiNameFromSettingOrTable(struct trackDb *tdb, struct sqlConnection *conn, char *table) /* Return file name from bigDataUrl or little table. */ { return bbiNameFromSettingOrTableChrom(tdb, conn, table, NULL); }