310ed9324362faacfdf0ca7d0cd011605ce2e750 galt Tue Nov 28 03:05:06 2023 -0800 Improving ability to restore symlinked files back into their wrangle or submit directories so they are not lost. Includes special files such as 2bit under the path "". Handles either one dataset at a time, or can with great care be used to restore all files for all sets. Great care must be used with this command, and observe many caviots as well. We have no real way to preserve our symlinks other than this. diff --git src/hg/cirm/cdw/lib/cdwLib.c src/hg/cirm/cdw/lib/cdwLib.c index 66161b2..f2e9d5e 100644 --- src/hg/cirm/cdw/lib/cdwLib.c +++ src/hg/cirm/cdw/lib/cdwLib.c @@ -1,2773 +1,2793 @@ /* cdwLib - routines shared by various cdw programs. See also cdw * module for tables and routines to access structs built on tables. */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "hash.h" #include "dystring.h" #include "jksql.h" #include "errAbort.h" #include "cheapcgi.h" #include "hex.h" #include "openssl/sha.h" #include "base64.h" #include "basicBed.h" #include "bigBed.h" #include "portable.h" #include "filePath.h" #include "genomeRangeTree.h" #include "md5.h" #include "htmshell.h" #include "obscure.h" #include "bamFile.h" #include "raToStruct.h" #include "web.h" #include "hdb.h" #include "cdwValid.h" #include "cdw.h" #include "cdwFastqFileFromRa.h" #include "cdwBamFileFromRa.h" #include "cdwQaWigSpotFromRa.h" #include "cdwVcfFileFromRa.h" #include "rql.h" #include "intValTree.h" #include "tagStorm.h" #include "cdwLib.h" #include "trashDir.h" #include "wikiLink.h" #include "hgConfig.h" /* System globals - just a few ... for now. Please seriously not too many more. */ char *cdwDatabase = "cdw"; int cdwSingleFileTimeout = 4*60*60; // How many seconds we give ourselves to fetch a single file char *cdwRootDir = "/data/cirm/cdw/"; char *eapRootDir = "/data/cirm/encodeAnalysisPipeline/"; char *cdwValDataDir = "/data/cirm/valData/"; char *cdwDaemonEmail = "cdw@cirm-01.sdsc.edu"; struct sqlConnection *cdwConnect() /* Returns a read only connection to database. */ { return sqlConnect(cdwDatabase); } struct sqlConnection *cdwConnectReadWrite() /* Returns read/write connection to database. */ { return sqlConnectProfile("cdw", cdwDatabase); } char *cdwPathForFileId(struct sqlConnection *conn, long long fileId) /* Return full path (which eventually should be freeMem'd) for fileId */ { char query[256]; char fileName[PATH_LEN]; sqlSafef(query, sizeof(query), "select cdwFileName from cdwFile where id=%lld", fileId); sqlNeedQuickQuery(conn, query, fileName, sizeof(fileName)); char path[512]; safef(path, sizeof(path), "%s%s", cdwRootDir, fileName); return cloneString(path); } char *cdwTempDir() /* Returns pointer to cdwTempDir. This is shared, so please don't modify. */ { static char path[PATH_LEN]; if (path[0] == 0) { /* Note code elsewhere depends on tmp dir being inside of cdwRootDir - also good * to have it there so move to a permanent file is quick and unlikely to fail. */ safef(path, sizeof(path), "%s%s", cdwRootDir, "tmp"); makeDirsOnPath(path); strcat(path, "/"); } return path; } char *cdwTempDirForToday(char dir[PATH_LEN]) /* Fills in dir with temp dir of the day, and returns a pointer to it. */ { char dayDir[PATH_LEN]; cdwDirForTime(cdwNow(), dayDir); safef(dir, PATH_LEN, "%s%stmp/", cdwRootDir, dayDir); /* Bracket time consuming call to makeDirsOnPath with check that we didn't just do same * thing. */ static char lastDayDir[PATH_LEN] = ""; if (!sameString(dayDir, lastDayDir)) { strcpy(lastDayDir, dayDir); int len = strlen(dir); dir[len-1] = 0; makeDirsOnPath(dir); dir[len-1] = '/'; } return dir; } long long cdwGettingFile(struct sqlConnection *conn, char *submitDir, char *submitFileName) /* See if we are in process of getting file. Return file record id if it exists even if * it's not complete. Return -1 if record does not exist. */ { /* First see if we have even got the directory. */ char query[PATH_LEN+512]; sqlSafef(query, sizeof(query), "select id from cdwSubmitDir where url='%s'", submitDir); int submitDirId = sqlQuickNum(conn, query); if (submitDirId <= 0) return -1; /* Then see if we have file that matches submitDir and submitFileName. */ sqlSafef(query, sizeof(query), "select id from cdwFile " "where submitFileName='%s' and submitDirId = %d and errorMessage = '' and deprecated=''" " and (endUploadTime >= startUploadTime or startUploadTime < %lld) " "order by submitId desc limit 1" , submitFileName, submitDirId , (long long)cdwNow() - cdwSingleFileTimeout); long long id = sqlQuickLongLong(conn, query); if (id == 0) return -1; return id; } long long cdwGotFile(struct sqlConnection *conn, char *submitDir, char *submitFileName, char *md5, long long size) /* See if we already got file. Return fileId if we do, otherwise -1. This returns * TRUE based mostly on the MD5sum. For short files (less than 100k) then we also require * the submitDir and submitFileName to match. This is to cover the case where you might * have legitimate empty files duplicated even though they were computed based on different * things. For instance coming up with no peaks is a legitimate result for many chip-seq * experiments. */ { /* For large files just rely on MD5. */ char query[PATH_LEN+512]; if (size > 100000) { sqlSafef(query, sizeof(query), "select id from cdwFile where md5='%s' order by submitId desc limit 1" , md5); long long result = sqlQuickLongLong(conn, query); if (result == 0) result = -1; return result; } /* Rest of the routine deals with smaller files, which we are less worried about * duplicating, and indeed expect a little duplication of the empty file if none * other. */ /* First see if we have even got the directory. */ sqlSafef(query, sizeof(query), "select id from cdwSubmitDir where url='%s'", submitDir); int submitDirId = sqlQuickNum(conn, query); if (submitDirId <= 0) return -1; /* The complex truth is that we may have gotten this file multiple times. * We return the most recent version where it got uploaded and passed the post-upload * MD5 sum, and thus where the MD5 field is filled in the database. */ sqlSafef(query, sizeof(query), "select md5,id from cdwFile " "where submitFileName='%s' and submitDirId = %d and md5 != '' " "order by submitId desc limit 1" , submitFileName, submitDirId); struct sqlResult *sr = sqlGetResult(conn, query); char **row; long fileId = -1; if ((row = sqlNextRow(sr)) != NULL) { char *dbMd5 = row[0]; if (sameWord(md5, dbMd5)) fileId = sqlLongLong(row[1]); } sqlFreeResult(&sr); return fileId; } long long cdwNow() /* Return current time in seconds since Epoch. */ { return time(NULL); } /* This is size of base64 encoded hash plus 1 for the terminating zero. */ #define CDW_SID_SIZE 65 static void makeShaBase64(unsigned char *inputBuf, int inputSize, char out[CDW_SID_SIZE]) /* Make zero terminated printable cryptographic hash out of in */ { unsigned char shaBuf[48]; SHA384(inputBuf, inputSize, shaBuf); char *base64 = base64Encode((char*)shaBuf, sizeof(shaBuf)); memcpy(out, base64, CDW_SID_SIZE); out[CDW_SID_SIZE-1] = 0; freeMem(base64); } void cdwMakeSid(char *user, char sid[CDW_SID_SIZE]) /* Convert users to sid */ { /* Salt it well with stuff that is reproducible but hard to guess */ unsigned char inputBuf[512]; memset(inputBuf, 0, sizeof(inputBuf)); int i; for (i=0; iemail); } int cdwFindUserIdFromEmail(struct sqlConnection *conn, char *userEmail) /* Return true id of this user */ { char query[256]; sqlSafef(query, sizeof(query), "select id from cdwUser where email = '%s'", userEmail); return sqlQuickNum(conn, query); } boolean cdwUserIsAdmin(struct sqlConnection *conn, char *userEmail) /* Return true if the user is an admin */ { char query[256]; sqlSafef(query, sizeof(query), "select isAdmin from cdwUser where email = '%s'", userEmail); int isAdmin = sqlQuickNum(conn, query); if (isAdmin == 1) return TRUE; return FALSE; } void cdwWarnUnregisteredUser(char *email) /* Put up warning message about unregistered user and tell them how to register. */ { warn("No user exists with email %s. If you need an account please contact your " "CIRM DCC data wrangler and have them create an account for you." , email); } struct cdwUser *cdwMustGetUserFromEmail(struct sqlConnection *conn, char *email) /* Return user associated with email or put up error message. */ { struct cdwUser *user = cdwUserFromEmail(conn, email); if (user == NULL) { cdwWarnUnregisteredUser(email); noWarnAbort(); } return user; } struct cdwGroup *cdwGroupFromName(struct sqlConnection *conn, char *name) /* Return cdwGroup of given name or NULL if not found. */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwGroup where name='%s'", name); return cdwGroupLoadByQuery(conn, query); } struct cdwGroup *cdwNeedGroupFromName(struct sqlConnection *conn, char *groupName) /* Get named group or die trying */ { struct cdwGroup *group = cdwGroupFromName(conn, groupName); if (group == NULL) errAbort("Group %s doesn't exist", groupName); return group; } boolean cdwFileInGroup(struct sqlConnection *conn, unsigned int fileId, unsigned int groupId) /* Return TRUE if file is in group */ { char query[256]; sqlSafef(query, sizeof(query), "select count(*) from cdwGroupFile where fileId=%u and groupId=%u", fileId, groupId); return sqlQuickNum(conn, query) > 0; } int cdwUserFileGroupsIntersect(struct sqlConnection *conn, long long fileId, int userId) /* Return the number of groups file and user have in common, zero for no match */ { char query[512]; sqlSafef(query, sizeof(query), "select count(*) from cdwGroupUser,cdwGroupFile " " where cdwGroupUser.groupId = cdwGroupFile.groupId " " and cdwGroupUser.userId = %d and cdwGroupFile.fileId = %lld" , userId, fileId); verbose(2, "%s\n", query); return sqlQuickNum(conn, query); } struct rbTree *cdwFilesWithSharedGroup(struct sqlConnection *conn, int userId) /* Make an intVal type tree where the keys are fileIds and the val is null * This contains all files that are associated with any group that user is part of. * Can be used to do quicker version of cdwCheckAccess. */ { struct rbTree *groupedFiles = intValTreeNew(); char query[256]; sqlSafef(query, sizeof(query), "select distinct(fileId) from cdwGroupFile,cdwGroupUser " " where cdwGroupUser.groupId = cdwGroupFile.groupId " " and cdwGroupUser.userId = %d", userId); struct sqlResult *sr = sqlGetResult(conn, query); char **row; while ((row = sqlNextRow(sr)) != NULL) { long long fileId = sqlLongLong(row[0]); intValTreeAdd(groupedFiles, fileId, NULL); } sqlFreeResult(&sr); return groupedFiles; } static boolean checkAccess(struct rbTree *groupedFiles, struct sqlConnection *conn, struct cdwFile *ef, struct cdwUser *user, int accessType) /* See if user should be allowed this level of access. The accessType is one of * cdwAccessRead or cdwAccessWrite. Write access implies read access too. * This can be called with user as NULL, in which case only access to shared-with-all * files is granted. * Since the most time consuming part of the operation involved the group access * check, parts of this can be precomputed in the groupedFiles tree. */ { /* First check for public access. */ if (ef->allAccess >= accessType) return TRUE; /* Everything else requires an actual user */ if (user == NULL) return FALSE; /* Check for user individual access */ if (ef->userId == user->id && ef->userAccess >= accessType) return TRUE; /* Check admin-level access */ if (user->isAdmin) return TRUE; /* Check group access, this involves SQL query */ if (ef->groupAccess >= accessType) { if (groupedFiles != NULL) return intValTreeLookup(groupedFiles, ef->id) != NULL; else return cdwUserFileGroupsIntersect(conn, ef->id, user->id); } return FALSE; } boolean cdwCheckAccess(struct sqlConnection *conn, struct cdwFile *ef, struct cdwUser *user, int accessType) /* See if user should be allowed this level of access. The accessType is one of * cdwAccessRead or cdwAccessWrite. Write access implies read access too. * This can be called with user as NULL, in which case only access to shared-with-all * files is granted. This function takes almost a millisecond. If you are doing it * to many files consider using cdwQuickCheckAccess instead. */ { return checkAccess(NULL, conn, ef, user, accessType); } boolean cdwQuickCheckAccess(struct rbTree *groupedFiles, struct cdwFile *ef, struct cdwUser *user, int accessType) /* See if user should be allowed this level of access. The groupedFiles is * the result of a call to cdwFilesWithSharedGroup. The other parameters are as * cdwCheckAccess. If you are querying thousands of files, this function is hundreds * of times faster though. */ { return checkAccess(groupedFiles, NULL, ef, user, accessType); } long long cdwCountAccessible(struct sqlConnection *conn, struct cdwUser *user) /* Return total number of files associated user can access */ { long long count = 0; if (user == NULL) { char query[256]; sqlSafef(query, sizeof(query), "select count(*) from cdwFile,cdwValidFile " " where cdwFile.id = cdwValidFile.fileId and allAccess > 0" " and (errorMessage='' or errorMessage is null)" ); count = sqlQuickLongLong(conn, query); } else { struct rbTree *groupedFiles = cdwFilesWithSharedGroup(conn, user->id); char query[256]; sqlSafef(query, sizeof(query), "select cdwFile.* from cdwFile,cdwValidFile " " where cdwFile.id = cdwValidFile.fileId " " and (errorMessage='' or errorMessage is null)" ); struct cdwFile *ef, *efList = cdwFileLoadByQuery(conn, query); for (ef = efList; ef != NULL; ef = ef->next) { if (cdwQuickCheckAccess(groupedFiles, ef, user, cdwAccessRead)) ++count; } cdwFileFree(&efList); rbTreeFree(&groupedFiles); } return count; } struct cdwFile *cdwAccessibleFileList(struct sqlConnection *conn, struct cdwUser *user) /* Get list of all files user can access. Null user means just publicly accessible. */ { if (user == NULL) // No user, just publicly readable files then { char query[256]; sqlSafef(query, sizeof(query), "select cdwFile.* from cdwFile,cdwValidFile " " where cdwFile.id = cdwValidFile.fileId and allAccess > 0" " and (errorMessage='' or errorMessage is null)"); return cdwFileLoadByQuery(conn, query); } else // Load all valid files and check access one at a time { struct rbTree *groupedFiles = cdwFilesWithSharedGroup(conn, user->id); struct cdwFile *accessibleList = NULL, *validList = cdwFileLoadAllValid(conn); struct cdwFile *ef, *next; for (ef = validList; ef != NULL; ef = next) { next = ef->next; if (cdwQuickCheckAccess(groupedFiles, ef, user, cdwAccessRead)) { slAddHead(&accessibleList, ef); } else { cdwFileFree(&ef); } } rbTreeFree(&groupedFiles); slReverse(&accessibleList); return accessibleList; } } struct rbTree *cdwAccessTreeForUser(struct sqlConnection *conn, struct cdwUser *user, struct cdwFile *efList, struct rbTree *groupedFiles) /* Construct intVal tree of files from efList that we have access to. The * key is the fileId, the value is the cdwFile object */ { struct rbTree *accessTree = intValTreeNew(0); struct cdwFile *ef; for (ef = efList; ef != NULL; ef = ef->next) { if (cdwQuickCheckAccess(groupedFiles, ef, user, cdwAccessRead)) intValTreeAdd(accessTree, ef->id, ef); } return accessTree; } int cdwGetHost(struct sqlConnection *conn, char *hostName) /* Look up host name in table and return associated ID. If not found * make up new table entry. */ { /* If it's already in table, just return ID. */ char query[512]; sqlSafef(query, sizeof(query), "select id from cdwHost where name='%s'", hostName); int hostId = sqlQuickNum(conn, query); if (hostId > 0) return hostId; sqlSafef(query, sizeof(query), "insert cdwHost (name, firstAdded, paraFetchStreams) values('%s', %lld, 10)", hostName, cdwNow()); sqlUpdate(conn, query); return sqlLastAutoId(conn); } int cdwGetSubmitDir(struct sqlConnection *conn, int hostId, char *submitDir) /* Get submitDir from database, creating it if it doesn't already exist. */ { /* If it's already in table, just return ID. */ char query[512]; sqlSafef(query, sizeof(query), "select id from cdwSubmitDir where url='%s'", submitDir); int dirId = sqlQuickNum(conn, query); if (dirId > 0) return dirId; sqlSafef(query, sizeof(query), "insert cdwSubmitDir (url, firstAdded, hostId) values('%s', %lld, %d)", submitDir, cdwNow(), hostId); sqlUpdate(conn, query); return sqlLastAutoId(conn); } void cdwMakeLicensePlate(char *prefix, int ix, char *out, int outSize) /* Make a license-plate type string composed of prefix + funky coding of ix * and put result in out. */ { int maxIx = 10*10*10*26*26*26; if (ix < 0) errAbort("ix must be positive in cdwMakeLicensePlate"); if (ix > maxIx) errAbort("ix exceeds max in cdwMakeLicensePlate. ix %d, max %d\n", ix, maxIx); int prefixSize = strlen(prefix); int minSize = prefixSize + 6 + 1; if (outSize < minSize) errAbort("outSize (%d) not big enough in cdwMakeLicensePlate", outSize); /* Copy in prefix. */ strcpy(out, prefix); /* Generate the 123ABC part of license plate backwards. */ char *s = out+minSize; int x = ix - 1; // -1 so start with AAA not AAB *(--s) = 0; // zero tag at end; int i; for (i=0; i<3; ++i) { int remainder = x%26; *(--s) = 'A' + remainder; x /= 26; } for (i=0; i<3; ++i) { int remainder = x%10; *(--s) = '0' + remainder; x /= 10; } } void cdwDirForTime(time_t sinceEpoch, char dir[PATH_LEN]) /* Return the output directory for a given time. */ { /* Get current time parsed into struct tm */ struct tm now; gmtime_r(&sinceEpoch, &now); /* make directory string out of year/month/day/ */ safef(dir, PATH_LEN, "%d/%d/%d/", now.tm_year+1900, now.tm_mon+1, now.tm_mday); } char *lastMatchCharExcept(char *start, char *end, char match, char except) /* Return last char between start up to but not including end that is match. * However if except occurs between end and this match, return NULL instead. * Also return NULL if there is no match */ { char *e = end; while (--e >= start) { char c = *e; if (c == except) return NULL; if (c == match) return e; } return NULL; } void cdwMakeBabyName(unsigned long id, char *baseName, int baseNameSize) /* Given a numerical ID, make an easy to pronouce file name */ { char *consonants = "bdfghjklmnprstvwxyz"; // Avoid c and q because make sound ambiguous char *vowels = "aeiou"; int consonantCount = strlen(consonants); int vowelCount = strlen(vowels); assert(id >= 1); unsigned long ix = id - 1; /* We start at zero not 1 */ int basePos = 0; do { char v = vowels[ix%vowelCount]; ix /= vowelCount; char c = consonants[ix%consonantCount]; ix /= consonantCount; if (basePos + 2 >= baseNameSize) errAbort("Not enough room for %lu in %d letters in cdwMakeBabyName", id, baseNameSize); baseName[basePos] = c; baseName[basePos+1] = v; basePos += 2; } while (ix > 0); baseName[basePos] = 0; } char *cdwFindDoubleFileSuffix(char *path) /* Return pointer to second from last '.' in part of path between last / and end. * If there aren't two dots, just return pointer to normal single dot suffix. */ { int nameSize = strlen(path); char *suffix = lastMatchCharExcept(path, path + nameSize, '.', '/'); if (suffix != NULL) { if (sameString(suffix, ".gz") || sameString(suffix, ".bigBed")) { char *secondSuffix = lastMatchCharExcept(path, suffix, '.', '/'); if (secondSuffix != NULL) suffix = secondSuffix; } } else suffix = path + nameSize; return suffix; } void cdwMakeFileNameAndPath(int cdwFileId, char *submitFileName, char cdwFile[PATH_LEN], char serverPath[PATH_LEN]) /* Convert file id to local file name, and full file path. Make any directories needed * along serverPath. */ { /* Preserve suffix. Give ourselves up to two suffixes. */ char *suffix = cdwFindDoubleFileSuffix(submitFileName); /* Figure out cdw file name, starting with baseName. */ char baseName[32]; cdwMakeBabyName(cdwFileId, baseName, sizeof(baseName)); /* Figure out directory and make any components not already there. */ char cdwDir[PATH_LEN]; cdwDirForTime(cdwNow(), cdwDir); char uploadDir[PATH_LEN]; safef(uploadDir, sizeof(uploadDir), "%s%s", cdwRootDir, cdwDir); makeDirsOnPath(uploadDir); /* Figure out full file names */ safef(cdwFile, PATH_LEN, "%s%s%s", cdwDir, baseName, suffix); safef(serverPath, PATH_LEN, "%s%s", cdwRootDir, cdwFile); } char *cdwSetting(struct sqlConnection *conn, char *name) /* Return named settings value, or NULL if setting doesn't exist. FreeMem when done. */ { char query[256]; sqlSafef(query, sizeof(query), "select val from cdwSettings where name='%s'", name); return sqlQuickString(conn, query); } char *cdwRequiredSetting(struct sqlConnection *conn, char *name) /* Returns setting, abort if it isn't found. FreeMem when done. */ { char *val = cdwSetting(conn, name); if (val == NULL) errAbort("Required %s setting is not defined in cdwSettings table", name); return val; } char *cdwLicensePlateHead(struct sqlConnection *conn) /* Return license plate prefix for current database - something like TSTFF or DEVFF or ENCFF */ { static char head[32]; if (head[0] == 0) { char *prefix = cdwRequiredSetting(conn, "prefix"); safef(head, sizeof(head), "%s", prefix); } return head; } static char *localHostName = "localhost"; static char *localHostDir = ""; static int getLocalHost(struct sqlConnection *conn) /* Make up record for local host if it is not there already. */ { char query[256]; sqlSafef(query, sizeof(query), "select id from cdwHost where name = '%s'", localHostName); int hostId = sqlQuickNum(conn, query); if (hostId == 0) { sqlSafef(query, sizeof(query), "insert cdwHost(name, firstAdded) values('%s', %lld)", localHostName, cdwNow()); sqlUpdate(conn, query); hostId = sqlLastAutoId(conn); } return hostId; } static int getLocalSubmitDir(struct sqlConnection *conn) /* Get submit dir for local submissions, making it up if it does not exist. */ { int hostId = getLocalHost(conn); char query[256]; sqlSafef(query, sizeof(query), "select id from cdwSubmitDir where url='%s' and hostId=%d", localHostDir, hostId); int dirId = sqlQuickNum(conn, query); if (dirId == 0) { sqlSafef(query, sizeof(query), "insert cdwSubmitDir(url,hostId,firstAdded) values('%s',%d,%lld)", localHostDir, hostId, cdwNow()); sqlUpdate(conn, query); dirId = sqlLastAutoId(conn); } return dirId; } static int getLocalSubmit(struct sqlConnection *conn) /* Get the submission that covers all of our local additions. */ { int dirId = getLocalSubmitDir(conn); char query[256]; sqlSafef(query, sizeof(query), "select id from cdwSubmit where submitDirId='%d'", dirId); int submitId = sqlQuickNum(conn, query); if (submitId == 0) { sqlSafef(query, sizeof(query), "insert cdwSubmit (submitDirId,startUploadTime) values(%d,%lld)", dirId, cdwNow()); sqlUpdate(conn, query); submitId = sqlLastAutoId(conn); } return submitId; } char **sqlNeedNextRow(struct sqlResult *sr) /* Get next row or die trying. Since the error reporting is not good, please only * use when an error would be unusual. */ { char **row = sqlNextRow(sr); if (row == NULL) errAbort("Unexpected empty result from database."); return row; } void cdwUpdateFileTags(struct sqlConnection *conn, long long fileId, struct dyString *tags) /* Update tags field in cdwFile with given value */ { struct dyString *query = dyStringNew(0); sqlDyStringPrintf(query, "update cdwFile set tags='%s' ", tags->string); sqlDyStringPrintf(query, " where id=%lld", fileId); sqlUpdate(conn, query->string); dyStringFree(&query); } struct cdwFile *cdwGetLocalFile(struct sqlConnection *conn, char *localAbsolutePath, char *givenMd5Sum) /* Get record of local file from database, adding it if it doesn't already exist. * Can make it a symLink rather than a copy in which case pass in valid MD5 sum * for symLinkM5dSum. */ { /* First do a reality check on the local absolute path. Is there a file there? */ if (localAbsolutePath[0] != '/') errAbort("Using relative path in cdwAddLocalFile."); long long size = fileSize(localAbsolutePath); if (size == -1) errAbort("%s does not exist", localAbsolutePath); long long updateTime = fileModTime(localAbsolutePath); /* Get file if it's in database already. */ int submitDirId = getLocalSubmitDir(conn); int submitId = getLocalSubmit(conn); char query[256+PATH_LEN]; sqlSafef(query, sizeof(query), "select * from cdwFile where submitId=%d and submitFileName='%s'", submitId, localAbsolutePath); struct cdwFile *ef = cdwFileLoadByQuery(conn, query); /* If we got something in database, check update time and size, and if it's no change just * return existing database id. */ if (ef != NULL && ef->updateTime == updateTime && ef->size == size) return ef; /* If we got here, then we need to make a new file record. Start with pretty empty record * that just has file ID, submitted file name and a few things*/ sqlSafef(query, sizeof(query), "insert cdwFile (submitId,submitDirId,submitFileName,startUploadTime) " " values(%d, %d, '%s', %lld)" , submitId, submitDirId, localAbsolutePath, cdwNow()); sqlUpdate(conn, query); long long fileId = sqlLastAutoId(conn); /* Create big data warehouse file/path name. */ char cdwFile[PATH_LEN], cdwPath[PATH_LEN]; cdwMakeFileNameAndPath(fileId, localAbsolutePath, cdwFile, cdwPath); /* We're a little paranoid so md5 it */ char *md5; /* Do copy or symbolic linking of file into warehouse managed dir. */ if (givenMd5Sum) { md5 = givenMd5Sum; } else { md5 = md5HexForFile(localAbsolutePath); } copyFile(localAbsolutePath, cdwPath); touchFileFromFile(localAbsolutePath, cdwPath); chmod(cdwPath, 0444); replaceOriginalWithSymlink(localAbsolutePath, "", cdwPath); /* Update file record. */ sqlSafef(query, sizeof(query), "update cdwFile set cdwFileName='%s', endUploadTime=%lld," "updateTime=%lld, size=%lld, md5='%s' where id=%lld" , cdwFile, cdwNow(), updateTime, size, md5, fileId); sqlUpdate(conn, query); /* Now, it's a bit of a time waste, but cheap in code, to just load it back from DB. */ sqlSafef(query, sizeof(query), "select * from cdwFile where id=%lld", fileId); return cdwFileLoadByQuery(conn, query); } struct cdwFile *cdwFileLoadAllValid(struct sqlConnection *conn) /* Get list of cdwFiles that have been validated with no error */ { char query[256]; sqlSafef(query, sizeof(query), "select cdwFile.* from cdwFile,cdwValidFile " " where cdwFile.id=cdwValidFile.fileId " " and (cdwFile.errorMessage='' or cdwFile.errorMessage is null)"); return cdwFileLoadByQuery(conn, query); } struct cdwFile *cdwFileAllIntactBetween(struct sqlConnection *conn, int startId, int endId) /* Return list of all files that are intact (finished uploading and MD5 checked) * with file IDs between startId and endId - including endId */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwFile where id>=%d and id<=%d and endUploadTime != 0 " "and updateTime != 0 and (errorMessage = '' or errorMessage is NULL) and deprecated = ''", startId, endId); return cdwFileLoadByQuery(conn, query); } long long cdwFindInSameSubmitDir(struct sqlConnection *conn, struct cdwFile *ef, char *submitFileName) /* Return fileId of most recent file of given submitFileName from submitDir * associated with file */ { char query[3*PATH_LEN]; sqlSafef(query, sizeof(query), "select cdwFile.id from cdwFile,cdwSubmitDir " "where cdwFile.submitDirId = cdwSubmitDir.id and " "cdwSubmitDir.id = %d and " "cdwFile.submitFileName = '%s' order by cdwFile.id desc" , ef->submitDirId, submitFileName); return sqlQuickLongLong(conn, query); } struct cdwFile *cdwFileFromId(struct sqlConnection *conn, long long fileId) /* Return cdwValidFile given fileId - return NULL if not found. */ { char query[128]; sqlSafef(query, sizeof(query), "select * from cdwFile where id=%lld", fileId); return cdwFileLoadByQuery(conn, query); } struct cdwFile *cdwFileFromIdOrDie(struct sqlConnection *conn, long long fileId) /* Return cdwValidFile given fileId - aborts if not found. */ { struct cdwFile *ef = cdwFileFromId(conn, fileId); if (ef == NULL) errAbort("Couldn't find file for id %lld\n", fileId); return ef; } int cdwFileIdFromPathSuffix(struct sqlConnection *conn, char *suf) /* return most recent fileId for file where submitDir.url+submitFname ends with suf. 0 if not found. */ { char query[4096]; int sufLen = strlen(suf); // This is a bit slow, on the order of 1 second. -jk sqlSafef(query, sizeof(query), "SELECT cdwFile.id FROM cdwSubmitDir, cdwFile " "WHERE cdwFile.submitDirId=cdwSubmitDir.id AND RIGHT(CONCAT_WS('/', cdwSubmitDir.url, submitFileName), %d)='%s' " "ORDER BY cdwFile.id DESC LIMIT 1;", sufLen, suf); int fileId = sqlQuickNum(conn, query); return fileId; } struct cdwValidFile *cdwValidFileFromFileId(struct sqlConnection *conn, long long fileId) /* Return cdwValidFile give fileId - returns NULL if not validated. */ { char query[128]; sqlSafef(query, sizeof(query), "select * from cdwValidFile where fileId=%lld", fileId); return cdwValidFileLoadByQuery(conn, query); } struct cdwValidFile *cdwValidFileFromLicensePlate(struct sqlConnection *conn, char *licensePlate) /* Return cdwValidFile from license plate - returns NULL if not found. */ { char query[128]; sqlSafef(query, sizeof(query), "select * from cdwValidFile where licensePlate='%s'", licensePlate); return cdwValidFileLoadByQuery(conn, query); } struct cdwExperiment *cdwExperimentFromAccession(struct sqlConnection *conn, char *acc) /* Given something like 'ENCSR123ABC' return associated experiment. */ { char query[128]; sqlSafef(query, sizeof(query), "select * from cdwExperiment where accession='%s'", acc); return cdwExperimentLoadByQuery(conn, query); } struct genomeRangeTree *cdwMakeGrtFromBed3List(struct bed3 *bedList) /* Make up a genomeRangeTree around bed file. */ { struct genomeRangeTree *grt = genomeRangeTreeNew(); struct bed3 *bed; for (bed = bedList; bed != NULL; bed = bed->next) genomeRangeTreeAdd(grt, bed->chrom, bed->chromStart, bed->chromEnd); return grt; } struct cdwAssembly *cdwAssemblyForUcscDb(struct sqlConnection *conn, char *ucscDb) /* Get assembly for given UCSC ID or die trying */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwAssembly where ucscDb='%s'", ucscDb); struct cdwAssembly *assembly = cdwAssemblyLoadByQuery(conn, query); if (assembly == NULL) errAbort("Can't find assembly for %s", ucscDb); return assembly; } struct cdwAssembly *cdwAssemblyForId(struct sqlConnection *conn, long long id) /* Get assembly of given ID. */ { char query[128]; sqlSafef(query, sizeof(query), "select * from cdwAssembly where id=%lld", id); struct cdwAssembly *assembly = cdwAssemblyLoadByQuery(conn, query); if (assembly == NULL) errAbort("Can't find assembly for %lld", id); return assembly; } char *cdwSimpleAssemblyName(char *assembly) /* Given compound name like male.hg19 return just hg19 */ /* Given name of assembly return name where we want to do enrichment calcs. */ { /* If it ends with one of our common assembly suffix, then do enrichment calcs * in that space, rather than some subspace such as male, female, etc. */ static char *specialAsm[] = {".hg19",".hg38",".mm9",".mm10",".dm3",".ce10",".dm6"}; int i; for (i=0; inext) { struct rbTree *tree = genomeRangeTreeFindOrAddRangeTree(grt, chrom->name); struct lm *lm = lmInit(0); struct bigBedInterval *iv, *ivList = NULL; ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(tree, iv->start, iv->end); lmCleanup(&lm); } bigBedFileClose(&bbi); bbiChromInfoFreeList(&chromList); return grt; } boolean cdwIsSupportedBigBedFormat(char *format) /* Return TRUE if it's one of the bigBed formats we support. */ { int i; if (sameString(format, "bigBed")) // Generic bigBed ok return TRUE; for (i=0; istring); dyStringFree(&query); } void cdwWriteErrToStderrAndTable(struct sqlConnection *conn, char *table, int id, char *err) /* Write out error message to errorMessage field of table and through stderr. */ { warn("%s", trimSpaces(err)); cdwWriteErrToTable(conn, table, id, err); } void cdwAddJob(struct sqlConnection *conn, char *command, int submitId) /* Add job to queue to run. */ { char query[256+strlen(command)]; sqlSafef(query, sizeof(query), "insert into cdwJob (commandLine,submitId) values('%s',%d)", command, submitId); sqlUpdate(conn, query); } void cdwAddQaJob(struct sqlConnection *conn, long long fileId, int submitId) /* Create job to do QA on this and add to queue */ { char command[64]; safef(command, sizeof(command), "cdwQaAgent %lld", fileId); cdwAddJob(conn, command, submitId); } int cdwSubmitPositionInQueue(struct sqlConnection *conn, char *url, unsigned *retJobId) /* Return position of our URL in submission queue. Optionally return id in cdwSubmitJob * table of job. */ { char query[256]; sqlSafef(query, sizeof(query), "select id,commandLine from cdwSubmitJob where startTime = 0"); struct sqlResult *sr = sqlGetResult(conn, query); char **row; int aheadOfUs = -1; int pos = 0; unsigned jobId = 0; while ((row = sqlNextRow(sr)) != NULL) { jobId = sqlUnsigned(row[0]); char *line = row[1]; char *cdwSubmit = nextQuotedWord(&line); char *lineUrl = nextQuotedWord(&line); if (sameOk(cdwSubmit, "cdwSubmit") && sameOk(url, lineUrl)) { aheadOfUs = pos; break; } ++pos; } sqlFreeResult(&sr); if (retJobId != NULL) *retJobId = jobId; return aheadOfUs; } struct cdwSubmitDir *cdwSubmitDirFromId(struct sqlConnection *conn, long long id) /* Return submissionDir with given ID or NULL if no such submission. */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwSubmitDir where id=%lld", id); return cdwSubmitDirLoadByQuery(conn, query); } struct cdwSubmit *cdwSubmitFromId(struct sqlConnection *conn, long long id) /* Return submission with given ID or NULL if no such submission. */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwSubmit where id=%lld", id); return cdwSubmitLoadByQuery(conn, query); } struct cdwSubmit *cdwMostRecentSubmission(struct sqlConnection *conn, char *url) /* Return most recent submission, possibly in progress, from this url */ { int urlSize = strlen(url); char query[128 + 2*urlSize + 1]; sqlSafef(query, sizeof(query), "select * from cdwSubmit where url='%s' order by id desc limit 1", url); return cdwSubmitLoadByQuery(conn, query); } long long cdwSubmitMaxStartTime(struct cdwSubmit *submit, struct sqlConnection *conn) /* Figure out when we started most recent single file in the upload, or when * we started if not files started yet. */ { char query[256]; sqlSafef(query, sizeof(query), "select max(startUploadTime) from cdwFile where submitId=%u", submit->id); long long maxStartTime = sqlQuickLongLong(conn, query); if (maxStartTime == 0) maxStartTime = submit->startUploadTime; return maxStartTime; } int cdwSubmitCountNewValid(struct cdwSubmit *submit, struct sqlConnection *conn) /* Count number of new files in submission that have been validated. */ { char query[256]; sqlSafef(query, sizeof(query), "select count(*) from cdwFile e,cdwValidFile v where e.id = v.fileId and e.submitId=%u", submit->id); return sqlQuickNum(conn, query); } int cdwSubmitCountErrors(struct cdwSubmit *submit, struct sqlConnection *conn) /* Count number of errors with submitted files */ { char query[256]; sqlSafef(query, sizeof(query), "select count(*) from cdwFile where submitId=%u and errorMessage != '' and errorMessage is not null", submit->id); return sqlQuickNum(conn, query); } boolean cdwSubmitIsValidated(struct cdwSubmit *submit, struct sqlConnection *conn) /* Return TRUE if validation has run. This does not mean that they all passed validation. * It just means the validator has run and has made a decision on each file in the submission. */ { /* Is this off by one because of the validated.txt being in the submission but never validated? */ return cdwSubmitCountErrors(submit,conn) + cdwSubmitCountNewValid(submit, conn) == submit->newFiles; } void cdwAddSubmitJob(struct sqlConnection *conn, char *userEmail, char *url, boolean update) /* Add submission job to table and wake up daemon. */ { /* Create command and add it to cdwSubmitJob table. */ char command[strlen(url) + strlen(userEmail) + 256]; safef(command, sizeof(command), "cdwSubmit %s'%s' %s", (update ? "-update " : ""), url, userEmail); char query[strlen(command)+128]; sqlSafef(query, sizeof(query), "insert cdwSubmitJob (commandLine) values('%s')", command); sqlUpdate(conn, query); /* Write sync signal (any string ending with newline) to fifo to wake up daemon. */ FILE *fifo = mustOpen("../userdata/cdwSubmit.fifo", "w"); fputc('\n', fifo); carefulClose(&fifo); } struct cdwValidFile *cdwFindElderReplicates(struct sqlConnection *conn, struct cdwValidFile *vf) /* Find all replicates of same output and format type for experiment that are elder * (fileId less than your file Id). Younger replicates are responsible for taking care * of correlations with older ones. Sorry younguns, it's like social security. */ { if (sameString(vf->format, "unknown")) return NULL; char query[256]; sqlSafef(query, sizeof(query), "select * from cdwValidFile where id<%d and experiment='%s' and format='%s'" " and outputType='%s'" , vf->id, vf->experiment, vf->format, vf->outputType); return cdwValidFileLoadByQuery(conn, query); } #ifdef OLD void cdwWebHeaderWithPersona(char *title) /* Print out HTTP and HTML header through tag with persona info */ { printf("Content-Type:text/html\r\n"); printf("\r\n\r\n"); puts(""); printf("\n%s%s\n", getCspMetaHeader(), "CIRM Data Warehouse"); puts(""); // Use CIRM3 CSS for common look puts(""); puts(""); // external link icon (box with arrow) is from FontAwesome (fa-external-link) puts(""); puts(""); puts(""); puts(""); puts(""); puts(""); puts(""); /* layout with navigation bar */ puts("\n"); cdwWebNavBarStart(); } #endif /* OLD */ #ifdef OLD void cdwWebFooterWithPersona() /* Print out end tags and persona script stuff */ { cdwWebNavBarEnd(); htmlEnd(); } #endif /* OLD */ void cdwCreateNewUser(char *email) /* Create new user, checking that user does not already exist. */ { /* Now make sure user is not already in user table. */ struct sqlConnection *conn = cdwConnectReadWrite(); struct dyString *query = dyStringNew(0); sqlDyStringPrintf(query, "select count(*) from cdwUser where email = '%s'", email); if (sqlQuickNum(conn, query->string) > 0) errAbort("User %s already exists", email); /* Do database insert. */ dyStringClear(query); sqlDyStringPrintf(query, "insert into cdwUser (email) values('%s')", email); sqlUpdate(conn, query->string); sqlDisconnect(&conn); } void cdwPrintLogOutButton() /* Print log out button */ { printf(""); } struct dyString *cdwFormatDuration(long long seconds) /* Convert seconds to days/hours/minutes. Return result in a dyString you can free */ { struct dyString *dy = dyStringNew(0); int days = seconds/(3600*24); if (days > 0) dyStringPrintf(dy, "%d days, ", days); seconds -= days*3600*24; int hours = seconds/3600; if (hours > 0 || days > 0) dyStringPrintf(dy, "%d hours", hours); seconds -= hours*3600; if (days == 0) { int minutes = seconds/60; if (minutes > 0) { if (hours > 0) dyStringPrintf(dy, ", "); dyStringPrintf(dy, "%d minutes", minutes); } if (hours == 0) { if (minutes > 0) dyStringPrintf(dy, ", "); seconds -= minutes*60; dyStringPrintf(dy, "%d seconds", (int)seconds); } } return dy; } struct cdwFile *cdwFileInProgress(struct sqlConnection *conn, int submitId) /* Return file in submission in process of being uploaded if any. */ { char query[256]; sqlSafef(query, sizeof(query), "select fileIdInTransit from cdwSubmit where id=%u", submitId); long long fileId = sqlQuickLongLong(conn, query); if (fileId == 0) return NULL; sqlSafef(query, sizeof(query), "select * from cdwFile where id=%lld", (long long)fileId); return cdwFileLoadByQuery(conn, query); } static void accessDenied() /* Sleep a bit and then deny access. */ { sleep(5); errAbort("Access denied!"); } struct cdwScriptRegistry *cdwScriptRegistryFromCgi() /* Get script registery from cgi variables. Does authentication too. */ { struct sqlConnection *conn = cdwConnect(); char *user = sqlEscapeString(cgiString("user")); char *password = sqlEscapeString(cgiString("password")); char query[256]; sqlSafef(query, sizeof(query), "select * from cdwScriptRegistry where name='%s'", user); struct cdwScriptRegistry *reg = cdwScriptRegistryLoadByQuery(conn, query); if (reg == NULL) accessDenied(); char key[CDW_SID_SIZE]; cdwMakeSid(password, key); if (!sameString(reg->secretHash, key)) accessDenied(); sqlDisconnect(&conn); return reg; } void cdwValidFileUpdateDb(struct sqlConnection *conn, struct cdwValidFile *el, long long id) /* Save cdwValidFile as a row to the table specified by tableName, replacing existing record at * id. */ { struct dyString *dy = dyStringNew(512); sqlDyStringPrintf(dy, "update cdwValidFile set "); // omit id and licensePlate fields - one autoupdates and the other depends on this // also omit fileId which also really can't change. sqlDyStringPrintf(dy, " format='%s',", el->format); sqlDyStringPrintf(dy, " outputType='%s',", el->outputType); sqlDyStringPrintf(dy, " experiment='%s',", el->experiment); sqlDyStringPrintf(dy, " replicate='%s',", el->replicate); sqlDyStringPrintf(dy, " enrichedIn='%s',", el->enrichedIn); sqlDyStringPrintf(dy, " ucscDb='%s',", el->ucscDb); sqlDyStringPrintf(dy, " itemCount=%lld,", (long long)el->itemCount); sqlDyStringPrintf(dy, " basesInItems=%lld,", (long long)el->basesInItems); sqlDyStringPrintf(dy, " sampleCount=%lld,", (long long)el->sampleCount); sqlDyStringPrintf(dy, " basesInSample=%lld,", (long long)el->basesInSample); sqlDyStringPrintf(dy, " sampleBed='%s',", el->sampleBed); sqlDyStringPrintf(dy, " mapRatio=%g,", el->mapRatio); sqlDyStringPrintf(dy, " sampleCoverage=%g,", el->sampleCoverage); sqlDyStringPrintf(dy, " depth=%g,", el->depth); sqlDyStringPrintf(dy, " singleQaStatus=0,"); sqlDyStringPrintf(dy, " replicateQaStatus=0,"); sqlDyStringPrintf(dy, " part='%s',", el->part); sqlDyStringPrintf(dy, " pairedEnd='%s',", el->pairedEnd); sqlDyStringPrintf(dy, " qaVersion='%d',", el->qaVersion); sqlDyStringPrintf(dy, " uniqueMapRatio=%g,", el->uniqueMapRatio); sqlDyStringPrintf(dy, " lane='%s'", el->lane); #if (CDWVALIDFILE_NUM_COLS != 24) #error "Please update this routine with new column" #endif sqlDyStringPrintf(dy, " where id=%lld\n", (long long)id); sqlUpdate(conn, dy->string); dyStringFree(&dy); } char *cdwLookupTag(struct cgiParsedVars *list, char *tag) /* Return first occurence of tag on list, or empty string if not found */ { char *ret = ""; struct cgiParsedVars *tags; for (tags = list; tags != NULL; tags = tags->next) { char *val = hashFindVal(tags->hash, tag); if (val != NULL && !sameString(val, "n/a")) { ret = val; break; } } return ret; } void cdwValidFileFieldsFromTags(struct cdwValidFile *vf, struct cgiParsedVars *tags) /* Fill in many of vf's fields from tags. */ { // Most fields are just taken directly from tags, converted from underbar to camelCased // separation conventions. vf->format = cloneString(cdwLookupTag(tags, "format")); vf->outputType = cloneString(cdwLookupTag(tags, "output_type")); vf->replicate = cloneString(cdwLookupTag(tags, "replicate")); vf->enrichedIn = cloneString(cdwLookupTag(tags, "enriched_in")); vf->ucscDb = cloneString(cdwLookupTag(tags, "ucsc_db")); vf->part = cloneString(cdwLookupTag(tags, "file_part")); vf->pairedEnd = cloneString(cdwLookupTag(tags, "paired_end")); vf->lane = cloneString(cdwLookupTag(tags, "lane")); // Experiment field defaults to same as meta, but sometimes needs to be different. // Experiment field drives replicate comparisons. char *experiment = cdwLookupTag(tags, "experiment"); if (isEmpty(experiment)) experiment = cdwLookupTag(tags, "meta"); vf->experiment = cloneString(experiment); // Make sure we update this routine if we add fields to cdwValidFile #if (CDWVALIDFILE_NUM_COLS != 24) #error "Please update this routine with new column" #endif } void cdwRemoveQaRecords(struct sqlConnection *conn, long long fileId) /* Remove records associated with a file from all of the cdwQaXxx and cdwXxxFile * tables */ { char query[1024]; sqlSafef(query, sizeof(query), "delete from cdwFastqFile where fileId=%lld", fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwBamFile where fileId=%lld", fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwVcfFile where fileId=%lld", fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwQaPairSampleOverlap where elderFileId=%lld or youngerFileId=%lld", fileId, fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwQaPairCorrelation where elderFileId=%lld or youngerFileId=%lld", fileId, fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwQaEnrich where fileId=%lld", fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwQaContam where fileId=%lld", fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwQaRepeat where fileId=%lld", fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwQaPairedEndFastq where fileId1=%lld or fileId2=%lld", fileId, fileId); sqlUpdate(conn, query); } int findSubmitSymlinkExt(char *submitFileName, char *submitDir, char **pPath, char **pLastPath, int *pSymlinkLevels) /* Find the last symlink and real file in the chain from submitDir/submitFileName. * This is useful for when target of symlink in cdw/ gets renamed * (e.g. license plate after passes validation), or removed (e.g. cdwReallyRemove* commands). * Returns 0 for success. / * Returns -1 if path does not exist. */ { int result = 0; struct stat sb; char *lastPath = NULL; char *path = mustExpandRelativePath(submitDir, submitFileName); int symlinkLevels = 0; while (TRUE) { if (!fileExists(path)) { //path=does not exist result = -1; break; } if (lstat(path, &sb) == -1) errnoAbort("lstat failure on %s", path); if ((sb.st_mode & S_IFMT) != S_IFLNK) break; // follow the symlink ++symlinkLevels; if (symlinkLevels > 10) errAbort("Too many symlinks followed: %d symlinks. Probably a symlink loop.", symlinkLevels); // read the symlink char *symPath = mustReadSymlinkExt(path, &sb); // apply symPath to path char *newPath = mustPathRelativeToFile(path, symPath); freeMem(lastPath); lastPath = path; freeMem(symPath); path = newPath; } if (result == 0 && ((sb.st_mode & S_IFMT) != S_IFREG)) errAbort("Expecting regular file. Followed symlinks to %s but it is not a regular file.", path); *pPath = path; *pLastPath = lastPath; *pSymlinkLevels = symlinkLevels; return result; } char *testOriginalSymlink(char *submitFileName, char *submitDir) /* Follows submitted symlinks to real file. * Aborts if real file path starts with cdwRootDir * since it should not point to a file already under cdwRoot. */ { char *lastPath = NULL; char *path = NULL; int symlinkLevels = 0; int result = findSubmitSymlinkExt(submitFileName, submitDir, &path, &lastPath, &symlinkLevels); if (result == -1) // path does not exist { errAbort("path=[%s] does not exist following submitDir/submitFileName through symlinks.", path); } if (startsWith(cdwRootDir, path)) errAbort("Unexpected operation. The symlink %s points to %s. It should not point to a file already under cdwRoot %s", submitFileName, path, cdwRootDir); freeMem(lastPath); return path; } void replaceOriginalWithSymlink(char *submitFileName, char *submitDir, char *cdwPath) /* For a file that was just copied, remove original and symlink to new one instead * to save space. Follows symlinks if any to the real file and replaces it with a symlink */ { char *path = testOriginalSymlink(submitFileName, submitDir); if (unlink(path) == -1) // save space errnoAbort("unlink failure %s", path); makeSymLink(cdwPath, path); verbose(1, "%s converted to symlink to %s\n", path, cdwPath); freeMem(path); } char *findSubmitSymlink(char *submitFileName, char *submitDir, char *oldPath) /* Find the last symlink in the chain from submitDir/submitFileName. * This is useful for when target of symlink in cdw/ gets renamed * (e.g. license plate after passes validation), or removed (e.g. cdwReallyRemove* commands). */ { char *lastPath = NULL; char *path = NULL; int symlinkLevels = 0; int result = findSubmitSymlinkExt(submitFileName, submitDir, &path, &lastPath, &symlinkLevels); if (result == -1) // path does not exist { warn("path=[%s] does not exist following submitDir/submitFileName through symlinks.", path); return NULL; } if (symlinkLevels < 1) { warn("Too few symlinks followed: %d symlinks. Where is the symlink created by cdwSubmit?", symlinkLevels); return NULL; } if (!sameString(path, oldPath)) { warn("Found symlinks point to %s, expecting to find symlink pointing to old path %s", path, oldPath); return NULL; } freeMem(path); return lastPath; } +void safeCopyFile(char *source, char *dest) +/* Copy file from source to dest using temp file. Perform move at the end only when the file is complete. + * Prevents partial copies. */ +{ +char tempFileName[1024]; +safef(tempFileName, sizeof tempFileName, "%s.tempCdwCopy", dest); +copyFile(source, tempFileName); +mustRename(tempFileName, dest); +} + + void cdwReallyRemoveFile(struct sqlConnection *conn, char *submitDir, long long fileId, boolean unSymlinkOnly, boolean really) /* If unSymlinkOnly is NOT specified, removes all records of file from database and from Unix file system if * the really flag is set. Otherwise just print some info on the file. * Tries to find original submitdir and replace symlink with real file to restore it. */ { + struct cdwFile *ef = cdwFileFromId(conn, fileId); char *path = cdwPathForFileId(conn, fileId); verbose(1, "%s id=%u, submitFileName=%s, path=%s\n", unSymlinkOnly ? "unlocking" : "removing", ef->id, ef->submitFileName, path); +struct cdwSubmit *es = cdwSubmitFromId(conn, ef->submitId); if (really) { char query[1024]; - struct cdwSubmit *es = cdwSubmitFromId(conn, ef->submitId); if (!unSymlinkOnly) { cdwRemoveQaRecords(conn, fileId); sqlSafef(query, sizeof(query), "delete from cdwGroupFile where fileId=%lld", fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwValidFile where fileId=%lld", fileId); sqlUpdate(conn, query); sqlSafef(query, sizeof(query), "delete from cdwFile where id=%lld", fileId); sqlUpdate(conn, query); } + } + char *lastPath = NULL; // skip symlink check if meta or manifest which do not get validated or license plate or symlink if (!((fileId == es->manifestFileId) || (fileId == es->metaFileId))) lastPath = findSubmitSymlink(ef->submitFileName, submitDir, path); if (lastPath) { verbose(3, "lastPath=%s path=%s\n", lastPath, path); - if (unlink(lastPath) == -1) // drop about to be invalid symlink - errnoAbort("unlink failure %s", lastPath); - copyFile(path, lastPath); + } + +if (really) + { + if (lastPath) + { + // safer copy protection for the symlink across + safeCopyFile(path, lastPath); + touchFileFromFile(path, lastPath); chmod(lastPath, 0664); freeMem(lastPath); - } if (!unSymlinkOnly) mustRemove(path); } + } freez(&path); cdwFileFree(&ef); } void cdwFileResetTags(struct sqlConnection *conn, struct cdwFile *ef, char *newTags, boolean revalidate, int submitId) /* Reset tags on file, strip out old validation and QA, schedule new validation and QA. */ /* Remove existing QA records and rerun QA agent on given file. */ { long long fileId = ef->id; /* Update database to let people know format revalidation is in progress. */ char query[4*1024]; /* Update tags for file in cdwFile table. */ sqlSafef(query, sizeof(query), "update cdwFile set tags='%s' where id=%lld", newTags, fileId); sqlUpdate(conn, query); if (revalidate) { sqlSafef(query, sizeof(query), "update cdwFile set errorMessage = '%s' where id=%lld", "Revalidation in progress.", fileId); sqlUpdate(conn, query); /* Get rid of records referring to file in other validation and qa tables. */ cdwRemoveQaRecords(conn, fileId); /* schedule validator */ cdwAddQaJob(conn, ef->id, submitId); } else { /* The revalidation case relies on cdwMakeValidFile to update the cdwValidFile table. * Here we must do it ourselves. */ struct cdwValidFile *vf = cdwValidFileFromFileId(conn, ef->id); if (vf != NULL) { struct cgiParsedVars *tags = cdwMetaVarsList(conn, ef); cdwValidFileFieldsFromTags(vf, tags); cdwValidFileUpdateDb(conn, vf, vf->id); cgiParsedVarsFreeList(&tags); cdwValidFileFree(&vf); } } } static void scanSam(char *samIn, FILE *f, struct genomeRangeTree *grt, long long *retHit, long long *retMiss, long long *retTotalBasesInHits, long long *retUniqueHitCount) /* Scan through sam file doing several things:counting how many reads hit and how many * miss target during mapping phase, copying those that hit to a little bed file, and * also defining regions covered in a genomeRangeTree. */ { samfile_t *sf = samopen(samIn, "r", NULL); bam_hdr_t *bamHeader = sam_hdr_read(sf); bam1_t one; ZeroVar(&one); int err; long long hit = 0, miss = 0, unique = 0, totalBasesInHits = 0; while ((err = sam_read1(sf, bamHeader, &one)) >= 0) { int32_t tid = one.core.tid; if (tid < 0) { ++miss; continue; } ++hit; if (one.core.qual > cdwMinMapQual) ++unique; char *chrom = bamHeader->target_name[tid]; // Approximate here... can do better if parse cigar. int start = one.core.pos; int size = one.core.l_qseq; int end = start + size; totalBasesInHits += size; boolean isRc = (one.core.flag & BAM_FREVERSE); char strand = (isRc ? '-' : '+'); if (start < 0) start=0; if (f != NULL) fprintf(f, "%s\t%d\t%d\t.\t0\t%c\n", chrom, start, end, strand); genomeRangeTreeAdd(grt, chrom, start, end); } if (err < 0 && err != -1) errnoAbort("samread err %d", err); samclose(sf); *retHit = hit; *retMiss = miss; *retTotalBasesInHits = totalBasesInHits; *retUniqueHitCount = unique; } void cdwReserveTempFile(char *path) /* Call mkstemp on path. This will fill in terminal XXXXXX in path with file name * and create an empty file of that name. Generally that empty file doesn't stay empty for long. */ { int fd = mkstemp(path); if (fd == -1) errnoAbort("Couldn't create temp file %s", path); if (fchmod(fd, 0664) == -1) errnoAbort("Couldn't change permissions on temp file %s", path); mustCloseFd(&fd); } void cdwIndexPath(struct cdwAssembly *assembly, char indexPath[PATH_LEN]) /* Fill in path to BWA/Bowtie index. */ { safef(indexPath, PATH_LEN, "/dev/shm/btData/%s", assembly->ucscDb); } void cdwAsPath(char *format, char path[PATH_LEN]) /* Convert something like "narrowPeak" in format to full path involving * encValDir/as/narrowPeak.as */ { safef(path, PATH_LEN, "%sas/%s.as", cdwValDataDir, format); } boolean cdwTrimReadsForAssay(char *fastqPath, char trimmedPath[PATH_LEN], char *assay) /* Look at assay and see if it's one that needs trimming. If so make a new trimmed * file and put file name in trimmedPath. Otherwise just copy fastqPath to trimmed * path and return FALSE. */ { if (sameString(assay, "long-RNA-seq")) { char cmd[3*PATH_LEN]; // Make up temp file name for poly-A trimmed file safef(trimmedPath, PATH_LEN, "%scdwFastqPolyFilterXXXXXX", cdwTempDir()); cdwReserveTempFile(trimmedPath); // Run cdwFastqPolyFilter on the new file then pass the output into BWA. safef(cmd, sizeof(cmd), "cdwFastqPolyFilter %s %s", fastqPath, trimmedPath); mustSystem(cmd); return TRUE; } else { strcpy(trimmedPath, fastqPath); return FALSE; } } void cdwCleanupTrimReads(char *fastqPath, char trimmedPath[PATH_LEN]) /* Remove trimmed sample file. Does nothing if fastqPath and trimmedPath the same. */ { if (!sameString(fastqPath, trimmedPath)) remove(trimmedPath); } void cdwAlignFastqMakeBed(struct cdwFile *ef, struct cdwAssembly *assembly, char *fastqPath, struct cdwValidFile *vf, FILE *bedF, double *retMapRatio, double *retDepth, double *retSampleCoverage, double *retUniqueMapRatio, char *assay) /* Take a sample fastq, run the aligner on it and then convert that file to a bed. * bedF and all the ret parameters can be NULL. */ { // Figure out BWA index char genoFile[PATH_LEN]; cdwIndexPath(assembly, genoFile); // Trim reads if need be char trimmedFile[PATH_LEN]; cdwTrimReadsForAssay(fastqPath, trimmedFile, assay); char *samName; samName = cloneString(rTempName(cdwTempDir(), "ewdSample1", ".sam")); char cmd[3*PATH_LEN]; // We used to use bwa backtrack here ("bwa aln"), but this mode does not allow mmap indices // BWA mem allows preloaded mmap'ed indices, but it is very slow to start, like bowtie2 // Also BWA mem and bowtie2 are local aligners, so their stats are very different and they do not seem // to have a "require 99% identity option", which is very, very strange. // So bowtie seemed like the best of both worlds, mmap and also global alignment // Max did various comparisons with 80 sample fastq files and the count of aligned reads were very very similar // for those files that have enough reads in them. Plots were in email to Jim/Clay. // -l 40 is seed length. Makes it a lot faster. Today's reads should be longer than 40 bp // -n 1 is the number of mismatches in the seed. lower this to 1 makes it a lot faster. // -mm activates mmap for the index, which is now in ramdisk // -S is for SAM output (so no more useless .sai temp files) // these options require a decently recent bowtie version, I used 1.2.2 safef(cmd, sizeof(cmd), "bowtie -l 40 -n 1 --mm --threads 3 %s %s -S > %s", genoFile, trimmedFile, samName); mustSystem(cmd); /* Scan sam file to calculate vf->mapRatio, vf->sampleCoverage and vf->depth. * and also to produce little bed file for enrichment step. */ struct genomeRangeTree *grt = genomeRangeTreeNew(); long long hitCount=0, missCount=0, uniqueHitCount, totalBasesInHits=0; scanSam(samName, bedF, grt, &hitCount, &missCount, &totalBasesInHits, &uniqueHitCount); verbose(1, "hitCount=%lld, missCount=%lld, totalBasesInHits=%lld, grt=%p\n", hitCount, missCount, totalBasesInHits, grt); if (retMapRatio) *retMapRatio = (double)hitCount/(hitCount+missCount); if (retDepth) *retDepth = (double)totalBasesInHits/assembly->baseCount * (double)vf->itemCount/vf->sampleCount; long long basesHitBySample = genomeRangeTreeSumRanges(grt); if (retSampleCoverage) *retSampleCoverage = (double)basesHitBySample/assembly->baseCount; if (retUniqueMapRatio) *retUniqueMapRatio = (double)uniqueHitCount/(hitCount+missCount); // Clean up and go home cdwCleanupTrimReads(fastqPath, trimmedFile); genomeRangeTreeFree(&grt); remove(samName); } struct cdwFastqFile *cdwFastqFileFromFileId(struct sqlConnection *conn, long long fileId) /* Get cdwFastqFile with given fileId or NULL if none such */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwFastqFile where fileId=%lld", fileId); return cdwFastqFileLoadByQuery(conn, query); } struct cdwVcfFile *cdwVcfFileFromFileId(struct sqlConnection *conn, long long fileId) /* Get cdwVcfFile with given fileId or NULL if none such */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwVcfFile where fileId=%lld", fileId); return cdwVcfFileLoadByQuery(conn, query); } static int mustMkstemp(char *template) /* Call mkstemp to make a temp file with name based on template (which is altered) * by the call to be the file name. Return unix file descriptor. */ { int fd = mkstemp(template); if (fd == -1) errnoAbort("Couldn't make temp file based on %s", template); return fd; } void cdwMakeTempFastqSample(char *source, int size, char dest[PATH_LEN]) /* Copy size records from source into a new temporary dest. Fills in dest */ { /* Make temporary file to save us a unique place in file system. */ safef(dest, PATH_LEN, "%scdwSampleFastqXXXXXX", cdwTempDir()); int fd = mustMkstemp(dest); close(fd); char command[3*PATH_LEN]; safef(command, sizeof(command), "fastqStatsAndSubsample %s /dev/null %s -smallOk -sampleSize=%d", source, dest, size); verbose(2, "command: %s\n", command); mustSystem(command); } void cdwMakeFastqStatsAndSample(struct sqlConnection *conn, long long fileId) /* Run fastqStatsAndSubsample, and put results into cdwFastqFile table. */ { struct cdwFastqFile *fqf = cdwFastqFileFromFileId(conn, fileId); if (fqf == NULL) { char *path = cdwPathForFileId(conn, fileId); char statsFile[PATH_LEN], sampleFile[PATH_LEN]; char command[3*PATH_LEN]; // Cut adapt on RNA seq files. safef(statsFile, PATH_LEN, "%scdwFastqStatsXXXXXX", cdwTempDir()); cdwReserveTempFile(statsFile); char dayTempDir[PATH_LEN]; safef(sampleFile, PATH_LEN, "%scdwFastqSampleXXXXXX", cdwTempDirForToday(dayTempDir)); cdwReserveTempFile(sampleFile); // For RNA seq files run on the fastqTrimmed output, otherwise run on the unaltered CDW file. safef(command, sizeof(command), "fastqStatsAndSubsample -sampleSize=%d -smallOk %s %s %s", cdwSampleTargetSize, path, statsFile, sampleFile); mustSystem(command); safef(command, sizeof(command), "gzip -c %s > %s.fastq.gz", sampleFile, sampleFile); mustSystem(command); strcat(sampleFile, ".fastq.gz"); fqf = cdwFastqFileOneFromRa(statsFile); fqf->fileId = fileId; fqf->sampleFileName = cloneString(sampleFile); cdwFastqFileSaveToDb(conn, fqf, "cdwFastqFile", 1024); remove(statsFile); freez(&path); } cdwFastqFileFree(&fqf); } struct cdwQaWigSpot *cdwMakeWigSpot(struct sqlConnection *conn, long long wigId, long long spotId) /* Create a new cdwQaWigSpot record in database based on comparing wig file to spot file * (specified by id's in cdwFile table). */ { /* Get valid files from fileIds and check format */ struct cdwValidFile *wigVf = cdwValidFileFromFileId(conn, wigId); if (!sameString(wigVf->format, "bigWig")) errAbort("%lld is not a bigWig file, is %s instead", wigId, wigVf->format); struct cdwValidFile *spotVf = cdwValidFileFromFileId(conn, spotId); if (!sameString(spotVf->format, "narrowPeak") && !sameString(spotVf->format, "broadPeak") && !sameString(spotVf->format, "bigBed")) errAbort("%lld is not a recognized peak type format, is %s", spotId, spotVf->format); /* Remove any old record for files. */ char query[256]; sqlSafef(query, sizeof(query), "delete from cdwQaWigSpot where wigId=%lld and spotId=%lld", wigId, spotId); sqlUpdate(conn, query); /* Figure out file names */ char *wigPath = cdwPathForFileId(conn, wigId); char *spotPath = cdwPathForFileId(conn, spotId); char statsFile[PATH_LEN]; safef(statsFile, PATH_LEN, "%scdwQaWigSpotXXXXXX", cdwTempDir()); cdwReserveTempFile(statsFile); char peakFile[PATH_LEN]; safef(peakFile, PATH_LEN, "%scdwQaWigSpotXXXXXX", cdwTempDir()); cdwReserveTempFile(peakFile); /* Convert narrowPeak input into a temporary bed4 file */ char command[3*PATH_LEN]; safef(command, sizeof(command), "bigBedToBed %s stdout | cut -f 1-4 > %s", spotPath, peakFile); mustSystem(command); /* Call on bigWigAverageOverBed on peaks */ safef(command, sizeof(command), "bigWigAverageOverBed %s %s /dev/null -stats=%s", wigPath, peakFile, statsFile); mustSystem(command); remove(peakFile); /* Parse out ra file, save it to database, and remove ra file. */ struct cdwQaWigSpot *spot = cdwQaWigSpotOneFromRa(statsFile); spot->wigId = wigId; spot->spotId = spotId; cdwQaWigSpotSaveToDb(conn, spot, "cdwQaWigSpot", 1024); spot->id = sqlLastAutoId(conn); /* Clean up and go home. */ cdwQaWigSpotFree(&spot); cdwValidFileFree(&wigVf); cdwValidFileFree(&spotVf); freez(&wigPath); freez(&spotPath); return spot; } struct cdwQaWigSpot *cdwQaWigSpotFor(struct sqlConnection *conn, long long wigFileId, long long spotFileId) /* Return wigSpot relationship if any we have in database for these two files. */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwQaWigSpot where wigId=%lld and spotId=%lld", wigFileId, spotFileId); return cdwQaWigSpotLoadByQuery(conn, query); } struct cdwBamFile *cdwBamFileFromFileId(struct sqlConnection *conn, long long fileId) /* Get cdwBamFile with given fileId or NULL if none such */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwBamFile where fileId=%lld", fileId); return cdwBamFileLoadByQuery(conn, query); } struct cdwBamFile * cdwMakeBamStatsAndSample(struct sqlConnection *conn, long long fileId, char sampleBed[PATH_LEN]) /* Run cdwBamStats and put results into cdwBamFile table, and also a sample bed. * The sampleBed will be filled in by this routine. */ { /* Remove any old record for file. */ char query[256]; sqlSafef(query, sizeof(query), "delete from cdwBamFile where fileId=%lld", fileId); sqlUpdate(conn, query); /* Figure out file names */ char *path = cdwPathForFileId(conn, fileId); char statsFile[PATH_LEN]; safef(statsFile, PATH_LEN, "%scdwBamStatsXXXXXX", cdwTempDir()); cdwReserveTempFile(statsFile); char dayTempDir[PATH_LEN]; safef(sampleBed, PATH_LEN, "%scdwBamSampleXXXXXX", cdwTempDirForToday(dayTempDir)); cdwReserveTempFile(sampleBed); /* Make system call to make ra and bed, and then another system call to zip bed.*/ char command[3*PATH_LEN]; safef(command, sizeof(command), "edwBamStats -sampleBed=%s -sampleBedSize=%d %s %s", sampleBed, cdwSampleTargetSize, path, statsFile); mustSystem(command); safef(command, sizeof(command), "gzip %s", sampleBed); mustSystem(command); strcat(sampleBed, ".gz"); /* Parse out ra file, save it to database, and remove ra file. */ struct cdwBamFile *ebf = cdwBamFileOneFromRa(statsFile); ebf->fileId = fileId; cdwBamFileSaveToDb(conn, ebf, "cdwBamFile", 1024); remove(statsFile); /* Clean up and go home. */ freez(&path); return ebf; } struct cdwVcfFile * cdwMakeVcfStatsAndSample(struct sqlConnection *conn, long long fileId, char sampleBed[PATH_LEN]) /* Run cdwVcfStats and put results into cdwVcfFile table, and also a sample bed. * The sampleBed will be filled in by this routine. */ { /* Remove any old record for file. */ char query[256]; sqlSafef(query, sizeof(query), "delete from cdwVcfFile where fileId=%lld", fileId); sqlUpdate(conn, query); /* Figure out file names */ char *path = cdwPathForFileId(conn, fileId); char statsFile[PATH_LEN]; safef(statsFile, PATH_LEN, "%scdwVcfStatsXXXXXX", cdwTempDir()); cdwReserveTempFile(statsFile); char dayTempDir[PATH_LEN]; safef(sampleBed, PATH_LEN, "%scdwVcfSampleXXXXXX", cdwTempDirForToday(dayTempDir)); cdwReserveTempFile(sampleBed); /* Make system call to make ra and bed, and then another system call to zip bed.*/ char command[3*PATH_LEN]; safef(command, sizeof(command), "cdwVcfStats -bed=%s %s %s", sampleBed, path, statsFile); mustSystem(command); safef(command, sizeof(command), "gzip %s", sampleBed); mustSystem(command); strcat(sampleBed, ".gz"); /* Parse out ra file, save it to database, and remove ra file. */ struct cdwVcfFile *vcf = cdwVcfFileOneFromRa(statsFile); vcf->fileId = fileId; cdwVcfFileSaveToDb(conn, vcf, "cdwVcfFile", 1024); remove(statsFile); /* Clean up and go home. */ freez(&path); return vcf; } char *cdwOppositePairedEndString(char *end) /* Return "1" for "2" and vice versa */ { if (sameString(end, "1")) return "2"; else if (sameString(end, "2")) return "1"; else if (sameString(end, "cell_barcode") || sameString(end, "sample_barcode")) return NULL; else { errAbort("Expecting 1 or 2, got %s in oppositeEnd", end); return NULL; } } struct cdwValidFile *cdwOppositePairedEnd(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf) /* Given one file of a paired end set of fastqs, find the file with opposite ends. */ { char *otherEnd = cdwOppositePairedEndString(vf->pairedEnd); if (otherEnd == NULL) return NULL; char query[1024]; sqlSafef(query, sizeof(query), "select cdwValidFile.* from cdwValidFile join cdwFile on cdwValidFile.fileId=cdwFile.id" " where experiment='%s' and submitDirId=%d and outputType='%s' and replicate='%s' " " and part='%s' and pairedEnd='%s' and itemCount=%lld and deprecated=''" , vf->experiment, ef->submitDirId, vf->outputType, vf->replicate, vf->part, otherEnd , vf->itemCount); struct cdwValidFile *otherVf = cdwValidFileLoadByQuery(conn, query); if (otherVf == NULL) return NULL; if (otherVf->next != NULL) errAbort("Multiple results from pairedEnd query %s", query); return otherVf; } struct cdwQaPairedEndFastq *cdwQaPairedEndFastqFromVfs(struct sqlConnection *conn, struct cdwValidFile *vfA, struct cdwValidFile *vfB, struct cdwValidFile **retVf1, struct cdwValidFile **retVf2) /* Return pair record if any for the two fastq files. */ { /* Sort the two ends. */ struct cdwValidFile *vf1 = NULL, *vf2 = NULL; if (sameString(vfA->pairedEnd, "1")) { vf1 = vfA; vf2 = vfB; } else { vf1 = vfB; vf2 = vfA; } if (retVf1 != NULL) *retVf1 = vf1; if (retVf2 != NULL) *retVf2 = vf2; /* See if we already have a record for these two. */ /* Return record for these two. */ char query[1024]; sqlSafef(query, sizeof(query), "select * from cdwQaPairedEndFastq where fileId1=%u and fileId2=%u", vf1->fileId, vf2->fileId); return cdwQaPairedEndFastqLoadByQuery(conn, query); } FILE *cdwPopen(char *command, char *mode) /* do popen or die trying */ { /* Because of bugs with popen(...,"r") and programs that use stdin otherwise * it's probably better to use Mark's pipeline library, but it is ever so * much harder to use... */ FILE *f = popen(command, mode); if (f == NULL) errnoAbort("Can't popen(%s, %s)", command, mode); return f; } boolean cdwOneLineSystemAttempt(char *command, char *line, int maxLineSize) /* Execute system command and return one line result from it in line */ { FILE *f = popen(command, "r"); boolean ok = FALSE; if (f != NULL) { char *result = fgets(line, maxLineSize, f); if (result != NULL) ok = TRUE; pclose(f); } else { errnoWarn("failed popen %s", command); } return ok; } void cdwOneLineSystemResult(char *command, char *line, int maxLineSize) /* Execute system command and return one line result from it in line */ { if (!cdwOneLineSystemAttempt(command, line, maxLineSize) ) errAbort("Can't get line from %s", command); } void cdwMd5File(char *fileName, char md5Hex[33]) /* call md5sum utility to calculate md5 for file and put result in hex format md5Hex * This ends up being about 30% faster than library routine md5HexForFile, * however since there's popen() weird interactions with stdin involved * it's not suitable for a general purpose library. Environment inside cdw * is controlled enough it should be ok. */ { char command[PATH_LEN + 16]; safef(command, sizeof(command), "md5sum %s", fileName); char line[2*PATH_LEN]; cdwOneLineSystemResult(command, line, sizeof(line)); memcpy(md5Hex, line, 32); md5Hex[32] = 0; } void cdwPokeFifo(char *fifoName) /* Send '\n' to fifo to wake up associated daemon */ { /* Sadly we loop through places it might be since it varies. It has to live somewhere * that web CGIs can poke is the problem. */ char *places[] = {"/data/www/userdata/", "/usr/local/apache/userdata/"}; int i; for (i=0; i 0) { // set timeout to refresh page (saving/restoring scroll position via cookie) printf("", msec); puts(""); // disable autorefresh when user is changing page settings puts(""); } else if (msec == 0) puts("clearTimeout(cdwRefresh);"); // Negative msec ignored } /***/ /* Navigation bar */ void cdwWebNavBarStart() /* Layout navigation bar */ { puts("
"); puts(""); puts("
"); } void cdwWebNavBarEnd() /* Close layout after navigation bar */ { puts("
"); } void cdwWebBrowseMenuItem(boolean on) /* Toggle visibility of 'Browse submissions' link on navigation menu */ { printf("", on ? "show" : "hide"); } void cdwWebSubmitMenuItem(boolean on) /* Toggle visibility of 'Submit data' link on navigation menu */ { printf("", on ? "show" : "hide"); } char *cdwRqlLookupField(void *record, char *key) /* Lookup a field in a tagStanza. */ { struct tagStanza *stanza = record; return tagFindVal(stanza, key); } boolean cdwRqlStatementMatch(struct rqlStatement *rql, struct tagStanza *stanza, struct lm *lm) /* Return TRUE if where clause and tableList in statement evaluates true for stanza. */ { struct rqlParse *whereClause = rql->whereClause; if (whereClause == NULL) return TRUE; else { struct rqlEval res = rqlEvalOnRecord(whereClause, stanza, cdwRqlLookupField, lm); res = rqlEvalCoerceToBoolean(res); return res.val.b; } } static void rBuildStanzaRefList(struct tagStorm *tags, struct tagStanza *stanzaList, struct rqlStatement *rql, struct lm *lm, int *pMatchCount, struct slRef **pList) /* Recursively add stanzas that match query to list */ { struct tagStanza *stanza; for (stanza = stanzaList; stanza != NULL; stanza = stanza->next) { if (rql->limit < 0 || rql->limit > *pMatchCount) { if (cdwRqlStatementMatch(rql, stanza, lm)) { refAdd(pList, stanza); *pMatchCount += 1; } if (stanza->children != NULL) rBuildStanzaRefList(tags, stanza->children, rql, lm, pMatchCount, pList); } } } void cdwCheckRqlFields(struct rqlStatement *rql, struct slName *tagFieldList) /* Make sure that rql query only includes fields that exist in tags */ { struct hash *hash = hashFromSlNameList(tagFieldList); rqlCheckFieldsExist(rql, hash, "cdwFileTags table"); hashFree(&hash); } struct slRef *tagStanzasMatchingQuery(struct tagStorm *tags, char *query) /* Return list of references to stanzas that match RQL query */ { struct rqlStatement *rql = rqlStatementParseString(query); struct slName *tagFieldList = tagStormFieldList(tags); cdwCheckRqlFields(rql, tagFieldList); slFreeList(&tagFieldList); int matchCount = 0; struct slRef *list = NULL; struct lm *lm = lmInit(0); rBuildStanzaRefList(tags, tags->forest, rql, lm, &matchCount, &list); rqlStatementFree(&rql); lmCleanup(&lm); return list; } struct cgiParsedVars *cdwMetaVarsList(struct sqlConnection *conn, struct cdwFile *ef) /* Return list of cgiParsedVars dictionaries for metadata for file. Free this up * with cgiParsedVarsFreeList() */ { struct cgiParsedVars *tagsList = cgiParsedVarsNew(ef->tags); struct cgiParsedVars *parentTags = NULL; char query[256]; sqlSafef(query, sizeof(query), "select tags from cdwMetaTags where id=%u", ef->metaTagsId); char *metaCgi = sqlQuickString(conn, query); if (metaCgi != NULL) { parentTags = cgiParsedVarsNew(metaCgi); tagsList->next = parentTags; freez(&metaCgi); } return tagsList; } static int gMatchCount = 0; static boolean gDoSelect = FALSE; static boolean gFirst = TRUE; static void rMatchesToRa(struct tagStorm *tags, struct tagStanza *list, struct rqlStatement *rql, struct lm *lm) /* Recursively traverse stanzas on list outputting matching stanzas as ra. */ { struct tagStanza *stanza; for (stanza = list; stanza != NULL; stanza = stanza->next) { if (rql->limit < 0 || rql->limit > gMatchCount) { if (stanza->children) rMatchesToRa(tags, stanza->children, rql, lm); else /* Just apply query to leaves */ { if (cdwRqlStatementMatch(rql, stanza, lm)) { ++gMatchCount; if (gDoSelect) { struct slName *field; for (field = rql->fieldList; field != NULL; field = field->next) { char *val = tagFindVal(stanza, field->name); if (val != NULL) printf("%s\t%s\n", field->name, val); } printf("\n"); } } } } } } static void printQuotedTsv(char *val) /* Print out tab separated value inside of double quotes. Escape any existing quotes with quotes. */ { putchar('"'); char c; while ((c = *val++) != 0) { if (c == '"') putchar(c); putchar(c); } putchar('"'); } static void rMatchesToCsv(struct tagStorm *tags, struct tagStanza *list, struct rqlStatement *rql, struct lm *lm) /* Recursively traverse stanzas on list outputting matching stanzas as * a comma separated values file. */ { struct tagStanza *stanza; for (stanza = list; stanza != NULL; stanza = stanza->next) { if (rql->limit < 0 || rql->limit > gMatchCount) // We are inside the acceptable limit { if (stanza->children) // Recurse till we have just leaves. rMatchesToCsv(tags, stanza->children, rql, lm); else /* Just apply query to leaves */ { if (cdwRqlStatementMatch(rql, stanza, lm)) { ++gMatchCount; if (gDoSelect) { struct slName *field; if (gFirst)// For the first stanza print out a header line. { char *sep = ""; gFirst = FALSE; for (field = rql->fieldList; field != NULL; field = field->next) { printf("%s%s", sep, field->name); sep = ","; } printf("\n"); } char *sep = ""; for (field = rql->fieldList; field != NULL; field = field->next) { fputs(sep, stdout); sep = ","; char *val = emptyForNull(tagFindVal(stanza, field->name)); // Check for embedded comma or existing quotes if (strchr(val, ',') == NULL && strchr(val, '"') == NULL) fputs(val, stdout); else { printQuotedTsv(val); } } printf("\n"); } } } } } } static void rMatchesToTsv(struct tagStorm *tags, struct tagStanza *list, struct rqlStatement *rql, struct lm *lm) /* Recursively traverse stanzas on list outputting matching stanzas as a tab separated values file. */ { struct tagStanza *stanza; for (stanza = list; stanza != NULL; stanza = stanza->next) { if (rql->limit < 0 || rql->limit > gMatchCount) // We are inside the acceptable limit { if (stanza->children) // Recurse till we have just leaves. rMatchesToTsv(tags, stanza->children, rql, lm); else /* Just apply query to leaves */ { if (cdwRqlStatementMatch(rql, stanza, lm)) { ++gMatchCount; if (gDoSelect) { struct slName *field; if (gFirst)// For the first stanza print out a header line. { gFirst = FALSE; printf("#"); char *sep = ""; for (field = rql->fieldList; field != NULL; field = field->next) { printf("%s%s", sep, field->name); sep = "\t"; } printf("\n"); } char *sep = ""; for (field = rql->fieldList; field != NULL; field = field->next) { char *val = naForNull(tagFindVal(stanza, field->name)); printf("%s%s", sep, val); sep = "\t"; } printf("\n"); } } } } } } void cdwPrintMatchingStanzas(char *rqlQuery, int limit, struct tagStorm *tags, char *format) /* Show stanzas that match query */ { struct dyString *dy = dyStringCreate("%s", rqlQuery); int maxLimit = 10000; if (limit > maxLimit) limit = maxLimit; struct rqlStatement *rql = rqlStatementParseString(dy->string); /* Get list of all tag types in tree and use it to expand wildcards in the query * field list. */ struct slName *allFieldList = tagStormFieldList(tags); slSort(&allFieldList, slNameCmpCase); rql->fieldList = wildExpandList(allFieldList, rql->fieldList, TRUE); /* Traverse tag tree outputting when rql statement matches in select case, just * updateing count in count case. */ gDoSelect = sameWord(rql->command, "select"); if (gDoSelect) rql->limit = limit; struct lm *lm = lmInit(0); if (sameString(format, "ra")) rMatchesToRa(tags, tags->forest, rql, lm); else if (sameString(format, "tsv")) rMatchesToTsv(tags, tags->forest, rql, lm); else if (sameString(format, "csv")) rMatchesToCsv(tags, tags->forest, rql, lm); if (sameWord(rql->command, "count")) printf("%d\n", gMatchCount); } void cdwPrintSlRefList(struct slRef *results, struct slName *fieldNames, char *format, int limit) /* Print a linked list of results in ra, tsv, or csv format. Each result should be a list of * slPair key/values. */ { int maxLimit = 10000; if (limit > maxLimit) limit = maxLimit; int matchCount = 0; if (sameString(format, "csv")) { // Write csv header struct slName *fieldName = fieldNames; char *sep = ""; while (fieldName != NULL) { printf("%s%s", sep, fieldName->name); sep = ","; fieldName = fieldName->next; } printf("\n"); } if (sameString(format, "tsv")) { // Write tsv header struct slName *fieldName = fieldNames; char *sep = ""; printf("#"); while (fieldName != NULL) { printf("%s%s", sep, fieldName->name); sep = "\t"; fieldName = fieldName->next; } printf("\n"); } struct slRef *result = results; while (result != NULL) { if (++matchCount > limit) break; struct slPair *keyval = result->val; char *sep = ""; while (keyval != NULL) { char *val = keyval->val; if (sameString(format, "ra") && !isEmpty(val)) printf("%s\t%s\n", keyval->name, val); if (sameString(format, "csv")) { printf("%s", sep); sep = ","; val = emptyForNull(val); // Check for embedded comma or existing quotes if (strchr(val, ',') == NULL && strchr(val, '"') == NULL) printf("%s", val); else { printQuotedTsv(val); } } if (sameString(format, "tsv")) { val = naForNull(val); printf("%s%s", sep, val); sep = "\t"; } keyval = keyval->next; } printf("\n"); result = result->next; } } #ifdef NOT_CURRENTLY_USED static struct dyString *getLoginBits(struct cart *cart) /* Get a little HTML fragment that has login/logout bit of menu */ { /* Construct URL to return back to this page */ char *command = cartUsualString(cart, "cdwCommand", "home"); char *sidString = cartSidUrlString(cart); char returnUrl[PATH_LEN*2]; safef(returnUrl, sizeof(returnUrl), "../cgi-bin/cdwWebBrowse?cdwCommand=%s&%s", command, sidString ); char *encodedReturn = cgiEncode(returnUrl); /* Write a little html into loginBits */ struct dyString *loginBits = dyStringNew(0); dyStringAppend(loginBits, "Login"); } else { dyStringPrintf(loginBits, "../cgi-bin/hgLogin?hgLogin.do.displayLogout=1&returnto=%s&%s", encodedReturn, sidString); dyStringPrintf(loginBits, "\" id=\"logoutLink\">Logout %s", userName); if (loginUseBasicAuth()) wikiFixLogoutLinkWithJs(); } /* Clean up and go home */ freez(&encodedReturn); return loginBits; } #endif char *cdwHeadTagDependencies(struct cart *cart, boolean makeAbsolute) /* Return page head dependencies string. This is content that actually appears at the top * of the page, in the head tag. Optionally make links point to absolute URLs instead of relative. */ { // page header dependencies html is in a stringified .h file struct dyString *dy = dyStringNew(4*1024); dyStringPrintf(dy, #include "cdwHeadTagDependencies.h" ); char *headStr = cloneString(dy->string); if (!makeAbsolute) return headStr; char *headStr2 = replaceChars(headStr, "../", "/"); freez(&headStr); return headStr2; } char *cdwPageHeader(struct cart *cart, boolean makeAbsolute) /* Return page header string. This is content that actually appears at the top * of the page, like menu stuff. Optionally make links point to absolute URLs instead of relative. */ { // page header html is in a stringified .h file struct dyString *dy = dyStringNew(4*1024); dyStringPrintf(dy, #include "cdwPageHeader.h" ); char *menubarStr = menuBarAddUiVars(dy->string, "/cgi-bin/cdw", cartSidUrlString(cart)); if (!makeAbsolute) return menubarStr; char *menubarStr2 = replaceChars(menubarStr, "../", "/"); freez(&menubarStr); return menubarStr2; } char *cdwPageFooter(struct cart *cart, boolean makeAbsolute) /* Return page footer string. This is content that appears in the page footer, like * links to other institutions etc. Optionally make any relative URLs into absolute * URLs. */ { // page footer html is in a stringified .h file struct dyString *dy = dyStringNew(4*1024); dyStringPrintf(dy, #include "cdwPageFooter.h" ); char *menubarStr = menuBarAddUiVars(dy->string, "/cgi-bin/cdw", cartSidUrlString(cart)); if (!makeAbsolute) return menubarStr; char *menubarStr2 = replaceChars(menubarStr, "../", "/"); freez(&menubarStr); return menubarStr2; } char *cdwLocalMenuBar(struct cart *cart, boolean makeAbsolute) /* Return menu bar string. Optionally make links in menubar to point to absolute URLs, not relative. */ { // menu bar html is in a stringified .h file struct dyString *dy = dyStringNew(4*1024); dyStringPrintf(dy, #include "cdwNavBar.h" ); char *menubarStr = menuBarAddUiVars(dy->string, "/cgi-bin/cdw", cartSidUrlString(cart)); if (!makeAbsolute) return menubarStr; char *menubarStr2 = replaceChars(menubarStr, "../", "/"); freez(&menubarStr); return menubarStr2; } char *fileExtFromFormat(char *format) /* return file extension given the cdwFile format as defined in cdwValid.c. Result has to be freed */ { if (sameWord(format, "vcf")) return cloneString(".vcf.gz"); if (sameWord(format, "fasta")) return cloneString(".fa.gz"); if (sameWord(format, "fastq")) return cloneString(".fastq.gz"); if (sameWord(format, "unknown")) return cloneString(""); return catTwoStrings(".", format); }