567bb801c1502dd59b886e1ed10e76bcaf72cca4
jcasper
  Wed Dec 18 13:33:36 2019 -0800
Adding most recent validation stats to cdwCheckValidation status, per wrangler request (no ticket)

diff --git src/hg/cirm/cdw/cdwCheckValidation/cdwCheckValidation.c src/hg/cirm/cdw/cdwCheckValidation/cdwCheckValidation.c
index 56752d6..5b5cf37 100644
--- src/hg/cirm/cdw/cdwCheckValidation/cdwCheckValidation.c
+++ src/hg/cirm/cdw/cdwCheckValidation/cdwCheckValidation.c
@@ -1,378 +1,409 @@
 /* cdwCheckValidation - Check if a cdwSubmit validation step has completed. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "cdw.h"
 #include "cdwLib.h"
 #include "obscure.h"
 
 // Options and an int to keep track of the validation. 
 int gSubmitId = -1; 
 boolean gWait = FALSE; 
 boolean gLong = FALSE; 
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "cdwCheckValidation - Check if a cdwSubmit validation step has completed and print some \n"
   "\t\t     file metrics for the submission. Returns 0 if the submission has completed \n"
   "\t\t     and -1 otherwise. \n"
   "usage:\n"
   "\tcdwCheckValidation command user@email.address outputFile\n"
   "commands are one of:\n"
   "   status - print information\n"
   "   failed - list failed files\n"
   "   retry - rerun validation on failed files\n"
   "options:\n"
   "\t-submitId - Over ride the auto selection and use this specific id, this ignores the email"
   "provided. \n"
   "\t-wait - Wait until all files have been processed by cdwQaAgent, check every 5 seconds or" 
   "so.\n"
   "\t-long - Prints file-by-file status as well as overall status. \n");
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     {"submitId", OPTION_STRING}, 
     {"wait", OPTION_BOOLEAN}, 
     {"long", OPTION_BOOLEAN}, 
     {NULL, 0},
 };
 
 
 int getSubmitId(FILE *f, char *cdwUser, char *command)
 /* Use the user email to get the userId out of cdwUser, then use the cdwUser id to key into 
  * cdwSubmit and grab the last submission */ 
 {
 struct sqlConnection *conn = sqlConnect("cdw"); 
 // Query the cdwUser table to find the userId associated with the email address. 
 char query[1024]; 
 sqlSafef(query, sizeof(query), "select * from cdwUser where email = '%s';", cdwUser);
 struct cdwUser *submitter = cdwUserLoadByQuery(conn,query); 
 if (submitter == NULL)
     uglyAbort("There are no users associated with the provided email %s",cdwUser); 
 
 // Use the userId to key into cdwSubmit and get the last submission id. 
 sqlSafef(query, sizeof(query), "select * from cdwSubmit where userId=%i order by id desc limit 1",
 	    submitter->id);  
 struct cdwSubmit *submission = cdwSubmitLoadByQuery(conn, query);
 if (submission == NULL) uglyAbort("There are no submissions associated with the provided id.");
 
 // Print out stats. 
 if (startsWith("status",command)) 
     {
     fprintf(f,"User email:\t%s\nUser id:\t%i\n",submitter->email, submitter->id); 
     fprintf(f,"Submission id:\t%i\n",submission->id); 
     }
 sqlDisconnect(&conn); 
 
 return submission->id; 
 }
 
 int secondsUsed(int submitId)
 /* Lets figure out how long things have taken. */
 {
 struct sqlConnection *conn = sqlConnect("cdw"); 
 char query[1024]; 
 
 // Get when the first job started. 
 sqlSafef(query, sizeof(query), "select min(startTime) from cdwJob where submitId = '%i'"
 	" and startTime != 0", submitId);
 int batchStart = sqlQuickNum(conn, query);
 
 // Get when the last job finished. 
 sqlSafef(query, sizeof(query), "select max(endTime) from cdwJob where submitId = '%i'"
 	" and endTime != 0", submitId);
 int batchEnd = sqlQuickNum(conn, query);
 
 sqlDisconnect(&conn); 
 return batchEnd - batchStart; 
 }
 
 void printEnrichmentStats(FILE *f, struct cdwValidFile *validFile)
 // Print out the enrichment stats for a valid file. 
 {
 struct sqlConnection *conn = sqlConnect("cdw"); 
 char query[1024];
 fprintf(f, " Enrich");
 // Return if no ucscDb is specified. 
 if (!validFile->ucscDb)
     return;
 
 // Get the assembly id, default to hg19.  
 char ucscDb[128] = "hg19";
 if (!startsWith(validFile->ucscDb, " "))
     safef(ucscDb, sizeof(ucscDb), "%s", validFile->ucscDb); 
 sqlSafef(query, sizeof(query), "select * from cdwAssembly where ucscDb = '%s'", ucscDb);
 struct cdwAssembly *assembly = cdwAssemblyLoadByQuery(conn, query);
 if (!assembly) return;
 
 
 // Get the list of enrichment targets for this valid file.  
 char enrichedIn[128] = "exon";
 if (!startsWith(validFile->enrichedIn," "))
     safef(enrichedIn, sizeof(enrichedIn), "%s", validFile->enrichedIn); 
 sqlSafef(query, sizeof(query), "select * from cdwQaEnrichTarget where assemblyId = %i and name in" 
 	"('%s', 'chrX', 'chrY', 'chrM')", assembly->id, enrichedIn); 
 struct cdwQaEnrichTarget *targetList = cdwQaEnrichTargetLoadByQuery(conn, query);
 if (!targetList) 
     return;
 
 // Loop through the list of enrichment targets and print stats. 
 struct cdwQaEnrichTarget *target;
 for (target = targetList; ; target = target->next)
     {
     sqlSafef(query, sizeof(query), "select enrichment from cdwQaEnrich where fileId = %i and"
 		" qaEnrichTargetId = %i", validFile->fileId, target->id); 
     double enrichment = sqlQuickDouble(conn, query);
     fprintf(f," %s: %0.3f", target->name, enrichment);
     if (target->next == NULL)
 	break;
     else 
 	fprintf(f,",");
     }
 sqlDisconnect(&conn); 
 }
 
 void printTimeStats(FILE *f, struct cdwJob *jobList, int finJobs)
 // Print out the time so far and an estimate of the time remaining in a pretty fashion.
 {
 int timeSoFar = secondsUsed(jobList->submitId); 
 
 int hours, minutes, seconds; 
 // Do math magic to get the proper hours, minutes and seconds. 
 hours = timeSoFar/3600; 
 minutes = (timeSoFar - (hours*3600))/ 60; 
 seconds = (timeSoFar - (hours*3600) - (minutes*60)); 
 assert(slCount(jobList) > 0); 
 
 // Determine average time so far, multiply it by the number of jobs remaining.  
 double avgTimeSoFar = (double) timeSoFar / (double) finJobs; 
 int jobsRemaining = slCount(jobList) - finJobs; 
 double estTimeRemaining = avgTimeSoFar * jobsRemaining; 
     
 int eHours, eMinutes, eSeconds; 
 eHours = estTimeRemaining/3600; 
 eMinutes = (estTimeRemaining - (eHours*3600))/ 60; 
 eSeconds = (estTimeRemaining - (eHours*3600) - (eMinutes*60)); 
 
 fprintf(f,"Time so far:\t%ih %im %is\nTime remaining:\t%ih %im %is\n", hours, minutes, seconds,
 	    eHours, eMinutes, eSeconds); 
 }
 		
 void printValidFileStats(FILE *f, char *fileId)
 // Print out the stats for a valid file. Enrichment stats are handled in their own function. 
 {
 struct sqlConnection *conn = sqlConnect("cdw"); 
 char query[1024]; 
 sqlSafef(query, sizeof(query), "select * from cdwValidFile where fileId = %s", fileId); 
 struct cdwValidFile *validFile = cdwValidFileLoadByQuery(conn, query);
 sqlDisconnect(&conn);
 
 if (!validFile)
     // Usually indicates an errant cdwAddQaJob in some code some where. 
     {
     fprintf(f,"File id: %s | Status: corrupted | The file passed validation yet has no entry in"
 	    " cdwValidFile |  \n", fileId);
     sqlDisconnect(&conn); 
     return; 
     }
 // Default these to n/a to prevent seg faults. 
 
 if (!validFile->mapRatio || !validFile->uniqueMapRatio || !validFile->sampleCoverage)
     {
     fprintf(f,"File id: %s | Valid: Yes | Format: %s |\n", fileId, validFile->format);
     return;
     }
 
 fprintf(f,"File id: %s | Valid: Yes | Format: %s | Map ratio (mr): %f | Unique mr: %f| "
 		"Coverage: %f |", fileId, validFile->format, validFile->mapRatio, 
 		validFile->uniqueMapRatio, validFile->sampleCoverage);
 printEnrichmentStats(f, validFile);
 fprintf(f," |\n");
 return; 
 }
     
 void printSubmissionStatistics(FILE *f, struct cdwJob *jobList, char *command)
 /* Take a list of jobs and gather stats. Different stats are printed to file f depending on the
  * command given. */
 {
 if (!jobList)
     uglyAbort("There are no jobs on the joblist"); 
 
 // Keep track of all the possible job types. 
 int finJobs = 0, workingJobs = 0, queuedJobs = 0, failedJobs = 0, validFiles = 0; 
 
 if (startsWith("status", command) && gLong) 
     fprintf(f,"Printing file statistics...\n");
 
 // Loop through all jobs and gather stats. 
+struct hash *latestRuns = hashNew(0);
 struct cdwJob *job;
 for (job = jobList;; job = job->next)
     {
     char *prefix = "cdwQaAgent ";
     assert(startsWith(prefix, job->commandLine)); 
     char *fileIdString = job->commandLine + strlen(prefix);
     long long fileId = sqlLongLong(fileIdString);
     if (job->startTime > 0)
 	{
 	if (job->endTime > 0)
 	// Finished jobs. 
 	    {
 	    ++finJobs;
+
+            // Keep track of the most recent job found for this fileId
+            struct hashEl *fileEl = hashStore(latestRuns, fileIdString);
+            if (fileEl->val == NULL)
+                fileEl->val = job;
+            else
+                {
+                struct cdwJob *prevFileJob = (struct cdwJob*) fileEl->val;
+                if (prevFileJob->endTime < job->endTime)
+                    fileEl->val = job;
+                }
+
 	    if (job->returnCode == 0)
 	    // Jobs that passed validation. 
 		{
 		++validFiles; 
 		if (startsWith("status", command) && gLong) 
 		    printValidFileStats(f, fileIdString);
 		}
 	    else
 	    // Jobs that failed validation. 
 		{
 		++failedJobs;
 		if (startsWith("failed", command))
 		    fprintf(f,"File id: %s | Valid: No | Error: %s |\n", fileIdString, job->stderr);  
 		if (startsWith("status", command) && gLong) 
 		   fprintf(f,"File id: %s | Valid: No | Error: %s |\n", fileIdString, job->stderr);  
 		if (startsWith("retry",command))
 		    {
 		    fprintf(f,"Rerunning file %s \n", fileIdString); 
 		    struct sqlConnection *conn = sqlConnect("cdw"); 
 		    cdwAddQaJob(conn, fileId, job->submitId);
 		    sqlDisconnect(&conn); 
 		    }
 		}
 	    }
 	else
 	// Working jobs.
 	    {
 	    ++workingJobs;
 	    if (startsWith("status", command) && gLong) 
 		fprintf(f,"File id: %s | Valid: running | Run time: %i | \n", fileIdString, (int)((unsigned)time(NULL) - job->startTime));  
 	    }
 	}
     else
     // Queued jobs.  
 	{
 	++queuedJobs;
 	if (startsWith("status", command) && gLong) 
 	    fprintf(f,"File id: %s | Valid: queued | \n", fileIdString);  
 	}
     if (!job->next) 
 	break;
     }
 
 // Print out overall submission stats. 
 
 if (startsWith("status", command))
     {
     fprintf(f,"Total files:\t%i\n",slCount(jobList));
     fprintf(f,"Finished validation:\t%i\n", finJobs);
     fprintf(f,"Passed validation:\t\e[1;32m%i\e[0m\n",  validFiles);
     fprintf(f,"Failed validation:\t\e[1;31m%i\e[0m\n",  failedJobs);
+
+    int passedMostRecent = 0, failedMostRecent = 0;
+    struct hashEl *file;
+    struct hashCookie cookie = hashFirst(latestRuns);
+    while ((file = hashNext(&cookie)) != NULL)
+        {
+        struct cdwJob *job = (struct cdwJob*) file->val;
+        if (job->returnCode == 0)
+            passedMostRecent++;
+        else
+            failedMostRecent++;
+        }
+    
+    fprintf(f,"Unique files:\t%d\n", passedMostRecent+failedMostRecent);
+    fprintf(f,"Files that passed their most recent run:\t\e[1;32m%d\e[0m\n", passedMostRecent);
+    fprintf(f,"Files that failed their most recent run:\t\e[1;32m%d\e[0m\n", failedMostRecent);
     fprintf(f,"Jobs in progress:\t%i\n", workingJobs);
     fprintf(f,"Jobs queued:\t%i\n", queuedJobs);
     printTimeStats(f, jobList, finJobs);
     if (slCount(jobList) == finJobs)
 	fprintf(f,"Status:\t\e[1;32mCompleted\e[0m\n"); 
     else
 	fprintf(f,"Status:\tIn progress\n"); 
     
     }
 if (startsWith("retry", command))
     fprintf(f,"Started revalidating %i files.\n", failedJobs); 
 if (startsWith("failed", command))
     fprintf(f,"%i files failed validation.\n", failedJobs); 
+
+hashFree(&latestRuns);
 }
     
 
 void waitLoop(int submitId)
 /* Hang out and wait for all files in the submission to pass through cdwQaAgent. */ 
 {
 for (;;)
     {
     struct sqlConnection *conn = sqlConnect("cdw"); 
 
     // Grab all cdwJob entries with the submitId. 
     char query[1024]; 
     sqlSafef(query, sizeof(query), "select * from cdwJob where submitId = '%i';", submitId);
     struct cdwJob *jobList = cdwJobLoadByQuery(conn, query);
     if (!jobList) uglyAbort("The submission has no entries in cdwJob.\nThere are several "
 		    "possibilities as to why; the submission could be corrupted, submitted before\n"
 		    " submitId's were implemented or the submission is still copying files and has "
 			"not started validation");
     // Determine how many cdwJob entries that have completed.  
     sqlSafef(query, sizeof(query), "select count(*) from cdwJob where submitId = '%i'"
 		" and endTime > 0", submitId);
     int finJobs = sqlQuickNum(conn, query);
 
     sqlDisconnect(&conn); 
     if (slCount(jobList) != finJobs)
 	{
 	sleep(5); 
 	}
     else 
 	{
 	break;
 	}
     }
 }
 
 void cdwCheckValidation(char *command, char *cdwUser, char *outputFile)
 /* cdwCheckValidation - Check if a submission has completed validation. */
 {
 
 struct sqlConnection *conn = sqlConnect("cdw"); 
 
 FILE *f = mustOpen(outputFile,"w"); 
 int submitId;
 // If no submitId is given grab the last submission associated with the users email address. 
 if (gSubmitId == -1)
     submitId = getSubmitId(f, cdwUser, command);
 else submitId = gSubmitId; 
 
 // Grab all cdwJob entries with the submitId. 
 char query[1024]; 
 sqlSafef(query, sizeof(query), "select * from cdwJob where submitId = '%i';", submitId);
 struct cdwJob *jobList = cdwJobLoadByQuery(conn, query);
 
 
 if (!jobList) uglyAbort("The submission has no entries in cdwJob.\nThere are several "
 		    "possibilities as to why; the submission could be corrupted, submitted before\n"
 		    " submitId's were implemented or the submission is still copying files and has "
 		    "not started validation");
 assert(slCount(jobList) > 0);
 if (gWait) waitLoop(submitId); 
 
 sqlSafef(query, sizeof(query), "select count(*) from cdwJob where submitId = '%i' and endTime > 0",
 	    submitId);
 int finJobs = sqlQuickNum(conn, query);
 sqlDisconnect(&conn); 
 
 // Go through the jobList, gather and print statistics. 
 printSubmissionStatistics(f, jobList, command); 
 
 // Check if the submission has finished validation. 
 if (finJobs == slCount(jobList))
     exit(0);
 else 
     exit(-1); 
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 gWait = optionExists("wait");
 gLong = optionExists("long"); 
 gSubmitId = optionInt("submitId", gSubmitId);
 if (argc != 4)
     usage();
 cdwCheckValidation(argv[1], argv[2], argv[3]);
 return 0;
 }