4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/encode3/validateManifest/validateManifest.c src/hg/encode3/validateManifest/validateManifest.c index 9454291..a4f6189 100644 --- src/hg/encode3/validateManifest/validateManifest.c +++ src/hg/encode3/validateManifest/validateManifest.c @@ -1,960 +1,960 @@ /* validate ENCODE3 manifest.txt creating output validated.txt */ /* Copyright (C) 2014 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "portable.h" #include "md5.h" #include "hex.h" #include "sqlNum.h" #include "encode3/encode3Valid.h" #include "gff.h" char *version = "1.9"; char *workingDir = "."; char *encValData = "encValData"; char *ucscDb = NULL; char *validateFilesPath = ""; boolean quickMd5sum = FALSE; // Just for development testing, do not use void usage() /* Explain usage and exit. */ { errAbort( "validateManifest v%s - Validates the ENCODE3 manifest.txt file.\n" " Calls validateFiles on each file in the manifest.\n" " Exits with a zero status for no errors detected and non-zero for errors.\n" " Writes Error messages to stderr\n" "usage:\n" " validateManifest\n" "\n" " -dir=workingDir, defaults to the current directory.\n" " -encValData=encValDataDir, relative to workingDir, defaults to %s.\n" "\n" " Input files in the working directory: \n" " manifest.txt - current input manifest file\n" " validated.txt - input from previous run of validateManifest\n" "\n" " Output file in the working directory: \n" " validated.txt - results of validated input\n" "\n" , version, encValData ); } static struct optionSpec options[] = { {"dir", OPTION_STRING}, {"encValData", OPTION_STRING}, {"quickMd5sum", OPTION_BOOLEAN}, // Testing option, user should not use {"-help", OPTION_BOOLEAN}, {NULL, 0}, }; struct slRecord /* List of tab-parsed records. */ { struct slRecord *next; /* Next in list. */ char *row; /* Allocated at run time to length of string. */ char **words; /* Array allocated dynamically */ }; int readManifest(char *fileName, struct slRecord **pFields, struct slRecord **pAllRecs ) /* Read in the manifest file format into memory structures */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct slRecord *allRecs = NULL; char *row; char **fields = NULL; char **words = NULL; int fieldCount = 0; ////verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, lf->fileName); int fieldNameRowsCount = 0; while (lineFileNext(lf, &row, NULL)) { //uglyf("%s\n", row); // DEBUG REMOVE if ( startsWith("#file_name", row) || startsWith("#file" , row) || // catch some misspellings like #filename startsWith("#ucsc_db" , row)) { if ( fieldNameRowsCount == 0) { ++fieldNameRowsCount; // grab fieldnames from metadata char *metaLine = cloneString(row); //uglyf("%s\n", metaLine); // DEBUG REMOVE ++metaLine; // skip over the leading # char fieldCount = chopByChar(metaLine, '\t', NULL, 0); AllocArray(fields,fieldCount); fieldCount = chopByChar(metaLine, '\t', fields, fieldCount); /* DEBUG for (i=0; i<fieldCount; ++i) { uglyf("field #%d = [%s]\n", i, fields[i]); // DEBUG REMOVE } */ struct slRecord *meta = NULL; AllocVar(meta); meta->row = metaLine; meta->words = fields; if (pFields) *pFields = meta; } else { errAbort("Found comment line listing field names more than once."); } } else if (startsWith("#",row)) { // ignore other comment lines? } else { char *line = cloneString(row); int n = 0; AllocArray(words,fieldCount+1); n = chopByChar(line, '\t', words, fieldCount+1); if (n != fieldCount) { errAbort("Error [file=%s, line=%d]: found %d columns, expected %d [%s]" , lf->fileName, lf->lineIx, n, fieldCount, row); } struct slRecord *rec = NULL; AllocVar(rec); rec->row = line; rec->words = words; slAddHead(&allRecs, rec); } } slReverse(&allRecs); if (pAllRecs) *pAllRecs = allRecs; if (fieldNameRowsCount == 0) errAbort("Expected 1st line to contain a comment line listing field names."); lineFileClose(&lf); return fieldCount; } struct hash *makeFileNameHash(struct slRecord *recs, int fileNameIndex) /* make a hash of all records by fileName */ { struct hash *hash = newHash(12); struct slRecord *rec = NULL; for(rec = recs; rec; rec = rec->next) { if (hashLookup(hash, rec->words[fileNameIndex])) errAbort("duplicate file_name found: %s", rec->words[fileNameIndex]); hashAdd(hash, rec->words[fileNameIndex], rec); } return hash; } char *getAs(char *asFileName) /* Get full .as path */ { char asPath[256]; safef(asPath, sizeof asPath, "%s/as/%s", encValData, asFileName); return cloneString(asPath); } char *getGenome(char *fileName) /* Get genome, e.g. hg19 */ { // TODO this could use some more development // but start with something very simple for now // such as assuming that the genome is found // as the prefix in the fileName path. // Maybe in future can pull this from the hub.txt? // ucscDb will be set to the value in the optional column "ucsc_db" char genome[256] = ""; if (ucscDb) { safef(genome, sizeof genome, "%s", ucscDb); } else { char *slash = strchr(fileName, '/'); if (!slash) errAbort("Expected to find genome in file_name prefix."); safencat(genome, sizeof genome, fileName, slash - fileName); } if ( !sameString(genome, "dm3") && !sameString(genome, "dm6") && !sameString(genome, "ce10") && !sameString(genome, "hg19") && !sameString(genome, "hg20") && !sameString(genome, "hg38") && !sameString(genome, "mm9") && !sameString(genome, "mm10") ) errAbort("unknown genome %s", genome); return cloneString(genome); } char *getChromInfo(char *fileName) /* Get path to chromInfo file for fileName */ { char *genome = getGenome(fileName); char chromInfo[256]; safef(chromInfo, sizeof chromInfo, "%s/%s/chrom.sizes", encValData, genome); return cloneString(chromInfo); } char *getTwoBit(char *fileName) /* Get path to twoBit file for fileName */ { // TODO this could use some more development // Maybe in future can download this from one of our servers? char *genome = getGenome(fileName); char twoBit[256]; safef(twoBit, sizeof twoBit, "%s/%s/%s.2bit", encValData, genome, genome); return cloneString(twoBit); } char *getBamBai(char *fileName) /* Get path to bam index for fileName */ { char bamBai[256]; safef(bamBai, sizeof bamBai, "%s.bai", fileName); return cloneString(bamBai); } boolean runCmdLine(char *cmdLine) /* Run command line */ { // TODO this should be substantially more complex // with exec with timeout, might want to just translate // some of the exec with wait code from the old ENCODE2 pipeline // Maybe the default timeout should be 8 hours. // I am sure that is more than generous enough for validating a single big file. verbose(1, "cmdLine=[%s]\n",cmdLine); int retCode = system(cmdLine); verbose(2, "retCode=%d\n", retCode); // note 0 = success, 65280 = exit(255) or exit(-1) which is usually errAbort. sleep(1); // give stupid gzip broken pipe errors a chance to happen and print out to stderr return (retCode == 0); } boolean validateBam(char *fileName) /* Validate BAM file */ { char *twoBit = getTwoBit(fileName); char *chromInfo = getChromInfo(fileName); char cmdLine[1024]; int mismatches = 7; // TODO this is totally arbitrary right now // run validator on BAM even if the twoBit is not available. boolean hasTwoBit = fileExists(twoBit); if (hasTwoBit) { safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=bam -mismatches=%d -chromInfo=%s -genome=%s %s", validateFilesPath, mismatches, chromInfo, twoBit, fileName); } else { // simple existence check for the corresponding .bam.bai since without -genome=, // vf will not even open the bam index. /* I thought this was good to check anyway since labs should be using bam with index in a hub, but Eurie seems to think not, so commenting out for now. char *bamBai = getBamBai(fileName); if (!fileExists(bamBai)) { warn("Bam Index file missing: %s. Use SAM Tools to create.", bamBai); return FALSE; } */ // QUICK-run by removing -genome and mismatches and stuff. safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=bam -chromInfo=%s %s", validateFilesPath, chromInfo, fileName); } return runCmdLine(cmdLine); } boolean validateBedNP(char *fileName, char *format, boolean plainBed, int bedN, int bedP) /* Validate bedN+P file */ { char *chromInfo = getChromInfo(fileName); char asFileName[256]; safef(asFileName, sizeof asFileName, "%s.as", format); char *asFile = getAs(asFileName); char cmdLine[1024]; char *bedType = "bigBed"; if (plainBed) bedType = "bed"; safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=%s%d+%d -as=%s -chromInfo=%s %s", validateFilesPath, bedType, bedN, bedP, asFile, chromInfo, fileName); return runCmdLine(cmdLine); } boolean validateBigBed(char *fileName) /* Validate bigBed file */ { warn("error FILENAME %s: generic format bigBed is not allowed. Please use a specific format for the kind of bed data.", fileName); return FALSE; char *asFile = getAs("modPepMap-std.as"); // TODO this wrong but how do we know what to put here? char *chromInfo = getChromInfo(fileName); char cmdLine[1024]; // TODO probably need to do more work to define what the right type= and .as is // going to be, and how to get it. // The following line is nothing but pure hack taken from the first example found in the manifest, // and probably will fail miserably on other lines of the manifest, as this approach is too simple to work still safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=bigBed12+4 -as=%s -chromInfo=%s %s", validateFilesPath, asFile, chromInfo, fileName); // TODO actually run the validator return runCmdLine(cmdLine); } boolean validateBigWig(char *fileName) /* Validate bigWig file */ { char *chromInfo = getChromInfo(fileName); char cmdLine[1024]; safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=bigWig -chromInfo=%s %s", validateFilesPath, chromInfo, fileName); return runCmdLine(cmdLine); } boolean validateFastq(char *fileName) /* Validate fastq file */ { // check if fastq fileName ends in .gz extension if (!endsWith(fileName, ".gz")) { warn("FILENAME %s must be compressed only with gzip and have the extension .gz", fileName); return FALSE; } char cmdLine[1024]; safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=fastq %s", validateFilesPath, fileName); return runCmdLine(cmdLine); } boolean validateGtf(char *fileName) /* Validate gtf file */ { uglyf("GTF: very basic checking only performed.\n"); /* Open and read file with generic GFF reader and check it is GTF */ struct gffFile *gff = gffRead(fileName); if (!gff->isGtf) { warn("file (%s) is not in GTF format - check it has gene_id and transcript_id", fileName); return FALSE; } // TODO actually run a more complete check return TRUE; } boolean validateRcc(char *fileName) /* Validate RCC file */ { char cmdLine[1024]; safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=rcc %s", validateFilesPath, fileName); return runCmdLine(cmdLine); } boolean validateIdat(char *fileName) /* Validate IDAT file */ { char cmdLine[1024]; safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=idat %s", validateFilesPath, fileName); return runCmdLine(cmdLine); } boolean validateFasta(char *fileName) /* Validate fasta file */ { char cmdLine[1024]; safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=fasta %s", validateFilesPath, fileName); return runCmdLine(cmdLine); } boolean validateUnknown(char *fileName) /* Validate Unknown type file */ { warn("File %s has type unknown", fileName); return TRUE; } boolean validateFile(char *fileName, char *format) /* call validateFiles for the file and format */ { boolean result = FALSE; if (endsWith(fileName, ".tgz")) // TODO how to handle .tgz tar'd fasta files. { // will encode3 really even need to support these at all? // and if it does, would we have vf support tar archive natively, // or have vm (this program) unpack it and call vf for each file found inside? warn(".tgz format not currently supported by validateManifest"); return FALSE; } // Handle support for plain BEDs that are not bigBeds. boolean plainBed = FALSE; if (startsWith("bed_", format)) { format += strlen("bed_"); // skip the prefix. plainBed = TRUE; } // Call the handler based on format if (sameString(format,"bam")) result = validateBam(fileName); else if (startsWith(format,"bigBed")) // Actually this generic type will be rejected. result = validateBigBed(fileName); else if (startsWith(format,"bigWig")) result = validateBigWig(fileName); else if (startsWith(format,"fastq")) result = validateFastq(fileName); else if (startsWith(format,"gtf")) result = validateGtf(fileName); else if (startsWith(format,"rcc")) result = validateRcc(fileName); else if (startsWith(format,"idat")) result = validateIdat(fileName); else if (startsWith(format,"fasta")) result = validateFasta(fileName); else if (startsWith(format,"unknown")) result = validateUnknown(fileName); else { struct encode3BedType *bedType = encode3BedTypeMayFind(format); if (bedType != NULL) { result = validateBedNP(fileName, format, plainBed, bedType->bedFields, bedType->extraFields); } else { warn("Unrecognized format: %s", format); result = FALSE; } } return result; } boolean disallowedCompressionExtension(char *fileName, char *format) /* return TRUE if fileName ends in a disallowed extension */ { if (endsWith(fileName, ".tgz")) return TRUE; if (endsWith(fileName, ".tar.gz")) return TRUE; if (endsWith(fileName, ".tar")) return TRUE; if (endsWith(fileName, ".zip")) return TRUE; if (endsWith(fileName, ".bz2")) return TRUE; if (endsWith(fileName, ".gz") && !sameString(format,"fastq")) return TRUE; return FALSE; } int validateManifest(char *workingDir) /* Validate the manifest.txt input file creating validated.txt output */ { chdir(workingDir); if (!fileExists("manifest.txt")) { warn("manifest.txt not found in workingDir %s", workingDir); usage(); } uglyf("workingDir=%s\n\n", workingDir); if (fileExists("validateFiles")) validateFilesPath = "./"; char *fakeMd5sum = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; if (quickMd5sum) uglyf("DEBUG: because md5sum calculations are slow for big files, for testing purposes big files will be assigned md5sum=%s\n", fakeMd5sum); struct slRecord *manifestFields = NULL; struct slRecord *manifestRecs = NULL; uglyf("reading manifest.txt\n\n"); int mFieldCount = readManifest("manifest.txt", &manifestFields, &manifestRecs); struct slRecord *vFields = NULL; struct slRecord *vRecs = NULL; boolean haveVal = FALSE; int vFieldCount = -1; if (fileExists("validated.txt")) // read in the old validated.txt file to save time { uglyf("reading validated.txt\n\n"); vFieldCount = readManifest("validated.txt", &vFields, &vRecs); if (vFieldCount != mFieldCount + 4) // TODO this might be allowed someday if good case exists for it. errAbort("ERROR: the number of fields in validated.txt %d does not match the number of fields %d in manifest.txt", vFieldCount, mFieldCount); haveVal = TRUE; } int mUcscDbIdx = -1; // optional field ucsc_db int mFileNameIdx = -1; int mFormatIdx = -1; int mOutputTypeIdx = -1; int mExperimentIdx = -1; int mReplicateIdx = -1; int mTechnicalReplicateIdx = -1; int mEnrichedInIdx = -1; int mPairedEndIdx = -1; int i = 0; // find field numbers needed for required fields. for (i=0; i<mFieldCount; ++i) { if (sameString(manifestFields->words[i], "ucsc_db")) mUcscDbIdx = i; if (sameString(manifestFields->words[i], "file_name")) mFileNameIdx = i; if (sameString(manifestFields->words[i], "format")) mFormatIdx = i; if (sameString(manifestFields->words[i], "output_type")) mOutputTypeIdx = i; if (sameString(manifestFields->words[i], "experiment")) mExperimentIdx = i; if (sameString(manifestFields->words[i], "replicate")) mReplicateIdx = i; if (sameString(manifestFields->words[i], "technical_replicate")) mTechnicalReplicateIdx = i; if (sameString(manifestFields->words[i], "enriched_in")) mEnrichedInIdx = i; if (sameString(manifestFields->words[i], "paired_end")) mPairedEndIdx = i; } if (mFileNameIdx == -1) errAbort("field file_name not found in manifest.txt"); if (mFormatIdx == -1) errAbort("field format not found in manifest.txt"); if (mOutputTypeIdx == -1) errAbort("field output_type not found in manifest.txt"); if (mExperimentIdx == -1) errAbort("field experiment not found in manifest.txt"); if (mReplicateIdx == -1) errAbort("field replicate not found in manifest.txt"); // technical_replicate is probably optional //if (mTechnicalReplicateIdx == -1) // errAbort("field technical_replicate not found in manifest.txt"); if (mEnrichedInIdx == -1) errAbort("field enriched_in not found in manifest.txt"); // paired_end is probably optional //if (mPairedEndIdx == -1) // errAbort("field paired_end not found in manifest.txt"); // check if the fieldnames in old validated appear in the same order in manifest.txt // although this is currently a minor limitation, it could be removed // with just a little work in future if needed. if (haveVal) for (i = 0; i < mFieldCount; ++i) { if (!sameString(manifestFields->words[i], vFields->words[i])) errAbort("field names in old validated.txt do not match those in manifest.txt"); } // get indexes for old val extra fields int vMd5SumIdx = -1; int vSizeIdx = -1; int vModifiedIdx = -1; int vValidKeyIdx = -1; if (haveVal) { for (i = mFieldCount; i < vFieldCount; ++i) { if (sameString(vFields->words[i], "md5_sum")) vMd5SumIdx = i; if (sameString(vFields->words[i], "size")) vSizeIdx = i; if (sameString(vFields->words[i], "modified")) vModifiedIdx = i; if (sameString(vFields->words[i], "valid_key")) vValidKeyIdx = i; } if ( vMd5SumIdx == -1) errAbort("field " "md5_sum not found in old validated.txt"); if ( vSizeIdx == -1) errAbort("field " "size not found in old validated.txt"); if ( vModifiedIdx == -1) errAbort("field " "modified not found in old validated.txt"); if ( vValidKeyIdx == -1) errAbort("field ""valid_key not found in old validated.txt"); } // calling for the side-effect of checking for duplicate file_names. // ignore return code (void) makeFileNameHash(manifestRecs, mFileNameIdx); // hash old validated records by file_name for quick lookup. struct hash *valHash = NULL; if (haveVal) valHash = makeFileNameHash(vRecs, mFileNameIdx); // open output // write to a different temp filename so that the old validated.txt is not lost if this program not complete FILE *f = mustOpen("validated.tmp", "w"); char *tabSep = ""; // write fieldnames to output fprintf(f,"#"); // write leading comment character # for (i = 0; i < mFieldCount; ++i) { fprintf(f, "%s%s", tabSep, manifestFields->words[i]); tabSep = "\t"; } // include additional fieldnames fprintf(f,"\tmd5_sum\tsize\tmodified\tvalid_key"); fprintf(f,"\n"); fprintf(f,"##validateManifest version %s\n", version); // write vm version as a comment // hash for output_type checking for unique format // catches problem some users were using the same output_type on the wrong format. struct hash *outputTypeHash = newHash(12); // loop through manifest recs boolean errsFound = FALSE; struct slRecord *rec = NULL; int recNo = 1; for(rec = manifestRecs; rec; rec = rec->next) { /* DEBUG uglyf("rec #%d = [", recNo); int i = 0; for (i = 0; i < mFieldCount; ++i) { uglyf("\t%s", rec->words[i]); } uglyf("]\n"); */ boolean fileIsValid = TRUE; // Default to valid until we know otherwise. // get file_name, size, datetime char *mFileName = rec->words[mFileNameIdx]; if (mUcscDbIdx != -1) ucscDb = rec->words[mUcscDbIdx]; off_t mFileSize = 0; time_t mFileTime = 0; char *mMd5Hex = "0"; char *mValidKey = "ERROR"; // check that ucsc db, if used, is not blank if (fileIsValid && mUcscDbIdx != -1 && ucscDb[0] == 0) { fileIsValid = FALSE; printf("ERROR: ucsc_db must not be blank.\n"); } // check that the file exists if (fileIsValid && !fileExists(mFileName)) { fileIsValid = FALSE; printf("ERROR: '%s' FILE NOT FOUND !!!\n", mFileName); } char *mFormat = rec->words[mFormatIdx]; // check that the format is not blank if (fileIsValid && sameString(mFormat,"")) { fileIsValid = FALSE; printf("ERROR: format must not be blank.\n"); } // check that the file extension is not disallowed if (fileIsValid && disallowedCompressionExtension(mFileName, mFormat)) { fileIsValid = FALSE; printf("ERROR: %s FILE COMPRESSION TYPE NOT ALLOWED with format %s !!!\n", mFileName, mFormat); } // check that each output_type is not blank and only used with one format char *mOutputType = rec->words[mOutputTypeIdx]; if (fileIsValid && mOutputType[0] == 0) { fileIsValid = FALSE; printf("ERROR: output_type must not be blank.\n"); } if (fileIsValid) { struct hashEl *el = hashLookup(outputTypeHash, mOutputType); if (el) { char *existingFormat = (char *) el->val; if (!sameString(mFormat, existingFormat)) { fileIsValid = FALSE; printf("ERROR: Each output_type can only be used with one format. output_type %s is being used with both format %s and %s.\n", mOutputType, mFormat, existingFormat); } } else hashAdd(outputTypeHash, mOutputType, cloneString(mFormat)); } // check experiment field char *mExperiment = rec->words[mExperimentIdx]; if (fileIsValid) { if (!startsWith("ENCSR", mExperiment)) { fileIsValid = FALSE; printf("ERROR: %s is not a valid value for the experiment field. Must start with ENCSR.\n", mExperiment); } } // check enriched_in field char *mEnrichedIn = rec->words[mEnrichedInIdx]; if (fileIsValid) { if (!encode3CheckEnrichedIn(mEnrichedIn)) { fileIsValid = FALSE; printf("ERROR: %s is not a valid value for the enriched_in field.\n", mEnrichedIn); } } // check replicate field char *mReplicate = rec->words[mReplicateIdx]; if (fileIsValid) { boolean smallNumber = FALSE; int sl = strlen(mReplicate); int sn = 0; if (countLeadingDigits(mReplicate) == sl && sl <= 2 && sl > 0) { smallNumber = TRUE; sn = atoi(mReplicate); } if (!(startsWith("pooled", mReplicate) || startsWith("n/a", mReplicate) || (smallNumber && sn >=1 && sn <=10))) { fileIsValid = FALSE; printf("ERROR: %s is not a valid value for the replicate field. " "Must be pooled or n/a or a small unsigned number 1 <= N <=10.\n", mReplicate); } } // check technical_replicate field if (fileIsValid) { if (mTechnicalReplicateIdx != -1) // The technical_replicate field is optional { char *mTechnicalReplicate = rec->words[mTechnicalReplicateIdx]; boolean smallNumber = FALSE; int sl = strlen(mTechnicalReplicate); int sn = 0; if (countLeadingDigits(mTechnicalReplicate) == sl && sl <= 2 && sl > 0) { smallNumber = TRUE; sn = atoi(mTechnicalReplicate); } if (!(startsWith("pooled", mTechnicalReplicate) || startsWith("n/a", mTechnicalReplicate) || (smallNumber && sn >=1 && sn <=10))) { fileIsValid = FALSE; printf("ERROR: %s is not a valid value for the technical_replicate field. " "Must be pooled or n/a or a small unsigned number 1 <= N <=10.\n", mTechnicalReplicate); } } } // check paired_end field if (fileIsValid) { if (mPairedEndIdx != -1) // The check paired_end field is optional { char *mPairedEnd = rec->words[mPairedEndIdx]; boolean smallNumber = FALSE; int sl = strlen(mPairedEnd); int sn = 0; if (countLeadingDigits(mPairedEnd) == sl && sl < 2 && sl > 0) { smallNumber = TRUE; sn = atoi(mPairedEnd); } if (!(startsWith("pooled", mPairedEnd) || startsWith("n/a", mPairedEnd) || (smallNumber && (sn==1 || sn ==2)))) { fileIsValid = FALSE; printf("ERROR: %s is not a valid value for the paired_end field. Must be 1 (forward), 2 (reverse) or \"n/a\".\n", mPairedEnd); } } else { if (sameString(mFormat, "fastq")) // The check paired_end field is required for fastq { fileIsValid = FALSE; printf("ERROR: For format fastq the paired_end field is required. Must be 1 (forward), 2 (reverse) or \"n/a\".\n"); } } } if (fileIsValid) { mFileSize = fileSize(mFileName); mFileTime = fileModTime(mFileName); char *vMd5Hex = NULL; char *vValidKey = NULL; boolean dataMatches = FALSE; // look for a matching record in old validated if (haveVal) { off_t vFileSize = 0; time_t vFileTime = 0; struct slRecord *vRec = (struct slRecord *) hashFindVal(valHash, rec->words[mFileNameIdx]); // check if all fields match between manifest and old validated if (vRec) { dataMatches = TRUE; // check that the fields values match for (i = 0; i < mFieldCount; ++i) { if (!sameString(rec->words[i], vRec->words[i])) dataMatches = FALSE; } // check that the record correctly matches the actual file sizes. if (dataMatches) { vFileSize = sqlLongLong(vRec->words[vSizeIdx]); // TODO maybe use my special functions from the validator if (vFileSize != mFileSize) dataMatches = FALSE; } // check that the record correctly matches the actual file timestamp. if (dataMatches) { vFileTime = sqlLongLong(vRec->words[vModifiedIdx]); // There is no sqlLong function, but there should be! if (vFileTime != mFileTime) dataMatches = FALSE; } // verify vValidKey against vMd5Hex. if (dataMatches) { vMd5Hex = vRec->words[vMd5SumIdx]; vValidKey = vRec->words[vValidKeyIdx]; char *checkValidKey = encode3CalcValidationKey(vMd5Hex, vFileSize); if (sameString(vValidKey,"ERROR")) { dataMatches = FALSE; } else if (!sameString(vValidKey,checkValidKey)) { warn("invalid key %s in old validated.txt",vValidKey); // TODO add line# or filename etc? dataMatches = FALSE; } } } } if (dataMatches) { mMd5Hex = vMd5Hex; mValidKey = vValidKey; } else { // get md5_sum if (quickMd5sum && mFileSize > 100 * 1024 * 1024) mMd5Hex = fakeMd5sum; // "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; else { verbose(1, "Running md5 checksum on %s\n", mFileName); mMd5Hex = md5HexForFile(mFileName); } mValidKey = encode3CalcValidationKey(mMd5Hex, mFileSize); fileIsValid = validateFile(mFileName, mFormat); // Call the validator on the file and format. if (!fileIsValid) mValidKey = "ERROR"; } } printf("mFileName = %s size=%lld time=%ld md5=%s validKey=%s\n", mFileName, (long long)mFileSize, (long)mFileTime, mMd5Hex, mValidKey); printf("\n"); // write to output validated.tmp tabSep = ""; for (i = 0; i < mFieldCount; ++i) { fprintf(f, "%s%s", tabSep, rec->words[i]); tabSep = "\t"; } // include additional fields fprintf(f,"\t%s\t%lld\t%ld\t%s", mMd5Hex, (long long)mFileSize, (long)mFileTime, mValidKey); fprintf(f,"\n"); fflush(f); if (sameString(mValidKey,"ERROR")) errsFound = TRUE; ++recNo; } carefulClose(&f); rename("validated.tmp", "validated.txt"); // replace the old validated file with the new one if (errsFound) return 1; // #file_name format experiment replicate output_type biosample target localization update // ucsc_db (this is optional but overrides attempts to get db from file_name path) // md5_sum size modified valid_key return 0; } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc!=1 || optionExists("-help")) usage(); workingDir = optionVal("dir", workingDir); encValData = optionVal("encValData", encValData); quickMd5sum = optionExists("quickMd5sum"); return validateManifest(workingDir); }