src/hg/encode3/validateManifest/validateManifest.c 4898794edd81be5285ea6e544acbedeaeb31bf78

4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/encode3/validateManifest/validateManifest.c src/hg/encode3/validateManifest/validateManifest.c
index 9454291..a4f6189 100644
--- src/hg/encode3/validateManifest/validateManifest.c
+++ src/hg/encode3/validateManifest/validateManifest.c
@@ -1,960 +1,960 @@
 /* validate ENCODE3 manifest.txt creating output validated.txt */
 
 /* Copyright (C) 2014 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "portable.h"
 #include "md5.h"
 #include "hex.h"
 #include "sqlNum.h"
 #include "encode3/encode3Valid.h"
 #include "gff.h"
 
 char *version = "1.9";
 char *workingDir = ".";
 char *encValData = "encValData";
 char *ucscDb = NULL;
 char *validateFilesPath = "";
 
 boolean quickMd5sum = FALSE;  // Just for development testing, do not use
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
     "validateManifest v%s - Validates the ENCODE3 manifest.txt file.\n"
     "                Calls validateFiles on each file in the manifest.\n"
     "                Exits with a zero status for no errors detected and non-zero for errors.\n"
     "                Writes Error messages to stderr\n"
     "usage:\n"
     "   validateManifest\n"
     "\n"
     "   -dir=workingDir, defaults to the current directory.\n"
     "   -encValData=encValDataDir, relative to workingDir, defaults to %s.\n"
     "\n"
     "   Input files in the working directory: \n"
     "     manifest.txt - current input manifest file\n"
     "     validated.txt - input from previous run of validateManifest\n" 
     "\n"
     "   Output file in the working directory: \n"
     "     validated.txt - results of validated input\n"
     "\n"
     , version, encValData
     );
 }
 
 static struct optionSpec options[] = {
     {"dir", OPTION_STRING},
     {"encValData", OPTION_STRING},
     {"quickMd5sum", OPTION_BOOLEAN},     // Testing option, user should not use
     {"-help", OPTION_BOOLEAN},
     {NULL, 0},
 };
 
 
 struct slRecord
 /* List of tab-parsed records. */
     {
     struct slRecord *next;	/* Next in list. */
     char *row;               /* Allocated at run time to length of string. */
     char **words;            /* Array allocated dynamically */
     };
 
 
 
 int readManifest(char *fileName, 
     struct slRecord **pFields, 
     struct slRecord **pAllRecs )
 /* Read in the manifest file format into memory structures */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 
 struct slRecord *allRecs = NULL;
 
 char *row;
 char **fields = NULL;
 char **words = NULL;
 int fieldCount = 0;
 
 ////verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, lf->fileName);
 
 int fieldNameRowsCount = 0;
 
 while (lineFileNext(lf, &row, NULL))
     {
     //uglyf("%s\n", row); // DEBUG REMOVE
     if ( startsWith("#file_name", row) || 
 	 startsWith("#file"     , row) ||  // catch some misspellings like #filename
 	 startsWith("#ucsc_db"  , row))
 	{
 	if ( fieldNameRowsCount == 0)
 	    {
 	    ++fieldNameRowsCount;
 	    // grab fieldnames from metadata
 	    char *metaLine = cloneString(row);
 	    //uglyf("%s\n", metaLine); // DEBUG REMOVE
 	    ++metaLine;  // skip over the leading # char
 	    fieldCount = chopByChar(metaLine, '\t', NULL, 0);
 	    AllocArray(fields,fieldCount);
 	    fieldCount = chopByChar(metaLine, '\t', fields, fieldCount);
 	    /* DEBUG
 	    for (i=0; i<fieldCount; ++i)
 		{
 		uglyf("field #%d = [%s]\n", i, fields[i]); // DEBUG REMOVE
 		}
 	    */
 	    struct slRecord *meta = NULL;
 	    AllocVar(meta);
 	    meta->row = metaLine;
 	    meta->words = fields;
 	    if (pFields)
 		*pFields = meta;	    
 	    }
 	else
 	    {
 	    errAbort("Found comment line listing field names more than once.");
 	    }
 	}
     else if (startsWith("#",row))
 	{
 	// ignore other comment lines?
 	}
     else
 	{
 
 	char *line = cloneString(row);
 
 	int n = 0;
 	AllocArray(words,fieldCount+1);
 	n = chopByChar(line, '\t', words, fieldCount+1);
 	if (n != fieldCount)
 	    {
 	    errAbort("Error [file=%s, line=%d]: found %d columns, expected %d [%s]"
 		, lf->fileName, lf->lineIx, n, fieldCount, row);
 	    }
 
 	struct slRecord *rec = NULL;
 	AllocVar(rec);
 	rec->row = line;
 	rec->words = words;
 
 	slAddHead(&allRecs, rec);	
 	}
 
     }
 
 slReverse(&allRecs);
 if (pAllRecs)
     *pAllRecs = allRecs;	    
 
 if (fieldNameRowsCount == 0)
     errAbort("Expected 1st line to contain a comment line listing field names.");
 
 lineFileClose(&lf);
 
 return fieldCount;
 
 }
 
 
 struct hash *makeFileNameHash(struct slRecord *recs, int fileNameIndex)
 /* make a hash of all records by fileName */
 {
 struct hash *hash = newHash(12);
 struct slRecord *rec = NULL;
 for(rec = recs; rec; rec = rec->next)
     {
     if (hashLookup(hash, rec->words[fileNameIndex]))
 	errAbort("duplicate file_name found: %s", rec->words[fileNameIndex]);
     hashAdd(hash, rec->words[fileNameIndex], rec);
     }
 return hash;
 }
 
 
 char *getAs(char *asFileName)
 /* Get full .as path */
 {
 char asPath[256];
 safef(asPath, sizeof asPath, "%s/as/%s", encValData, asFileName);
 return cloneString(asPath);
 }
 
 char *getGenome(char *fileName)
 /* Get genome, e.g. hg19 */
 {  
 // TODO this could use some more development
 // but start with something very simple for now
 // such as assuming that the genome is found 
 // as the prefix in the fileName path.
 // Maybe in future can pull this from the hub.txt?
 // ucscDb will be set to the value in the optional column "ucsc_db"
 char genome[256] = "";
 if (ucscDb)
     {
     safef(genome, sizeof genome, "%s", ucscDb);
     }
 else
     {
     char *slash = strchr(fileName, '/');
     if (!slash)
 	errAbort("Expected to find genome in file_name prefix.");
     safencat(genome, sizeof genome, fileName, slash - fileName);
     }
 if (
     !sameString(genome, "dm3") &&
     !sameString(genome, "dm6") &&
     !sameString(genome, "ce10") &&
     !sameString(genome, "hg19") &&
     !sameString(genome, "hg20") &&
     !sameString(genome, "hg38") &&
     !sameString(genome, "mm9") &&
     !sameString(genome, "mm10") 
     )
     errAbort("unknown genome %s", genome);
 return cloneString(genome);
 }
 
 char *getChromInfo(char *fileName)
 /* Get path to chromInfo file for fileName */
 {
 char *genome = getGenome(fileName);
 char chromInfo[256];
 safef(chromInfo, sizeof chromInfo, "%s/%s/chrom.sizes", encValData, genome);
 return cloneString(chromInfo);
 }
 
 char *getTwoBit(char *fileName)
 /* Get path to twoBit file for fileName */
 {  
 // TODO this could use some more development
 // Maybe in future can download this from one of our servers?
 char *genome = getGenome(fileName);
 char twoBit[256];
 safef(twoBit, sizeof twoBit, "%s/%s/%s.2bit", encValData, genome, genome);
 return cloneString(twoBit);
 }
 
 char *getBamBai(char *fileName)
 /* Get path to bam index for fileName */
 {  
 char bamBai[256];
 safef(bamBai, sizeof bamBai, "%s.bai", fileName);
 return cloneString(bamBai);
 }
 
 
 boolean runCmdLine(char *cmdLine)
 /* Run command line */
 {
 // TODO this should be substantially more complex
 //   with exec with timeout, might want to just translate
 //   some of the exec with wait code from the old ENCODE2 pipeline
 //   Maybe the default timeout should be 8 hours.
 //   I am sure that is more than generous enough for validating a single big file.
 verbose(1, "cmdLine=[%s]\n",cmdLine);
 int retCode = system(cmdLine); 
 verbose(2, "retCode=%d\n", retCode);   // note 0 = success, 65280 = exit(255) or exit(-1) which is usually errAbort.
 sleep(1); // give stupid gzip broken pipe errors a chance to happen and print out to stderr
 return (retCode == 0);
 }
 
 boolean validateBam(char *fileName)
 /* Validate BAM file */
 {
 char *twoBit = getTwoBit(fileName);
 char *chromInfo = getChromInfo(fileName);
 char cmdLine[1024];
 int mismatches = 7;  // TODO this is totally arbitrary right now
 
 // run validator on BAM even if the twoBit is not available.
 boolean hasTwoBit = fileExists(twoBit); 
 if (hasTwoBit)
     {
     safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=bam -mismatches=%d -chromInfo=%s -genome=%s %s", 
 	validateFilesPath, mismatches, chromInfo, twoBit, fileName);
     }
 else
     {
     // simple existence check for the corresponding .bam.bai since without -genome=, 
     //  vf will not even open the bam index.
     /* I thought this was good to check anyway since labs should be using bam with index in a hub, 
 	but Eurie seems to think not, so commenting out for now.
     char *bamBai = getBamBai(fileName);
     if (!fileExists(bamBai))
 	{
 	warn("Bam Index file missing: %s. Use SAM Tools to create.", bamBai);
 	return FALSE;
 	}
     */
     // QUICK-run by removing -genome and mismatches and stuff.
     safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=bam -chromInfo=%s %s", 
 	validateFilesPath, chromInfo, fileName);
     }
 return runCmdLine(cmdLine);
 }
 
 
 boolean validateBedNP(char *fileName, char *format, boolean plainBed, int bedN, int bedP)
 /* Validate bedN+P file */
 {
 char *chromInfo = getChromInfo(fileName);
 char asFileName[256];
 safef(asFileName, sizeof asFileName, "%s.as", format);
 char *asFile = getAs(asFileName);
 char cmdLine[1024];
 char *bedType = "bigBed";
 if (plainBed)
     bedType = "bed";
 safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=%s%d+%d -as=%s -chromInfo=%s %s", validateFilesPath, bedType, bedN, bedP, asFile, chromInfo, fileName);
 return runCmdLine(cmdLine);
 }
 
 boolean validateBigBed(char *fileName)
 /* Validate bigBed file */
 {
 
 warn("error FILENAME %s: generic format bigBed is not allowed. Please use a specific format for the kind of bed data.", fileName);
 return FALSE;
 
 char *asFile = getAs("modPepMap-std.as");  // TODO this wrong but how do we know what to put here?
 char *chromInfo = getChromInfo(fileName);
 char cmdLine[1024];
 // TODO probably need to do more work to define what the right type= and .as is
 //  going to be, and how to get it.
 // The following line is nothing but pure hack taken from the first example found in the manifest,
 //  and probably will fail miserably on other lines of the manifest, as this approach is too simple to work still
 safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=bigBed12+4 -as=%s -chromInfo=%s %s", validateFilesPath, asFile, chromInfo, fileName);
 // TODO actually run the validator
 return runCmdLine(cmdLine);
 }
 
 boolean validateBigWig(char *fileName)
 /* Validate bigWig file */
 {
 char *chromInfo = getChromInfo(fileName);
 char cmdLine[1024];
 safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=bigWig -chromInfo=%s %s", validateFilesPath, chromInfo, fileName);
 return runCmdLine(cmdLine);
 }
 
 boolean validateFastq(char *fileName)
 /* Validate fastq file */
 {
 // check if fastq fileName ends in .gz extension
 if (!endsWith(fileName, ".gz"))
     {
     warn("FILENAME %s must be compressed only with gzip and have the extension .gz", fileName);
     return FALSE;
     }
 char cmdLine[1024];
 safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=fastq %s", validateFilesPath, fileName);
 return runCmdLine(cmdLine);
 }
 
 boolean validateGtf(char *fileName)
 /* Validate gtf file */
 {
 uglyf("GTF: very basic checking only performed.\n");
 /* Open and read file with generic GFF reader and check it is GTF */
 struct gffFile *gff = gffRead(fileName);
 if (!gff->isGtf)
     {
     warn("file (%s) is not in GTF format - check it has gene_id and transcript_id", fileName);
     return FALSE;
     }
 // TODO actually run a more complete check
 return TRUE;
 }
 
 boolean validateRcc(char *fileName)
 /* Validate RCC file */
 {
 char cmdLine[1024];
 safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=rcc %s", validateFilesPath, fileName);
 return runCmdLine(cmdLine);
 }
  
 boolean validateIdat(char *fileName)
 /* Validate IDAT file */
 {
 char cmdLine[1024];
 safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=idat %s", validateFilesPath, fileName);
 return runCmdLine(cmdLine);
 }
  
 boolean validateFasta(char *fileName)
 /* Validate fasta file */
 {
 char cmdLine[1024];
 safef(cmdLine, sizeof cmdLine, "%svalidateFiles -type=fasta %s", validateFilesPath, fileName);
 return runCmdLine(cmdLine);
 }
  
 boolean validateUnknown(char *fileName)
 /* Validate Unknown type file */
 {
 warn("File %s has type unknown", fileName);
 return TRUE;
 }
  
 
 boolean validateFile(char *fileName, char *format)
 /* call validateFiles for the file and format */
 {
 boolean result = FALSE;
 if (endsWith(fileName, ".tgz"))  // TODO how to handle .tgz tar'd fasta files.
     { // will encode3 really even need to support these at all?
       // and if it does, would we have vf support tar archive natively,
       // or have vm (this program) unpack it and call vf for each file found inside?
     warn(".tgz format not currently supported by validateManifest");
     return FALSE;
     }
 
 // Handle support for plain BEDs that are not bigBeds.
 boolean plainBed = FALSE;
 if (startsWith("bed_", format))
     {
     format += strlen("bed_");  // skip the prefix.
     plainBed = TRUE;
     }
 
 // Call the handler based on format
 if (sameString(format,"bam"))
     result = validateBam(fileName);
 else if (startsWith(format,"bigBed")) // Actually this generic type will be rejected.
     result = validateBigBed(fileName);
 else if (startsWith(format,"bigWig"))
     result = validateBigWig(fileName);
 else if (startsWith(format,"fastq"))
     result = validateFastq(fileName);
 else if (startsWith(format,"gtf"))
     result = validateGtf(fileName);
 else if (startsWith(format,"rcc"))
     result = validateRcc(fileName);
 else if (startsWith(format,"idat"))
     result = validateIdat(fileName);
 else if (startsWith(format,"fasta"))
     result = validateFasta(fileName);
 else if (startsWith(format,"unknown"))
     result = validateUnknown(fileName);
 else 
     {
     struct encode3BedType *bedType = encode3BedTypeMayFind(format);
     if (bedType != NULL)
         {
 	result = validateBedNP(fileName, format, plainBed, 
 	    bedType->bedFields, bedType->extraFields);
 	}
     else
         {
 	warn("Unrecognized format: %s", format);
 	result = FALSE;
 	}
     }
 return result;
 }
 
 
 boolean disallowedCompressionExtension(char *fileName, char *format)
 /* return TRUE if fileName ends in a disallowed extension */
 {
 if (endsWith(fileName, ".tgz"))
     return TRUE;
 if (endsWith(fileName, ".tar.gz"))
     return TRUE;
 if (endsWith(fileName, ".tar"))
     return TRUE;
 if (endsWith(fileName, ".zip"))
     return TRUE;
 if (endsWith(fileName, ".bz2"))
     return TRUE;
 if (endsWith(fileName, ".gz") && !sameString(format,"fastq"))
     return TRUE;
 return FALSE;
 }
 
 
 int validateManifest(char *workingDir)
 /* Validate the manifest.txt input file creating validated.txt output */
 {
 
 chdir(workingDir);
 if (!fileExists("manifest.txt"))
     {
     warn("manifest.txt not found in workingDir %s", workingDir);
     usage();
     }
 
 uglyf("workingDir=%s\n\n", workingDir);
 
 if (fileExists("validateFiles"))
     validateFilesPath = "./";
 
 
 char *fakeMd5sum = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
 if (quickMd5sum)
     uglyf("DEBUG: because md5sum calculations are slow for big files, for testing purposes big files will be assigned md5sum=%s\n", fakeMd5sum);
 
 
 struct slRecord *manifestFields = NULL;
 struct slRecord *manifestRecs = NULL;
 
 uglyf("reading manifest.txt\n\n");
 int mFieldCount = readManifest("manifest.txt", &manifestFields, &manifestRecs);
 
 struct slRecord *vFields = NULL;
 struct slRecord *vRecs = NULL;
 
 boolean haveVal = FALSE;
 int vFieldCount = -1;
 
 if (fileExists("validated.txt"))  // read in the old validated.txt file to save time
     {
     uglyf("reading validated.txt\n\n");
     vFieldCount = readManifest("validated.txt", &vFields, &vRecs);
     if (vFieldCount != mFieldCount + 4) // TODO this might be allowed someday if good case exists for it.
 	errAbort("ERROR: the number of fields in validated.txt %d does not match the number of fields %d in manifest.txt", vFieldCount, mFieldCount);
     haveVal = TRUE;
     }
 
 
 
 int mUcscDbIdx = -1;    // optional field ucsc_db
 int mFileNameIdx = -1;
 int mFormatIdx = -1;
 int mOutputTypeIdx = -1;
 int mExperimentIdx = -1;
 int mReplicateIdx = -1;
 int mTechnicalReplicateIdx = -1;
 int mEnrichedInIdx = -1;
 int mPairedEndIdx = -1;
 int i = 0;
 // find field numbers needed for required fields.
 for (i=0; i<mFieldCount; ++i)
     {
     if (sameString(manifestFields->words[i], "ucsc_db"))
 	mUcscDbIdx = i;
     if (sameString(manifestFields->words[i], "file_name"))
 	mFileNameIdx = i;
     if (sameString(manifestFields->words[i], "format"))
 	mFormatIdx = i;
     if (sameString(manifestFields->words[i], "output_type"))
 	mOutputTypeIdx = i;
     if (sameString(manifestFields->words[i], "experiment"))
 	mExperimentIdx = i;
     if (sameString(manifestFields->words[i], "replicate"))
 	mReplicateIdx = i;
     if (sameString(manifestFields->words[i], "technical_replicate"))
 	mTechnicalReplicateIdx = i;
     if (sameString(manifestFields->words[i], "enriched_in"))
 	mEnrichedInIdx = i;
     if (sameString(manifestFields->words[i], "paired_end"))
 	mPairedEndIdx = i;
     }
 if (mFileNameIdx == -1)
     errAbort("field file_name not found in manifest.txt");
 if (mFormatIdx == -1)
     errAbort("field format not found in manifest.txt");
 if (mOutputTypeIdx == -1)
     errAbort("field output_type not found in manifest.txt");
 if (mExperimentIdx == -1)
     errAbort("field experiment not found in manifest.txt");
 if (mReplicateIdx == -1)
     errAbort("field replicate not found in manifest.txt");
 // technical_replicate is probably optional
 //if (mTechnicalReplicateIdx == -1)
 //    errAbort("field technical_replicate not found in manifest.txt");
 if (mEnrichedInIdx == -1)
     errAbort("field enriched_in not found in manifest.txt");
 // paired_end is probably optional
 //if (mPairedEndIdx == -1)
 //    errAbort("field paired_end not found in manifest.txt");
 
 // check if the fieldnames in old validated appear in the same order in manifest.txt
 //  although this is currently a minor limitation, it could be removed 
 //  with just a little work in future if needed.
 if (haveVal)
     for (i = 0; i < mFieldCount; ++i)
 	{
 	if (!sameString(manifestFields->words[i], vFields->words[i]))
 	    errAbort("field names in old validated.txt do not match those in manifest.txt");
 	}
 // get indexes for old val extra fields
 int vMd5SumIdx = -1;
 int vSizeIdx = -1;
 int vModifiedIdx = -1;
 int vValidKeyIdx = -1;
 if (haveVal)
     {
     for (i = mFieldCount; i < vFieldCount; ++i)
 	{
 	if (sameString(vFields->words[i], "md5_sum"))
 	    vMd5SumIdx = i;
 	if (sameString(vFields->words[i], "size"))
 	    vSizeIdx = i;
 	if (sameString(vFields->words[i], "modified"))
 	    vModifiedIdx = i;
 	if (sameString(vFields->words[i], "valid_key"))
 	    vValidKeyIdx = i;
 	}
     if ( vMd5SumIdx   == -1) errAbort("field "  "md5_sum not found in old validated.txt");
     if ( vSizeIdx      == -1) errAbort("field "     "size not found in old validated.txt");
     if ( vModifiedIdx  == -1) errAbort("field " "modified not found in old validated.txt");
     if ( vValidKeyIdx == -1) errAbort("field ""valid_key not found in old validated.txt");
     }
 
 // calling for the side-effect of checking for duplicate file_names.
 // ignore return code
 (void) makeFileNameHash(manifestRecs, mFileNameIdx);
 
 // hash old validated records by file_name for quick lookup.
 struct hash *valHash = NULL;
 if (haveVal)
     valHash = makeFileNameHash(vRecs, mFileNameIdx);
 
 
 // open output
 // write to a different temp filename so that the old validated.txt is not lost if this program not complete
 FILE *f = mustOpen("validated.tmp", "w"); 
 
 char *tabSep = "";
 // write fieldnames to output
 fprintf(f,"#");  // write leading comment character #
 for (i = 0; i < mFieldCount; ++i)
     {
     fprintf(f, "%s%s", tabSep, manifestFields->words[i]);
     tabSep = "\t";
     }
 // include additional fieldnames
 fprintf(f,"\tmd5_sum\tsize\tmodified\tvalid_key");
 fprintf(f,"\n");
 
 fprintf(f,"##validateManifest version %s\n", version);  // write vm version as a comment
 
 // hash for output_type checking for unique format 
 // catches problem some users were using the same output_type on the wrong format.
 struct hash *outputTypeHash = newHash(12);
 
 // loop through manifest recs
 boolean errsFound = FALSE;
 struct slRecord *rec = NULL;
 int recNo = 1;
 for(rec = manifestRecs; rec; rec = rec->next)
     {
     /* DEBUG
     uglyf("rec #%d = [", recNo);
     int i = 0;
     for (i = 0; i < mFieldCount; ++i)
 	{
 	uglyf("\t%s", rec->words[i]);
 	}
     uglyf("]\n");
     */
 
     boolean fileIsValid = TRUE; // Default to valid until we know otherwise.
 
     // get file_name, size, datetime
     char *mFileName = rec->words[mFileNameIdx];
     if (mUcscDbIdx != -1)
 	ucscDb = rec->words[mUcscDbIdx];
 
     off_t mFileSize = 0;
     time_t mFileTime = 0;
     char *mMd5Hex = "0";
     char *mValidKey = "ERROR";
 
     // check that ucsc db, if used, is not blank
     if (fileIsValid && mUcscDbIdx != -1 && ucscDb[0] == 0)
 	{
 	fileIsValid = FALSE;
 	printf("ERROR: ucsc_db must not be blank.\n");
 	}		    
 
     // check that the file exists
     if (fileIsValid && !fileExists(mFileName))
 	{
 	fileIsValid = FALSE;
 	printf("ERROR: '%s' FILE NOT FOUND !!!\n", mFileName);
 	}
 
     char *mFormat = rec->words[mFormatIdx];
 
     // check that the format is not blank
     if (fileIsValid && sameString(mFormat,""))
 	{
 	fileIsValid = FALSE;
 	printf("ERROR: format must not be blank.\n");
 	}
 
     // check that the file extension is not disallowed
     if (fileIsValid && disallowedCompressionExtension(mFileName, mFormat))
 	{
 	fileIsValid = FALSE;
 	printf("ERROR: %s FILE COMPRESSION TYPE NOT ALLOWED with format %s !!!\n", mFileName, mFormat);
 	}
 
 
     // check that each output_type is not blank and only used with one format
     char *mOutputType = rec->words[mOutputTypeIdx];
     if (fileIsValid && mOutputType[0] == 0)
 	{
 	fileIsValid = FALSE;
 	printf("ERROR: output_type must not be blank.\n");
 	}		    
     if (fileIsValid)
 	{
 	struct hashEl *el = hashLookup(outputTypeHash, mOutputType);
 	if (el)
 	    {
 	    char *existingFormat = (char *) el->val;
 	    if (!sameString(mFormat, existingFormat))
 		{
 		fileIsValid = FALSE;
 		printf("ERROR: Each output_type can only be used with one format.  output_type %s is being used with both format %s and %s.\n",
 		    mOutputType, mFormat, existingFormat);
 		}		    
 	    }
 	else
 	    hashAdd(outputTypeHash, mOutputType, cloneString(mFormat));
 	}
 
     // check experiment field
     char *mExperiment = rec->words[mExperimentIdx];
     if (fileIsValid)
 	{
 	if (!startsWith("ENCSR", mExperiment))
 	    {
 	    fileIsValid = FALSE;
 	    printf("ERROR: %s is not a valid value for the experiment field.  Must start with ENCSR.\n", mExperiment);
 	    }
 	}
 
     // check enriched_in field
     char *mEnrichedIn = rec->words[mEnrichedInIdx];
     if (fileIsValid)
 	{
 	if (!encode3CheckEnrichedIn(mEnrichedIn))
 	    {
 	    fileIsValid = FALSE;
 	    printf("ERROR: %s is not a valid value for the enriched_in field.\n", mEnrichedIn);
 	    }
 	}
 
     
     // check replicate field
     char *mReplicate = rec->words[mReplicateIdx];
     if (fileIsValid)
 	{
 	boolean smallNumber = FALSE;
 	int sl = strlen(mReplicate);
 	int sn = 0;
 	if (countLeadingDigits(mReplicate) == sl && sl <= 2 && sl > 0)
 	    {
 	    smallNumber = TRUE;
 	    sn = atoi(mReplicate);
 	    }
        	if (!(startsWith("pooled", mReplicate) || startsWith("n/a", mReplicate) || (smallNumber && sn >=1 && sn <=10)))
 	    {
 	    fileIsValid = FALSE;
     	    printf("ERROR: %s is not a valid value for the replicate field.  "
 		"Must be pooled or n/a or a small unsigned number 1 <= N <=10.\n", mReplicate);
 	    }
 	}
     
     // check technical_replicate field
     if (fileIsValid)
 	{
 	if (mTechnicalReplicateIdx != -1)  // The technical_replicate field is optional
 	    {
 	    char *mTechnicalReplicate = rec->words[mTechnicalReplicateIdx];
 	    boolean smallNumber = FALSE;
 	    int sl = strlen(mTechnicalReplicate);
 	    int sn = 0;
 	    if (countLeadingDigits(mTechnicalReplicate) == sl && sl <= 2 && sl > 0)
 		{
 		smallNumber = TRUE;
 		sn = atoi(mTechnicalReplicate);
 		}
 	    if (!(startsWith("pooled", mTechnicalReplicate) || startsWith("n/a", mTechnicalReplicate) || (smallNumber && sn >=1 && sn <=10)))
 		{
 		fileIsValid = FALSE;
 		printf("ERROR: %s is not a valid value for the technical_replicate field.  "
 		    "Must be pooled or n/a or a small unsigned number 1 <= N <=10.\n", mTechnicalReplicate);
 		}
 	    }
 	}
     
     // check paired_end field
     if (fileIsValid)
 	{
 	if (mPairedEndIdx != -1)  // The check paired_end field is optional
 	    {
 	    char *mPairedEnd = rec->words[mPairedEndIdx];
 	    boolean smallNumber = FALSE;
 	    int sl = strlen(mPairedEnd);
 	    int sn = 0;
 	    if (countLeadingDigits(mPairedEnd) == sl && sl < 2 && sl > 0)
 		{
 		smallNumber = TRUE;
 		sn = atoi(mPairedEnd);
 		}
 	    if (!(startsWith("pooled", mPairedEnd) || startsWith("n/a", mPairedEnd) || (smallNumber && (sn==1 || sn ==2))))
 		{
 		fileIsValid = FALSE;
 		printf("ERROR: %s is not a valid value for the paired_end field.  Must be 1 (forward), 2 (reverse) or \"n/a\".\n", mPairedEnd);
 		}
 	    }
 	else
 	    {
 	    if (sameString(mFormat, "fastq"))  // The check paired_end field is required for fastq
 		{
 		fileIsValid = FALSE;
 		printf("ERROR: For format fastq the paired_end field is required.  Must be 1 (forward), 2 (reverse) or \"n/a\".\n");
 		}
 	    }
 	}
     
 
 
     if (fileIsValid)
 	{
 
 	mFileSize = fileSize(mFileName);
 	mFileTime = fileModTime(mFileName);
 
 	char *vMd5Hex = NULL;
 	char *vValidKey = NULL;
 	
 	boolean dataMatches = FALSE;
 	// look for a matching record in old validated
 	if (haveVal)
 	    {
 	    off_t vFileSize = 0;
 	    time_t vFileTime = 0;
 	    struct slRecord *vRec = (struct slRecord *) hashFindVal(valHash, rec->words[mFileNameIdx]);
 	    // check if all fields match between manifest and old validated
 	    if (vRec)
 		{
 		dataMatches = TRUE;
 		// check that the fields values match
 		for (i = 0; i < mFieldCount; ++i)
 		    {
 		    if (!sameString(rec->words[i], vRec->words[i]))
 			dataMatches = FALSE;
 		    }
 		// check that the record correctly matches the actual file sizes.
 		if (dataMatches)
 		    {
 		    vFileSize = sqlLongLong(vRec->words[vSizeIdx]);  // TODO maybe use my special functions from the validator
 		    if (vFileSize != mFileSize) dataMatches = FALSE;
 		    }
 		// check that the record correctly matches the actual file timestamp.
 		if (dataMatches)
 		    {
 		    vFileTime = sqlLongLong(vRec->words[vModifiedIdx]);  // There is no sqlLong function, but there should be!
 		    if (vFileTime != mFileTime) dataMatches = FALSE;
 		    }
 		// verify vValidKey against vMd5Hex.
 		if (dataMatches)
 		    {
 		    vMd5Hex   = vRec->words[vMd5SumIdx];
 		    vValidKey = vRec->words[vValidKeyIdx];
 		    char *checkValidKey = encode3CalcValidationKey(vMd5Hex, vFileSize);
 		    if (sameString(vValidKey,"ERROR")) 
 			{
 			dataMatches = FALSE;
 			}
 		    else if (!sameString(vValidKey,checkValidKey)) 
 			{
 			warn("invalid key %s in old validated.txt",vValidKey);  // TODO add line# or filename etc?
 			dataMatches = FALSE;
 			}
 		    }
 		
 		}
 	    }
 
 	if (dataMatches)
 	    {
 	    mMd5Hex = vMd5Hex;
 	    mValidKey = vValidKey;
 	    }
 	else
 	    {
 	    // get md5_sum
 	    if (quickMd5sum && mFileSize > 100 * 1024 * 1024)
 		mMd5Hex = fakeMd5sum;  // "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
 	    else
 		{
 		verbose(1, "Running md5 checksum on %s\n", mFileName);
 		mMd5Hex = md5HexForFile(mFileName);
 		}
 
 	    mValidKey = encode3CalcValidationKey(mMd5Hex, mFileSize);
 
 	    fileIsValid = validateFile(mFileName, mFormat); // Call the validator on the file and format.
 	    if (!fileIsValid)
 		mValidKey = "ERROR";
 
 	    }
 
 	}
 
 
     printf("mFileName = %s size=%lld time=%ld md5=%s validKey=%s\n", 
 	mFileName, (long long)mFileSize, (long)mFileTime, mMd5Hex, mValidKey);
     printf("\n");
 
     // write to output validated.tmp
     tabSep = "";
     for (i = 0; i < mFieldCount; ++i)
 	{
 	fprintf(f, "%s%s", tabSep, rec->words[i]);
 	tabSep = "\t";
 	}
     // include additional fields
     fprintf(f,"\t%s\t%lld\t%ld\t%s", mMd5Hex, (long long)mFileSize, (long)mFileTime, mValidKey);
     fprintf(f,"\n");
     fflush(f);
 
     if (sameString(mValidKey,"ERROR"))
 	errsFound = TRUE;
 
     ++recNo;
     }
 
 
 carefulClose(&f);
 rename("validated.tmp", "validated.txt"); // replace the old validated file with the new one
 
 if (errsFound)
     return 1;
 
 // #file_name      format  experiment      replicate       output_type     biosample       target  localization    update
 // ucsc_db   (this is optional but overrides attempts to get db from file_name path)
 // md5_sum size modified valid_key
 
 return 0;
 
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 
 if (argc!=1 || optionExists("-help"))
     usage();
 
 workingDir = optionVal("dir", workingDir);
 encValData = optionVal("encValData", encValData);
 quickMd5sum = optionExists("quickMd5sum");
 
 return validateManifest(workingDir);
 
 }