96d7a1cc2f69c3989b4ca7cc2e94ef2b8df310dd
brianlee
  Tue Mar 4 11:18:54 2025 -0800
Adding tiff and avi to cdwSubmit.c and cdwMakeValidFile.c for ingest

diff --git src/hg/cirm/cdw/cdwMakeValidFile/cdwMakeValidFile.c src/hg/cirm/cdw/cdwMakeValidFile/cdwMakeValidFile.c
index c213b694216..68e9fd1ea9b 100644
--- src/hg/cirm/cdw/cdwMakeValidFile/cdwMakeValidFile.c
+++ src/hg/cirm/cdw/cdwMakeValidFile/cdwMakeValidFile.c
@@ -1,1107 +1,1115 @@
 /* cdwMakeValidFile - Add range of ids to valid file table. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "errCatch.h"
 #include "localmem.h"
 #include "errAbort.h"
 #include "sqlNum.h"
 #include "cheapcgi.h"
 #include "obscure.h"
 #include "jksql.h"
 #include "asParse.h"
 #include "twoBit.h"
 #include "genomeRangeTree.h"
 #include "basicBed.h"
 #include "bbiFile.h"
 #include "bigWig.h"
 #include "bigBed.h"
 #include "bamFile.h"
 #include "htmlPage.h"
 #include "portable.h"
 #include "gff.h"
 #include "cdw.h"
 #include "cdwLib.h"
 #include "fa.h"
 #include "filePath.h"
 #include "cdwValid.h"
 #include "vcf.h"
 #include "csv.h"
 
 int maxErrCount = 1;	/* Set from command line. */
 int errCount;		/* Set as we run. */
 boolean redo = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "cdwMakeValidFile - Add range of ids to valid file table.\n"
   "usage:\n"
   "   cdwMakeValidFile startId endId\n"
   "options:\n"
   "   maxErrCount=N - maximum errors allowed before it stops, default %d\n"
   "   -redo - redo validation even if have it already\n"
   , maxErrCount);
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"maxErrCount", OPTION_INT},
    {"redo", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 void alignFastqMakeBed(struct cdwFile *ef, struct cdwAssembly *assembly,
     char *fastqPath, struct cdwValidFile *vf, FILE *bedF, char *assay)
 /* Take a sample fastq and run bwa on it, and then convert that file to a bed. 
  * Update vf->mapRatio and related fields. */
 {
 cdwAlignFastqMakeBed(ef, assembly, fastqPath, vf, bedF, 
     &vf->mapRatio, &vf->depth, &vf->sampleCoverage, &vf->uniqueMapRatio, assay);
 }
 
 void makeValidFastq( struct sqlConnection *conn, char *path, struct cdwFile *ef, 
 	struct cdwAssembly *assembly, struct cdwValidFile *vf, char *assay)
 /* Fill out fields of vf.  Create sample subset. */
 {
 /* Make cdwFastqFile record. */
 long long fileId = ef->id;
 
 cdwMakeFastqStatsAndSample(conn, fileId);
 struct cdwFastqFile *fqf = cdwFastqFileFromFileId(conn, fileId);
 verbose(1, "Made sample fastq with %lld reads\n", fqf->sampleCount);
 
 /* Save some key pieces in vf. */
 vf->itemCount = fqf->readCount;
 vf->basesInItems = fqf->baseCount;
 vf->sampleCount = fqf->sampleCount;
 vf->basesInSample = fqf->basesInSample;
 
 /* Align fastq and turn results into bed. */
 char sampleBedName[PATH_LEN], temp[PATH_LEN];
 safef(sampleBedName, PATH_LEN, "%scdwSampleBedXXXXXX", cdwTempDirForToday(temp));
 cdwReserveTempFile(sampleBedName);
 FILE *bedF = mustOpen(sampleBedName, "w");
 alignFastqMakeBed(ef, assembly, fqf->sampleFileName, vf, bedF, assay);
 carefulClose(&bedF);
 vf->sampleBed = cloneString(sampleBedName);
 
 cdwFastqFileFree(&fqf);
 }
 
 #define TYPE_BAM  1
 #define TYPE_READ 2
 
 #ifdef OLD
 struct miniBed
 /* Almost a bed record. */
     {
     struct miniBed *next;
     uint32_t tid;   // Target ID in a bam file 
     uint32_t start; // Start position
     uint32_t size;  // Size of read
     char strand;    //  '+' or '-'
     };
 
 int miniBedCmp(const void *va, const void *vb)
 /* Compare to sort based on query start. */
 {
 const struct targetPos *a = *((struct targetPos **)va);
 const struct targetPos *b = *((struct targetPos **)vb);
 int dif;
 dif = a->tid - b->tid;
 if (dif == 0)
     dif = a->start - b->start;
 return dif;
 }
 
 void cdwMakeSampleOfBam(char *inBamName, FILE *outBed, int maxSampleSize, 
     struct cdwAssembly *assembly, struct genomeRangeTree *grt, struct cdwValidFile *vf)
 /* Sample every downStep items in inBam and write in simplified bed 5 fashion to outBed. */
 {
 samfile_t *sf = samopen(inBamName, "rb", NULL);
 bam_header_t *bamHeader = sf->header;
 struct lm *lm = lmInit(0);
 struct miniBed *mbList = NULL, *mb;
 
 bam1_t one;
 ZeroVar(&one);	// This seems to be necessary!
 
 /* Pass through collecting counts and making up miniBeds for items. */
 long long mappedCount = 0, uniqueMappedCount = 0;
 for (;;)
     {
     if (bam_read1(sf->x.bam, &one) < 0)
 	break;
     int32_t tid = one.core.tid;
     int l_qseq = one.core.l_qseq;
     if (tid > 0)
 	{
 	++mappedCount;
 	if (one.core.qual > cdwMinMapQual)
 	    {
 	    ++uniqueMappedCount;
 	    lmAllocVar(lm, mb);
 	    mb->tid = tid;
 	    mb->start = one.core.pos;
 	    mb->size = l_qseq;
 	    mb-strand = ((one.core.flat & BAM_FREVERSE) ? '-' : '+');
 	    slAddHead(&mbList, mb);
 	    }
 	}
     }
 
 /* Whittle down mini bed list to sample, and crawl through it making bed etc. */
 mbList = slListRandomSample(mbList, maxSampleSize);
 for (mb = mbList; mb != NULL; mb = mb->next)
     {
     vf->sampleCount += 1;
     vf->basesInSample += mb->size;
     char *chrom = bamHeader->target_name[tid];
        {
        if (tid > 0)
 	   {
 	   int start = one.core.pos;
 	   // Approximate here... can do better if parse cigar.
 	   int end = start + l_qseq;	
 	   boolean isRc = (one.core.flag & BAM_FREVERSE);
 	   char strand = '+';
 	   if (isRc)
 	       {
 	       strand = '-';
 	       }
 	   if (start < 0) start=0;
 	   fprintf(outBed, "%s\t%d\t%d\t.\t0\t%c\n", chrom, start, end, strand);
 	   genomeRangeTreeAdd(grt, chrom, start, end);
 	   }
        }
     }
 vf->mapRatio = (double)mappedCount/vf->itemCount;
 vf->uniqueMapRatio = (double)uniqueMappedCount/vf->itemCount;
 vf->depth = vf->basesInItems*vf->mapRatio/assembly->baseCount;
 samclose(sf);
 lmCleanup(&lm);
 }
 #endif /* OLD */
 
 static void checkBamChroms(struct sqlConnection *conn, char *bamPath, struct cdwAssembly *assembly)
 /* Check chromosomes in bam file are in agreement with those in
  * assembly.  The bar here is pretty low - there must be at least one size
  * agreement and no size disagreements. */
 {
 /* Get size hash from two bit file associated with assembly */
 char *twoBitPath = cdwPathForFileId(conn, assembly->twoBitId);
 struct hash *chromHash = twoBitChromHash(twoBitPath);
 
 /* Get bam file handle and chromosome list. */
 samfile_t *fh = bamMustOpenLocal(bamPath, "rb", NULL);
 bam_header_t *bamHeader = sam_hdr_read(fh);
 
 /* Count up chromosomes that match.  Not all need to but at least some do. */
 int i, matchCount = 0;
 for (i = 0; i < bamHeader->n_targets; i++)
     {
     /* Look up chrom size under BAM name, and if that fails under name with "chr" prefix */
     char *name = bamHeader->target_name[i];
     int size = bamHeader->target_len[i];
     int chromSize = hashIntValDefault(chromHash, name, 0);
     char chrName[256];
     if (chromSize == 0)
         {
 	safef(chrName, sizeof(chrName), "chr%s", name);
 	chromSize = hashIntValDefault(chromHash, chrName, 0);
 	}
 
     /* Compare sizes, if they don't match swawk and die, otherwise add to match count */
     if (chromSize != 0)
         {
 	if (chromSize == size)
 	    ++matchCount;
 	else
 	    errAbort("Chromosome size mismatch: %s is %d bases in %s, %d base in %s (%s)",
 		name, size, bamPath, chromSize, assembly->name, twoBitPath);
 	}
     }
 if (matchCount == 0)
     errAbort("%s didn't match any chromosomes in %s", bamPath, twoBitPath);
 
 /* Clean up and go home */
 bamClose(&fh);
 hashFree(&chromHash);
 freez(&twoBitPath);
 }
 
 static void checkBbiChroms(struct sqlConnection *conn, struct bbiFile *bbi, 
     struct cdwAssembly *assembly)
 /* Check chromosomes in bigBed or bigWig file are in agreement with those in
  * assembly.  The bar here is pretty low - there must be at least one size
  * agreement and no size disagreements.  The bbiFile may only have a single
  * chromosome for all we know. Also want to tolerate different use of haplotypes */
 {
 /* Get size hash from two bit file associated with assembly */
 char *twoBitPath = cdwPathForFileId(conn, assembly->twoBitId);
 struct hash *chromHash = twoBitChromHash(twoBitPath);
 struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);
 
 /* Count up chromosomes that match.  Not all need to but at least some do. */
 int matchCount = 0;
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     {
     int chromSize = hashIntValDefault(chromHash, chrom->name, 0);
     if (chromSize != 0)
         {
 	if (chromSize == chrom->size)
 	    ++matchCount;
 	else
 	    errAbort("Chromosome size mismatch: %s is %d bases in %s, %d base in %s (%s)",
 		chrom->name, (int)chrom->size, bbi->fileName, chromSize, 
 		assembly->name, twoBitPath);
 	}
     }
 if (matchCount == 0)
     errAbort("%s didn't match any chromosomes in %s", bbi->fileName, twoBitPath);
 
 /* Clean up and go home */
 bbiChromInfoFreeList(&chromList);
 hashFree(&chromHash);
 freez(&twoBitPath);
 }
 
 void makeValidBigBed( struct sqlConnection *conn, char *path, struct cdwFile *ef, 
 	struct cdwAssembly *assembly, char *format, struct cdwValidFile *vf)
 /* Fill in fields of vf based on bigBed. */
 {
 struct bbiFile *bbi = bigBedFileOpen(path);
 checkBbiChroms(conn, bbi, assembly);
 vf->sampleCount = vf->itemCount = bigBedItemCount(bbi);
 struct bbiSummaryElement sum = bbiTotalSummary(bbi);
 vf->basesInSample = vf->basesInItems = sum.sumData;
 vf->sampleCoverage = (double)sum.validCount/assembly->baseCount;
 vf->depth = (double)sum.sumData/assembly->baseCount;
 vf->mapRatio = 1.0;
 bigBedFileClose(&bbi);
 }
 
 void makeValidBed( struct sqlConnection *conn, char *path, struct cdwFile *ef, 
 	struct cdwAssembly *assembly, char *format, char *asRoot, struct cdwValidFile *vf)
 /* Fill in fields of vf based on bed and grind through file checking it. */
 {
 /* Get structure with info about which fields are true bed. */
 struct cdwBedType *bedType = cdwBedTypeFind(asRoot);
 int bedFieldCount = bedType->bedFields;
 
 /* Load up as file to check against */
 char asPath[PATH_LEN];
 cdwAsPath(asRoot, asPath);
 struct asObject *as = asParseFile(asPath);
 
 /* Create a row one bigger than expected (so can detect rows too big as well 
  * as too small. */
 int colCount = slCount(as->columnList);
 int colAlloc = colCount+1;
 char *row[colAlloc];
 
 /* Loop through file validating each line and collecting statistics. */
 struct lineFile *lf = lineFileOpen(path, TRUE);
 struct bed bed = {};
 char *line;
 int itemCount = 0;
 long long baseCount = 0;
 while (lineFileNextReal(lf, &line))
     {
     int wordsRead = chopByWhite(line, row, colAlloc);
     lineFileExpectWords(lf, colCount, wordsRead);
     loadAndValidateBed(row, bedFieldCount, colCount, lf, &bed, as, FALSE);
     ++itemCount;
     baseCount += bed.chromEnd - bed.chromStart;
     }
 
 asObjectFreeList(&as);
 
 /* Fill in fields of vf based on statistics */
 vf->sampleCount = vf->itemCount = itemCount;
 vf->basesInSample = vf->basesInItems = baseCount;
 vf->sampleCoverage = vf->depth = (double)baseCount/assembly->baseCount;
 vf->mapRatio = 1.0;
 }
 
 void makeValidBigWig(struct sqlConnection *conn, char *path, struct cdwFile *ef, 
 	struct cdwAssembly *assembly, struct cdwValidFile *vf)
 /* Fill in fields of vf based on bigWig. */
 {
 struct bbiFile *bbi = bigWigFileOpen(path);
 checkBbiChroms(conn, bbi, assembly);
 struct bbiSummaryElement sum = bbiTotalSummary(bbi);
 vf->sampleCount = vf->itemCount = vf->basesInSample = vf->basesInItems = sum.validCount;
 vf->sampleCoverage = (double)sum.validCount/assembly->baseCount;
 vf->depth = (double)sum.sumData/assembly->baseCount;
 vf->mapRatio = 1.0;
 bigWigFileClose(&bbi);
 }
 
 void cdwValidFileDump(struct cdwValidFile *vf, FILE *f)
 /* Write out info about vf, just for debugging really */
 {
 fprintf(f, "vf->id = %d\n", vf->id);
 fprintf(f, "vf->licensePlate = %s\n", vf->licensePlate);
 fprintf(f, "vf->fileId = %d\n", vf->fileId);
 fprintf(f, "vf->format = %s\n", vf->format);
 fprintf(f, "vf->outputType = %s\n", vf->outputType);
 fprintf(f, "vf->experiment = %s\n", vf->experiment);
 fprintf(f, "vf->replicate = %s\n", vf->replicate);
 fprintf(f, "vf->enrichedIn = %s\n", vf->enrichedIn);
 fprintf(f, "vf->ucscDb = %s\n", vf->ucscDb);
 fprintf(f, "vf->itemCount = %lld\n", vf->itemCount);
 fprintf(f, "vf->basesInItems = %lld\n", vf->basesInItems);
 fprintf(f, "vf->sampleBed = %s\n", vf->sampleBed);
 fprintf(f, "vf->sampleCount = %lld\n", vf->sampleCount);
 fprintf(f, "vf->basesInSample = %lld\n", vf->basesInSample);
 fprintf(f, "vf->sampleCoverage = %g\n", vf->sampleCoverage);
 fprintf(f, "vf->sampleCount = %g\n", vf->depth);
 }
 
 void makeValidBam( struct sqlConnection *conn, char *path, struct cdwFile *ef, 
 	struct cdwAssembly *assembly, struct cdwValidFile *vf)
 /* Fill out fields of vf based on bam.  Create sample subset as a little bed file. */
 {
 /* Check chromosome sizes to fail fast if need be on wrong genome version. */
 checkBamChroms(conn, path, assembly);
 
 /* Have cdwBamStats do most of the work. */
 char sampleFileName[PATH_LEN];
 struct cdwBamFile *ebf = cdwMakeBamStatsAndSample(conn, ef->id, sampleFileName);
 
 /* Fill in some of validFile record from bamFile record */
 vf->sampleBed = cloneString(sampleFileName);
 vf->itemCount = ebf->readCount;
 vf->basesInItems = ebf->readBaseCount;
 vf->mapRatio = (double)ebf->mappedCount/ebf->readCount;
 vf->uniqueMapRatio = (double)ebf->uniqueMappedCount/ebf->readCount;
 vf->depth = vf->basesInItems*vf->mapRatio/assembly->baseCount;
 
 /* Scan through the bam file to make up information about the sample bits */
 struct genomeRangeTree *grt = genomeRangeTreeNew();
 struct lineFile *lf = lineFileOpen(sampleFileName, TRUE);
 char *row[3];
 while (lineFileRow(lf, row))
     {
     char *chrom = row[0];
     unsigned start = sqlUnsigned(row[1]);
     unsigned end = sqlUnsigned(row[2]);
     vf->sampleCount += 1;
     vf->basesInSample += end - start;
     genomeRangeTreeAdd(grt, chrom, start, end);
     }
 lineFileClose(&lf);
 
 /* Fill in last bits that need summing from the genome range tree. */
 long long basesHitBySample = genomeRangeTreeSumRanges(grt);
 vf->sampleCoverage = (double)basesHitBySample/assembly->baseCount;
 genomeRangeTreeFree(&grt);
 cdwBamFileFree(&ebf);
 }
 
 void makeValid2Bit(struct sqlConnection *conn, char *path, struct cdwFile *ef, 
     struct cdwValidFile *vf)
 /* Fill in info about assembly */
 {
 struct twoBitFile *tbf = twoBitOpen(path);
 vf->basesInItems = vf->basesInSample = twoBitTotalSize(tbf);
 vf->itemCount = vf->sampleCount = tbf->seqCount;
 vf->mapRatio = 1.0;
 vf->sampleCoverage = 1.0;
 vf->depth = 1.0;
 twoBitClose(&tbf);
 }
 
 void makeValidFasta(struct sqlConnection *conn, char *path, struct cdwFile *ef, 
     struct cdwValidFile *vf)
 /* Fill in info about fasta file */
 {
 struct lineFile *lf = lineFileOpen(path, FALSE);
 DNA *dna;
 int size;
 char *name;
 while (faSpeedReadNext(lf, &dna, &size, &name))
     {
     vf->basesInItems += size;
     vf->itemCount += 1;
     }
 lineFileClose(&lf);
 }
 
 void genomeRangeTreeWriteAsBed3(struct genomeRangeTree *grt, char *fileName)
 /* Write as bed4 file */
 {
 FILE *f = mustOpen(fileName, "w");
 struct hashEl *chrom, *chromList = hashElListHash(grt->hash);
 slSort(&chromList, hashElCmpWithEmbeddedNumbers);
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     {
     char *chromName = chrom->name;
     struct rbTree *rangeTree = chrom->val;
     struct range *range, *rangeList = rangeTreeList(rangeTree);
     for (range = rangeList; range != NULL; range = range->next)
 	fprintf(f, "%s\t%d\t%d\n", chromName, range->start, range->end);
     }
 carefulClose(&f);
 }
 
 void makeValidVcf( struct sqlConnection *conn, char *path, struct cdwFile *ef, 
 	struct cdwAssembly *assembly, struct cdwValidFile *vf)
 /* Fill out fields of vf from a variant call format (vcf) file.  Create bed file. */
 {
 /* Have cdwVcfStats do most of the work. */
 char sampleFileName[PATH_LEN];
 struct cdwVcfFile *vcf = cdwMakeVcfStatsAndSample(conn, ef->id, sampleFileName);
 
 /* Fill in some of validFile record from bamFile record */
 vf->sampleBed = cloneString(sampleFileName);
 vf->itemCount = vcf->itemCount;
 vf->basesInItems = vcf->sumOfSizes;
 vf->mapRatio = 1;
 vf->uniqueMapRatio = 1;
 vf->depth = (double)vcf->sumOfSizes/assembly->baseCount;
 vf->sampleCount =  vcf->itemCount;
 vf->basesInSample = vcf->sumOfSizes;
 vf->sampleCoverage = (double)vcf->basesCovered/assembly->baseCount;
 cdwVcfFileFree(&vcf);
 }
 
 
 void makeValidText(struct sqlConnection *conn, char *path, struct cdwFile *ef, 
     struct cdwValidFile *vf)
 /* Fill in info about a text file. */
 {
 struct lineFile *lf = lineFileOpen(path, FALSE);
 char *line;
 int lineSize;
 while (lineFileNext(lf, &line, &lineSize))
     {
     int i;
     for (i=0; i<lineSize; ++i)
          {
 	 if (line[i] == 0)
 	     errAbort("%s is not a text file,  has binary zeroes.", ef->submitFileName);
 	 }
     vf->itemCount += 1;
     }
 lineFileClose(&lf);
 }
 
 void makeValidTabSepFile(struct sqlConnection *conn, char *path, struct cdwFile *ef, 
     struct cdwValidFile *vf, char **labels, int fieldCount)
 /* Make sure a file looks like it's tab separated with a consistent number of columns and
  * optionally first row matching labels. */
 {
 struct lineFile *lf = lineFileOpen(path, TRUE);
 char *row[fieldCount];
 boolean firstTime = TRUE;
 while (lineFileRowTab(lf, row))
     {
     if (firstTime && labels != NULL)
         {
 	int i;
 	for (i=0; i<fieldCount; ++i)
 	    {
 	    if (differentString(labels[i], row[i]))
 	       errAbort("label mismatch field %d.  '%s' vs. '%s'", i, labels[i], row[i]);
 	    }
 	firstTime = FALSE;
 	}
     else
 	vf->itemCount += 1;
     }
 
 lineFileClose(&lf);
 }
 
 void makeValidKallistoAbundance( struct sqlConnection *conn, char *path, struct cdwFile *ef, 
     struct cdwValidFile *vf)
 /* Make sure a kallisto abundance file looks all good */
 {
 char *labels[] = { "target_id",       "length",  "eff_length",      "est_counts",      "tpm", };
 makeValidTabSepFile(conn, path, ef, vf, labels, ArraySize(labels));
 }
 
 void makeValidTsv( struct sqlConnection *conn, char *path, struct cdwFile *ef, 
     struct cdwValidFile *vf)
 /* Make sure a tsv tab-separated values file looks all good */
 {
 struct lineFile *lf = lineFileOpen(path, TRUE);
 // get fieldCount from first line
 int fieldCount = 0;
 int lineSize;
 char *line;
 while (lineFileNext(lf, &line, &lineSize))
     {
     if (line[0] == '#')
         continue;
     fieldCount = chopByChar(line, '\t', NULL, 0);
     }
 lineFileClose(&lf);
 if (fieldCount == 0)
     errAbort("0 columns in tsv %s", ef->submitFileName);
 makeValidTabSepFile(conn, path, ef, vf, NULL, fieldCount);
 }
 
 void makeValidCsv( struct sqlConnection *conn, char *path, struct cdwFile *ef, 
     struct cdwValidFile *vf)
 /* Make sure a csv comma-separated values file looks all good */
 {
 struct lineFile *lf = lineFileOpen(path, TRUE);
 // get fieldCount from first line
 int fieldCount = 0;
 int lineSize;
 char *line;
 int lineNumber = 0;
 while (lineFileNext(lf, &line, &lineSize))
     {
     ++lineNumber;
     struct slName *list = csvParse(line);
     int thisCount = slCount(list);
     if (fieldCount == 0)
 	{
 	fieldCount = thisCount;
 	}
     else
 	{
 	if (thisCount != fieldCount)
 	    errAbort("Line #%d of csv %s has %d columns. Previous rows had %d columns.", 
 		lineNumber, ef->submitFileName, thisCount, fieldCount);
 	vf->itemCount += 1;
 	}
     slFreeList(list);
     }
 lineFileClose(&lf);
 if (fieldCount == 0)
     errAbort("0 columns in csv %s", ef->submitFileName);
 }
 
 void makeValidHtml(struct sqlConnection *conn, char *path, struct cdwFile *ef, 
     struct cdwValidFile *vf)
 /* Fill in info about html file */
 {
 struct htmlPage *page = htmlPageGet(path);
 htmlPageValidateOrAbort(page);
 vf->itemCount = slCount(page->tags);
 htmlPageFree(&page);
 }
 
 void makeValidGtf(struct sqlConnection *conn, char *path, struct cdwFile *ef, 
     struct cdwAssembly *assembly, struct cdwValidFile *vf)
 /* Fill in info about a gtf file. */
 {
 /* Open and read file with generic GFF reader and check it is GTF */
 struct gffFile *gff = gffRead(path);
 if (!gff->isGtf)
     errAbort("file id %lld (%s) is not in GTF format - check it has gene_id and transcript_id",
 	(long long)ef->id, ef->submitFileName);
 
 /* Convert it to a somewhat smaller less informative bed file for sampling purposes. */
 char sampleFileName[PATH_LEN], temp[PATH_LEN];
 safef(sampleFileName, PATH_LEN, "%scdwGffBedXXXXXX", cdwTempDirForToday(temp));
 cdwReserveTempFile(sampleFileName);
 FILE *f = fopen(sampleFileName, "w");
 struct genomeRangeTree *grt = genomeRangeTreeNew();
 
 /* Loop through lines writing out simple bed and adding to genome range tree. */
 struct gffLine *gffLine;
 long long itemCount = 0;
 long long totalSize = 0;
 for (gffLine = gff->lineList; gffLine != NULL; gffLine = gffLine->next)
     {
     totalSize += gffLine->end - gffLine->start;
     fprintf(f, "%s\t%ld\t%ld\n", gffLine->seq, gffLine->start, gffLine->end);
     genomeRangeTreeAdd(grt, gffLine->seq, gffLine->start, gffLine->end);
     ++itemCount;
     }
 carefulClose(&f);
 
 /* Fill out what we can of vf with info we've gathered. */
 vf->itemCount = vf->sampleCount = itemCount;
 vf->basesInItems = vf->basesInSample = totalSize;
 vf->sampleBed = cloneString(sampleFileName);
 long long basesHitBySample = genomeRangeTreeSumRanges(grt);
 genomeRangeTreeFree(&grt);
 vf->sampleCoverage = (double)basesHitBySample/assembly->baseCount;
 vf->mapRatio = 1.0;
 vf->depth = (double)totalSize/assembly->baseCount;
 gffFileFree(&gff);
 }
 
 void makeValidRcc(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf)
 /* Fill in info about a nanostring RCC file. */
 {
 cdwValidateRcc(path);
 }
 
 void makeValidIdat(struct sqlConnection *conn, char *path, 
     struct cdwFile *ef, struct cdwValidFile *vf)
 /* Fill in info about a illumina idac file. */
 {
 cdwValidateIdat(path);
 }
 
 void makeValidPdf(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf)
 /* Check it is really pdf. */
 {
 cdwValidatePdf(path);
 }
 
 void validateVcfGzTbi(struct sqlConnection *conn, char *path, 
     struct cdwFile *ef, struct cdwValidFile *vf)
 /* Given a path to a tabix on a vcf, validate it is tabix, and that the 
  * vcf it refers to exists and has correct name */
 {
 char dir[PATH_LEN], name[FILENAME_LEN], extension[FILEEXT_LEN];
 splitPath(path, dir, name, extension);
 char vcfPath[PATH_LEN];
 safef(vcfPath, sizeof(vcfPath), "%s%s%s", dir, name, extension);
 if (!fileExists(vcfPath))
     {
     /* Look for it under cdwPathName */
     if (!cdwFindInSameSubmitDir(conn, ef, vcfPath))
 	errAbort("%s, the original of %s doesn't exist", vcfPath, path);
     }
 }
 
 void makeValidCustomTrack(struct sqlConnection *conn, char *path, 
     struct cdwFile *ef, struct cdwValidFile *vf)
 /* Fill in some info about a BED file of no particular sub-format. This is allowed to have
  * browser and track lines in it, which are ignored. */
 {
 struct lineFile *lf = lineFileOpen(path, TRUE);
 char *line;
 char *row[256];
 int bedSize = 0;
 while (lineFileNextReal(lf, &line))
     {
     if (startsWithWord("browser", line) || startsWithWord("track", line))
         continue;
     int wordCount = chopLine(line, row);
     if (bedSize == 0)
 	{
         bedSize = wordCount;
 	if (bedSize < 3)
 	    {
 	    lineFileExpectAtLeast(lf, 3, bedSize);
 	    }
 	}
     else
 	{
         if (bedSize != wordCount)
 	    {
 	    errAbort("Some lines of %s have %d words, but line %d of has %d words",
 		    lf->fileName, bedSize, lf->lineIx, wordCount);
 	    }
 	}
     int start = lineFileNeedNum(lf, row, 1);
     int end = lineFileNeedNum(lf, row, 2);
     if (end < start)
          errAbort("end before start line %d of %s", lf->lineIx, lf->fileName);
     ++vf->itemCount;
     vf->basesInItems += (end - start);
     }
 lineFileClose(&lf);
 }
 
 static void needAssembly(struct cdwFile *ef, char *format, struct cdwAssembly *assembly)
 /* Require assembly tag be present. */
 {
 if (assembly == NULL)
     errAbort("file id %lld (%s) is %s format and needs an assembly tag to validate", 
 	(long long)ef->id, ef->submitFileName, format);
 }
 
 void mustMakeValidFile(struct sqlConnection *conn, struct cdwFile *ef, struct cgiParsedVars *tags,
     long long oldValidId)
 /* If possible make a cdwValidFile record for this.  Makes sure all the right tags are there,
  * and then parses file enough to determine itemCount and the like.  For some files, like fastqs,
  * it will take a subset of the file as a sample so can do QA without processing the whole thing. */
 {
 /* Make up validFile from tags and id */
 
 struct cdwValidFile *vf;
 AllocVar(vf);
 vf->fileId = ef->id;
 cdwValidFileFieldsFromTags(vf, tags);
 vf->sampleBed = "";
 
 if (oldValidId == 0) // moving up here to mitigate race condition
     {
     cdwValidFileSaveToDb(conn, vf, "cdwValidFile", 512);
     vf->id = sqlLastAutoId(conn);
     }
 else
     {
     vf->id = oldValidId;
     }
 
 if (vf->format)	// We only can validate if we have something for format 
     {
     /* Look up assembly. */
     struct cdwAssembly *assembly = NULL;
     if (!isEmpty(vf->ucscDb) && !sameString(vf->ucscDb, "unknown"))
 	{
 	char *ucscDb = vf->ucscDb;
 	char query[256];
 	sqlSafef(query, sizeof(query), "select * from cdwAssembly where ucscDb='%s'", vf->ucscDb);
 	assembly = cdwAssemblyLoadByQuery(conn, query);
 	if (assembly == NULL)
 	    errAbort("Couldn't find assembly corresponding to %s", ucscDb);
 	}
 
     /* Make path to file */
     char path[PATH_LEN];
     safef(path, sizeof(path), "%s%s", cdwRootDir, ef->cdwFileName);
 
     /* And dispatch according to format. */
     char *format = vf->format;
     char *suffix = cdwFindDoubleFileSuffix(path);
     char suffixBuf[128];
 
     char *bedPrefix = "bed_";
     int bedPrefixSize = 4;
 
     if (sameString(format, "fastq"))
 	{
 	char *assay = cdwLookupTag(tags, "assay");
 	needAssembly(ef, format, assembly);
 	makeValidFastq(conn, path, ef, assembly, vf, assay);
 	suffix = ".fastq.gz";
 	}
     else if (cdwIsSupportedBigBedFormat(format))
 	{
 	needAssembly(ef, format, assembly);
 	makeValidBigBed(conn, path, ef, assembly, format, vf);
 	if (sameString(format, "bigBed"))
 	    suffix = ".bigBed";
 	else
 	    {
 	    safef(suffixBuf, sizeof(suffixBuf), ".%s.bigBed", format);
 	    suffix = suffixBuf;
 	    }
 	}
     else if (startsWith(bedPrefix, format) && cdwIsSupportedBigBedFormat(format+bedPrefixSize))
         {
 	char *formatNoBed = format + bedPrefixSize;
 	needAssembly(ef, format, assembly);
 	makeValidBed(conn, path, ef, assembly, format, formatNoBed, vf);
 	safef(suffixBuf, sizeof(suffixBuf), ".%s.bed", format);
 	suffix = suffixBuf;
 	}
     else if (sameString(format, "bigWig"))
         {
 	needAssembly(ef, format, assembly);
 	makeValidBigWig(conn, path, ef, assembly, vf);
 	suffix = ".bigWig";
 	}
     else if (sameString(format, "bam"))
         {
 	needAssembly(ef, format, assembly);
 	makeValidBam(conn, path, ef, assembly, vf);
 	suffix = ".bam";
 	}
     else if (sameString(format, "bam.bai"))
         {
 	cdwValidateBamIndex(path);
 	suffix = ".bam.bai";
 	}
     else if (sameString(format, "vcf.idx"))
         {
 	cdwValidateTabixIndex(path);
 	suffix = ".vcf.idx";
 	}
     else if (sameString(format, "2bit"))
         {
 	makeValid2Bit(conn, path, ef, vf);
 	suffix = ".2bit";
 	}
     else if (sameString(format, "fasta"))
         {
 	makeValidFasta(conn, path, ef, vf);
 	suffix = ".fasta.gz";
 	}
     else if (sameString(format, "gtf"))
         {
 	needAssembly(ef, format, assembly);
 	makeValidGtf(conn, path, ef, assembly, vf);
 	suffix = ".gtf.gz";
 	}
     else if (sameString(format, "rcc"))
         {
 	makeValidRcc(conn, path, ef, vf);
 	suffix = ".RCC";
 	}
     else if (sameString(format, "idat"))
         {
 	makeValidIdat(conn, path, ef, vf);
 	suffix = ".idat";
 	}
     else if (sameString(format, "customTrack"))
         {
 	makeValidCustomTrack(conn, path, ef, vf);
 	assert(endsWith(ef->submitFileName, ".gz"));
 	suffix = cdwFindDoubleFileSuffix(ef->submitFileName);
 	}
     else if (sameString(format, "pdf"))
         {
 	makeValidPdf(conn, path, ef, vf);
 	suffix = ".pdf";
 	}
     else if (sameString(format, "vcf"))
         {
 	needAssembly(ef, format, assembly);
 	makeValidVcf(conn, path, ef, assembly, vf);
 	if (endsWith(ef->submitFileName, ".gz"))
 	    suffix = ".vcf.gz";
 	else
 	    suffix = ".vcf";
 	}
     else if (sameString(format, "vcf.gz.tbi"))
         {
 	validateVcfGzTbi(conn, path, ef, vf);
 	suffix = ".vcf.gz.tbi";
 	}
     else if (sameString(format, "cram"))
         {
 	cdwValidateCram(path);
 	suffix = ".cram";
 	}
     else if (sameString(format, "jpg"))
         {
 	cdwValidateJpg(path);
 	suffix = ".jpg";
 	}
     else if (sameString(format, "png"))
         {
 	cdwValidatePng(path);
 	suffix = ".png";
 	}
     else if (sameString(format, "expression_matrix"))
 	{
 	makeValidText(conn, path, ef, vf); 
 	}
     else if (sameString(format, "text"))
         {
 	makeValidText(conn, path, ef, vf);
 	}
     else if (sameString(format, "kallisto_abundance"))
 	{
         makeValidKallistoAbundance(conn, path, ef, vf);
 	}
     else if (sameString(format, "tsv"))
 	{
         makeValidTsv(conn, path, ef, vf);
 	}
     else if (sameString(format, "csv"))
 	{
         makeValidCsv(conn, path, ef, vf);
 	}
     else if (sameString(format, "html"))
         {
 	makeValidHtml(conn, path, ef, vf);
 	}
+    else if (sameString(format, "tiff"))
+        {
+	/* No specific validation needed for raw format. */
+	}
+    else if (sameString(format, "avi"))
+        {
+        /* No specific validation needed for xls format. */
+        }
     else if (sameString(format, "raw"))
         {
 	/* No specific validation needed for raw format. */
 	}
     else if (sameString(format, "xls"))
         {
         /* No specific validation needed for xls format. */
         }
     else if (sameString(format, "unknown"))
         {
         /* No specific validation needed for unknown format. */
         }
     else
         {
 	errAbort("Unrecognized format %s for %s\n", format, ef->cdwFileName);
 	}
 
     /* Save record except for license plate to DB. */
     cdwValidFileUpdateDb(conn, vf, vf->id);
 
     /* Create license plate around our ID.  File in warehouse to use license plate
      * instead of baby-babble IDs. */
     cdwMakeLicensePlate( cdwLicensePlateHead(conn), 
 	vf->id, vf->licensePlate, cdwMaxPlateSize);
 
     /* Create swapped out version of cdwFileName in newName. */
     struct dyString *newName = dyStringNew(0);
     char *fileName = ef->cdwFileName;
     char *dirEnd = strrchr(fileName, '/');
     if (dirEnd == NULL)
 	dirEnd = fileName;
     else
 	dirEnd += 1;
     dyStringAppendN(newName, fileName, dirEnd - fileName);
     dyStringAppend(newName, vf->licensePlate);
     dyStringAppend(newName, suffix);
 
     /* Now build full path names and attempt rename in file system. */
     char oldPath[PATH_LEN], newPath[PATH_LEN];
     safef(oldPath, sizeof(oldPath), "%s%s", cdwRootDir, fileName);
     safef(newPath, sizeof(newPath), "%s%s", cdwRootDir, newName->string);
 
     char query[PATH_LEN+256];
 
     // rename symlink to it in submitDir
     sqlSafef(query, sizeof(query), "select url from cdwSubmitDir where id='%d'", ef->submitDirId);
     char *submitDir = sqlQuickString(conn, query);
     if (!submitDir)
 	errAbort("submitDir not found for id %d", ef->submitDirId);
 
 
     boolean metaException = sameString(ef->submitFileName, "meta.txt");
     // if meta.txt is being expliictly submitted, this is probably for dataset page meta links.
     // this makes it an odd exception, because unlike normally submitted files,
     // this will NOT be converted to a symlink pointing to the cdw/ dir file
     // during the original cdwSubmit.
 
     char *lastPath = NULL;
     if (!metaException)
 	{
 	lastPath = findSubmitSymlink(ef->submitFileName, submitDir, oldPath);
 	freeMem(submitDir);
 	if (!lastPath)
 	    noWarnAbort();
 
 	verbose(3, "lastPath=%s newPath=%s\n", lastPath, newPath);
 	if (unlink(lastPath) == -1)  // drop about to be invalid symlink
 	    errnoAbort("unlink failure %s", lastPath);
 	}
 
     mustRename(oldPath, newPath);
 
     if (!metaException)
 	{
 	if (symlink(newPath, lastPath) == -1)  // replace with symlink
 	    errnoAbort("symlink failure from %s to %s", lastPath, newPath);
 	freeMem(lastPath);
 	}
 
     verbose(2, "Renamed %s to %s\n", oldPath, newPath);
 
     /* Update database with new name */
     sqlSafef(query, sizeof(query), "update cdwFile set cdwFileName='%s' where id=%lld",
 	newName->string, (long long)ef->id);
     sqlUpdate(conn, query);
 
     dyStringFree(&newName);
 
     /* Update validFile record with license plate. */
     sqlSafef(query, sizeof(query), "update cdwValidFile set licensePlate='%s' where id=%lld", 
 	vf->licensePlate, (long long)vf->id);
     sqlUpdate(conn, query);
     }
 freez(&vf);
 }
 
 boolean makeValidFile(struct sqlConnection *conn, struct cdwFile *ef, struct cgiParsedVars *tags,
     long long oldValidId)
 /* Attempt to make validation.  If it fails catch error and attach it to ef->errorMessage as well
  * as sending it to stderr, and return FALSE.  Otherwise return TRUE. */
 {
 struct errCatch *errCatch = errCatchNew();
 boolean success = TRUE;
 if (errCatchStart(errCatch))
     {
     mustMakeValidFile(conn, ef, tags, oldValidId);
     }
 errCatchEnd(errCatch);
 if (errCatch->gotError)
     {
     cdwWriteErrToStderrAndTable(conn, "cdwFile", ef->id, errCatch->message->string);
     warn("This is from submitted file %s", ef->submitFileName);
     success = FALSE;
     }
 else
     {
     warn("%s", errCatch->message->string);  // Make status output legible
     char query[256];
     sqlSafef(query, sizeof(query), "update cdwFile set errorMessage='' where id=%lld",
 	(long long)ef->id);
     sqlUpdate(conn, query);
     }
 errCatchFree(&errCatch);
 return success;
 }
 
 void cdwClearFileError(struct sqlConnection *conn, long long fileId)
 /* Clear file error message */
 {
 char query[256];
 sqlSafef(query, sizeof(query), "update cdwFile set errorMessage='' where id=%lld", fileId);
 sqlUpdate(conn, query);
 }
 
 void cdwMakeValidFile(int startId, int endId)
 /* cdwMakeValidFile - Add range of ids to valid file table.. */
 {
 /* Make list with all files in ID range  - don't want to use cdwFileAllIntactInRange because
  * we may be revalidating files that do have errors. */
 struct sqlConnection *conn = sqlConnect(cdwDatabase);
 char query[512];
 sqlSafef(query, sizeof(query),
     "select * from cdwFile where id>=%d and id<=%d and endUploadTime != 0 "
     "and updateTime != 0", 
     startId, endId);
 struct cdwFile *ef, *efList = cdwFileLoadByQuery(conn, query);
 if (efList == NULL)
     errAbort("No files in %d to %d", startId, endId);
 
 for (ef = efList; ef != NULL; ef = ef->next)
     {
     char query[256];
     sqlSafef(query, sizeof(query), "select id from cdwValidFile where fileId=%lld", (long long)ef->id);
     long long vfId = sqlQuickLongLong(conn, query);
     if (vfId != 0 && isEmpty(ef->errorMessage) && !redo)
 	{
         verbose(2, "already validated %s %s\n", ef->cdwFileName, ef->submitFileName);
 	}
     else
 	{
 	verbose(1, "processing %lld %s %s\n", (long long)ef->id, ef->cdwFileName, ef->submitFileName);
 	char path[PATH_LEN];
 	safef(path, sizeof(path), "%s%s", cdwRootDir, ef->cdwFileName);
 	if (!isEmpty(ef->tags)) // All ones we care about have tags
 	    {
 	    if (vfId != 0)
 	        cdwClearFileError(conn, ef->id);
 	    struct cgiParsedVars *tags = cdwMetaVarsList(conn, ef);
 	    if (!makeValidFile(conn, ef, tags, vfId))
 	        {
 		if (++errCount >= maxErrCount)
 		    errAbort("Aborting after %d errors", errCount);
 		}
 	    cgiParsedVarsFreeList(&tags);
 	    }
 	else
 	    {
 	    verbose(2, "no tags to validate on %s %s\n", ef->cdwFileName, ef->submitFileName);
 	    }
 	}
     }
 
 sqlDisconnect(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 maxErrCount = optionInt("maxErrCount", maxErrCount);
 redo = optionExists("redo");
 cdwMakeValidFile(sqlUnsigned(argv[1]), sqlUnsigned(argv[2]));
 return 0;
 }