96d7a1cc2f69c3989b4ca7cc2e94ef2b8df310dd brianlee Tue Mar 4 11:18:54 2025 -0800 Adding tiff and avi to cdwSubmit.c and cdwMakeValidFile.c for ingest diff --git src/hg/cirm/cdw/cdwMakeValidFile/cdwMakeValidFile.c src/hg/cirm/cdw/cdwMakeValidFile/cdwMakeValidFile.c index c213b694216..68e9fd1ea9b 100644 --- src/hg/cirm/cdw/cdwMakeValidFile/cdwMakeValidFile.c +++ src/hg/cirm/cdw/cdwMakeValidFile/cdwMakeValidFile.c @@ -1,1107 +1,1115 @@ /* cdwMakeValidFile - Add range of ids to valid file table. */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "errCatch.h" #include "localmem.h" #include "errAbort.h" #include "sqlNum.h" #include "cheapcgi.h" #include "obscure.h" #include "jksql.h" #include "asParse.h" #include "twoBit.h" #include "genomeRangeTree.h" #include "basicBed.h" #include "bbiFile.h" #include "bigWig.h" #include "bigBed.h" #include "bamFile.h" #include "htmlPage.h" #include "portable.h" #include "gff.h" #include "cdw.h" #include "cdwLib.h" #include "fa.h" #include "filePath.h" #include "cdwValid.h" #include "vcf.h" #include "csv.h" int maxErrCount = 1; /* Set from command line. */ int errCount; /* Set as we run. */ boolean redo = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "cdwMakeValidFile - Add range of ids to valid file table.\n" "usage:\n" " cdwMakeValidFile startId endId\n" "options:\n" " maxErrCount=N - maximum errors allowed before it stops, default %d\n" " -redo - redo validation even if have it already\n" , maxErrCount); } /* Command line validation table. */ static struct optionSpec options[] = { {"maxErrCount", OPTION_INT}, {"redo", OPTION_BOOLEAN}, {NULL, 0}, }; void alignFastqMakeBed(struct cdwFile *ef, struct cdwAssembly *assembly, char *fastqPath, struct cdwValidFile *vf, FILE *bedF, char *assay) /* Take a sample fastq and run bwa on it, and then convert that file to a bed. * Update vf->mapRatio and related fields. */ { cdwAlignFastqMakeBed(ef, assembly, fastqPath, vf, bedF, &vf->mapRatio, &vf->depth, &vf->sampleCoverage, &vf->uniqueMapRatio, assay); } void makeValidFastq( struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwAssembly *assembly, struct cdwValidFile *vf, char *assay) /* Fill out fields of vf. Create sample subset. */ { /* Make cdwFastqFile record. */ long long fileId = ef->id; cdwMakeFastqStatsAndSample(conn, fileId); struct cdwFastqFile *fqf = cdwFastqFileFromFileId(conn, fileId); verbose(1, "Made sample fastq with %lld reads\n", fqf->sampleCount); /* Save some key pieces in vf. */ vf->itemCount = fqf->readCount; vf->basesInItems = fqf->baseCount; vf->sampleCount = fqf->sampleCount; vf->basesInSample = fqf->basesInSample; /* Align fastq and turn results into bed. */ char sampleBedName[PATH_LEN], temp[PATH_LEN]; safef(sampleBedName, PATH_LEN, "%scdwSampleBedXXXXXX", cdwTempDirForToday(temp)); cdwReserveTempFile(sampleBedName); FILE *bedF = mustOpen(sampleBedName, "w"); alignFastqMakeBed(ef, assembly, fqf->sampleFileName, vf, bedF, assay); carefulClose(&bedF); vf->sampleBed = cloneString(sampleBedName); cdwFastqFileFree(&fqf); } #define TYPE_BAM 1 #define TYPE_READ 2 #ifdef OLD struct miniBed /* Almost a bed record. */ { struct miniBed *next; uint32_t tid; // Target ID in a bam file uint32_t start; // Start position uint32_t size; // Size of read char strand; // '+' or '-' }; int miniBedCmp(const void *va, const void *vb) /* Compare to sort based on query start. */ { const struct targetPos *a = *((struct targetPos **)va); const struct targetPos *b = *((struct targetPos **)vb); int dif; dif = a->tid - b->tid; if (dif == 0) dif = a->start - b->start; return dif; } void cdwMakeSampleOfBam(char *inBamName, FILE *outBed, int maxSampleSize, struct cdwAssembly *assembly, struct genomeRangeTree *grt, struct cdwValidFile *vf) /* Sample every downStep items in inBam and write in simplified bed 5 fashion to outBed. */ { samfile_t *sf = samopen(inBamName, "rb", NULL); bam_header_t *bamHeader = sf->header; struct lm *lm = lmInit(0); struct miniBed *mbList = NULL, *mb; bam1_t one; ZeroVar(&one); // This seems to be necessary! /* Pass through collecting counts and making up miniBeds for items. */ long long mappedCount = 0, uniqueMappedCount = 0; for (;;) { if (bam_read1(sf->x.bam, &one) < 0) break; int32_t tid = one.core.tid; int l_qseq = one.core.l_qseq; if (tid > 0) { ++mappedCount; if (one.core.qual > cdwMinMapQual) { ++uniqueMappedCount; lmAllocVar(lm, mb); mb->tid = tid; mb->start = one.core.pos; mb->size = l_qseq; mb-strand = ((one.core.flat & BAM_FREVERSE) ? '-' : '+'); slAddHead(&mbList, mb); } } } /* Whittle down mini bed list to sample, and crawl through it making bed etc. */ mbList = slListRandomSample(mbList, maxSampleSize); for (mb = mbList; mb != NULL; mb = mb->next) { vf->sampleCount += 1; vf->basesInSample += mb->size; char *chrom = bamHeader->target_name[tid]; { if (tid > 0) { int start = one.core.pos; // Approximate here... can do better if parse cigar. int end = start + l_qseq; boolean isRc = (one.core.flag & BAM_FREVERSE); char strand = '+'; if (isRc) { strand = '-'; } if (start < 0) start=0; fprintf(outBed, "%s\t%d\t%d\t.\t0\t%c\n", chrom, start, end, strand); genomeRangeTreeAdd(grt, chrom, start, end); } } } vf->mapRatio = (double)mappedCount/vf->itemCount; vf->uniqueMapRatio = (double)uniqueMappedCount/vf->itemCount; vf->depth = vf->basesInItems*vf->mapRatio/assembly->baseCount; samclose(sf); lmCleanup(&lm); } #endif /* OLD */ static void checkBamChroms(struct sqlConnection *conn, char *bamPath, struct cdwAssembly *assembly) /* Check chromosomes in bam file are in agreement with those in * assembly. The bar here is pretty low - there must be at least one size * agreement and no size disagreements. */ { /* Get size hash from two bit file associated with assembly */ char *twoBitPath = cdwPathForFileId(conn, assembly->twoBitId); struct hash *chromHash = twoBitChromHash(twoBitPath); /* Get bam file handle and chromosome list. */ samfile_t *fh = bamMustOpenLocal(bamPath, "rb", NULL); bam_header_t *bamHeader = sam_hdr_read(fh); /* Count up chromosomes that match. Not all need to but at least some do. */ int i, matchCount = 0; for (i = 0; i < bamHeader->n_targets; i++) { /* Look up chrom size under BAM name, and if that fails under name with "chr" prefix */ char *name = bamHeader->target_name[i]; int size = bamHeader->target_len[i]; int chromSize = hashIntValDefault(chromHash, name, 0); char chrName[256]; if (chromSize == 0) { safef(chrName, sizeof(chrName), "chr%s", name); chromSize = hashIntValDefault(chromHash, chrName, 0); } /* Compare sizes, if they don't match swawk and die, otherwise add to match count */ if (chromSize != 0) { if (chromSize == size) ++matchCount; else errAbort("Chromosome size mismatch: %s is %d bases in %s, %d base in %s (%s)", name, size, bamPath, chromSize, assembly->name, twoBitPath); } } if (matchCount == 0) errAbort("%s didn't match any chromosomes in %s", bamPath, twoBitPath); /* Clean up and go home */ bamClose(&fh); hashFree(&chromHash); freez(&twoBitPath); } static void checkBbiChroms(struct sqlConnection *conn, struct bbiFile *bbi, struct cdwAssembly *assembly) /* Check chromosomes in bigBed or bigWig file are in agreement with those in * assembly. The bar here is pretty low - there must be at least one size * agreement and no size disagreements. The bbiFile may only have a single * chromosome for all we know. Also want to tolerate different use of haplotypes */ { /* Get size hash from two bit file associated with assembly */ char *twoBitPath = cdwPathForFileId(conn, assembly->twoBitId); struct hash *chromHash = twoBitChromHash(twoBitPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Count up chromosomes that match. Not all need to but at least some do. */ int matchCount = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { int chromSize = hashIntValDefault(chromHash, chrom->name, 0); if (chromSize != 0) { if (chromSize == chrom->size) ++matchCount; else errAbort("Chromosome size mismatch: %s is %d bases in %s, %d base in %s (%s)", chrom->name, (int)chrom->size, bbi->fileName, chromSize, assembly->name, twoBitPath); } } if (matchCount == 0) errAbort("%s didn't match any chromosomes in %s", bbi->fileName, twoBitPath); /* Clean up and go home */ bbiChromInfoFreeList(&chromList); hashFree(&chromHash); freez(&twoBitPath); } void makeValidBigBed( struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwAssembly *assembly, char *format, struct cdwValidFile *vf) /* Fill in fields of vf based on bigBed. */ { struct bbiFile *bbi = bigBedFileOpen(path); checkBbiChroms(conn, bbi, assembly); vf->sampleCount = vf->itemCount = bigBedItemCount(bbi); struct bbiSummaryElement sum = bbiTotalSummary(bbi); vf->basesInSample = vf->basesInItems = sum.sumData; vf->sampleCoverage = (double)sum.validCount/assembly->baseCount; vf->depth = (double)sum.sumData/assembly->baseCount; vf->mapRatio = 1.0; bigBedFileClose(&bbi); } void makeValidBed( struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwAssembly *assembly, char *format, char *asRoot, struct cdwValidFile *vf) /* Fill in fields of vf based on bed and grind through file checking it. */ { /* Get structure with info about which fields are true bed. */ struct cdwBedType *bedType = cdwBedTypeFind(asRoot); int bedFieldCount = bedType->bedFields; /* Load up as file to check against */ char asPath[PATH_LEN]; cdwAsPath(asRoot, asPath); struct asObject *as = asParseFile(asPath); /* Create a row one bigger than expected (so can detect rows too big as well * as too small. */ int colCount = slCount(as->columnList); int colAlloc = colCount+1; char *row[colAlloc]; /* Loop through file validating each line and collecting statistics. */ struct lineFile *lf = lineFileOpen(path, TRUE); struct bed bed = {}; char *line; int itemCount = 0; long long baseCount = 0; while (lineFileNextReal(lf, &line)) { int wordsRead = chopByWhite(line, row, colAlloc); lineFileExpectWords(lf, colCount, wordsRead); loadAndValidateBed(row, bedFieldCount, colCount, lf, &bed, as, FALSE); ++itemCount; baseCount += bed.chromEnd - bed.chromStart; } asObjectFreeList(&as); /* Fill in fields of vf based on statistics */ vf->sampleCount = vf->itemCount = itemCount; vf->basesInSample = vf->basesInItems = baseCount; vf->sampleCoverage = vf->depth = (double)baseCount/assembly->baseCount; vf->mapRatio = 1.0; } void makeValidBigWig(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwAssembly *assembly, struct cdwValidFile *vf) /* Fill in fields of vf based on bigWig. */ { struct bbiFile *bbi = bigWigFileOpen(path); checkBbiChroms(conn, bbi, assembly); struct bbiSummaryElement sum = bbiTotalSummary(bbi); vf->sampleCount = vf->itemCount = vf->basesInSample = vf->basesInItems = sum.validCount; vf->sampleCoverage = (double)sum.validCount/assembly->baseCount; vf->depth = (double)sum.sumData/assembly->baseCount; vf->mapRatio = 1.0; bigWigFileClose(&bbi); } void cdwValidFileDump(struct cdwValidFile *vf, FILE *f) /* Write out info about vf, just for debugging really */ { fprintf(f, "vf->id = %d\n", vf->id); fprintf(f, "vf->licensePlate = %s\n", vf->licensePlate); fprintf(f, "vf->fileId = %d\n", vf->fileId); fprintf(f, "vf->format = %s\n", vf->format); fprintf(f, "vf->outputType = %s\n", vf->outputType); fprintf(f, "vf->experiment = %s\n", vf->experiment); fprintf(f, "vf->replicate = %s\n", vf->replicate); fprintf(f, "vf->enrichedIn = %s\n", vf->enrichedIn); fprintf(f, "vf->ucscDb = %s\n", vf->ucscDb); fprintf(f, "vf->itemCount = %lld\n", vf->itemCount); fprintf(f, "vf->basesInItems = %lld\n", vf->basesInItems); fprintf(f, "vf->sampleBed = %s\n", vf->sampleBed); fprintf(f, "vf->sampleCount = %lld\n", vf->sampleCount); fprintf(f, "vf->basesInSample = %lld\n", vf->basesInSample); fprintf(f, "vf->sampleCoverage = %g\n", vf->sampleCoverage); fprintf(f, "vf->sampleCount = %g\n", vf->depth); } void makeValidBam( struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwAssembly *assembly, struct cdwValidFile *vf) /* Fill out fields of vf based on bam. Create sample subset as a little bed file. */ { /* Check chromosome sizes to fail fast if need be on wrong genome version. */ checkBamChroms(conn, path, assembly); /* Have cdwBamStats do most of the work. */ char sampleFileName[PATH_LEN]; struct cdwBamFile *ebf = cdwMakeBamStatsAndSample(conn, ef->id, sampleFileName); /* Fill in some of validFile record from bamFile record */ vf->sampleBed = cloneString(sampleFileName); vf->itemCount = ebf->readCount; vf->basesInItems = ebf->readBaseCount; vf->mapRatio = (double)ebf->mappedCount/ebf->readCount; vf->uniqueMapRatio = (double)ebf->uniqueMappedCount/ebf->readCount; vf->depth = vf->basesInItems*vf->mapRatio/assembly->baseCount; /* Scan through the bam file to make up information about the sample bits */ struct genomeRangeTree *grt = genomeRangeTreeNew(); struct lineFile *lf = lineFileOpen(sampleFileName, TRUE); char *row[3]; while (lineFileRow(lf, row)) { char *chrom = row[0]; unsigned start = sqlUnsigned(row[1]); unsigned end = sqlUnsigned(row[2]); vf->sampleCount += 1; vf->basesInSample += end - start; genomeRangeTreeAdd(grt, chrom, start, end); } lineFileClose(&lf); /* Fill in last bits that need summing from the genome range tree. */ long long basesHitBySample = genomeRangeTreeSumRanges(grt); vf->sampleCoverage = (double)basesHitBySample/assembly->baseCount; genomeRangeTreeFree(&grt); cdwBamFileFree(&ebf); } void makeValid2Bit(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Fill in info about assembly */ { struct twoBitFile *tbf = twoBitOpen(path); vf->basesInItems = vf->basesInSample = twoBitTotalSize(tbf); vf->itemCount = vf->sampleCount = tbf->seqCount; vf->mapRatio = 1.0; vf->sampleCoverage = 1.0; vf->depth = 1.0; twoBitClose(&tbf); } void makeValidFasta(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Fill in info about fasta file */ { struct lineFile *lf = lineFileOpen(path, FALSE); DNA *dna; int size; char *name; while (faSpeedReadNext(lf, &dna, &size, &name)) { vf->basesInItems += size; vf->itemCount += 1; } lineFileClose(&lf); } void genomeRangeTreeWriteAsBed3(struct genomeRangeTree *grt, char *fileName) /* Write as bed4 file */ { FILE *f = mustOpen(fileName, "w"); struct hashEl *chrom, *chromList = hashElListHash(grt->hash); slSort(&chromList, hashElCmpWithEmbeddedNumbers); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { char *chromName = chrom->name; struct rbTree *rangeTree = chrom->val; struct range *range, *rangeList = rangeTreeList(rangeTree); for (range = rangeList; range != NULL; range = range->next) fprintf(f, "%s\t%d\t%d\n", chromName, range->start, range->end); } carefulClose(&f); } void makeValidVcf( struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwAssembly *assembly, struct cdwValidFile *vf) /* Fill out fields of vf from a variant call format (vcf) file. Create bed file. */ { /* Have cdwVcfStats do most of the work. */ char sampleFileName[PATH_LEN]; struct cdwVcfFile *vcf = cdwMakeVcfStatsAndSample(conn, ef->id, sampleFileName); /* Fill in some of validFile record from bamFile record */ vf->sampleBed = cloneString(sampleFileName); vf->itemCount = vcf->itemCount; vf->basesInItems = vcf->sumOfSizes; vf->mapRatio = 1; vf->uniqueMapRatio = 1; vf->depth = (double)vcf->sumOfSizes/assembly->baseCount; vf->sampleCount = vcf->itemCount; vf->basesInSample = vcf->sumOfSizes; vf->sampleCoverage = (double)vcf->basesCovered/assembly->baseCount; cdwVcfFileFree(&vcf); } void makeValidText(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Fill in info about a text file. */ { struct lineFile *lf = lineFileOpen(path, FALSE); char *line; int lineSize; while (lineFileNext(lf, &line, &lineSize)) { int i; for (i=0; isubmitFileName); } vf->itemCount += 1; } lineFileClose(&lf); } void makeValidTabSepFile(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf, char **labels, int fieldCount) /* Make sure a file looks like it's tab separated with a consistent number of columns and * optionally first row matching labels. */ { struct lineFile *lf = lineFileOpen(path, TRUE); char *row[fieldCount]; boolean firstTime = TRUE; while (lineFileRowTab(lf, row)) { if (firstTime && labels != NULL) { int i; for (i=0; iitemCount += 1; } lineFileClose(&lf); } void makeValidKallistoAbundance( struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Make sure a kallisto abundance file looks all good */ { char *labels[] = { "target_id", "length", "eff_length", "est_counts", "tpm", }; makeValidTabSepFile(conn, path, ef, vf, labels, ArraySize(labels)); } void makeValidTsv( struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Make sure a tsv tab-separated values file looks all good */ { struct lineFile *lf = lineFileOpen(path, TRUE); // get fieldCount from first line int fieldCount = 0; int lineSize; char *line; while (lineFileNext(lf, &line, &lineSize)) { if (line[0] == '#') continue; fieldCount = chopByChar(line, '\t', NULL, 0); } lineFileClose(&lf); if (fieldCount == 0) errAbort("0 columns in tsv %s", ef->submitFileName); makeValidTabSepFile(conn, path, ef, vf, NULL, fieldCount); } void makeValidCsv( struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Make sure a csv comma-separated values file looks all good */ { struct lineFile *lf = lineFileOpen(path, TRUE); // get fieldCount from first line int fieldCount = 0; int lineSize; char *line; int lineNumber = 0; while (lineFileNext(lf, &line, &lineSize)) { ++lineNumber; struct slName *list = csvParse(line); int thisCount = slCount(list); if (fieldCount == 0) { fieldCount = thisCount; } else { if (thisCount != fieldCount) errAbort("Line #%d of csv %s has %d columns. Previous rows had %d columns.", lineNumber, ef->submitFileName, thisCount, fieldCount); vf->itemCount += 1; } slFreeList(list); } lineFileClose(&lf); if (fieldCount == 0) errAbort("0 columns in csv %s", ef->submitFileName); } void makeValidHtml(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Fill in info about html file */ { struct htmlPage *page = htmlPageGet(path); htmlPageValidateOrAbort(page); vf->itemCount = slCount(page->tags); htmlPageFree(&page); } void makeValidGtf(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwAssembly *assembly, struct cdwValidFile *vf) /* Fill in info about a gtf file. */ { /* Open and read file with generic GFF reader and check it is GTF */ struct gffFile *gff = gffRead(path); if (!gff->isGtf) errAbort("file id %lld (%s) is not in GTF format - check it has gene_id and transcript_id", (long long)ef->id, ef->submitFileName); /* Convert it to a somewhat smaller less informative bed file for sampling purposes. */ char sampleFileName[PATH_LEN], temp[PATH_LEN]; safef(sampleFileName, PATH_LEN, "%scdwGffBedXXXXXX", cdwTempDirForToday(temp)); cdwReserveTempFile(sampleFileName); FILE *f = fopen(sampleFileName, "w"); struct genomeRangeTree *grt = genomeRangeTreeNew(); /* Loop through lines writing out simple bed and adding to genome range tree. */ struct gffLine *gffLine; long long itemCount = 0; long long totalSize = 0; for (gffLine = gff->lineList; gffLine != NULL; gffLine = gffLine->next) { totalSize += gffLine->end - gffLine->start; fprintf(f, "%s\t%ld\t%ld\n", gffLine->seq, gffLine->start, gffLine->end); genomeRangeTreeAdd(grt, gffLine->seq, gffLine->start, gffLine->end); ++itemCount; } carefulClose(&f); /* Fill out what we can of vf with info we've gathered. */ vf->itemCount = vf->sampleCount = itemCount; vf->basesInItems = vf->basesInSample = totalSize; vf->sampleBed = cloneString(sampleFileName); long long basesHitBySample = genomeRangeTreeSumRanges(grt); genomeRangeTreeFree(&grt); vf->sampleCoverage = (double)basesHitBySample/assembly->baseCount; vf->mapRatio = 1.0; vf->depth = (double)totalSize/assembly->baseCount; gffFileFree(&gff); } void makeValidRcc(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Fill in info about a nanostring RCC file. */ { cdwValidateRcc(path); } void makeValidIdat(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Fill in info about a illumina idac file. */ { cdwValidateIdat(path); } void makeValidPdf(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Check it is really pdf. */ { cdwValidatePdf(path); } void validateVcfGzTbi(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Given a path to a tabix on a vcf, validate it is tabix, and that the * vcf it refers to exists and has correct name */ { char dir[PATH_LEN], name[FILENAME_LEN], extension[FILEEXT_LEN]; splitPath(path, dir, name, extension); char vcfPath[PATH_LEN]; safef(vcfPath, sizeof(vcfPath), "%s%s%s", dir, name, extension); if (!fileExists(vcfPath)) { /* Look for it under cdwPathName */ if (!cdwFindInSameSubmitDir(conn, ef, vcfPath)) errAbort("%s, the original of %s doesn't exist", vcfPath, path); } } void makeValidCustomTrack(struct sqlConnection *conn, char *path, struct cdwFile *ef, struct cdwValidFile *vf) /* Fill in some info about a BED file of no particular sub-format. This is allowed to have * browser and track lines in it, which are ignored. */ { struct lineFile *lf = lineFileOpen(path, TRUE); char *line; char *row[256]; int bedSize = 0; while (lineFileNextReal(lf, &line)) { if (startsWithWord("browser", line) || startsWithWord("track", line)) continue; int wordCount = chopLine(line, row); if (bedSize == 0) { bedSize = wordCount; if (bedSize < 3) { lineFileExpectAtLeast(lf, 3, bedSize); } } else { if (bedSize != wordCount) { errAbort("Some lines of %s have %d words, but line %d of has %d words", lf->fileName, bedSize, lf->lineIx, wordCount); } } int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (end < start) errAbort("end before start line %d of %s", lf->lineIx, lf->fileName); ++vf->itemCount; vf->basesInItems += (end - start); } lineFileClose(&lf); } static void needAssembly(struct cdwFile *ef, char *format, struct cdwAssembly *assembly) /* Require assembly tag be present. */ { if (assembly == NULL) errAbort("file id %lld (%s) is %s format and needs an assembly tag to validate", (long long)ef->id, ef->submitFileName, format); } void mustMakeValidFile(struct sqlConnection *conn, struct cdwFile *ef, struct cgiParsedVars *tags, long long oldValidId) /* If possible make a cdwValidFile record for this. Makes sure all the right tags are there, * and then parses file enough to determine itemCount and the like. For some files, like fastqs, * it will take a subset of the file as a sample so can do QA without processing the whole thing. */ { /* Make up validFile from tags and id */ struct cdwValidFile *vf; AllocVar(vf); vf->fileId = ef->id; cdwValidFileFieldsFromTags(vf, tags); vf->sampleBed = ""; if (oldValidId == 0) // moving up here to mitigate race condition { cdwValidFileSaveToDb(conn, vf, "cdwValidFile", 512); vf->id = sqlLastAutoId(conn); } else { vf->id = oldValidId; } if (vf->format) // We only can validate if we have something for format { /* Look up assembly. */ struct cdwAssembly *assembly = NULL; if (!isEmpty(vf->ucscDb) && !sameString(vf->ucscDb, "unknown")) { char *ucscDb = vf->ucscDb; char query[256]; sqlSafef(query, sizeof(query), "select * from cdwAssembly where ucscDb='%s'", vf->ucscDb); assembly = cdwAssemblyLoadByQuery(conn, query); if (assembly == NULL) errAbort("Couldn't find assembly corresponding to %s", ucscDb); } /* Make path to file */ char path[PATH_LEN]; safef(path, sizeof(path), "%s%s", cdwRootDir, ef->cdwFileName); /* And dispatch according to format. */ char *format = vf->format; char *suffix = cdwFindDoubleFileSuffix(path); char suffixBuf[128]; char *bedPrefix = "bed_"; int bedPrefixSize = 4; if (sameString(format, "fastq")) { char *assay = cdwLookupTag(tags, "assay"); needAssembly(ef, format, assembly); makeValidFastq(conn, path, ef, assembly, vf, assay); suffix = ".fastq.gz"; } else if (cdwIsSupportedBigBedFormat(format)) { needAssembly(ef, format, assembly); makeValidBigBed(conn, path, ef, assembly, format, vf); if (sameString(format, "bigBed")) suffix = ".bigBed"; else { safef(suffixBuf, sizeof(suffixBuf), ".%s.bigBed", format); suffix = suffixBuf; } } else if (startsWith(bedPrefix, format) && cdwIsSupportedBigBedFormat(format+bedPrefixSize)) { char *formatNoBed = format + bedPrefixSize; needAssembly(ef, format, assembly); makeValidBed(conn, path, ef, assembly, format, formatNoBed, vf); safef(suffixBuf, sizeof(suffixBuf), ".%s.bed", format); suffix = suffixBuf; } else if (sameString(format, "bigWig")) { needAssembly(ef, format, assembly); makeValidBigWig(conn, path, ef, assembly, vf); suffix = ".bigWig"; } else if (sameString(format, "bam")) { needAssembly(ef, format, assembly); makeValidBam(conn, path, ef, assembly, vf); suffix = ".bam"; } else if (sameString(format, "bam.bai")) { cdwValidateBamIndex(path); suffix = ".bam.bai"; } else if (sameString(format, "vcf.idx")) { cdwValidateTabixIndex(path); suffix = ".vcf.idx"; } else if (sameString(format, "2bit")) { makeValid2Bit(conn, path, ef, vf); suffix = ".2bit"; } else if (sameString(format, "fasta")) { makeValidFasta(conn, path, ef, vf); suffix = ".fasta.gz"; } else if (sameString(format, "gtf")) { needAssembly(ef, format, assembly); makeValidGtf(conn, path, ef, assembly, vf); suffix = ".gtf.gz"; } else if (sameString(format, "rcc")) { makeValidRcc(conn, path, ef, vf); suffix = ".RCC"; } else if (sameString(format, "idat")) { makeValidIdat(conn, path, ef, vf); suffix = ".idat"; } else if (sameString(format, "customTrack")) { makeValidCustomTrack(conn, path, ef, vf); assert(endsWith(ef->submitFileName, ".gz")); suffix = cdwFindDoubleFileSuffix(ef->submitFileName); } else if (sameString(format, "pdf")) { makeValidPdf(conn, path, ef, vf); suffix = ".pdf"; } else if (sameString(format, "vcf")) { needAssembly(ef, format, assembly); makeValidVcf(conn, path, ef, assembly, vf); if (endsWith(ef->submitFileName, ".gz")) suffix = ".vcf.gz"; else suffix = ".vcf"; } else if (sameString(format, "vcf.gz.tbi")) { validateVcfGzTbi(conn, path, ef, vf); suffix = ".vcf.gz.tbi"; } else if (sameString(format, "cram")) { cdwValidateCram(path); suffix = ".cram"; } else if (sameString(format, "jpg")) { cdwValidateJpg(path); suffix = ".jpg"; } else if (sameString(format, "png")) { cdwValidatePng(path); suffix = ".png"; } else if (sameString(format, "expression_matrix")) { makeValidText(conn, path, ef, vf); } else if (sameString(format, "text")) { makeValidText(conn, path, ef, vf); } else if (sameString(format, "kallisto_abundance")) { makeValidKallistoAbundance(conn, path, ef, vf); } else if (sameString(format, "tsv")) { makeValidTsv(conn, path, ef, vf); } else if (sameString(format, "csv")) { makeValidCsv(conn, path, ef, vf); } else if (sameString(format, "html")) { makeValidHtml(conn, path, ef, vf); } + else if (sameString(format, "tiff")) + { + /* No specific validation needed for raw format. */ + } + else if (sameString(format, "avi")) + { + /* No specific validation needed for xls format. */ + } else if (sameString(format, "raw")) { /* No specific validation needed for raw format. */ } else if (sameString(format, "xls")) { /* No specific validation needed for xls format. */ } else if (sameString(format, "unknown")) { /* No specific validation needed for unknown format. */ } else { errAbort("Unrecognized format %s for %s\n", format, ef->cdwFileName); } /* Save record except for license plate to DB. */ cdwValidFileUpdateDb(conn, vf, vf->id); /* Create license plate around our ID. File in warehouse to use license plate * instead of baby-babble IDs. */ cdwMakeLicensePlate( cdwLicensePlateHead(conn), vf->id, vf->licensePlate, cdwMaxPlateSize); /* Create swapped out version of cdwFileName in newName. */ struct dyString *newName = dyStringNew(0); char *fileName = ef->cdwFileName; char *dirEnd = strrchr(fileName, '/'); if (dirEnd == NULL) dirEnd = fileName; else dirEnd += 1; dyStringAppendN(newName, fileName, dirEnd - fileName); dyStringAppend(newName, vf->licensePlate); dyStringAppend(newName, suffix); /* Now build full path names and attempt rename in file system. */ char oldPath[PATH_LEN], newPath[PATH_LEN]; safef(oldPath, sizeof(oldPath), "%s%s", cdwRootDir, fileName); safef(newPath, sizeof(newPath), "%s%s", cdwRootDir, newName->string); char query[PATH_LEN+256]; // rename symlink to it in submitDir sqlSafef(query, sizeof(query), "select url from cdwSubmitDir where id='%d'", ef->submitDirId); char *submitDir = sqlQuickString(conn, query); if (!submitDir) errAbort("submitDir not found for id %d", ef->submitDirId); boolean metaException = sameString(ef->submitFileName, "meta.txt"); // if meta.txt is being expliictly submitted, this is probably for dataset page meta links. // this makes it an odd exception, because unlike normally submitted files, // this will NOT be converted to a symlink pointing to the cdw/ dir file // during the original cdwSubmit. char *lastPath = NULL; if (!metaException) { lastPath = findSubmitSymlink(ef->submitFileName, submitDir, oldPath); freeMem(submitDir); if (!lastPath) noWarnAbort(); verbose(3, "lastPath=%s newPath=%s\n", lastPath, newPath); if (unlink(lastPath) == -1) // drop about to be invalid symlink errnoAbort("unlink failure %s", lastPath); } mustRename(oldPath, newPath); if (!metaException) { if (symlink(newPath, lastPath) == -1) // replace with symlink errnoAbort("symlink failure from %s to %s", lastPath, newPath); freeMem(lastPath); } verbose(2, "Renamed %s to %s\n", oldPath, newPath); /* Update database with new name */ sqlSafef(query, sizeof(query), "update cdwFile set cdwFileName='%s' where id=%lld", newName->string, (long long)ef->id); sqlUpdate(conn, query); dyStringFree(&newName); /* Update validFile record with license plate. */ sqlSafef(query, sizeof(query), "update cdwValidFile set licensePlate='%s' where id=%lld", vf->licensePlate, (long long)vf->id); sqlUpdate(conn, query); } freez(&vf); } boolean makeValidFile(struct sqlConnection *conn, struct cdwFile *ef, struct cgiParsedVars *tags, long long oldValidId) /* Attempt to make validation. If it fails catch error and attach it to ef->errorMessage as well * as sending it to stderr, and return FALSE. Otherwise return TRUE. */ { struct errCatch *errCatch = errCatchNew(); boolean success = TRUE; if (errCatchStart(errCatch)) { mustMakeValidFile(conn, ef, tags, oldValidId); } errCatchEnd(errCatch); if (errCatch->gotError) { cdwWriteErrToStderrAndTable(conn, "cdwFile", ef->id, errCatch->message->string); warn("This is from submitted file %s", ef->submitFileName); success = FALSE; } else { warn("%s", errCatch->message->string); // Make status output legible char query[256]; sqlSafef(query, sizeof(query), "update cdwFile set errorMessage='' where id=%lld", (long long)ef->id); sqlUpdate(conn, query); } errCatchFree(&errCatch); return success; } void cdwClearFileError(struct sqlConnection *conn, long long fileId) /* Clear file error message */ { char query[256]; sqlSafef(query, sizeof(query), "update cdwFile set errorMessage='' where id=%lld", fileId); sqlUpdate(conn, query); } void cdwMakeValidFile(int startId, int endId) /* cdwMakeValidFile - Add range of ids to valid file table.. */ { /* Make list with all files in ID range - don't want to use cdwFileAllIntactInRange because * we may be revalidating files that do have errors. */ struct sqlConnection *conn = sqlConnect(cdwDatabase); char query[512]; sqlSafef(query, sizeof(query), "select * from cdwFile where id>=%d and id<=%d and endUploadTime != 0 " "and updateTime != 0", startId, endId); struct cdwFile *ef, *efList = cdwFileLoadByQuery(conn, query); if (efList == NULL) errAbort("No files in %d to %d", startId, endId); for (ef = efList; ef != NULL; ef = ef->next) { char query[256]; sqlSafef(query, sizeof(query), "select id from cdwValidFile where fileId=%lld", (long long)ef->id); long long vfId = sqlQuickLongLong(conn, query); if (vfId != 0 && isEmpty(ef->errorMessage) && !redo) { verbose(2, "already validated %s %s\n", ef->cdwFileName, ef->submitFileName); } else { verbose(1, "processing %lld %s %s\n", (long long)ef->id, ef->cdwFileName, ef->submitFileName); char path[PATH_LEN]; safef(path, sizeof(path), "%s%s", cdwRootDir, ef->cdwFileName); if (!isEmpty(ef->tags)) // All ones we care about have tags { if (vfId != 0) cdwClearFileError(conn, ef->id); struct cgiParsedVars *tags = cdwMetaVarsList(conn, ef); if (!makeValidFile(conn, ef, tags, vfId)) { if (++errCount >= maxErrCount) errAbort("Aborting after %d errors", errCount); } cgiParsedVarsFreeList(&tags); } else { verbose(2, "no tags to validate on %s %s\n", ef->cdwFileName, ef->submitFileName); } } } sqlDisconnect(&conn); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); maxErrCount = optionInt("maxErrCount", maxErrCount); redo = optionExists("redo"); cdwMakeValidFile(sqlUnsigned(argv[1]), sqlUnsigned(argv[2])); return 0; }