4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/cirm/cdw/cdwMakeEnrichments/cdwMakeEnrichments.c src/hg/cirm/cdw/cdwMakeEnrichments/cdwMakeEnrichments.c index 6c2a48c..ac0e57b 100644 --- src/hg/cirm/cdw/cdwMakeEnrichments/cdwMakeEnrichments.c +++ src/hg/cirm/cdw/cdwMakeEnrichments/cdwMakeEnrichments.c @@ -1,556 +1,556 @@ /* cdwMakeEnrichments - Scan through database and make a makefile to calc. enrichments and * store in database. Note to compare with featureBits enrichments use -countGaps * in featureBits */ /* Copyright (C) 2014 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "localmem.h" #include "hash.h" #include "basicBed.h" #include "options.h" #include "jksql.h" #include "cheapcgi.h" #include "bits.h" #include "portable.h" #include "localmem.h" #include "bbiFile.h" #include "bigBed.h" #include "bigWig.h" #include "genomeRangeTree.h" #include "cdw.h" #include "cdwLib.h" void usage() /* Explain usage and exit. */ { errAbort( "cdwMakeEnrichments - Scan through database and make a makefile to calc. enrichments and \n" "store in database. Enrichments similar to 'featureBits -countGaps' enrichments.\n" "usage:\n" " cdwMakeEnrichments startFileId endFileId\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; struct target /* Information about a target */ { struct target *next; struct cdwQaEnrichTarget *target; /* The database target structure */ struct genomeRangeTree *grt; /* Random access intersection structure for target */ double overlapBases; /* Sum of all overlaps with target. */ long long uniqOverlapBases; /* Sum of unique overlaps with target. */ boolean skip; /* If true skip calcs on this one. */ }; struct target *targetNew(struct cdwQaEnrichTarget *et, struct genomeRangeTree *grt) /* Make a little local target object */ { struct target *target; AllocVar(target); target->target = et; target->grt = grt; return target; } struct cdwQaEnrich *enrichFromOverlaps(struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *target, long long overlapBases, long long uniqOverlapBases) /* Build up enrichment structure based on overlaps between file and target */ { struct cdwQaEnrich *enrich; AllocVar(enrich); long long targetSize = target->target->targetSize; enrich->fileId = ef->id; enrich->qaEnrichTargetId = target->target->id; enrich->targetBaseHits = overlapBases; enrich->targetUniqHits = uniqOverlapBases; enrich->coverage = (double)uniqOverlapBases/targetSize; if (vf->sampleCount > 0) { double sampleSizeFactor = (double)vf->itemCount /vf->sampleCount; double sampleTargetDepth = (double)overlapBases/targetSize; if (vf->depth > 0) { enrich->enrichment = sampleSizeFactor * sampleTargetDepth / vf->depth; if (vf->sampleCoverage > 0) enrich->uniqEnrich = enrich->coverage / vf->sampleCoverage; } verbose(2, "%s size %lld (%0.3f%%), %s (%0.3f%%), overlap %lld (%0.3f%%)\n", target->target->name, targetSize, 100.0*targetSize/assembly->baseCount, ef->cdwFileName, 100.0*vf->sampleCoverage, uniqOverlapBases, 100.0*uniqOverlapBases/assembly->baseCount); } return enrich; } boolean enrichmentExists(struct sqlConnection *conn, struct cdwFile *ef, struct cdwQaEnrichTarget *target) /* Return TRUE if already is a an enrichment in database for this combination. */ { char query[256]; sqlSafef(query, sizeof(query), "select count(*) from cdwQaEnrich" " where fileId=%lld and qaEnrichTargetId=%d", (long long)ef->id, target->id); return sqlQuickNum(conn, query) != 0; } void doEnrichmentsFromBed3Sample(struct bed3 *sampleList, struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Given a bed3 list, calculate enrichments for targets */ { struct genomeRangeTree *sampleGrt = cdwMakeGrtFromBed3List(sampleList); struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash); /* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; long long uniqOverlapBases = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *sampleTree = chrom->val; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(sampleTree); for (range = rangeList; range != NULL; range = range->next) { /* Do unique base overlap counts (since using range trees both sides) */ int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end); uniqOverlapBases += overlap; } } } /* Figure out how much we overlap allowing same bases in genome * to part of more than one overlap. */ long long overlapBases = 0; struct bed3 *sample; for (sample = sampleList; sample != NULL; sample = sample->next) { int overlap = genomeRangeTreeOverlapSize(grt, sample->chrom, sample->chromStart, sample->chromEnd); overlapBases += overlap; } /* Save to database. */ struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, overlapBases, uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } genomeRangeTreeFree(&sampleGrt); hashElFreeList(&chromList); } void doEnrichmentsFromSampleBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from sample bed file. */ { char *sampleBed = vf->sampleBed; if (isEmpty(sampleBed)) { warn("No sample bed for %s", ef->cdwFileName); return; } /* Load sample bed, make a range tree to track unique coverage, and get list of all chroms .*/ struct bed3 *sampleList = bed3LoadAll(sampleBed); if (sampleList == NULL) { warn("Sample bed is empty for %s", ef->cdwFileName); return; } doEnrichmentsFromBed3Sample(sampleList, conn, ef, vf, assembly, targetList); bed3FreeList(&sampleList); } void doEnrichmentsFromBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bed file. */ { char *bedPath = cdwPathForFileId(conn, ef->id); struct bed3 *sampleList = bed3LoadAll(bedPath); doEnrichmentsFromBed3Sample(sampleList, conn, ef, vf, assembly, targetList); bed3FreeList(&sampleList); freez(&bedPath); } void doEnrichmentsFromBigBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigBedPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigBedFileOpen(bigBedPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigBed data in memory. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *iv; struct rbTree *bbTree = rangeTreeNew(); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(bbTree, iv->start, iv->end); struct range *bbRange, *bbRangeList = rangeTreeList(bbTree); /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct bigBedInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->overlapBases += overlap; } for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next) { int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end); target->uniqOverlapBases += overlap; } } } rangeTreeFree(&bbTree); lmCleanup(&lm); } /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigBedFileClose(&bbi); freez(&bigBedPath); } #ifdef OLDWAY /* This old way is ~3 times as slow */ void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigWig for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); long startBigQueryTime = clock1000(); struct bbiInterval *ivList = bigWigIntervalQuery(bbi, chrom->name, 0, chrom->size, lm); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; struct bbiInterval *iv; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->uniqOverlapBases += overlap; target->overlapBases += overlap * iv->val; } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; lmCleanup(&lm); } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); } #endif /* OLDWAY */ void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); struct bigWigValsOnChrom *valsOnChrom = bigWigValsOnChromNew(); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { long startBigQueryTime = clock1000(); boolean gotData = bigWigValsOnChromFetchData(valsOnChrom, chrom->name, bbi); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; if (gotData) { double *valBuf = valsOnChrom->valBuf; Bits *covBuf = valsOnChrom->covBuf; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(targetTree); for (range = rangeList; range != NULL; range = range->next) { int s = range->start, e = range->end, i; for (i=s; i<=e; ++i) { if (bitReadOne(covBuf, i)) { double x = valBuf[i]; target->uniqOverlapBases += 1; target->overlapBases += x; } } } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; } } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bigWigValsOnChromFree(&valsOnChrom); bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); } struct target *targetsForAssembly(struct sqlConnection *conn, struct cdwAssembly *assembly) /* Get list of enrichment targets for given assembly */ { char query[128]; sqlSafef(query, sizeof(query), "select * from cdwQaEnrichTarget where assemblyId=%d", assembly->id); struct cdwQaEnrichTarget *et, *etList = cdwQaEnrichTargetLoadByQuery(conn, query); /* Wrap a new structure around the enrichment targets where we'll store summary info. */ struct target *target, *targetList = NULL, **targetTail = &targetList; for (et = etList; et != NULL; et = et->next) { char *targetBed = cdwPathForFileId(conn, et->fileId); struct genomeRangeTree *grt = cdwGrtFromBigBed(targetBed); target = targetNew(et, grt); *targetTail = target; targetTail = &target->next; freez(&targetBed); } return targetList; } char *ignoredFormats[] = { "idat", "customTrack", "rcc", "kallisto_abundance", "expression_matrix", "bam.bai", "vcf.gz.tbi", "text", "html", "csv", "tsv", "pdf", "png", "unknown", }; void doEnrichments(struct sqlConnection *conn, struct cdwFile *ef, char *path, struct hash *assemblyToTarget) /* Calculate enrichments on for all targets file. The targetList and the * grtList are in the same order. */ { /* Get validFile from database. */ struct cdwValidFile *vf = cdwValidFileFromFileId(conn, ef->id); if (vf == NULL) return; /* We can only work if have validFile table entry */ if (!isEmpty(vf->enrichedIn) && !sameWord(vf->ucscDb, "unknown") && !isEmpty(vf->ucscDb) && !sameWord(vf->format, "unknown")) { /* Get our assembly */ char *format = vf->format; char *ucscDb = vf->ucscDb; char *targetName = cdwSimpleAssemblyName(ucscDb); struct cdwAssembly *assembly = cdwAssemblyForUcscDb(conn, targetName); struct target *targetList = hashFindVal(assemblyToTarget, assembly->name); if (targetList == NULL) { targetList = targetsForAssembly(conn, assembly); if (targetList == NULL) errAbort("No targets for assembly %s", assembly->name); hashAdd(assemblyToTarget, assembly->name, targetList); } /* Loop through targetList zeroing out existing ovelaps. */ struct target *target; boolean allSkip = TRUE; for (target = targetList; target != NULL; target = target->next) { target->overlapBases = target->uniqOverlapBases = 0; target->skip = enrichmentExists(conn, ef, target->target); if (!target->skip) allSkip = FALSE; } /* Do a big dispatch based on format. * Current formats; fastq, bigWig, bed, bigBed, gtf, gff, bam, vcf, idat, curstomTrack, rcc. * formats cnt; kallisto_abundance, bam.bai, vcf.gz.tbi, expression_matrix, unknown. */ if (!allSkip) { if (sameString(format, "fastq")) doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList); else if (sameString(format, "bigWig")) doEnrichmentsFromBigWig(conn, ef, vf, assembly, targetList); else if (startsWith("bed_", format)) doEnrichmentsFromBed(conn, ef, vf, assembly, targetList); else if (cdwIsSupportedBigBedFormat(format) || sameString(format, "bigBed")) doEnrichmentsFromBigBed(conn, ef, vf, assembly, targetList); else if (sameString(format, "gtf")) doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList); else if (sameString(format, "gff")) doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList); else if (sameString(format, "bam")) doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList); else if (sameString(format, "vcf")) doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList); else if (stringIx(format, ignoredFormats) >= 0) verbose(2, "Ignoring %s %s in doEnrichments, that's ok\n", format, ef->cdwFileName); else errAbort("Unrecognized format %s in doEnrichments(%s)", format, path); } /* Clean up and go home. */ cdwAssemblyFree(&assembly); } cdwValidFileFree(&vf); } void cdwMakeEnrichments(int startFileId, int endFileId) /* cdwMakeEnrichments - Scan through database and make a makefile to calc. enrichments and store * in database. */ { /* Make list with all files in ID range */ struct sqlConnection *conn = sqlConnect(cdwDatabase); struct cdwFile *ef, *efList = cdwFileAllIntactBetween(conn, startFileId, endFileId); /* Make up a hash for targets keyed by assembly name. */ struct hash *assemblyToTarget = hashNew(0); for (ef = efList; ef != NULL; ef = ef->next) { char path[PATH_LEN]; safef(path, sizeof(path), "%s%s", cdwRootDir, ef->cdwFileName); verbose(1, "%lld processing %s aka %s\n", (long long)ef->id, ef->submitFileName, path); if (ef->tags) // All ones we care about have tags doEnrichments(conn, ef, path, assemblyToTarget); } sqlDisconnect(&conn); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); cdwMakeEnrichments(sqlUnsigned(argv[1]), sqlUnsigned(argv[2])); return 0; }