f14c003e98375586810203f2b6250aae02c064ff
galt
  Wed Aug 4 15:22:47 2021 -0700
Adding png to the cdwEnrichments ignore list as requested.

diff --git src/hg/cirm/cdw/cdwMakeEnrichments/cdwMakeEnrichments.c src/hg/cirm/cdw/cdwMakeEnrichments/cdwMakeEnrichments.c
index 3e4d11c..6c2a48c 100644
--- src/hg/cirm/cdw/cdwMakeEnrichments/cdwMakeEnrichments.c
+++ src/hg/cirm/cdw/cdwMakeEnrichments/cdwMakeEnrichments.c
@@ -1,555 +1,556 @@
 /* cdwMakeEnrichments - Scan through database and make a makefile to calc. enrichments and 
  * store in database.  Note to compare with featureBits enrichments use -countGaps 
  * in featureBits */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "localmem.h"
 #include "hash.h"
 #include "basicBed.h"
 #include "options.h"
 #include "jksql.h"
 #include "cheapcgi.h"
 #include "bits.h"
 #include "portable.h"
 #include "localmem.h"
 #include "bbiFile.h"
 #include "bigBed.h"
 #include "bigWig.h"
 #include "genomeRangeTree.h"
 #include "cdw.h"
 #include "cdwLib.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "cdwMakeEnrichments - Scan through database and make a makefile to calc. enrichments and \n"
   "store in database. Enrichments similar to 'featureBits -countGaps' enrichments.\n"
   "usage:\n"
   "   cdwMakeEnrichments startFileId endFileId\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 struct target
 /* Information about a target */
     {
     struct target *next;
     struct cdwQaEnrichTarget *target;  /* The database target structure */
     struct genomeRangeTree *grt;       /* Random access intersection structure for target */
     double overlapBases;	       /* Sum of all overlaps with target. */
     long long uniqOverlapBases;	       /* Sum of unique overlaps with target. */
     boolean skip;		       /* If true skip calcs on this one. */
     };
 
 struct target *targetNew(struct cdwQaEnrichTarget *et, struct genomeRangeTree *grt)
 /* Make a little local target object */
 {
 struct target *target;
 AllocVar(target);
 target->target = et;
 target->grt = grt;
 return target;
 }
 
 struct cdwQaEnrich *enrichFromOverlaps(struct cdwFile *ef, struct cdwValidFile *vf,
 	struct cdwAssembly *assembly, struct target *target, 
 	long long overlapBases, long long uniqOverlapBases)
 /* Build up enrichment structure based on overlaps between file and target */
 {
 struct cdwQaEnrich *enrich;
 AllocVar(enrich);
 long long targetSize = target->target->targetSize;
 enrich->fileId = ef->id;
 enrich->qaEnrichTargetId = target->target->id;
 enrich->targetBaseHits = overlapBases;
 enrich->targetUniqHits = uniqOverlapBases;
 enrich->coverage = (double)uniqOverlapBases/targetSize;
 if (vf->sampleCount > 0)
     {
     double sampleSizeFactor = (double)vf->itemCount /vf->sampleCount;
     double sampleTargetDepth = (double)overlapBases/targetSize;
     if (vf->depth > 0)
 	{
 	enrich->enrichment = sampleSizeFactor * sampleTargetDepth / vf->depth;
 	if (vf->sampleCoverage > 0)
 	    enrich->uniqEnrich = enrich->coverage / vf->sampleCoverage;
 	}
     verbose(2, "%s size %lld (%0.3f%%), %s (%0.3f%%), overlap %lld (%0.3f%%)\n", 
 	target->target->name, targetSize, 100.0*targetSize/assembly->baseCount, 
 	ef->cdwFileName, 100.0*vf->sampleCoverage, 
 	uniqOverlapBases, 100.0*uniqOverlapBases/assembly->baseCount);
     }
 return enrich;
 }
 
 boolean enrichmentExists(struct sqlConnection *conn, struct cdwFile *ef, 
     struct cdwQaEnrichTarget *target)
 /* Return TRUE if already is a an enrichment in database for this combination. */
 {
 char query[256];
 sqlSafef(query, sizeof(query), 
     "select count(*) from cdwQaEnrich"
     " where fileId=%lld and qaEnrichTargetId=%d", (long long)ef->id, target->id);
 return sqlQuickNum(conn, query) != 0;
 }
 
 void doEnrichmentsFromBed3Sample(struct bed3 *sampleList,
     struct sqlConnection *conn,
     struct cdwFile *ef, struct cdwValidFile *vf, 
     struct cdwAssembly *assembly, struct target *targetList)
 /* Given a bed3 list,  calculate enrichments for targets */
 {
 struct genomeRangeTree *sampleGrt = cdwMakeGrtFromBed3List(sampleList);
 struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash);
 
 /* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */
 struct target *target;
 for (target = targetList; target != NULL; target = target->next)
     {
     if (target->skip)
         continue;
     struct genomeRangeTree *grt = target->grt;
     long long uniqOverlapBases = 0;
     for (chrom = chromList; chrom != NULL; chrom = chrom->next)
         {
 	struct rbTree *sampleTree = chrom->val;
 	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
 	if (targetTree != NULL)
 	    {
 	    struct range *range, *rangeList = rangeTreeList(sampleTree);
 	    for (range = rangeList; range != NULL; range = range->next)
 		{
 		/* Do unique base overlap counts (since using range trees both sides) */
 		int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end);
 		uniqOverlapBases += overlap;
 		}
 	    }
 	}
 
     /* Figure out how much we overlap allowing same bases in genome
      * to part of more than one overlap. */ 
     long long overlapBases = 0;
     struct bed3 *sample;
     for (sample = sampleList; sample != NULL; sample = sample->next)
         {
 	int overlap = genomeRangeTreeOverlapSize(grt, 
 	    sample->chrom, sample->chromStart, sample->chromEnd);
 	overlapBases += overlap;
 	}
 
     /* Save to database. */
     struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly,
 	target, overlapBases, uniqOverlapBases);
     cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
     cdwQaEnrichFree(&enrich);
     }
 genomeRangeTreeFree(&sampleGrt);
 hashElFreeList(&chromList);
 }
 
 void doEnrichmentsFromSampleBed(struct sqlConnection *conn, 
     struct cdwFile *ef, struct cdwValidFile *vf, 
     struct cdwAssembly *assembly, struct target *targetList)
 /* Figure out enrichments from sample bed file. */
 {
 char *sampleBed = vf->sampleBed;
 if (isEmpty(sampleBed))
     {
     warn("No sample bed for %s", ef->cdwFileName);
     return;
     }
 /* Load sample bed, make a range tree to track unique coverage, and get list of all chroms .*/
 struct bed3 *sampleList = bed3LoadAll(sampleBed);
 if (sampleList == NULL)
     {
     warn("Sample bed is empty for %s", ef->cdwFileName);
     return;
     }
 doEnrichmentsFromBed3Sample(sampleList, conn, ef, vf, assembly, targetList);
 bed3FreeList(&sampleList);
 }
 
 void doEnrichmentsFromBed(struct sqlConnection *conn,
     struct cdwFile *ef, struct cdwValidFile *vf, 
     struct cdwAssembly *assembly, struct target *targetList)
 /* Figure out enrichments from a bed file. */
 {
 char *bedPath = cdwPathForFileId(conn, ef->id);
 struct bed3 *sampleList = bed3LoadAll(bedPath);
 doEnrichmentsFromBed3Sample(sampleList, conn, ef, vf, assembly, targetList);
 bed3FreeList(&sampleList);
 freez(&bedPath);
 }
 
 void doEnrichmentsFromBigBed(struct sqlConnection *conn, 
     struct cdwFile *ef, struct cdwValidFile *vf, 
     struct cdwAssembly *assembly, struct target *targetList)
 /* Figure out enrichments from a bigBed file. */
 {
 /* Get path to bigBed, open it, and read all chromosomes. */
 char *bigBedPath = cdwPathForFileId(conn, ef->id);
 struct bbiFile *bbi = bigBedFileOpen(bigBedPath);
 struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);
 
 /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
  * for all targets.  This is complicated by just wanting to keep one chromosome worth of
  * bigBed data in memory. */
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     {
     /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */
     struct lm *lm = lmInit(0);
     struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm);
     struct bigBedInterval *iv;
     struct rbTree *bbTree = rangeTreeNew();
     for (iv = ivList; iv != NULL; iv = iv->next)
 	 rangeTreeAdd(bbTree, iv->start, iv->end);
     struct range *bbRange, *bbRangeList = rangeTreeList(bbTree);
 
     /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */
     struct target *target;
     for (target = targetList; target != NULL; target = target->next)
         {
 	if (target->skip)
 	    continue;
 	struct genomeRangeTree *grt = target->grt;
 	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
 	if (targetTree != NULL)
 	    {
 	    struct bigBedInterval *iv;
 	    for (iv = ivList; iv != NULL; iv = iv->next)
 		{
 		int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end);
 		target->overlapBases += overlap;
 		}
 	    for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next)
 		{
 		int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end);
 		target->uniqOverlapBases += overlap;
 		}
 	    }
 	}
     rangeTreeFree(&bbTree);
     lmCleanup(&lm);
     }
 
 /* Now loop through targets and save enrichment info to database */
 struct target *target;
 for (target = targetList; target != NULL; target = target->next)
     {
     if (target->skip)
 	continue;
     struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
 	target->overlapBases, target->uniqOverlapBases);
     cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
     cdwQaEnrichFree(&enrich);
     }
 
 bbiChromInfoFreeList(&chromList);
 bigBedFileClose(&bbi);
 freez(&bigBedPath);
 }
 
 #ifdef OLDWAY
 /* This old way is ~3 times as slow */
 void doEnrichmentsFromBigWig(struct sqlConnection *conn, 
     struct cdwFile *ef, struct cdwValidFile *vf, 
     struct cdwAssembly *assembly, struct target *targetList)
 /* Figure out enrichments from a bigBed file. */
 {
 /* Get path to bigBed, open it, and read all chromosomes. */
 char *bigWigPath = cdwPathForFileId(conn, ef->id);
 struct bbiFile *bbi = bigWigFileOpen(bigWigPath);
 struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);
 
 /* This takes a while, so let's figure out what parts take the time. */
 long totalBigQueryTime = 0;
 long totalOverlapTime = 0;
 
 /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
  * for all targets.  This is complicated by just wanting to keep one chromosome worth of
  * bigWig data in memory. Also just for performance we do a lookup of target range tree to
  * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     {
     /* Get list of intervals in bigWig for this chromosome, and feed it to a rangeTree. */
     struct lm *lm = lmInit(0);
     long startBigQueryTime = clock1000();
     struct bbiInterval *ivList = bigWigIntervalQuery(bbi, chrom->name, 0, chrom->size, lm);
     long endBigQueryTime = clock1000();
     totalBigQueryTime += endBigQueryTime - startBigQueryTime;
     struct bbiInterval *iv;
 
     /* Loop through all targets adding overlaps from ivList */
     long startOverlapTime = clock1000();
     struct target *target;
     for (target = targetList; target != NULL; target = target->next)
         {
 	struct genomeRangeTree *grt = target->grt;
 	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
 	if (targetTree != NULL)
 	    {
 	    for (iv = ivList; iv != NULL; iv = iv->next)
 		{
 		int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end);
 		target->uniqOverlapBases += overlap;
 		target->overlapBases += overlap * iv->val;
 		}
 	    }
 	}
     long endOverlapTime = clock1000();
     totalOverlapTime += endOverlapTime - startOverlapTime;
     lmCleanup(&lm);
     }
 
 verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime);
 
 /* Now loop through targets and save enrichment info to database */
 struct target *target;
 for (target = targetList; target != NULL; target = target->next)
     {
     struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
 	target->overlapBases, target->uniqOverlapBases);
     cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
     cdwQaEnrichFree(&enrich);
     }
 
 bbiChromInfoFreeList(&chromList);
 bigWigFileClose(&bbi);
 freez(&bigWigPath);
 }
 #endif /* OLDWAY */
 
 
 void doEnrichmentsFromBigWig(struct sqlConnection *conn, 
     struct cdwFile *ef, struct cdwValidFile *vf, 
     struct cdwAssembly *assembly, struct target *targetList)
 /* Figure out enrichments from a bigBed file. */
 {
 /* Get path to bigBed, open it, and read all chromosomes. */
 char *bigWigPath = cdwPathForFileId(conn, ef->id);
 struct bbiFile *bbi = bigWigFileOpen(bigWigPath);
 struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);
 struct bigWigValsOnChrom *valsOnChrom = bigWigValsOnChromNew();
 
 /* This takes a while, so let's figure out what parts take the time. */
 long totalBigQueryTime = 0;
 long totalOverlapTime = 0;
 
 /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
  * for all targets.  This is complicated by just wanting to keep one chromosome worth of
  * bigWig data in memory. Also just for performance we do a lookup of target range tree to
  * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     {
     long startBigQueryTime = clock1000();
     boolean gotData = bigWigValsOnChromFetchData(valsOnChrom, chrom->name, bbi);
     long endBigQueryTime = clock1000();
     totalBigQueryTime += endBigQueryTime - startBigQueryTime;
     if (gotData)
 	{
 	double *valBuf = valsOnChrom->valBuf;
 	Bits *covBuf = valsOnChrom->covBuf;
 
 	/* Loop through all targets adding overlaps from ivList */
 	long startOverlapTime = clock1000();
 	struct target *target;
 	for (target = targetList; target != NULL; target = target->next)
 	    {
 	    if (target->skip)
 		continue;
 	    struct genomeRangeTree *grt = target->grt;
 	    struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
 	    if (targetTree != NULL)
 		{
 		struct range *range, *rangeList = rangeTreeList(targetTree);
 		for (range = rangeList; range != NULL; range = range->next)
 		    {
 		    int s = range->start, e = range->end, i;
 		    for (i=s; i<=e; ++i)
 		        {
 			if (bitReadOne(covBuf, i))
 			    {
 			    double x = valBuf[i];
 			    target->uniqOverlapBases += 1;
 			    target->overlapBases += x;
 			    }
 			}
 		    }
 		}
 	    }
 	long endOverlapTime = clock1000();
 	totalOverlapTime += endOverlapTime - startOverlapTime;
 	}
     }
 
 verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime);
 
 /* Now loop through targets and save enrichment info to database */
 struct target *target;
 for (target = targetList; target != NULL; target = target->next)
     {
     if (target->skip)
 	continue;
     struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
 	target->overlapBases, target->uniqOverlapBases);
     cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
     cdwQaEnrichFree(&enrich);
     }
 
 bigWigValsOnChromFree(&valsOnChrom);
 bbiChromInfoFreeList(&chromList);
 bigWigFileClose(&bbi);
 freez(&bigWigPath);
 }
 
 struct target *targetsForAssembly(struct sqlConnection *conn, struct cdwAssembly *assembly)
 /* Get list of enrichment targets for given assembly */
 {
 char query[128];
 sqlSafef(query, sizeof(query), "select * from cdwQaEnrichTarget where assemblyId=%d", assembly->id);
 struct cdwQaEnrichTarget *et, *etList = cdwQaEnrichTargetLoadByQuery(conn, query);
 
 /* Wrap a new structure around the enrichment targets where we'll store summary info. */
 struct target *target, *targetList = NULL, **targetTail = &targetList;
 for (et = etList; et != NULL; et = et->next)
     {
     char *targetBed = cdwPathForFileId(conn, et->fileId);
     struct genomeRangeTree *grt = cdwGrtFromBigBed(targetBed);
     target = targetNew(et, grt);
     *targetTail = target;
     targetTail = &target->next;
     freez(&targetBed);
     }
 return targetList;
 }
 
 char *ignoredFormats[] = {
 	"idat",
 	"customTrack",
 	"rcc",
 	"kallisto_abundance",
 	"expression_matrix",
 	"bam.bai",
 	"vcf.gz.tbi",
 	"text",
 	"html",
 	"csv",
 	"tsv",
 	"pdf",
+	"png",
 	"unknown",
 	};
 
 void doEnrichments(struct sqlConnection *conn, struct cdwFile *ef, char *path, 
     struct hash *assemblyToTarget)
 /* Calculate enrichments on for all targets file. The targetList and the
  * grtList are in the same order. */
 {
 /* Get validFile from database. */
 struct cdwValidFile *vf = cdwValidFileFromFileId(conn, ef->id);
 if (vf == NULL)
     return;	/* We can only work if have validFile table entry */
 
 if (!isEmpty(vf->enrichedIn) && !sameWord(vf->ucscDb, "unknown") && !isEmpty(vf->ucscDb)
     && !sameWord(vf->format, "unknown"))
     {
     /* Get our assembly */
     char *format = vf->format;
     char *ucscDb = vf->ucscDb;
     char *targetName = cdwSimpleAssemblyName(ucscDb);
     struct cdwAssembly *assembly = cdwAssemblyForUcscDb(conn, targetName);
 
     struct target *targetList = hashFindVal(assemblyToTarget, assembly->name);
     if (targetList == NULL)
 	{
 	targetList = targetsForAssembly(conn, assembly);
 	if (targetList == NULL)
 	    errAbort("No targets for assembly %s", assembly->name);
 	hashAdd(assemblyToTarget, assembly->name, targetList);
 	}
 
     /* Loop through targetList zeroing out existing ovelaps. */
     struct target *target;
     boolean allSkip = TRUE;
     for (target = targetList; target != NULL; target = target->next)
 	{
 	target->overlapBases = target->uniqOverlapBases = 0;
 	target->skip = enrichmentExists(conn, ef, target->target);
 	if (!target->skip)
 	    allSkip = FALSE;
 	}
 
     /* Do a big dispatch based on format. 
      * Current formats; fastq, bigWig, bed, bigBed, gtf, gff, bam, vcf, idat, curstomTrack, rcc. 
      * formats cnt; kallisto_abundance, bam.bai, vcf.gz.tbi, expression_matrix, unknown.  */
     if (!allSkip)
 	{
 	if (sameString(format, "fastq"))
 	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
 	else if (sameString(format, "bigWig"))
 	    doEnrichmentsFromBigWig(conn, ef, vf, assembly, targetList);
 	else if (startsWith("bed_", format))
 	    doEnrichmentsFromBed(conn, ef, vf, assembly, targetList);
 	else if (cdwIsSupportedBigBedFormat(format) || sameString(format, "bigBed"))
 	    doEnrichmentsFromBigBed(conn, ef, vf, assembly, targetList);
 	else if (sameString(format, "gtf"))
 	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
 	else if (sameString(format, "gff"))
 	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
 	else if (sameString(format, "bam"))
 	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
 	else if (sameString(format, "vcf"))
 	    doEnrichmentsFromSampleBed(conn, ef, vf, assembly, targetList);
 	else if (stringIx(format, ignoredFormats) >= 0)
 	    verbose(2, "Ignoring %s %s in doEnrichments, that's ok\n", format, ef->cdwFileName);
 	else
 	    errAbort("Unrecognized format %s in doEnrichments(%s)", format, path);
 	}
 
     /* Clean up and go home. */
     cdwAssemblyFree(&assembly);
     }
 cdwValidFileFree(&vf);
 }
 
 
 void cdwMakeEnrichments(int startFileId, int endFileId)
 /* cdwMakeEnrichments - Scan through database and make a makefile to calc. enrichments and store 
  * in database. */
 {
 /* Make list with all files in ID range */
 struct sqlConnection *conn = sqlConnect(cdwDatabase);
 struct cdwFile *ef, *efList = cdwFileAllIntactBetween(conn, startFileId, endFileId);
 
 /* Make up a hash for targets keyed by assembly name. */
 struct hash *assemblyToTarget = hashNew(0);
 
 for (ef = efList; ef != NULL; ef = ef->next)
     {
     char path[PATH_LEN];
     safef(path, sizeof(path), "%s%s", cdwRootDir, ef->cdwFileName);
     verbose(1, "%lld processing %s aka %s\n", (long long)ef->id, ef->submitFileName, path);
 
     if (ef->tags) // All ones we care about have tags
 	doEnrichments(conn, ef, path, assemblyToTarget);
     }
 sqlDisconnect(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 cdwMakeEnrichments(sqlUnsigned(argv[1]), sqlUnsigned(argv[2]));
 return 0;
 }