4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/encode3/encodeDataWarehouse/edwQaEvaluate/edwQaEvaluate.c src/hg/encode3/encodeDataWarehouse/edwQaEvaluate/edwQaEvaluate.c
index d61bed7..a082dfb 100644
--- src/hg/encode3/encodeDataWarehouse/edwQaEvaluate/edwQaEvaluate.c
+++ src/hg/encode3/encodeDataWarehouse/edwQaEvaluate/edwQaEvaluate.c
@@ -1,593 +1,593 @@
 /* edwQaEvaluate - Consider available evidence and set edwValidFile.*QaStatus. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "errAbort.h"
 #include "encodeDataWarehouse.h"
 #include "edwLib.h"
 
 /* Globals */
 int version = 5;
 
 /* Version history 
  *  1 - initial release
  *  2 - Relaxed most thresholds on RAMPAGE data type, which is not expected to map well now.
  *      Relaxed enrichment threshold in ChIP-seq and DNase-seq down to 2 since 'open is not 
  *      very specific.
  *      Split RNA-seq into:
  *      'Long RNA-seq' - reads 200 bases or longer or polyadenylated mRNA
  *      'RNA-seq' - unspecified, treated same as RNA-seq
  *      'Short RNA-seq' - reads 200 bases or shorter
  *      'miRNA-seq' - micro RNA sequencing
  *  3 - Relaxed DNAse threshold for enrichment in 'open chromatin' to 1.6.
  *  4 - Fixed bug that was putting reverse strand reads in BAM and FASTQ files in the wrong
  *      place.  Did not change thresholds, but getting much better (almost 2x better) 
  *      enrichments and maybe 50% better cross-enrichments as a result.  Also made it so
  *      that files had to be from two different replicates (not technical replicates) to
  *      get credit for a good replicateQa.
  *  5 - Relaxing thresholds for ENCODE2 experiments to be half of those for ENCODE3. 
  *      Relaxing threshold for ChIP-seq to cross-enrichment of 3 (for broad peaks)
  *      Relaxing threshold for ChIP-seq mapping from 80% to 65%
  */
 
 boolean solexaFastqOk = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "edwQaEvaluate - Consider available evidence and set edwValidFile.*QaStatus\n"
   "version: %d\n"
   "usage:\n"
   "   edwQaEvaluate startId endId\n"
   "options:\n"
   "   -solexaFastqOk - if set will pass in spite of fastq being in solexa format\n"
   , version
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"solexaFastqOk", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 struct qaThresholds
 /* Qa thresholds for a data type */
     {
     double fastqMapRatio;	// Minimum mapping ratio for fastq
     double bamMapRatio;		// Minimum mapping ratio for bam
     double fastqQual;	        // Minimum average quality for fastq
     double fastqPairConcordance;  // Minimum concordance between read pairs
     double repeatContent;       // Maximum total repeat content
     double ribosomeContent;	// Maximum ribosomal RNA content
     double closeContamination;	// Maximum contamination from close species (mammal vs mammal)
     double farContamination;	// Maximum contamintation from distant species
     double enrichment;		// Minimum enrichment
     double crossEnrichment;     // Minimum cross-enrichment
     double pearsonClipped;	// Minimum pearson clipped correlation
     };
 
 struct qaThresholds dnaseThresholds = 
 /* Thresholds for DNAse - pretty high since no introns like RNA-seq or base-changing
  * as in methy-seq. */
     {
     .fastqMapRatio = 0.50,
     .bamMapRatio = 0.50,
     .fastqQual = 20,
     .fastqPairConcordance = 0.8,
     .repeatContent = 0.1,
     .ribosomeContent = 0.05,
     .closeContamination = 0.06,
     .farContamination = 0.02,
     .enrichment = 1.6,
     .crossEnrichment = 15,
     .pearsonClipped = 0.10,
     };
 
 struct qaThresholds longRnaSeqThresholds = 
 /* Thresholds for RNA-seq - lower from introns and other issues. */
     {
     .fastqMapRatio = 0.20,
     .bamMapRatio = 0.50,
     .fastqQual = 20,
     .fastqPairConcordance = 0.6,  // introns
     .repeatContent = 0.5,
     .ribosomeContent = 0.15,
     .closeContamination = 0.5,
     .farContamination = 0.03,
     .enrichment = 4,
     .crossEnrichment = 5,
     .pearsonClipped = 0.05,
     };
 
 struct qaThresholds shortRnaSeqThresholds = 
 /* Thresholds for RNA-seq - lower from introns and other issues. */
     {
     .fastqMapRatio = 0.20,
     .bamMapRatio = 0.50,
     .fastqQual = 20,
     .fastqPairConcordance = 0.7,
     .repeatContent = 0.5,
     .ribosomeContent = 0.15,
     .closeContamination = 0.5,
     .farContamination = 0.03,
     .enrichment = 1.5,
     .crossEnrichment = 5,
     .pearsonClipped = 0.05,
     };
 
 struct qaThresholds miRnaSeqThresholds = 
 /* Thresholds for miRNA-seq - lower from introns and other issues. */
     {
     .fastqMapRatio = 0.0005,	// Low expectations for this to align with BWA - 22bp reads
     .bamMapRatio = 0.05,
     .fastqQual = 20,
     .fastqPairConcordance = 0.7,
     .repeatContent = 0.5,
     .ribosomeContent = 0.15,
     .closeContamination = 0.5,
     .farContamination = 0.03,
     .enrichment = 4,
     .crossEnrichment = 5,
     .pearsonClipped = 0.05,
     };
 
 struct qaThresholds chipSeqThresholds = 
 /* Chip-seq thresholds - pretty high since is all straight DNA */
     {
     .fastqMapRatio = 0.65,
     .bamMapRatio = 0.65,
     .fastqQual = 20,
     .fastqPairConcordance = 0.8,
     .repeatContent = 0.1,
     .ribosomeContent = 0.05,
     .closeContamination = 0.06,
     .farContamination = 0.02,
     .enrichment = 2,
     .crossEnrichment = 3,
     .pearsonClipped = 0.03,
     };
 
 struct qaThresholds shotgunBisulfiteSeqThresholds = 
 /* Chip-seq thresholds - pretty high since is all straight DNA */
     {
     .fastqMapRatio = 0.04,  // Don't expect much mapping with BWA
     .bamMapRatio = 0.3,
     .fastqQual = 20,
     .fastqPairConcordance = 0.7,
     .repeatContent = 0.1,
     .ribosomeContent = 0.05,
     .closeContamination = 0.06,
     .farContamination = 0.02,
     .enrichment = 2.5,
     .crossEnrichment = 5,
     .pearsonClipped = 0.05,
     };
 
 struct qaThresholds rampageThresholds = 
 /* Rampage is a way of sequencing selectively near the transcription start site.
  * It requires a special aligner, so don't expect too much from generic BWA here. */
     {
     .fastqMapRatio = 0.001,
     .bamMapRatio = 0.10,
     .fastqQual = 20,
     .fastqPairConcordance = 0.3,
     .repeatContent = 0.1,
     .ribosomeContent = 0.05,
     .closeContamination = 0.06,
     .farContamination = 0.02,
     .enrichment = 2.0,
     .crossEnrichment = 3,
     .pearsonClipped = 0.05,
     };
 
 struct qaThresholds chiaPetThresholds = 
 /* Chia pet links together DNA brought together from a long distance using an antibody. */
     {
     .fastqMapRatio = 0.001,
     .bamMapRatio = 0.40,
     .fastqQual = 25,
     .fastqPairConcordance = 0.001,
     .repeatContent = 0.1,
     .ribosomeContent = 0.05,
     .closeContamination = 0.06,
     .farContamination = 0.02,
     .enrichment = 1.5,
     .crossEnrichment = 1.1,
     .pearsonClipped = 0.0001,
     };
 
 int failQa(struct sqlConnection *conn, struct edwFile *ef, char *whyFormat, ...)
 #ifdef __GNUC__
 __attribute__((format(printf, 3, 4)))
 #endif
    ;
    /* Explain why this failed QA */
 
 int failQa(struct sqlConnection *conn, struct edwFile *ef, char *whyFormat, ...)
 /* Explain why this failed QA  - always returns -1*/
 {
 /* First just do the warn */
 warn("Failing QA on fileId %u", ef->id);
 va_list args;
 va_start(args, whyFormat);
 char reason[512];
 vasafef(reason, sizeof(reason), whyFormat, args);
 warn("%s", reason);
 
 /* See if already have a failure with this file and version. */
 char query[512];
 sqlSafef(query, sizeof(query), 
     "select count(*) from edwQaFail where fileId=%u and qaVersion=%d", ef->id, version);
 int prevFailCount = sqlQuickNum(conn, query);
 
 /* If we are the first need to save it to database. */
 if (prevFailCount == 0)
     {
     struct edwQaFail fail = {.id=0, .fileId=ef->id, .qaVersion=version, .reason=reason};
     edwQaFailSaveToDb(conn, &fail, "edwQaFail", 0);
     }
 return -1;
 }
 
 struct edwQaRepeat *edwQaRepeatMatching(struct sqlConnection *conn, 
 	long long fileId, char *repClass)
 /* Return record associated with repeat of given class and fileId */
 {
 char query[256];
 sqlSafef(query, sizeof(query),
     "select * from edwQaRepeat where fileId=%lld and repeatClass='%s'"
     , fileId, repClass);
 return edwQaRepeatLoadByQuery(conn, query);
 }
 
 boolean isInArray(int *array, int size, int x)
 /* Return TRUE if x is in array of given size. */
 {
 int i;
 for (i=0; i<size; ++i)
     if (array[i] == x)
         return TRUE;
 return FALSE;
 }
 
 int mammalianTaxons[] = {9606, 10090, 10116};
 char *mammalianTaxonsString = "9606,10090,10116";
 
 double maxContam(struct sqlConnection *conn, struct edwValidFile *vf, boolean isClose)
 /* Get list of contamination targets that are close or far */
 {
 char query[512];
 sqlSafef(query, sizeof(query), "select taxon from edwAssembly where ucscDb='%s'", vf->ucscDb);
 int taxon = sqlQuickNum(conn, query);
 assert(taxon != 0);
 
 boolean isMammal = isInArray(mammalianTaxons, ArraySize(mammalianTaxons), taxon);
 if (!isMammal)
     {
     /* Non-mammalian case is relatively easy - there are no close species, everything is distant */
     if (isClose)
         return 0.0;
     else
         {
 	sqlSafef(query, sizeof(query), "select max(mapRatio) from edwQaContam where fileId=%u",
 	    vf->fileId);
 	}
     }
 else
     {
     sqlSafef(query, sizeof(query), 
 	"select max(mapRatio) from edwQaContam c,edwQaContamTarget t, edwAssembly a "
 	"where c.fileId=%u and c.qaContamTargetId=t.id and t.assemblyId = a.id "
 	"and a.taxon %s in (%s)", 
 	vf->fileId, (isClose ? "" : "not"), mammalianTaxonsString);
     }
 return sqlQuickDouble(conn, query);
 }
 
 boolean passContamination(struct sqlConnection *conn,
     struct edwFile *ef, struct edwValidFile *vf, struct edwExperiment *exp,
     struct qaThresholds *threshold)
 /* Return TRUE if pass QA threshold. */
 {
 double closeContamination = maxContam(conn, vf, TRUE);
 double farContamination = maxContam(conn, vf, FALSE);
 if (closeContamination > threshold->closeContamination)
     {
     failQa(conn, ef, "closeContamination = %g, threshold for %s is %g",
 	closeContamination, exp->dataType, threshold->closeContamination);
     return FALSE;
     }
 else if (farContamination > threshold->farContamination)
     {
     failQa(conn, ef, "farContamination = %g, threshold for %s is %g", 
 	farContamination, exp->dataType, threshold->farContamination);
     return FALSE;
     }
 return TRUE;
 }
 
 boolean edwBestPairedSomething(struct sqlConnection *conn, char *query, double *retBest)
 /* Finish up some sort of max query */
 {
 struct sqlResult *sr = sqlGetResult(conn, query);
 char **row = sqlNextRow(sr);
 boolean gotResult = row[0] != NULL;
 if (gotResult)
     *retBest = sqlDouble(row[0]);
 sqlFreeResult(&sr);
 return gotResult;
 }
 
 boolean edwBestCrossEnrichment(struct sqlConnection *conn, long long fileId, double *retBest)
 /* If there are biological replicates for same experiment and output type,  return
  * TRUE and calculate the best cross enrichment. */
 {
 char query[256];
 sqlSafef(query, sizeof(query),
     "select max(sampleSampleEnrichment) from edwQaPairSampleOverlap p,edwValidFile e,edwValidFile y "
     "where elderFileId=e.fileId and youngerFileId=y.fileId and e.replicate != y.replicate "
     "and (elderFileId = %lld or youngerFileId = %lld) "
     , fileId, fileId);
 return edwBestPairedSomething(conn, query, retBest);
 }
 
 boolean edwBestPearsonClipped(struct sqlConnection *conn, long long fileId, double *retBest)
 /* If there are biological replicates for same experiment and output type,  return
  * TRUE and calculate the best cross enrichment. */
 {
 char query[256];
 sqlSafef(query, sizeof(query),
     "select max(pearsonClipped) from edwQaPairCorrelation p,edwValidFile e,edwValidFile y "
     "where elderFileId=e.fileId and youngerFileId=y.fileId and e.replicate != y.replicate "
     "and (elderFileId = %lld or youngerFileId = %lld) "
     , fileId, fileId);
 return edwBestPairedSomething(conn, query, retBest);
 }
 
 int respectManualOverride(int newVal, int oldVal)
 /* If oldVal shows manual override, return it, else newVal. */
 {
 if (oldVal > 1 || oldVal < -1)
     return oldVal;
 return newVal;
 }
 
 void checkThresholds(struct sqlConnection *conn,
     struct edwFile *ef, struct edwValidFile *vf, struct edwExperiment *exp, 
     struct qaThresholds *threshold)
 /* Check that files meet thresholds if possible. */
 {
 char query[512];
 char *dataType = exp->dataType;
 boolean isEncode2 = sameWord(exp->rfa, "ENCODE2");
 
 /* Make local copy of threshold with oldFactor weighed in if need be. */
 struct qaThresholds lt = *threshold;
 threshold = &lt;
 if (isEncode2)
     {
     double oldFactor = 0.5;
     lt.fastqMapRatio *= oldFactor;
     lt.bamMapRatio *= oldFactor;
     lt.fastqQual *= oldFactor;
     lt.fastqPairConcordance *= oldFactor;
     lt.repeatContent /= oldFactor;
     lt.ribosomeContent /= oldFactor;
     lt.closeContamination /= oldFactor;
     lt.farContamination /= oldFactor;
     lt.enrichment *= oldFactor;
     lt.crossEnrichment *= oldFactor;
     lt.pearsonClipped *= oldFactor;
     }
 
 /* First check individual file thresholds. */
 int passStat = 1;
 int pairPassStat = 0;
 if (sameString(vf->format, "fastq"))
     {
     /* Check a bunch of things either in edwValidFile record or edwFastqFile record. */
     struct edwFastqFile *fq = edwFastqFileFromFileId(conn, ef->id);
     if (fq == NULL)
        errAbort("No edwFastqFile for file ID %u", ef->id);
     if (vf->mapRatio < threshold->fastqMapRatio)
         {
 	passStat = failQa(conn, ef, "Map ratio is %g, threshold for %s is %g", 
 	    vf->mapRatio, dataType, threshold->fastqMapRatio);
 	}
     else if (!sameWord(fq->qualType, "sanger") && !solexaFastqOk && !isEncode2)
         {
 	passStat = failQa(conn, ef, "Quality scores type %s not sanger", fq->qualType);
 	}
     else if (fq->qualMean < threshold->fastqQual)
         {
 	passStat = failQa(conn, ef, "Mean quality score is %g, threshold for %s is %g",
 	    fq->qualMean, dataType, threshold->fastqQual);
 	}
     else if (!isEmpty(vf->pairedEnd))
         {
 	struct edwValidFile *otherVf = edwOppositePairedEnd(conn, vf);
 	if (otherVf == NULL)
 	    passStat = 0;   // Need to wait for other end.
 	else
 	    {
 	    struct edwQaPairedEndFastq *pair = edwQaPairedEndFastqFromVfs(conn, vf, otherVf,
 			    NULL, NULL);
 	    if (pair == NULL)
 		errAbort("edwQaPairedEndFastq not found for %u %u", vf->fileId, otherVf->fileId);
 	    if (pair->concordance < threshold->fastqPairConcordance)
 	       failQa(conn, ef, "Pair concordance is %g, threshold for %s is %g",
 			pair->concordance, dataType, threshold->fastqPairConcordance);
 	    edwQaPairedEndFastqFree(&pair);
 	    }
 	}
 
     /* If still passing check repeatContent */
     if (passStat == 1)
         {
 	struct edwQaRepeat *rep = edwQaRepeatMatching(conn, ef->id, "total");
 	if (rep == NULL)
 	    errAbort("No total repeat record for file id %u", ef->id);
 	if (rep->mapRatio > threshold->repeatContent)
 	    passStat = failQa(conn, ef, "Repeat content is %g, threshold for %s is %g",
 		rep->mapRatio, dataType, threshold->repeatContent);
 	edwQaRepeatFree(&rep);
 	}
 
     /* If still passing check ribosomeContent */
     if (passStat == 1)
         {
 	struct edwQaRepeat *rep = edwQaRepeatMatching(conn, ef->id, "rRNA");
 	if (rep != NULL && rep->mapRatio > threshold->ribosomeContent)
 	    {
 	    passStat = failQa(conn, ef, "Ribosomal content is %g, threshold for %s is %g",
 	       rep->mapRatio, dataType, threshold->ribosomeContent);
 	    }
 	edwQaRepeatFree(&rep);
 	}
 
     /* If still passing check for contamination with other organisms. */
     if (passStat == 1)
 	if (!passContamination(conn, ef, vf, exp, threshold))
 	    passStat = -1;
 
     edwFastqFileFree(&fq);
     }
 else if (sameString(vf->format, "bam"))
     {
     if (vf->mapRatio < threshold->bamMapRatio)
         {
 	passStat = failQa(conn, ef, "Map ratio is %g, threshold for %s is %g", 
 	    vf->mapRatio, dataType, threshold->fastqMapRatio);
 	}
     }
 if (sameString(vf->format, "bam") || sameString(vf->format, "fastq") 
    || sameString(vf->format, "bigBed") || sameString(vf->format, "broadPeak")
    || sameString(vf->format, "narrowPeak") || sameString(vf->format, "gtf")
    || sameString(vf->format, "bigWig"))
     {
     /* If still passing check for enrichment */
     char *enrichedIn = vf->enrichedIn;
     if (passStat == 1 && !isEmpty(enrichedIn) && !sameWord(enrichedIn, "unknown"))
 	{
 	sqlSafef(query, sizeof(query),
 	   "select e.enrichment from edwQaEnrich e,edwQaEnrichTarget t "
 	   "where e.qaEnrichTargetId = t.id "
 	   "and e.fileId = %u "
 	   "and t.name = '%s' "
 	   , ef->id, enrichedIn);
 	double enrichment = sqlQuickDouble(conn, query);
 	if (enrichment < threshold->enrichment)
 	    {
 	    passStat = failQa(conn, ef, "Enrichment in %s is %g, threshold for %s is %g",
 		enrichedIn, enrichment, dataType, threshold->enrichment);
 	    }
 	}
 
     /* Finally check cross enrichment */
     if (passStat == 1)
 	{
 	if (sameString(vf->format, "bigWig"))
 	    {
 	    double r = 0;
 	    if (edwBestPearsonClipped(conn, ef->id, &r))
 		{
 		if (r < threshold->pearsonClipped)
 		    pairPassStat = failQa(conn, ef, "ClippedR is %g, threshold for %s is %g",
 			r, dataType, threshold->pearsonClipped);
 		else
 		    pairPassStat = 1;
 		}
 	    }
 	else
 	    {
 	    double crossEnrichment = 0;
 	    if (edwBestCrossEnrichment(conn, ef->id, &crossEnrichment))
 		{
 		if (crossEnrichment < threshold->crossEnrichment)
 		    pairPassStat = failQa(conn, ef, "CrossEnrichment is %g, threshold for %s is %g",
 			crossEnrichment, dataType, threshold->crossEnrichment);
 		else
 		    pairPassStat = 1;
 		}
 	    }
 	}
     }
 else
     passStat = 0;
 passStat = respectManualOverride(passStat, vf->singleQaStatus);
 pairPassStat = respectManualOverride(pairPassStat, vf->replicateQaStatus);
 sqlSafef(query, sizeof(query),
     "update edwValidFile set singleQaStatus=%d, replicateQaStatus=%d, qaVersion=%d "
     "where id=%u",  passStat, pairPassStat, version, vf->id);
 sqlUpdate(conn, query);
 }
 
 void edwQaEvaluate(int startFileId, int endFileId)
 /* edwQaEvaluate - Consider available evidence and set edwValidFile.*QaStatus. */
 {
 struct sqlConnection *conn = edwConnectReadWrite();
 struct edwFile *ef, *efList = edwFileAllIntactBetween(conn, startFileId, endFileId);
 for (ef = efList; ef != NULL; ef = ef->next)
     {
     verbose(2, "Looking at %u\n", ef->id);
     struct edwValidFile *vf = edwValidFileFromFileId(conn, ef->id);
     if (vf != NULL)
 	{
 	verbose(2, " aka %s\n", vf->licensePlate);
 	if (!isEmpty(vf->experiment))
 	    {
 	    struct edwExperiment *exp = edwExperimentFromAccession(conn, vf->experiment);
 	    if (exp != NULL)
 		{
 		char *dataType = exp->dataType;
 		struct qaThresholds *thresholds = NULL;
 		if (sameWord("DNase-seq", dataType))
 		    thresholds = &dnaseThresholds;
 		else if (sameWord("RNA-seq", dataType) || sameWord("Long RNA-seq", dataType))
 		    thresholds = &longRnaSeqThresholds;   
 		else if (sameWord("short RNA-seq", dataType))
 		    thresholds = &shortRnaSeqThresholds;   
 		else if (sameWord("miRNA-seq", dataType))
 		    thresholds = &miRnaSeqThresholds;   
 		else if (sameWord("ChIP-seq", dataType))
 		    thresholds = &chipSeqThresholds;
 		else if (sameWord("Shotgun Bisulfite-seq", dataType))
 		    thresholds = &shotgunBisulfiteSeqThresholds;
 		else if (sameWord("RAMPAGE", dataType))
 		    thresholds = &rampageThresholds;
 		else if (sameWord("ChiaPet", dataType))
 		    thresholds = &chiaPetThresholds;
 		else if (sameWord("", dataType))
 		    ;
 		else
 		    verbose(2, "No thresholds for data type %s", dataType);
 		verbose(2, "  dataType %s, thresholds %p\n", dataType, thresholds);
 		if (thresholds != NULL)
 		    checkThresholds(conn, ef, vf, exp, thresholds);
 		}
 	    else
 		warn("Can't find experiment for '%s'", vf->experiment);
 	    }
 	}
     }
 sqlDisconnect(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 solexaFastqOk = optionExists("solexaFastqOk");
 edwQaEvaluate(sqlUnsigned(argv[1]), sqlUnsigned(argv[2]));
 return 0;
 }