src/hg/encode/encodeMergeReplicates/encodeMergeReplicates.c 4898794edd81be5285ea6e544acbedeaeb31bf78

4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/encode/encodeMergeReplicates/encodeMergeReplicates.c src/hg/encode/encodeMergeReplicates/encodeMergeReplicates.c
index 6bd9890..385a0b4 100644
--- src/hg/encode/encodeMergeReplicates/encodeMergeReplicates.c
+++ src/hg/encode/encodeMergeReplicates/encodeMergeReplicates.c
@@ -1,337 +1,337 @@
 /* encodeMergeReplicates - Merge together replicates for a pooled output.  Only works on 
  * narrowPeak and broadPeak files currently. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "localmem.h"
 #include "options.h"
 #include "encode/encodePeak.h"
 #include "peakCluster.h"
 #include "rangeTree.h"
 
 
 int clAgree = 1;
 boolean clAdd = FALSE;
 double clThreshold = 0.0;
 double clGotThreshold = FALSE;
 boolean clUniqueName = FALSE;
 
 #define SCORE_COL_IX 6
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "encodeMergeReplicates - Merge together replicates for a pooled output.  \n"
   "Only works on narrowPeak and broadPeak files currently. Ignores strand.\n"
   "usage:\n"
   "   encodeMergeReplicates in1 in2 in3 ... output\n"
   "options:\n"
   "   -agree=N - only output where have agreement between N replicates, default is 1\n"
   "   -add - add together signals rather than averaging them\n"
   "   -threshold=N.N - only output where signal is over threshold\n"
   "   -maxMin - set threshold to be the maximum of the minima of two replicates\n"
   "   -addMin - set threshold to be the sum of the minima of the two replicates\n"
   "   -uniqueName - make output names unique instead of name of best in cluster\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"agree", OPTION_INT},
    {"add", OPTION_BOOLEAN},
    {"threshold", OPTION_DOUBLE},
    {"maxMin", OPTION_BOOLEAN},
    {"addMin", OPTION_BOOLEAN},
    {"uniqueName", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 struct peakItem *bestItemInRefList(struct slRef *refList)
 /* Return highest scoring item in list */
 {
 struct peakItem *bestItem = refList->val;
 struct slRef *ref;
 for (ref = refList->next; ref != NULL; ref = ref->next)
     {
     struct peakItem *item = ref->val;
     if (item->score > bestItem->score)
         bestItem = item;
     }
 return bestItem;
 }
 
 static int peakClusterSourceCount(struct peakCluster *cluster)
 /* Return number of sources in cluster. */
 {
 /* Use simple algorithm here since # of sources likely to just be 1-3 */
 struct slRef *uniqList = NULL, *ref;
 for (ref = cluster->itemRefList; ref != NULL; ref = ref->next)
      {
      struct peakItem *item = ref->val;
      refAddUnique(&uniqList, item->source);
      }
 int count = slCount(uniqList);
 slFreeList(&uniqList);
 return count;
 }
 
 static int countNonnumericPrefix(char *s)
 /* Return number of characters until get first digit. */
 {
 int count = 0;
 char c;
 while ((c = *s++) != 0)
     {
     if (isdigit(c))
         break;
     else
         ++count;
     }
 return count;
 }
 
 static void copyNonnumericPrefix(char *source, char *dest, int destSize)
 /* Copy non-numerical prefix if any of source to dest. */
 {
 int size = countNonnumericPrefix(source);
 if (size >= destSize)
     errAbort("Nonnumerical prefix of %s longer than %d chars.", source, destSize);
 memcpy(dest, source, size);
 dest[size] = 0;
 }
 
 void outputClusterNarrowPeak(struct peakCluster *cluster, FILE *f, boolean add,
 	boolean gotThreshold, double threshold, boolean forceUniqueName)
 /* Output cluster of overlapping narrowPeaks - doing average of items. */
 {
 /* Some variables to keep statistics on all items in cluster. */
 double sumP = 0.0, sumQ = 0.0, sumSignal = 0.0;
 long long sumStart = 0, sumEnd = 0;
 long long sumPeak = 0, sumScore = 0;
 
 /* To figure out what to name the cluster, we look for the name of the best 
  * (highest scoring) item. */
 struct slRef *ref, *refList=cluster->itemRefList;
 struct peakItem *bestItem = bestItemInRefList(refList);
 int itemCount = 0;
 char *bestLine = bestItem->asciiLine;
 char *bestName = NULL;
 
 /* Figure out # of words to output based on best item (oh my) */
 int wordCount = chopByWhite(bestLine, NULL, 0);
 boolean gotP = FALSE, gotQ = FALSE;
 for (ref = refList; ref != NULL; ref = ref->next)
     {
     struct peakItem *item = ref->val;
     char *words[wordCount];
     chopByWhite(item->asciiLine, words, wordCount);
     int chromStart = sqlUnsigned(words[1]);
     sumStart += chromStart;
     sumEnd += sqlUnsigned(words[2]);
     if (bestItem == item)
         bestName = words[3];
     sumScore += sqlSigned(words[4]);
     sumSignal += sqlDouble(words[6]);
     if (!sameString(words[7], "-1"))
         {
 	gotP = TRUE;
 	sumP += sqlDouble(words[7]);
 	}
     if (!sameString(words[8], "-1"))
 	{
 	gotQ = TRUE;
 	sumQ += sqlDouble(words[8]);
 	}
     int peak = -1;
     if (wordCount > 9)
         peak = sqlSigned(words[9]) + chromStart;
     sumPeak += peak;
     itemCount += 1;
     }
 
 /* Handle much of the averaging by multiplying by scale factor rather than 
  * dividing by itemCount. */
 double scaleFactor = 1.0;
 if (!add)
     scaleFactor = 1.0/itemCount;
 double signalValue = sumSignal * scaleFactor;
 
 /* Apply threshold if have one. */
 if (!gotThreshold || signalValue >= threshold)
     {
     /* Output chrom/start/end fields. Start and end are averaged from all items regardless 
      * of add flag*/
     fprintf(f, "%s\t", cluster->chrom);		// chrom
     int chromStart = sumStart/itemCount;
     fprintf(f, "%d\t", chromStart);			// chromStart
     fprintf(f, "%d\t", (int)(sumEnd/itemCount));	// chromEnd
 
     /* Figure out what to call it,  same name as best item usually, but can force a
      * unique numerical suffix.  This will replace any existing numerical suffix. */
     char *name = bestName;
     char uniqPrefix[64];
     char uniqName[100];
     if (forceUniqueName)
         {
 	static int uniqIx = 0;
 	copyNonnumericPrefix(name, uniqPrefix, sizeof(uniqPrefix));
 	safef(uniqName, sizeof(uniqName), "%s%d", uniqPrefix, ++uniqIx);
 	name = uniqName;
 	}
     fprintf(f, "%s\t", name);			// name
 
     /* Output score - possibly averaged. */
     int score = sumScore*scaleFactor;
     if (score > 1000) score = 1000;
     fprintf(f, "%d\t", score);			// score 0-1000
 
     /* Some day need to come back to strand... */
     fprintf(f, ".\t");				// strand
 
     /* Output signal, P and Q, possibly doing average, handling not. */
     fprintf(f, "%g\t", sumSignal*scaleFactor);	// signalValue
     if (gotP)
 	fprintf(f, "%g\t", sumP*scaleFactor);		// pValue
     else
 	fprintf(f, "-1\t");
     if (gotQ)
 	fprintf(f, "%g", sumQ*scaleFactor);		// qValue
     else
 	fprintf(f, "-1");
 
     /* Here we need to do something else if there's a mix of narrow and broad in input, or
      * some but not all are -1. */
     if (wordCount > 9)
 	fprintf(f, "\t%d", (int)(sumPeak/itemCount - chromStart));
     fprintf(f, "\n");
     }
 }
 
 static double minOfCol(char *fileName, int colIx)
 /* Return minimum value seen in given column of file. */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 int minColCount = colIx+1;
 char *row[minColCount];
 boolean gotAny = FALSE;
 double minVal = 0;
 while (lineFileNextRow(lf, row, minColCount))
     {
     double val = lineFileNeedDouble(lf, row, colIx);
     if (!gotAny || val < minVal)
 	{
 	gotAny = TRUE;
         minVal = val;
 	}
     }
 lineFileClose(&lf);
 if (!gotAny)
     errAbort("No data in %s", fileName);
 return minVal;
 }
 
 static double maxMinOfReplicates(int inCount, char *inNames[], int scoreColIx)
 /* Return max of mins-within-a-file. */
 {
 if (inCount <= 0)
     errAbort("Need positive inCount in maxMinOfReplicates.  Got %d", inCount);
 double maxVal = minOfCol(inNames[0], scoreColIx);
 int i;
 for (i=1; i<inCount; ++i)
     {
     double val = minOfCol(inNames[i], scoreColIx);
     if (val > maxVal)
         maxVal = val;
     }
 return maxVal;
 }
 
 static double addMinOfReplicates(int inCount, char *inNames[], int scoreColIx)
 /* Return sum of all mins-within-a-file. */
 {
 double sum = 0;
 int i;
 for (i=0; i<inCount; ++i)
     sum += minOfCol(inNames[i], scoreColIx);
 return sum;
 }
 
 void encodeMergeReplicates(int inCount, char *inNames[], char *outName)
 /* encodeMergeReplicates - Merge together replicates for a pooled output.  
  * Only works on broadPeak and narrowPeak files currently. */
 {
 /* Make list of sources out of input files. */
 struct peakSource *source, *sourceList = NULL;
 int i;
 for (i=0; i<inCount; ++i)
     {
     AllocVar(source);
     source->dataSource = inNames[i];
     source->chromColIx = 0;
     source->startColIx = 1;
     source->endColIx = 2;
     source->scoreColIx = SCORE_COL_IX;
     source->normFactor = 1.0;
     source->minColCount = SCORE_COL_IX+1;
     slAddTail(&sourceList, source);
     }
 
 /* Load in from all sources. */
 struct peakClusterMaker *maker = peakClusterMakerNew();
 for (source = sourceList; source != NULL; source = source->next)
     peakClusterMakerAddFromSource(maker, source);
 
 /* Cluster each chromosome. */
 FILE *f = mustOpen(outName, "w");
 struct hashEl *chrom, *chromList = peakClusterMakerChromList(maker);
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     {
     struct rbTree *tree = chrom->val;
     struct range *range, *rangeList = rangeTreeList(tree);
     struct lm *lm = lmInit(0);
     for (range = rangeList; range != NULL; range = range->next)
          {
 	 struct peakCluster *cluster, *clusterList = peakClusterItems(lm, range->val,
 		 BIGNUM, 0.0);
 	 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
 	     {
 	     if (clAgree < 2 || peakClusterSourceCount(cluster) >= clAgree)
 		 outputClusterNarrowPeak(cluster, f, clAdd, clGotThreshold, clThreshold,
 		 	clUniqueName);
 	     }
 	 }
     lmCleanup(&lm);
     }
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 4)
     usage();
 clAgree = optionInt("agree", clAgree);
 clAdd = optionExists("add");
 clGotThreshold = optionExists("threshold");
 clThreshold = optionDouble("threshold", clThreshold);
 if (optionExists("maxMin"))
    {
    clGotThreshold = TRUE;
    clThreshold = maxMinOfReplicates(argc-2, argv+1, SCORE_COL_IX);
    }
 if (optionExists("addMin"))
    {
    clGotThreshold = TRUE;
    clThreshold = addMinOfReplicates(argc-2, argv+1, SCORE_COL_IX);
    }
 if (clGotThreshold)
    verbose(2, "Threshold %g\n", clThreshold);
 clUniqueName = optionExists("uniqueName");
 encodeMergeReplicates(argc-2, argv+1, argv[argc-1]);
 return 0;
 }