src/utils/convolve/convolve.c e70152e44cc66cc599ff6b699eb8adc07f3e656a

e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/utils/convolve/convolve.c src/utils/convolve/convolve.c
index ba608dc..6c4e1e9 100644
--- src/utils/convolve/convolve.c
+++ src/utils/convolve/convolve.c
@@ -1,409 +1,412 @@
 /*	convolve - read in a set of probabilities from a histogram
  *	calculate convolutions of those probabilities for higher order
  *	histograms
  */
+
+/* Copyright (C) 2011 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
 #include	"common.h"
 #include	"hash.h"
 #include	"options.h"
 #include	"linefile.h"
 #include	<math.h>
 
 
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"count", OPTION_INT},
     {"html", OPTION_BOOLEAN},
     {"logs", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 static int convolve_count = 4;		/* # of times to convolve	*/
 static boolean html = FALSE;		/* need neat html output	*/
 static boolean logs = FALSE;		/* input data is in logs, not probs */
 
 static void usage()
 {
 errAbort(
     "convolve - perform convolution of probabilities\n"
     "usage: convolve [-count=N] [-logs] \\\n\t[-html] <file with initial set of probabilities>\n"
     "\t-count=N - number of times to run convolution\n"
     "\t-logs - input data is in log base 2 format, not probabilities\n"
     "\t-html - output in html table row format"
 );
 }
 
 /*	the histograms will be kept on a hash list.
  *	Definition of one element of that list
  */
 struct histoGram
     {
     int bin;
     double prob;
     double log_2;
     };
 
 #ifndef log2
 /* log base two is	[regardless of the base of log() function] */
 #define	log2(x)	(log(x)/log(2.0))
 #endif
 
 /*	Working in log space for probabilities and need to add
  *	two probabilities together.  This case is for logs base
  *	two, but the formula works for any base.  The trick is that if
  *	one of the probabilities is much smaller than the other it is
  *	negligible to the sum.
  *
  *	have: log0 = log(p0) and log1 = log(p1)
  *	want: l = log(p0 + p1) = log(2^log0 + 2^log1)
  *	but the problem with this is that the 2^log0 and 2^log1
  *	will lose a lot of the precision that is in log0 and log1
  *
  *	To solve:
  *
  *	let m = max(log0, log1)
  *	d = max(log0, log1) - min(log0, log1)
  *	then l = log(2^m + 2^(m-d))	(if m is log0, (m-d) = log1, etc...)
  *	= log(2^m + 2^m*2^(-d))		(x^(a+b) = x^a*x^b)
  *	= log(2^m * (1 + 2^(-d)))	(factor out 2^m)
  *	= m + log(1 + 2^(-d))		(log(a*b) = log(a) + log(b))
  */
 static double addLogProbabilities(double log0, double log1)
 {
 double result;
 double m;
 double d;
 
 if (log0 > log1) {
     m = log0;
     d = log0 - log1;
 } else {
     m = log1;
     d = log1 - log0;
 }
 result = m + log2(1 + pow(2.0,-d));
 return(result);
 }	/*	addLogProbabilities()	*/
 
 /*	the printHistorgram needs to order the entries from the hash
  *	by bin number since the hash is not necessarily ordered.
  *	Thus, we first count them, then allocate an array for all,
  *	read them into the array, and then traversing the array they
  *	are ordered.  bin numbers are assumed to run from 0 to N-1
  *	for N bins.
  */
 static void printHistogram(struct hash * histo, int medianBin)
 {
 int i;				/*	for loop counting	*/
 int elCount = 0;		/*	to count the hash elements	*/
 double *probabilities;		/*	will be an array of probabilities */
 double *log_2;			/*	will be an array of the log_2	*/
 double *inverseProbabilities;	/*	inverse of probabilities	*/
 double *inverseLog_2;		/*	inverse of log_2	*/
 struct hashEl *el, *elList;	/*	to traverse the hash	*/
 double cumulativeProbability;	/*	to show CPD	*/
 double cumulativeLog_2;		/*	to show CPD	*/
 
 elList = hashElListHash(histo);	/*	fetch the hash as a list	*/
 for (el = elList; el != NULL; el = el->next) {
     ++elCount;	/*	count hash elements	*/
 }
 
 /*	Allocate the arrays	*/
 probabilities = (double *) needMem((size_t)(sizeof(double) * elCount));
 log_2 = (double *) needMem((size_t) (sizeof(double) * elCount));
 inverseProbabilities = (double *) needMem((size_t)(sizeof(double) * elCount));
 inverseLog_2 = (double *) needMem((size_t) (sizeof(double) * elCount));
 
 /*	Traverse the list again, this time placing all values in the
  *	arrays
  */
 for (el = elList; el != NULL; el = el->next)
     {
     struct histoGram *hg;	/*	histogram hash element	*/
     hg = el->val;
     probabilities[hg->bin] = hg->prob;
     log_2[hg->bin] = hg->log_2;
     }
 hashElFreeList(&elList);
 
 cumulativeProbability = 0.0;
 cumulativeLog_2 = -500.0;	/*	arbitrarily small number	*/
 
 /*	compute the inverse  P(V<=v)	*/
 for (i = elCount-1; i >= 0; --i)
     {
     if (i == (elCount-1))
 	{
     	cumulativeLog_2 = log_2[i];
 	} else {
 	cumulativeLog_2 = addLogProbabilities(cumulativeLog_2, log_2[i]);
 	}
     if (cumulativeLog_2 > 0.0) cumulativeLog_2 = 0.0;
     inverseLog_2[i] = cumulativeLog_2;
     inverseProbabilities[i] = pow(2.0,cumulativeLog_2);
     }
 
 printf("Histogram with %d bins:\n", elCount);
 /*	Now the array is an ordered list	*/
 for (i = 0; i < elCount; ++i)
     {
     if (i == 0)
 	{
     	cumulativeLog_2 = log_2[i];
 	} else {
 	cumulativeLog_2 = addLogProbabilities(cumulativeLog_2, log_2[i]);
 	}
     if (cumulativeLog_2 > 0.0) cumulativeLog_2 = 0.0;
     cumulativeProbability  = pow(2.0,cumulativeLog_2);
     if (html)
 	{
 	if (medianBin == i)
 	    printf("<TR><TH> %d <BR> (median) </TH>\n", i);
 
 	printf("\t<TD ALIGN=RIGHT> %% %.2f </TD><TD ALIGN=RIGHT> %.4g </TD>\n\t<TD ALIGN=RIGHT> %% %.2f </TD><TD ALIGN=RIGHT> %.4f </TD>\n\t<TD ALIGN=RIGHT> %% %.2f </TD><TD ALIGN=RIGHT> %.4f </TD></TR>\n",
 		100.0 * probabilities[i],
 		log_2[i], 100.0 * cumulativeProbability,
 		cumulativeLog_2, 100.0 * inverseProbabilities[i], inverseLog_2[i]);
 	} else {
 	printf("bin %d: %% %.2f %0.6g\t%% %.2f\t%.6g\t%% %.2f\t%.6g", i,
 		100.0 * probabilities[i], log_2[i],
 		100.0 * cumulativeProbability,
 		cumulativeLog_2, 100.0 * inverseProbabilities[i], inverseLog_2[i]);
 	if (medianBin == i)
 	    printf(" - median\n");
 	else
 	    printf("\n");
 	}
     }
 }	/*	printHistogram()	*/
 
 /*	one iteration of convolution
  *
  *	input is a probability histogram with N bins
  *		bins are numbered 0 through N-1
  *	output is a probability histogram with (N*2)-1 bins
  *
  *	The value to calculate in an output bin is a sum
  *	of the probabilities from the input bins that contribute to that
  *	output bin.  Specifically:
  *
  *	output[0] = input[0] * input[0]
  *	output[1] = (input[0] * input[1]) +
  *			(input[1] * input[0])
  *	output[2] = (input[0] * input[2]) +
  *			(input[1] * input[1]) +
  *			(input[2] * input[0])
  *	output[3] = (input[0] * input[3]) +
  *			(input[1] * input[2]) +
  *			(input[2] * input[1]) +
  *			(input[3] * input[0])
  *	output[4] = (input[0] * input[4]) +
  *			(input[1] * input[3]) +
  *			(input[2] * input[2]) +
  *			(input[3] * input[1]) +
  *			(input[4] * input[0])
  *	the pattern here is that the bin numbers from the input are
  *	summed together to get the bin number of the output.  The
  *	probabilities from those bins used from the input in each
  *	individual sum are multiplied together, and all those resulting
  *	multiplies are added together for the resulting output.
  *
  *	A double for() loop would do the above calculation:
  *	for (i = 0; i < N; ++i) {
  *		for (j = 0; j < N; ++j) {
  *			output[i+j] += input[i] * input[j];
  *		}
  *	}
  */
 static int iteration(struct hash *input, struct hash *output)
 {
 struct hashEl *el, *elList;
 int median = 0;
 double maxLog = -1000000.0;
 
 /*	This outside loop is like (i = 0; i < N; ++i)	*/
 elList = hashElListHash(input);
 for (el = elList; el != NULL; el = el->next)
     {
     struct histoGram *hg0;	/*	a hash element	*/
     struct hashEl *elInput;
     int bin0;
 
     /*	This inside loop is like (j = 0; j < N; ++j)	*/
     hg0 = el->val;
     bin0 = hg0->bin;
     for (elInput = elList; elInput != NULL; elInput = elInput->next)
 	{
 	struct histoGram *hg1;	/*	a hash element	*/
 	int bin1;
 	char binName[128];
 	struct hashEl *el;
 	struct histoGram *hgNew;
 
 	hg1 = elInput->val;
 	bin1 = hg1->bin;
 	/*	output bin name is (i+j) == (bin0+bin1)	*/
 	snprintf(binName, sizeof(binName), "%d", bin0+bin1);
 	el = hashLookup(output, binName);
 	/*	start new output bin if it does not yet exist	*/
 	if (el == NULL) {
 	    AllocVar(hgNew);
 	    hgNew->bin = bin0+bin1;
 	    hgNew->prob = hg0->prob * hg1->prob;
 	    hgNew->log_2 = hg0->log_2 + hg1->log_2;
 	    hashAdd(output, binName, hgNew);
 	} else {	/*	Add to existing output bin the new inputs */
 	    hgNew = el->val;
 	    hgNew->log_2 =
 		addLogProbabilities(hgNew->log_2, (hg0->log_2+hg1->log_2));
 	    hgNew->prob = pow(2.0,hgNew->log_2);
 	}
 	/*	Keep track of the resulting output median	*/
 	if (hgNew->log_2 > maxLog)
 	    {
 	    maxLog = hgNew->log_2;
 	    median = hgNew->bin;
 	    }
 	}
     }
 hashElFreeList(&elList);
 
 return median;
 }	/*	iteration()	*/
 
 /*	convolve() - perform the task on the input data
  *	I would like to rearrange this business here, and instead of
  *	reading in the data and leaving it in the hash for all other
  *	routines to work with, it would be best to get it immediately
  *	into an array.  That makes the work of the other routines much
  *	easier.
  */
 static void convolve(int argc, char *argv[])
 {
 int i;
 struct lineFile *lf;			/* for line file utilities	*/
 
 for (i = 1; i < argc; ++i)
     {
     int lineCount = 0;			/* counting input lines	*/
     char *line = (char *)NULL;		/* to receive data input line	*/
     char *words[128];			/* to split data input line	*/
     int wordCount = 0;			/* result of split	*/
     struct hash *histo0;	/*	first histogram	*/
     struct hash *histo1;	/*	second histogram	*/
     int medianBin0 = 0;		/*	bin at median for histo0	*/
     double medianLog_2 = -500.0;	/*	log at median	*/
     int bin = 0;		/*	0 to N-1 for N bins	*/
     int convolutions = 0;	/*	loop counter for # of convolutions */
 
     histo0 = newHash(0);
 
     lf = lineFileOpen(argv[i], TRUE);	/*	input file	*/
     verbose(1, "Processing %s\n", argv[1]);
     while (lineFileNext(lf, &line, NULL))
 	{
 	int j;			/*	loop counter over words	*/
 	int inputValuesCount = 0;
 	struct histoGram *hg;	/*	an allocated hash element	*/
 
 	++lineCount;
 	chopPrefixAt(line, '#'); /* ignore any comments starting with # */
 	if (strlen(line) < 3)	/*	anything left on this line ? */
 	    continue;		/*	no, go to next line	*/
 	wordCount = chopByWhite(line, words, 128);
 	if (wordCount < 1)
 warn("Expecting at least a word at line %d, file: %s, found %d words",
 	lineCount, argv[i], wordCount);
 	if (wordCount == 128)
 warn("May have more than 128 values at line %d, file: %s", lineCount, argv[i]);
 
 	verbose(2, "Input data read from file: %s\n", argv[i]);
 	for (j = 0; j < wordCount; ++j)
 	    {
 	    char binName[128];
 	    double dataValue;
 	    double probInput;
 	    double log_2;
 	    dataValue = strtod(words[j], NULL);
 	    ++inputValuesCount;
 	    if (logs)
 		{
 		log_2 = dataValue;
 		probInput = pow(2.0,log_2);
 		} else {
 		if (dataValue > 0.0)
 		    {
 		    log_2 = log2(dataValue);
 		    probInput = dataValue;
 		    } else {
 		    log_2 = -500.0;	/*	arbitrary limit	*/
 		    probInput = pow(2.0,log_2);
 		    }
 		}
 	    if (log_2 > medianLog_2)
 		{
 		medianLog_2 = log_2;
 		medianBin0 = bin;
 		}
 	    verbose(2, "bin %d: %g %0.5g\n",
 		    inputValuesCount-1, probInput, log_2);
 
 	    AllocVar(hg);	/*	the histogram element	*/
 	    hg->bin = bin;
 	    hg->prob = probInput;
 	    hg->log_2 = log_2;
 	    snprintf(binName, sizeof(binName), "%d", hg->bin);
 	    hashAdd(histo0, binName, hg);
 
 	    ++bin;
 	    }	/*	for each word on an input line	*/
 	}	/*	for each line in a file	*/
 
 	/*	file read complete, echo input	*/
 	if (verboseLevel() >= 2)
 	    printHistogram(histo0, medianBin0);
 
 	/*	perform convolutions to specified count
 	 *	the iteration does histo0 with itself to produce histo1
 	 *	Then histo0 is freed and histo1 copied to it for the
 	 *	next loop.
 	 */
 	for (convolutions = 0; convolutions < convolve_count; ++convolutions)
 	    {
 	    int medianBin;
 	    histo1 = newHash(0);
 	    medianBin = iteration(histo0, histo1);
 	    if (verboseLevel() >= 2)
 		printHistogram(histo1, medianBin);
 	    freeHashAndVals(&histo0);
 	    histo0 = histo1;
 	    }
 
     }		/*	for each input file	*/
 }	/*	convolve()	*/
 
 
 int main(int argc, char *argv[])
 {
 optionInit(&argc, argv, optionSpecs);
 
 if(argc < 2)
     usage();
 
 convolve_count = optionInt("count", 4);
 logs = optionExists("logs");
 html = optionExists("html");
 
 if (verboseLevel() >= 2)
     {
     printf("options: -verbose, input file(s):\n");
     printf("-count=%d\n", convolve_count);
     printf("data input is in %s format\n", logs ? "log" : "probability");
     if (html) printf ("output in html format\n");
     }
 
 convolve(argc, argv);
 
 return(0);
 }