src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c 1.2

1.2 2010/05/05 00:50:37 kent
Doing another pass at the regulatory clustering - this time whole genome rather than just chromosome 22.
Index: src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c	8 Mar 2010 23:35:07 -0000	1.1
+++ src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c	5 May 2010 00:50:37 -0000	1.2
@@ -1,154 +1,241 @@
 /* regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "obscure.h"
 #include "sqlNum.h"
-#include "hmmStats.h"
+#include "hmmstats.h"
 
 static char const rcsid[] = "$Id$";
 
+boolean encodeList = FALSE;
+
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.\n"
   "usage:\n"
   "   regClusterBedExpCfg inputFileList output.cfg\n"
   "options:\n"
-  "   -xxx=XXX\n"
+  "   -encodeList - the inputFileList is of format you might get from cut and paste of\n"
+  "        encode downloads page - tab separated with following columns:\n"
+  "             <relDate> <fileName> <fileSize> <submitDate> <metadata>\n"
+  "        where the metadata component is in the format:\n"
+  "              this=that; that=two words; that=whatever\n"
+  "        and the antibody and factor tags in the metadata are used\n"
   );
 }
 
 static struct optionSpec options[] = {
+   {"encodeList", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 int commonPrefixSize(struct slName *list)
 /* Return length of common prefix */
 {
 if (list == NULL)
     return 0;
 int commonSize = strlen(list->name);
 struct slName *el, *lastEl = list;
 for (el = list->next; el != NULL; el = el->next)
     {
     int sameSize = countSame(el->name, lastEl->name);
     commonSize = min(sameSize, commonSize);
     lastEl = el;
     }
 return commonSize;
 }
 
 int countSameAtEnd(char *a, char *b)
 /* Count number of characters at end of strings that are same in each string. */
 {
 int count = 0;
 char *aEnd = a + strlen(a);
 char *bEnd = b + strlen(b);
 while  (--aEnd >= a && --bEnd >= b)
     {
     if (*aEnd != *bEnd)
         break;
     ++count;
     }
 return count;
 }
 
 int commonSuffixSize(struct slName *list)
 /* Return length of common suffix */
 {
 if (list == NULL)
     return 0;
 int commonSize = strlen(list->name);
 struct slName *el, *lastEl = list;
 for (el = list->next; el != NULL; el = el->next)
     {
     int sameSize = countSameAtEnd(el->name, lastEl->name);
     commonSize = min(sameSize, commonSize);
     lastEl = el;
     }
 return commonSize;
 }
 
 void camelParseTwo(char *in, char **retA, char **retB)
 /* Parse out CamelCased in into a and b.  */
 {
 char *s = in;
 char *aStart = s;
 char *bStart = NULL;
 char c;
 while ((c = *(++s)) != 0)
     {
     if (isupper(c))
         {
 	bStart = s;
 	break;
         }
     }
 if (bStart == NULL)
    errAbort("Couldn't find start of second word in %s", in);
 *retA = cloneStringZ(aStart, bStart - aStart);
 *retB = cloneString(bStart);
 }
 
 double calcNormScoreFactor(char *fileName, int scoreCol)
 /* Figure out what to multiply things by to get a nice browser score (0-1000) */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[scoreCol+1];
 double sum = 0, sumSquares = 0;
 int n = 0;
 double minVal=0, maxVal=0;
 while (lineFileRow(lf, row))
     {
     double x = sqlDouble(row[scoreCol]);
     if (n == 0)
         minVal = maxVal = x;
     if (x < minVal) minVal = x;
     if (x > maxVal) maxVal = x;
     sum += x;
     sumSquares += x*x;
     n += 1;
     }
 
 double std = calcStdFromSums(sum, sumSquares, n);
 double mean = sum/n;
 double highEnd = mean + std;
 if (highEnd > maxVal) highEnd = maxVal;
 return 1000.0/highEnd;
 }
 
-void regClusterBedExpCfg(char *input, char *output)
-/* regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.. */
+void makeConfigFromFileList(char *input, char *output)
+/* makeConfigFromFileList - Create config file for hgBedsToBedExps from list of files.. */
 {
 FILE *f = mustOpen(output, "w");
 struct slName *in, *inList = readAllLines(input);
 int commonPrefix = commonPrefixSize(inList);
 int commonSuffix = commonSuffixSize(inList);
 for (in = inList; in != NULL; in = in->next)
     {
     char *s = in->name;
     int len = strlen(s);
     char *midString = cloneStringZ(s+commonPrefix, len - commonPrefix - commonSuffix);
     char *factor, *cell;
     camelParseTwo(midString, &cell, &factor);
     fprintf(f, "%s\t%s\t", factor, cell);
     fprintf(f, "%c\t", cell[0]);
     fprintf(f, "file\t7\t");
     fprintf(f, "%g\t", calcNormScoreFactor(in->name, 7));
     fprintf(f, "%s\n", in->name);
     }
 carefulClose(&f);
 }
 
+void makeConfigFromEncodeList(char *input, char *output)
+/* create config file for hgBedsToBedExps from tab-separated file of format
+ *         <relDate> <fileName> <fileSize> <submitDate> <metadata> */
+{
+FILE *f = mustOpen(output, "w");
+struct lineFile *lf = lineFileOpen(input, TRUE);
+char *line;
+
+while (lineFileNextReal(lf, &line))
+    {
+    /* Parse out line into major components. */
+    char *releaseDate = nextWord(&line);
+    char *fileName = nextWord(&line);
+    char *fileSize = nextWord(&line);
+    char *submitDate = nextWord(&line);
+    char *metadata = trimSpaces(line);
+    if (isEmpty(metadata))
+        errAbort("line %d of %s is truncated", lf->lineIx, lf->fileName);
+
+    verbose(2, "releaseDate=%s; fileName=%s; fileSize=%s; submitDate=%s; %s\n", 
+    	releaseDate, fileName, fileSize, submitDate, metadata);
+
+
+    /* Loop through metadata looking for cell and antibody.  Metadata
+     * is in format this=that; that=two words; that=whatever */
+    char *cell = NULL, *antibody = NULL;
+    for (;;)
+        {
+	/* Find terminating semicolon if any replace it with zero, and
+	 * note position for next time around loop. */
+	metadata = skipLeadingSpaces(metadata);
+	if (isEmpty(metadata))
+	    break;
+	char *semi = strchr(metadata, ';');
+	if (semi != NULL)
+	   *semi++ = 0;
+
+	/* Parse out name/value pair. */
+	char *name = metadata;
+	char *value = strchr(metadata, '=');
+	if (value == NULL)
+	   errAbort("Missing '=' in metadata after tag %s in line %d of %s", 
+	   	name, lf->lineIx, lf->fileName);
+	*value++ = 0;
+	name = trimSpaces(name);
+	value = trimSpaces(value);
+
+	/* Look for our tags. */
+	if (sameString(name, "cell"))
+	    cell = value;
+	else if (sameString(name, "antibody"))
+	    antibody = value;
+
+	metadata = semi;
+	}
+    if (cell == NULL) 
+        errAbort("No cell in metadata line %d of %s", lf->lineIx, lf->fileName);
+    if (antibody == NULL) 
+        errAbort("No antibody in metadata line %d of %s", lf->lineIx, lf->fileName);
+
+    fprintf(f, "%s\t%s\t", antibody, cell);
+    fprintf(f, "%c\t", cell[0]);
+    fprintf(f, "file\t6\t");
+    fprintf(f, "%g\t", calcNormScoreFactor(fileName, 6));
+    fprintf(f, "%s\n", fileName);
+    }
+carefulClose(&f);
+}
+
+void regClusterBedExpCfg(char *input, char *output)
+/* regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.. */
+{
+if (encodeList)
+    makeConfigFromEncodeList(input, output);
+else
+    makeConfigFromFileList(input, output);
+}
+ 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
+encodeList = optionExists("encodeList");
 regClusterBedExpCfg(argv[1], argv[2]);
 return 0;
 }