src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c 1.2
1.2 2010/05/05 00:50:37 kent
Doing another pass at the regulatory clustering - this time whole genome rather than just chromosome 22.
Index: src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c 8 Mar 2010 23:35:07 -0000 1.1
+++ src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c 5 May 2010 00:50:37 -0000 1.2
@@ -4,25 +4,33 @@
#include "hash.h"
#include "options.h"
#include "obscure.h"
#include "sqlNum.h"
-#include "hmmStats.h"
+#include "hmmstats.h"
static char const rcsid[] = "$Id$";
+boolean encodeList = FALSE;
+
void usage()
/* Explain usage and exit. */
{
errAbort(
"regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.\n"
"usage:\n"
" regClusterBedExpCfg inputFileList output.cfg\n"
"options:\n"
- " -xxx=XXX\n"
+ " -encodeList - the inputFileList is of format you might get from cut and paste of\n"
+ " encode downloads page - tab separated with following columns:\n"
+ " <relDate> <fileName> <fileSize> <submitDate> <metadata>\n"
+ " where the metadata component is in the format:\n"
+ " this=that; that=two words; that=whatever\n"
+ " and the antibody and factor tags in the metadata are used\n"
);
}
static struct optionSpec options[] = {
+ {"encodeList", OPTION_BOOLEAN},
{NULL, 0},
};
int commonPrefixSize(struct slName *list)
@@ -119,10 +127,10 @@
if (highEnd > maxVal) highEnd = maxVal;
return 1000.0/highEnd;
}
-void regClusterBedExpCfg(char *input, char *output)
-/* regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.. */
+void makeConfigFromFileList(char *input, char *output)
+/* makeConfigFromFileList - Create config file for hgBedsToBedExps from list of files.. */
{
FILE *f = mustOpen(output, "w");
struct slName *in, *inList = readAllLines(input);
int commonPrefix = commonPrefixSize(inList);
@@ -142,13 +150,92 @@
}
carefulClose(&f);
}
+void makeConfigFromEncodeList(char *input, char *output)
+/* create config file for hgBedsToBedExps from tab-separated file of format
+ * <relDate> <fileName> <fileSize> <submitDate> <metadata> */
+{
+FILE *f = mustOpen(output, "w");
+struct lineFile *lf = lineFileOpen(input, TRUE);
+char *line;
+
+while (lineFileNextReal(lf, &line))
+ {
+ /* Parse out line into major components. */
+ char *releaseDate = nextWord(&line);
+ char *fileName = nextWord(&line);
+ char *fileSize = nextWord(&line);
+ char *submitDate = nextWord(&line);
+ char *metadata = trimSpaces(line);
+ if (isEmpty(metadata))
+ errAbort("line %d of %s is truncated", lf->lineIx, lf->fileName);
+
+ verbose(2, "releaseDate=%s; fileName=%s; fileSize=%s; submitDate=%s; %s\n",
+ releaseDate, fileName, fileSize, submitDate, metadata);
+
+
+ /* Loop through metadata looking for cell and antibody. Metadata
+ * is in format this=that; that=two words; that=whatever */
+ char *cell = NULL, *antibody = NULL;
+ for (;;)
+ {
+ /* Find terminating semicolon if any replace it with zero, and
+ * note position for next time around loop. */
+ metadata = skipLeadingSpaces(metadata);
+ if (isEmpty(metadata))
+ break;
+ char *semi = strchr(metadata, ';');
+ if (semi != NULL)
+ *semi++ = 0;
+
+ /* Parse out name/value pair. */
+ char *name = metadata;
+ char *value = strchr(metadata, '=');
+ if (value == NULL)
+ errAbort("Missing '=' in metadata after tag %s in line %d of %s",
+ name, lf->lineIx, lf->fileName);
+ *value++ = 0;
+ name = trimSpaces(name);
+ value = trimSpaces(value);
+
+ /* Look for our tags. */
+ if (sameString(name, "cell"))
+ cell = value;
+ else if (sameString(name, "antibody"))
+ antibody = value;
+
+ metadata = semi;
+ }
+ if (cell == NULL)
+ errAbort("No cell in metadata line %d of %s", lf->lineIx, lf->fileName);
+ if (antibody == NULL)
+ errAbort("No antibody in metadata line %d of %s", lf->lineIx, lf->fileName);
+
+ fprintf(f, "%s\t%s\t", antibody, cell);
+ fprintf(f, "%c\t", cell[0]);
+ fprintf(f, "file\t6\t");
+ fprintf(f, "%g\t", calcNormScoreFactor(fileName, 6));
+ fprintf(f, "%s\n", fileName);
+ }
+carefulClose(&f);
+}
+
+void regClusterBedExpCfg(char *input, char *output)
+/* regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.. */
+{
+if (encodeList)
+ makeConfigFromEncodeList(input, output);
+else
+ makeConfigFromFileList(input, output);
+}
+
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 3)
usage();
+encodeList = optionExists("encodeList");
regClusterBedExpCfg(argv[1], argv[2]);
return 0;
}