4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c index a6dd8d9..df22053 100644 --- src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c +++ src/hg/regulate/regClusterBedExpCfg/regClusterBedExpCfg.c @@ -1,356 +1,356 @@ /* regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.. */ /* Copyright (C) 2013 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "obscure.h" #include "sqlNum.h" #include "hmmstats.h" #include "errAbort.h" boolean encodeList = FALSE; boolean tabList = FALSE; boolean useTarget = FALSE; char *cellLetter = NULL; int scoreCol = 7; boolean noLetter = FALSE; boolean noLetterOk = FALSE; boolean noNormalize = FALSE; struct hash *cellLetterHash; void usage() /* Explain usage and exit. */ { errAbort( "regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.\n" "usage:\n" " regClusterBedExpCfg inputFileList output.cfg\n" "options:\n" " -cellLetter=file.tab - two column file of form <letter> <name> for cell lines\n" " If not present the first letter of the cell name will be used\n" " -tabList - the inputFileList is in three or four column format of form:\n" " <fileName> <cell> <factor-antibody> [factor-target]\n" " -useTarget - cluster on target (must have 4 column tabList)\n" " -encodeList - the inputFileList is of format you might get from cut and paste of\n" " encode downloads page - tab separated with following columns:\n" " <relDate> <fileName> <fileSize> <submitDate> <metadata>\n" " where the metadata component is in the format:\n" " this=that; that=two words; that=whatever\n" " and the antibody and cell tags in the metadata are used\n" " -scoreCol=N - The column (starting with 1) with score. 5 for bed, 7 for narrowPeak\n" " default %d\n" " -noLetter - just list cell types found in the inputFileList that lack a code in the cellLetter file\n" " -noLetterOk - use first letter of cell name (lower-cased) if not found in cell letter file. Strips trailing + qualifiers in cell name before looking up in cell letter file.\n" " -noNormalize - skip normalization, set norm value to 1 and scoreCol to 4 (score) in cfg file\n" , scoreCol ); } static struct optionSpec options[] = { {"encodeList", OPTION_BOOLEAN}, {"tabList", OPTION_BOOLEAN}, {"useTarget", OPTION_BOOLEAN}, {"cellLetter", OPTION_STRING}, {"scoreCol", OPTION_INT}, {"noLetter", OPTION_BOOLEAN}, {"noLetterOk", OPTION_BOOLEAN}, {"noNormalize", OPTION_BOOLEAN}, {NULL, 0}, }; char *cellAbbrevDefault(char *cell, boolean toLower) /* Return default abbreviation of cell-name */ { static char buf[2]; buf[0] = (toLower ? tolower(cell[0]) : cell[0]); buf[1] = 0; return buf; } char *cellAbbreviation(char *cell) /* Return abbreviated version of cell-name */ { if (cellLetterHash == NULL) return cellAbbrevDefault(cell, FALSE); if (noLetterOk) { // strip qualifiers (follow the '+' char) char *plus = stringIn("+", cell); if (plus) *plus = 0; } char *val = hashFindVal(cellLetterHash, cell); if (val != NULL) return val; if (noLetterOk) return cellAbbrevDefault(cell, TRUE); if (noLetter) uglyf("cell %s isn't in %s\n", cell, cellLetter); else errAbort("cell %s isn't in %s\n", cell, cellLetter); return NULL; } int commonPrefixSize(struct slName *list) /* Return length of common prefix */ { if (list == NULL) return 0; int commonSize = strlen(list->name); struct slName *el, *lastEl = list; for (el = list->next; el != NULL; el = el->next) { int sameSize = countSame(el->name, lastEl->name); commonSize = min(sameSize, commonSize); lastEl = el; } return commonSize; } int countSameAtEnd(char *a, char *b) /* Count number of characters at end of strings that are same in each string. */ { int count = 0; char *aEnd = a + strlen(a); char *bEnd = b + strlen(b); while (--aEnd >= a && --bEnd >= b) { if (*aEnd != *bEnd) break; ++count; } return count; } int commonSuffixSize(struct slName *list) /* Return length of common suffix */ { if (list == NULL) return 0; int commonSize = strlen(list->name); struct slName *el, *lastEl = list; for (el = list->next; el != NULL; el = el->next) { int sameSize = countSameAtEnd(el->name, lastEl->name); commonSize = min(sameSize, commonSize); lastEl = el; } return commonSize; } void camelParseTwo(char *in, char **retA, char **retB) /* Parse out CamelCased in into a and b. */ { char *s = in; char *aStart = s; char *bStart = NULL; char c; while ((c = *(++s)) != 0) { if (isupper(c)) { bStart = s; break; } } if (bStart == NULL) errAbort("Couldn't find start of second word in %s", in); *retA = cloneStringZ(aStart, bStart - aStart); *retB = cloneString(bStart); } double calcNormScoreFactor(char *fileName, int scoreCol) /* Figure out what to multiply things by to get a nice browser score (0-1000) */ { if (noNormalize) return 1.0; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[scoreCol+1]; double sum = 0, sumSquares = 0; int n = 0; double minVal=0, maxVal=0; while (lineFileRow(lf, row)) { double x = sqlDouble(row[scoreCol]); if (n == 0) minVal = maxVal = x; if (x < minVal) minVal = x; if (x > maxVal) maxVal = x; sum += x; sumSquares += x*x; n += 1; } lineFileClose(&lf); double std = calcStdFromSums(sum, sumSquares, n); double mean = sum/n; double highEnd = mean + std; if (highEnd > maxVal) highEnd = maxVal; return 1000.0/highEnd; } void makeConfigFromFileList(char *input, char *output) /* makeConfigFromFileList - Create config file for hgBedsToBedExps from list of files.. */ { FILE *f = mustOpen(output, "w"); struct slName *in, *inList = readAllLines(input); int commonPrefix = commonPrefixSize(inList); int commonSuffix = commonSuffixSize(inList); for (in = inList; in != NULL; in = in->next) { char *s = in->name; int len = strlen(s); char *midString = cloneStringZ(s+commonPrefix, len - commonPrefix - commonSuffix); char *factor, *cell; camelParseTwo(midString, &cell, &factor); fprintf(f, "%s\t%s\t", factor, cell); fprintf(f, "%s\t", cellAbbreviation(cell)); fprintf(f, "file\t%d\t", scoreCol-1); fprintf(f, "%g\t", calcNormScoreFactor(in->name, scoreCol-1)); fprintf(f, "%s\n", in->name); } carefulClose(&f); } void makeConfigFromTabList(char *input, char *output, boolean useTarget) /* makeConfigFromFileList - Create config file for hgBedsToBedExps from list of file/cell/ab or file/cell/ab/target. */ { struct lineFile *lf = lineFileOpen(input, TRUE); char *row[4]; FILE *f = mustOpen(output, "w"); while (lineFileRow(lf, row)) { char *fileName = row[0]; char *cell = row[1]; char *factor = row[2]; verbose(3, "%s\n", fileName); if (useTarget) // 4 column input file -- output target cell+treatment+factor fprintf(f, "%s\t%s+%s\t", row[3], cell, factor); else // antibody cell+treatment fprintf(f, "%s\t%s", factor, cell); fprintf(f, "\t%s\t", cellAbbreviation(cell)); fprintf(f, "file\t%d\t", scoreCol-1); fprintf(f, "%g\t", calcNormScoreFactor(fileName, scoreCol-1)); fprintf(f, "%s\n", fileName); } lineFileClose(&lf); carefulClose(&f); } void makeConfigFromEncodeList(char *input, char *output) /* create config file for hgBedsToBedExps from tab-separated file of format * <relDate> <fileName> <fileSize> <submitDate> <metadata> */ { FILE *f = mustOpen(output, "w"); struct lineFile *lf = lineFileOpen(input, TRUE); char *line; while (lineFileNextReal(lf, &line)) { /* Parse out line into major components. */ char *releaseDate = nextWord(&line); char *fileName = nextWord(&line); char *fileSize = nextWord(&line); char *submitDate = nextWord(&line); char *metadata = trimSpaces(line); if (isEmpty(metadata)) errAbort("line %d of %s is truncated", lf->lineIx, lf->fileName); verbose(2, "releaseDate=%s; fileName=%s; fileSize=%s; submitDate=%s; %s\n", releaseDate, fileName, fileSize, submitDate, metadata); /* Loop through metadata looking for cell and antibody. Metadata * is in format this=that; that=two words; that=whatever */ char *cell = NULL, *antibody = NULL; for (;;) { /* Find terminating semicolon if any replace it with zero, and * note position for next time around loop. */ metadata = skipLeadingSpaces(metadata); if (isEmpty(metadata)) break; char *semi = strchr(metadata, ';'); if (semi != NULL) *semi++ = 0; /* Parse out name/value pair. */ char *name = metadata; char *value = strchr(metadata, '='); if (value == NULL) errAbort("Missing '=' in metadata after tag %s in line %d of %s", name, lf->lineIx, lf->fileName); *value++ = 0; name = trimSpaces(name); value = trimSpaces(value); /* Look for our tags. */ if (sameString(name, "cell")) cell = value; else if (sameString(name, "antibody")) antibody = value; metadata = semi; } if (cell == NULL) errAbort("No cell in metadata line %d of %s", lf->lineIx, lf->fileName); if (antibody == NULL) errAbort("No antibody in metadata line %d of %s", lf->lineIx, lf->fileName); fprintf(f, "%s\t%s\t", antibody, cell); fprintf(f, "%s\t", cellAbbreviation(cell)); fprintf(f, "file\t%d\t", scoreCol-1); fprintf(f, "%g", calcNormScoreFactor(fileName, scoreCol-1)); fprintf(f, "\t%s\n", fileName); } carefulClose(&f); } void regClusterBedExpCfg(char *input, char *output) /* regClusterBedExpCfg - Create config file for hgBedsToBedExps from list of files.. */ { if (cellLetter) cellLetterHash = hashTwoColumnFile(cellLetter); if (encodeList) makeConfigFromEncodeList(input, output); else if (tabList) makeConfigFromTabList(input, output, useTarget); else makeConfigFromFileList(input, output); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); encodeList = optionExists("encodeList"); tabList = optionExists("tabList"); useTarget = optionExists("useTarget"); cellLetter = optionVal("cellLetter", cellLetter); scoreCol = optionInt("scoreCol", scoreCol); noLetter = optionExists("noLetter"); noLetterOk = optionExists("noLetterOk"); noNormalize = optionExists("noNormalize"); if (noNormalize) // standard score column index added to config file scoreCol = 5; else { verbose(2, "Normalizing score using column %d\n", scoreCol); } regClusterBedExpCfg(argv[1], argv[2]); return 0; }