e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/utils/splitFileByColumn/splitFileByColumn.c src/utils/splitFileByColumn/splitFileByColumn.c
index cc12bb7..9fc4f50 100644
--- src/utils/splitFileByColumn/splitFileByColumn.c
+++ src/utils/splitFileByColumn/splitFileByColumn.c
@@ -1,253 +1,256 @@
 /* splitFileByColumn - Split text input into files named by column value. */
+
+/* Copyright (C) 2014 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "dystring.h"
 #include "options.h"
 #include "obscure.h"
 
 /* Option variables: */
 int col = 1;
 char *headerText = NULL;
 char *tailerText = NULL;
 boolean chromDirs = FALSE;
 char *ending = NULL;
 boolean tab = FALSE;
 
 /* Make this global since we will need it when traversing a hash: */
 char *outDirName = NULL;
 
 #define MAX_COL_OFFSET 32
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 "splitFileByColumn - Split text input into files named by column value\n"
 "usage:\n"
 "   splitFileByColumn source outDir\n"
 "options:\n"
 "   -col=N      - Use the Nth column value (default: N=1, first column)\n"
 "   -head=file  - Put head in front of each output\n"
 "   -tail=file  - Put tail at end of each output\n"
 "   -chromDirs  - Split into subdirs of outDir that are distilled from chrom\n"
 "                 names, e.g. chr3_random -> outDir/3/chr3_random.XXX .\n"
 "   -ending=XXX - Use XXX as the dot-suffix of split files (default: taken\n"
 "                 from source).\n"
 "   -tab        - Split by tab characters instead of whitespace.\n"
 "Split source into multiple files in outDir, with each filename determined\n"
 "by values from a column of whitespace-separated input in source.\n"
 "If source begins with a header, you should pipe \"tail +N source\" to this\n"
 "program where N is number of header lines plus 1, or use some similar\n"
 "method to strip the header from the input.\n"
 );
 }
 
 static struct optionSpec options[] = {
     {"col", OPTION_INT},
     {"head", OPTION_STRING},
     {"tail", OPTION_STRING},
     {"chromDirs", OPTION_BOOLEAN},
     {"ending", OPTION_STRING},
     {"tab", OPTION_BOOLEAN},
     {NULL, 0},
 };
 
 
 char *getFileName(char *baseName)
 /* Return a complete path given dir and basename (add chromDir if applicable 
  * and ending). */
 {
 struct dyString *dy = dyStringNew(0);
 dyStringPrintf(dy, "%s/", outDirName);
 if (chromDirs)
     {
     char *trunc = cloneString(baseName);
     char *ptr = strstr(trunc, "_random");
     if (ptr != NULL)
 	*ptr = '\0';
     if (startsWith("chr", trunc))
 	strcpy(trunc, trunc+strlen("chr"));
     dyStringPrintf(dy, "%s/", trunc);
     freeMem(trunc);
     makeDirs(dy->string);
     }
 dyStringPrintf(dy, "%s%s", baseName, ending);
 return dyStringCannibalize(&dy);
 }
 
 
 FILE *getFp(struct hash *chromFpHash, char *baseName)
 /* Return an open file pointer for file baseName. */
 {
 /* Assume that the input will usually contain contiguous chunks of rows 
  * destined for the same output file, so we won't waste too much time if
  * we close and open a file each time baseName changes.  Having only one 
  * open file pointer, instead of keeping all files open, prevents us from 
  * running into the operating system limit on open file pointers. */
 static char *prevBaseName = NULL;
 static int fileCount = 0;
 FILE *f = NULL;
 struct hashEl *hel = hashLookup(chromFpHash, baseName);
 struct hashEl *prevHel = NULL;
 
 if (hel == NULL)
     {
     /* Brand new baseName - close prevFile, open new file and store in hash. */
     char *outFileName = getFileName(baseName);
     if (prevBaseName != NULL)
 	{
 	prevHel = hashLookup(chromFpHash, prevBaseName);
 	if (prevHel != NULL)
 	    carefulClose((FILE **)&(prevHel->val));
 	}
     verbose(1, "Creating %s\n", outFileName);
     if (fileCount == 1001)
 	warn("Warning: greater than 1000 files have been created in %s",
 	     outDirName);
     f = mustOpen(outFileName, "w");
     if (headerText != NULL)
 	fprintf(f, "%s", headerText);
     hashAdd(chromFpHash, baseName, f);
     freez(&outFileName);
     fileCount++;
     }
 else if (hel->val == NULL)
     {
     /* File has been opened before but is closed -- close prevFile, append. */
     char *outFileName = getFileName(baseName);
     if (prevBaseName != NULL)
 	{
 	prevHel = hashLookup(chromFpHash, prevBaseName);
 	if (prevHel != NULL)
 	    carefulClose((FILE **)&(prevHel->val));
 	}
     hel->val = f = mustOpen(outFileName, "a");
     freez(&outFileName);
     }
 else if (!sameString(baseName, prevBaseName))
     {
     errAbort("program error: a cached file pointer is open but baseName (%s)"
 	     " != prevBaseName (%s)", baseName, prevBaseName);
     }
 else
     {
     /* Write to open file pointer. */
     f = hel->val;
     }
 
 if (prevBaseName == NULL || !sameString(baseName, prevBaseName))
     {
     freez(&prevBaseName);
     prevBaseName = cloneString(baseName);
     }
 
 return f;
 }
 
 
 void addTailAndClose(struct hashEl *hel)
 /* Given an element of chromFpHash, add tailerText if specified (which may
  * require reopening the file) and close the file pointer. */
 {
 if (tailerText != NULL)
     {
     FILE *f = (FILE *)(hel->val);
     if (f == NULL)
 	{
 	char *outFileName = getFileName(hel->name);
 	hel->val = f = mustOpen(outFileName, "a");
 	}
     fprintf(f, "%s", tailerText);
     }
 carefulClose((FILE **)&(hel->val));
 }
 
 void closeFiles(struct hash *chromFpHash)
 /* Apply addTailAndClose to all items in chromFpHash. */
 {
 hashTraverseEls(chromFpHash, addTailAndClose);
 }
 
 
 void splitFileByColumn(char *inFileName)
 /* splitFileByColumn - Split text input into files named by column value. */
 {
 struct lineFile *lf = lineFileOpen(inFileName, TRUE);
 struct hash *chromFpHash = hashNew(10);
 char *line = NULL;
 
 if (ending == NULL)
     {
     char *ptr = inFileName + strlen(inFileName) - 1;
     while (ptr > inFileName && *ptr != '/')
 	ptr--;
     ptr = strchr(ptr, '.');
     if (ptr == NULL)
 	ending = "";
     else
 	ending = ptr;
     }
 else if (ending[0] != '.')
     {
     /* If -ending is given without initial ., prepend .: */
     size_t deSize = sizeof(char) * strlen(ending) + 2;
     char *dotEnding = cloneStringZ(ending, deSize);
     safef(dotEnding, deSize, ".%s", ending);
     ending = dotEnding;
     }
 
 makeDirs(outDirName);
 while (lineFileNext(lf, &line, NULL))
     {
     char *lineClone = cloneString(line);
     char *words[MAX_COL_OFFSET+1];
     int wordCount = 0;
     char *colVal = NULL;
     FILE *f = NULL;
     
     if (tab)
 	wordCount = chopTabs(lineClone, words);
     else
 	wordCount = chopLine(lineClone, words);
     if (wordCount < col)
 	errAbort("Too few columns line %d of %s (%d, need at least %d)",
 		 lf->lineIx, lf->fileName, wordCount, col);
 
     colVal = words[col-1];
     f = getFp(chromFpHash, colVal);
     fprintf(f, "%s\n", line);
     freeMem(lineClone);
     }
 lineFileClose(&lf);
 closeFiles(chromFpHash);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *headFileName = NULL, *tailFileName = NULL;
 optionInit(&argc, argv, options);
 col = optionInt("col", col);
 headFileName = optionVal("head", headFileName);
 tailFileName = optionVal("tail", tailFileName);
 chromDirs = optionExists("chromDirs");
 ending = optionVal("ending", ending);
 tab = optionExists("tab");
 if (headFileName != NULL)
     {
     readInGulp(headFileName, &headerText, NULL);
     }
 if (tailFileName != NULL)
     {
     readInGulp(tailFileName, &tailerText, NULL);
     }
 if (argc != 3)
     usage();
 if (col > MAX_COL_OFFSET)
     errAbort("Sorry, max -col offset currently supported is %d",
 	     MAX_COL_OFFSET);
 outDirName = argv[2];
 splitFileByColumn(argv[1]);
 return 0;
 }