src/hg/makeDb/hgLoadPsl/hgLoadPsl.c 1.34

1.34 2009/03/12 17:40:33 angie
Sort output by target coords unless -noSort is specified. Thanks MarkD for the idea to pipe output to sort!
Index: src/hg/makeDb/hgLoadPsl/hgLoadPsl.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/hgLoadPsl/hgLoadPsl.c,v
retrieving revision 1.33
retrieving revision 1.34
diff -b -B -U 4 -r1.33 -r1.34
--- src/hg/makeDb/hgLoadPsl/hgLoadPsl.c	3 Sep 2008 19:19:44 -0000	1.33
+++ src/hg/makeDb/hgLoadPsl/hgLoadPsl.c	12 Mar 2009 17:40:33 -0000	1.34
@@ -6,31 +6,35 @@
 #include "psl.h"
 #include "xAli.h"
 #include "hdb.h"
 #include "hgRelate.h"
+#include "pipeline.h"
 
 static char const rcsid[] = "$Id$";
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"noTNameIx", OPTION_BOOLEAN},
     {"tNameIx", OPTION_BOOLEAN},  /* default now, but allow for backw.compat.*/
     {"nobin", OPTION_BOOLEAN},
+    {"noBin", OPTION_BOOLEAN},
     {"xa", OPTION_BOOLEAN},
     {"fastLoad", OPTION_BOOLEAN},
     {"onServer", OPTION_BOOLEAN},   /* this is now the default, leave in for compat */
     {"clientLoad", OPTION_BOOLEAN},
     {"append", OPTION_BOOLEAN},
     {"keep", OPTION_BOOLEAN},
     {"table", OPTION_STRING},
+    {"noSort", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 unsigned pslCreateOpts = 0;
 unsigned pslLoadOpts = 0;
 boolean append = FALSE;
 boolean keep = FALSE;
 char *clTableName = NULL;
+boolean noSort = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
@@ -53,8 +57,9 @@
   "               is slower.  This can also load remotely\n"
   "   -append  Append data, don't drop tables before loading\n"
   "   -nobin Repress binning\n"
   "   -keep  Don't remove intermediate tab file/s after load\n"
+  "   -noSort  don't sort (you better be sorting before this)\n"
 );
 }
 
 boolean hasBinColumn(struct sqlConnection *conn, char* table)
@@ -105,42 +110,83 @@
     sqlRemakeTable(conn, table, sqlCmd);
 freez(&sqlCmd);
 }
 
+/* Sort output -- different field offsets depending on whether or not a
+ * bin column is included in the output. */
+static char *sortNoBinCmd[] = {"sort", "-k14,14", "-k16n,16n", NULL};
+static char **outPipeNoBin[] = {sortNoBinCmd, NULL};
+static char *sortBinCmd[] = {"sort", "-k15,15", "-k17n,17n", NULL};
+static char **outPipeBin[] = {sortBinCmd, NULL};
+
 void copyPslToTab(char *pslFile, char *tabFile)
 /* copy a single PSL to the tab file */
 {
 struct psl *psl;
 struct lineFile *lf = pslFileOpen(pslFile);
-FILE *tabFh = mustOpen(tabFile, "w");
+struct pipeline *pl = NULL;
+FILE *tabFh = NULL;
+if (noSort)
+    tabFh = mustOpen(tabFile, "w");
+else
+    {
+    if (pslCreateOpts & PSL_WITH_BIN)
+	pl = pipelineOpen(outPipeBin, pipelineWrite, tabFile, NULL);
+    else
+	pl = pipelineOpen(outPipeNoBin, pipelineWrite, tabFile, NULL);
+    tabFh = pipelineFile(pl);
+    }
 while ((psl = pslNext(lf)) != NULL)
     {
     if (pslCreateOpts & PSL_WITH_BIN)
         fprintf(tabFh, "%u\t", hFindBin(psl->tStart, psl->tEnd));
     pslTabOut(psl, tabFh);
     pslFree(&psl);
     }
 lineFileClose(&lf);
-carefulClose(&tabFh);
+if (noSort)
+    carefulClose(&tabFh);
+else
+    {
+    pipelineWait(pl);
+    pipelineFree(&pl);
+    }
 }
 
 void copyPslXaToTab(char *pslFile, char *tabFile)
 /* copy a single PSL XA to the tab file */
 {
 struct xAli *xa;
 char *row[23];
 struct lineFile *lf = lineFileOpen(pslFile, TRUE);
-FILE *tabFh = mustOpen(tabFile, "w");
+struct pipeline *pl = NULL;
+FILE *tabFh = NULL;
+if (noSort)
+    tabFh = mustOpen(tabFile, "w");
+else
+    {
+    if (pslCreateOpts & PSL_WITH_BIN)
+	pl = pipelineOpen(outPipeBin, pipelineWrite, tabFile, NULL);
+    else
+	pl = pipelineOpen(outPipeNoBin, pipelineWrite, tabFile, NULL);
+    tabFh = pipelineFile(pl);
+    }
 while (lineFileRow(lf, row))
     {
     xa = xAliLoad(row);
     if (pslCreateOpts & PSL_WITH_BIN)
         fprintf(tabFh, "%u\t", hFindBin(xa->tStart, xa->tEnd));
     xAliTabOut(xa, tabFh);
     xAliFree(&xa);
     }
-carefulClose(&tabFh);
 lineFileClose(&lf);
+if (noSort)
+    carefulClose(&tabFh);
+else
+    {
+    pipelineWait(pl);
+    pipelineFree(&pl);
+    }
 }
 
 void loadPslTable(char *database, struct sqlConnection *conn, char *pslFile)
 /* load one psl table */
@@ -171,9 +217,9 @@
 setupTable(database, conn, table);
 
 /* if a bin column is being added or if the input file is
  * compressed, we must copy to an intermediate tab file */
-indirectLoad = ((pslCreateOpts & PSL_WITH_BIN) != 0) || endsWith(pslFile, ".gz");
+indirectLoad = ((pslCreateOpts & PSL_WITH_BIN) != 0) || endsWith(pslFile, ".gz") || !noSort;
 
 if (indirectLoad)
     {
     tabFile = "psl.tab";
@@ -213,9 +259,9 @@
 {
 optionInit(&argc, argv, optionSpecs);
 if (! optionExists("noTNameIx"))
     pslCreateOpts |= PSL_TNAMEIX;
-if (!optionExists("nobin"))
+if (! (optionExists("nobin") || optionExists("noBin")))
     pslCreateOpts |= PSL_WITH_BIN;
 if (optionExists("xa"))
     pslCreateOpts |= PSL_XA_FORMAT;
 if (!optionExists("fastLoad"))
@@ -226,8 +272,9 @@
     pslLoadOpts |= SQL_TAB_FILE_ON_SERVER;
 clTableName = optionVal("table", NULL);
 append = optionExists("append");
 keep = optionExists("keep");
+noSort = optionExists("noSort");
 if (argc < 3)
     usage();
 if ((clTableName != NULL) && (argc > 3))
     errAbort("may only specify one psl file with -tableName");