src/hg/ratStuff/mafSplit/mafSplit.c 1.7

1.7 2010/05/21 01:24:24 galt
adding option to mafSplit to allow it to simply split by chrom/scaffold keeping the target name
Index: src/hg/ratStuff/mafSplit/mafSplit.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/ratStuff/mafSplit/mafSplit.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 4 -r1.6 -r1.7
--- src/hg/ratStuff/mafSplit/mafSplit.c	17 Mar 2010 00:42:39 -0000	1.6
+++ src/hg/ratStuff/mafSplit/mafSplit.c	21 May 2010 01:24:24 -0000	1.7
@@ -30,8 +30,12 @@
   "                     Instead of auto-incrementing an integer to determine\n"
   "                     output filename, expect each target sequence name to\n"
   "                     end with a unique number and use that number as the\n"
   "                     integer to tack onto outRoot.\n"
+  "   -useFullSequenceName  For use only with -byTarget.\n"
+  "                     Instead of auto-incrementing an integer to determine\n"
+  "                     output filename, use the target sequence name\n"
+  "                     to tack onto outRoot.\n"
   "   -useHashedName=N  For use only with -byTarget.\n"
   "                     Instead of auto-incrementing an integer or requiring\n"
   "                     a unique number in the sequence name, use a hash\n"
   "                     function on the sequence name to compute an N-bit\n"
@@ -49,16 +53,18 @@
 static struct optionSpec options[] = {
    {"byTarget", OPTION_BOOLEAN},
    {"outDirDepth", OPTION_INT},
    {"useSequenceName", OPTION_BOOLEAN},
+   {"useFullSequenceName", OPTION_BOOLEAN},
    {"useHashedName", OPTION_INT},
    {NULL, 0},
 };
 
 /* Option variables */
 static boolean byTarget = FALSE;
 static int outDirDepth = 0;
 static boolean useSequenceName = FALSE;
+static boolean useFullSequenceName = FALSE;
 static int hashedNameBits = 0;
 
 
 static int numberFromName(char *name)
@@ -79,9 +85,9 @@
 unsigned mask = (1 << hashedNameBits) - 1;
 return hashedName & mask;
 }
 
-static char *mkOutPath(char *outRootDir, char *outRootFile, int seqNum)
+static char *mkOutPath(char *outRootDir, char *outRootFile, int seqNum, char *target)
 /* formulate pathname, using seqNum if outDirDepth > 0 */
 {
 struct dyString *dy = dyStringNew(1024);
 if (outDirDepth > 0)
@@ -99,12 +105,18 @@
 		     seqNum, seqNumDownshifted, seqNumMasked);
 	dyStringPrintf(dy, "%d/", seqNumMasked);
 	makeDir(dy->string);
 	}
+    if (target)
+	dyStringPrintf(dy, "%s%s.maf", outRootFile, target);
+    else
     dyStringPrintf(dy, "%s%05d.maf", outRootFile, seqNum);
     }
 else
     {
+    if (target)
+	dyStringPrintf(dy, "%s/%s%s.maf", outRootDir, outRootFile, target);
+    else
     dyStringPrintf(dy, "%s/%s%03d.maf", outRootDir, outRootFile, seqNum);
     }
 return dyStringCannibalize(&dy);
 }
@@ -137,9 +149,9 @@
 	    {
 	    /* Hash paths by themselves instead of by target, because 
 	     * we may end up reusing a path for several targets. */
 	    int tHashed = numberFromHashedName(targetName, hashedNameBits);
-	    path = mkOutPath(outRootDir, outRootFile, tHashed);
+	    path = mkOutPath(outRootDir, outRootFile, tHashed, NULL);
 	    path = (char *)hashFindVal(pathHash, path);
 	    }
 	else
 	    path = (char *)hashFindVal(pathHash, targetName);
@@ -156,9 +168,20 @@
 	    if (useSequenceName)
 		seqNum = numberFromName(targetName);
 	    else if (hashedNameBits > 0)
 		seqNum = numberFromHashedName(targetName, hashedNameBits);
-	    path = mkOutPath(outRootDir, outRootFile, seqNum);
+	    if (useFullSequenceName)
+		{
+		/* skip over db. prefix if any */
+		char *target = strchr(targetName,'.');
+		if (target)
+		    ++target;
+		else
+		    target = targetName;
+		path = mkOutPath(outRootDir, outRootFile, seqNum, target);
+		}
+	    else
+		path = mkOutPath(outRootDir, outRootFile, seqNum, NULL);
 	    verbose(3, "Opening path %s for writing and adding it to hash "
 		    "for %s\n", path, targetName);
 	    f = mustOpen(path, "w");
 	    fprintf(f, "##maf version=1 scoring=blastz\n");
@@ -329,15 +352,19 @@
 optionInit(&argc, argv, options);
 byTarget = optionExists("byTarget");
 outDirDepth = optionInt("outDirDepth", outDirDepth);
 useSequenceName = optionExists("useSequenceName");
+useFullSequenceName = optionExists("useFullSequenceName");
 hashedNameBits = optionInt("useHashedName", hashedNameBits);
 if (outDirDepth > 0 && !byTarget)
     errAbort("-outDirDepth=N can be specified only when -byTarget is "
 	     "specified.");
 if (useSequenceName && !byTarget)
     errAbort("-useSequenceName can be specified only when -byTarget is "
 	     "specified.");
+if (useFullSequenceName && !byTarget)
+    errAbort("-useFullSequenceName can be specified only when -byTarget is "
+	     "specified.");
 if (hashedNameBits > 0 && !byTarget)
     errAbort("-useHashedName can be specified only when -byTarget is "
 	     "specified.");
 if (hashedNameBits > 0 && useSequenceName)