src/hg/ratStuff/mafSplit/mafSplit.c 1.7
1.7 2010/05/21 01:24:24 galt
adding option to mafSplit to allow it to simply split by chrom/scaffold keeping the target name
Index: src/hg/ratStuff/mafSplit/mafSplit.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/ratStuff/mafSplit/mafSplit.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 4 -r1.6 -r1.7
--- src/hg/ratStuff/mafSplit/mafSplit.c 17 Mar 2010 00:42:39 -0000 1.6
+++ src/hg/ratStuff/mafSplit/mafSplit.c 21 May 2010 01:24:24 -0000 1.7
@@ -30,8 +30,12 @@
" Instead of auto-incrementing an integer to determine\n"
" output filename, expect each target sequence name to\n"
" end with a unique number and use that number as the\n"
" integer to tack onto outRoot.\n"
+ " -useFullSequenceName For use only with -byTarget.\n"
+ " Instead of auto-incrementing an integer to determine\n"
+ " output filename, use the target sequence name\n"
+ " to tack onto outRoot.\n"
" -useHashedName=N For use only with -byTarget.\n"
" Instead of auto-incrementing an integer or requiring\n"
" a unique number in the sequence name, use a hash\n"
" function on the sequence name to compute an N-bit\n"
@@ -49,16 +53,18 @@
static struct optionSpec options[] = {
{"byTarget", OPTION_BOOLEAN},
{"outDirDepth", OPTION_INT},
{"useSequenceName", OPTION_BOOLEAN},
+ {"useFullSequenceName", OPTION_BOOLEAN},
{"useHashedName", OPTION_INT},
{NULL, 0},
};
/* Option variables */
static boolean byTarget = FALSE;
static int outDirDepth = 0;
static boolean useSequenceName = FALSE;
+static boolean useFullSequenceName = FALSE;
static int hashedNameBits = 0;
static int numberFromName(char *name)
@@ -79,9 +85,9 @@
unsigned mask = (1 << hashedNameBits) - 1;
return hashedName & mask;
}
-static char *mkOutPath(char *outRootDir, char *outRootFile, int seqNum)
+static char *mkOutPath(char *outRootDir, char *outRootFile, int seqNum, char *target)
/* formulate pathname, using seqNum if outDirDepth > 0 */
{
struct dyString *dy = dyStringNew(1024);
if (outDirDepth > 0)
@@ -99,12 +105,18 @@
seqNum, seqNumDownshifted, seqNumMasked);
dyStringPrintf(dy, "%d/", seqNumMasked);
makeDir(dy->string);
}
+ if (target)
+ dyStringPrintf(dy, "%s%s.maf", outRootFile, target);
+ else
dyStringPrintf(dy, "%s%05d.maf", outRootFile, seqNum);
}
else
{
+ if (target)
+ dyStringPrintf(dy, "%s/%s%s.maf", outRootDir, outRootFile, target);
+ else
dyStringPrintf(dy, "%s/%s%03d.maf", outRootDir, outRootFile, seqNum);
}
return dyStringCannibalize(&dy);
}
@@ -137,9 +149,9 @@
{
/* Hash paths by themselves instead of by target, because
* we may end up reusing a path for several targets. */
int tHashed = numberFromHashedName(targetName, hashedNameBits);
- path = mkOutPath(outRootDir, outRootFile, tHashed);
+ path = mkOutPath(outRootDir, outRootFile, tHashed, NULL);
path = (char *)hashFindVal(pathHash, path);
}
else
path = (char *)hashFindVal(pathHash, targetName);
@@ -156,9 +168,20 @@
if (useSequenceName)
seqNum = numberFromName(targetName);
else if (hashedNameBits > 0)
seqNum = numberFromHashedName(targetName, hashedNameBits);
- path = mkOutPath(outRootDir, outRootFile, seqNum);
+ if (useFullSequenceName)
+ {
+ /* skip over db. prefix if any */
+ char *target = strchr(targetName,'.');
+ if (target)
+ ++target;
+ else
+ target = targetName;
+ path = mkOutPath(outRootDir, outRootFile, seqNum, target);
+ }
+ else
+ path = mkOutPath(outRootDir, outRootFile, seqNum, NULL);
verbose(3, "Opening path %s for writing and adding it to hash "
"for %s\n", path, targetName);
f = mustOpen(path, "w");
fprintf(f, "##maf version=1 scoring=blastz\n");
@@ -329,15 +352,19 @@
optionInit(&argc, argv, options);
byTarget = optionExists("byTarget");
outDirDepth = optionInt("outDirDepth", outDirDepth);
useSequenceName = optionExists("useSequenceName");
+useFullSequenceName = optionExists("useFullSequenceName");
hashedNameBits = optionInt("useHashedName", hashedNameBits);
if (outDirDepth > 0 && !byTarget)
errAbort("-outDirDepth=N can be specified only when -byTarget is "
"specified.");
if (useSequenceName && !byTarget)
errAbort("-useSequenceName can be specified only when -byTarget is "
"specified.");
+if (useFullSequenceName && !byTarget)
+ errAbort("-useFullSequenceName can be specified only when -byTarget is "
+ "specified.");
if (hashedNameBits > 0 && !byTarget)
errAbort("-useHashedName can be specified only when -byTarget is "
"specified.");
if (hashedNameBits > 0 && useSequenceName)