src/hg/ratStuff/mafsInRegion/mafsInRegion.c 1.7
1.7 2009/07/25 08:32:16 markd
added option to keep initial/training gaps in alignment'
Index: src/hg/ratStuff/mafsInRegion/mafsInRegion.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/ratStuff/mafsInRegion/mafsInRegion.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 1000000 -r1.6 -r1.7
--- src/hg/ratStuff/mafsInRegion/mafsInRegion.c 22 Jul 2009 20:23:27 -0000 1.6
+++ src/hg/ratStuff/mafsInRegion/mafsInRegion.c 25 Jul 2009 08:32:16 -0000 1.7
@@ -1,202 +1,206 @@
/* Split MAF file based on positions in a bed file */
#include "common.h"
#include "portable.h"
#include "hash.h"
#include "options.h"
#include "sqlNum.h"
#include "maf.h"
#include "bed.h"
static char const rcsid[] = "$Id$";
void usage()
/* Explain usage and exit. */
{
errAbort(
"mafsInRegion - Extract MAFS in a genomic region\n"
"usage:\n"
" mafsInRegion regions.bed out.maf|outDir in.maf(s)\n"
"options:\n"
" -outDir - output separate files named by bed name field to outDir\n"
+ " -keepInitialGaps - keep alignment columns at the beginning and of a block that are gapped in all species\n"
);
}
static struct optionSpec options[] = {
{"outDir", OPTION_BOOLEAN},
+ {"keepInitialGaps", OPTION_BOOLEAN},
{NULL, 0},
};
boolean outDir = FALSE;
+boolean keepInitialGaps = FALSE;
char *dir = NULL;
char *scoring = NULL;
struct hash *loadRegions(char *file)
/* load regions into a hash of lists by chrom */
{
struct bed *bed = NULL, *bedList = NULL, *nextBed = NULL, *temp = NULL;
struct hash *regionHash = newHash(6);
struct bed *regions;
regions = bedLoadNAll(file, outDir ? 4 : 3);
/* order by chrom, start */
slSort(®ions, bedCmp);
verbose(2, "found %d regions\n", slCount(regions));
bedList = regions;
for (bed = regions; bed != NULL; bed = nextBed)
{
verbose(3, "region %s:%d-%d\n", bed->chrom, bed->chromStart+1, bed->chromEnd);
nextBed = bed->next;
if ((bed->next == NULL) || (differentString(bed->chrom,bed->next->chrom)))
{
temp = bed->next;
bed->next = NULL;
hashAdd(regionHash, bed->chrom, bedList);
verbose(2, "just added %d regions on %s\n", slCount(bedList), bed->chrom);
bedList = temp;
}
}
return regionHash;
}
char *chromFromSrc(char *src)
/* get chrom name from <db>.<chrom> */
{
char *p;
if ((p = strchr(src, '.')) == NULL)
errAbort("Can't find chrom in MAF component src: %s\n", src);
return ++p;
}
FILE *startOutFile(char *outFile)
{
static FILE *f = NULL;
f = mustOpen(outFile, "w");
verbose(3, "creating %s\n", outFile);
mafWriteStart(f, scoring);
return f;
}
void endOutFile(FILE *f)
{
mafWriteEnd(f);
carefulClose(&f);
}
void extractMafs(char *file, FILE *f, struct hash *regionHash)
/* extract MAFs in a file from regions specified in hash */
{
char *chrom = NULL;
struct bed *bed = NULL;
struct mafFile *mf = mafOpen(file);
struct mafAli *maf = NULL;
struct mafComp *mc;
char path[256];
verbose(1, "extracting from %s\n", file);
maf = mafNext(mf);
while (maf)
{
mc = maf->components;
if (!chrom || differentString(chrom, chromFromSrc(mc->src)))
chrom = cloneString(chromFromSrc(mc->src)); /* new chrom */
bed = (struct bed *)hashFindVal(regionHash, chrom);
if (!bed)
{
/* no regions on this chrom -- skip to next chrom */
do
mafAliFree(&maf);
while (((maf = mafNext(mf)) != NULL) && sameString(chromFromSrc(maf->components->src), chrom));
continue; // start over with this maf
}
verbose(2, "region: %s:%d-%d\n",
bed->chrom, bed->chromStart+1, bed->chromEnd);
if (outDir)
{
if (f)
endOutFile(f);
safef(path, sizeof (path), "%s/%s.maf", dir, bed->name);
f = startOutFile(path);
}
/* skip mafs before region, stopping if chrom changes */
while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) &&
(mc->start + mc->size) <= bed->chromStart)
{
mafAliFree(&maf);
maf = mafNext(mf);
}
/* extract all mafs and pieces of mafs in region */
while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) &&
(bed->chromStart < mc->start + mc->size && bed->chromEnd > mc->start))
{
int mafStart = mc->start;
int mafEnd = mc->start + mc->size;
struct mafAli *full = maf;
if (mafStart < bed->chromStart || mafEnd > bed->chromEnd)
{
full = maf;
- maf = mafSubset(full, mc->src, bed->chromStart, bed->chromEnd);
+ maf = mafSubsetE(full, mc->src, bed->chromStart, bed->chromEnd, keepInitialGaps);
mc = maf->components;
}
verbose(2, " %s:%d-%d\n", chrom, mc->start+1, mc->start + mc->size);
mafWrite(f, maf);
struct mafAli *nextMaf = (mafEnd > bed->chromEnd+1)
? mafSubset(full, mc->src, bed->chromEnd+1, mafEnd) : mafNext(mf);
if (maf != full)
mafAliFree(&maf);
mafAliFree(&full);
maf = nextMaf;
}
/* get next region */
hashRemove(regionHash, bed->chrom);
if (bed->next)
hashAdd(regionHash, bed->chrom, bed->next);
}
mafFileFree(&mf);
}
void mafsInRegion(char *regionFile, char *out, int mafCount, char *mafFiles[])
/* Extract MAFs in regions listed in regin file */
{
int i = 0;
struct hash *bedHash = NULL;
FILE *f = NULL;
struct mafFile *mf = NULL;
verbose(1, "Extracting from %d files to %s\n", mafCount, out);
bedHash = loadRegions(regionFile);
/* get scoring scheme */
mf = mafOpen(mafFiles[0]);
if (!mf)
errAbort("can't open MAF file: %s\n", mafFiles[0]);
scoring = cloneString(mf->scoring);
mafFileFree(&mf);
/* set up output dir */
if (outDir)
{
dir = out;
makeDir(dir);
}
else
f = startOutFile(out);
for (i = 0; i < mafCount; i++)
extractMafs(mafFiles[i], f, bedHash);
if (!outDir)
endOutFile(f);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
outDir = optionExists("outDir");
+keepInitialGaps = optionExists("keepInitialGaps");
if (argc < 4)
usage();
mafsInRegion(argv[1], argv[2], argc-3, &argv[3]);
return 0;
}