src/hg/ratStuff/mafsInRegion/mafsInRegion.c 1.7

1.7 2009/07/25 08:32:16 markd
added option to keep initial/training gaps in alignment'
Index: src/hg/ratStuff/mafsInRegion/mafsInRegion.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/ratStuff/mafsInRegion/mafsInRegion.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 1000000 -r1.6 -r1.7
--- src/hg/ratStuff/mafsInRegion/mafsInRegion.c	22 Jul 2009 20:23:27 -0000	1.6
+++ src/hg/ratStuff/mafsInRegion/mafsInRegion.c	25 Jul 2009 08:32:16 -0000	1.7
@@ -1,202 +1,206 @@
 /* Split MAF file based on positions in a bed file */
 
 #include "common.h"
 #include "portable.h"
 #include "hash.h"
 #include "options.h"
 #include "sqlNum.h"
 #include "maf.h"
 #include "bed.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "mafsInRegion - Extract MAFS in a genomic region\n"
   "usage:\n"
   "    mafsInRegion regions.bed out.maf|outDir in.maf(s)\n"
   "options:\n"
   "    -outDir - output separate files named by bed name field to outDir\n"
+  "    -keepInitialGaps - keep alignment columns at the beginning and of a block that are gapped in all species\n"
   );
 }
 
 static struct optionSpec options[] = {
     {"outDir", OPTION_BOOLEAN},
+    {"keepInitialGaps", OPTION_BOOLEAN},
     {NULL, 0},
 };
 
 boolean outDir = FALSE;
+boolean keepInitialGaps = FALSE;
 char *dir = NULL;
 char *scoring = NULL;
 
 struct hash *loadRegions(char *file)
 /* load regions into a hash of lists by chrom */
 {
 struct bed *bed = NULL, *bedList = NULL, *nextBed = NULL, *temp = NULL;
 struct hash *regionHash = newHash(6);
 struct bed *regions;
 
 regions = bedLoadNAll(file, outDir ? 4 : 3);
 /* order by chrom, start */
 slSort(&regions, bedCmp);
 verbose(2, "found %d regions\n", slCount(regions));
 bedList = regions;
 for (bed = regions; bed != NULL; bed = nextBed)
     {
     verbose(3, "region %s:%d-%d\n", bed->chrom, bed->chromStart+1, bed->chromEnd);
     nextBed = bed->next;
     if ((bed->next == NULL) || (differentString(bed->chrom,bed->next->chrom)))
 	{
 	temp = bed->next;
 	bed->next = NULL;
 	hashAdd(regionHash, bed->chrom, bedList);
 	verbose(2, "just added %d regions on %s\n", slCount(bedList), bed->chrom);
 	bedList = temp;
 	}
     }
 return regionHash;
 }
 
 char *chromFromSrc(char *src)
 /* get chrom name from <db>.<chrom> */
 {
 char *p;
 if ((p = strchr(src, '.')) == NULL)
     errAbort("Can't find chrom in MAF component src: %s\n", src);
 return ++p;
 }
 
 FILE *startOutFile(char *outFile)
 {
 static FILE *f = NULL;
 
 f = mustOpen(outFile, "w");
 verbose(3, "creating %s\n", outFile);
 mafWriteStart(f, scoring);
 return f;
 }
 
 void endOutFile(FILE *f)
 {
 mafWriteEnd(f);
 carefulClose(&f);
 }
 
 void extractMafs(char *file, FILE *f, struct hash *regionHash)
 /* extract MAFs in a file from regions specified in hash */
 {
 char *chrom = NULL;
 struct bed *bed = NULL;
 struct mafFile *mf = mafOpen(file);
 struct mafAli *maf = NULL;
 struct mafComp *mc;
 char path[256];
 
 verbose(1, "extracting from %s\n", file);
 maf = mafNext(mf);
 while (maf)
     {
     mc = maf->components;
     if (!chrom || differentString(chrom, chromFromSrc(mc->src)))
         chrom = cloneString(chromFromSrc(mc->src));         /* new chrom */
     bed = (struct bed *)hashFindVal(regionHash, chrom);
     if (!bed)
         {
         /* no regions on this chrom -- skip to next chrom */
         do
             mafAliFree(&maf);
         while (((maf = mafNext(mf)) != NULL) && sameString(chromFromSrc(maf->components->src), chrom));
         continue;  // start over with this maf
         }
     verbose(2, "region: %s:%d-%d\n", 
             bed->chrom, bed->chromStart+1, bed->chromEnd);
     if (outDir)
         {
         if (f)
             endOutFile(f);
         safef(path, sizeof (path), "%s/%s.maf", dir, bed->name);
         f = startOutFile(path);
         }
 
     /* skip mafs before region, stopping if chrom changes */
     while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) &&
         (mc->start + mc->size) <= bed->chromStart)
         {
         mafAliFree(&maf);
         maf = mafNext(mf);
         }
 
     /* extract all mafs and pieces of mafs in region */
     while (maf && (mc = maf->components) && sameString(chrom, chromFromSrc(mc->src)) &&
         (bed->chromStart < mc->start + mc->size && bed->chromEnd > mc->start))
         {
         int mafStart = mc->start;
         int mafEnd = mc->start + mc->size;
         struct mafAli *full = maf;
         if (mafStart < bed->chromStart || mafEnd > bed->chromEnd)
             {
             full = maf;
-            maf = mafSubset(full, mc->src, bed->chromStart, bed->chromEnd);
+            maf = mafSubsetE(full, mc->src, bed->chromStart, bed->chromEnd, keepInitialGaps);
             mc = maf->components;
             }
         verbose(2, "   %s:%d-%d\n", chrom, mc->start+1, mc->start + mc->size);
         mafWrite(f, maf);
         struct mafAli *nextMaf = (mafEnd > bed->chromEnd+1)
             ? mafSubset(full, mc->src, bed->chromEnd+1, mafEnd) : mafNext(mf);
         if (maf != full)
             mafAliFree(&maf);
         mafAliFree(&full);
         maf = nextMaf;
         }
     /* get next region */
     hashRemove(regionHash, bed->chrom);
     if (bed->next)
         hashAdd(regionHash, bed->chrom, bed->next);
     }
 mafFileFree(&mf);
 }
 
 void mafsInRegion(char *regionFile, char *out, int mafCount, char *mafFiles[])
 /* Extract MAFs in regions listed in regin file */
 {
 int i = 0;
 struct hash *bedHash = NULL;
 FILE *f = NULL;
 struct mafFile *mf = NULL;
 
 verbose(1, "Extracting from %d files to %s\n", mafCount, out);
 bedHash = loadRegions(regionFile);
 
 /* get scoring scheme */
 mf = mafOpen(mafFiles[0]);
 if (!mf)
     errAbort("can't open MAF file: %s\n", mafFiles[0]);
 scoring = cloneString(mf->scoring);
 mafFileFree(&mf);
 
 /* set up output dir */
 if (outDir)
     {
     dir = out;
     makeDir(dir);
     }
 else
     f = startOutFile(out);
 for (i = 0; i < mafCount; i++)
     extractMafs(mafFiles[i], f, bedHash);
 if (!outDir)
     endOutFile(f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 outDir = optionExists("outDir");
+keepInitialGaps = optionExists("keepInitialGaps");
 if (argc < 4)
     usage();
 mafsInRegion(argv[1], argv[2], argc-3, &argv[3]);
 return 0;
 }