src/utils/twoBitToFa/twoBitToFa.c 1.13

1.13 2009/08/26 03:09:14 kent
Added -bed option
Index: src/utils/twoBitToFa/twoBitToFa.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/twoBitToFa/twoBitToFa.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -b -B -U 4 -r1.12 -r1.13
--- src/utils/twoBitToFa/twoBitToFa.c	9 Jan 2009 10:14:33 -0000	1.12
+++ src/utils/twoBitToFa/twoBitToFa.c	26 Aug 2009 03:09:14 -0000	1.13
@@ -6,8 +6,9 @@
 #include "dnaseq.h"
 #include "fa.h"
 #include "twoBit.h"
 #include "bPlusTree.h"
+#include "basicBed.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
@@ -25,8 +26,9 @@
   "                    in the format seqSpec[:start-end], e.g. chr1 or chr1:0-189\n"
   "                    where coordinates are half-open zero-based, i.e. [start,end)\n"
   "   -noMask - convert sequence to all upper case\n"
   "   -bpt=index.bpt - use bpt index instead of built in one\n"
+  "   -bed=input.bed - grab sequences specified by input.bed. Will exclude introns\n"
   "\n"
   "Sequence and range may also be specified as part of the input\n"
   "file name using the syntax:\n"
   "      /path/input.2bit:name\n"
@@ -42,16 +44,18 @@
 int clEnd = 0;		/* End from command line. */
 char *clSeqList = NULL; /* file containing list of seq names */
 bool noMask = FALSE;  /* convert seq to upper case */
 char *clBpt = NULL;	/* External index file. */
+char *clBed = NULL;	/* Bed file that specifies bounds of sequences. */
 
 static struct optionSpec options[] = {
    {"seq", OPTION_STRING},
    {"seqList", OPTION_STRING},
    {"start", OPTION_INT},
    {"end", OPTION_INT},
    {"noMask", OPTION_BOOLEAN},
    {"bpt", OPTION_STRING},
+   {"bed", OPTION_STRING},
    {NULL, 0},
 };
 
 void outputOne(struct twoBitFile *tbf, char *seqSpec, FILE *f, 
@@ -81,8 +85,54 @@
 for (s = tbss; s != NULL; s = s->next)
     outputOne(tbf, s->name, outFile, s->start, s->end);
 }
 
+struct dnaSeq *twoBitAndBedToSeq(struct twoBitFile *tbf, struct bed *bed)
+/* Get sequence defined by bed.  Exclude introns. */
+{
+struct dnaSeq *seq;
+if (bed->blockCount <= 1)
+    {
+    seq = twoBitReadSeqFrag(tbf, bed->chrom, bed->chromStart, bed->chromEnd);
+    freeMem(seq->name);
+    seq->name = cloneString(bed->name);
+    }
+else
+    {
+    int totalBlockSize = bedTotalBlockSize(bed);
+    AllocVar(seq);
+    seq->name = cloneString(bed->name);
+    seq->dna = needMem(totalBlockSize+1);
+    seq->size = totalBlockSize;
+    int i;
+    int seqOffset = 0;
+    for (i=0; i<bed->blockCount; ++i)
+        {
+	int exonSize = bed->blockSizes[i];
+	int exonStart = bed->chromStart + bed->chromStarts[i];
+	struct dnaSeq *exon = twoBitReadSeqFrag(tbf, bed->chrom, exonStart, exonStart+exonSize);
+	memcpy(seq->dna + seqOffset, exon->dna, exonSize);
+	seqOffset += exonSize;
+	dnaSeqFree(&exon);
+	}
+    }
+if (bed->strand[0] == '-')
+    reverseComplement(seq->dna, seq->size);
+return seq;
+}
+
+static void processSeqsFromBed(struct twoBitFile *tbf, char *bedFileName, FILE *outFile)
+/* Get sequences defined by beds.  Exclude introns. */
+{
+struct bed *bed, *bedList = bedLoadAll(bedFileName);
+for (bed = bedList; bed != NULL; bed = bed->next)
+    {
+    struct dnaSeq *seq = twoBitAndBedToSeq(tbf, bed);
+    faWriteNext(outFile, seq->name, seq->dna, seq->size);
+    dnaSeqFree(&seq);
+    }
+}
+
 void twoBitToFa(char *inName, char *outName)
 /* twoBitToFa - Convert all or part of twoBit file to fasta. */
 {
 struct twoBitFile *tbf;
@@ -105,12 +155,19 @@
 if (tbs->seqs != NULL && clBpt != NULL)
     tbf = twoBitOpenExternalBptIndex(tbs->fileName, clBpt);
 else
     tbf = twoBitOpen(tbs->fileName);
-if (tbs->seqs == NULL)
-    processAllSeqs(tbf, outFile);
+if (clBed != NULL)
+    {
+    processSeqsFromBed(tbf, clBed, outFile);
+    }
 else
+    {
+    if (tbs->seqs == NULL)
+	processAllSeqs(tbf, outFile);
+    else
     processSeqSpecs(tbf, tbs->seqs, outFile);
+    }
 twoBitSpecFree(&tbs);
 carefulClose(&outFile);
 twoBitClose(&tbf);
 }
@@ -125,8 +182,16 @@
 clStart = optionInt("start", clStart);
 clEnd = optionInt("end", clEnd);
 clSeqList = optionVal("seqList", clSeqList);
 clBpt = optionVal("bpt", clBpt);
+clBed = optionVal("bed", clBed);
+if (clBed != NULL)
+    {
+    if (clSeqList != NULL)
+	errAbort("Can only have seqList or bed options, not both.");
+    if (clSeq != NULL)
+        errAbort("Can only have seq or bed options, not both.");
+    }
 if ((clStart > clEnd) && (clSeq == NULL))
     errAbort("must specify -seq with -start and -end");
 if ((clSeq != NULL) && (clSeqList != NULL))
     errAbort("can't specify both -seq and -seqList");