src/utils/twoBitToFa/twoBitToFa.c 1.13

1.13 2009/08/26 03:09:14 kent
Added -bed option
Index: src/utils/twoBitToFa/twoBitToFa.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/twoBitToFa/twoBitToFa.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -b -B -U 1000000 -r1.12 -r1.13
--- src/utils/twoBitToFa/twoBitToFa.c	9 Jan 2009 10:14:33 -0000	1.12
+++ src/utils/twoBitToFa/twoBitToFa.c	26 Aug 2009 03:09:14 -0000	1.13
@@ -1,137 +1,202 @@
 /* twoBitToFa - Convert all or part of twoBit file to fasta. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dnaseq.h"
 #include "fa.h"
 #include "twoBit.h"
 #include "bPlusTree.h"
+#include "basicBed.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "twoBitToFa - Convert all or part of .2bit file to fasta\n"
   "usage:\n"
   "   twoBitToFa input.2bit output.fa\n"
   "options:\n"
   "   -seq=name - restrict this to just one sequence\n"
   "   -start=X  - start at given position in sequence (zero-based)\n"
   "   -end=X - end at given position in sequence (non-inclusive)\n"
   "   -seqList=file - file containing list of the desired sequence names \n"
   "                    in the format seqSpec[:start-end], e.g. chr1 or chr1:0-189\n"
   "                    where coordinates are half-open zero-based, i.e. [start,end)\n"
   "   -noMask - convert sequence to all upper case\n"
   "   -bpt=index.bpt - use bpt index instead of built in one\n"
+  "   -bed=input.bed - grab sequences specified by input.bed. Will exclude introns\n"
   "\n"
   "Sequence and range may also be specified as part of the input\n"
   "file name using the syntax:\n"
   "      /path/input.2bit:name\n"
   "   or\n"
   "      /path/input.2bit:name\n"
   "   or\n"
   "      /path/input.2bit:name:start-end\n"
   );
 }
 
 char *clSeq = NULL;	/* Command line sequence. */
 int clStart = 0;	/* Start from command line. */
 int clEnd = 0;		/* End from command line. */
 char *clSeqList = NULL; /* file containing list of seq names */
 bool noMask = FALSE;  /* convert seq to upper case */
 char *clBpt = NULL;	/* External index file. */
+char *clBed = NULL;	/* Bed file that specifies bounds of sequences. */
 
 static struct optionSpec options[] = {
    {"seq", OPTION_STRING},
    {"seqList", OPTION_STRING},
    {"start", OPTION_INT},
    {"end", OPTION_INT},
    {"noMask", OPTION_BOOLEAN},
    {"bpt", OPTION_STRING},
+   {"bed", OPTION_STRING},
    {NULL, 0},
 };
 
 void outputOne(struct twoBitFile *tbf, char *seqSpec, FILE *f, 
 	int start, int end)
 /* Output sequence. */
 {
 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, seqSpec, start, end);
 if (noMask)
     toUpperN(seq->dna, seq->size);
 faWriteNext(f, seq->name, seq->dna, seq->size);
 dnaSeqFree(&seq);
 }
 
 static void processAllSeqs(struct twoBitFile *tbf, FILE *outFile)
 /* get all sequences in a file */
 {
 struct twoBitIndex *index;
 for (index = tbf->indexList; index != NULL; index = index->next)
     outputOne(tbf, index->name, outFile, 0, 0);
 }
 
 static void processSeqSpecs(struct twoBitFile *tbf, struct twoBitSeqSpec *tbss,
                             FILE *outFile)
 /* process list of twoBitSeqSpec objects */
 {
 struct twoBitSeqSpec *s;
 for (s = tbss; s != NULL; s = s->next)
     outputOne(tbf, s->name, outFile, s->start, s->end);
 }
 
+struct dnaSeq *twoBitAndBedToSeq(struct twoBitFile *tbf, struct bed *bed)
+/* Get sequence defined by bed.  Exclude introns. */
+{
+struct dnaSeq *seq;
+if (bed->blockCount <= 1)
+    {
+    seq = twoBitReadSeqFrag(tbf, bed->chrom, bed->chromStart, bed->chromEnd);
+    freeMem(seq->name);
+    seq->name = cloneString(bed->name);
+    }
+else
+    {
+    int totalBlockSize = bedTotalBlockSize(bed);
+    AllocVar(seq);
+    seq->name = cloneString(bed->name);
+    seq->dna = needMem(totalBlockSize+1);
+    seq->size = totalBlockSize;
+    int i;
+    int seqOffset = 0;
+    for (i=0; i<bed->blockCount; ++i)
+        {
+	int exonSize = bed->blockSizes[i];
+	int exonStart = bed->chromStart + bed->chromStarts[i];
+	struct dnaSeq *exon = twoBitReadSeqFrag(tbf, bed->chrom, exonStart, exonStart+exonSize);
+	memcpy(seq->dna + seqOffset, exon->dna, exonSize);
+	seqOffset += exonSize;
+	dnaSeqFree(&exon);
+	}
+    }
+if (bed->strand[0] == '-')
+    reverseComplement(seq->dna, seq->size);
+return seq;
+}
+
+static void processSeqsFromBed(struct twoBitFile *tbf, char *bedFileName, FILE *outFile)
+/* Get sequences defined by beds.  Exclude introns. */
+{
+struct bed *bed, *bedList = bedLoadAll(bedFileName);
+for (bed = bedList; bed != NULL; bed = bed->next)
+    {
+    struct dnaSeq *seq = twoBitAndBedToSeq(tbf, bed);
+    faWriteNext(outFile, seq->name, seq->dna, seq->size);
+    dnaSeqFree(&seq);
+    }
+}
+
 void twoBitToFa(char *inName, char *outName)
 /* twoBitToFa - Convert all or part of twoBit file to fasta. */
 {
 struct twoBitFile *tbf;
 FILE *outFile = mustOpen(outName, "w");
 struct twoBitSpec *tbs;
 
 if (clSeq != NULL)
     {
     char seqSpec[2*PATH_LEN];
     if (clEnd > clStart)
         safef(seqSpec, sizeof(seqSpec), "%s:%s:%d-%d", inName, clSeq, clStart, clEnd);
     else
         safef(seqSpec, sizeof(seqSpec), "%s:%s", inName, clSeq);
     tbs = twoBitSpecNew(seqSpec);
     }
 else if (clSeqList != NULL)
     tbs = twoBitSpecNewFile(inName, clSeqList);
 else
     tbs = twoBitSpecNew(inName);
 if (tbs->seqs != NULL && clBpt != NULL)
     tbf = twoBitOpenExternalBptIndex(tbs->fileName, clBpt);
 else
     tbf = twoBitOpen(tbs->fileName);
-if (tbs->seqs == NULL)
-    processAllSeqs(tbf, outFile);
+if (clBed != NULL)
+    {
+    processSeqsFromBed(tbf, clBed, outFile);
+    }
 else
+    {
+    if (tbs->seqs == NULL)
+	processAllSeqs(tbf, outFile);
+    else
     processSeqSpecs(tbf, tbs->seqs, outFile);
+    }
 twoBitSpecFree(&tbs);
 carefulClose(&outFile);
 twoBitClose(&tbf);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 clSeq = optionVal("seq", clSeq);
 clStart = optionInt("start", clStart);
 clEnd = optionInt("end", clEnd);
 clSeqList = optionVal("seqList", clSeqList);
 clBpt = optionVal("bpt", clBpt);
+clBed = optionVal("bed", clBed);
+if (clBed != NULL)
+    {
+    if (clSeqList != NULL)
+	errAbort("Can only have seqList or bed options, not both.");
+    if (clSeq != NULL)
+        errAbort("Can only have seq or bed options, not both.");
+    }
 if ((clStart > clEnd) && (clSeq == NULL))
     errAbort("must specify -seq with -start and -end");
 if ((clSeq != NULL) && (clSeqList != NULL))
     errAbort("can't specify both -seq and -seqList");
 noMask = optionExists("noMask");
 dnaUtilOpen();
 twoBitToFa(argv[1], argv[2]);
 return 0;
 }