917964e6471bfb6f8c021d6a38b36254855c4ce9 max Sun Sep 16 21:45:32 2012 -0700 added -bedPos option and a test for it diff --git src/utils/twoBitToFa/twoBitToFa.c src/utils/twoBitToFa/twoBitToFa.c index b9785ab..daf7925 100644 --- src/utils/twoBitToFa/twoBitToFa.c +++ src/utils/twoBitToFa/twoBitToFa.c @@ -1,201 +1,218 @@ /* twoBitToFa - Convert all or part of twoBit file to fasta. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnaseq.h" #include "fa.h" #include "twoBit.h" #include "bPlusTree.h" #include "basicBed.h" void usage() /* Explain usage and exit. */ { errAbort( "twoBitToFa - Convert all or part of .2bit file to fasta\n" "usage:\n" " twoBitToFa input.2bit output.fa\n" "options:\n" " -seq=name - restrict this to just one sequence\n" " -start=X - start at given position in sequence (zero-based)\n" " -end=X - end at given position in sequence (non-inclusive)\n" " -seqList=file - file containing list of the desired sequence names \n" " in the format seqSpec[:start-end], e.g. chr1 or chr1:0-189\n" " where coordinates are half-open zero-based, i.e. [start,end)\n" " -noMask - convert sequence to all upper case\n" " -bpt=index.bpt - use bpt index instead of built in one\n" " -bed=input.bed - grab sequences specified by input.bed. Will exclude introns\n" + " -bedPos - use with -bed to put chrom:start-end as the fasta ID\n" "\n" "Sequence and range may also be specified as part of the input\n" "file name using the syntax:\n" " /path/input.2bit:name\n" " or\n" " /path/input.2bit:name\n" " or\n" " /path/input.2bit:name:start-end\n" ); } char *clSeq = NULL; /* Command line sequence. */ int clStart = 0; /* Start from command line. */ int clEnd = 0; /* End from command line. */ char *clSeqList = NULL; /* file containing list of seq names */ bool noMask = FALSE; /* convert seq to upper case */ char *clBpt = NULL; /* External index file. */ char *clBed = NULL; /* Bed file that specifies bounds of sequences. */ +bool clBedPos = FALSE; static struct optionSpec options[] = { {"seq", OPTION_STRING}, {"seqList", OPTION_STRING}, {"start", OPTION_INT}, {"end", OPTION_INT}, {"noMask", OPTION_BOOLEAN}, {"bpt", OPTION_STRING}, {"bed", OPTION_STRING}, + {"bedPos", OPTION_BOOLEAN}, {NULL, 0}, }; void outputOne(struct twoBitFile *tbf, char *seqSpec, FILE *f, int start, int end) /* Output sequence. */ { struct dnaSeq *seq = twoBitReadSeqFrag(tbf, seqSpec, start, end); if (noMask) toUpperN(seq->dna, seq->size); faWriteNext(f, seq->name, seq->dna, seq->size); dnaSeqFree(&seq); } static void processAllSeqs(struct twoBitFile *tbf, FILE *outFile) /* get all sequences in a file */ { struct twoBitIndex *index; for (index = tbf->indexList; index != NULL; index = index->next) outputOne(tbf, index->name, outFile, 0, 0); } static void processSeqSpecs(struct twoBitFile *tbf, struct twoBitSeqSpec *tbss, FILE *outFile) /* process list of twoBitSeqSpec objects */ { struct twoBitSeqSpec *s; for (s = tbss; s != NULL; s = s->next) outputOne(tbf, s->name, outFile, s->start, s->end); } struct dnaSeq *twoBitAndBedToSeq(struct twoBitFile *tbf, struct bed *bed) /* Get sequence defined by bed. Exclude introns. */ { struct dnaSeq *seq; if (bed->blockCount <= 1) { seq = twoBitReadSeqFrag(tbf, bed->chrom, bed->chromStart, bed->chromEnd); freeMem(seq->name); seq->name = cloneString(bed->name); } else { int totalBlockSize = bedTotalBlockSize(bed); AllocVar(seq); seq->name = cloneString(bed->name); seq->dna = needMem(totalBlockSize+1); seq->size = totalBlockSize; int i; int seqOffset = 0; for (i=0; iblockCount; ++i) { int exonSize = bed->blockSizes[i]; int exonStart = bed->chromStart + bed->chromStarts[i]; struct dnaSeq *exon = twoBitReadSeqFrag(tbf, bed->chrom, exonStart, exonStart+exonSize); memcpy(seq->dna + seqOffset, exon->dna, exonSize); seqOffset += exonSize; dnaSeqFree(&exon); } } if (bed->strand[0] == '-') reverseComplement(seq->dna, seq->size); return seq; } static void processSeqsFromBed(struct twoBitFile *tbf, char *bedFileName, FILE *outFile) /* Get sequences defined by beds. Exclude introns. */ { struct bed *bed, *bedList = bedLoadAll(bedFileName); for (bed = bedList; bed != NULL; bed = bed->next) { struct dnaSeq *seq = twoBitAndBedToSeq(tbf, bed); - faWriteNext(outFile, seq->name, seq->dna, seq->size); + char* seqName = NULL; + if (clBedPos) + { + char buf[1024]; + safef(buf, 1024, "%s:%d-%d", bed->chrom, bed->chromStart, bed->chromEnd); + seqName = buf; + } + else + seqName = seq->name; + faWriteNext(outFile, seqName, seq->dna, seq->size); dnaSeqFree(&seq); } } void twoBitToFa(char *inName, char *outName) /* twoBitToFa - Convert all or part of twoBit file to fasta. */ { struct twoBitFile *tbf; FILE *outFile = mustOpen(outName, "w"); struct twoBitSpec *tbs; if (clSeq != NULL) { char seqSpec[2*PATH_LEN]; if (clEnd > clStart) safef(seqSpec, sizeof(seqSpec), "%s:%s:%d-%d", inName, clSeq, clStart, clEnd); else safef(seqSpec, sizeof(seqSpec), "%s:%s", inName, clSeq); tbs = twoBitSpecNew(seqSpec); } else if (clSeqList != NULL) tbs = twoBitSpecNewFile(inName, clSeqList); else tbs = twoBitSpecNew(inName); if (tbs->seqs != NULL && clBpt != NULL) tbf = twoBitOpenExternalBptIndex(tbs->fileName, clBpt); else tbf = twoBitOpen(tbs->fileName); if (clBed != NULL) { processSeqsFromBed(tbf, clBed, outFile); } else { if (tbs->seqs == NULL) processAllSeqs(tbf, outFile); else processSeqSpecs(tbf, tbs->seqs, outFile); } twoBitSpecFree(&tbs); carefulClose(&outFile); twoBitClose(&tbf); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); clSeq = optionVal("seq", clSeq); clStart = optionInt("start", clStart); clEnd = optionInt("end", clEnd); clSeqList = optionVal("seqList", clSeqList); clBpt = optionVal("bpt", clBpt); clBed = optionVal("bed", clBed); +clBedPos = optionExists("bedPos"); +noMask = optionExists("noMask"); + +if (clBedPos && !clBed) + errAbort("the -bedPos option requires the -bed option"); if (clBed != NULL) { if (clSeqList != NULL) errAbort("Can only have seqList or bed options, not both."); if (clSeq != NULL) errAbort("Can only have seq or bed options, not both."); } if ((clStart > clEnd) && (clSeq == NULL)) errAbort("must specify -seq with -start and -end"); if ((clSeq != NULL) && (clSeqList != NULL)) errAbort("can't specify both -seq and -seqList"); -noMask = optionExists("noMask"); + dnaUtilOpen(); twoBitToFa(argv[1], argv[2]); return 0; }