c866d0e97f8b9a2a76593f56ebe5825c9714d7e3
max
  Wed Nov 22 07:15:31 2023 -0800
documenting twoBitToFa a little better, no redmine, ML ticket with Jacques van Helden

diff --git src/utils/twoBitToFa/twoBitToFa.c src/utils/twoBitToFa/twoBitToFa.c
index d66e74f..ec150b4 100644
--- src/utils/twoBitToFa/twoBitToFa.c
+++ src/utils/twoBitToFa/twoBitToFa.c
@@ -1,229 +1,235 @@
 /* twoBitToFa - Convert all or part of twoBit file to fasta. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dnaseq.h"
 #include "fa.h"
 #include "twoBit.h"
 #include "bPlusTree.h"
 #include "basicBed.h"
 #include "udc.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "twoBitToFa - Convert all or part of .2bit file to fasta\n"
   "usage:\n"
   "   twoBitToFa input.2bit output.fa\n"
   "options:\n"
   "   -seq=name       Restrict this to just one sequence.\n"
   "   -start=X        Start at given position in sequence (zero-based).\n"
   "   -end=X          End at given position in sequence (non-inclusive).\n"
   "   -seqList=file   File containing list of the desired sequence names \n"
   "                   in the format seqSpec[:start-end], e.g. chr1 or chr1:0-189\n"
   "                   where coordinates are half-open zero-based, i.e. [start,end).\n"
   "   -noMask         Convert sequence to all upper case.\n"
   "   -bpt=index.bpt  Use bpt index instead of built-in one.\n"
   "   -bed=input.bed  Grab sequences specified by input.bed. Will exclude introns.\n"
   "   -bedPos         With -bed, use chrom:start-end as the fasta ID in output.fa.\n"
   "   -udcDir=/dir/to/cache  Place to put cache for remote bigBed/bigWigs.\n"
   "\n"
+  "Input file can be a URL\n"
   "Sequence and range may also be specified as part of the input\n"
   "file name using the syntax:\n"
   "      /path/input.2bit:name\n"
   "   or\n"
   "      /path/input.2bit:name:start-end\n"
+  "examples:\n"
+  "  wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.2bit\n"
+  "  twoBitToFa hg38.2bit -seq=chr1 -start=1000000 -end=20000000 out.fa\n"
+  "  echo 'chr1 1 1000 testRegion' > test.bed\n"
+  "  twoBitToFa https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.2bit -bed=test.bed out.fa\n"
   );
 }
 
 char *clSeq = NULL;	/* Command line sequence. */
 int clStart = 0;	/* Start from command line. */
 int clEnd = 0;		/* End from command line. */
 char *clSeqList = NULL; /* file containing list of seq names */
 bool noMask = FALSE;  /* convert seq to upper case */
 char *clBpt = NULL;	/* External index file. */
 char *clBed = NULL;	/* Bed file that specifies bounds of sequences. */
 bool clBedPos = FALSE;
 
 static struct optionSpec options[] = {
    {"seq", OPTION_STRING},
    {"seqList", OPTION_STRING},
    {"start", OPTION_INT},
    {"end", OPTION_INT},
    {"noMask", OPTION_BOOLEAN},
    {"bpt", OPTION_STRING},
    {"bed", OPTION_STRING},
    {"bedPos", OPTION_BOOLEAN},
    {"udcDir", OPTION_STRING},
    {NULL, 0},
 };
 
 void outputOne(struct twoBitFile *tbf, char *seqSpec, FILE *f, 
 	int start, int end)
 /* Output sequence. */
 {
 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, seqSpec, start, end);
 if (noMask)
     toUpperN(seq->dna, seq->size);
 faWriteNext(f, seq->name, seq->dna, seq->size);
 dnaSeqFree(&seq);
 }
 
 static void processAllSeqs(struct twoBitFile *tbf, FILE *outFile)
 /* get all sequences in a file */
 {
 struct twoBitIndex *index;
 for (index = tbf->indexList; index != NULL; index = index->next)
     outputOne(tbf, index->name, outFile, 0, 0);
 }
 
 static void processSeqSpecs(struct twoBitFile *tbf, struct twoBitSeqSpec *tbss,
                             FILE *outFile)
 /* process list of twoBitSeqSpec objects */
 {
 struct twoBitSeqSpec *s;
 for (s = tbss; s != NULL; s = s->next)
     outputOne(tbf, s->name, outFile, s->start, s->end);
 }
 
 struct dnaSeq *twoBitAndBedToSeq(struct twoBitFile *tbf, struct bed *bed)
 /* Get sequence defined by bed.  Exclude introns. */
 {
 struct dnaSeq *seq;
 if (bed->blockCount <= 1)
     {
     seq = twoBitReadSeqFrag(tbf, bed->chrom, bed->chromStart, bed->chromEnd);
     freeMem(seq->name);
     seq->name = cloneString(bed->name);
     }
 else
     {
     int totalBlockSize = bedTotalBlockSize(bed);
     AllocVar(seq);
     seq->name = cloneString(bed->name);
     seq->dna = needMem(totalBlockSize+1);
     seq->size = totalBlockSize;
     int i;
     int seqOffset = 0;
     for (i=0; i<bed->blockCount; ++i)
         {
 	int exonSize = bed->blockSizes[i];
 	int exonStart = bed->chromStart + bed->chromStarts[i];
 	struct dnaSeq *exon = twoBitReadSeqFrag(tbf, bed->chrom, exonStart, exonStart+exonSize);
 	memcpy(seq->dna + seqOffset, exon->dna, exonSize);
 	seqOffset += exonSize;
 	dnaSeqFree(&exon);
 	}
     }
 if (bed->strand[0] == '-')
     reverseComplement(seq->dna, seq->size);
 return seq;
 }
 
 static void processSeqsFromBed(struct twoBitFile *tbf, char *bedFileName, FILE *outFile)
 /* Get sequences defined by beds.  Exclude introns. */
 {
 struct bed *bed, *bedList = bedLoadAll(bedFileName);
 for (bed = bedList; bed != NULL; bed = bed->next)
     {
     struct dnaSeq *seq = twoBitAndBedToSeq(tbf, bed);
     char* seqName = NULL;
     if (clBedPos) 
         {
         char buf[1024];
         safef(buf, 1024, "%s:%d-%d", bed->chrom, bed->chromStart, bed->chromEnd);
         seqName = buf;
         }
     else
         seqName = seq->name;
     if (noMask)
         toUpperN(seq->dna, seq->size);
     faWriteNext(outFile, seqName, seq->dna, seq->size);
     dnaSeqFree(&seq);
     }
 }
 
 void twoBitToFa(char *inName, char *outName)
 /* twoBitToFa - Convert all or part of twoBit file to fasta. */
 {
 struct twoBitFile *tbf;
 FILE *outFile = mustOpen(outName, "w");
 struct twoBitSpec *tbs;
 
 if (clSeq != NULL)
     {
     char seqSpec[2*PATH_LEN];
     if (clEnd > clStart)
         safef(seqSpec, sizeof(seqSpec), "%s:%s:%d-%d", inName, clSeq, clStart, clEnd);
     else
         safef(seqSpec, sizeof(seqSpec), "%s:%s", inName, clSeq);
     tbs = twoBitSpecNew(seqSpec);
     }
 else if (clSeqList != NULL)
     tbs = twoBitSpecNewFile(inName, clSeqList);
 else
     tbs = twoBitSpecNew(inName);
 
 if (tbs == NULL)
     errAbort("%s is not a twoBit file", inName);
 
 if (tbs->seqs != NULL && clBpt != NULL)
     tbf = twoBitOpenExternalBptIndex(tbs->fileName, clBpt);
 else
     tbf = twoBitOpen(tbs->fileName);
 if (clBed != NULL)
     {
     processSeqsFromBed(tbf, clBed, outFile);
     }
 else
     {
     if (tbs->seqs == NULL)
 	processAllSeqs(tbf, outFile);
     else
 	processSeqSpecs(tbf, tbs->seqs, outFile);
     }
 twoBitSpecFree(&tbs);
 carefulClose(&outFile);
 twoBitClose(&tbf);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 clSeq = optionVal("seq", clSeq);
 clStart = optionInt("start", clStart);
 clEnd = optionInt("end", clEnd);
 clSeqList = optionVal("seqList", clSeqList);
 clBpt = optionVal("bpt", clBpt);
 clBed = optionVal("bed", clBed);
 clBedPos = optionExists("bedPos");
 noMask = optionExists("noMask");
 udcSetDefaultDir(optionVal("udcDir", udcDefaultDir()));
 
 if (clBedPos && !clBed) 
     errAbort("the -bedPos option requires the -bed option");
 if (clBed != NULL)
     {
     if (clSeqList != NULL)
 	errAbort("Can only have seqList or bed options, not both.");
     if (clSeq != NULL)
         errAbort("Can only have seq or bed options, not both.");
     }
 if ((clStart > clEnd) && (clSeq == NULL))
     errAbort("must specify -seq with -start and -end");
 if ((clSeq != NULL) && (clSeqList != NULL))
     errAbort("can't specify both -seq and -seqList");
 
 dnaUtilOpen();
 twoBitToFa(argv[1], argv[2]);
 return 0;
 }