src/shortReads/itsaMake/itsaMake.c 1.5

1.5 2009/11/24 15:51:11 kent
Improving comment.
Index: src/shortReads/itsaMake/itsaMake.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/shortReads/itsaMake/itsaMake.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/shortReads/itsaMake/itsaMake.c	14 Nov 2008 21:56:41 -0000	1.4
+++ src/shortReads/itsaMake/itsaMake.c	24 Nov 2009 15:51:11 -0000	1.5
@@ -1,514 +1,515 @@
 /* itsaMake - Create indexed traversable suffix array for fast searches of genome.. */
 /* Copyright Jim Kent 2008 all rights reserved. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "sqlNum.h"
 #include "dnaLoad.h"
 #include "dnaseq.h"
 #include "verbose.h"
 #include "itsa.h"
 
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "itsaMake - Create indexed traversable suffix array for fast searches of genome.\n"
   "usage:\n"
   "   itsaMake input1 input2 ... inputN output.itsa\n"
   "where each input is either a fasta file, a nib file, a 2bit file, or a text file\n"
   "containing the names of the above file types one per line and output.itsa is the\n"
   "output file containing the suffix array in a binary format.\n"
   "options:\n"
   "   verbose=N - Control verbosity. Default 1 is normal.  0 is silent."
   );
 }
 
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 struct chromInfo
 /* Basic info on a chromosome (or contig). */
     {
     struct chromInfo *next;
     char *name;		/* Chromosome/contig name. */
     bits32 size;	/* Chromosome size. */
     bits32 offset;	/* Chromosome offset in total DNA */
     };
 
 
 static bits64 roundUpTo4(bits64 x)
 /* Round x up to next multiple of 4 */
 {
 return (x+3) & (~(3));
 }
 
 static void zeroPad(FILE *f, int count)
 /* Write zeroes to file. */
 {
 while (--count >= 0)
     fputc(0, f);
 }
 
 static void indexChromPass1(struct chromInfo *chrom, DNA *allDna,  
 	bits32 *offsetArray, bits32 *listArray, bits32 *index13)
-/* Create a itsaOneBaseListy for each base in seq, and hang it in appropriate slot
- * in listyIndex. */
+/* Do crude sorting of suffexes in allDna into lists formed by listArray 
+ * (for next) pointers, and offsetArray (for data - positions in genome).
+ * Each list will be in a bucket in index13. */
 {
 bits32 baseIx;
 bits32 seqSize = chrom->size;
 bits32 chromOffset = chrom->offset;
 DNA *dna = allDna + chromOffset;
 bits32 maskTil = 0;
 
 verbose(2, "   start short initial loop\n");
 /* Preload the 13mer with the first 12 bases. */
 bits32 thirteen = 0;
 for (baseIx=0; baseIx<12; ++baseIx)
     {
     int baseLetter = dna[baseIx];
     if (baseLetter == 'N')
         maskTil = baseIx + 13;
     thirteen <<= 2;
     thirteen += itsaBaseToVal[baseLetter];
     }
 
 verbose(2, "   start main loop\n");
 /* Do the middle part of the sequence where there are no end conditions to consider. */
 bits32 freePos = chromOffset;
 for (baseIx = 12; baseIx < seqSize; ++baseIx)
     {
     int baseLetter = dna[baseIx];
     if (baseLetter == 'N')
         maskTil = baseIx + 13;
     thirteen = (thirteen << 2) + itsaBaseToVal[baseLetter];
     thirteen &= 0x3FFFFFF;
     if (baseIx >= maskTil)
 	{
 	offsetArray[freePos] = baseIx - 12 + chromOffset;
 	listArray[freePos] = index13[thirteen];
 	index13[thirteen] = freePos;
 	++freePos;
 	}
     }
 verbose(2, "   end main loop\n");
 }
 
 char *globalAllDna;	/* Copy of all DNA for sort function. */
 
 static int cmpAfter16(const void *va, const void *vb)
 /* Compare to sort, assuming first 16 bases already match. */
 {
 bits32 a = *((bits32 *)va);
 bits32 b = *((bits32 *)vb);
 return strcmp(globalAllDna + a + 16, globalAllDna + b + 16);
 }
 
 static int binary4(DNA *dna)
 /* Return next 4 bases in binary format.  Return -1 if there is an N. */
 {
 bits32 packedDna = 0;
 int i;
 for (i=0; i<4; ++i)
     {
     int baseLetter = dna[i];
     if (baseLetter == 'N')
         return -1;
     packedDna <<= 2;
     packedDna += itsaBaseToVal[baseLetter];
     }
 return packedDna;
 }
 
 void sortAndWriteOffsets(bits32 firstIx, bits32 *offsetArray, bits32 *listArray, 
 	DNA *allDna, FILE *f)
 /* Set up and do a qsort on list that starts at firstIx.  Write result to file. */
 {
 /* Count up length of list. */
 int listSize = 0;
 bits32 listIx;
 for (listIx = firstIx; listIx != 0; listIx = listArray[listIx])
     ++listSize;
 
 /* Get an array to hold all offsets in list, hopefully a small one on stack,
  * but if need be a big one on heap */
 bits32 smallArray[32], *bigArray = NULL, *sortArray;
 if (listSize <= ArraySize(smallArray))
     sortArray = smallArray;
 else
     AllocArray(sortArray, listSize);
 
 /* Copy list to array for sorting. */
 int sortIx;
 for (sortIx=0, listIx=firstIx; listIx != 0; ++sortIx, listIx=listArray[listIx])
     sortArray[sortIx] = offsetArray[listIx];
 
 /* Do the qsort.  I hope I got the cmp function right! */
 qsort(sortArray, listSize, sizeof(sortArray[0]), cmpAfter16);
 
 /* Write out sorted result. */
 mustWrite(f, sortArray, listSize * sizeof(bits32));
 
 /* Clean up. */
 if (bigArray)
     freeMem(bigArray);
 }
 
 
 bits64 finishAndWriteOneSlot(bits32 *offsetArray, bits32 *listArray, 
 	bits32 slotFirstIx, DNA *allDna, FILE *f)
 /* Do additional sorting and write results to file.  Return amount actually written.  */
 {
 bits64 basesIndexed = 0;
 bits32 elIx, nextElIx;
 if (slotFirstIx != 0)
     {
     /* Do in affect a secondary bucket sort on the 14-17th bases. */
     bits32 buckets[256];
     bits32 bucketIx;
     for (bucketIx = 0; bucketIx < ArraySize(buckets); bucketIx += 1)
         buckets[bucketIx] = 0;
     for (elIx = slotFirstIx; elIx != 0; elIx = nextElIx)
         {
 	nextElIx = listArray[elIx];
 	int bucketIx = binary4(allDna + offsetArray[elIx] + 13U);
 	if (bucketIx >= 0)
 	    {
 	    listArray[elIx] = buckets[bucketIx];
 	    buckets[bucketIx] = elIx;
 	    ++basesIndexed;
 	    }
 	}
 
     /* Do final sorting within buckets. */
     for (bucketIx = 0; bucketIx < ArraySize(buckets); ++bucketIx )
         {
 	bits32 firstIx = buckets[bucketIx];
 	if (firstIx != 0)
 	    {
 	    bits32 secondIx = listArray[firstIx];
 	    if (secondIx == 0)
 	        {
 		/* Special case for size one list, there are lots of these! */
 	        writeOne(f, offsetArray[firstIx]);
 		}
 	    else
 	        {
 		if (listArray[secondIx] == 0)
 		    {
 		    bits32 firstOffset = offsetArray[firstIx];
 		    bits32 secondOffset = offsetArray[secondIx];
 		    /* Special case for size two list.  There are still quite a few of these. */
 		    if (strcmp(allDna+firstOffset, allDna+secondOffset) < 0)
 		        {
 			writeOne(f, secondOffset);
 			writeOne(f, firstOffset);
 			}
 		    else
 		        {
 			writeOne(f, firstOffset);
 			writeOne(f, secondOffset);
 			}
 		    }
 		else
 		    {
 		    /* Three or more - do it the hard way... */
 		    sortAndWriteOffsets(firstIx, offsetArray, listArray, allDna, f);
 		    }
 		}
 	    }
 	}
     }
 return basesIndexed;
 }
 
 static void itsaFillInTraverseArray(char *dna, bits32 *suffixArray, bits32 arraySize, 
 	bits32 *traverseArray, UBYTE *cursorArray)
 /* Fill in the bits that will help us traverse the array as if it were a tree. */
 {
 int depth = 0;
 int stackSize = 4*1024;
 int *stack;
 AllocArray(stack, stackSize);
 bits32 i;
 for (i=0; i<arraySize; ++i)
     {
     char *curDna = dna + suffixArray[i];
     int d;
     for (d = 0; d<depth; ++d)
         {
 	bits32 prevIx = stack[d];
 	char *prevDna = dna + suffixArray[prevIx];
 	if (curDna[d] != prevDna[d])
 	    {
 	    int stackIx;
 	    for (stackIx=d; stackIx<depth; ++stackIx)
 	        {
 		prevIx = stack[stackIx];
 		traverseArray[prevIx] = i - prevIx;
 		}
 	    depth = d;
 	    break;
 	    }
 	}
     if (depth >= stackSize)
         errAbort("Stack overflow, depth >= %d", stackSize);
     stack[depth] = i;
     cursorArray[i] = depth;  // May overflow. That's ok. We just care about indexed ones which are < 13.
     depth += 1;
     if ((i&0xFFFFF)==0xFFFFF)
         {
 	verboseDot();
 	if ((i&0x3FFFFFF)==0)
 	    verbose(1, "traversed %lld%%\n", 100LL*i/arraySize);
 	}
     }
 verbose(1, "finished traversal\n");
 
 /* Do final clear out of stack */
 int stackIx;
 for (stackIx=0; stackIx < depth; ++stackIx)
     {
     bits32 prevIx = stack[stackIx];
     traverseArray[prevIx] = arraySize - prevIx;
     }
 }
 
 static void itsaWriteMerged(struct chromInfo *chromList, DNA *allDna,
 	bits32 *offsetArray, bits32 *listArray, bits32 *index13, char *output)
 /* Write out a file that contains a single splix that is the merger of
  * all of the individual splixes in list.   As a side effect will replace
  * offsetArray with suffix array and listArray with traverse array */
 {
 FILE *f = mustOpen(output, "w+");
 
 /** Allocate header and fill out easy constant fields. */
 struct itsaFileHeader *header;
 AllocVar(header);
 header->majorVersion = ITSA_MAJOR_VERSION;
 header->minorVersion = ITSA_MINOR_VERSION;
 
 /* Figure out sizes of names and sequence for each chromosome. */
 struct chromInfo *chrom;
 bits32 chromNamesSize = 0;
 bits64 dnaDiskSize = 1;	/* For initial zero. */
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    chromNamesSize += strlen(chrom->name) + 1;
    dnaDiskSize += chrom->size + 1;  /* Include separating zeroes. */
    }
 
 /* Fill in  most of rest of header fields */
 header->chromCount = slCount(chromList);
 header->chromNamesSize = roundUpTo4(chromNamesSize);
 header->dnaDiskSize = roundUpTo4(dnaDiskSize);
 bits32 chromSizesSize = header->chromCount*sizeof(bits32);
 
 /* Write header. */
 mustWrite(f, header, sizeof(*header));
 
 /* Write chromosome names. */
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     mustWrite(f, chrom->name, strlen(chrom->name)+1);
 zeroPad(f, header->chromNamesSize - chromNamesSize);
 
 /* Write chromosome sizes. */
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     mustWrite(f, &chrom->size, sizeof(chrom->size));
 int chromSizesSizePad = chromSizesSize - header->chromCount * sizeof(bits32);
 zeroPad(f, chromSizesSizePad);
 
 /* Write out chromosome DNA and zeros before, between, and after. */
 mustWrite(f, allDna, dnaDiskSize);
 zeroPad(f, header->dnaDiskSize - dnaDiskSize);
 verboseTime(1, "Wrote %lld bases of DNA including zero padding", header->dnaDiskSize);
 
 /* Calculate and write suffix array. Convert index13 to index of array as opposed to index
  * of sequence. */
 bits64 arraySize = 0;
 off_t suffixArrayFileOffset = ftello(f);
 int slotCount = itsaSlotCount;
 int slotIx;
 for (slotIx=0; slotIx < slotCount; ++slotIx)
     {
     int slotSize = finishAndWriteOneSlot(offsetArray, listArray, index13[slotIx], allDna, f);
     /* Convert index13 to hold the position in the suffix array where the first thing matching
      * the corresponding 13-base prefix is found. */
     if (slotSize != 0)
         index13[slotIx] = arraySize+1;	/* The +1 is so we can keep 0 for not found. */
     else
         index13[slotIx] = 0;
     arraySize += slotSize;
     if ((slotIx % 200000 == 0) && slotIx != 0)
 	{
 	verboseDot();
 	if (slotIx % 10000000 == 0)
 	    verbose(1, "fine sort bucket %d of %d\n", slotIx, slotCount);
 	}
     }
 verbose(1, "fine sort bucket %d of %d\n", slotCount, slotCount);
 verboseTime(1, "Wrote %lld suffix array positions", arraySize);
 
 /* Now we're done with the offsetArray and listArray buffers, so use them for the
  * next phase. */
 bits32 *suffixArray = offsetArray;
 offsetArray = NULL;	/* Help make some errors more obvious */
 bits32 *traverseArray = listArray;
 listArray = NULL;	/* Help make some errors more obvious */
 
 /* Read the suffix array back from the file. */
 fseeko(f, suffixArrayFileOffset, SEEK_SET);
 mustRead(f, suffixArray, arraySize*sizeof(bits32));
 verboseTime(1, "Read suffix array back in");
 
 /* Calculate traverse array and cursor arrays */
 memset(traverseArray, 0, arraySize*sizeof(bits32));
 UBYTE *cursorArray = needHugeMem(arraySize);
 itsaFillInTraverseArray(allDna, suffixArray, arraySize, traverseArray, cursorArray);
 verboseTime(1, "Filled in traverseArray");
 
 /* Write out traverse array. */
 mustWrite(f, traverseArray, arraySize*sizeof(bits32));
 verboseTime(1, "Wrote out traverseArray");
 
 /* Write out 13-mer index. */
 mustWrite(f, index13, itsaSlotCount*sizeof(bits32));
 verboseTime(1, "Wrote out index13");
 
 /* Write out bits of cursor array corresponding to index. */
 for (slotIx=0; slotIx<itsaSlotCount; ++slotIx)
     {
     bits32 indexPos = index13[slotIx];
     if (indexPos == 0)
        fputc(0, f);
     else
        fputc(cursorArray[indexPos-1], f);
     }
 verboseTime(1, "Wrote out cursors13");
 
 /* Update a few fields in header, and go back and write it out again with
  * the correct magic number to indicate it's complete. */
 header->magic = ITSA_MAGIC;
 header->arraySize = arraySize;
 header->size = sizeof(*header) 			// header
 	+ header->chromNamesSize + 		// chromosome names
 	+ header->chromCount * sizeof(bits32)	// chromosome sizes
 	+ header->dnaDiskSize 			// dna sequence
 	+ sizeof(bits32) * arraySize	 	// suffix array
 	+ sizeof(bits32) * arraySize   		// traverse array
 	+ sizeof(bits32) * itsaSlotCount 	// index13
 	+ sizeof(UBYTE) * itsaSlotCount;	// cursors13
 
 rewind(f);
 mustWrite(f, header, sizeof(*header));
 carefulClose(&f);
 verbose(1, "Completed %s is %lld bytes\n", output, header->size);
 }
 
 void itsaMake(int inCount, char *inputs[], char *output)
 /* itsaMake - Make a suffix array file out of input DNA sequences.. */
 {
 verboseTime(1, NULL);
 bits64 maxGenomeSize = 1024LL*1024*1024*4;
 
 itsaBaseToValInit();
 
 /* Load all DNA, make sure names are unique, and alphabetize by name. */
 struct dnaSeq *seqList = NULL, *seq;
 struct hash *uniqSeqHash = hashNew(0);
 bits64 totalDnaSize = 1;	/* FOr space between. */
 int inputIx;
 for (inputIx=0; inputIx<inCount; ++inputIx)
     {
     char * input = inputs[inputIx];
     struct dnaLoad *dl = dnaLoadOpen(input);
     while ((seq = dnaLoadNext(dl)) != NULL)
 	{
 	verbose(1, "read %s with %d bases\n", seq->name, seq->size);
 	if (hashLookup(uniqSeqHash, seq->name))
 	    errAbort("Input sequence name %s repeated, all must be unique.", seq->name);
 	totalDnaSize +=  seq->size + 1;
 	if (totalDnaSize > maxGenomeSize)
 	    errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize);
 	slAddHead(&seqList, seq);
 	}
     dnaLoadClose(&dl);
     }
 slSort(&seqList, dnaSeqCmpName);
 verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList));
 
 /* Allocate big buffer for all DNA. */
 DNA *allDna = globalAllDna = needHugeMem(totalDnaSize);
 allDna[0] = 0;
 bits64 chromOffset = 1;	/* Have zeroes between each chrom, and before and after. */
 
 /* Copy DNA to a single big buffer, and create chromInfo on each sequence. */
 struct chromInfo *chrom, *chromList = NULL;
 for (seq = seqList; seq != NULL; seq = seq->next)
     {
     AllocVar(chrom);
     chrom->name = cloneString(seq->name);
     chrom->size = seq->size;
     chrom->offset = chromOffset;
     slAddHead(&chromList, chrom);
     toUpperN(seq->dna, seq->size);
     memcpy(allDna + chromOffset, seq->dna, seq->size + 1);
     chromOffset += seq->size + 1;
     }
 slReverse(&chromList);
 
 /* Free up separate dna sequences because we're going to need a lot of RAM soon. */
 
 
 /* Allocate index array, and offset and list arrays. */
 dnaSeqFreeList(&seqList);
 bits32 *index13;
 AllocArray(index13, itsaSlotCount);
 bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32));
 bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32));
 verboseTime(1, "Allocated buffers %lld bytes total", 
 	(long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32)));
 
 /* Where normally we'd keep some sort of structure with a next element to form a list
  * of matching positions in each slot of our index,  to conserve memory we'll do this
  * with two parallel arrays.  Because we're such cheapskates in terms of memory we'll
  * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different
  * purposes.   
  *     In the first phase they will together be used to form linked lists of
  * offsets, and the 13mer index will point to the first item in each list.  In this
  * phase the offsetArray contains offsets into the allDna structure, and the listArray
  * contains the next pointers for the list.  After the first phase we write out the
  * suffix array to disk.
  *     In the second phase we read the suffix array back into the offsetArray, and
  * use the listArray for the traverseArray.  We write out the traverse array to finish
  * things up. */
 
 
 /* Load up all DNA buffer. */
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     {
     verbose(2, "  About to do first pass index\n");
     indexChromPass1(chrom, allDna, offsetArray, listArray, index13);
     verbose(2, "  Done first pass index\n");
     }
 verboseTime(1, "Done big bucket sort");
 slReverse(&chromList);
 itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 3)
     usage();
 dnaUtilOpen();
 itsaMake(argc-2, argv+1, argv[argc-1]);
 return 0;
 }