372f1fef9075019ab06e62e178f7d2df989d0cbd hiram Fri Jun 30 17:19:06 2017 -0700 working up brute force crispr scoring scheme refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c new file mode 100644 index 0000000..f8f06ff --- /dev/null +++ src/utils/crisprKmers/crisprKmers.c @@ -0,0 +1,235 @@ +/* crisprKmers - annotate crispr sequences. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "dnautil.h" +#include "dnaseq.h" +#include "dnaLoad.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "crisprKmers - annotate crispr sequences\n" + "usage:\n" + " crisprKmers \n" + "options:\n" + " where is a .2bit file or .fa fasta sequence\n" + " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +struct crispr +/* one chromosome set of crisprs */ + { + struct crispr *next; /* Next in list. */ + unsigned long long sequence; /* sequence value in 2bit format */ + unsigned long long start; /* chromosome start 0-relative */ + char strand; /* strand: + or - */ + }; + +struct crisprList +/* all chromosome sets of crisprs */ + { + struct crisprList *next; /* Next in list. */ + char *chrom; /* chrom name */ + int size; /* chrom size */ + struct crispr *chromCrisprs; /* all the crisprs on this chrom */ + }; + +static char bases[4]; /* for binary to ascii conversion */ +static char revBases[4]; /* reverse complement for binary to ascii conversion */ + +static int slCmpStart(const void *va, const void *vb) +/* Compare slPairs on start value */ +{ +const struct crispr *a = *((struct crispr **)va); +const struct crispr *b = *((struct crispr **)vb); +unsigned long long aVal = a->start; +unsigned long long bVal = b->start; +if (aVal < bVal) + return -1; +else if (aVal > bVal) + return 1; +else + return 0; +} + +static char *kmerValToString(long long val, boolean revComp) +{ +static char kmerString[32]; +long long twoBitMask = 0x300000000000; +int shiftCount = 44; +int baseCount = 0; + +kmerString[baseCount] = 0; +if (revComp) + { + twoBitMask = 0x3; + shiftCount = 0; + while (shiftCount < 46) + { + int base = (val & twoBitMask) >> shiftCount; + kmerString[baseCount++] = revBases[base]; + twoBitMask <<= 2; + shiftCount += 2; + } + } +else + { + while (twoBitMask) + { + int base = (val & twoBitMask) >> shiftCount;; + kmerString[baseCount++] = bases[base]; + twoBitMask >>= 2; + shiftCount -= 2; + } + } +kmerString[baseCount] = 0; +return kmerString; +} + +static struct crisprList *scanSeq(struct dnaSeq *seq) +{ +struct crispr *crisprSet = NULL; +struct crisprList *oneChromList = NULL; +AllocVar(oneChromList); +oneChromList->chrom = cloneString(seq->name); +oneChromList->size = seq->size; +int i; +DNA *dna; +unsigned long long chromPosition = 0; +unsigned long long startGap = 0; +unsigned long long gapCount = 0; +int kmerLength = 0; +long long kmerVal = 0; +long long fortySixBits = 0x3fffffffffff; +long long endsGG = (G_BASE_VAL << 2) | G_BASE_VAL; +long long beginsCC = (long long)((C_BASE_VAL << 2) | C_BASE_VAL) << 42; +long long reverseMask = (long long)0xf << 42; +verbose(4, "# endsGG: %032llx\n", endsGG); +verbose(4, "# beginsCC: %032llx\n", beginsCC); +verbose(4, "# 46 bits: %032llx\n", fortySixBits); + +dna=seq->dna; +for (i=0; i < seq->size; ++i) + { + int val; + val = ntVal[(int)dna[i]]; + switch (val) + { + case T_BASE_VAL: // 0 + case C_BASE_VAL: // 1 + case A_BASE_VAL: // 2 + case G_BASE_VAL: // 3 + kmerVal = fortySixBits & ((kmerVal << 2) | val); + ++kmerLength; +// boolean testA = endsGG == ((long long)kmerVal & endsGG); + if (kmerLength > 22) + { + if (endsGG == (kmerVal & 0xf)) + { + struct crispr *oneCrispr = NULL; + AllocVar(oneCrispr); + oneCrispr->start = chromPosition - 22; + oneCrispr->strand = '+'; + oneCrispr->sequence = kmerVal; + slAddHead(&crisprSet, oneCrispr); + } + if (beginsCC == (kmerVal & reverseMask)) + { + struct crispr *oneCrispr = NULL; + AllocVar(oneCrispr); + oneCrispr->start = chromPosition - 22; + oneCrispr->strand = '-'; + oneCrispr->sequence = kmerVal; + slAddHead(&crisprSet, oneCrispr); + } + } + break; + default: + ++gapCount; + startGap = chromPosition; + /* skip all N's == any value not = 0,1,2,3 */ + while ( ((i+1) < seq->size) && (0xfffffffd & ntVal[(int)dna[i+1]])) + { + ++chromPosition; + ++i; + } + if (startGap != chromPosition) + verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+"); + else + verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+"); + kmerLength = 0; /* reset, start over */ + kmerVal = 0; + } + ++chromPosition; + } +oneChromList->chromCrisprs = crisprSet; +return oneChromList; +} + +void crisprKmers(char *sequence) +/* crisprKmers - annotate crispr sequences. */ +{ +verbose(1, "# scanning sequence: %s\n", sequence); +dnaUtilOpen(); +struct dnaLoad *dl = dnaLoadOpen(sequence); +struct dnaSeq *seq; +struct crisprList *allCrisprs = NULL; + +while ((seq = dnaLoadNext(dl)) != NULL) + { + verbose(2, "#\tprocessing: %s, size: %d\n", seq->name, seq->size); + struct crisprList *oneList = scanSeq(seq); + slAddHead(&allCrisprs, oneList); + } +if (verboseLevel() > 2) + { + int chrCount = slCount(allCrisprs); +verbose(1, "# printing crispr list, chrCount: %d\n", chrCount); + struct crisprList *listPtr; + for (listPtr = allCrisprs; listPtr; listPtr = listPtr->next) + { + struct crispr *crisprPtr; + int crisprCount = slCount(listPtr->chromCrisprs); + slSort(&listPtr->chromCrisprs, slCmpStart); +verbose(1, "# printing crispr list, crisprCount: %d on %s\n", crisprCount, listPtr->chrom); + for (crisprPtr = listPtr->chromCrisprs; crisprPtr; crisprPtr = crisprPtr->next) + { + if ('+' == crisprPtr->strand) + { + verbose (3, "%s\t%s\t%llu\t+\n", kmerValToString(crisprPtr->sequence, FALSE), listPtr->chrom, crisprPtr->start); + } + else + { + verbose (3, "%s\t%s\t%llu\t-\n", kmerValToString(crisprPtr->sequence, TRUE), listPtr->chrom, crisprPtr->start); + } + } + } + } +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 2) + usage(); +bases[0] = 'T'; +bases[1] = 'C'; +bases[2] = 'A'; +bases[3] = 'G'; +revBases[0] = 'A'; +revBases[1] = 'G'; +revBases[2] = 'T'; +revBases[3] = 'C'; +crisprKmers(argv[1]); +return 0; +}