4b8a6c52d6c082137831ab42d67505c8d7c78a93
hiram
  Thu Mar 7 14:08:47 2019 -0800
expand findMotif to 32 character motifs and allow count of mis matches no redmine

diff --git src/utils/findMotif/findMotif.c src/utils/findMotif/findMotif.c
index 520d576..a23cf2b 100644
--- src/utils/findMotif/findMotif.c
+++ src/utils/findMotif/findMotif.c
@@ -1,315 +1,490 @@
-/* findMotif - find specified motif in nib files. */
+/* findMotif - find specified motif in fa/nib/2bit files. */
 
 /* Copyright (C) 2012 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
+
+#include <popcntintrin.h>
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dnautil.h"
 #include "dnaseq.h"
 #include "dnaLoad.h"
 #include "portable.h"
 #include "memalloc.h"
 #include "obscure.h"
 
 
 char *chr = (char *)NULL;	/*	process the one chromosome listed */
 char *motif = (char *)NULL;	/*	specified motif string */
 unsigned motifLen = 0;		/*	length of motif	*/
+unsigned misMatch = 0;		/*	command line arg	*/
 unsigned long long motifVal;	/*	motif converted to a number	*/
 unsigned long long complementVal;	/*	- strand complement	*/
 boolean bedOutputOpt = TRUE;	/*	output bed file (default) */
 boolean wigOutput = FALSE;	/*  output wiggle format instead of bed file */
-char *strand = (char *)NULL;
-boolean doPlusStrand = TRUE;	/*	output bed file instead of wiggle */
-boolean doMinusStrand = TRUE;	/*	output bed file instead of wiggle */
+char *strand = (char *)NULL;	/*	command line argument + or -	*/
+boolean doPlusStrand = TRUE;	/*	strand argument can turn this off */
+boolean doMinusStrand = TRUE;	/*	strand argument can turn this off */
+static int lowerLimitMotifLength = 4;	/* must be at least 4 characters */
+
+#define bitsPerLongLong	(8 * sizeof(unsigned long long))
+#define perfectScore 1000	/*	mis-matches will lower the score */
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "findMotif - find specified motif in sequence\n"
   "usage:\n"
   "   findMotif [options] -motif=<acgt...> sequence\n"
   "where:\n"
   "   sequence is a .fa , .nib or .2bit file or a file which is a list of sequence files.\n"
   "options:\n"
   "   -motif=<acgt...> - search for this specified motif (case ignored, [acgt] only)\n"
+  "   NOTE: motif must be at least %d characters, less than %d\n"
   "   -chr=<chrN> - process only this one chrN from the sequence\n"
   "   -strand=<+|-> - limit to only one strand.  Default is both.\n"
   "   -bedOutput - output bed format (this is the default)\n"
   "   -wigOutput - output wiggle data format instead of bed file\n"
+  "   -misMatch=N - allow N mismatches (0 default == perfect match)\n"
   "   -verbose=N - set information level [1-4]\n"
-  "   NOTE: motif must be longer than 4 characters, less than 17\n"
-  "   -verbose=4 - will display gaps as bed file data lines to stderr"
+  "   -verbose=4 - will display gaps as bed file data lines to stderr",
+lowerLimitMotifLength, (int)(bitsPerLongLong / 2)
   );
 }
 
 static struct optionSpec options[] = {
    {"chr", OPTION_STRING},
    {"strand", OPTION_STRING},
    {"motif", OPTION_STRING},
+   {"misMatch", OPTION_INT},
    {"bedOutput", OPTION_BOOLEAN},
    {"wigOutput", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
+/* Numerical values (and redefinitions here) for bases from danutil.h:
+ *	#define T_BASE_VAL 0	3
+ *	#define U_BASE_VAL 0	3
+ *	#define C_BASE_VAL 1	1
+ *	#define A_BASE_VAL 2	0
+ *	#define G_BASE_VAL 3	2
+ *  These do not work for the XOR comparison bit count,
+ *      using the ones below instead
+*/
+
+static int dnaUtilBases[4];	/* translation values from dnautil.h */
+
+#define A_BASE  0
+#define C_BASE  1
+#define G_BASE  2
+#define T_BASE  3
+#define U_BASE  3
+
+static char bases[4];  /* for two-bits to ascii conversion */
+
+static void initBases()
+/* set up the translation matrix for two-bits to ascii */
+{
+bases[A_BASE] = 'A';
+bases[C_BASE] = 'C';
+bases[G_BASE] = 'G';
+bases[T_BASE] = 'T';
+dnaUtilBases[A_BASE_VAL] = A_BASE;
+dnaUtilBases[C_BASE_VAL] = C_BASE;
+dnaUtilBases[G_BASE_VAL] = G_BASE;
+dnaUtilBases[T_BASE_VAL] = T_BASE;
+}
+
+static void misMatchString(char *stringReturn, unsigned long long misMatch,
+    unsigned stringLength)
+/* return ascii string ...*.....*.*.....*.. to indicate mis matches */
+{
+int i = 0;
+unsigned long long twoBitMask = 0xc000000000000000;
+int startShift = (bitsPerLongLong - (2 * stringLength));
+twoBitMask >>= startShift;
+while (twoBitMask)
+    {
+    stringReturn[i] = '.';
+    if (twoBitMask & misMatch)
+        stringReturn[i] = '*';
+    ++i;
+    twoBitMask >>= 2;
+    }
+stringReturn[i] = 0;
+}       // static void misMatchString()
+
+static void valToString(char *stringReturn, unsigned long long val,
+    unsigned stringLength)
+/* return ASCII string for binary sequence value */
+{
+unsigned long long twoBitMask = 0xc000000000000000;
+int startShift = (bitsPerLongLong - (2 * stringLength));
+int baseCount = 0;
+
+/* shorter string than 32 characters needs to get the mask into
+ * initial position.
+ */
+twoBitMask >>= startShift;
+
+while (twoBitMask)
+    {
+    int base = (val & twoBitMask) >> ((62-startShift) - (baseCount * 2));
+    stringReturn[baseCount++] = bases[base];
+    twoBitMask >>= 2;
+    }
+stringReturn[baseCount] = 0;
+}       //      static void valToString()
+
 static void scanSeq(struct dnaSeq *seq)
 {
 int i, val;
 FILE *nf = NULL;
 DNA *dna;
 unsigned long long mask;
 unsigned long long chromPosition = 0;
 register unsigned long long incomingVal = 0;
 unsigned long long incomingLength = 0;
 unsigned long long posFound = 0;
 unsigned long long negFound = 0;
 unsigned long long posPreviousPosition = 0;
 unsigned long long negPreviousPosition = 0;
 register unsigned long long posNeedle = motifVal;
 register unsigned long long negNeedle = complementVal;
 unsigned long long enterGap = 1;
 boolean inGap = FALSE;
 unsigned long long gapCount = 0;
 
 mask = 3;
 for (i=1; i < motifLen; ++i )
 	mask = (mask << 2) | 3;
-verbose(2, "#\tsequence: %s size: %d, motifMask: %#llx\n", seq->name, seq->size, mask);
-verbose(2, "#\tmotif numerical value: %llu (%#llx)\n", posNeedle, posNeedle);
+verbose(3, "#\tsequence: %s size: %d, motifMask: %#llx\n", seq->name, seq->size, mask);
+verbose(3, "#\tmotif numerical value: %llu (%#llx)\n", posNeedle, posNeedle);
 
 /* Need "chrom" */
 
 dna = seq->dna;
 for (i=0; i < seq->size; ++i)
     {
     ++chromPosition;
-    val = ntVal[(int)dna[i]];
+    val = dnaUtilBases[ntVal[(int)dna[i]]];
     switch (val)
 	{
-	case T_BASE_VAL:
-	case C_BASE_VAL:
-	case A_BASE_VAL:
-	case G_BASE_VAL:
+	case T_BASE:
+	case C_BASE:
+	case A_BASE:
+	case G_BASE:
 	    incomingVal = mask & ((incomingVal << 2) | val);
 	    if (! incomingLength)
 		{
 		if (inGap &&
 		 (((long long int)chromPosition - (long long int)enterGap) > 0))
 		    {
 		    ++gapCount;
 		    verbose(3,
 			    "#\treturn from gap at %llu, gap length: %llu\n",
 			    chromPosition, chromPosition - enterGap);
 		    verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n",
 			    seq->name, enterGap-1, chromPosition-1, gapCount,
 			    chromPosition - enterGap, "+");
 		    /* assume no more gaps, this is safe to a chrom size
 		     *	of 1 Tb */
 		    enterGap = (unsigned long long)BIGNUM << 10;
 		    }
 		}
 	    inGap = FALSE;
 	    ++incomingLength;
 
-	    if (doPlusStrand && (incomingLength >= motifLen)
-		&& (incomingVal == posNeedle))
+	    char misMatchPrint[33];
+	    int score = perfectScore;
+	    if (doPlusStrand && (incomingLength >= motifLen))
+		{
+		unsigned long long bitsXor = incomingVal ^ posNeedle;
+		boolean matchOK = FALSE;
+		if (bitsXor)
 		    {
+		    if (misMatch)
+			{
+			/* possible bitsXor bit values: 01 10 11
+			 *  turn those three values into just: 01
+			 */
+			bitsXor=(bitsXor|(bitsXor >> 1)) & 0x5555555555555555;
+			/* now count those 1 bits on */
+			int bitsOn = _mm_popcnt_u64(bitsXor);
+			if (bitsOn <= misMatch)
+			    {
+			    misMatchString(misMatchPrint, bitsXor, motifLen);
+verbose(2, "# misMatch count: %d position: %llu\n", bitsOn, chromPosition-motifLen);
+			    matchOK = TRUE;
+			    score -= bitsOn;
+			    }
+			}
+		    }
+		else
+		    {
+		    matchOK = TRUE;
+		    }
+		if (matchOK)
+		    {
+                    char needleString[33];
+                    char incomingString[33];
+                    valToString(needleString, posNeedle, motifLen);
+                    valToString(incomingString, incomingVal, motifLen);
+		    if (score < perfectScore)
+			{
+			verbose(2, "#\tmotif: %s\n", needleString);
+			verbose(2, "#\t       %s\n", misMatchPrint);
+			verbose(2, "#\tmatch: %s\n", incomingString);
+			}
 		++posFound;
 		if (wigOutput)
 		    printf("%llu 1 %#llx == %#llx\n", chromPosition-motifLen+1, incomingVal&mask,posNeedle);
 		else
-		    printf("%s\t%llu\t%llu\t%llu\t%d\t%s\n", seq->name, chromPosition-motifLen, chromPosition, posFound+negFound, 1000, "+");
+		    printf("%s\t%llu\t%llu\t%llu\t%d\t%s\n", seq->name, chromPosition-motifLen, chromPosition, posFound+negFound, score, "+");
 
 		if ((posPreviousPosition + motifLen) > chromPosition)
 		    verbose(2, "#\toverlapping + at: %s:%llu-%llu\n", seq->name, posPreviousPosition, chromPosition);
 		posPreviousPosition = chromPosition;
 		    }
+		}
 
 	    if (doMinusStrand && (incomingLength >= motifLen)
 		&& (incomingVal == negNeedle))
 		{
+		score = perfectScore;
+		unsigned long long bitsXor = incomingVal ^ negNeedle;
+		boolean matchOK = FALSE;
+		if (bitsXor)
+		    {
+		    if (misMatch)
+			{
+			/* possible bitsXor bit values: 01 10 11
+			 *  turn those three values into just: 01
+			 */
+			bitsXor=(bitsXor|(bitsXor >> 1)) & 0x5555555555555555;
+			/* now count those 1 bits on */
+			int bitsOn = _mm_popcnt_u64(bitsXor);
+			if (bitsOn <= misMatch)
+			    {
+			    misMatchString(misMatchPrint, bitsXor, motifLen);
+verbose(2, "# misMatch count: %d position: %llu\n", bitsOn, chromPosition-motifLen);
+			    matchOK = TRUE;
+			    score -= bitsOn;
+			    }
+			}
+		    }
+		else
+		    {
+		    matchOK = TRUE;
+		    }
+		if (matchOK)
+		    {
+                    char needleString[33];
+                    char incomingString[33];
+                    valToString(needleString, negNeedle, motifLen);
+                    valToString(incomingString, incomingVal, motifLen);
+		    if (score < perfectScore)
+			{
+			verbose(2, "#\tmotif: %s\n", needleString);
+			verbose(2, "#\t       %s\n", misMatchPrint);
+			verbose(2, "#\tmatch: %s\n", incomingString);
+			}
 		++negFound;
 		if (wigOutput)
 		    printf("%llu -1 %#llx == %#llx\n", chromPosition-motifLen+1, incomingVal&mask,negNeedle);
 		else
 		    printf("%s\t%llu\t%llu\t%llu\t%d\t%s\n", seq->name, chromPosition-motifLen, chromPosition, posFound+negFound, 1000, "-");
 
 		if ((negPreviousPosition + motifLen) > chromPosition)
 		    verbose(2, "#\toverlapping - at: %s:%llu-%llu\n", seq->name, negPreviousPosition, chromPosition);
 		negPreviousPosition = chromPosition;
 		    }
+		}
 	    break;
 
 	default:
 	    if (incomingLength)
 		{
 		verbose(3, "#\tenter gap at %llu\n", chromPosition);
 		enterGap = chromPosition;
 		}
 	    inGap = TRUE;
 	    incomingVal = 0;
 	    incomingLength = 0;
 	    break;
 	}
     }
 if (inGap && (((long long int)chromPosition - (long long int)enterGap) > 0))
     {
     ++gapCount;
     verbose(3,
 	    "#\treturn from gap at %llu, gap length: %llu\n",
 	    chromPosition+1, chromPosition - enterGap + 1);
     verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n",
 	    seq->name, enterGap-1, chromPosition, gapCount,
 	    chromPosition - enterGap + 1, "+");
     }
 
 verbose(2, "#\tfound: %llu times + strand, %llu times - strand\n",
     posFound, negFound );
 verbose(2, "#\t%% of chromosome: %g %% + strand %g %% - strand\n",
     (double)(posFound*motifLen)/(double)chromPosition,
     (double)(negFound*motifLen)/(double)chromPosition);
 
 carefulClose(&nf);
 }
 
 static void findMotif(char *input)
 /* findMotif - find specified motif in sequence file. */
 {
 struct dnaLoad *dl = dnaLoadOpen(input);
 struct dnaSeq *seq;
 
 while ((seq = dnaLoadNext(dl)) != NULL)
     {
     verbose(2, "#\tprocessing: %s\n", seq->name);
     scanSeq(seq);
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 int i;
 char *cp;
 unsigned long long reversed;
 size_t maxAlloc;
 char asciiAlloc[32];
 
 optionInit(&argc, argv, options);
 
 if (argc < 2)
     usage();
 
+initBases();
+
 maxAlloc = 2100000000 *
 	 (((sizeof(size_t)/4)*(sizeof(size_t)/4)*(sizeof(size_t)/4)));
 sprintLongWithCommas(asciiAlloc, (long long) maxAlloc);
 verbose(4, "#\tmaxAlloc: %s\n", asciiAlloc);
 setMaxAlloc(maxAlloc);
 /* produces: size_t is 4 == 2100000000 ~= 2^31 = 2Gb
  *      size_t is 8 = 16800000000 ~= 2^34 = 16 Gb
  */
 
 dnaUtilOpen();
 
 motif = optionVal("motif", NULL);
 chr = optionVal("chr", NULL);
 strand = optionVal("strand", NULL);
+misMatch = optionInt("misMatch", misMatch);
 bedOutputOpt = optionExists("bedOutput");
 wigOutput = optionExists("wigOutput");
 
 if (wigOutput)
     bedOutputOpt = FALSE;
 else
     bedOutputOpt = TRUE;
 
+motifLen = strlen(motif);
+
 if (chr)
     verbose(2, "#\tprocessing chr: %s\n", chr);
 if (strand)
     verbose(2, "#\tprocessing strand: '%s'\n", strand);
 if (motif)
-    verbose(2, "#\tsearching for motif: %s\n", motif);
-else {
+    verbose(2, "#\tsearching for motif: %s, length: %d\n", motif, motifLen);
+else
+    {
     warn("ERROR: -motif string empty, please specify a motif\n");
     usage();
     }
+if (misMatch)
+    verbose(2, "#\tallow mismatch %d bases\n", misMatch);
 verbose(2, "#\ttype output: %s\n", wigOutput ? "wiggle data" : "bed format");
 verbose(2, "#\tspecified sequence: %s\n", argv[1]);
 verbose(2, "#\tsizeof(motifVal): %d\n", (int)sizeof(motifVal));
 if (strand)
     {
     if (! (sameString(strand,"+") | sameString(strand,"-")))
 	{
 	warn("ERROR: -strand specified ('%s') is not + or - ?\n", strand);
 	usage();
 	}
     /*	They are both on by default, turn off the one not specified */
     if (sameString(strand,"-"))
 	doPlusStrand = FALSE;
     if (sameString(strand,"+"))
 	doMinusStrand = FALSE;
     }
-motifLen = strlen(motif);
 /*	at two bits per character, size limit of motif is
- *	number of bits in motifVal / 2
+ *	(number of bits in motifVal) / 2
  */
-if (motifLen > (4*sizeof(motifVal))/2 )
+if (motifLen > (bitsPerLongLong/2) )
+    {
+    warn("ERROR: motif string '%s'\n\ttoo long at %d characters, limit %d\n", motif, motifLen, (int)(bitsPerLongLong/2));
+    usage();
+    }
+if (motifLen < lowerLimitMotifLength)
     {
-    warn("ERROR: motif string too long, limit %d\n", (4*(int)sizeof(motifVal))/2 );
+    warn("ERROR: motif string must be at least %d characters\n", lowerLimitMotifLength);
     usage();
     }
+/* convert the motif string into 2 bits per base into the motifVal */
 cp = motif;
 motifVal = 0;
 complementVal = 0;
 for (i = 0; i < motifLen; ++i)
     {
 	switch (*cp)
 	{
 	case 'a':
 	case 'A':
-	    motifVal = (motifVal << 2) | A_BASE_VAL;
-	    complementVal = (complementVal << 2) | T_BASE_VAL;
+	    motifVal = (motifVal << 2) | A_BASE;
+	    complementVal = (complementVal << 2) | T_BASE;
 	    break;
 	case 'c':
 	case 'C':
-	    motifVal = (motifVal << 2) | C_BASE_VAL;
-	    complementVal = (complementVal << 2) | G_BASE_VAL;
+	    motifVal = (motifVal << 2) | C_BASE;
+	    complementVal = (complementVal << 2) | G_BASE;
 	    break;
 	case 'g':
 	case 'G':
-	    motifVal = (motifVal << 2) | G_BASE_VAL;
-	    complementVal = (complementVal << 2) | C_BASE_VAL;
+	    motifVal = (motifVal << 2) | G_BASE;
+	    complementVal = (complementVal << 2) | C_BASE;
 	    break;
 	case 't':
 	case 'T':
-	    motifVal = (motifVal << 2) | T_BASE_VAL;
-	    complementVal = (complementVal << 2) | A_BASE_VAL;
+	    motifVal = (motifVal << 2) | T_BASE;
+	    complementVal = (complementVal << 2) | A_BASE;
 	    break;
 	default:
-	    warn(
-		"ERROR: character in motif: '%c' is not one of ACGT\n", *cp);
+	    warn("ERROR: character in motif: '%c' is not one of ACGT\n", *cp);
 	    usage();
 	}
 	++cp;
     }
+char needleString[33];
+valToString(needleString, motifVal, motifLen);
+verbose(2, "#\tmotif: '%s' length %d\n", needleString, motifLen);
+/* and swap the complement value to get it reversed */
 reversed = 0;
 for (i = 0; i < motifLen; ++i)
     {
     int base;
     base = complementVal & 3;
     reversed = (reversed << 2) | base;
     complementVal >>= 2;
     }
 complementVal = reversed;
-verbose(2, "#\tmotif numerical value: %llu (%#llx)\n", motifVal, motifVal);
-verbose(2, "#\tcomplement numerical value: %llu (%#llx)\n", complementVal, complementVal);
-if (motifLen < 5)
-    {
-    warn("ERROR: motif string must be more than 4 characters\n");
-    usage();
-    }
+valToString(needleString, complementVal, motifLen);
+verbose(2, "#\trever: '%s' length %d\n", needleString, motifLen);
+verbose(3, "#\t     motif numerical value: %llu (%#llx)\n", motifVal, motifVal);
+verbose(3, "#\tcomplement numerical value: %llu (%#llx)\n", complementVal, complementVal);
 
 findMotif(argv[1]);
 return 0;
 }