src/lib/iupac.c b50a4586bd9b4cf56a85705bd1b00b415803b5ab

b50a4586bd9b4cf56a85705bd1b00b415803b5ab
kent
  Fri May 11 07:21:49 2012 -0700
Adding support for IUPAC nucleotide ambiguity codes to 'short match' track.  Made iupac.c library module.
diff --git src/lib/iupac.c src/lib/iupac.c
new file mode 100644
index 0000000..6dd4402
--- /dev/null
+++ src/lib/iupac.c
@@ -0,0 +1,213 @@
+/* iupac - routines to help cope with IUPAC ambiguity codes in DNA sequence. */
+
+#include "common.h"
+#include "dnautil.h"
+#include "iupac.h"
+
+boolean iupacMatchLower(char iupac, char dna)
+/* See if iupac ambiguity code matches dna character where
+ * both are lower case */
+{
+switch (iupac)
+    {
+    case 'a':
+        return dna == 'a';
+    case 'c':
+        return dna == 'c';
+    case 'g':
+        return dna == 'g';
+    case 't':
+    case 'u':
+        return dna == 't';
+    case 'r':
+        return dna == 'a' || dna == 'g';
+    case 'y':
+        return dna == 'c' || dna == 't';
+    case 's':
+        return dna == 'g' || dna == 'c';
+    case 'w':
+        return dna == 'a' || dna == 't';
+    case 'k':
+        return dna == 'g' || dna == 't';
+    case 'm':
+        return dna == 'a' || dna == 'c';
+    case 'b':
+        return dna == 'c' || dna == 'g' || dna == 't';
+    case 'd':
+        return dna == 'a' || dna == 'g' || dna == 't';
+    case 'h':
+        return dna == 'a' || dna == 'c' || dna == 't';
+    case 'v':
+        return dna == 'a' || dna == 'c' || dna == 'g';
+    case 'n':
+        return TRUE;
+    default:
+        errAbort("Unrecognized IUPAC code '%c'", iupac);
+	return FALSE;   // Not actually used but prevent compiler complaints
+    }
+}
+
+boolean iupacMatch(char iupac, char dna)
+/* See if iupac ambiguity code matches dna character */
+{
+return iupacMatchLower(tolower(iupac), tolower(dna));
+}
+
+boolean isIupacLower(char c)
+/* See if iupac c is a legal (lower case) iupac char */
+{
+switch (c)
+    {
+    case 'a':
+    case 'c':
+    case 'g':
+    case 't':
+    case 'u':
+    case 'r':
+    case 'y':
+    case 's':
+    case 'w':
+    case 'k':
+    case 'm':
+    case 'b':
+    case 'd':
+    case 'h':
+    case 'v':
+    case 'n':
+        return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
+boolean isIupac(char c)
+/* See if iupac c is a legal iupac char */
+{
+return isIupacLower(tolower(c));
+}
+
+void iupacFilter(char *in, char *out)
+/* Filter out non-DNA non-UIPAC ambiguity code characters and change to lower case. */
+{
+char c;
+while ((c = *in++) != 0)
+    {
+    c = tolower(c);
+    if (isIupacLower(c))
+       *out++ = c;
+    }
+*out++ = 0;
+}
+
+boolean anyIupac(char *s)
+/* Return TRUE if there are any IUPAC ambiguity codes in s */
+{
+dnaUtilOpen();
+int c;
+while ((c = *s++) != 0)
+    {
+    switch (c)
+	{
+	case 'r':
+	case 'y':
+	case 's':
+	case 'w':
+	case 'k':
+	case 'm':
+	case 'b':
+	case 'd':
+	case 'h':
+	case 'v':
+	case 'n':
+	    return TRUE;
+	}
+    }
+return FALSE;
+}
+
+char iupacComplementBaseLower(char iupac)
+/* Return IUPAC complement for a single base */
+{
+switch (iupac)
+    {
+    case 'a':
+        return 't';
+    case 'c':
+        return 'g';
+    case 'g':
+        return 'c';
+    case 't':
+    case 'u':
+        return 'a';
+    case 'r':
+	return 'y';
+    case 'y':
+	return 'r';
+    case 's':
+	return 's';
+    case 'w':
+	return 'w';
+    case 'k':
+	return 'm';
+    case 'm':
+	return 'k';
+    case 'b':
+	return 'v';
+    case 'd':
+	return 'h';
+    case 'h':
+	return 'd';
+    case 'v':
+	return 'b';
+    case 'n':
+	return 'n';
+    default:
+        errAbort("Unrecognized IUPAC code '%c'", iupac);
+	return 0;   // Just to keep compiler from complaining, control won't reach here.
+    }
+}
+
+void iupacComplementLower(char *iupac, int iuSize)
+/* Return IUPAC complement many bases. Assumes iupac is lower case. */
+{
+int i;
+for (i=0; i<iuSize; ++i)
+    iupac[i] = iupacComplementBaseLower(iupac[i]);
+}
+
+void iupacReverseComplement(char *iu, int iuSize)
+/* Reverse complement a string containing DNA and IUPAC codes. Result will be always
+ * lower case. */
+{
+toLowerN(iu, iuSize);
+reverseBytes(iu, iuSize);
+iupacComplementLower(iu, iuSize);
+}
+
+boolean iupacMatchStart(char *iupacPrefix, char *dnaString)
+/* Return TRUE if start of DNA is compatible with iupac */
+{
+char iupac;
+while ((iupac = *iupacPrefix++) != 0)
+    {
+    if (!iupacMatch(iupac, *dnaString++))
+        return FALSE;
+    }
+return TRUE;
+}
+
+char *iupacIn(char *needle, char *haystack)
+/* Return first place in haystack (DNA) that matches needle that may contain IUPAC codes. */
+{
+int needleSize = strlen(needle);
+int haySize = strlen(haystack);
+char *endOfHay = haystack + haySize - needleSize;
+char *h;
+for (h = haystack; h<=endOfHay; ++h)
+    {
+    if (iupacMatchStart(needle, h))
+        return h;
+    }
+return NULL;
+}
+