94234c062dc26dd2ee4fe2b381ca11486848c601
mmaddren
  Wed May 21 16:09:32 2014 -0700
more or less final methylateGenome, and added a new tool analyzeCpG which currently clusters CpG regions in a bed file
diff --git src/utils/methylateGenome/methylateGenome.c src/utils/methylateGenome/methylateGenome.c
index 22cddda..49876db 100644
--- src/utils/methylateGenome/methylateGenome.c
+++ src/utils/methylateGenome/methylateGenome.c
@@ -2,78 +2,187 @@
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dnautil.h"
 #include "dnaLoad.h"
 #include "dnaseq.h"
 #include "fa.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "methylateGenome - Creates a methylated version of a genome, in which any occurance of CG becomes TG\n"
   "usage:\n"
-  "   methylateGenome input output.fa\n"
+  "   methylateGenome input.[fa|2bit] outputPrefix\n"
   "options:\n"
   "   -xxx=XXX\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
-void methylateGenome(char *fileName, char *outputName)
-/* methylateGenome - Creates a methylated version of an input genome, in which any occurance of CG becomes TG. */
+void old(struct dnaSeq *seq)
+{
+int end = seq->size - 1;
+int i;
+for (i = 0; i < end; ++i)
+    {
+    char b1 = seq->dna[i];
+    int v1 = ntVal[(unsigned)b1];
+    int v2 = ntVal[(unsigned)seq->dna[i+1]];
+    
+    if (v1 == C_BASE_VAL && v2 == G_BASE_VAL)
+        {
+        if (b1 == 'C')
+            seq->dna[i] = 'T';
+        else
+            seq->dna[i] = 't';
+        }
+    }
+}
+
+void forwardStrandNoMethylation(struct dnaSeq *seq)
+{
+int end = seq->size - 1;
+int i;
+for (i = 0; i < end; ++i)
+    {
+    if (seq->dna[i] == 'C')
+        seq->dna[i] = 'T';
+    else if (seq->dna[i] == 'c')
+        seq->dna[i] = 't';
+    }
+}
+
+void reverseStrandNoMethylation(struct dnaSeq *seq)
+{
+int end = seq->size - 1;
+int i;
+for (i = 0; i < end; ++i)
+    {
+    if (seq->dna[i] == 'G')
+        seq->dna[i] = 'A';
+    else if (seq->dna[i] == 'g')
+        seq->dna[i] = 'a';
+    }
+}
+
+void forwardStrandWithMethylation(struct dnaSeq *seq)
+{
+int end = seq->size - 1;
+int i;
+for (i = 0; i < end; ++i)
+    {
+    char b1 = seq->dna[i];
+    int v1 = ntVal[(unsigned)b1];
+    int v2 = ntVal[(unsigned)seq->dna[i+1]];
+    
+    if (v1 == C_BASE_VAL && v2 != G_BASE_VAL)
+        {
+            if (b1 == 'C')
+                seq->dna[i] = 'T';
+            else if (b1 == 'c')
+                seq->dna[i] = 't';
+        }
+    }
+}
+
+void reverseStrandWithMethylation(struct dnaSeq *seq)
+{
+int end = seq->size - 1;
+int i;
+for (i = 0; i < end; ++i)
+    {
+    char b1 = seq->dna[i];
+    int v1 = ntVal[(unsigned)b1];
+    int v2 = ntVal[(unsigned)seq->dna[i+1]];
+    
+    if (v1 == G_BASE_VAL && v2 != C_BASE_VAL)
+        {
+            if (b1 == 'G')
+                seq->dna[i] = 'A';
+            else if (b1 == 'g')
+                seq->dna[i] = 'a';
+        }
+    }
+}
+
+void performConversion(char *fileName, char *outputName, void (*replacementPolicy)(struct dnaSeq *))
 {
 // Open input and output files
 struct dnaLoad *dl = dnaLoadOpen(fileName);
 FILE *f = mustOpen(outputName, "w");
 
 // Loop over every line in the input file...
 struct dnaSeq *seq = NULL;
 for (;;)
     {
     // Take every line in the file, break when we're done
     seq = dnaLoadNext(dl);
     if (seq == NULL)
         break;
       
+    replacementPolicy(seq);
+      
     // Replace all 'CG' (or 'cg') with 'TG' (or 'tg')
-    int end = seq->size - 1;
+    /*int end = seq->size - 1;
     int i;
     for (i = 0; i < end; ++i)
         {
         char b1 = seq->dna[i];
         int v1 = ntVal[(unsigned)b1];
         int v2 = ntVal[(unsigned)seq->dna[i+1]];
         
         if (v1 == C_BASE_VAL && v2 == G_BASE_VAL)
             {
             if (b1 == 'C')
                 seq->dna[i] = 'T';
             else
                 seq->dna[i] = 't';
             }
-        }
+        }*/
     
     // Write out the modified line
     faWriteNext(f, seq->name, seq->dna, seq->size);
     }
     
 if (fclose(f) != 0)
     errnoAbort("fclose failed");
 }
 
+void methylateGenome(char *fileName, char *outputPrefix)
+/* methylateGenome - Creates a methylated version of an input genome, in which any occurance of CG becomes TG. */
+{
+int nameSize = 64;
+
+char forwardNoMethylName[nameSize];
+safef(forwardNoMethylName, nameSize, "%s_forwardNoMethyl.fa", outputPrefix);
+performConversion(fileName, forwardNoMethylName, &forwardStrandNoMethylation);
+
+char reverseNoMethylName[nameSize];
+safef(reverseNoMethylName, nameSize, "%s_reverseNoMethyl.fa", outputPrefix);
+performConversion(fileName, reverseNoMethylName, &reverseStrandNoMethylation);
+
+char forwardMethylName[nameSize];
+safef(forwardMethylName, nameSize, "%s_forwardMethyl.fa", outputPrefix);
+performConversion(fileName, forwardMethylName, &forwardStrandWithMethylation);
+
+char reverseMethylName[nameSize];
+safef(reverseMethylName, nameSize, "%s_reverseMethyl.fa", outputPrefix);
+performConversion(fileName, reverseMethylName, &reverseStrandWithMethylation);
+}
+
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 dnaUtilOpen();
 methylateGenome(argv[1], argv[2]);
 return 0;
 }