94234c062dc26dd2ee4fe2b381ca11486848c601 mmaddren Wed May 21 16:09:32 2014 -0700 more or less final methylateGenome, and added a new tool analyzeCpG which currently clusters CpG regions in a bed file diff --git src/utils/methylateGenome/methylateGenome.c src/utils/methylateGenome/methylateGenome.c index 22cddda..49876db 100644 --- src/utils/methylateGenome/methylateGenome.c +++ src/utils/methylateGenome/methylateGenome.c @@ -1,79 +1,188 @@ /* methylateGenome - Creates a methylated version of an input genome, in which any occurance of CG becomes TG. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnautil.h" #include "dnaLoad.h" #include "dnaseq.h" #include "fa.h" void usage() /* Explain usage and exit. */ { errAbort( "methylateGenome - Creates a methylated version of a genome, in which any occurance of CG becomes TG\n" "usage:\n" - " methylateGenome input output.fa\n" + " methylateGenome input.[fa|2bit] outputPrefix\n" "options:\n" " -xxx=XXX\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; -void methylateGenome(char *fileName, char *outputName) -/* methylateGenome - Creates a methylated version of an input genome, in which any occurance of CG becomes TG. */ +void old(struct dnaSeq *seq) +{ +int end = seq->size - 1; +int i; +for (i = 0; i < end; ++i) + { + char b1 = seq->dna[i]; + int v1 = ntVal[(unsigned)b1]; + int v2 = ntVal[(unsigned)seq->dna[i+1]]; + + if (v1 == C_BASE_VAL && v2 == G_BASE_VAL) + { + if (b1 == 'C') + seq->dna[i] = 'T'; + else + seq->dna[i] = 't'; + } + } +} + +void forwardStrandNoMethylation(struct dnaSeq *seq) +{ +int end = seq->size - 1; +int i; +for (i = 0; i < end; ++i) + { + if (seq->dna[i] == 'C') + seq->dna[i] = 'T'; + else if (seq->dna[i] == 'c') + seq->dna[i] = 't'; + } +} + +void reverseStrandNoMethylation(struct dnaSeq *seq) +{ +int end = seq->size - 1; +int i; +for (i = 0; i < end; ++i) + { + if (seq->dna[i] == 'G') + seq->dna[i] = 'A'; + else if (seq->dna[i] == 'g') + seq->dna[i] = 'a'; + } +} + +void forwardStrandWithMethylation(struct dnaSeq *seq) +{ +int end = seq->size - 1; +int i; +for (i = 0; i < end; ++i) + { + char b1 = seq->dna[i]; + int v1 = ntVal[(unsigned)b1]; + int v2 = ntVal[(unsigned)seq->dna[i+1]]; + + if (v1 == C_BASE_VAL && v2 != G_BASE_VAL) + { + if (b1 == 'C') + seq->dna[i] = 'T'; + else if (b1 == 'c') + seq->dna[i] = 't'; + } + } +} + +void reverseStrandWithMethylation(struct dnaSeq *seq) +{ +int end = seq->size - 1; +int i; +for (i = 0; i < end; ++i) + { + char b1 = seq->dna[i]; + int v1 = ntVal[(unsigned)b1]; + int v2 = ntVal[(unsigned)seq->dna[i+1]]; + + if (v1 == G_BASE_VAL && v2 != C_BASE_VAL) + { + if (b1 == 'G') + seq->dna[i] = 'A'; + else if (b1 == 'g') + seq->dna[i] = 'a'; + } + } +} + +void performConversion(char *fileName, char *outputName, void (*replacementPolicy)(struct dnaSeq *)) { // Open input and output files struct dnaLoad *dl = dnaLoadOpen(fileName); FILE *f = mustOpen(outputName, "w"); // Loop over every line in the input file... struct dnaSeq *seq = NULL; for (;;) { // Take every line in the file, break when we're done seq = dnaLoadNext(dl); if (seq == NULL) break; + replacementPolicy(seq); + // Replace all 'CG' (or 'cg') with 'TG' (or 'tg') - int end = seq->size - 1; + /*int end = seq->size - 1; int i; for (i = 0; i < end; ++i) { char b1 = seq->dna[i]; int v1 = ntVal[(unsigned)b1]; int v2 = ntVal[(unsigned)seq->dna[i+1]]; if (v1 == C_BASE_VAL && v2 == G_BASE_VAL) { if (b1 == 'C') seq->dna[i] = 'T'; else seq->dna[i] = 't'; } - } + }*/ // Write out the modified line faWriteNext(f, seq->name, seq->dna, seq->size); } if (fclose(f) != 0) errnoAbort("fclose failed"); } +void methylateGenome(char *fileName, char *outputPrefix) +/* methylateGenome - Creates a methylated version of an input genome, in which any occurance of CG becomes TG. */ +{ +int nameSize = 64; + +char forwardNoMethylName[nameSize]; +safef(forwardNoMethylName, nameSize, "%s_forwardNoMethyl.fa", outputPrefix); +performConversion(fileName, forwardNoMethylName, &forwardStrandNoMethylation); + +char reverseNoMethylName[nameSize]; +safef(reverseNoMethylName, nameSize, "%s_reverseNoMethyl.fa", outputPrefix); +performConversion(fileName, reverseNoMethylName, &reverseStrandNoMethylation); + +char forwardMethylName[nameSize]; +safef(forwardMethylName, nameSize, "%s_forwardMethyl.fa", outputPrefix); +performConversion(fileName, forwardMethylName, &forwardStrandWithMethylation); + +char reverseMethylName[nameSize]; +safef(reverseMethylName, nameSize, "%s_reverseMethyl.fa", outputPrefix); +performConversion(fileName, reverseMethylName, &reverseStrandWithMethylation); +} + int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); dnaUtilOpen(); methylateGenome(argv[1], argv[2]); return 0; }