src/hg/utils/mutationClassifier/mutationClassifier.c 1.4

1.4 2010/02/07 19:53:31 larrym
code cleanup per jim's feedback
Index: src/hg/utils/mutationClassifier/mutationClassifier.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/mutationClassifier/mutationClassifier.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 4 -r1.3 -r1.4
--- src/hg/utils/mutationClassifier/mutationClassifier.c	2 Feb 2010 09:08:36 -0000	1.3
+++ src/hg/utils/mutationClassifier/mutationClassifier.c	7 Feb 2010 19:53:31 -0000	1.4
@@ -10,9 +10,8 @@
 #include "assert.h"
 #include "hash.h"
 
 static int debug = 0;
-static char *database;
 static struct hash *geneHash = NULL;
 
 #define SPLICE_SITE "spliceSite"
 #define MISSENSE "missense"
@@ -165,25 +164,25 @@
 struct genePred *curB = b;
 struct genePred *savedB = NULL;
 struct genePredStub *lastAddedB = NULL;
 
-for(curA = a; curA != NULL && curB != NULL;)
+for (curA = a; curA != NULL && curB != NULL;)
     {
-    if(debug)
+    if (debug)
         {
         fprintf(stderr, "A: %s:%d-%d\n", curA->chrom, curA->chromStart, curA->chromEnd);
         fprintf(stderr, "B: %s:%d-%d\n", curB->chrom, curB->cdsStart, curB->cdsEnd);
         }
-    if(bedItemsOverlap(curA, curB))
+    if (bedItemsOverlap(curA, curB))
         {
-        if(debug)
+        if (debug)
             {
             fprintf(stderr, "%s:%d-%d", curA->chrom, curA->chromStart, curA->chromEnd);
             }
-        if(aCommon != NULL)
+        if (aCommon != NULL)
             {
             struct bed *tmpA = shallowBedCopy(curA);
-            if(*aCommon == NULL)
+            if (*aCommon == NULL)
                 {
                 *aCommon = tmpA;
                 }
             else
@@ -192,12 +191,12 @@
                 }
             // We put newly allocated bed items at the end of the returned list so they match order in original list
             lastAddedA = tmpA;
             }
-        if(bCommon != NULL)
+        if (bCommon != NULL)
             {
             struct genePredStub *tmpB = genePredStubCopy(curB);
-            if(*bCommon == NULL)
+            if (*bCommon == NULL)
                 {
                 *bCommon = tmpB;
                 }
             else
@@ -205,24 +204,24 @@
                 lastAddedB->next = tmpB;
                 }
             lastAddedB = tmpB;
             }
-        if(bCommon == NULL)
+        if (bCommon == NULL)
             {
             // see note at beginning of function
             curA = curA->next;
             }
         else
             {
-            if(savedB == NULL)
+            if (savedB == NULL)
                 savedB = curB;
             curB = curB->next;
             }
         count++;
         }
     else
         {
-        if(savedB)
+        if (savedB)
             {
             // curA has matched at least one entry in b; now rewind curB to look for potentially multiple matches 
             // within b in the next entry from a (see notes in hw1_analyze.h)
             curA = curA->next;
@@ -233,9 +232,9 @@
             {
             int diff = strcmp(curA->chrom, curB->chrom);
             if (!diff)
                 diff = curA->chromStart - curB->cdsStart;
-            if(diff < 0)
+            if (diff < 0)
                 {
                 curA = curA->next;
                 }
             else
@@ -268,14 +267,14 @@
     bed->chrom = cloneString(chrom);
     bed->start = start;
     bed->end = end;
     bed->name = cloneString(row[3]);
-    if(wordCount >= 5)
+    if (wordCount >= 5)
         bed->score = lineFileNeedNum(lf, row, 4);
-    if(wordCount >= 6)
+    if (wordCount >= 6)
         {
         bed->strand = row[5][0];
-        if(bed->strand == '-')
+        if (bed->strand == '-')
             // we support this so we can process dbSnp data (which has reverse strand SNPs).
             complement(bed->name, strlen(bed->name)); 
         }
     slAddHead(&retVal, bed);
@@ -283,9 +282,9 @@
 lineFileClose(&lf);
 return retVal;
 }
 
-static void clipGenPred(struct genePred *gp)
+static void clipGenPred(char *database, struct genePred *gp)
 {
 // Clip exonStarts/exonEnds to cdsStart/cdsEnd and then read in the whole DNA for this gene in preparation for a SNP check.
 // After this call, exonStarts/exonEnds contain only the exons used for CDS (i.e. some may be removed).
 // DNA is put in name2 field; whole dna is read to make it easier to deal with AA's that cross exon junctions.
@@ -294,60 +293,60 @@
     unsigned *newStarts = needMem(gp->exonCount * sizeof(unsigned));
     unsigned *newEnds = needMem(gp->exonCount * sizeof(unsigned));
     int newCount = 0;
     gp->name2 = cloneString("");
-    for(i=0;i<gp->exonCount;i++)
+    for (i=0;i<gp->exonCount;i++)
         {
-        if(gp->exonEnds[i] >= gp->cdsStart && gp->exonStarts[i] <= gp->cdsEnd)
+        if (gp->exonEnds[i] >= gp->cdsStart && gp->exonStarts[i] <= gp->cdsEnd)
             {
             char retNibName[HDB_MAX_PATH_STRING];
             newStarts[newCount] = max(gp->exonStarts[i], gp->cdsStart);
             newEnds[newCount] = min(gp->exonEnds[i], gp->cdsEnd);
             hNibForChrom(database, gp->chrom, retNibName);
             struct dnaSeq *dna = hFetchSeqMixed(retNibName, gp->chrom, newStarts[newCount], newEnds[newCount]);
             char *newName = needMem(strlen(gp->name2) + strlen(dna->dna) + 1);
             sprintf(newName, "%s%s", gp->name2, dna->dna);
-            free(gp->name2);
+            freeMem(gp->name2);
             gp->name2 = newName;
             newCount++;
             }
         }
     gp->exonCount = newCount;
-    free(gp->exonStarts);
-    free(gp->exonEnds);
+    freeMem(gp->exonStarts);
+    freeMem(gp->exonEnds);
     gp->exonStarts = newStarts;
     gp->exonEnds = newEnds;
-    if(gp->strand[0] == '-')
+    if (gp->strand[0] == '-')
         {
         reverseComplement(gp->name2, strlen(gp->name2));
         }
     gp->score = strlen(gp->name2);
-    if(debug)
+    if (debug)
         printf("%s - %d: %s\n", gp->name2, (int) strlen(gp->name2), gp->name2);
 }
 
-static int transformPos(struct genePred *gp, unsigned pos, boolean *lastExon)
+static int transformPos(char *database, struct genePred *gp, unsigned pos, boolean *lastExon)
 {
 // transformPos chrom:chromStart coordinates to relative CDS coordinates
 // returns -1 if pos is NOT within the CDS
 
 int i, delta = 0;
 boolean reverse = gp->strand[0] == '-';
 
-if(gp->name2 == NULL)
+if (gp->name2 == NULL)
     {
-    clipGenPred(gp);
+    clipGenPred(database, gp);
     }
-for(i=0;i<gp->exonCount;i++)
+for (i=0;i<gp->exonCount;i++)
     {
-    if(pos <= gp->exonStarts[i])
+    if (pos <= gp->exonStarts[i])
         {
         return -1;
         }
-    else if(pos < gp->exonEnds[i])
+    else if (pos < gp->exonEnds[i])
         {
         pos = delta + pos - gp->exonStarts[i];
-        if(gp->strand[0] == '-')
+        if (gp->strand[0] == '-')
             pos = gp->score - pos - 1;
         // assert(pos >= 0 && pos < strlen(gp->name2));
         *lastExon = reverse ? i == 0 : (i + 1) == gp->exonCount;
         return pos;
@@ -356,9 +355,9 @@
     }
 return -1;
 }
 
-struct genePred *readGenes()
+struct genePred *readGenes(char *database)
 {
 struct genePred *retVal = NULL;
 char query[256];
 struct sqlResult *sr;
@@ -387,51 +386,48 @@
 sqlDisconnect(&conn);
 return(retVal);
 }
 
-int main(int argc, char** argv)
+static void mutationClassifier(char *database, char *inputFile)
 {
 struct bed *overlapA = NULL;
 struct genePredStub *overlapB = NULL;
-optionInit(&argc, argv, optionSpecs);
-if(argc < 3)
-    usage();
-database = argv[1];
-struct bed6 *snps = readBedFile(argv[2]);
-verbose(2, "Hello: %d SNPs\n", slCount(snps));
-struct genePred *genes = readGenes();
+struct bed6 *snps = readBedFile(inputFile);
+
+verbose(2, " read %d mutations\n", slCount(snps));
+struct genePred *genes = readGenes(database);
 verbose(2, "Hello: %d canonical known genes\n", slCount(genes));
 int count = intersectBeds((struct bed *) snps, genes, &overlapA, &overlapB);
 verbose(2, "number of intersects: %d\n", count);
 // reading unmasked file is much faster - why?
 // sprintf(retNibName, "/hive/data/genomes/hg19/hg19.2bit");
-for(;overlapA != NULL; overlapA = overlapA->next, overlapB = overlapB->next)
+for (;overlapA != NULL; overlapA = overlapA->next, overlapB = overlapB->next)
     {
     struct genePred *gp = overlapB->genePred;
     char *code = NULL;
     char additional[256];
     additional[0] = 0;
     // boolean reverse = !strcmp(overlapA->strand, "-");
     boolean lastExon;
 
-    int pos = transformPos(gp, overlapA->chromStart, &lastExon);
-    if(pos >= 0)
+    int pos = transformPos(database, gp, overlapA->chromStart, &lastExon);
+    if (pos >= 0)
         {
         int len = strlen(overlapA->name);
-        if(len > 1 || (overlapA->chromEnd - overlapA->chromStart))
+        if (len > 1 || (overlapA->chromEnd - overlapA->chromStart))
             {
             int delta = len - (overlapA->chromEnd - overlapA->chromStart);
-            if(delta % 3)
+            if (delta % 3)
                 code = FRAME_SHIFT_INS;
             else
                 code = IN_FRAME_INS;
             }
         else
             {
             unsigned codonStart;
-            if((pos % 3) == 0)
+            if ((pos % 3) == 0)
                 codonStart = pos;
-            else if((pos % 3) == 1)
+            else if ((pos % 3) == 1)
                 codonStart = pos - 1;
             else
                 codonStart = pos - 2;
             char original[4];
@@ -439,74 +435,82 @@
             strncpy(original, gp->name2 + codonStart, 3);
             strncpy(new, gp->name2 + codonStart, 3);
             original[3] = new[3] = 0;
             new[pos % 3] = overlapA->name[0];
-            if(gp->strand[0] == '-')
+            if (gp->strand[0] == '-')
                 complement(new + (pos % 3), 1);
             AA originalAA = lookupCodon(original);
             AA newAA = lookupCodon(new);
-            if(!originalAA)
+            if (!originalAA)
                 originalAA = '*';
-            if(newAA)
+            if (newAA)
                 {
                 code = originalAA == newAA ? SYNONYMOUS : originalAA == '*' ? READ_THROUGH : MISSENSE;
                 }
             else
                 {
                 newAA = '*';
                 code = lastExon ? NONSENSE_LAST_EXON : NONSENSE;
                 }
-            if(debug)
+            if (debug)
                 fprintf(stderr, "original: %s:%c; new: %s:%c\n", original, originalAA, new, newAA);
             safef(additional, sizeof(additional), "%c>%c", originalAA, newAA);
-            if(debug)
+            if (debug)
                 fprintf(stderr, "mismatch at %s:%d; %d; %c => %c\n", overlapA->chrom, overlapA->chromStart, pos, originalAA, newAA);
             }
         }
     else
         {
         boolean reverse = gp->strand[0] == '-';
-        if(overlapA->chromStart < gp->cdsStart)
+        if (overlapA->chromStart < gp->cdsStart)
             {
             code = reverse ? THREE_PRIME_UTR : FIVE_PRIME_UTR;
             }
-        else if(overlapA->chromStart >= gp->cdsEnd)
+        else if (overlapA->chromStart >= gp->cdsEnd)
             {
             code = reverse ? FIVE_PRIME_UTR : THREE_PRIME_UTR;
             }
         else
             {
             // In intro, so check for interuption of splice junction (special case first and last exon).
             int i;
-            for(i=0;i<gp->exonCount;i++)
+            for (i=0;i<gp->exonCount;i++)
                 {
                 int start = gp->exonStarts[i] - overlapA->chromStart;
                 int end   = overlapA->chromEnd - gp->exonEnds[i];
                 // XXXX I still think this isn't quite right (not sure if we s/d use chromEnd or chromStart).
                 if (i == 0)
                     {
-                    if(end == 1 || end == 2)
+                    if (end == 1 || end == 2)
                         code = SPLICE_SITE;
                     }
                 else if (i == (gp->exonCount - 1))
                     {
-                    if(start == 1 || start == 2)
+                    if (start == 1 || start == 2)
                         code = SPLICE_SITE;
                     }
                 else if ((start == 1 || start == 2) || (end == 1 || end == 2))
                     code = SPLICE_SITE;
                 }
             }
         }
-    if(code)
+    if (code)
         {
         char *geneSymbol = gp->name;
         struct hashEl *el;
-        if((el = hashLookup(geneHash, geneSymbol)))
+        if ((el = hashLookup(geneHash, geneSymbol)))
             {
             geneSymbol = (char *) el->val;
             }
         printf("%s\t%d\t%d\t%s\t%s\t%s\n", overlapA->chrom, overlapA->chromStart, overlapA->chromEnd, geneSymbol, code, additional);
         }
     }
+}
+
+int main(int argc, char** argv)
+{
+optionInit(&argc, argv, optionSpecs);
+if (argc < 3)
+    usage();
+mutationClassifier(argv[1], argv[2]);
 return 0;
 }