src/hg/utils/mutationClassifier/mutationClassifier.c 1.4
1.4 2010/02/07 19:53:31 larrym
code cleanup per jim's feedback
Index: src/hg/utils/mutationClassifier/mutationClassifier.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/mutationClassifier/mutationClassifier.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 4 -r1.3 -r1.4
--- src/hg/utils/mutationClassifier/mutationClassifier.c 2 Feb 2010 09:08:36 -0000 1.3
+++ src/hg/utils/mutationClassifier/mutationClassifier.c 7 Feb 2010 19:53:31 -0000 1.4
@@ -10,9 +10,8 @@
#include "assert.h"
#include "hash.h"
static int debug = 0;
-static char *database;
static struct hash *geneHash = NULL;
#define SPLICE_SITE "spliceSite"
#define MISSENSE "missense"
@@ -165,25 +164,25 @@
struct genePred *curB = b;
struct genePred *savedB = NULL;
struct genePredStub *lastAddedB = NULL;
-for(curA = a; curA != NULL && curB != NULL;)
+for (curA = a; curA != NULL && curB != NULL;)
{
- if(debug)
+ if (debug)
{
fprintf(stderr, "A: %s:%d-%d\n", curA->chrom, curA->chromStart, curA->chromEnd);
fprintf(stderr, "B: %s:%d-%d\n", curB->chrom, curB->cdsStart, curB->cdsEnd);
}
- if(bedItemsOverlap(curA, curB))
+ if (bedItemsOverlap(curA, curB))
{
- if(debug)
+ if (debug)
{
fprintf(stderr, "%s:%d-%d", curA->chrom, curA->chromStart, curA->chromEnd);
}
- if(aCommon != NULL)
+ if (aCommon != NULL)
{
struct bed *tmpA = shallowBedCopy(curA);
- if(*aCommon == NULL)
+ if (*aCommon == NULL)
{
*aCommon = tmpA;
}
else
@@ -192,12 +191,12 @@
}
// We put newly allocated bed items at the end of the returned list so they match order in original list
lastAddedA = tmpA;
}
- if(bCommon != NULL)
+ if (bCommon != NULL)
{
struct genePredStub *tmpB = genePredStubCopy(curB);
- if(*bCommon == NULL)
+ if (*bCommon == NULL)
{
*bCommon = tmpB;
}
else
@@ -205,24 +204,24 @@
lastAddedB->next = tmpB;
}
lastAddedB = tmpB;
}
- if(bCommon == NULL)
+ if (bCommon == NULL)
{
// see note at beginning of function
curA = curA->next;
}
else
{
- if(savedB == NULL)
+ if (savedB == NULL)
savedB = curB;
curB = curB->next;
}
count++;
}
else
{
- if(savedB)
+ if (savedB)
{
// curA has matched at least one entry in b; now rewind curB to look for potentially multiple matches
// within b in the next entry from a (see notes in hw1_analyze.h)
curA = curA->next;
@@ -233,9 +232,9 @@
{
int diff = strcmp(curA->chrom, curB->chrom);
if (!diff)
diff = curA->chromStart - curB->cdsStart;
- if(diff < 0)
+ if (diff < 0)
{
curA = curA->next;
}
else
@@ -268,14 +267,14 @@
bed->chrom = cloneString(chrom);
bed->start = start;
bed->end = end;
bed->name = cloneString(row[3]);
- if(wordCount >= 5)
+ if (wordCount >= 5)
bed->score = lineFileNeedNum(lf, row, 4);
- if(wordCount >= 6)
+ if (wordCount >= 6)
{
bed->strand = row[5][0];
- if(bed->strand == '-')
+ if (bed->strand == '-')
// we support this so we can process dbSnp data (which has reverse strand SNPs).
complement(bed->name, strlen(bed->name));
}
slAddHead(&retVal, bed);
@@ -283,9 +282,9 @@
lineFileClose(&lf);
return retVal;
}
-static void clipGenPred(struct genePred *gp)
+static void clipGenPred(char *database, struct genePred *gp)
{
// Clip exonStarts/exonEnds to cdsStart/cdsEnd and then read in the whole DNA for this gene in preparation for a SNP check.
// After this call, exonStarts/exonEnds contain only the exons used for CDS (i.e. some may be removed).
// DNA is put in name2 field; whole dna is read to make it easier to deal with AA's that cross exon junctions.
@@ -294,60 +293,60 @@
unsigned *newStarts = needMem(gp->exonCount * sizeof(unsigned));
unsigned *newEnds = needMem(gp->exonCount * sizeof(unsigned));
int newCount = 0;
gp->name2 = cloneString("");
- for(i=0;i<gp->exonCount;i++)
+ for (i=0;i<gp->exonCount;i++)
{
- if(gp->exonEnds[i] >= gp->cdsStart && gp->exonStarts[i] <= gp->cdsEnd)
+ if (gp->exonEnds[i] >= gp->cdsStart && gp->exonStarts[i] <= gp->cdsEnd)
{
char retNibName[HDB_MAX_PATH_STRING];
newStarts[newCount] = max(gp->exonStarts[i], gp->cdsStart);
newEnds[newCount] = min(gp->exonEnds[i], gp->cdsEnd);
hNibForChrom(database, gp->chrom, retNibName);
struct dnaSeq *dna = hFetchSeqMixed(retNibName, gp->chrom, newStarts[newCount], newEnds[newCount]);
char *newName = needMem(strlen(gp->name2) + strlen(dna->dna) + 1);
sprintf(newName, "%s%s", gp->name2, dna->dna);
- free(gp->name2);
+ freeMem(gp->name2);
gp->name2 = newName;
newCount++;
}
}
gp->exonCount = newCount;
- free(gp->exonStarts);
- free(gp->exonEnds);
+ freeMem(gp->exonStarts);
+ freeMem(gp->exonEnds);
gp->exonStarts = newStarts;
gp->exonEnds = newEnds;
- if(gp->strand[0] == '-')
+ if (gp->strand[0] == '-')
{
reverseComplement(gp->name2, strlen(gp->name2));
}
gp->score = strlen(gp->name2);
- if(debug)
+ if (debug)
printf("%s - %d: %s\n", gp->name2, (int) strlen(gp->name2), gp->name2);
}
-static int transformPos(struct genePred *gp, unsigned pos, boolean *lastExon)
+static int transformPos(char *database, struct genePred *gp, unsigned pos, boolean *lastExon)
{
// transformPos chrom:chromStart coordinates to relative CDS coordinates
// returns -1 if pos is NOT within the CDS
int i, delta = 0;
boolean reverse = gp->strand[0] == '-';
-if(gp->name2 == NULL)
+if (gp->name2 == NULL)
{
- clipGenPred(gp);
+ clipGenPred(database, gp);
}
-for(i=0;i<gp->exonCount;i++)
+for (i=0;i<gp->exonCount;i++)
{
- if(pos <= gp->exonStarts[i])
+ if (pos <= gp->exonStarts[i])
{
return -1;
}
- else if(pos < gp->exonEnds[i])
+ else if (pos < gp->exonEnds[i])
{
pos = delta + pos - gp->exonStarts[i];
- if(gp->strand[0] == '-')
+ if (gp->strand[0] == '-')
pos = gp->score - pos - 1;
// assert(pos >= 0 && pos < strlen(gp->name2));
*lastExon = reverse ? i == 0 : (i + 1) == gp->exonCount;
return pos;
@@ -356,9 +355,9 @@
}
return -1;
}
-struct genePred *readGenes()
+struct genePred *readGenes(char *database)
{
struct genePred *retVal = NULL;
char query[256];
struct sqlResult *sr;
@@ -387,51 +386,48 @@
sqlDisconnect(&conn);
return(retVal);
}
-int main(int argc, char** argv)
+static void mutationClassifier(char *database, char *inputFile)
{
struct bed *overlapA = NULL;
struct genePredStub *overlapB = NULL;
-optionInit(&argc, argv, optionSpecs);
-if(argc < 3)
- usage();
-database = argv[1];
-struct bed6 *snps = readBedFile(argv[2]);
-verbose(2, "Hello: %d SNPs\n", slCount(snps));
-struct genePred *genes = readGenes();
+struct bed6 *snps = readBedFile(inputFile);
+
+verbose(2, " read %d mutations\n", slCount(snps));
+struct genePred *genes = readGenes(database);
verbose(2, "Hello: %d canonical known genes\n", slCount(genes));
int count = intersectBeds((struct bed *) snps, genes, &overlapA, &overlapB);
verbose(2, "number of intersects: %d\n", count);
// reading unmasked file is much faster - why?
// sprintf(retNibName, "/hive/data/genomes/hg19/hg19.2bit");
-for(;overlapA != NULL; overlapA = overlapA->next, overlapB = overlapB->next)
+for (;overlapA != NULL; overlapA = overlapA->next, overlapB = overlapB->next)
{
struct genePred *gp = overlapB->genePred;
char *code = NULL;
char additional[256];
additional[0] = 0;
// boolean reverse = !strcmp(overlapA->strand, "-");
boolean lastExon;
- int pos = transformPos(gp, overlapA->chromStart, &lastExon);
- if(pos >= 0)
+ int pos = transformPos(database, gp, overlapA->chromStart, &lastExon);
+ if (pos >= 0)
{
int len = strlen(overlapA->name);
- if(len > 1 || (overlapA->chromEnd - overlapA->chromStart))
+ if (len > 1 || (overlapA->chromEnd - overlapA->chromStart))
{
int delta = len - (overlapA->chromEnd - overlapA->chromStart);
- if(delta % 3)
+ if (delta % 3)
code = FRAME_SHIFT_INS;
else
code = IN_FRAME_INS;
}
else
{
unsigned codonStart;
- if((pos % 3) == 0)
+ if ((pos % 3) == 0)
codonStart = pos;
- else if((pos % 3) == 1)
+ else if ((pos % 3) == 1)
codonStart = pos - 1;
else
codonStart = pos - 2;
char original[4];
@@ -439,74 +435,82 @@
strncpy(original, gp->name2 + codonStart, 3);
strncpy(new, gp->name2 + codonStart, 3);
original[3] = new[3] = 0;
new[pos % 3] = overlapA->name[0];
- if(gp->strand[0] == '-')
+ if (gp->strand[0] == '-')
complement(new + (pos % 3), 1);
AA originalAA = lookupCodon(original);
AA newAA = lookupCodon(new);
- if(!originalAA)
+ if (!originalAA)
originalAA = '*';
- if(newAA)
+ if (newAA)
{
code = originalAA == newAA ? SYNONYMOUS : originalAA == '*' ? READ_THROUGH : MISSENSE;
}
else
{
newAA = '*';
code = lastExon ? NONSENSE_LAST_EXON : NONSENSE;
}
- if(debug)
+ if (debug)
fprintf(stderr, "original: %s:%c; new: %s:%c\n", original, originalAA, new, newAA);
safef(additional, sizeof(additional), "%c>%c", originalAA, newAA);
- if(debug)
+ if (debug)
fprintf(stderr, "mismatch at %s:%d; %d; %c => %c\n", overlapA->chrom, overlapA->chromStart, pos, originalAA, newAA);
}
}
else
{
boolean reverse = gp->strand[0] == '-';
- if(overlapA->chromStart < gp->cdsStart)
+ if (overlapA->chromStart < gp->cdsStart)
{
code = reverse ? THREE_PRIME_UTR : FIVE_PRIME_UTR;
}
- else if(overlapA->chromStart >= gp->cdsEnd)
+ else if (overlapA->chromStart >= gp->cdsEnd)
{
code = reverse ? FIVE_PRIME_UTR : THREE_PRIME_UTR;
}
else
{
// In intro, so check for interuption of splice junction (special case first and last exon).
int i;
- for(i=0;i<gp->exonCount;i++)
+ for (i=0;i<gp->exonCount;i++)
{
int start = gp->exonStarts[i] - overlapA->chromStart;
int end = overlapA->chromEnd - gp->exonEnds[i];
// XXXX I still think this isn't quite right (not sure if we s/d use chromEnd or chromStart).
if (i == 0)
{
- if(end == 1 || end == 2)
+ if (end == 1 || end == 2)
code = SPLICE_SITE;
}
else if (i == (gp->exonCount - 1))
{
- if(start == 1 || start == 2)
+ if (start == 1 || start == 2)
code = SPLICE_SITE;
}
else if ((start == 1 || start == 2) || (end == 1 || end == 2))
code = SPLICE_SITE;
}
}
}
- if(code)
+ if (code)
{
char *geneSymbol = gp->name;
struct hashEl *el;
- if((el = hashLookup(geneHash, geneSymbol)))
+ if ((el = hashLookup(geneHash, geneSymbol)))
{
geneSymbol = (char *) el->val;
}
printf("%s\t%d\t%d\t%s\t%s\t%s\n", overlapA->chrom, overlapA->chromStart, overlapA->chromEnd, geneSymbol, code, additional);
}
}
+}
+
+int main(int argc, char** argv)
+{
+optionInit(&argc, argv, optionSpecs);
+if (argc < 3)
+ usage();
+mutationClassifier(argv[1], argv[2]);
return 0;
}