src/hg/utils/mutationClassifier/mutationClassifier.c 1.1

1.1 2010/01/30 20:44:09 larrym
first pass; currently only handles coding SNPs; still working on how to output mutation classifications
Index: src/hg/utils/mutationClassifier/mutationClassifier.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/mutationClassifier/mutationClassifier.c,v
retrieving revision 1.3
retrieving revision 1.1
diff -b -B -U 4 -r1.3 -r1.1
--- src/hg/utils/mutationClassifier/mutationClassifier.c	2 Feb 2010 09:08:36 -0000	1.3
+++ src/hg/utils/mutationClassifier/mutationClassifier.c	30 Jan 2010 20:44:09 -0000	1.1
@@ -13,21 +13,8 @@
 static int debug = 0;
 static char *database;
 static struct hash *geneHash = NULL;
 
-#define SPLICE_SITE "spliceSite"
-#define MISSENSE "missense"
-#define READ_THROUGH "readThrough"
-#define NONSENSE "nonsense"
-#define NONSENSE_LAST_EXON "nonsenseLastExon"
-#define SYNONYMOUS "synonymous"
-#define IN_FRAME_DEL "inFrameDel"
-#define IN_FRAME_INS "inFrameIns"
-#define FRAME_SHIFT_DEL "frameShiftDel"
-#define FRAME_SHIFT_INS "frameShiftIns"
-#define THREE_PRIME_UTR "threePrimeUtr"
-#define FIVE_PRIME_UTR "fivePrimeUtr"
-
 static struct optionSpec optionSpecs[] = {
     {NULL, 0}
 };
 
@@ -35,10 +22,10 @@
 /* A five field bed. */
     {
     struct bed6 *next;
     char *chrom;	/* Allocated in hash. */
-    unsigned start;	/* Start (0 based) */
-    unsigned end;	/* End (non-inclusive) */
+    int start;		/* Start (0 based) */
+    int end;		/* End (non-inclusive) */
     char *name;	/* Name of item */
     int score; /* Score - 0-1000 */
     char strand;
     };
@@ -60,34 +47,25 @@
 errAbort(
   "mutationClassifier - classify mutations \n"
   "usage:\n"
   "   mutationClassifier database snp.bed\n"
-  "Classifies SNPs and indels which are in coding regions of UCSC\n"
+  "Classifies SNPs which are in coding regions of UCSC\n"
   "canononical genes as synonymous or non-synonymous.\n"
   "Prints bed4 for identified SNPs; name field contains the codon transformation.\n"
   "Standard single character amino acid codes are used; '*' == stop codon.\n"
   "output is bed4+ with classification code in the name field, and additonal\n"
   "annotations in subsequent fields.\n\n"
   "Mutations are classified with the following codes:\n"
-  "%s\n"
-  "%s\n"
-  "%s\n"
-  "%s\n"
-  "%s\n"
-  "%s\n"
-  "%s\n"
-  "%s\n"
-  "%s\n"
-  "%s\n"
-  "\n"
+  "J:   mutation in the splice donor/acceptor sites (2 bases at beginning and ending of introns)\n"
+  "F:   frame shift in CDS\n"
+  "N:   non-synonymous SNP (nonsense or missense)\n"
+  "S:   synonymous SNP\n"
+  "?:   non-frame shift indel\n\n"
   "snp.bed should be bed4 (with SNP base in the name field).\n"
   "SNPs are assumed to be on positive strand, unless snp.bed is bed6 with\n"
-  "explicit strand field (score field is ignored). SNPs should be\n"
-  "zero-length bed entries; all entries are assumed to be indels.\n"
-  "\n"
+  "explicit strand field (score field is ignored).\n\n"
   "example:\n"
-  "     mutationClassifier hg19 snp.bed\n",
-  SPLICE_SITE, MISSENSE, READ_THROUGH, NONSENSE, NONSENSE_LAST_EXON, SYNONYMOUS, IN_FRAME_DEL, IN_FRAME_INS, FRAME_SHIFT_DEL, FRAME_SHIFT_INS
+  "     mutationClassifier hg19 snp.bed\n"
   );
 }
 
 static struct bed *shallowBedCopy(struct bed *bed)
@@ -138,10 +116,10 @@
 
 static int bedItemsOverlap(struct bed *a, struct genePred *b)
 {
      return (!strcmp(a->chrom, b->chrom) &&
-	     a->chromEnd > b->txStart &&
-	     a->chromStart <= b->txEnd);
+	     a->chromEnd > b->cdsStart &&
+	     a->chromStart <= b->cdsEnd);
 }
 
 static int intersectBeds (struct bed *a, struct genePred *b,
                           struct bed **aCommon, struct genePredStub **bCommon)
@@ -324,16 +302,14 @@
     if(debug)
         printf("%s - %d: %s\n", gp->name2, (int) strlen(gp->name2), gp->name2);
 }
 
-static int transformPos(struct genePred *gp, unsigned pos, boolean *lastExon)
+static int transformPos(struct genePred *gp, unsigned pos)
 {
 // transformPos chrom:chromStart coordinates to relative CDS coordinates
 // returns -1 if pos is NOT within the CDS
 
 int i, delta = 0;
-boolean reverse = gp->strand[0] == '-';
-
 if(gp->name2 == NULL)
     {
     clipGenPred(gp);
     }
@@ -348,9 +324,8 @@
         pos = delta + pos - gp->exonStarts[i];
         if(gp->strand[0] == '-')
             pos = gp->score - pos - 1;
         // assert(pos >= 0 && pos < strlen(gp->name2));
-        *lastExon = reverse ? i == 0 : (i + 1) == gp->exonCount;
         return pos;
         }
     delta += gp->exonEnds[i] - gp->exonStarts[i];
     }
@@ -405,26 +380,32 @@
 // reading unmasked file is much faster - why?
 // sprintf(retNibName, "/hive/data/genomes/hg19/hg19.2bit");
 for(;overlapA != NULL; overlapA = overlapA->next, overlapB = overlapB->next)
     {
-    struct genePred *gp = overlapB->genePred;
+    struct genePred *genePred = overlapB->genePred;
     char *code = NULL;
     char additional[256];
     additional[0] = 0;
-    // boolean reverse = !strcmp(overlapA->strand, "-");
-    boolean lastExon;
-
-    int pos = transformPos(gp, overlapA->chromStart, &lastExon);
+    boolean reverse = !strcmp(overlapA->name, "-");
+    int pos = transformPos(genePred, overlapA->chromStart);
     if(pos >= 0)
         {
-        int len = strlen(overlapA->name);
-        if(len > 1 || (overlapA->chromEnd - overlapA->chromStart))
+        if(reverse)
+            {
+            // deletion
+            if((overlapA->chromEnd - overlapA->chromStart) % 3)
+                code = "F";
+            else
+                code = "?";
+            }
+        else if(strlen(overlapA->name) > 1)
             {
-            int delta = len - (overlapA->chromEnd - overlapA->chromStart);
+            // insertion
+            int delta = strlen(overlapA->name) - (overlapA->chromEnd - overlapA->chromStart);
             if(delta % 3)
-                code = FRAME_SHIFT_INS;
+                code = "F";
             else
-                code = IN_FRAME_INS;
+                code = "?";
             }
         else
             {
             unsigned codonStart;
@@ -435,78 +416,60 @@
             else
                 codonStart = pos - 2;
             char original[4];
             char new[4];
-            strncpy(original, gp->name2 + codonStart, 3);
-            strncpy(new, gp->name2 + codonStart, 3);
+            strncpy(original, genePred->name2 + codonStart, 3);
+            strncpy(new, genePred->name2 + codonStart, 3);
             original[3] = new[3] = 0;
             new[pos % 3] = overlapA->name[0];
-            if(gp->strand[0] == '-')
+            if(genePred->strand[0] == '-')
                 complement(new + (pos % 3), 1);
             AA originalAA = lookupCodon(original);
             AA newAA = lookupCodon(new);
             if(!originalAA)
                 originalAA = '*';
-            if(newAA)
-                {
-                code = originalAA == newAA ? SYNONYMOUS : originalAA == '*' ? READ_THROUGH : MISSENSE;
-                }
-            else
-                {
+            if(!newAA)
                 newAA = '*';
-                code = lastExon ? NONSENSE_LAST_EXON : NONSENSE;
-                }
             if(debug)
                 fprintf(stderr, "original: %s:%c; new: %s:%c\n", original, originalAA, new, newAA);
+            // add some synonymous vs. non-synonymous code?
+            code = originalAA == newAA ? "S" : "N";
             safef(additional, sizeof(additional), "%c>%c", originalAA, newAA);
             if(debug)
                 fprintf(stderr, "mismatch at %s:%d; %d; %c => %c\n", overlapA->chrom, overlapA->chromStart, pos, originalAA, newAA);
             }
         }
     else
         {
-        boolean reverse = gp->strand[0] == '-';
-        if(overlapA->chromStart < gp->cdsStart)
-            {
-            code = reverse ? THREE_PRIME_UTR : FIVE_PRIME_UTR;
-            }
-        else if(overlapA->chromStart >= gp->cdsEnd)
-            {
-            code = reverse ? FIVE_PRIME_UTR : THREE_PRIME_UTR;
-            }
-        else
-            {
             // In intro, so check for interuption of splice junction (special case first and last exon).
             int i;
-            for(i=0;i<gp->exonCount;i++)
-                {
-                int start = gp->exonStarts[i] - overlapA->chromStart;
-                int end   = overlapA->chromEnd - gp->exonEnds[i];
-                // XXXX I still think this isn't quite right (not sure if we s/d use chromEnd or chromStart).
-                if (i == 0)
-                    {
-                    if(end == 1 || end == 2)
-                        code = SPLICE_SITE;
-                    }
-                else if (i == (gp->exonCount - 1))
+        for(i=0;i<genePred->exonCount;i++)
                     {
-                    if(start == 1 || start == 2)
-                        code = SPLICE_SITE;
-                    }
-                else if ((start == 1 || start == 2) || (end == 1 || end == 2))
-                    code = SPLICE_SITE;
-                }
+            int start = genePred->exonStarts[i] - overlapA->chromStart;
+            int end   = overlapA->chromEnd - genePred->exonEnds[i];
+            if (i == 0 && (end > 0 && end <= 2))
+                code = "J";
+            else if ((i == (genePred->exonCount - 1)) && (start > 0 && start <= 2))
+                code = "J";
+            else if ((start > 0 && start <= 2) || (end > 0 && end <= 2))
+                code = "J";
+            if(i > 0 && (overlapA->chromEnd + 1) >= genePred->exonStarts[i])
+                // mutation at end of intron
+                code = "J";
+            else if(i < (genePred->exonCount - 1) && (overlapA->chromStart - 1) <= genePred->exonEnds[i])
+                // mutation at beginning of intron
+                code = "J";
             }
         }
     if(code)
         {
-        char *geneSymbol = gp->name;
+        char *geneSymbol = genePred->name;
         struct hashEl *el;
         if((el = hashLookup(geneHash, geneSymbol)))
             {
             geneSymbol = (char *) el->val;
             }
-        printf("%s\t%d\t%d\t%s\t%s\t%s\n", overlapA->chrom, overlapA->chromStart, overlapA->chromEnd, geneSymbol, code, additional);
+        printf("%s\t%d\t%d\t%s\t%s\t%s\n", overlapA->chrom, overlapA->chromStart, overlapA->chromStart + 1, code, additional, geneSymbol);
         }
     }
 return 0;
 }