50367dd2101774592c5367431d32a7b9ef5c17ab
hiram
  Fri Nov 5 14:53:51 2021 -0700
fixup integer overflow error which eliminated CpG detection in high GC content regions refs #26707

diff --git src/utils/cpgIslandExt/cpg_lh.c src/utils/cpgIslandExt/cpg_lh.c
index 5baa608..1714b77 100644
--- src/utils/cpgIslandExt/cpg_lh.c
+++ src/utils/cpgIslandExt/cpg_lh.c
@@ -1,228 +1,228 @@
 /* Copyright (C) 2012 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 
 /*  Last edited: Jun 23 19:33 1995 (gos) */
 /*  date unknown (LaDeana Hillier) */
 /*  3/23/04 (Angie Hinrichs): use obs/exp from G-G & F '87*/
 /* 
   cpg.c: CpG island finder. Larsen et al Genomics 13 1992 p1095-1107
   "usually defined as >200bp with %GC > 50% and obs/exp CpG >
   0.6". Here use running sum rather than window to score: if not CpG
   at postion i, then decrement runSum counter, but if CpG then runSum
   += CPGSCORE.     Spans > threshold are searched
   for recursively.
   
   filename: fasta format concatenated sequence file: 
 */
 
 /*********************************************************
 
 
 put in options: -p debug to switch on commented out print statements
                 -l length threshold
                 -t score threshold
                 -s to specify score
 		-d score (-ve) if not found : default = -1.
 		
 Really want it to take pattern file:
      Pattern  score_if_found  score_if_not_found
 
 How best to give #pattern hits, %GC in span, o/e ?
 Cleanest:
 Provide using separate scripts called using output.
 
 Add -g or -m option for "max gap" as in Qpatch.c - score not increased
 if reaches this threshold.  This means that patches which are separated
 by more than the threshold are never joined.
 
 *********************************************************/
 
 #include "stdio.h"
 #include "math.h"
 #include "readseq.h"
 #include <stdlib.h>
 
 /********************* CONTROLS ********************/
 #define CPGSCORE  17   /* so that can compare with old cpg - this
 			  had CPGSCORE 27, but allowed score to reach
 			  0 once without reporting */
 #define MALLOCBLOCK 200000
 #define MAXNAMELEN 512  /*"length 0, title > MAXNAMELEN or starts with bad character." if too small */
 
 /*------------------------------------------------------*/
 int readSequence (FILE *fil, int *conv,
                          char **seq, char **id, char **desc, int *length) ;
 
 void findspans ( int start, int end, char *seq, char *seqname ) ;
 
 void getstats ( int start, int end, char *seq, char *seqname, int *ncpg, int *ngpc, int *ng, int *nc ) ;
 
 void usage (void)
 { 
   fprintf(stderr, "cpg_lh - calculate CpG Island data for cpgIslandExt tracks\n");
   fprintf(stderr, "usage:\n    cpg_lh <sequence.fa>\n") ;
   fprintf(stderr, "where <sequence.fa> is fasta sequence, must be more than\n");
   fprintf(stderr, "   200 bases of legitimate sequence, not all N's\n");
   fprintf(stderr, "\nTo process the output into the UCSC bed file format:\n\n"
 "cpglh fastaInput.fa \\\n"
 " | awk '{$2 = $2 - 1; width = $3 - $2;\n"
 "   printf(\"%%s\\t%%d\\t%%s\\t%%s %%s\\t%%s\\t%%s\\t%%0.0f\\t%%0.1f\\t%%s\\t%%s\\n\",\n"
 "    $1, $2, $3, $5, $6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);}' \\\n" \
 "     | sort -k1,1 -k2,2n > output.bed\n");
 
   fprintf(stderr, "\nThe original cpg.c was written by Gos Miklem from the Sanger Center.\n"
 "LaDeana Hillier added some modifications --> cpg_lh.c, and UCSC hass\n"
 "added some further modifications to cpg_lh.c, so that its expected\n"
 "number of CpGs in an island is calculated as described in\n"
 "  Gardiner-Garden, M. and M. Frommer, 1987\n"
 "  CpG islands in vertebrate genomes. J. Mol. Biol. 196:261-282\n"
 "\n"
 "    Expected = (Number of C's * Number of G's) / Length\n"
 "\n"
 "Instead of a sliding-window search for CpG islands, this cpg program\n"
 "uses a running-sum score where a 'C' followed by a 'G' increases the\n"
 "score by 17 and anything else decreases the score by 1.  When the\n"
 "score transitions from positive to 0 (and at the end of the sequence),\n"
 "the sequence in the current span is evaluated to see if it qualifies\n"
 "as a CpG island (>200 bp length, >50%% GC, >0.6 ratio of observed CpG\n"
 "to expected).  Then the search recurses on the span from the position\n"
 "with the max running score up to the current position.\n");
 
   exit (-1);
 }
 /*------------------------------------------------------*/
 int main (int argc, char **argv)
 { 
   
   char *seq, *seqname, *desc ;
   
   int length ;
   int i ;
   static FILE *fil ;
 
   /*------------------------------------------------------*/  
   switch ( argc )
     {
     default: if (argc != 2)
       usage () ;
     }
      if (!(seqname = malloc (MAXNAMELEN+1)))
      { fprintf (stderr, "Couldn't malloc %d bytes", MAXNAMELEN) ;
      exit (-1) ;
      }
      
      if (!(seq = malloc (MALLOCBLOCK+1)))
        { fprintf (stderr, "Couldn't malloc %d bytes", MALLOCBLOCK) ;
 	 exit (-1) ;
        }
 
   if (!(fil = fopen ( argv[1], "r" ))) 
     usage ();
   
   while ( readSequence(fil, dna2textConv, &seq, &seqname, &desc, &length) ) 
     /* once through per sequence */
     { 
       i = 0 ;
       while ( seqname[i] != ' ' && seqname[i] != '\0' && i < 256 )
 	++i ;
       seqname[i] = '\0' ;
 
       findspans ( 0, length, seq, seqname ) ;
     }
 
   exit (0);
 }
 
 void findspans ( int start, int end, char *seq, char *seqname )
 {
   int i ;
   int sc = 0 ;
   int lsc = 0 ; 
   int imn = -1 ;  /* else sequences which start with pattern reported badly */
   int imx = 0 ;
   int mx = 0 ;
   int winlen = 0;
   float expect, obsToExp;
 
   int ncpg, ngpc, ngc, ng, nc;  
 
   i = start ;
   while ( i < end )  
     {
       lsc = sc ;
       sc += ( end-1-i && seq[i]=='C' && seq[i+1]=='G' ) ? CPGSCORE : -1 ;
       sc = sc < 0 ? 0 : sc ;
 /*      printf("%d \t %d \t%d \t %d \t%d \t%d\n", i, sc, lsc, imn, imx, mx) ; */
       if ( sc == 0 && lsc )  /* should threshold here */
 	{
 	  /* imn+1 is the start of the match. 
 	     imx+1 is the end of the match, given pattern-length=2.
 	     fetchdb using reported co-ordinates will return a sequence
 	     which both starts and ends with a complete pattern.
 	     Further +1 adjusts so that start of sequence = 1 
 	  */
 
 	  getstats ( imn+1, imx+2, seq, seqname, &ncpg, &ngpc, &ng, &nc ) ;
 	  ngc = ng + nc;
       if (((imx+2)-(imn+2))>199 && (ngc*100.0/(imx+1-imn))>50.0 ) {
 	/* old gos estimate	  printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.1f\n", seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn), 1.0*ncpg/ngpc) ; */
 	winlen=imx+1-imn;
 	/* ASH 3/23/04: expected val from Gardiner-Garden & Frommer '87: */
-	expect = (float)(nc * ng) / (float)winlen;
+	expect = ((float)nc * (float)ng) / (float)winlen;
 	obsToExp = (float)ncpg / expect;
 	if ( obsToExp > 0.60 )
            printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.2f\t %.2f\n",
 		  seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn),
 		  1.0*ncpg/ngpc, obsToExp) ; 
       }
 /* 	  printf("%s \t %d\t %d\t %d \n", seqname, imn+2, imx+2, mx ) ; 
   */
 	  /* Recursive call searches from one position after the end of the 
 	     last match to the current position */
 	  findspans( imx+2, i, seq, seqname ) ;
 	  sc = lsc = imn = imx =  mx = 0 ;
 	}
       imx = sc > mx ? i : imx ;
       mx = sc > mx ? sc : mx ;
       imn = sc == 0 ? i : imn ;
       ++i ;
     }
   if ( sc != 0 )  /* should threshold here */
     {
 /*      printf("%d \t %d \t%d \t %d \t%d \t%d\n", i, sc, lsc, imn, imx, mx) ;  */
       
       /* ASH 3/23/04: Make this test & output consistent w/above. */
       getstats ( imn+1, imx+2, seq, seqname, &ncpg, &ngpc, &ng, &nc ) ;
       ngc = nc + ng;
       if (((imx+2)-(imn+2))>199 && (ngc*100.0/(imx+1-imn))>50.0 ) {
 	winlen=imx+1-imn;
-	expect = (float)(nc * ng) / (float)winlen;
+	expect = ((float)nc * (float)ng) / (float)winlen;
 	obsToExp = (float)ncpg / expect;
 	if ( obsToExp > 0.60 )
            printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.2f\t %.2f\n",
 		  seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn),
 		  1.0*ncpg/ngpc, obsToExp) ; 
       }
       
 /*      printf("%s \t %d\t %d\t %d \n", seqname, imn+2, imx+2, mx ) ; 
    */
       findspans( imx+2, end, seq, seqname ) ;
     }
 }
 
 /* ASH 3/23/04: Return separate counts of G's and C's for expected val calc. */
 void getstats ( int start, int end, char *seq, char *seqname, int *ncpg, int *ngpc, int *ng, int *nc )
 {
 
 int i ;
 *ncpg = *ngpc = *ng = *nc = 0 ;
 
 for ( i = start ; i < end ; ++i ) 
   {
    if ( end-1-i && seq[i]=='C' && seq[i+1]=='G' ) ++*ncpg ; 
    if ( end-1-i && seq[i]=='G' && seq[i+1]=='C' ) ++*ngpc ; 
    if ( seq[i]=='G' ) ++*ng ; 
    if ( seq[i]=='C' ) ++*nc ; 
   }
 }