d43ec6011a8adc1ba82e3b2c2473e2043d300c03
hiram
  Thu Apr 19 14:10:07 2012 -0700
UCSC source from CVS source tree 2005 version
diff --git src/utils/cpgIslandExt/cpg.c src/utils/cpgIslandExt/cpg.c
new file mode 100644
index 0000000..0afb018
--- /dev/null
+++ src/utils/cpgIslandExt/cpg.c
@@ -0,0 +1,174 @@
+
+/*  Last edited: Jun 23 19:33 1995 (gos) */
+/* 
+  cpg.c: CpG island finder. Larsen et al Genomics 13 1992 p1095-1107
+  "usually defined as >200bp with %GC > 50% and obs/exp CpG >
+  0.6". Here use running sum rather than window to score: if not CpG
+  at postion i, then decrement runSum counter, but if CpG then runSum
+  += CPGSCORE.     Spans > threshold are searched
+  for recursively.
+  
+  filename: fasta format concatenated sequence file: 
+*/
+
+/*********************************************************
+
+
+put in options: -p debug to switch on commented out print statements
+                -l length threshold
+                -t score threshold
+                -s to specify score
+		-d score (-ve) if not found : default = -1.
+		
+Really want it to take pattern file:
+     Pattern  score_if_found  score_if_not_found
+
+How best to give #pattern hits, %GC in span, o/e ?
+Cleanest:
+Provide using separate scripts called using output.
+
+Add -g or -m option for "max gap" as in Qpatch.c - score not increased
+if reaches this threshold.  This means that patches which are separated
+by more than the threshold are never joined.
+
+*********************************************************/
+
+#include "stdio.h"
+#include "math.h"
+#include "readseq.h"
+
+/********************* CONTROLS ********************/
+#define CPGSCORE  17   /* so that can compare with old cpg - this
+			  had CPGSCORE 27, but allowed score to reach
+			  0 once without reporting */
+#define MALLOCBLOCK 200000
+#define MAXNAMELEN 512  /*"length 0, title > MAXNAMELEN or starts with bad character." if too small */
+
+/*------------------------------------------------------*/
+int readSequence (FILE *fil, int *conv,
+                         char **seq, char **id, char **desc, int *length) ;
+
+int findspans ( int start, int end, char *seq, char *seqname ) ;
+
+void getstats ( int start, int end, char *seq, char *seqname, int *ncpg, int *ngpc, int *ngc ) ;
+
+void usage (void)
+{ 
+  fprintf (stderr, "Usage: cpg seqfilename\n") ;
+  exit (-1) ;
+}
+/*------------------------------------------------------*/
+void main (int argc, char **argv)
+{ 
+  
+  char *seq, *seqname, *desc ;
+  int conv[] =   { 0, 1, 2, 3, 4 } ;
+  
+  int length ;
+  int i ;
+  static FILE *fil ;
+
+  char c, *cp ;
+  extern char* malloc() ;
+
+  /*------------------------------------------------------*/  
+  switch ( argc )
+    {
+    default: if (argc != 2)
+      usage () ;
+    }
+     if (!(seqname = malloc (MAXNAMELEN+1)))
+     { fprintf (stderr, "Couldn't malloc %d bytes", MAXNAMELEN) ;
+     exit (-1) ;
+     }
+     
+     if (!(seq = malloc (MALLOCBLOCK+1)))
+       { fprintf (stderr, "Couldn't malloc %d bytes", MALLOCBLOCK) ;
+	 exit (-1) ;
+       }
+
+  if (!(fil = fopen ( argv[1], "r" ))) 
+    usage ();
+  
+  while ( readSequence(fil, dna2textConv, &seq, &seqname, &desc, &length) ) 
+    /* once through per sequence */
+    { 
+      i = 0 ;
+      while ( seqname[i] != ' ' && seqname[i] != '\0' && i < 256 )
+	++i ;
+      seqname[i] = '\0' ;
+
+      findspans ( 0, length, seq, seqname ) ;
+    }
+
+  exit (0);
+}
+
+int findspans ( int start, int end, char *seq, char *seqname )
+{
+  int i ;
+  int sc = 0 ;
+  int lsc = 0 ; 
+  int imn = -1 ;  /* else sequences which start with pattern reported badly */
+  int imx = 0 ;
+  int mx = 0 ;
+
+  int ncpg, ngpc, ngc ;  
+
+  i = start ;
+  while ( i < end )  
+    {
+      lsc = sc ;
+      sc = ( end-1-i && seq[i]=='C' && seq[i+1]=='G' ) ? sc += CPGSCORE : --sc ;
+      sc = sc < 0 ? 0 : sc ;
+/*      printf("%d \t %d \t%d \t %d \t%d \t%d\n", i, sc, lsc, imn, imx, mx) ; */
+      if ( sc == 0 && lsc )  /* should threshold here */
+	{
+	  /* imn+1 is the start of the match. 
+	     imx+1 is the end of the match, given pattern-length=2.
+	     fetchdb using reported co-ordinates will return a sequence
+	     which both starts and ends with a complete pattern.
+	     Further +1 adjusts so that start of sequence = 1 
+	  */
+
+	  getstats ( imn+1, imx+2, seq, seqname, &ncpg, &ngpc, &ngc ) ;
+	  printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.1f\n", seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn), 1.0*ncpg/ngpc) ; 
+	  
+/* 	  printf("%s \t %d\t %d\t %d \n", seqname, imn+2, imx+2, mx ) ; 
+  */
+	  /* Recursive call searches from one position after the end of the 
+	     last match to the current position */
+	  findspans( imx+2, i, seq, seqname ) ;
+	  sc = lsc = imn = imx =  mx = 0 ;
+	}
+      imx = sc > mx ? i : imx ;
+      mx = sc > mx ? sc : mx ;
+      imn = sc == 0 ? i : imn ;
+      ++i ;
+    }
+  if ( sc != 0 )  /* should threshold here */
+    {
+/*      printf("%d \t %d \t%d \t %d \t%d \t%d\n", i, sc, lsc, imn, imx, mx) ;  */
+      
+      getstats ( imn+1, imx+2, seq, seqname, &ncpg, &ngpc, &ngc ) ;
+      printf("%s\t %d\t %d\t %d\t CpG: %d\t %.1f\t %.2f\n", seqname, imn+2, imx+2, mx, ncpg, ngc*100.0/(imx+1-imn), 1.0*ncpg/ngpc) ; 
+      
+/*      printf("%s \t %d\t %d\t %d \n", seqname, imn+2, imx+2, mx ) ; 
+   */
+      findspans( imx+2, end, seq, seqname ) ;
+    }
+}
+
+void getstats ( int start, int end, char *seq, char *seqname, int *ncpg, int *ngpc, int *ngc )
+{
+
+int i ;
+*ncpg = *ngpc = *ngc = 0 ;
+
+for ( i = start ; i < end ; ++i ) 
+  {
+   if ( end-1-i && seq[i]=='C' && seq[i+1]=='G' ) ++*ncpg ; 
+   if ( end-1-i && seq[i]=='G' && seq[i+1]=='C' ) ++*ngpc ; 
+   if ( seq[i]=='C' || seq[i]=='G' ) ++*ngc ; 
+  }
+}