src/hg/instinct/flatfileToBED15/flatfileToBED15.c 1.5

1.5 2010/05/28 20:10:13 cszeto
Added writing microarrayGroups file
Index: src/hg/instinct/flatfileToBED15/flatfileToBED15.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/flatfileToBED15/flatfileToBED15.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/instinct/flatfileToBED15/flatfileToBED15.c	18 Oct 2009 22:11:30 -0000	1.4
+++ src/hg/instinct/flatfileToBED15/flatfileToBED15.c	28 May 2010 20:10:13 -0000	1.5
@@ -1,150 +1,216 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include <math.h>
 
 #include "common.h"
 #include "options.h"
 #include "bed.h"
 
 #define MAX_LINE 100000
 #define MAX_TOKEN 50
+#define MAX_FILENAME 200
+
 
 static struct optionSpec optionSpecs[] =
 {
+	{"prefix", OPTION_STRING},
     {"flatFile", OPTION_STRING},
     {"bedFile", OPTION_STRING},
     {NULL, 0}
 };
 
 /*
 ** Prototypes
 */
 
 void usage();
 
+int printMicroarrayGroups(char *prefix, char line[]){
+	char outfile[MAX_FILENAME], *tok;
+	sprintf(outfile, "%s_microarrayGroups.ra", prefix);
+	FILE * fp = fopen(outfile, "w");
+	if(fp == NULL){
+		fprintf(stderr, "ERROR: Couldn't open %s for writing.\n", outfile);
+		exit(1);
+	}
+	int i = 0,expCount = 0;
+	for(i = 0; i < MAX_LINE; i++){
+		if(line[i]=='\0' || line[i] == '\n')
+			break;
+		if(line[i] == '\t')
+			expCount++;
+	}
+	fprintf(fp, "name %sGroups\n", prefix);
+	fprintf(fp, "type groupings\n");
+	fprintf(fp, "all %sAll\n\n", prefix);
+	fprintf(fp, "name %sAll\n", prefix);
+	fprintf(fp, "type all\n");
+	fprintf(fp, "description All Arrays\n");
+	fprintf(fp, "expIds ");
+	for(i = 0; i < expCount; i++){
+		fprintf(fp, "%d,", i);
+	}
+	fprintf(fp, "\n");
+	fprintf(fp, "groupSizes ");
+	for(i = 0; i < expCount; i++){
+		fprintf(fp, "1,");
+	}
+	fprintf(fp, "\n");
+	fprintf(fp, "names ");
+	tok = strtok(line, "\t\n");
+	tok = strtok(NULL, "\t\n"); //skip the 'names' entry
+	
+	fprintf(stderr, "EERE\n");
+	while(tok != NULL){
+		fprintf(fp,"%s,", tok);
+		tok=strtok(NULL, "\t\n");
+	}
+	fprintf(stderr, "never HERE\n");
+	fprintf(fp, "\n");
+	
+	fclose(fp);
+	return expCount;
+}
+
 int main(int argc, char *argv[])
 {
     int i=0, j=0, k=0; 
-    char *bedFileName, *flatFileName;
-    FILE *flatFile = NULL;
+    char *prefix, *bedFileName, *flatFileName, outfile[100];
+    FILE *flatFile, *fp, *error;
 
     optionInit(&argc, argv, optionSpecs);
+    prefix = optionVal("prefix", NULL);
+	if(!prefix)
+		{
+		fprintf(stderr, "ERROR: missing prefix.\n");
+		usage();
+		}	
     bedFileName = optionVal("bedFile", NULL);
     if(!bedFileName)
         {
         fprintf(stderr, "ERROR: missing bedFile name.\n");
         usage();
         }
-
     flatFileName = optionVal("flatFile", NULL);
     if(!flatFileName)
         {
         fprintf(stderr, "ERROR: missing flatFile name.\n");
         usage();
         }
 
     flatFile = fopen(flatFileName, "r");
-
     if(!flatFile)
         {        
         fprintf(stderr, "ERROR: can't open flatFile '%s'\n", flatFileName);
         usage();
         }
 
-    FILE * error = fopen("error.log", "w");
+    error = fopen("error.log", "w");
     if(!error){
         printf("Couldn't open the error log file. Exiting...\n");
         exit(1);
     }
 
+	sprintf(outfile, "%s.bed15", prefix);
+	fp = fopen(outfile, "w");
+    if(!error){
+        printf("Couldn't open the output file %s. Exiting...\n", outfile);
+        exit(1);
+    }
+	
+
     char line[MAX_LINE], *tok;
 
     struct bed *b;
     b = bedLoadAll(bedFileName);
     struct hash *probeMap = newHash(0);
     for(;b;b=b->next) {
     	hashAdd(probeMap, b->name, b);
     }
 
     //grab the info out for each probe
     char probeName[MAX_TOKEN], expIds[MAX_LINE], expScores[MAX_LINE], dir[MAX_TOKEN], tmp[MAX_TOKEN];
-    int expCount=0, lastExpCount=0;
+    int lineCount=0, expCount=0, lastExpCount=0;
     
     while(fgets(line, MAX_LINE, flatFile)){
         //init
         expIds[0]= expScores[0] = tmp[0] = dir[0] = probeName[0] = '\0';
         i = j = k = expCount = 0;
-    
         //copy the probe name from the line
         while(line[j] != '\t' && line[j] != '\0' && line[j] != '\n')
             {
             probeName[j] = line[j];
             j++;
             }
         probeName[j] = '\0';
 
         struct hashEl *el = hashLookup(probeMap, probeName);
-        if(el == NULL) {
+		if(lineCount == 0){
+			lastExpCount=printMicroarrayGroups(prefix,line);
+        }else if(el == NULL) {
             fprintf(error, "%s not in bedFile\n", probeName);
         } else{
             // print info to outfile
             struct bed *b = (struct bed *) el->val;
 
             //copy the expScores out of the flatFile line, and count expCount
             tok=strtok(line, "\t\n");
             tok=strtok(NULL, "\t\n");//get rid of the probename one
             i=0;
             while(tok != NULL){
                 if(strcmp(tok,"NaN") != 0) strcat(expScores, tok);
                 strcat(expScores, ",");
                 tok=strtok(NULL, "\t\n");
                 expCount++;
             }         
     
             //check the expCount is ok, or if it's not set yet set it. 
             if(expCount != lastExpCount && lastExpCount!= 0){
                 fprintf(stderr,"ERROR: %s line has fewer expScores than the last line (%d vs. %d). Unparsable file! Exiting...\n", line, expCount, lastExpCount);
                 exit(1);
             }else if(lastExpCount == 0) lastExpCount = expCount;
 
             //make a string for the expIds based on expCount
             for(i=0; i < expCount; i++){
                 sprintf(tmp,"%d,",i);
                 strcat(expIds, tmp);
             }
     
             if(b->chromStart < b->chromEnd) strcpy(dir, "+");
             else strcpy(dir, "-");    
         
-            printf("%s\t", b->chrom);
-            printf("%u\t", b->chromStart);
-            printf("%u\t", b->chromEnd);
-            printf("%s\t", probeName);
-            printf("0\t%s\t0\t0\t0\t1\t0\t0\t",dir);
-            printf("%d\t", expCount);
-            printf("%s\t", expIds);
-            printf("%s\n",expScores);
+            fprintf(fp, "%s\t", b->chrom);
+            fprintf(fp, "%u\t", b->chromStart);
+            fprintf(fp, "%u\t", b->chromEnd);
+            fprintf(fp, "%s\t", probeName);
+            fprintf(fp, "0\t%s\t0\t0\t0\t1\t0\t0\t",dir);
+            fprintf(fp, "%d\t", expCount);
+            fprintf(fp, "%s\t", expIds);
+            fprintf(fp, "%s\n",expScores);
         }
+		lineCount++;
     }
     fclose(flatFile);
     fclose(error);
+	fclose(fp);
     return 0;
 }
 
 /*
 ** Grunt Functions:
 */
 
 void usage()
 /* Explain usage and exit. */
 {
-    fprintf(stderr, "Usage: ./flatfileToBED15 -flatFile=filename -bedFile=filename > [outputBED15_filename]\n");
+    fprintf(stderr, "Usage: ./flatfileToBED15 -prefix=name -flatFile=filename -bedFile=filename\n");
+	fprintf(stderr, "\t-prefix should be the name you want on the results files (prefix.ra and prefix.bed15\n");
     fprintf(stderr, "\t-flatFile should be tab delimited, one probe per line, one sample per column, beginning with a naming column\n");
     fprintf(stderr, "\t-bedFile should be a BED4 file containing coordinate information for each probe expected for the platform.\n");
-    fprintf(stderr, "In addition to the bed15 file produced in stdout, this program will create an 'error.log'\n");
+    fprintf(stderr, "In addition to the bed15 file and microarrayGroups.ra file, this program will create an 'error.log'\n");
     fprintf(stderr, "file for all probes it coulnd't find coordinates for in the provided BED4.\n");
     fprintf(stderr, "NOTE: NaN vals MUST be marked 'NaN' or the program will put them into your bed file as is, rather than as empty vals.\n");
     exit(1);
 }