85bd10da37f403d155c4434d90edcd146e682948 braney Mon May 15 13:03:32 2017 -0700 add sorting to composite wiggles. diff --git src/optimalLeaf/readFile.cc src/optimalLeaf/readFile.cc new file mode 100644 index 0000000..dc68de6 --- /dev/null +++ src/optimalLeaf/readFile.cc @@ -0,0 +1,199 @@ +//---------------------------------------------------- +// readFile.hh +// reads the input file. +// ----------------------------------------------------- +#include <stdlib.h> +#include <iostream> +#include <fstream> +#include <string.h> +#include <stdio.h> +#include <ctype.h> +#include <math.h> +#include "readFile.hh" + +using namespace std; +const int MAXSIZE = 200000; +const float missing = 0.75; + +// reads the file and genrates the expression matrix and +// retrieves gene names and experiment names. +readFile::readFile(char *fileName,int orfL, int descL) { + FILE *from=fopen(fileName,"r"); + char findName[5]; + int isName=0; + int lineLen = orfL+descL+MAXSIZE; + char *buffer = new char[lineLen]; + char geneVal[MAXSIZE]; + char **lines,**temp; + int i, j, k,num, len,numT,place; + char *tok, *newtok; + int curLines = 100; // initial guess for number of genes + int nlines=0; // actual number of lines + int numMiss,removeGenes = 0; + int redI = 0; + lines = new char*[curLines]; + for(i=0;i<curLines;i++) { + lines[i] = new char[lineLen]; + } + while(fgets(buffer,lineLen,from)) { + strcpy(lines[nlines],buffer); + nlines++; + if(nlines == curLines) { // overflow, add new lines + curLines = curLines * 2; + temp = new char*[nlines]; + for(i=0;i<nlines;i++) { + temp[i] = new char[lineLen]; + strcpy(temp[i],lines[i]); + delete []lines[i]; + } + delete []lines; + lines = new char*[curLines]; + for(i=0;i<curLines;i++) { + lines[i] = new char[lineLen]; + if(i < nlines) { + strcpy(lines[i],temp[i]); // copy all existing lines + delete []temp[i]; + } + } + delete []temp; + } + } + /* add '\n' at the end of last line if needed */ + if(!strchr(lines[nlines-1],'\n')) { + len=strlen(lines[nlines-1]); + lines[nlines-1][len]='\n'; + lines[nlines-1][len+1]=0; + } + + // compute the number of values in the first line, and find if 'name' was + // used + strcpy(findName,"name"); + + len=strlen(lines[0]); + j = 0; + numT = 0; + while(j < len && numT == 0) { + if(lines[0][j]=='\t') + numT++; + j++; + } + i = 0; + // check if the 'name' column is present in the input file + while(j < len && numT == 1) { + buffer[i] = lines[0][j]; + if(lines[0][j]=='\t') + numT++; + j++; + i++; + } + buffer[i-1] = '\0'; + if(strcmp(buffer,findName) == 0) + isName = 1; + numT = 0; + j=0; + while(lines[0][j]!='\n') { + if(lines[0][j]=='\t') + numT++; + j++; + } + numT++; // end of line + numGenes = nlines - 1; + expNum = numT - 1- isName; + geneNames = new char*[numGenes]; + geneDesc = new char*[numGenes]; + vals = new float*[numGenes]; + expNames = new char*[expNum]; + for(i=0;i<numGenes;i++) { + geneNames[i] = new char[orfL]; + geneDesc[i] = new char[descL]; + vals[i] = new float[expNum]; + } + for(i=0;i<expNum;i++) { + expNames[i] = new char[descL]; + } + j = 0; + numT = 0; + while(j<len && numT < (1+isName)) { + if(isspace(lines[0][j])) { + numT++; + } + j++; + } + place = 0; + // find experiment names + for(i=0;i<expNum;i++) { + while(isspace(lines[0][j]) == 0) { + expNames[i][place] = lines[0][j]; + j++; + place++; + } + j++; + expNames[i][place] = '\0'; + place = 0; + } + // get the gene values + for(i=0;i<numGenes;i++) { + j=0; + place = 0; + while(isspace(lines[i+1][j]) == 0) { + geneNames[i-redI][place] = lines[i+1][j]; + place++; + j++; + } + geneNames[i-redI][place] = '\0'; + place = 0; + j++; + if(isName) { + while(isspace(lines[i+1][j]) == 0) { + geneDesc[i-redI][place] = lines[i+1][j]; + place++; + j++; + } + geneDesc[i-redI][place] = '\0'; + } + else { + strcpy(geneDesc[i-redI],geneNames[i-redI]); + } + place = 0; + j++; + numMiss = 0; // the number of missing values + for(k=0;k<expNum;k++) { + place = 0; + while(isspace(lines[i+1][j]) == 0) { + geneVal[place] = lines[i+1][j]; + place++; + j++; + } + geneVal[place] = '\0'; + if(geneVal[0] == '\0') { + vals[i-redI][k] = 110; // missing values are replaced by a number + // greater than 100, however when writing the + // output file, they are restored. + numMiss++; + } + else + vals[i-redI][k] = atof(geneVal); + j++; + } + if(numMiss > (int)ceil(missing*expNum) || (numMiss + 1) == expNum) { // too many missing values + removeGenes++; + redI++; + } + } + for(i=0;i<curLines;i++) { + delete []lines[i]; + } + delete []lines; + numGenes = numGenes - removeGenes; + cerr<<"removed "<<removeGenes<<" genes due to missing values"<<'\n'; +} + +readFile::readFile( int inumGenes, int iexpNum, float **ivals, char **igeneNames,char **igeneDesc,char **iexpNames) +{ +numGenes = inumGenes; +expNum = iexpNum; +vals = ivals; +geneNames = igeneNames; +geneDesc = igeneDesc; +expNames = expNames; +}