85bd10da37f403d155c4434d90edcd146e682948 braney Mon May 15 13:03:32 2017 -0700 add sorting to composite wiggles. diff --git src/optimalLeaf/SimMat.cc src/optimalLeaf/SimMat.cc new file mode 100644 index 0000000..3b2e347 --- /dev/null +++ src/optimalLeaf/SimMat.cc @@ -0,0 +1,219 @@ +// -------------------------------------------------------- +// SimMat.cc +// Computes the similarity matrix, and writes ordered trees +// to a .CDT files. +// -------------------------------------------------------- +#include <stdlib.h> +#include <iostream> +#include <fstream> +#include <string.h> +#include <math.h> +#include "SimMat.hh" +#include "readFile.hh" + +using namespace std; +// read the input file +SimMat::SimMat(char *filename,int orfL,int descL,int mZ) { + + meanZ = mZ; + readFile myRead(filename,orfL,descL); + genes = myRead.getNumGenes(); + exp = myRead.getNumExp(); + in = myRead.getVals(); + geneNames = myRead.getNames(); + geneDesc = myRead.getDesc(); + expNames = myRead.getExp(); + generateMat(); +} + +SimMat::SimMat( int igenes, int iexp, int imeanZ, float **iin, + char **igeneNames, char **igeneDesc, char **iexpNames) { + + genes = igenes; + exp = iexp; + meanZ = imeanZ; + in = iin; + geneNames = igeneNames; + geneDesc = igeneDesc; + expNames = iexpNames; + + generateMat(); +} + +// compute the siimlarity matrix +void SimMat::generateMat() { + + int i,j; + mat = new float*[genes+1]; + means = compMean(genes,exp,1); + std = compStd(genes,exp,1); + for(i=1;i<genes+1;i++) { + mat[i] = new float[genes+1]; + mat[i][i] = 0; + } + for(i=1;i<genes;i++) { + for(j=i+1;j<genes+1;j++) { + mat[i][j] = -1 * compSim(i-1,j-1,exp,1); + mat[j][i] = mat[i][j]; + } + } +} + +// compute the columns siimlarity matrix +void SimMat::generateCols(char *name) { + + int i,j; + delete []means; + delete []std; + float simVal; + ofstream toSim(name); + cMat = new float*[exp+1]; + means = compMean(exp,genes,0); + std = compStd(exp,genes,0); + for(i=1;i<exp+1;i++) { + cMat[i] = new float[exp+1]; + cMat[i][i] = 0; + } + for(i=1;i<exp;i++) { + for(j=i+1;j<exp+1;j++) { + simVal = compSim(i-1,j-1,genes,0); + cMat[i][j] = -1 * simVal; + cMat[j][i] = cMat[i][j]; + } + } +} +// computes average expressions of genes +float *SimMat::compMean(int tot,int c,int isGene) { + int i,j; + float *res = new float[tot]; + float sum; + int num; + for(i=0;i<tot;i++) { + sum = 0; + num = 0; + for(j=0;j<c;j++) { + if(isGene == 1) { + if(in[i][j] < 100) { + sum += in[i][j]; + num++; + } + } + else { + if(in[j][i] < 100) { + sum += in[j][i]; + num++; + } + } + } + res[i] = sum/(float)num; + if(meanZ) + res[i] = 0; + } + return res; +} +// computes standard devaition of genes +float *SimMat::compStd(int tot,int c, int isGene) { + + int i,j; + float *res = new float[tot]; + float sum; + int num; + for(i=0;i<tot;i++) { + sum = 0; + num = 0; + for(j=0;j<c;j++) { + if(isGene == 1) { + if(in[i][j] < 100) { + sum += pow((in[i][j]-means[i]),2); + num++; + } + } + else { + if(in[j][i] < 100) { + sum += pow((in[j][i]-means[i]),2); + num++; + } + } + + } + sum = sum/(float)num; + if(sum == 0) { + cerr<<"Error! no variance for gene "<<i<<'\n'; + cerr<<"The variance for gene "<<i<<" has been set to 1"<<'\n'; + sum = 1; + } + res[i] = sqrt(sum); + } + return res; +} + +float SimMat::compSim(int g1,int g2,int c,int isGene) { + int i; + float sum = 0; + int num = 0; + for(i=0;i<c;i++) { + if(isGene == 1) { + if(in[g1][i] < 100 && in[g2][i] < 100) { + sum += ((in[g1][i]-means[g1])/std[g1])*((in[g2][i]-means[g2])/std[g2]); + num++; + } + } + else { + if(in[i][g1] < 100 && in[i][g2] < 100) { + sum += ((in[i][g1]-means[g1])/std[g1])*((in[i][g2]-means[g2])/std[g2]); + num++; + } + } + } + if(num == 0) + sum = -1; + else + sum = sum/(float)num; + return sum; +} + +// gets an array representing genes, and writes it to a .CDT file +void SimMat::printOrder(int *arr,int *arrC,char *name) { + + ofstream to(name); + int i,j; + to<<"GID"<<'\t'<<"gene"<<'\t'<<"NAME"<<'\t'<<"GWEIGHT"; + for(i=0;i<exp;i++) { + to<<'\t'<<expNames[arrC[i]-1]<<"-"<<arrC[i]; + } + to<<'\n'; + to<<"AID"<<'\t'<<'\t'<<'\t'; + for(i=0;i<exp;i++) { + to<<'\t'<<"ARRY"<<arrC[i]<<"X"; + } + to<<'\n'; + to<<"EWEIGHT"<<'\t'<<'\t'<<'\t'; + for(i=1;i<exp+1;i++) { + to<<'\t'<<"1"; + } + to<<'\n'; + for(i=0;i<genes;i++) { + to<<"GENE"<<arr[i]<<"X"<<'\t'<<geneNames[arr[i]-1]<<'\t'<<geneDesc[arr[i]-1]<<'\t'<<"1"; + for(j=0;j<exp;j++) { + if(in[arr[i]-1][arrC[j]-1] < 100) + to<<'\t'<<in[arr[i]-1][arrC[j]-1]; + else + to<<'\t'; + } + to<<'\n'; + } +} + + + + + + + + + + + + + +