85bd10da37f403d155c4434d90edcd146e682948 braney Mon May 15 13:03:32 2017 -0700 add sorting to composite wiggles. diff --git src/optimalLeaf/bestArr.cc src/optimalLeaf/bestArr.cc new file mode 100644 index 0000000..8110031 --- /dev/null +++ src/optimalLeaf/bestArr.cc @@ -0,0 +1,230 @@ +//----------------------------------------------------------- +// Bestarrange.cc +// Builds an hierarchical clustering tree, and finds its optimal +// leaf ordering. The program gets as an input an expression +// file (see README) and generates four files as output. +// The first two (for an input file named foo.txt, these are called fooIn.GTR +// and fooIn.CDT) are the hierarchical clustering reuslts. The other two +// files (foo.CDT and foo.GTR) are the optima leaf ordering results. +// Both these results can be viewed using Michael Eisen's TreeView +// software. +//---------------------------------------------------------- + +#include <stdlib.h> +#include <iostream> +#include <fstream> +using namespace std; +#include <string.h> +#include <stdio.h> +#include "general.hh" +#include "Tree.hh" +#include "SimMat.hh" + +extern "C" +{ +#include "optimalLeaf.h" +} + +const int BUFSIZE = 100; +Tree *createTree(int num,float **m,char *str); +char *nameFile(char *name,char *e,char *m); +void copyFile(char *from, char *to); + +#ifdef NOTNOW +main(int argc,char** argv) { + + int i,len = strlen(argv[1]); + char *fullName = new char[len+6]; + strcpy(fullName,argv[1]); + strcat(fullName,".txt"); + fullName[len+5] = '\0'; + int newD = descLen; // initial maximum length of description + int newO = orfLen; // initial maximum length of orf name + int meanZ = 0; + int mult = 0; + if(argc>2) { + for(i=3;i<=argc;i++) { + if(argv[i-1][0] == '-' && argv[i-1][1] == 'd') + newD = atoi(argv[i-1]+2); + if(argv[i-1][0] == '-' && argv[i-1][1] == 'o') + newO = atoi(argv[i-1]+2); + if(argv[i-1][0] == '-' && argv[i-1][1] == 'c') + meanZ = 1; + if(argv[i-1][0] == '-' && argv[i-1][1] == 'm') + mult = 1; + } + } + SimMat inputData(fullName,newO,newD,meanZ); + // output files names + char *cdtFile = nameFile(argv[1],".CDT",0); + char *gtrFile = nameFile(argv[1],".GTR",0); + char *inCdtFile = nameFile(argv[1],".CDT","In"); + char *inGtrFile = nameFile(argv[1],".GTR","In"); + char *atrFile = nameFile(argv[1],".ATR",0); + float **mat = inputData.giveMat(); // similarity matrix + int num = inputData.giveGenes(); + Tree *t = createTree(num,mat,gtrFile,"GENE"); // hierarchical clustering tree + copyFile(gtrFile,inGtrFile); + float init = -1*t->curDist(mat); // initial ordering score + cerr<<"Initial sum of similarities "<<init<<'\n'; + int *order = t->initOrder(); + float min; + int *arr = t->returnOrder(&min,num); // the optimal leaf algorithm call + min = -1*min; + cerr<<"Optimal sum of similarities: "<<min<<'\n'; + cerr<<"Improvement "<<(min-init)/(init)*100<<"%"<<'\n'; + int numC = inputData.giveExp(); + int *arrC = new int[numC+1]; + for(i=0;i<numC;i++) + arrC[i] = i+1; + inputData.printOrder(order,arrC,inCdtFile); // writes initial order to fooIn.CDT + float **cMat; + if(mult == 1) { + inputData.generateCols(argv[2]); + cMat = inputData.giveCols(); // similarity matrix + Tree *tc = createTree(numC,cMat,atrFile,"ARRY"); // hierarchical clustering tree + float initC = -1*tc->curDist(cMat); // initial ordering score + int *orderC = tc->initOrder(); + float minC; + delete []arrC; + arrC = tc->returnOrder(&minC,numC); // the optimal leaf algorithm call + minC = -1*minC; + cerr<<"Initial column similarity: "<<initC<<'\n'; + cerr<<"Optimal column similarity: "<<minC<<'\n'; + cerr<<"Improvement "<<(minC-initC)/(initC)*100<<"%"<<'\n'; + } + inputData.printOrder(arr,arrC,cdtFile); // write optimal order to foo.CDT +} +#endif + + +// Performs the hierarchical clustering, and generates the tree +// corresponding to this clustering +Tree *createTree(int num,float **m) { + int i,j,k; + float **newM = new float*[num+1]; + //ofstream to(name); // .GTR file + Tree **allTrees = new Tree*[num+1]; + // copy similarity matrix + for(i=1;i<num+1;i++) { + newM[i] = new float[num+1]; + for(j=1;j<num+1;j++) { + newM[i][j] = m[i][j]; + } + } + // initially each gene is assigned to its own cluster + for(i=1;i<num+1;i++) { + allTrees[i] = new Tree(i,m); + } + float max; + int r,l; + float lSize,rSize; + Tree *temp; + for(i=1;i<num;i++) { + max = minInf; + // find minimum entry in (converted) similarity matrix + for(k=1;k<num;k++) { + if(allTrees[k] != 0) { + for(j=k+1;j<num+1;j++) { + if(allTrees[j] != 0) { + if(max < (-1* newM[j][k])) { + max = (-1 * newM[j][k]); + l = k; + r = j; + } + } + } + } + } + rSize = allTrees[r]->giveNumLeafs(); + lSize = allTrees[l]->giveNumLeafs(); +#ifdef NOTNOW + to<<"NODE"<<i<<"X"<<'\t'; + if(allTrees[l]->isLeaf()) + to<<str<<allTrees[l]->giveIndex()<<"X"<<'\t'; + else + to<<"NODE"<<allTrees[l]->giveIndex()<<"X"<<'\t'; + if(allTrees[r]->isLeaf()) + to<<str<<allTrees[r]->giveIndex()<<"X"<<'\t'; + else + to<<"NODE"<<allTrees[r]->giveIndex()<<"X"<<'\t'; + to<<max<<'\n'; +#endif + + temp = allTrees[l]; + allTrees[l] = 0; + // combine the two clusters + allTrees[l] = new Tree(temp,allTrees[r],i); + allTrees[r] = 0; + // update similarity matrix + for(j=1;j<num+1;j++) { + if(allTrees[j] != 0 && j != l) { + newM[j][l] = (lSize*newM[j][l] + rSize*newM[j][r])/(lSize+rSize); + newM[l][j] = newM[j][l]; + } + } + } + Tree *res; + // find root of the tree + for(i=1;i<num+1;i++) { + if(allTrees[i] != 0) { + res = allTrees[i]; + } + } + for(i=1;i<num+1;i++) { + delete []newM[i]; + } + delete []newM; + return res; +} + +// genearte file names with appropriate extension +char *nameFile(char *name,char *e,char *m) { + + int len = strlen(name) + strlen(e); + if(m != 0) + len = len + strlen(m); + char *res = new char[len + 4]; + strcpy(res,name); + if(m != 0) + strcat(res,m); + strcat(res,e); + return res; +} + +// copy one file to the other +void copyFile(char *from, char *to) { + + FILE *readFrom=fopen(from,"r"); + FILE *writeTo=fopen(to,"w"); + int size; + char buffer[BUFSIZE]; + while((size = fread(buffer,sizeof(char),BUFSIZE,readFrom)) == BUFSIZE) + fwrite(buffer,sizeof(char),BUFSIZE,writeTo); + fwrite(buffer,sizeof(char),size,writeTo); + fclose(readFrom); + fclose(writeTo); +} + +extern "C" +{ +int *optimalLeafOrder(int inumGenes, int iexpNum, int imeanZ, float **ivals, char **igeneNames,char **igeneDesc,char **iexpNames) +{ +//int foo; +//SimMat inputData( inumGenes, iexpNum, ivals, igeneNames,igeneDesc,iexpNames); + //char *fullName; + //int newO, newD, meanZ, num; +//int inumGenes; int iexpNum; float **ivals; char **igeneNames;char **igeneDesc;char **iexpNames; + //SimMat inputData(fullName,newO,newD,meanZ); + SimMat inputData(inumGenes, iexpNum, imeanZ, ivals, igeneNames,igeneDesc,iexpNames); + float **mat = inputData.giveMat(); // similarity matrix + int num = inputData.giveGenes(); + Tree *t = createTree(num,mat); // hierarchical clustering tree + int *orderC = t->initOrder(); + float min; + int *arr = t->returnOrder(&min,num); // the optimal leaf algorithm call + +return arr; +} + +}