dab89c1682e091dfa3780ce95359ebea9f6b7d52 braney Thu May 18 09:49:17 2017 -0700 add licensing info to optimalLeaf library diff --git src/optimalLeaf/bestArr.cc src/optimalLeaf/bestArr.cc index 8110031..86fa4bb 100644 --- src/optimalLeaf/bestArr.cc +++ src/optimalLeaf/bestArr.cc @@ -1,230 +1,236 @@ +/* This code provided by Ziv Bar-Joseph with the explicit understanding that + * no licensing is required for it to be used in the UCSC Genome Browser + * as long as reference is made to this paper: + * https://www.ncbi.nlm.nih.gov/pubmed/12801867 + */ + //----------------------------------------------------------- // Bestarrange.cc // Builds an hierarchical clustering tree, and finds its optimal // leaf ordering. The program gets as an input an expression // file (see README) and generates four files as output. // The first two (for an input file named foo.txt, these are called fooIn.GTR // and fooIn.CDT) are the hierarchical clustering reuslts. The other two // files (foo.CDT and foo.GTR) are the optima leaf ordering results. // Both these results can be viewed using Michael Eisen's TreeView // software. //---------------------------------------------------------- #include <stdlib.h> #include <iostream> #include <fstream> using namespace std; #include <string.h> #include <stdio.h> #include "general.hh" #include "Tree.hh" #include "SimMat.hh" extern "C" { #include "optimalLeaf.h" } const int BUFSIZE = 100; Tree *createTree(int num,float **m,char *str); char *nameFile(char *name,char *e,char *m); void copyFile(char *from, char *to); #ifdef NOTNOW main(int argc,char** argv) { int i,len = strlen(argv[1]); char *fullName = new char[len+6]; strcpy(fullName,argv[1]); strcat(fullName,".txt"); fullName[len+5] = '\0'; int newD = descLen; // initial maximum length of description int newO = orfLen; // initial maximum length of orf name int meanZ = 0; int mult = 0; if(argc>2) { for(i=3;i<=argc;i++) { if(argv[i-1][0] == '-' && argv[i-1][1] == 'd') newD = atoi(argv[i-1]+2); if(argv[i-1][0] == '-' && argv[i-1][1] == 'o') newO = atoi(argv[i-1]+2); if(argv[i-1][0] == '-' && argv[i-1][1] == 'c') meanZ = 1; if(argv[i-1][0] == '-' && argv[i-1][1] == 'm') mult = 1; } } SimMat inputData(fullName,newO,newD,meanZ); // output files names char *cdtFile = nameFile(argv[1],".CDT",0); char *gtrFile = nameFile(argv[1],".GTR",0); char *inCdtFile = nameFile(argv[1],".CDT","In"); char *inGtrFile = nameFile(argv[1],".GTR","In"); char *atrFile = nameFile(argv[1],".ATR",0); float **mat = inputData.giveMat(); // similarity matrix int num = inputData.giveGenes(); Tree *t = createTree(num,mat,gtrFile,"GENE"); // hierarchical clustering tree copyFile(gtrFile,inGtrFile); float init = -1*t->curDist(mat); // initial ordering score cerr<<"Initial sum of similarities "<<init<<'\n'; int *order = t->initOrder(); float min; int *arr = t->returnOrder(&min,num); // the optimal leaf algorithm call min = -1*min; cerr<<"Optimal sum of similarities: "<<min<<'\n'; cerr<<"Improvement "<<(min-init)/(init)*100<<"%"<<'\n'; int numC = inputData.giveExp(); int *arrC = new int[numC+1]; for(i=0;i<numC;i++) arrC[i] = i+1; inputData.printOrder(order,arrC,inCdtFile); // writes initial order to fooIn.CDT float **cMat; if(mult == 1) { inputData.generateCols(argv[2]); cMat = inputData.giveCols(); // similarity matrix Tree *tc = createTree(numC,cMat,atrFile,"ARRY"); // hierarchical clustering tree float initC = -1*tc->curDist(cMat); // initial ordering score int *orderC = tc->initOrder(); float minC; delete []arrC; arrC = tc->returnOrder(&minC,numC); // the optimal leaf algorithm call minC = -1*minC; cerr<<"Initial column similarity: "<<initC<<'\n'; cerr<<"Optimal column similarity: "<<minC<<'\n'; cerr<<"Improvement "<<(minC-initC)/(initC)*100<<"%"<<'\n'; } inputData.printOrder(arr,arrC,cdtFile); // write optimal order to foo.CDT } #endif // Performs the hierarchical clustering, and generates the tree // corresponding to this clustering Tree *createTree(int num,float **m) { int i,j,k; float **newM = new float*[num+1]; //ofstream to(name); // .GTR file Tree **allTrees = new Tree*[num+1]; // copy similarity matrix for(i=1;i<num+1;i++) { newM[i] = new float[num+1]; for(j=1;j<num+1;j++) { newM[i][j] = m[i][j]; } } // initially each gene is assigned to its own cluster for(i=1;i<num+1;i++) { allTrees[i] = new Tree(i,m); } float max; int r,l; float lSize,rSize; Tree *temp; for(i=1;i<num;i++) { max = minInf; // find minimum entry in (converted) similarity matrix for(k=1;k<num;k++) { if(allTrees[k] != 0) { for(j=k+1;j<num+1;j++) { if(allTrees[j] != 0) { if(max < (-1* newM[j][k])) { max = (-1 * newM[j][k]); l = k; r = j; } } } } } rSize = allTrees[r]->giveNumLeafs(); lSize = allTrees[l]->giveNumLeafs(); #ifdef NOTNOW to<<"NODE"<<i<<"X"<<'\t'; if(allTrees[l]->isLeaf()) to<<str<<allTrees[l]->giveIndex()<<"X"<<'\t'; else to<<"NODE"<<allTrees[l]->giveIndex()<<"X"<<'\t'; if(allTrees[r]->isLeaf()) to<<str<<allTrees[r]->giveIndex()<<"X"<<'\t'; else to<<"NODE"<<allTrees[r]->giveIndex()<<"X"<<'\t'; to<<max<<'\n'; #endif temp = allTrees[l]; allTrees[l] = 0; // combine the two clusters allTrees[l] = new Tree(temp,allTrees[r],i); allTrees[r] = 0; // update similarity matrix for(j=1;j<num+1;j++) { if(allTrees[j] != 0 && j != l) { newM[j][l] = (lSize*newM[j][l] + rSize*newM[j][r])/(lSize+rSize); newM[l][j] = newM[j][l]; } } } Tree *res; // find root of the tree for(i=1;i<num+1;i++) { if(allTrees[i] != 0) { res = allTrees[i]; } } for(i=1;i<num+1;i++) { delete []newM[i]; } delete []newM; return res; } // genearte file names with appropriate extension char *nameFile(char *name,char *e,char *m) { int len = strlen(name) + strlen(e); if(m != 0) len = len + strlen(m); char *res = new char[len + 4]; strcpy(res,name); if(m != 0) strcat(res,m); strcat(res,e); return res; } // copy one file to the other void copyFile(char *from, char *to) { FILE *readFrom=fopen(from,"r"); FILE *writeTo=fopen(to,"w"); int size; char buffer[BUFSIZE]; while((size = fread(buffer,sizeof(char),BUFSIZE,readFrom)) == BUFSIZE) fwrite(buffer,sizeof(char),BUFSIZE,writeTo); fwrite(buffer,sizeof(char),size,writeTo); fclose(readFrom); fclose(writeTo); } extern "C" { int *optimalLeafOrder(int inumGenes, int iexpNum, int imeanZ, float **ivals, char **igeneNames,char **igeneDesc,char **iexpNames) { //int foo; //SimMat inputData( inumGenes, iexpNum, ivals, igeneNames,igeneDesc,iexpNames); //char *fullName; //int newO, newD, meanZ, num; //int inumGenes; int iexpNum; float **ivals; char **igeneNames;char **igeneDesc;char **iexpNames; //SimMat inputData(fullName,newO,newD,meanZ); SimMat inputData(inumGenes, iexpNum, imeanZ, ivals, igeneNames,igeneDesc,iexpNames); float **mat = inputData.giveMat(); // similarity matrix int num = inputData.giveGenes(); Tree *t = createTree(num,mat); // hierarchical clustering tree int *orderC = t->initOrder(); float min; int *arr = t->returnOrder(&min,num); // the optimal leaf algorithm call return arr; } }