src/hg/protein/kgBestMrna/kgResultBestMrna.c 1.4

1.4 2009/03/17 19:54:57 markd
minor speed up of rendering of mRNA/ESTs by checking gbSeq table before seq table, avoiding two sql requests. Also simplified code.
Index: src/hg/protein/kgBestMrna/kgResultBestMrna.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/protein/kgBestMrna/kgResultBestMrna.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/protein/kgBestMrna/kgResultBestMrna.c	23 Jan 2008 21:49:40 -0000	1.3
+++ src/hg/protein/kgBestMrna/kgResultBestMrna.c	17 Mar 2009 19:54:57 -0000	1.4
@@ -1,342 +1,340 @@
 /* kgBestMrna - driver program to call blat to select best mRNA for each protein */
 
 #include	<sys/param.h>
 #include "common.h"
 #include "hCommon.h"
 #include "hdb.h"
 
-extern int hRnaSeqAndIdx(char *acc, struct dnaSeq **retSeq, HGID *retId, char *gbdate, struct sqlConnection *conn);
-
 char proteinName[20], mrnaName[20];
 char mrnaNames[500][20];
 int  mrnaScore[500];
 char proteinNameOld[20] = {""};
 
 struct dnaSeq *seq;
 HGID id;
 
 bioSeq *mSeq, qSeq, *pSeq;
 extern int answer_for_kg;
 
 struct dnaSeq *untransList;
 
 char line[2000];
 char line2[2000];
 
 int mrnaCount;
 int proteinCount;
 
 char mrnaNames[500][20];
 char mrnaDates[500][20];
 int  mrnaScore[500];
 int  diffIdent[500];
 
 FILE *o3, *o7;
 
 char *proteinDataDate;
 char *genomeRelease;
 char *genomeReadOnly;
 char *genomeDBname;
 char proteinsDB[100];
 char spDB[100];
 char gbTempDB[100];
 
 /* Explain usage and exit. */
 void usage()
     {
     errAbort(
   	"usage:\tkgResultBestMrna YYMMDD db ro_db> BestResult.out\n"
   	"\tYYMMDD is the release date of SWISS-PROT data, eg: 031117\n"
   	"\tdb is the genome under construction, eg: kgDB\n"
   	"\tro_db is the actual target genome, e.g.: mm4\n"
   	"kgResultBestMrna - after the cluster run is done, this reads the\n"
 	"\tresults and produces a best.lis listing.\n"
 	"\tExpects to find ./clusterRun and ./out directories in the\n"
 	"\tcurrent working directory.");
     }
 
 int cal_months(char *date)
     {
     int year, month, day;
     int months;
 	
     sscanf(date, "%d-%d-%d", &year, &month, &day);
     months = (year - 1970)*12 + month - 1;
     return(months);
     }
 
 void calScores(char *proteinID, int mrnaCount)
 {
 int  ixm, maxixm;	// index for mRNA
 int  maxScore;
 int  i, ii;
 
 char proteinName[20], mrnaName[20];
 int  diffs[500];
 int  monthss[500];
 int  mrnalens[500];
 char line[2000];
 char mrnaDate[20];
 int  months;
 int  diff;
 int  mrnalen;
 
 char *temp_str;
 
 struct dnaSeq *seq;
 struct sqlConnection *connR;
     
 connR = hAllocConn();
 
 ixm = 0;
 maxScore = 0;
 
 strcpy(proteinName, proteinID);
 maxixm = -1;
 for (ii=0; ii<mrnaCount; ii++)
     {
     strcpy(mrnaName, mrnaNames[ii]);
     if (hRnaSeqAndIdx(mrnaNames[ii], &mSeq, &id, mrnaDate, connR)== -1) 
 	{
 	printf("%s could not be found!!!\n", mrnaNames[ii]);fflush(stdout);
 	exit(1);
 	}
     strcpy(mrnaDates[ii], mrnaDate);
     mrnalen = strlen(mSeq->dna);
     months = cal_months(mrnaDate);
 		
     diffs[ii] 	   = diffIdent[ii];
     mrnalens[ii]   = mrnalen;
     monthss[ii]    = months;
     mrnaScore[ii]  = mrnalen + months*2 - diffs[ii]*50;
 
     if (mrnaScore[ii] > maxScore)
 	{
 	maxScore = mrnaScore[ii];
 	maxixm 	 = ii;
 	}	
     }
 
 for (i=0; i<mrnaCount; i++)
     {
     if (diffIdent[i] != -1)
 	{
 	fprintf(o3, "%10s %10s %8s %5d %5d %5d %5d",
         	proteinName, mrnaNames[i], mrnaDates[i], mrnaScore[i],
                 mrnalens[i], monthss[i], diffs[i]);
         if (i == maxixm)
             {
 	    fprintf(o3, " *\n");
 	    fprintf(o7, "%s\t%s\n", proteinName, mrnaNames[i]); fflush(o7);
 	    }
         else
 	    {
             fprintf(o3, "\n");
 	    }
 	fflush(o3);
 	}
     }
 fprintf(o3, "\n");
 fflush(o3);
 
 hFreeConn(&connR);
 }
 
 int main(int argc, char *argv[])
 {
 int  ixm, maxixm;       // index for mRNA
 int  maxScore;
 char proteinID[20], mrnaName[20];
 
 int  monthss[500];
 int  mrnalens[500];
 FILE *inf, *inf2, *outf;
 
 int  newMrna;
 int  ii= -1;
 char line[2000];
 char mrnaDate[20];
 int  months;
 int  diff;
 int  mrnalen;
 int  imrna;
 char dirName[PATH_MAX];
 char mrnaFile[PATH_MAX];
 char outName[PATH_MAX];
 char outDir[PATH_MAX];
 char blatCmd[PATH_MAX];
 char cwd[PATH_MAX];
 
 struct dnaSeq *seq;
 HGID id;
 bioSeq *mSeq, qSeq, *pSeq;
 
 struct sqlConnection *conn, *conn2, *conn3;
 char query[256], query2[256], query3[256];
 struct sqlResult *sr, *sr2, *sr3;
 char **row, **row2, **row3;
 
 char *accession;
 char *displayID;
 char *division;
 char *extDB;
 char *extAC;
   
 char *chp0, *chp;
 
 int totalCount, matchCount;
 
 char *name, *chrom, *strand, *txStart, *txEnd, *cdsStart, *cdsEnd, *exonCount, *exonStarts, *exonEnds;
 
 int  i;
 char *mrnaID;
 
 FILE *aaOut, *mrnaOut;
 char *aaSeq, *mrnaSeq;
 char cond_str[200];
 
 if (argc != 4) usage();
     
 proteinDataDate = argv[1];
 genomeRelease = argv[2];
 genomeReadOnly = argv[3];
 
 // make sure db connection goes to correct genome
 // hRnaSeqAndIdx() needs this.
 hSetDb(genomeReadOnly);
 
 sprintf(spDB, "sp%s", proteinDataDate);
 sprintf(proteinsDB, "proteins%s", proteinDataDate);
 sprintf(gbTempDB, "%sTemp", genomeRelease);
   
 inf = fopen("protein.lis", "r"); 
 if ((FILE *) NULL == inf)
     errAbort("ERROR: Can not open input file: protein.lis");
 o3  = fopen("kgBestMrna.out",   "w");
 if ((FILE *) NULL == o3)
     errAbort("ERROR: Can not open output file: kgBestMrna.out");
 o7  = fopen("best.lis",    "w");
 if ((FILE *) NULL == o7)
     errAbort("ERROR: Can not open output file: best.lis");
 
 conn2= hAllocConn();
 conn3= hAllocConn();
    
 proteinCount = 0; 
 snprintf(dirName, (size_t) sizeof(dirName), "%s", "./clusterRun" );
 
 while (fgets(line, 1000, inf) != NULL)
     {
     sscanf(line, "%s", proteinID);
     printf(">%s\n", proteinID);
     fflush(stdout);
 
     sprintf(cond_str, "val='%s'", proteinID);
     accession = sqlGetField(conn3, spDB, "displayId","acc", cond_str);
     sprintf(cond_str, "acc='%s'", accession);
     aaSeq = sqlGetField(conn3, spDB, "protein","val", cond_str);
 	
     if (aaSeq == NULL)
 	{
 	printf("no seq found for %s\n", proteinID);
 	fflush(stdout);
 	exit(1);
 	}	
 	
     if ( 0 == (proteinCount % 2000) )
 	{
 	snprintf(dirName, (size_t) sizeof(dirName), "prot%05d", proteinCount );
 	snprintf(outDir, (size_t) sizeof(outDir), "prot%05d", proteinCount );
 	}
 
     sprintf(query2,"select mrnaID from %sTemp.spMrna where spID='%s';",genomeRelease, proteinID);
 	
     sr2 = sqlMustGetResult(conn2, query2);
     row2 = sqlNextRow(sr2);
     imrna = 0;
     while (row2 != NULL)
 	{
  	mrnaID 	= row2[0];
 	strcpy(mrnaNames[imrna], mrnaID);
 
 	sprintf(cond_str, "name='%s'", mrnaID);
     	mrnaSeq = sqlGetField(conn3,gbTempDB,"refMrna","seq", cond_str);
 	row2 = sqlNextRow(sr2);
 	imrna++;
 	}
     mrnaCount = imrna;
     sqlFreeResult(&sr2);
     
     if ( ((char *) NULL) == getcwd(cwd, (size_t) sizeof(cwd)) )
 	errAbort("ERROR: Can not get current working directory");
     snprintf(outName, (size_t) sizeof(outName),"%s/out/%s/b%05d.out",
 	cwd, outDir, proteinCount);
     
     inf2 = fopen(outName, "r");
     if ((FILE *) NULL == inf2)
 	errAbort("ERROR: Can not open result file: %s, errno: %d", outName, errno);
     for (i=0; i<mrnaCount; i++) diffIdent[i] = -1;
     imrna = 0;
     newMrna = 0;
     while (fgets(line2, 1000, inf2) != NULL)
     	{
 	if (line2[0] == '>') 
 	    {
 	    newMrna = 1;
 	    line2[strlen(line2)-1] = '\0';
 	    chp = line2 + 1;
 	    ii=-1;
 	    for (i=0; i<mrnaCount; i++)
 		{
 		if (strcmp(chp, mrnaNames[i]) == 0)
 		    {
 		    ii = i;
 		    }
 		}
 	    if (ii == -1)
 		{
 		errAbort("ERROR: no mrna registered found", outName, errno);
 		}
 	    }
 	chp0 = strstr(line2, "Identities = ");
 	if (chp0 == NULL) continue;
 
 	if (!newMrna) continue;
 	else newMrna = 0;
 
 	chp0 = chp0 + strlen("Identities = ");
 
 	chp = strstr(chp0, "/");
 	*chp = '\0';
 	sscanf(chp0, "%d", &matchCount);
 	
 	chp++;
 	chp0  = strstr(chp, " ");
 	*chp0 = '\0';
 	sscanf(chp, "%d", &totalCount);
 
 	// some time mulitiple alignments for a single mRNA-protein pair
 	// the first one usually is the best alignment with highest score
 	if (diffIdent[ii] == -1) 
 	    {
 	    diffIdent[ii] = totalCount - matchCount;
  	    imrna++;
 	    }
 	}
     fclose(inf2);
    
     if (imrna != mrnaCount)
 	{
 	fprintf(stderr, "total mrna count %d %d is different for %s\n", imrna, mrnaCount, proteinID);
 	}
     calScores(proteinID, mrnaCount);
     proteinCount++;
     }    
 
 hFreeConn(&conn2);
 hFreeConn(&conn3);
 fclose(o3);
 fclose(o7);
 return(0);
 }