src/hg/protein/pbCalPi/pbCalPi.c 1.8
1.8 2009/09/25 18:37:59 kent
Simplified code. Reduced run time from 5 hours to 20 minutes.
Index: src/hg/protein/pbCalPi/pbCalPi.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/protein/pbCalPi/pbCalPi.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/hg/protein/pbCalPi/pbCalPi.c 3 Sep 2008 19:20:59 -0000 1.7
+++ src/hg/protein/pbCalPi/pbCalPi.c 25 Sep 2009 18:37:59 -0000 1.8
@@ -1,130 +1,109 @@
/* pbCalPi - Calculate pI values from a list of protein IDs */
#include "pbCalPi.h"
#include "math.h"
void usage()
/* Explain usage and exit. */
{
errAbort(
"pbCalPi - Calculate pI values from a list of protein IDs \n"
"usage:\n"
- " pbCalPi listFile spDb outFile\n"
- " listFile is the input file name of list of protein accession numbers\n"
+ " pbCalPi spDb outFile\n"
" spDb is the swissprot database name\n"
" outFile is the output file name of tab delineated file of protein accession number and pI value\n"
"example: pbCalPi prot.lis sp040115 pepPi.tab\n");
}
float myExp10(float x)
{
return(powf(10.0, x));
}
-float calPi(char *sequence)
+double calPi(char *sequence)
/* the calPi() is based on the parameters and algorithm obtained */
/* from SWISS-PROT. The calculation results should be identical to that */
/* from the Web page: http://us.expasy.org/tools/pi_tool.html */
{
int i, sequenceLength;
int comp[ALPHABET_LEN];
int nTermResidue, cTermResidue;
-float charge, phMin, phMid = 0.0, phMax;
-float carg, cter, nter, ctyr, chis, clys, casp, cglu, ccys;
+double charge, phMin, phMid = 0.0, phMax;
+double carg, cter, nter, ctyr, chis, clys, casp, cglu, ccys;
int R, H, K, D, E, C, Y;
R = (int)('R' - 'A');
H = (int)('H' - 'A');
K = (int)('K' - 'A');
D = (int)('D' - 'A');
E = (int)('E' - 'A');
C = (int)('C' - 'A');
Y = (int)('Y' - 'A');
sequenceLength = strlen(sequence);
for (i=0; i<ALPHABET_LEN; i++) comp[i] = 0.0;
/* Compute the amino-acid composition. */
for (i = 0; i < sequenceLength; i++)
comp[sequence[i] - 'A']++;
/* Look up N-terminal and C-terminal residue. */
nTermResidue = sequence[0] - 'A';
cTermResidue = sequence[sequenceLength - 1] - 'A';
phMin = PH_MIN;
phMax = PH_MAX;
for (i = 0, charge = 1.0; i < MAXLOOP && (phMax - phMin) > EPSI; i++)
{
phMid = phMin + (phMax - phMin) / 2.0;
cter = myExp10(-cPk[cTermResidue][0]) /
(myExp10(-cPk[cTermResidue][0]) + myExp10(-phMid));
nter = myExp10(-phMid) /
(myExp10(-cPk[nTermResidue][1]) + myExp10(-phMid));
carg = comp[R] * myExp10(-phMid) /
(myExp10(-cPk[R][2]) + myExp10(-phMid));
chis = comp[H] * myExp10(-phMid) /
(myExp10(-cPk[H][2]) + myExp10(-phMid));
clys = comp[K] * myExp10(-phMid) /
(myExp10(-cPk[K][2]) + myExp10(-phMid));
casp = comp[D] * myExp10(-cPk[D][2]) /
(myExp10(-cPk[D][2]) + myExp10(-phMid));
cglu = comp[E] * myExp10(-cPk[E][2]) /
(myExp10(-cPk[E][2]) + myExp10(-phMid));
ccys = comp[C] * myExp10(-cPk[C][2]) /
(myExp10(-cPk[C][2]) + myExp10(-phMid));
ctyr = comp[Y] * myExp10(-cPk[Y][2]) /
(myExp10(-cPk[Y][2]) + myExp10(-phMid));
charge = carg + clys + chis + nter -
(casp + cglu + ctyr + ccys + cter);
if (charge > 0.0)
phMin = phMid;
else
phMax = phMid;
}
return(phMid);
}
int main(int argc, char *argv[])
{
-char *infName, *spDb, *outfName;
-float pI;
-char *acc;
-char *seq;
-char cond_str[255];
-struct lineFile *inf;
-FILE *outf1;
-char *row[1];
-
-if (argc != 4)
- {
+if (argc != 3)
usage();
- }
-
-infName = argv[1];
-spDb = argv[2];
-outfName = argv[3];
-
-inf = lineFileOpen(infName, TRUE); /* input file */
-outf1 = mustOpen(outfName, "w");
-while (lineFileRow(inf, row))
+char *spDb = argv[1];
+char *outfName = argv[2];
+FILE *f = mustOpen(outfName, "w");
+
+struct sqlConnection *conn = sqlConnect(spDb);
+struct sqlResult *sr = sqlGetResult(conn, "select acc,val from protein");
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
{
- acc = row[0];
-
- safef(cond_str, sizeof(cond_str), "acc='%s'", acc);
- seq = sqlGetField(spDb, "protein", "val", cond_str);
-
- if (seq != NULL)
- {
- pI = calPi(seq);
- fprintf(outf1, "%s\t%.2f\n", acc, pI);
- }
+ fprintf(f, "%s\t%.2f\n", row[0], calPi(row[1]));
}
-fclose(outf1);
+carefulClose(&f);
return(0);
}