src/hg/protein/pbCalDist/pbCalDist.c 1.14
1.14 2009/03/06 21:11:51 fanhsu
Removed an unused sr variable.
Index: src/hg/protein/pbCalDist/pbCalDist.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/protein/pbCalDist/pbCalDist.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 1000000 -r1.13 -r1.14
--- src/hg/protein/pbCalDist/pbCalDist.c 3 Sep 2008 19:20:59 -0000 1.13
+++ src/hg/protein/pbCalDist/pbCalDist.c 6 Mar 2009 21:11:51 -0000 1.14
@@ -1,329 +1,328 @@
/* pbCalDist - Create tab delimited data files to be used by Proteome Browser stamps */
#include "common.h"
#include "hCommon.h"
#include "hdb.h"
#include "spDb.h"
void usage()
/* Explain usage and exit. */
{
errAbort(
"pbCalDist- Create tab delimited data files to be used by Proteome Browser stamps.\n"
"usage:\n"
" pbCalDist spDb protsDb taxn gnDb\n"
" spDb is the name of SWISS-PROT database\n"
" protsDb is the name of proteinsXXXXXX database\n"
" taxn is the taxnomy number of the Taxonomy ID\n"
" gnDb is the genome database name\n"
"Example: pbCalDist sp031112 proteins031112 9606 hg16\n");
}
int calDist(double *measure, int nInput, int nDist, double xMin, double xDelta, char *oFileName)
/* calculate histogram distribution of a double array of nInput elements */
{
int distCnt[1000];
double xDist[1000];
FILE *o3;
int i,j;
int highestCnt, totalCnt;
int lowCnt, hiCnt;
assert(nDist < ArraySize(distCnt));
o3 = mustOpen(oFileName, "w");
for (j=0; j<=(nDist+1); j++)
{
distCnt[j] = 0;
xDist[j] = xMin + xDelta * (double)j;
}
lowCnt = 0;
hiCnt = 0;
for (i=0; i<nInput; i++)
{
/* count values below xmin */
if (measure[i] < xDist[0])
{
lowCnt++;
}
for (j=0; j<nDist; j++)
{
if ((measure[i] >= xDist[j]) && (measure[i] < xDist[j+1]))
{
distCnt[j]++;
}
}
/* count values above xmax */
if (measure[i] >= xDist[nDist])
{
hiCnt++;
}
}
highestCnt = 0;
totalCnt = 0;
for (j=0; j<nDist; j++)
{
if (distCnt[j] > highestCnt) highestCnt = distCnt[j];
totalCnt = totalCnt + distCnt[j];
}
totalCnt = totalCnt + hiCnt + lowCnt;
if (totalCnt != nInput)
errAbort("nInput %d is not equal totalCnt %d, aborting ...\n", nInput, totalCnt);
for (j=0; j<nDist; j++)
{
fprintf(o3, "%f\t%d\n", xDist[j], distCnt[j]);
}
carefulClose(&o3);
return(highestCnt);
}
int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2;
char query2[256];
-struct sqlResult *sr, *sr2;
+struct sqlResult *sr2;
char **row2;
char cond_str[255];
char *proteinDatabaseName; /* example: sp031112 */
char *protDbName; /* example: proteins031112 */
char emptyStr[1] = {""};
FILE *o2;
char *accession;
char *aaSeq;
char *chp;
int i, j, len;
int cCnt;
char *answer, *answer2;
double hydroSum;
char *protDisplayId;
int aaResCnt[30];
double aaResCntDouble[30];
char aaAlphabet[30];
int aaResFound;
int totalResCnt;
int molWtCnt;
double molWt[100000];
int pIcnt;
double pI[100000];
double aa_hydro[256];
int icnt, jExon, pcnt, ipcnt = 0;
double aaLenDouble[100000];
double avgHydro[100000];
double cCountDouble[100000];
double exonCountDouble[100000];
double interProCountDouble[100000];
char *taxon;
char *database;
char *exonCnt;
int interProCount;
char *kgId;
if (argc != 5) usage();
strcpy(aaAlphabet, "WCMHYNFIDQKRTVPGEASLXZB");
/* Ala: 1.800 Arg: -4.500 Asn: -3.500 Asp: -3.500 Cys: 2.500 Gln: -3.500 */
aa_hydro['A'] = 1.800;
aa_hydro['R'] = -4.500;
aa_hydro['N'] = -3.500;
aa_hydro['D'] = -3.500;
aa_hydro['C'] = 2.500;
aa_hydro['Q'] = -3.500;
/* Glu: -3.500 Gly: -0.400 His: -3.200 Ile: 4.500 Leu: 3.800 Lys: -3.900 */
aa_hydro['E'] = -3.500;
aa_hydro['G'] = -0.400;
aa_hydro['H'] = -3.200;
aa_hydro['I'] = 4.500;
aa_hydro['L'] = 3.800;
aa_hydro['K'] = -3.900;
/* Met: 1.900 Phe: 2.800 Pro: -1.600 Ser: -0.800 Thr: -0.700 Trp: -0.900 */
aa_hydro['M'] = 1.900;
aa_hydro['F'] = 2.800;
aa_hydro['P'] = -1.600;
aa_hydro['S'] = -0.800;
aa_hydro['T'] = -0.700;
aa_hydro['W'] = -0.900;
/* Tyr: -1.300 Val: 4.200 Asx: -3.500 Glx: -3.500 Xaa: -0.490 */
aa_hydro['Y'] = -1.300;
aa_hydro['V'] = 4.200;
proteinDatabaseName = argv[1];
protDbName = argv[2];
taxon = argv[3];
database = argv[4];
o2 = mustOpen("pepResDist.tab", "w");
conn = hAllocConn(database);
conn2 = hAllocConn(database);
for (j=0; j<23; j++)
{
aaResCnt[j] = 0;
}
icnt = jExon = pcnt = 0;
pIcnt = 0;
molWtCnt = 0;
safef(query2, sizeof(query2), "select acc from %s.accToTaxon where taxon=%s;", proteinDatabaseName, taxon);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
while (row2 != NULL)
{
accession = row2[0];
safef(cond_str, sizeof(cond_str), "acc='%s'", accession);
protDisplayId = sqlGetField(proteinDatabaseName, "displayId", "val", cond_str);
safef(cond_str, sizeof(cond_str), "proteinID='%s'", protDisplayId);
answer = sqlGetField(database, "knownGene", "name", cond_str);
/* count InterPro domains */
if (answer != NULL)
{
safef(cond_str, sizeof(cond_str), "accession='%s'", accession);
answer2 = sqlGetField(protDbName, "swInterPro", "count(*)", cond_str);
if (answer2 != NULL)
{
interProCount = interProCount + atoi(answer2);
interProCountDouble[ipcnt] = (double)(atoi(answer2));
ipcnt++;
}
else
{
printf("%s is not in InterPro DB.\n", accession);fflush(stdout);
}
}
/* count exons, using coding exons from kgProtMap2 (KG-III) table */
safef(cond_str, sizeof(cond_str), "spID='%s'", accession);
kgId = sqlGetField(database, "kgXref", "kgID", cond_str);
safef(cond_str, sizeof(cond_str), "qName='%s'", kgId);
answer2 = sqlGetField(database, "kgProtMap2", "blockCount", cond_str);
if (answer2 != NULL)
{
exonCnt = strdup(answer2);
if (atoi(exonCnt) == 0)
{
errAbort("%s %s has 0 block count\n", accession, protDisplayId);
}
exonCountDouble[jExon] = (double)(atoi(exonCnt));
jExon++;
}
else
{
exonCnt = emptyStr;
}
/* process Mol Wt */
safef(cond_str, sizeof(cond_str), "accession='%s'", accession);
answer2 = sqlGetField(database, "pepMwAa", "molWeight", cond_str);
if (answer2 != NULL)
{
molWt[molWtCnt] = (double)(atof(answer2));
molWtCnt++;
}
/* process pI */
safef(cond_str, sizeof(cond_str), "accession='%s'", accession);
answer2 = sqlGetField(database, "pepPi", "pI", cond_str);
if (answer2 != NULL)
{
pI[pIcnt] = (double)(atof(answer2));
pIcnt++;
}
safef(cond_str, sizeof(cond_str), "acc='%s'", accession);
aaSeq = sqlGetField(proteinDatabaseName, "protein", "val", cond_str);
if (aaSeq == NULL)
{
errAbort("%s does not have protein sequence data in %s, aborting ...\n", accession,
proteinDatabaseName);
}
len = strlen(aaSeq);
chp = aaSeq;
for (i=0; i<len; i++)
{
aaResFound = 0;
for (j=0; j<23; j++)
{
if (*chp == aaAlphabet[j])
{
aaResFound = 1;
aaResCnt[j] ++;
}
}
if (!aaResFound)
{
warn("%c %d not a valid AA residue in %s:\n%s", *chp, *chp, accession, aaSeq);
}
chp++;
}
/* calculate hydrophobicity */
chp = aaSeq;
cCnt = 0;
hydroSum = 0;
for (i=0; i<len; i++)
{
hydroSum = hydroSum + aa_hydro[(int)(*chp)];
/* count Cysteines */
if ((*chp == 'C') || (*chp == 'c'))
{
cCnt ++;
}
chp++;
}
aaLenDouble[icnt] = len;
cCountDouble[icnt] = (double)cCnt;
avgHydro[icnt] = hydroSum/(double)len;
icnt++;
- sqlFreeResult(&sr);
row2 = sqlNextRow(sr2);
}
totalResCnt = 0;
for (i=0; i<23; i++)
{
totalResCnt = totalResCnt + aaResCnt[i];
}
/* write out residue count distribution */
for (i=0; i<20; i++)
{
aaResCntDouble[i] = ((double)aaResCnt[i])/((double)totalResCnt);
fprintf(o2, "%d\t%f\n", i+1, (float)aaResCntDouble[i]);
}
fprintf(o2, "%d\t%f\n", i+1, 0.0);
carefulClose(&o2);
/* calculate and write out various distributions */
calDist(molWt, molWtCnt, 21, 0.0, 10000.0,"pepMolWtDist.tab");
calDist(pI, pIcnt, 61, 3.0, 0.2, "pepPiDist.tab");
calDist(avgHydro, icnt, 41, -2.0, 0.1, "pepHydroDist.tab");
calDist(cCountDouble, icnt, 51, 0.0, 1.0, "pepCCntDist.tab");
calDist(exonCountDouble, jExon, 31, 0.0, 1.0, "pepExonCntDist.tab");
calDist(interProCountDouble, ipcnt, 16, 0.0, 1.0, "pepIPCntDist.tab");
sqlFreeResult(&sr2);
hFreeConn(&conn);
hFreeConn(&conn2);
return(0);
}