src/hg/protein/pfamXref/pfamXref.c 1.10
1.10 2009/09/25 08:52:45 kent
Sped up 10 or 20x by using a hash table instead of repeated sql queries.
Index: src/hg/protein/pfamXref/pfamXref.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/protein/pfamXref/pfamXref.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/hg/protein/pfamXref/pfamXref.c 23 Sep 2009 18:42:25 -0000 1.9
+++ src/hg/protein/pfamXref/pfamXref.c 25 Sep 2009 08:52:45 -0000 1.10
@@ -1,159 +1,168 @@
/* pfamXref read and parse the Pfam-A data file to generate
the .tab files to enable xref between Pfam and SWISS-PROT entries */
#include "common.h"
#include "hCommon.h"
#include "hdb.h"
void usage()
/* Explain usage and exit. */
{
errAbort(
"pfamXref - create pfam xref .tab file "
"usage:\n"
" pfamXref pn pfamInput pfamOutput pfamXref\n"
" pn is protein database name\n"
" pfamInput is input file name of Pfam data.\n"
" pfamOutput is output file name of output file of Pfam AC and description.\n"
" pfamXref is output file name of xref file of Pfam AC and SWISS-PROT ID and KG ID.\n"
"Example: pfamXref rn3 proteins07030 /cluster/store5/proteins/pfam/071803/Pfam-A.full pfamA.tab pfamAXref.tab\n");
}
char line[2000];
int main(int argc, char *argv[])
{
FILE *inf;
FILE *o1, *o2;
char cond_str[256];
char *proteinDB;
char *proteinFileName;
char *answer;
char *outputFileName, *outputFileName2;
char *desc;
char *chp1, *chp=NULL;
char *pfamID, *pfamAC;
char *swissAC, *swissDisplayID;
char emptyString[10] = {""};
int done, gsDone, gsFound, idFound;
char *chp2 = NULL;
if (argc != 5) usage();
proteinDB = cloneString(argv[1]);
proteinFileName = cloneString(argv[2]);
outputFileName = cloneString(argv[3]);
outputFileName2 = cloneString(argv[4]);
o1 = mustOpen(outputFileName, "w");
o2 = mustOpen("jj.dat", "w");
+/* Build up hash for quick access to displayIds. */
+struct hash *displayIdHash = hashNew(20);
+struct sqlConnection *conn = sqlConnect(proteinDB);
+struct sqlResult *sr = sqlGetResult(conn, "select accession,displayID from spXref3");
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
+ hashAdd(displayIdHash, row[0], cloneString(row[1]));
+sqlFreeResult(&sr);
+sqlDisconnect(&conn);
+
if ((inf = mustOpen(proteinFileName, "r")) == NULL)
{
fprintf(stderr, "Can't open file %s.\n", proteinFileName);
exit(8);
}
done = 0;
while (!done)
{
/* get to the beginning of a Pfam record */
idFound = 0;
while (fgets(line, sizeof(line), inf) != NULL)
{
chp = strstr(line, "GF ID");
if (chp != NULL)
{
idFound = 1;
break;
}
}
if (!idFound) break;
chp = chp + 8;
*(chp + strlen(chp) - 1) = '\0'; /* remove LF */
pfamID = strdup(chp);
/* Get Pfam AC */
mustGetLine(inf, line, sizeof(line));
chp = strstr(line, "GF AC ");
chp = chp + 8;
*(chp + strlen(chp) - 1) = '\0'; // remove LF
chp1 = strstr(chp, ".");
if (chp1 != NULL) *chp1 = '\0';
pfamAC = strdup(chp);
/* Get Pfam description. Please note, Pfam-B does not have this field.*/
mustGetLine(inf, line, sizeof(line));
chp = strstr(line, "GF DE ");
chp = chp + 8;
*(chp + strlen(chp) - 1) = '\0'; // remove LF
desc = chp;
fprintf(o1, "%s\t%s\t%s\n", pfamAC, pfamID, desc);
/* work on "#=GS ... AC ... " lines to get SWISS-PROT accession number */
gsDone = 0;
gsFound = 0;
while (!gsDone)
{
mustGetLine(inf, line, sizeof(line));
chp = strstr(line, "#=GS ");
if (chp != NULL)
{
chp = strstr(chp, " AC ");
if (chp != NULL)
{
gsFound = 1;
chp = chp + 4;
chp2 = strstr(chp, ".");
*(chp + strlen(chp) - 1) = '\0'; // remove LF
if (chp2 != NULL) *chp2='\0';
swissAC = chp;
// get display ID from AC
- sprintf(cond_str, "accession = '%s'", swissAC);
- answer = sqlGetField(proteinDB, "spXref3", "displayID", cond_str);
+ answer = hashFindVal(displayIdHash, swissAC);
if (answer != NULL)
{
swissDisplayID = answer;
}
else
{
// ACs missing from spXref3 might be found from spSecondardy table
sprintf(cond_str, "accession2 = '%s'", swissAC);
answer = sqlGetField(proteinDB, "spSecondaryID", "displayID", cond_str);
if (answer != NULL)
{
swissDisplayID = answer;
}
else
{
printf("%s not found in %s.spXref3\n", swissAC, proteinDB);fflush(stdout);
swissDisplayID = emptyString;
}
}
fprintf(o2, "%s\t%s\t%s\n", pfamAC, swissAC, swissDisplayID);
}
}
else
{
if (gsFound)
{
gsDone = 1;
}
}
}
}
carefulClose(&o1);
carefulClose(&o2);
sprintf(cond_str, "cat jj.dat | sort | uniq >%s",outputFileName2);
mustSystem(cond_str);
mustSystem("rm jj.dat");
return(0);
}