src/hg/protein/spDbAddVarSplice/spDbAddVarSplice.c 1.5

1.5 2009/08/23 19:51:26 kent
Coping with addition of sp| at start of fasta protein name in varSplice section of SwissProt.
Index: src/hg/protein/spDbAddVarSplice/spDbAddVarSplice.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/protein/spDbAddVarSplice/spDbAddVarSplice.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/protein/spDbAddVarSplice/spDbAddVarSplice.c	17 Mar 2007 18:12:41 -0000	1.4
+++ src/hg/protein/spDbAddVarSplice/spDbAddVarSplice.c	23 Aug 2009 19:51:26 -0000	1.5
@@ -1,130 +1,132 @@
 /* spDbAddVarSplice - This adds information on the varient splices to the sp/uniProt database. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "portable.h"
 #include "dnautil.h"
 #include "dnaseq.h"
 #include "fa.h"
 #include "jksql.h"
 #include "spDb.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "spDbAddVarSplice - This adds information on the varient splices to the sp/uniProt database\n"
   "usage:\n"
   "   spDbAddVarSplice database varsplice.fasta outDir\n"
   "Where:\n"
   "   database is the database to add to\n"
   "   varSplice.fasta is a fasta file with the splice varients\n"
   "   outDir is where various output tab-generated files live.\n"
   "After this program is done, add the varProtein.txt file into both protein\n"
   "and varProtein tables, add varDisplayId.txt to displayId table, and varAcc.txt\n"
   "into varAcc table, and the varAccToTaxon.txt into the accToTaxon table.\n"
   );
 }
 
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 FILE *openToWrite(char *dir, char *file)
 /* Return dir/file open for writing. */
 {
 char path[PATH_LEN];
 safef(path, sizeof(path), "%s/%s", dir, file);
 return mustOpen(path, "w");
 }
 
 void spDbAddVarSplice(char *database, char *inFile, char *outDir)
 /* spDbAddVarSplice - This adds information on the varient splices to the sp/uniProt database. */
 {
 struct sqlConnection *conn = sqlConnect(database);
 char query[256];
 makeDir(outDir);
 FILE *varProtein = openToWrite(outDir, "varProtein.txt");
 FILE *varAcc = openToWrite(outDir, "varAcc.txt");
 FILE *varDisplayId = openToWrite(outDir, "varDisplayId.txt");
 FILE *varAccToTaxon = openToWrite(outDir, "varAccToTaxon.txt");
 FILE *varDescription = openToWrite(outDir, "varDescription.txt");
 FILE *varGene = openToWrite(outDir, "varGene.txt");
 FILE *varGeneLogic = openToWrite(outDir, "varGeneLogic.txt");
 struct lineFile *lf = lineFileOpen(inFile, TRUE);
 aaSeq seq;
 ZeroVar(&seq);
 while (faPepSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
     {
     char *row[4];
-    int rowSize = chopString(seq.name, "-|", row, ArraySize(row));
+    char *name = seq.name;
+    if (startsWith("sp|", name))	// Skip over sp| introduced Aug 2009
+        name += 3;
+    int rowSize = chopString(name, "-|", row, ArraySize(row));
     if (rowSize != 3)
-        errAbort("Expecting name to be in format accession-N|DISP_ID, got %s\n", seq.name);
-    chopString(seq.name, "-|", row, ArraySize(row));
+        errAbort("Expecting name to be in format accession-N|DISP_ID, got %s\n", name);
     char *acc = row[0];
     char *version = row[1];
     char *displayId = row[2];
     int accLen = strlen(acc);
     int verLen = strlen(version);
     int displayIdLen = strlen(displayId);
 
     /* Do some tests. */
     if (accLen < 6 || accLen > 8 || isdigit(acc[0]) || !isdigit(acc[accLen-1]))
         errAbort("wierd accession %s before line %d of %s", acc, lf->lineIx, lf->fileName);
     if (!isdigit(version[0]) || verLen > 4)
         errAbort("wierd version %s before line %d of %s", version, lf->lineIx, lf->fileName);
     if (countChars(displayId, '_') != 1 || displayIdLen < 6 || displayIdLen > 16)
         errAbort("wierd displayId %s before line %d of %s", displayId, lf->lineIx, lf->fileName);
     if (accLen + 1 + verLen >= sizeof(SpAcc))
         errAbort("Need to increase size of SpAcc in spDb.h because of %s-%s", acc, version);
 
     /* Print out parsed results. */
     fprintf(varAcc, "%s-%s\t%s\t%s\n", acc, version, acc, version);
     fprintf(varProtein, "%s-%s\t%s\n", acc, version, seq.dna);
     fprintf(varDisplayId, "%s-%s\t%s-%s\n", acc, version, acc, version);
 
     /* Look up taxon of base protein and use it to write to varAccToTaxon table. */
     int taxon = spTaxon(conn, acc);
     fprintf(varAccToTaxon, "%s-%s\t%d\n", acc, version, taxon);
 
     /*Transfer description. */
     char *description = spDescription(conn, acc);
     fprintf(varDescription, "%s-%s\t%s\n", acc, version, description);
     freez(&description);
 
     /* Transfer gene logic. */
     safef(query, sizeof(query), "select val from geneLogic where acc = '%s'", acc);
     char *geneLogic = sqlQuickString(conn, query);
     if (geneLogic != NULL)
         fprintf(varGeneLogic, "%s-%s\t%s\n", acc, version, geneLogic);
     freez(&geneLogic);
 
     /* Transfer genes. */
     struct slName *gene, *geneList = spGenes(conn, acc);
     for (gene = geneList; gene != NULL; gene = gene->next)
         fprintf(varGene, "%s-%s\t%s\n", acc, version, gene->name);
     slFreeList(&geneList);
 
     }
 carefulClose(&varAcc);
 carefulClose(&varProtein);
 carefulClose(&varDisplayId);
 carefulClose(&varAccToTaxon);
 carefulClose(&varDescription);
 carefulClose(&varGene);
 carefulClose(&varGeneLogic);
 sqlDisconnect(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 spDbAddVarSplice(argv[1], argv[2], argv[3]);
 return 0;
 }