src/hg/protein/spToDb/spToDb.c 4898794edd81be5285ea6e544acbedeaeb31bf78

4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/protein/spToDb/spToDb.c src/hg/protein/spToDb/spToDb.c
index cd5edc1..58b5a04 100644
--- src/hg/protein/spToDb/spToDb.c
+++ src/hg/protein/spToDb/spToDb.c
@@ -1,1260 +1,1260 @@
 /* spToDb - Create a relational database out of SwissProt/trEMBL flat files. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "localmem.h"
 #include "dystring.h"
 #include "portable.h"
 #include "sqlNum.h"
 #include "obscure.h"
 #include "intValTree.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "spToDb - Create a relational database out of SwissProt/trEMBL flat files\n"
   "usage:\n"
   "   spToDb swissProt.dat outDir\n"
   "options:\n"
   "   -dupeOk - Allow duplicate accessions in input, and just use first one\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"dupeOk", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 boolean dupeOk;
 
 const int lineHeadSize = 5;
 
 void groupLine(struct lineFile *lf, char *type, 
 	char *firstLine, struct dyString *val)
 /* Add first line and any subsequence lines that start with
  * type to val. */
 {
 char *line;
 char rType[3];
 rType[0] = type[0];
 rType[1] = type[1];
 dyStringClear(val);
 dyStringAppend(val, firstLine);
 while (lineFileNext(lf, &line, NULL))
     {
     if (rType[0] != line[0] || rType[1] != line[1])
 	{
         lineFileReuse(lf);
 	break;
 	}
     dyStringAppendC(val, ' ');
     line += lineHeadSize;
     dyStringAppend(val, line);
     }
 }
 
 void stripLastPeriod(char *s)
 /* Remove last period if any. */
 {
 int end = strlen(s)-1;
 if (s[end] == '.')
     s[end] = 0;
 }
 
 void stripLastChar(char *s)
 /* Remove last character from string */
 {
 int end = strlen(s)-1;
 s[end] = 0;
 }
 
 struct spRecord
 /* A swissProt record. */
     {
     char *id;	/* Display ID */
     bool isCurated;	/* Is curated (SwissProt vs. trEMBL) */
     int aaSize;		/* Amino acid size. */
     struct slName *accList;	/* Accession list, first is primary. */
     char *createDate;   /* Creation date */
     char *seqDate;	/* Sequence last update. */
     char *annDate;	/* Annotation last update. */
     char *description;  /* Description - may be a couple of lines. */
     char *genes;	/* Associated genes (not yet parsed) */
     struct hashEl *gnList; /* Gene name list */
     char *orgSciName;	/* Organism scientific name. */
     struct slName *commonNames;	/* Common name(s). */
     char *taxonToGenus;	/* Taxonomy up to genus. */
     struct spTaxon *taxonList;	/* NCBI Taxonomy ID. */
     struct spTaxon *hostList;	/* NCBI ID of host organisms if any */
     char *organelle;	/* Organelle. */
     struct spLitRef *literatureList;	/* List of literature references. */
     struct spComment *commentList;	/* List of comments. */
     struct spDbRef *dbRefList;	/* List of database cross references. */
     struct slName *keyWordList;	/* Key word list. */
     struct spFeature *featureList;	/* List of features. */
     int molWeight;	/* Molecular weight. */
     struct spProtEv *spProtEvList;   /* Protein evidence list. */
     char *seq;	/* Sequence, one letter per amino acid. */
     };
 
 struct spTaxon
 /* List of taxon IDs. */
     {
     struct spTaxon *next;
     int id;
     };
 
 struct spComment 
 /* A swissProt structured comment. */
     {
     struct spComment *next;
     char *type;	/* Type of comment. */
     char *text;	/* Text of comment. */
     };
 
 struct spDbRef 
 /* A reference to another database from swissProt. */
     {
     struct spDbRef *next;	/* Next in list. */
     char *db;			/* Name of other database. */
     struct slName *idList;	/* ID's in other database. */
     };
 
 struct spFeature
 /* A feature - descibes a section of a protein. */
     {
     struct spFeature *next;	/* Next in list. */
     char *class;		/* Class of feature. */
     int start;			/* Zero based. */
     int end;			/* Non-inclusive. */
     char *type;			/* Type of feature, more specific than class. 
                                  * May be NULL. */
     char *FTId;			/* FT Id, may be NULL. */
     char softEndBits;/* 1 for start <, 2 for start ?, 4 for end >, 8 for end ? */
     };
 #define spFeatureStartLess 1
 #define spFeatureStartFuzzy 2
 #define spFeatureEndGreater 4
 #define spFeatureEndFuzzy 8
 
 struct spLitRef 
 /* A swissProt literature reference. */
     {
     struct spLitRef *next;
     char *title;	/* Title of article. */
     char *cite;		/* Journal/book/patent citation. */
     struct slName *authorList;	/* Author names in lastName, F.M. format. */
     char *rp;		/* Somewhat complex 'Reference Position' line. */
     struct hashEl *rxList; /* Cross-references. */
     struct hashEl *rcList; /* TISSUE=XXX X; STRAIN=YYY; parsed out. */
     char *pubMedId;	/* pubMed ID, may be NULL. */
     char *medlineId;	/* Medline ID, may be NULL. */
     char *doiId;	/* DOI ID, may be NULL. */
     };
 
 struct spProtEv
 /* Protein evidence type from PE line. */
     {
     struct spProtEv *next;	/* Next in list. */
     int id;			/* Numerical ID from SwissProt. */
     char *name;			/* Name. */
     };
 
 int spProtEvCmpId(const void *va, const void *vb)
 /* Compare spProtEv by id. */
 {
 const struct spProtEv *a = *((struct spProtEv **)va);
 const struct spProtEv *b = *((struct spProtEv **)vb);
 return a->id - b->id;
 }
 
 
 static void spParseComment(struct lineFile *lf, char *line, 
 	struct lm *lm, struct dyString *dy, struct spRecord *spr)
 /* Parse comment into records and hang them on spr. */
 {
 struct spComment *com = NULL;
 char *type;
 for (;;)
     {
     /* Process current line. */
     if (startsWith("-!-", line))
         {
 	/* Start new structured line. */
 	if (com != NULL)
 	    {
 	    com->text = lmCloneString(lm, dy->string);
 	    com = NULL;
 	    }
 	line += 4;	/* Skip '-!- '*/
 	type = line;
 	line = strchr(line, ':');
 	if (line == NULL)
 	    errAbort("expecting ':' line %d of %s", lf->lineIx, lf->fileName);
 	*line++ = 0;
 	lmAllocVar(lm, com);
 	com->type = lmCloneString(lm, type);
 	slAddHead(&spr->commentList, com);
 	dyStringClear(dy);
 	dyStringAppend(dy, skipLeadingSpaces(line));
 	}
     else if (startsWith("---", line))
         {
 	/* Probably copyright or something.  We don't save it
 	 * here in order to save space, but we do respect it! */
 	if (com != NULL)
 	    {
 	    com->text = lmCloneString(lm, dy->string);
 	    com = NULL;
 	    }
 	}
     else
         {
 	/* Save if we are in an open comment record. */
 	if ((com != NULL) && (strlen(line) > 3))
 	    {
 	    dyStringAppendC(dy, ' ');
 	    dyStringAppend(dy, line+3);
 	    }
 	}
 
     /* Fetch next line.  Break if it is not comment. */
     if (!lineFileNext(lf, &line, NULL))
         break;
     if (!startsWith("CC", line))
 	{
         lineFileReuse(lf);
 	break;
 	}
     line += 5;
     }
 if (com != NULL)
     com->text = lmCloneString(lm, dy->string);
 }
 
 static void parseNameVals(struct lineFile *lf, struct lm *lm, 
 	char *line, struct hashEl **pList)
 /* Parse things of form 'xxx=yyyy zzz; aaa=bbb ccc;' into 
  * *pList. */
 {
 char *name, *val, *e;
 char *inLine;
 struct hashEl *hel;
 inLine = strdup(line);
 while (line != NULL && line[0] != 0)
     {
     name = skipLeadingSpaces(line);
     val = strchr(name, '=');
     if (val == NULL)
         {
 	/* temporarily disable hard exit to accept new Swiss-Prot file format */
 	/* errAbort("Expecting = line %d of %s", lf->lineIx, lf->fileName); */
         /* fprintf(stderr, "Expecting = line %d of %s:%s\n", lf->lineIx, lf->fileName, inLine); */
 	/* Still seems to be necessary Feb 2, 2006.  I contacted UniProt about it.  -jk. */
         if (!strstr(inLine, "DOI"))
 	    {
 	    verbose(2, "%d of %s:%s\n", lf->lineIx, lf->fileName, inLine);
 	    }
 	break;
 	}
     *val++ = 0;
     e = strchr(val, ';');
     if (e != NULL)
 	*e++ = 0;
     lmAllocVar(lm, hel);
     hel->name = lmCloneString(lm, name);
     hel->val = lmCloneString(lm, val);
     slAddHead(pList, hel);
     line = e;
     }
 freeMem(inLine);
 }
 
 static void spParseReference(struct lineFile *lf, char *line, 
 	struct lm *lm, struct dyString *dy, struct spRecord *spr)
 /* Parse refence into record and hang it on spr. */
 {
 struct spLitRef *lit;
 
 /* We just ignore the RN line.  It is implicit in order in list. */
 lmAllocVar(lm, lit);
 
 while (lineFileNext(lf, &line, NULL))
     {
     /* uglyf("%d %s\n", lf->lineIx, line); */
     if (startsWith("RP", line))
         {
 	groupLine(lf, line, line+5, dy);
 	lit->rp = lmCloneString(lm, dy->string);
 	}
     else if (startsWith("RG", line))
         {
 	groupLine(lf, line, line+5, dy);
 	lit->rp = lmCloneString(lm, dy->string);
 	}
     else if (startsWith("RC", line))
         {
 	groupLine(lf, line, line+5, dy);
 	parseNameVals(lf, lm, dy->string, &lit->rcList);
 	}
     else if (startsWith("RX", line))
         {
 	groupLine(lf, line, line+5, dy);
 	parseNameVals(lf, lm, dy->string, &lit->rxList);
 	}
     else if (startsWith("RA", line))
         {
 	groupLine(lf, line, line+5, dy);
 	line = dy->string;
 	for (;;)
 	    {
 	    char *e;
 	    struct slName *n;
 	    line = skipLeadingSpaces(line);
 	    if (line == NULL || line[0] == 0)
 	        break;
 	    e = strchr(line, ',');
 	    if (e == NULL)
 	       e = strchr(line, ';');
 	    if (e != NULL)
 	       *e++ = 0;
 	    n = lmSlName(lm, line);
 	    slAddHead(&lit->authorList, n);
 	    line = e;
 	    }
 	}
     else if (startsWith("RT", line))
         {
 	char *s;
 	groupLine(lf, line, line+5, dy);
 	s = dy->string;
 	stripLastChar(s);
 	if (s[0] == '"')
 	    parseQuotedString(s, s, NULL);
 	lit->title = lmCloneString(lm, s);
 	}
     else if (startsWith("RL", line))
         {
 	groupLine(lf, line, line+5, dy);
 	lit->cite = lmCloneString(lm, dy->string);
 	}
     else 
         {
 	lineFileReuse(lf);
 	break;
 	}
     }
 slReverse(&lit->authorList);
 slReverse(&lit->rcList);
 slReverse(&lit->rxList);
 
 /* Look up medline/pubmed IDs. */
 lit->pubMedId = hashElFindVal(lit->rxList, "PubMed");
 lit->medlineId = hashElFindVal(lit->rxList, "MEDLINE");
 lit->doiId     = hashElFindVal(lit->rxList, "DOI");
 slAddHead(&spr->literatureList, lit);
 }
 
 static void spReadSeq(struct lineFile *lf, struct lm *lm, 
 	struct dyString *dy, struct spRecord *spr)
 /* Read sequence and attach it to record. */
 {
 char *line;
 dyStringClear(dy);
 while (lineFileNext(lf, &line, NULL))
     {
     if (line[0] != ' ')
 	{
         lineFileReuse(lf);
 	break;
 	}
     stripChar(line, ' ');
     dyStringAppend(dy, line);
     }
 spr->seq = lmCloneString(lm, dy->string);
 }
 
 static void spParseFeature(struct lineFile *lf, char *line, 
 	struct lm *lm, struct dyString *dy, struct spRecord *spr)
 /* Parse feature into record and hang it on spr. */
 {
 struct spFeature *feat;
 char *class = nextWord(&line);
 char *start = nextWord(&line);
 char *end = nextWord(&line);
 char *type = skipLeadingSpaces(line);
 char *FTId;
 char c;
 if (end == NULL || end[0] == 0)
     errAbort("Short FT line %d of %s", lf->lineIx, lf->fileName);
 lmAllocVar(lm, feat);
 feat->class = lmCloneString(lm, class);
 c = *start;
 if (c == '<')
     {
     feat->softEndBits |= spFeatureStartLess;
     start += 1;
     }
 else if (c == '?')
     {
     feat->softEndBits |= spFeatureStartFuzzy;
     start += 1;
     }
 c = *end;
 if (c == '>')
     {
     feat->softEndBits |= spFeatureEndGreater;
     end += 1;
     }
 else if (c == '?')
     {
     feat->softEndBits |= spFeatureEndFuzzy;
     end += 1;
     }
 feat->start = atoi(start)-1;
 feat->end = atoi(end);
 
 if (type != NULL)
     {
     /* Looks like multi-line type. */
     dyStringClear(dy);
     dyStringAppend(dy, type);
     dyStringAppend(dy, " ");  /* space between lines */
     while (lineFileNext(lf, &line, NULL))
 	{
 	char *sig = "FT    ";	/* Extra space after FT */
 	if (!startsWith(sig, line))
 	    {
 	    lineFileReuse(lf);
 	    break;
 	    }
 	line = skipLeadingSpaces(line+strlen(sig));
         FTId= strstr(line, "/FTId=");
     	if (FTId != NULL)
     	    {
     	    FTId = lmCloneString(lm, FTId+6);
 	    stripLastPeriod(FTId);
 	    feat->FTId = FTId;
 	    }
         else
 	    {
 	    dyStringAppend(dy, line);
             dyStringAppend(dy, " "); /* space between lines */
 	    }
 	}
     eraseTrailingSpaces(dy->string); /* remove last space */
     stripLastPeriod(dy->string);
     feat->type = lmCloneString(lm, dy->string);
     }
 else
     {
     lineFileNext(lf, &line, NULL);
     FTId= strstr(line, "/FTId=");
     if (FTId != NULL)
     	{
     	FTId = lmCloneString(lm, FTId+6);
 	stripLastPeriod(FTId);
 	feat->FTId = FTId;
 	}
     else
         {
 	lineFileReuse(lf);
 	}
     }
 slAddHead(&spr->featureList, feat);
 }
 
 static void parseProteinEvidence(struct lineFile *lf, struct lm *lm, struct dyString *dy, 
 	struct spRecord *spr)
 /* Parse protein evidence line[s] assuming these are already grouped into dy.  At this
  * stage they will be of form:
  *     ID: evidence description; ID: evidence description;
  * where ID is a number.   There should be a 1-1 correnspondence between ID's and
  * evidence descriptions, but we check that later, not here, since we're just parsing
  * a single one here.  The return value is setting the spProtEvList field of spr. */
 {
 char *s = dy->string;
 for (;;)
     {
     s = skipLeadingSpaces(s);
     if (s == NULL || s[0] == 0)
 	break;
     struct spProtEv *spProtEv;
     char *evidenceId = s;
     s = strchr(s, ':');
     if (s == NULL)
        errAbort("Expecting colon after number in PE line %d of %s",
 	    lf->lineIx, lf->fileName);
     *s++ = 0;
     if (!isdigit(evidenceId[0]))
 	errAbort("Non-numerical evidence ID line %d of %s", 
 	    lf->lineIx, lf->fileName);
     s = skipLeadingSpaces(s);
     char *evidenceName = s;
     s = strchr(s, ';');
     if (s == NULL)
 	errAbort("expecting semicolon line %d of %s", 
 	    lf->lineIx, lf->fileName);
     *s++ = 0;
     lmAllocVar(lm, spProtEv);
     spProtEv->id = sqlUnsigned(evidenceId);
     spProtEv->name = lmCloneString(lm, evidenceName);
     slAddHead(&spr->spProtEvList, spProtEv);
     }
 }
 
 struct spRecord *spRecordNext(struct lineFile *lf, 
 	struct lm *lm, 	/* Local memory pool for this structure. */
 	struct dyString *dy)	/* Scratch string to use. */
 /* Read next record from file and parse it into spRecord structure
  * that is allocated in memory. */
 {
 char *line, *word, *type;
 struct spRecord *spr;
 struct slName *n;
 char *taxonSig = "NCBI_TaxID=";
 int taxonSigLen = strlen(taxonSig);
 
 /* Parse ID line.  These can be in two forms with either 5 or 6 words
  * that I know about anyway.  Both forms start with ID and the accession,
  * and end with the size and 'AA.'  In the middle we look for signs of
  * curation depending on how many words there are. */
     {
     char *row[16];
     int rowSize = lineFileChopNext(lf, row, ArraySize(row));
     if (rowSize == 0)
         return NULL;
     if (rowSize < 5)
         errAbort("Short first line of record line %d of %s", 
 		lf->lineIx, lf->fileName);
     if (rowSize > 6)
         errAbort("Long first line of record line %d of %s", 
 		lf->lineIx, lf->fileName);
     lmAllocVar(lm, spr);
 
     /* Fetch ID. */
     if (!sameString(row[0], "ID"))
 	errAbort("Expecting ID line %d of %s", lf->lineIx, lf->fileName);
     spr->id = lmCloneString(lm, row[1]);
 
     /* Fetch size. */
     if (!sameString(row[rowSize-1], "AA."))
         errAbort("Expecting 'AA.' at end of line %d of %s",
 		lf->lineIx, lf->fileName);
     spr->aaSize = lineFileNeedNum(lf, row, rowSize-2);
 
     /* Figure out if it's curated. */
     char *tag = row[2];
     if (sameString(tag, "STANDARD;"))
         spr->isCurated = TRUE;
     else if (sameString(tag, "Reviewed;"))
         spr->isCurated = TRUE;
     else if (sameString(tag, "PRELIMINARY;"))
         spr->isCurated = FALSE;
     else if (sameString(tag, "Unreviewed;"))
         spr->isCurated = FALSE;
     else
         errAbort("Unrecognized field 3 of ID line %d of %s",
 		lf->lineIx, lf->fileName);
     }
 
 /* Loop around parsing until get '//' */
 for (;;)
     {
     if (!lineFileNextReal(lf, &line))
         errAbort("%s ends in middle of a record", lf->fileName);
     if (startsWith("//", line))
         break;
     type = line;
     line += 5;
     if (type[2] != ' ' || line[-1] != ' ' || line[0] == ' ')
 	{
         errAbort("Looks like SwissProt changed white space after type, line %d of %s", 
 		lf->lineIx, lf->fileName);
 	}
     if (startsWith("FT", type))
         {
 	spParseFeature(lf, line, lm, dy, spr);
 	}
     else if (startsWith("DR", type))
         {
 	struct spDbRef *dr;
 	word = nextWord(&line);
 	if (word == NULL)
 	    errAbort("Short DR line %d of %s", lf->lineIx, lf->fileName);
 	stripLastChar(word);
 	lmAllocVar(lm, dr);
 	dr->db = lmCloneString(lm, word);
 	while ((word = nextWord(&line)) != NULL)
 	    {
 	    stripLastChar(word);
 	    n = lmSlName(lm, word);
 	    slAddHead(&dr->idList, n);
 	    }
 	slReverse(&dr->idList);
 	slAddHead(&spr->dbRefList, dr);
 	}
     else if (startsWith("DT", type))
         {
 	char *date;
 	date = nextWord(&line);
 	if (date == NULL)
 	    errAbort("Short DT line %d of %s", lf->lineIx, lf->fileName);
 	if (endsWith(line, "Created)") 
 		|| endsWith(line, "integrated into UniProtKB/Swiss-Prot.") 
 		|| endsWith(line, "integrated into UniProtKB/TrEMBL."))
 	    spr->createDate = lmCloneString(lm, date);
 	else if (endsWith(line, "Last sequence update)") 
 		|| stringIn("sequence version", line))
 	    spr->seqDate = lmCloneString(lm, date);
 	else if (endsWith(line, "Last annotation update)") 
 		|| stringIn("entry version", line))
 	    spr->annDate = lmCloneString(lm, date);
 	else
 	    {
 	    errAbort("Unrecognized date type '%s' line %d of %s", 
 	    	line, lf->lineIx, lf->fileName);
 	    }
 	}
     else if (startsWith("AC", type))
         {
 	while ((word = nextWord(&line)) != NULL)
 	    {
 	    stripLastChar(word);	/* Cut of '.' or ';' */
 	    n = lmSlName(lm, word);
 	    slAddHead(&spr->accList, n);
 	    }
 	}
     else if (startsWith("DE", type))
         {
 	groupLine(lf, type, line, dy);
 	spr->description = lmCloneString(lm, dy->string);
 	}
     else if (startsWith("OS", type))
         {
 	char *common, *s;
 	char *latin;
 	groupLine(lf, type, line, dy);
 	latin = dy->string;
 	stripLastPeriod(latin);
 	s = latin;
 	while ((common = strchr(s, '(')) != NULL)
 	    {
 	    char *end = strchr(common, ')');
 	    *common++ = 0;
 	    if (end != NULL)
 	        *end++ = 0;
 	    else
 	        break;
 	    n = lmSlName(lm, common);
 	    slAddHead(&spr->commonNames, n);
 	    s = end;
 	    }
 	eraseTrailingSpaces(latin);
 	spr->orgSciName = lmCloneString(lm, latin);
 	}
     else if (startsWith("OC", type))
         {
 	groupLine(lf, type, line, dy);
 	subChar(dy->string, '\n', ' ');
 	spr->taxonToGenus = lmCloneString(lm, dy->string);
 	}
     else if (startsWith("OG", type))
         {
 	groupLine(lf, type, line, dy);
 	spr->organelle = lmCloneString(lm, dy->string);
 	}
     else if (startsWith("OX", type))
         {
 	/* Taxon(s) of species with this protein. */
 	char *s;
 	struct spTaxon *taxon;
 	groupLine(lf, type, line, dy);
 	s = dy->string;
 	if (!startsWith(taxonSig, s))
 	    errAbort("Don't understand OX line %d of %s. Expecting %s", 
 	    	lf->lineIx, lf->fileName, taxonSig);
 	s += taxonSigLen;
 	while ((word = nextWord(&s)) != NULL)
 	    {
 	    lmAllocVar(lm, taxon);
 	    taxon->id = atoi(word);
 	    slAddHead(&spr->taxonList, taxon);
 	    }
 	}
     else if (startsWith("OH", type))
         {
 	char *s;
 	struct spTaxon *host;
 	/* Pathogen host relationship. */
 	groupLine(lf, type, line, dy);
 	s = dy->string;
 	if (!startsWith(taxonSig, s))
 	    errAbort("Don't understand OH line %d of %s. Expecting %s", 
 	    	lf->lineIx, lf->fileName, taxonSig);
 	while ((s = stringIn(taxonSig, s)) != NULL)
 	     {
 	     s += taxonSigLen;
 	     lmAllocVar(lm, host);
 	     host->id = atoi(s);
 	     slAddHead(&spr->hostList, host);
 	     }
         }
     else if (startsWith("RN", type))
         {
 	spParseReference(lf, line, lm, dy, spr);
 	}
     else if (startsWith("CC", type))
         {
 	spParseComment(lf, line, lm, dy, spr);
 	}
     else if (startsWith("KW", type))
         {
 	char *end;
 	stripLastChar(line);
 	while (line != NULL)
 	    {
 	    end = strchr(line, ';');
 	    if (end != NULL)
 	       *end++ = 0;
 	    line = trimSpaces(line);
 	    n = lmSlName(lm, line);
 	    slAddHead(&spr->keyWordList, n);
 	    line = end;
 	    }
 	}
     else if (startsWith("SQ", type))
         {
 	char *row[5];
 	int wordCount;
 	wordCount = chopLine(line, row);
 	if (wordCount < 5)
 	    errAbort("Short SQ line %d of %s", lf->lineIx, lf->fileName);
 	if (!sameString(row[4], "MW;"))
 	    errAbort("Expecting MW; field 6 line %d of %s, got %s", 
 	    	lf->lineIx, lf->fileName, row[4]);
 	spr->molWeight = atoi(row[3]);
 	spReadSeq(lf, lm, dy, spr);
 	}
     else if (startsWith("GN", type))
         {
 	groupLine(lf, type, line, dy);
 	spr->genes = lmCloneString(lm, dy->string);
 	parseNameVals(lf, lm, dy->string, &spr->gnList);
 	}
     else if (startsWith("PE", type))
         {
 	groupLine(lf, type, line, dy);
 	parseProteinEvidence(lf, lm, dy, spr);
 	}
     else
         {
 	errAbort("Unrecognized line %d of %s:\n%s",
 		lf->lineIx, lf->fileName, type);
 	}
     }
 slReverse(&spr->accList);
 slReverse(&spr->gnList);
 slReverse(&spr->taxonList);
 slReverse(&spr->hostList);
 slReverse(&spr->commonNames);
 slReverse(&spr->literatureList);
 slReverse(&spr->commentList);
 slReverse(&spr->dbRefList);
 slReverse(&spr->keyWordList);
 slReverse(&spr->featureList);
 slReverse(&spr->spProtEvList);
 return spr;
 }
 
 /* ------------- Start main program ------------------  */
 
 struct uniquer
 /* Help manage a table that is simply unique. */
     {
     struct uniquer *next;
     struct hash *hash;
     int curId;
     FILE *f;
     };
 
 struct uniquer *uniquerNew(FILE *f, int hashSize)
 /* Make new uniquer structure. */
 {
 struct uniquer *uni;
 AllocVar(uni);
 uni->f = f;
 uni->hash = newHash(hashSize);
 return uni;
 }
    
 static char *nullPt = NULL;
 
 int uniqueStore(struct uniquer *uni, char *name)
 /* Store name in unique table.  Return id associated with name. */
 {
 if (name == NULL)
     return 0;
 else
     {
     struct hash *hash = uni->hash;
     struct hashEl *hel = hashLookup(hash, name);
 
     if (hel != NULL)
 	{
 	return (char *)(hel->val) - nullPt;
 	}
     else
 	{
 	uni->curId += 1;
 	hashAdd(hash, name, nullPt + uni->curId);
 	fprintf(uni->f, "%u\t%s\n", uni->curId, name);
 	return uni->curId;
 	}
     }
 }
 
 void toSqlDate(FILE *f, char *spDate)
 /* Write out SwissProt data in MySQL format. 
  * SwissProt:  01-NOV-1990
  * MySQL:      1990-11-01 */
 {
 static char *months[] = { "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
                           "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" };
 char dup[13];
 int monthIx;
 char *day, *month, *year;
 if (spDate == NULL)
     return;
 strncpy(dup, spDate, sizeof(dup));
 day = dup;
 month = dup+3;
 year = dup+7;
 dup[2] = dup[6] = dup[11] = 0;
 monthIx = stringIx(month, months);
 if (monthIx < 0)
    errAbort("Strange month %s", month);
 fprintf(f, "%s-%02d-%s", year, monthIx+1, day);
 }
 
 char *blankForNull(char *s)
 /* Return s, or "" if it is NULL */
 {
 if (s == NULL)
     return "";
 else
     return s;
 }
 
 char *nextGeneWord(char **pS)
 /* Return next gene, which is "," separated. */
 {
 char *start = skipLeadingSpaces(*pS);
 char *next;
 
 if (start == NULL || start[0] == 0)
     return NULL;
 next = stringIn(",", start);
 if (next != NULL)
     {
     *next = 0;
     next += 1;
     }
 *pS = next;
 return start;
 }
 
 FILE *createAt(char *dir, char *file)
 /* Open file in dir. */
 {
 char path[PATH_LEN];
 safef(path, sizeof(path), "%s/%s.txt", dir, file);
 return mustOpen(path, "w");
 }
 
 void spToDb(char *datFile, char *tabDir)
 /* spToDb - Create a relational database out of SwissProt/trEMBL flat files. */
 {
 struct lineFile *lf = lineFileOpen(datFile, TRUE);
 struct spRecord *spr;
 struct dyString *dy = newDyString(4096);
 
 /* We have 25 tables to make this fully relational and not
  * lose any info. Better start opening files. */
 makeDir(tabDir);
 FILE *displayId = createAt(tabDir, "displayId");
 FILE *otherAcc = createAt(tabDir, "otherAcc");
 FILE *organelle = createAt(tabDir, "organelle");
 FILE *info = createAt(tabDir, "info");
 FILE *description = createAt(tabDir, "description");
 FILE *geneLogic = createAt(tabDir, "geneLogic");
 FILE *gene = createAt(tabDir, "gene");
 FILE *taxon = createAt(tabDir, "taxon");
 FILE *accToTaxon = createAt(tabDir, "accToTaxon");
 FILE *commonName = createAt(tabDir, "commonName");
 FILE *keyword = createAt(tabDir, "keyword");
 FILE *accToKeyword = createAt(tabDir, "accToKeyword");
 FILE *commentType = createAt(tabDir, "commentType");
 FILE *commentVal = createAt(tabDir, "commentVal");
 FILE *comment = createAt(tabDir, "comment");
 FILE *protein = createAt(tabDir, "protein");
 FILE *extDb = createAt(tabDir, "extDb");
 FILE *extDbRef = createAt(tabDir, "extDbRef");
 FILE *featureClass = createAt(tabDir, "featureClass");
 FILE *featureType = createAt(tabDir, "featureType");
 FILE *featureId = createAt(tabDir, "featureId");
 FILE *feature = createAt(tabDir, "feature");
 FILE *author = createAt(tabDir, "author");
 FILE *reference = createAt(tabDir, "reference");
 FILE *referenceAuthors = createAt(tabDir, "referenceAuthors");
 FILE *citationRp = createAt(tabDir, "citationRp");
 FILE *citation = createAt(tabDir, "citation");
 FILE *rcType = createAt(tabDir, "rcType");
 FILE *rcVal = createAt(tabDir, "rcVal");
 FILE *citationRc = createAt(tabDir, "citationRc");
 FILE *pathogenHost = createAt(tabDir, "pathogenHost");
 FILE *proteinEvidenceType = createAt(tabDir, "proteinEvidenceType");
 FILE *proteinEvidence = createAt(tabDir, "proteinEvidence");
 
 /* Some of the tables require unique IDs */
 struct uniquer *organelleUni = uniquerNew(organelle, 14);
 struct uniquer *keywordUni = uniquerNew(keyword, 14);
 struct uniquer *commentTypeUni = uniquerNew(commentType, 10);
 struct uniquer *commentValUni = uniquerNew(commentVal, 18);
 struct uniquer *extDbUni = uniquerNew(extDb, 8);
 struct uniquer *featureClassUni = uniquerNew(featureClass, 10);
 struct uniquer *featureTypeUni = uniquerNew(featureType, 14);
 struct uniquer *featureIdUni = uniquerNew(featureId, 14);
 struct uniquer *authorUni = uniquerNew(author, 18);
 struct uniquer *referenceUni = uniquerNew(reference, 18);
 struct uniquer *citationRpUni = uniquerNew(citationRp, 18);
 struct uniquer *rcTypeUni = uniquerNew(rcType, 14);
 struct uniquer *rcValUni = uniquerNew(rcVal, 18);
 
 /* Other unique helpers. */
 struct hash *taxonHash = newHash(18);
 struct hash *taxonIdHash = newHash(18);
 struct hash *pathogenHostHash = newHash(16);
 struct hash *accHash = newHash(18);
 int citationId = 0;
 
 /* A little stuff to process proteinEvidenceType IDs, so that we can share the same
  * numerical ID with SwissProt. */
 struct rbTree *proteinEvidenceTree = intValTreeNew();
 struct spProtEv *proteinEvidenceList = NULL;
 
 for (;;)
     {
     struct lm *lm = lmInit(8*1024);
     char *acc;
     struct slName *n;
     int organelleId;
 
     spr = spRecordNext(lf, lm, dy);
     if (spr == NULL)
         break;
     acc = spr->accList->name;
 
     if (hashLookup(accHash, acc))
         {
 	if (dupeOk)
 	    continue;
 	else
 	    errAbort("Duplicate accession %s in record ending line %d of %s", 
 	        acc, lf->lineIx, lf->fileName);
 	}
     else
         hashAdd(accHash, acc, NULL);
 
     /* displayId */
     fprintf(displayId, "%s\t%s\n", acc, spr->id);
 
     /* otherAcc */
     for (n = spr->accList->next; n != NULL; n = n->next)
         fprintf(otherAcc, "%s\t%s\n", acc, n->name);
 
     /* organelle */
     organelleId = uniqueStore(organelleUni, spr->organelle);
 
     /* info */
     fprintf(info, "%s\t%d\t%d\t%d\t", 
     	acc, spr->isCurated, spr->aaSize, spr->molWeight);
     toSqlDate(info, spr->createDate);
     fputc('\t', info);
     toSqlDate(info, spr->seqDate);
     fputc('\t', info);
     toSqlDate(info, spr->annDate);
     fputc('\t', info);
     fprintf(info, "%d\n", organelleId);
 
     /* description */
     if (spr->description != NULL)
 	{
 	subChar(spr->description, '\t', ' ');
 	fprintf(description, "%s\t%s\n", acc, spr->description);
 	}
 	
     /* gene logic and gene */
     if (spr->gnList != NULL)
 	{
 	struct hashEl *hel;
 	char *s = spr->genes, *word;
 	char *gn;
 	stripLastPeriod(s);
 	fprintf(geneLogic, "%s\t%s\n", acc, s);
         int isPrimary = 1; /* first is primary gene name, other synonyms */
 	for (hel = spr->gnList; hel != NULL; hel = hel->next)
 	    {
 	    gn = (char *) hel->val;
 	    /* separte gene names if there are multiple gene names */ 
 	    for (;;)
 	    	{
 	    	word = nextGeneWord(&gn);
 	    	if (word == NULL)
 	            break;
 	    	fprintf(gene, "%s\t%s\t%d\n", acc, word, isPrimary);
                 isPrimary = 0;
 	    	}
 	    }
  	}
     
     /* taxon, commonName, accToTaxon */
     if (spr->taxonList != NULL)
         {
 	struct spTaxon *tax;
 	if (slCount(spr->taxonList) == 1)
 	    {
 	    /* Swiss prot only has full info on the first taxa when it
 	     * contains multiple taxons, so we have to rely on NCBI here... */
 	    int ncbiId = spr->taxonList->id;
 	    char ncbiIdX[16];
 	    safef(ncbiIdX, sizeof(ncbiIdX), "%x", ncbiId);
 	    if (!hashLookup(taxonHash, spr->orgSciName) 
 	    	|| !hashLookup(taxonIdHash, ncbiIdX))
 	        {
 		hashAdd(taxonHash, spr->orgSciName, NULL);
 		hashAdd(taxonIdHash, ncbiIdX, NULL);
 		fprintf(taxon, "%d\t%s\t%s\n",
 			ncbiId, spr->orgSciName, spr->taxonToGenus);
 		for (n = spr->commonNames; n != NULL; n = n->next)
 		    fprintf(commonName, "%d\t%s\n", ncbiId, n->name);
 		}
 	    }
 	for (tax = spr->taxonList; tax != NULL; tax = tax->next)
 	    fprintf(accToTaxon, "%s\t%d\n", acc, tax->id);
 	}
 
     /* host/pathogen relationship */
     if (spr->hostList != NULL)
         {
 	struct spTaxon *patho, *host;
 	if (spr->taxonList == NULL)
 	    errAbort("OH field without OX field in record ending line %d of %s",
 	        lf->lineIx, lf->fileName);
 	for (patho = spr->taxonList; patho != NULL; patho = patho->next)
 	    {
 	    int pathoId = patho->id;
 	    for (host = spr->hostList; host != NULL; host = host->next)
 		{
 		int hostId = host->id;
 		char hashKey[24];
 		safef(hashKey, sizeof(hashKey), "%X_%X", hostId, pathoId);
 		if (!hashLookup(pathogenHostHash, hashKey))
 		    {
 		    hashAdd(pathogenHostHash, hashKey, NULL);
 		    fprintf(pathogenHost, "%d\t%d\n", pathoId, hostId);
 		    }
 		}
 	    }
 	}
 
     /* keyword and accToKeyword */
     for (n = spr->keyWordList; n != NULL; n = n->next)
         {
 	int id = uniqueStore(keywordUni, n->name);
 	fprintf(accToKeyword, "%s\t%d\n", acc, id);
 	}
     
     /* commentType, commenVal, and comment. */
         {
 	struct spComment *spCom;
 	for (spCom = spr->commentList; spCom != NULL; spCom = spCom->next)
 	    {
 	    int commentType = uniqueStore(commentTypeUni, spCom->type);
 	    int commentVal = uniqueStore(commentValUni, spCom->text);
 	    fprintf(comment, "%s\t%d\t%d\n", acc, commentType, commentVal);
 	    }
 	}
 
     /* protein */
     fprintf(protein, "%s\t%s\n", acc, spr->seq);
 
     /* extDb and extDbRef */
         {
 	struct spDbRef *ref;
 	for (ref = spr->dbRefList; ref != NULL; ref = ref->next)
 	    {
 	    int extDb = uniqueStore(extDbUni, ref->db);
 	    int i = 0;
 	    char *extAccs[3];
 	    for (i=0; i<ArraySize(extAccs); ++i)
 	        extAccs[i] = "";
 	    for (n = ref->idList, i=0; n != NULL && i<3; n = n->next, ++i)
 		extAccs[i] = n->name;
 	    fprintf(extDbRef, "%s\t%d\t%s\t%s\t%s\n", acc, extDb, 
 	        extAccs[0], extAccs[1], extAccs[2]);
 	    }
 	}
 
     /* featureClass, featureType, and feature */
         {
 	struct spFeature *feat;
 	for (feat = spr->featureList; feat != NULL; feat = feat->next)
 	    {
 	    int class = uniqueStore(featureClassUni, feat->class);
 	    int type = uniqueStore(featureTypeUni, feat->type);
 	    int Id   = uniqueStore(featureIdUni, feat->FTId);
 	    fprintf(feature, "%s\t%d\t%d\t%d\t%d\t%d\t%d\n",
 	    	acc, feat->start, feat->end, class, type, feat->softEndBits, Id);
 	    }
 	}
 
     /* proteinEvidence. */
         {
 	if (spr->spProtEvList == NULL)
 	    errAbort("Missing required PE field in record ending line %d of %s",
 	        lf->lineIx, lf->fileName);
 	struct spProtEv *ev;
 	for (ev = spr->spProtEvList; ev != NULL; ev = ev->next)
 	    {
 	    struct spProtEv *savedEv = intValTreeFind(proteinEvidenceTree, ev->id);
 	    if (savedEv == NULL)
 	        {
 		AllocVar(savedEv);
 		savedEv->id = ev->id;
 		savedEv->name = cloneString(ev->name);
 		slAddHead(&proteinEvidenceList, savedEv);
 		intValTreeAdd(proteinEvidenceTree, savedEv->id, savedEv);
 		}
 	    else
 	        {
 		if (!sameString(ev->name, savedEv->name))
 		   errAbort("Disagreement on PE id/name pairing on record ending line %d of %s\n"
 		            "'%s' vs. '%s'", lf->lineIx, lf->fileName, ev->name, savedEv->name);
 		}
 	    fprintf(proteinEvidence, "%s\t%d\n", acc, ev->id);
 	    }
 	}
 
     /* citation, reference, author, and related tables. */
         {
 	struct spLitRef *ref;
 	struct hashEl *hel;
 	int refId;
 	for (ref = spr->literatureList; ref != NULL; ref = ref->next)
 	    {
 	    /* Do get reference ID and if necessary output new reference and other
 	     * tables. */
 	    if ((hel = hashLookup(referenceUni->hash, ref->cite)) != NULL)
 		{
 		refId = (char *)hel->val - nullPt;
 		}
 	    else
 	        {
 		referenceUni->curId += 1;
 		refId = referenceUni->curId;
 		hashAddInt(referenceUni->hash, ref->cite, refId);
 		if (ref->title)
 		    subChar(ref->title, '\t', ' ');
 		fprintf(reference, "%d\t%s\t%s\t%s\t%s\t%s\n", 
 		    refId, blankForNull(ref->title), ref->cite, 
 		    blankForNull(ref->pubMedId), blankForNull(ref->medlineId), blankForNull(ref->doiId));
 		for (n = ref->authorList; n != NULL; n = n->next)
 		    {
 		    int authorId = uniqueStore(authorUni, n->name);
 		    fprintf(referenceAuthors, "%d\t%d\n", refId, authorId);
 		    }
 		}
 
             /* Do citation and related tables. */
 	        {
 		int rpId = uniqueStore(citationRpUni, ref->rp);
 		++citationId;
 		fprintf(citation, "%d\t%s\t%d\t%d\n",
 			citationId, acc, refId, rpId);
 
 		for (hel = ref->rcList; hel != NULL; hel = hel->next)
 		    {
 		    int rcTypeId = uniqueStore(rcTypeUni, hel->name);
 		    int rcValId = uniqueStore(rcValUni, hel->val);
 		    fprintf(citationRc, "%d\t%d\t%d\n", 
 		    	citationId, rcTypeId, rcValId);
 		    }
 		}
 	    }
 	}
 
     lmCleanup(&lm);
     }
 dyStringFree(&dy);
 
 /* Sort and out protein evidence type. */
     {
     slSort(&proteinEvidenceList, spProtEvCmpId);
     struct spProtEv *ev;
     for (ev = proteinEvidenceList; ev != NULL; ev = ev->next)
         fprintf(proteinEvidenceType, "%d\t%s\n", ev->id, ev->name);
     }
 
 /* Compare spProtEv by id. */
 carefulClose(&displayId);
 carefulClose(&otherAcc);
 carefulClose(&organelle);
 carefulClose(&info);
 carefulClose(&description);
 carefulClose(&geneLogic);
 carefulClose(&gene);
 carefulClose(&taxon);
 carefulClose(&accToTaxon);
 carefulClose(&commonName);
 carefulClose(&keyword);
 carefulClose(&accToKeyword);
 carefulClose(&commentType);
 carefulClose(&commentVal);
 carefulClose(&comment);
 carefulClose(&protein);
 carefulClose(&extDb);
 carefulClose(&extDbRef);
 carefulClose(&featureClass);
 carefulClose(&featureType);
 carefulClose(&featureId);
 carefulClose(&feature);
 carefulClose(&author);
 carefulClose(&reference);
 carefulClose(&referenceAuthors);
 carefulClose(&citationRp);
 carefulClose(&citation);
 carefulClose(&rcType);
 carefulClose(&rcVal);
 carefulClose(&citationRc);
 carefulClose(&pathogenHost);
 carefulClose(&proteinEvidenceType);
 carefulClose(&proteinEvidence);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 dupeOk = optionExists("dupeOk");
 if (argc != 3)
     usage();
 spToDb(argv[1], argv[2]);
 return 0;
 }