f2cc86e3506c2d5fefe00dbe85e7f05f0f33f43f
jcasper
  Wed Mar 6 11:33:33 2024 -0800
Updates for new uniProt import, refs #30476

diff --git src/hg/protein/spToDb/spDb.sql src/hg/protein/spToDb/spDb.sql
index 75ef84b..ace3922 100644
--- src/hg/protein/spToDb/spDb.sql
+++ src/hg/protein/spToDb/spDb.sql
@@ -1,225 +1,225 @@
 # spDb.sql was originally generated by the autoSql program, which also 
 # generated spDb.c and spDb.h.  This creates the database representation of
 # an object which can be loaded and saved from RAM in a fairly 
 # automatic way.
 
 #Relate ID and primary accession. A good table to use just get handle on all records.
 CREATE TABLE displayId (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     val char(24) not null,	# SwissProt display ID
               #Indices
     PRIMARY KEY(acc),
     UNIQUE(val)
 );
 
 #Relate ID and other accessions
 CREATE TABLE otherAcc (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     val char(12) not null,	# Secondary accession
               #Indices
     INDEX(val),
     INDEX(acc)
 );
 
 #A part of a cell that has it's own genome
 CREATE TABLE organelle (
     id int not null,	# Organelle ID - we create this
     val longblob not null,	# Text description
               #Indices
     PRIMARY KEY(id)
 );
 
 #Small stuff with at most one copy associated with each SwissProt record
 CREATE TABLE info (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     isCurated tinyint not null,	# True if curated (SwissProt rather than trEMBL)
     aaSize int not null,	# Size in amino acids
     molWeight int not null,	# Molecular weight
     createDate date not null,	# Creation date
     seqDate date not null,	# Sequence last update date
     annDate date not null,	# Annotation last update date
     organelle int not null,	# Pointer into organelle table
               #Indices
     PRIMARY KEY(acc)
 );
 
 #Description lines
 CREATE TABLE description (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     val longtext not null,	# SwissProt DE lines
               #Indices
     PRIMARY KEY(acc)
 );
 
 #Gene including and/or logic if multiple
 CREATE TABLE geneLogic (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     val longtext not null,	# Gene(s) and logic to relate them.
               #Indices
     PRIMARY KEY(acc)
 );
 
 #Gene/accession relationship. Both sides can be multiply valued.
 CREATE TABLE gene (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     val varchar(255) not null,	# Single gene name
     isPrimary tinyint not null,	# True if primary gene name, false if a synonym
               #Indices
     INDEX(acc),
     INDEX(val(12))
 );
 
 #An NCBI taxon
 CREATE TABLE taxon (
     id int not null,	# Taxon NCBI ID
     binomial varchar(255) not null,	# Binomial format name
     toGenus longtext not null,	# Taxonomy - superkingdom to genus
               #Indices
     INDEX(id),	# NCBI may have updated 1/2 way through SwissProt it seems.
     INDEX(binomial(12))
 );
 
 #Common name for a taxon
 CREATE TABLE commonName (
     taxon int not null,	# Taxon table ID
     val varchar(255) not null,	# Common name
               #Indices
     INDEX(taxon),
     INDEX(val(12))
 );
 
 #accession/taxon relationship
 CREATE TABLE accToTaxon (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     taxon int not null,	# ID in taxon table
               #Indices
     INDEX(acc),
     INDEX(taxon)
 );
 
 #Host of pathogenic organism
 CREATE TABLE pathogenHost (
     pathogen int not null,	# Pathogen taxon
     host int not null,	# Host taxon
               #Indices
     INDEX(pathogen),
     INDEX(host)
 );
 
 #A keyword
 CREATE TABLE keyword (
     id int not null,	# Keyword ID - we create this
     val varchar(255) not null,	# Keyword itself
               #Indices
     PRIMARY KEY(id),
     INDEX(val(12))
 );
 
 #Relate keywords and accessions
 CREATE TABLE accToKeyword (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     keyword int not null,	# ID in keyword table
               #Indices
     INDEX(acc),
     INDEX(keyword)
 );
 
 #A type of comment
 CREATE TABLE commentType (
     id int not null,	# Comment type ID, we create this
     val varchar(255) not null,	# Name of comment type
               #Indices
     PRIMARY KEY(id),
     INDEX(val(12))
 );
 
 #Text of a comment
 CREATE TABLE commentVal (
     id int not null,	# Comment value ID - we create this
     val longtext not null,	# Amino acids
               #Indices
     PRIMARY KEY(id)
 );
 
 #A structured comment
 CREATE TABLE comment (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     commentType int not null,	# ID in commentType table
     commentVal int not null,	# ID in commentVal table
               #Indices
     INDEX(acc)
 );
 
 #Amino acid sequence
 CREATE TABLE protein (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession (maybe with variant extension)
     val longblob not null,	# Amino acids
               #Indices
     PRIMARY KEY(acc)
 );
 
 #Name of another database
 CREATE TABLE extDb (
     id int not null,	# Database id - we make this up
     val varchar(255) not null,	# Name of database
               #Indices
     PRIMARY KEY(id),
     INDEX(val(12))
 );
 
 #A reference to another database
 CREATE TABLE extDbRef (
-    acc char(12) not null,	# Primary SwissProt accession
+    acc char(20) not null,	# Primary SwissProt accession
     extDb int not null,	# ID in extDb table
     extAcc1 varchar(255) not null,	# External accession
     extAcc2 varchar(255) not null,	# External accession
     extAcc3 varchar(255) not null,	# External accession
               #Indices
     INDEX(acc),
-    INDEX(extAcc1(10))
+    INDEX(extAcc1(20))
 );
 
 #A class of feature
 CREATE TABLE featureClass (
     id int not null,	# Database id - we make this up
     val varchar(255) not null,	# Name of class
               #Indices
     PRIMARY KEY(id),
     INDEX(val(12))
 );
 
 #A type of feature
 CREATE TABLE featureType (
     id int not null,	# Database id - we make this up
     val longtext not null,	# Name of type
               #Indices
     PRIMARY KEY(id),
     INDEX(val(12))
 );
 
 #FeatureId table
 CREATE TABLE featureId (
   id int(11) NOT NULL default '0',	# Database id -we make this up
   val varchar(40) NOT NULL default '',  # UniProt feature Id
   PRIMARY KEY  (id),
   KEY val (val(14))
 ) ENGINE=MyISAM;
 
 #A description of part of a protein
 CREATE TABLE feature (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     start int not null,	# Start coordinate (zero based)
     end int not null,	# End coordinate (non-inclusive)
     featureClass int not null,	# ID of featureClass
     featureType int not null,	# ID of featureType
     softEndBits tinyint not null,  # 1 for start <, 2 for start ?, 4 for end >, 8 for end ?
     featureId int(11) NOT NULL default '0', # feature Id
               #Indices
     INDEX(acc)
 );
 
 #A single author
 CREATE TABLE author (
     id int not null,	# ID of this author
     val varchar(255) not null,	# Name of author
               #Indices
@@ -248,31 +248,31 @@
     INDEX(reference),
     INDEX(author)
 );
 
 #SwissProt RP (Reference Position) line.  Often includes reason for citing.
 CREATE TABLE citationRp (
     id int not null,	# ID of this citationRp
     val longtext not null,	# Reason for citing/position in sequence of cite.
               #Indices
     PRIMARY KEY(id)
 );
 
 #A SwissProt citation of a reference
 CREATE TABLE citation (
     id int not null,	# ID of this citation
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     reference int not null,	# ID in reference table
     rp int not null,	# ID in rp table
               #Indices
     PRIMARY KEY(id),
     INDEX(acc),
     INDEX(reference)
 );
 
 #Types found in a swissProt reference RC (reference comment) line
 CREATE TABLE rcType (
     id int not null,	# ID of this one
     val varchar(255) not null,	# name of this
               #Indices
     PRIMARY KEY(id)
 );
@@ -284,45 +284,45 @@
               #Indices
     PRIMARY KEY(id)
 );
 
 #Reference comments associated with citation
 CREATE TABLE citationRc (
     citation int not null,	# ID in citation table
     rcType int not null,	# ID in rcType table
     rcVal int not null,	# ID in rcVal table
               #Indices
     INDEX(citation)
 );
 
 #Contains just the variably spliced proteins
 CREATE TABLE varProtein (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     val longblob not null,	# Amino acids
               #Indices
     PRIMARY KEY(acc)
 );
 
 #Accessions and other info on the variably-spliced proteins
 CREATE TABLE varAcc (
-    varAcc char(12) not null,	# Accession of variant
-    parAcc char(12) not null,	# Accession of parent (non-variant)
-    variant char(4) not null,	# Variant part of accession
+    varAcc char(20) not null,	# Accession of variant
+    parAcc char(20) not null,	# Accession of parent (non-variant)
+    variant char(5) not null,	# Variant part of accession
               #Indices
     PRIMARY KEY(varAcc)
 );
 
 #Type of evidence that this is actually a protein
 CREATE TABLE proteinEvidenceType (
     id int not null,	# ID of type (shared with UniProt)
     val longblob not null,	# Description of protein evidence
               #Indices
     PRIMARY KEY(id)
 );
 
 #Associate protein with evidence for its existence
 CREATE TABLE proteinEvidence (
-    acc char(12) not null,	# Primary accession
+    acc char(20) not null,	# Primary accession
     proteinEvidenceType int not null,	# ID in proteinEvidenceType table
               #Indices
     INDEX(acc)
 );