f2cc86e3506c2d5fefe00dbe85e7f05f0f33f43f
jcasper
  Wed Mar 6 11:33:33 2024 -0800
Updates for new uniProt import, refs #30476

diff --git src/hg/makeDb/doc/uniProt/sp240202.txt src/hg/makeDb/doc/uniProt/sp240202.txt
new file mode 100644
index 0000000..bf56e7c
--- /dev/null
+++ src/hg/makeDb/doc/uniProt/sp240202.txt
@@ -0,0 +1,111 @@
+#!/bin/tcsh -efx
+# hgwdev.
+
+export DBDATE=240202
+export DB=sp$DBDATE
+export OLDDBDATE=180404
+
+# Set up working directory
+mkdir -p /hive/data/outside/uniProt/$DBDATE/build
+
+# Download uniProt. This will take about 12 hours
+cd /hive/data/outside/uniProt/$DBDATE/build
+wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
+# 658,371,534  330KB/s   in 36m 58s
+# Following Max's lead, switching to the SIB server in Switzerland (20MB/s vs 330KB/s)
+wget https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz
+# 182,682,062,273 19.9MB/s   in 2h 29m
+wget ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz
+# 8,495,311   5.11MB/s   in 1.6s
+
+# Records now have notes and evidence sometimes split across multiple lines.
+# Wrote a short script to join those FT lines into one
+
+# Turn flat file into relational tab-separated files.
+# strip out evidence tags
+time (zcat *.dat.gz | perl joinLines.pl | /cluster/home/jcasper/bin/x86_64/stripEvidence stdin stdout |  /cluster/home/jcasper/bin/x86_64/spToDb stdin ../tabFiles)
+
+# real    259m33.935s
+# user    615m51.206s
+# sys     53m19.369s
+
+cd ../tabFiles
+wc -l *.txt | awk '{print $2,$1}' | sort > counts
+cd ../..
+
+#join 170510/tabFiles/counts 180404/tabFiles/counts | awk '{print $1, $3/$2}' | awk '{if (($2 > 2) || ($2 < 0.5)) print}'
+join $OLDDBDATE/tabFiles/counts $DBDATE/tabFiles/counts | awk '{print $1, $3/$2}' | awk '{if (($2 > 2) || ($2 < 0.5)) print}'
+
+# accToKeyword.txt 13.6915
+# accToTaxon.txt 17.2785
+# citation.txt 8.23396
+# citationRc.txt 8.42574
+# comment.txt 12.3309
+# commonName.txt 0.18579
+# description.txt 8.65138
+# displayId.txt 8.65138
+# extDbRef.txt 11.1583
+# feature.txt 41.5864
+# featureId.txt 13.3898
+# gene.txt 10.8138
+# geneLogic.txt 10.2501
+# info.txt 8.65138
+# organelle.txt 2.8554
+# otherAcc.txt 4.54279
+# protein.txt 8.65138
+# proteinEvidence.txt 8.65138
+# rcVal.txt 3.0725
+# taxon.txt 0.037334
+# total 11.6354
+
+# Wait, so commonName.txt and taxon.txt actually shrank?  Looks like maybe they dropped
+# some viruses ....  What else, if anything?
+
+# Create the database.  
+hgsql mm10 -e "create database sp$DBDATE"
+
+# Load it up with table definitions from source directory
+hgsql sp$DBDATE < ~/kent/src/hg/protein/spToDb/spDb.sql
+
+# Load up the data from tab files.  This takes about an hour.
+# What is this, old csh?  Rewriting in bash
+#set s=`date +%s`
+#cd /hive/data/outside/uniProt/$DBDATE/tabFiles
+#foreach i (*.txt)
+#  hgsqlimport --local sp$DBDATE $i
+#end
+s=`date +%s`
+cd /hive/data/outside/uniProt/$DBDATE/tabFiles
+for i in *.txt
+  do hgsqlimport --local sp$DBDATE $i
+done
+
+#set e=`date +%s`
+e=`date +%s`
+expr $e - $s
+
+
+# 23006
+
+# Add varsplice info 
+zcat ../build/uniprot_sprot_varsplic.fasta.gz | spDbAddVarSplice sp$DBDATE stdin .
+hgLoadSqlTab sp$DBDATE -notOnServer -append varProtein /dev/null varProtein.txt
+hgLoadSqlTab sp$DBDATE -notOnServer -append protein /dev/null varProtein.txt
+hgLoadSqlTab sp$DBDATE -notOnServer -append varAcc /dev/null varAcc.txt
+
+# for i in displayId.txt accToTaxon.txt geneLogic.txt gene.txt description.txt; do hgsqlimport --local sp$DBDATE $i; done
+
+hgLoadSqlTab sp$DBDATE -notOnServer -append displayId /dev/null varDisplayId.txt
+hgLoadSqlTab sp$DBDATE -notOnServer -append accToTaxon /dev/null varAccToTaxon.txt
+hgLoadSqlTab sp$DBDATE -notOnServer -append geneLogic /dev/null varGeneLogic.txt
+tawk '{$(NF+1) = "0"; print}' varGene.txt | hgLoadSqlTab sp$DBDATE -notOnServer -append gene /dev/null stdin
+hgLoadSqlTab sp$DBDATE -notOnServer -append description /dev/null varDescription.txt
+
+# Add table descriptions
+makeTableDescriptions sp$DBDATE ~/kent/src/hg/protein/spToDb/spDbTables.as
+
+# Zip up tab files for people who prefer them to database.
+gzip *.txt
+
+# Don't forget to ask the admins to update the database softlink - "uniProt"
+# should go to this newest version of the sp* database.