src/hg/utils/otto/uniprot/doUniprot ef33c128a8aee1ba566ffa9f8e088888c07001c4

ef33c128a8aee1ba566ffa9f8e088888c07001c4
max
  Mon Feb 26 03:31:08 2024 -0800
fixing bug in uniprot otto job. the md5 of fasta files was not working, as some fastas entries look like ">uniprotId isRef" and other sequences dont have this "isRef" word. This meant that for the reference sequences, the sequence was not used for the md5 but only the word isRef. This meant that the otto job missed a change in a genome if only a single reference sequence changed. this will lead to a recompute of absolutely every mapping file at the next otto run and will use up the cluster for a few days. Oops.

diff --git src/hg/utils/otto/uniprot/doUniprot src/hg/utils/otto/uniprot/doUniprot
index ba88eef..0469804 100755
--- src/hg/utils/otto/uniprot/doUniprot
+++ src/hg/utils/otto/uniprot/doUniprot
@@ -36,30 +36,32 @@
 manualTaxIdDbs = {
     9606   : ["hg19", "hg38"], # human - keep hg19 mapping for a long time. UniProt is important for medical genetics.
     10090  : ["mm10"], # mouse
     10116  : ["rn6"], # rat
     8364   : ["xenTro9"], # xenopus
     6239   : ["ce11", "ce10"], # c elegans
     7227   : ["dm6"], # fruitfly
     7955   : ["danRer7", "danRer10"], # zebrafish
     559292   : ["sacCer3"], # yeast
     9940   : ["oviAri3"], # sheep
     9823   : ["susScr3"], # pig
     2697049: ["wuhCor1"]}
 
 # see also covidCheck.sh
 notAutoDbs = [
+# got errors due to short contigs in galGal6, just skipping
+"galGal6",
 # do not run on wuhCor1 - wuhCor1 has a special daily otto job in /hive/data/genomes/wuhCor1/bed/uniprot
 # because UniProt releases SARS-CoV-2 much more often
 "wuhCor1",
 # no monkeypox yet
 "mpxvRivers",
 # do not run on hs1, has no good gene model and BLAT protein takes forever
 "hs1",
 # sea urchin doesn't have any normal gene model track, not even Augustus
 "strPur2"
 ]
 
 # the script will stop if the difference between the last uniprot version and the current version is > than <maxDiffVersions>
 # but this check is only done for some important assemblies. For fringe assemblies, the numbers can easily
 # change 10x, as trembl predictions are the majority of annotations
 maxDiffVersions = 5.0
@@ -893,44 +895,49 @@
 
     if serverTime > localTime:
         logging.debug("URL %s is newer than %s" % (url, localFname))
         return True
     else:
         logging.debug("URL %s is not newer than %s" % (url, localFname))
         return False
 
 def run(cmd, ignoreErr=False):
     " run command, stop on error "
     if type(cmd)==type([]):
         cmd = " ".join(cmd)
     logging.debug(cmd)
     cmd = "set -o pipefail; "+cmd
     ret = os.system(cmd)
-    logging.debug("Running: %s" % (cmd))
+    logging.info("Running: %s" % (cmd))
     if ret!=0 and not ignoreErr:
         logging.error("command returned error: %s" % cmd)
-        sys.exit(1)
+        sys.exit(254)
+    else:
+        logging.debug("command exit code: %d" % ret)
     return ret
 
 def fastaMd5(fname):
     " sort gzipped fasta by id and return its md5 "
     logging.debug("Getting md5 of sorted seqs in %s" % fname)
     if fname.endswith(".gz"):
         catCmd = "zcat"
     else:
         catCmd = "cat"
-    cmd = """%s %s | awk 'BEGIN{RS=">"}{print $1"\t"$2;}' | sort -k1,1 | md5sum""" % (catCmd, fname)
+    # first awk strips off any words after the sequence ID
+    # second awk converts fasta to tab sep with first column seqId, second column sequence
+    # (can't believe that former Max came up with this himself)
+    cmd = """%s %s | awk '{print $1}' | awk 'BEGIN{RS=">"}{print $1"\t"$2;}' | sort -k1,1 | md5sum""" % (catCmd, fname)
     proc = subprocess.Popen(cmd, stdout=PIPE, shell=True, encoding="latin1")
     md5 = proc.stdout.readline().split()[0]
     logging.debug("MD5 of %s is %s" % (fname, md5))
     return md5
 
 def manyTabMd5(fnames):
     md5s = []
     for fn in fnames:
         md5s.append(tabMd5(fn))
     return listMd5(md5s)
 
 def tabMd5(fname):
     " sort gzipped tab and return its md5 "
     logging.debug("Getting md5 of sorted tab-sep file %s" % fname)
     if fname.endswith(".gz"):
@@ -1397,31 +1404,31 @@
 
     if doTrembl:
         fnames.append( (join(tabDir,"trembl.%d.tab" % taxId)) )
 
     ret = {}
     for fname in fnames:
         for row in iterTsvRows(open(fname)):
             for acc in getAllIsoAccs(row):
                 ret[acc] = row
     return ret
 
 def blatProteinsKeepBest(fullFaFname, db, mapFname, stats):
     " blat the proteins directly, only good for small genomes where we don't have any gene models "
     myDir = dirname(__file__)
     cmd = "blat -q=prot -t=dnax /gbdb/{db}/{db}.2bit {inf} stdout -noHead | sort -k10 | pslReps stdin stdout /dev/null " \
-        "-minAli=0.95 -nohead | pslProtCnv > {out}".format(db=db, inf=fullFaFname, out=mapFname, myDir=myDir)
+        "-minAli=0.95 -nohead | {myDir}/pslProtCnv > {out}".format(db=db, inf=fullFaFname, out=mapFname, myDir=myDir)
     stats["minAli"]=0.95
     #nuclFname = mapFname+".nucl"
     # originally used "| {myDir}/pslProtCnv " but tried to get rid of the markd-script dependency 
     #pslToProtPsl(nuclFname, mapFname)
     # then later figured that pslToProtPsl() is not the same as pslProtCnv and went back the MarkD way
     #os.remove(nuclFname)
     run(cmd)
 
 def getTransIds(db, geneTable, transcriptFa):
     """ return all possible transcript IDs given a gene table. The reason that we need is that Uniprot often
       refers to transcripts that don't exist and we are using a select file to map only to those. In these
       cases, we want to make sure that our select file contains only transcript that we actually have,
       so we can log how many transcript can possibly mapped (and which ones). This is important for debugging.
     """
     # no idea how to get the version suffix out of our mysql tables - thanks Obama!
@@ -1900,34 +1907,38 @@
 
 def downloadUniprot(uprotDir):
     " if necessary, download UniProt using LFTP in an atomic way, update release info file "
     localFname = join(uprotDir, "uniprot_sprot.xml.gz")
     if not isNewer(upUrl, localFname):
         logging.info("files at %s are not newer than file in %s, nothing to do. Specify -l to skip this check." % (upUrl, localFname))
         sys.exit(0)
 
     # use lftp to update uniprot and download only changed files
     # in late 2020, pget started to trigger errors, so removed -P 3 and --use-pget-n=4
     tmpDir = join(uprotDir, "download.tmp")
     if not isdir(tmpDir):
         os.makedirs(tmpDir)
 
     # the uniprot FTP server is too slow and too unreliable. Switching to EBI's which has rsync !!
-    #cmd = 'lftp ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/ -e "lcd %s && mirror . --exclude-glob *.dat.gz && exit"' % tmpDir
     count = 0
     while True:
-        cmd = "rsync -av --partial rsync://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/ %s/" % tmpDir
+        #cmd = "rsync -av --partial rsync://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/ %s/" % tmpDir
+        # ftp.uniprot.org (USA,, PIR)
+        # ftp.ebi.ac.uk (UK, EBI)
+        # ftp.expasy.org (Switzerland, SIB)
+
+        cmd = 'lftp ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/ -e "lcd %s && mirror . -P 3 --use-pget-n=4 --exclude-glob *.dat.gz && exit"' % tmpDir
         ret = run(cmd, ignoreErr=True)
         if ret==0:
             break
 
         logging.info("Problem with rsync, waiting for 5 hours, then retrying")
         time.sleep(5*60*60)
         count += 1
         if count > 5:
             errAbort("Cannot run rsync cmd: %s" % cmd)
 
     # make sure that download interruptions don't mess up the files in the target dir
     logging.info("Moving files from %s to %s" % (tmpDir, uprotDir))
     cmd = "rsync -a --remove-source-files %s/* %s" % (tmpDir, uprotDir)
     run(cmd)
     shutil.rmtree(tmpDir)
@@ -2044,31 +2055,32 @@
                 continue
 
             logging.debug("Annotating assembly %s" % db)
             chromSizesFname = "/hive/data/genomes/%s/chrom.sizes" % db
 
             accToMeta = parseRecordMeta(tabDir, taxId, doTrembl)
 
             dbMapDir = makeSubDir(mapDir, db)
             mapDescFname = join(dbMapDir, "liftInfo.json")
             mapFname, geneTable, mapMd5, accToMapSource, accToTrans, isNewMap = \
                 makeProteinGenomePsl(taxId, fullFaFname, tabFnames, db, cluster, options.force, dbMapDir, mapDescFname)
 
             dbBigBedDir = makeSubDir(bigBedDir, db)
 
             convMapToBigPsl(fullFaFname, mapFname, dbBigBedDir, accToDb, accToMeta, accToMapSource, accToTrans, chromSizesFname, doTrembl)
-            shutil.copy(mapDescFname, mapDir)
+
+            #shutil.copy(mapDescFname, mapDir)
 
             if options.onlyMap:
                 continue
 
             annotFnames = ["tab/swissprot.%d.annots.tab" % taxId]
             if doTrembl:
                 annotFnames.append( "tab/trembl.%d.annots.tab" % taxId )
 
             uniprotLift(fullFaFname, annotFnames, chromSizesFname, mapFname, dbBigBedDir, accToDb, accToMeta, options)
 
             shutil.copyfile(mapDescFname, join(dbBigBedDir, "liftInfo.json"))
 
 def makeSymlink(target, linkName):
     assert(".new" not in linkName) # make sure that this bug never happens again
 
@@ -2452,16 +2464,16 @@
 
     relStr, shortVersion = createVersionFiles(options.tabDir, options.bigBedDir, onlyDbs, taxIdDbs)
     copyToArchive(options.bigBedDir, options.archiveDir, shortVersion, onlyDbs)
 
     makeHubs(options.archiveDir, onlyDbs)
 
     if isfile(flagFname):
         os.remove(flagFname)
 
     logging.info("Uniprot pipeline completed.")
 
 try:
     main()
 except Exception as e:
     logging.error(e, exc_info=True)
-    sys.exit(1)
+    sys.exit(254)