src/hg/utils/otto/uniprot/doUniprot d9ce74c9310f90a8d631e048499434cc6b954f6d

d9ce74c9310f90a8d631e048499434cc6b954f6d
max
  Wed Mar 1 06:07:32 2023 -0800
finally fixing various old uniprot bugs that stopped the update in January and October: now stops if an error occurs, working around a symlink permissions problem, making the release message easier to read, ML ticket, refs #30701

diff --git src/hg/utils/otto/uniprot/doUniprot src/hg/utils/otto/uniprot/doUniprot
index 4f3351f..ba88eef 100755
--- src/hg/utils/otto/uniprot/doUniprot
+++ src/hg/utils/otto/uniprot/doUniprot
@@ -29,40 +29,44 @@
 # proteins have to match the transcript as at least at this percent ID
 # of these matches, only the top ones within 1% are used, see makeUniProtPsl.sh
 MINALI=0.93
 
 # manual mapping from uniProt taxon IDs to UCSC databases, the default mapping is taken from dbDb
 # these are added in addition, I obtained them in 2018 using some Apache log mining
 # the list could probably be shorter now
 manualTaxIdDbs = {
     9606   : ["hg19", "hg38"], # human - keep hg19 mapping for a long time. UniProt is important for medical genetics.
     10090  : ["mm10"], # mouse
     10116  : ["rn6"], # rat
     8364   : ["xenTro9"], # xenopus
     6239   : ["ce11", "ce10"], # c elegans
     7227   : ["dm6"], # fruitfly
     7955   : ["danRer7", "danRer10"], # zebrafish
-    4932   : ["sacCer3"], # yeast
+    559292   : ["sacCer3"], # yeast
     9940   : ["oviAri3"], # sheep
     9823   : ["susScr3"], # pig
     2697049: ["wuhCor1"]}
 
 # see also covidCheck.sh
 notAutoDbs = [
 # do not run on wuhCor1 - wuhCor1 has a special daily otto job in /hive/data/genomes/wuhCor1/bed/uniprot
 # because UniProt releases SARS-CoV-2 much more often
 "wuhCor1",
+# no monkeypox yet
+"mpxvRivers",
+# do not run on hs1, has no good gene model and BLAT protein takes forever
+"hs1",
 # sea urchin doesn't have any normal gene model track, not even Augustus
 "strPur2"
 ]
 
 # the script will stop if the difference between the last uniprot version and the current version is > than <maxDiffVersions>
 # but this check is only done for some important assemblies. For fringe assemblies, the numbers can easily
 # change 10x, as trembl predictions are the majority of annotations
 maxDiffVersions = 5.0
 checkDbs = ["hg19", "hg38", "mm10", "rn6", "danRer10", "danRer7", "ce11"]
 
 # when we rename the parasol cluster, no need to change this file
 # probably overkill in retrospect but seemed like a good idea back then
 clusterFname = "/cluster/bin/scripts/cluster.txt"
 
 TREMBLCOLOR="0,150,250" # light blue
@@ -95,30 +99,35 @@
     "colon cancer" : "ColC",
     "colorectal cancer" : "ColrectC",
     "hepatoblastoma" : "HepatoBl",
     "a sporadic cancer": "sporadic",
     "sporadic cancers": "sporadic",
     "non-small cell lung cancer cell lines": "NSCLC-CL",
     "a colon cancer cell line":"ColC-CL"
 }
 
 # directory for the NCBI flatfiles
 NCBIDIR = "ncbi"
 
 # global variable needed for atexit callback function
 flagFname = None
 
+def errAbort(msg):
+    " stop "
+    logging.error(msg)
+    assert(False) # generate stacktrace
+
 def getTaxIdDbs(onlyDbs):
     """ return a dict with taxonId -> list of most recent dbs (e.g. for human, it's hg19 and hg38) 
     if onlyDbs is a set, keep only these dbs.
     If onlyDbs is None, remove all nonAutoDbs.
     """
     query = "select taxId, name from dbDb where active=1 order by orderKey;"
     rows = runQuery("hgcentral", query, usePublic=True)
     taxIdToDbs = defaultdict(list)
     # take only the first database for every taxonId
     for taxId, dbCode in rows:
 
         if onlyDbs:
             if dbCode not in onlyDbs:
                 continue
         else:
@@ -132,37 +141,48 @@
     for taxId, manNames in manualTaxIdDbs.items():
         if manNames is None:
             del taxIdToDbs[taxId]
         else:
             for dbCode in manNames:
                 if onlyDbs:
                     if dbCode not in onlyDbs:
                         continue
                 else:
                     if dbCode in notAutoDbs:
                         continue
 
                 if dbCode not in taxIdToDbs[taxId]:
                     taxIdToDbs[taxId].append(dbCode)
 
+    # Check that every UCSC db has only a single taxon ID, otherwise all hell will break lose, as this script will run twice
+    # and the second run will overwrite all results from the previous run. We do not support "merging" annotations from two uniprot organisms.
+    # During testing this only happened for yeast strains, but it may come up again, so make sure it never happens again.
+    allDbs = set()
+    for taxId, dbs in taxIdToDbs.items():
+        for db in dbs:
+            if not db in allDbs:
+                allDbs.add(db)
+            else:
+                assert(False) # This should not happen: a UCSC assembly is assigned to two taxon IDs
+
     return taxIdToDbs
 
 def parseArgs():
     parser = optparse.OptionParser("""usage: %prog [options] [run] - run uniprot update: download UniProt, parse the annotations to .tab files, build liftOver .psls and lift the annotations to bigBed files, one set per genome db
 
     Assumptions:
-        - proteins that are in UniProt but have no annotations are not shown. see SKIPPROT in code.
+        - proteins that are in UniProt but have no annotations are not shown in the alignments.
         - on hg38/hg19, the mapping uses ncbiRefSeq/refGene. see findBestGeneTable()
             """)
     parser.add_option("-d", "--debug", dest="debug", action="store_true", \
             help="show debug messages")
     parser.add_option("-l", "--skipDownload", dest="skipDownload", action="store_true", \
             help="Skip the download step (debugging)")
     parser.add_option("", "--skipTrembl", dest="skipTrembl", action="store_true", \
             help="Skip processing Trembl")
     parser.add_option("-p", "--skipParse", dest="skipParse", action="store_true", \
             help="Skip the download and parsing step (debugging)")
     parser.add_option("", "--dbs", dest="onlyDbs", action="store", \
             help="only run on certain databases, comma-sep list of UCSC database names, e.g. 'hg19,mm10' (debugging). Implies -l and -p.")
     parser.add_option("", "--onlyMap", dest="onlyMap", action="store_true", \
             help="only create the pslMap files (debugging)")
     parser.add_option("-u", "--uniprotDir", dest="uniprotDir", action="store", \
@@ -398,30 +418,32 @@
     acc = firstAnnot.acc
 
     # set the bed name field to a disease, to the mutation or something else
     if isMut and firstAnnot.origAa=="":
         disCodes = shortenDisCode(firstAnnot.disCode)
         disName = ",".join(disCodes)
         if len(disName)>30:
             disName = disName[:30]+"..."
 
         bed[3] = "%s-%sdel" % (firstAnnot.begin, firstAnnot.end)
         if len(disCodes)>0 and not "strain" in disName:
             bed[3] += " in %s" % disName
 
     else:
         bed[3] = "%s%s%s" % (firstAnnot.origAa, firstAnnot.begin, firstAnnot.mutAa)
+        if len(bed[3]) > 20:
+            bed[3] = "%daa to %daa" % (len(firstAnnot.origAa), len(firstAnnot.mutAa))
 
     color = None
     if firstAnnot.featType=="sequence variant":
         annoType = "Naturally occurring sequence variant"
         color = "100,0,0"
     elif firstAnnot.featType=="mutagenesis site":
         annoType = "Experimental mutation of amino acids"
         color = "0,0,100"
     else:
         bed[3] = firstAnnot.shortFeatType
         if firstAnnot.featType in useComment:
             comment = firstAnnot.comment
             # trembl doesn't have comments for chains so make one up
             if comment=="":
                 comment = "%s:%s-%s" % (acc, str(firstAnnot.begin), str(firstAnnot.end))
@@ -825,31 +847,42 @@
     bbFnames = convertToBigBed(bedFiles, genomeChromSizesFname, bigBedDir)
     moveBigBedToOutDir(bbFnames, outDir)
 
     cmd = "rm -rf %s" % tempDir
     run(cmd)
 
 def isNewer(url, localFname):
     " return true if url is more recent than localFname "
     # little helper class to download only the http HEAD
     class HeadRequest(urllib.request.Request):
         def get_method(self):
             return "HEAD"
 
     # get the mtime of the file on the server
     if url.startswith("http"):
+        count = 0
+        while True:
+            try:
                 response = urllib.request.urlopen(HeadRequest(url))
+                break
+            except urllib.error.URLError:
+                logging.error("URLError on %s, waiting for one hour, retrying" % url)
+                time.sleep(3600)
+                count +=1
+                if count > 5:
+                    errAbort("Cannot open %s" % url)
+
         timeStr = response.headers.get("Last-Modified")
         serverTime = datetime.datetime.fromtimestamp(time.mktime(time.strptime(timeStr, "%a, %d %b %Y %H:%M:%S GMT")))
     elif url.startswith("ftp://"):
         # https://stackoverflow.com/questions/25724497/getting-servers-date-using-ftp
         u = urlparse(url)
         from ftputil import FTPHost # not found? run 'pip install ftputil'
         with FTPHost(u.netloc, "anonymous", "genome-www@soe.ucsc.edu") as host:
             mod_time = host.path.getmtime(u.path)
             logging.debug("mod time %s" % mod_time)
             serverTime = datetime.datetime.fromtimestamp(mod_time)
 
     # get the mtime of the flag file
     localTime = datetime.datetime.fromtimestamp(0)
     if os.path.isfile(localFname):
         t = os.path.getmtime(localFname)
@@ -863,31 +896,32 @@
         return True
     else:
         logging.debug("URL %s is not newer than %s" % (url, localFname))
         return False
 
 def run(cmd, ignoreErr=False):
     " run command, stop on error "
     if type(cmd)==type([]):
         cmd = " ".join(cmd)
     logging.debug(cmd)
     cmd = "set -o pipefail; "+cmd
     ret = os.system(cmd)
     logging.debug("Running: %s" % (cmd))
     if ret!=0 and not ignoreErr:
         logging.error("command returned error: %s" % cmd)
-        sys.exit(ret)
+        sys.exit(1)
+    return ret
 
 def fastaMd5(fname):
     " sort gzipped fasta by id and return its md5 "
     logging.debug("Getting md5 of sorted seqs in %s" % fname)
     if fname.endswith(".gz"):
         catCmd = "zcat"
     else:
         catCmd = "cat"
     cmd = """%s %s | awk 'BEGIN{RS=">"}{print $1"\t"$2;}' | sort -k1,1 | md5sum""" % (catCmd, fname)
     proc = subprocess.Popen(cmd, stdout=PIPE, shell=True, encoding="latin1")
     md5 = proc.stdout.readline().split()[0]
     logging.debug("MD5 of %s is %s" % (fname, md5))
     return md5
 
 def manyTabMd5(fnames):
@@ -1040,89 +1074,89 @@
     accToDb = {}
     ofh = open(outFname, "wt")
     for dbName, fname in fnames:
         if not isfile(fname):
             raise Exception("%s not found" % fname)
         ifh = gzip.open(fname, "rt")
         try:
             for line in ifh:
                 ofh.write(line)
 
                 if line.startswith(">"):
                     seqId = line.lstrip(">").rstrip("\n").split()[0]
                     accToDb[seqId] = dbName
         except:
             logging.error("Error reading %s" % fname)
-            sys.exit(1)
+            assert(False)
 
     return accToDb
 
-def pslToProtPsl(inFname, outFname):
-    " convert psl to prot coordinates "
-    ofh = open(outFname, "w")
-
-    for line in open(inFname):
-        row = line.rstrip("\n").split("\t")
-        matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, \
-            tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, tStart, \
-            tEnd, blockCount, blockSizes, qStarts, tStarts = row
-        # python3 now returns floats: 9/3 = 3.0 so need to cast to ints everywhere
-        qSize = int(float(qSize)/3)
-        tStarts = ",".join([str(int(int(x)/3)) for x in tStarts.rstrip(",").split(",")])
-        qStarts = ",".join([str(int(int(x)/3)) for x in qStarts.rstrip(",").split(",")])
-        blockSizes = ",".join([str(int(int(x)/3)) for x in blockSizes.rstrip(",").split(",")])
-        qStart = int(float(qStart)/3)
-        qEnd = int(float(qEnd)/3)
-        row = [matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, \
-            tBaseInsert, strand, qName, qSize, qStart, qEnd, \
-            tName, tSize, tStart, tEnd, \
-            blockCount, blockSizes, qStarts, tStarts]
-        row = [str(x) for x in row]
-        ofh.write("\t".join(row))
-        ofh.write("\n")
-    ofh.close()
+# not used anymore, using Mark's pslProtCnv
+#def pslToProtPsl(inFname, outFname):
+#    " convert psl to prot coordinates "
+#    ofh = open(outFname, "w")
+#
+#    for line in open(inFname):
+#        row = line.rstrip("\n").split("\t")
+#        matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, \
+#            tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, tStart, \
+#            tEnd, blockCount, blockSizes, qStarts, tStarts = row
+#        # python3 now returns floats: 9/3 = 3.0 so need to cast to ints everywhere
+#        qSize = int(float(qSize)/3)
+#        tStarts = ",".join([str(int(int(x)/3)) for x in tStarts.rstrip(",").split(",")])
+#        qStarts = ",".join([str(int(int(x)/3)) for x in qStarts.rstrip(",").split(",")])
+#        blockSizes = ",".join([str(int(int(x)/3)) for x in blockSizes.rstrip(",").split(",")])
+#        qStart = int(float(qStart)/3)
+#        qEnd = int(float(qEnd)/3)
+#        row = [matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, \
+#            tBaseInsert, strand, qName, qSize, qStart, qEnd, \
+#            tName, tSize, tStart, tEnd, \
+#            blockCount, blockSizes, qStarts, tStarts]
+#        row = [str(x) for x in row]
+#        ofh.write("\t".join(row))
+#        ofh.write("\n")
+#    ofh.close()
 
 def getStatusFromDb(db):
     if db=="trembl":
         dbDesc = "Unreviewed (TrEMBL)"
     else:
         dbDesc = "Manually reviewed (Swiss-Prot)"
     return dbDesc
 
 def addExtraGeneInfoFields(row, dbName, recAnnot):
     " add the 20 extra fields for gene-like tracks to a BED row "
     # change color and score - in the old bigBed days, there were no filters,
     # so the score can still be used to filter for trembl/swissprot
     acc = row[3]
 
     if dbName=="swissprot":
         color = SWISSPCOLOR
         score = "1000"
     elif dbName=="trembl":
         color = TREMBLCOLOR
         score = "0"
     else:
         assert(False)
 
     row[8] = color
     row[4] = score
 
     row.append( recAnnot.acc ) # uniprot accessions, field 26 /  1
     row.append( recAnnot.name ) # uniprot curated name, field 27 / 2
 
     dbDesc = getStatusFromDb(dbName)
-
     row.append( dbDesc ) # db source, field 28 / 3
 
     row.append( recAnnot.accList.replace("|", ", ")) # uniprot alternative IDs, #29 / 4
     row.append( recAnnot.isoIds.replace("|", ", ")) # isoform IDs, #30 / 5
 
     fullNames = recAnnot.protFullNames
     row.append( fullNames ) # uniprot full names, #31 / 6
 
     # merge all these fields into one list of "gene synonyms"
     #14  geneSynonyms
     #15  isoNames
     #16  geneOrdLocus
     #17  geneOrf           
     syns = recAnnot.geneSynonyms.split("|")
     syns.extend(recAnnot.isoNames.split("|"))
@@ -1182,391 +1216,494 @@
             mappedTNameI = headers.index("mappedTName")
             mappedTStartI = headers.index("mappedTStart")
             mappedTEndI = headers.index("mappedTEnd")
             continue
 
         acc = row[srcQNameI]
         transId = row[mappingQNameI]
         chrom = row[mappedTNameI]
         start = row[mappedTStartI]
         end = row[mappedTEndI]
 
         mapInfo[ (acc, chrom, start, end) ].add (transId)
 
     return mapInfo
 
-def pslToBigPsl(mapFname, protSeqs, accToDb, accToMeta, accToMapSource, chromSizesFname, bigPslSwissFname, bigPslTremblFname):
+def pslToBigPsl(mapFname, protSeqs, accToDb, accToMeta, accToMapSource, accToTrans, chromSizesFname, bigPslSwissFname, bigPslTremblFname, infoFname):
     " rewrite the mapping psl to a bigPsl with extra fields, change colors and score, add seqs and CDS info "
     logging.debug("Converting %s to bigPsl" % mapFname)
     bpInputFname = mapFname.replace(".psl", ".pslInput")
 
-    # convert to bigPslInput
+    # convert to bigPslInput = BED format
     cmd = "pslToBigPsl %(mapFname)s stdout | sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > %(bpInputFname)s" % locals()
     run(cmd)
 
     mapInfoFname = mapFname+".mapInfo"
     mapInfo = parseMapInfo(mapInfoFname)
 
     # rewrite the bigPslInput file:
     # - change colors and score
     # - add extra fields, sequence and CDS info, transcripts used from mapInfo
     # - split into swissprot and trembl files
     bpInputSwissFname = mapFname.replace(".psl", ".swissprot.pslInput")
     bpInputTremblFname = mapFname.replace(".psl", ".trembl.pslInput")
 
     ofhSwiss = open(bpInputSwissFname, "w")
 
     ofhTrembl = None
     if bigPslTremblFname:
         ofhTrembl = open(bpInputTremblFname, "w")
 
+    protToLocs = defaultdict(list)
     for line in open(bpInputFname):
         # chr1    11994   12144   B4E2Z4  1000    +       11994   12144   0       1       150,    0,      186     336     +       336     186,          249250621        150     0       0       0       0
         row = line.rstrip("\n").split("\t")
 
+        row[-1] = "2" # seqType field: 0=empty, 1=nucleotide, 2=amino_acid
+
         acc = row[3]
         recAnnot = accToMeta.get(acc, None)
 
         if recAnnot is None:
             logging.error("Accession: %s - no record info?" % acc)
             assert(False)
 
-        dbName = accToDb[acc]
-
         # add the transcripts that were used for mapping this protein to the genome
         chrom, start, end = row[0], row[1], row[2]
 
         if mapInfo:
             transListStr = ", ".join(sorted(list(mapInfo[(acc, chrom, start, end)])))
-            mapSource = accToMapSource[acc]
+            mapSource = accToMapSource.get(acc)
+            if mapSource is None:
+                mapSource = accToMapSource["default"]
+
+            protToLocs[acc].append( ("%s:%s-%s" % (chrom, str(start), str(end)), mapSource, transListStr.replace(" ", "")) )
+
             if mapSource == "best":
-                mapInfo = "best match(es) when aligning protein against all transcripts"
+                mapDesc = "Best match(es) when aligning protein against all transcripts"
+            elif mapSource == "direct":
+                mapDesc = "Best BLAT match when aligning protein against entire genome"
             elif mapSource == "uniprot":
-                mapInfo = "alignment of protein to transcript(s) annotated by UniProt"
+                mapDesc = "Alignment of protein to transcript(s) annotated by UniProt"
             elif mapSource == "entrez":
-                mapInfo = "best match(es) when aligning protein against RefSeq transcripts of NCBI gene annotated by UniProt"
+                mapDesc = "Best match(es) when aligning protein against RefSeq transcripts of NCBI gene annotated by UniProt"
             else:
-                assert(False) # invalid mapSource ?
-            mapInfoStr = transListStr+" (%s)" % mapInfo
+                assert(False) # bug
+
+            mapInfoStr = "%s: %s" % (mapDesc, transListStr)
         else:
-            mapInfoStr = "(mapped with direct BLAT)"
+            mapInfoStr = "no transcript: direct BLAT to genome"
 
         row.append(mapInfoStr)
 
-        row = addExtraGeneInfoFields(row, dbName, recAnnot)
+        protDb = accToDb[acc]
+
+        # there are two types of sequences in UniProt: the "main" ones, that
+        # are annotated, and the others ("isoforms"), which are not annotated.  I now color
+        # the alternative isoforms like trembl sequences
+        isMain = (acc==recAnnot.mainIsoAcc)
+        isMainStr = "alternative isoform"
+        if isMain:
+            isMainStr = "primary sequence"
+        if protDb=="swissprot":
+            if isMain:
+                color = "2,12,120"
+            else:
+                color= "0,150,250" # no need to distinguish, we have two different tracks for trembl and swissprot now
+        else:
+            color = "0,150,250" # trembl doesn't have isoforms, at least I've never seen one
+        row[8] = color
+
+        # treat swissprot isoforms as if they were trembl sequences. This makes some sense from a user perspective.
+        # as the isoforms are almost as useless as the trembl annotations. :-) It makes the filtering UI simpler.
+        featDbName = protDb
+        if not isMain and protDb=="swissprot":
+            featDbName = "trembl"
+        row = addExtraGeneInfoFields(row, featDbName, recAnnot)
+
+        # allow user to filter on main/not-main attribute
+        row.append(isMainStr)
 
         # add the sequence (for the alignment, oSequence)
         seq = protSeqs[acc]
         row[17] = seq
         # add the CDS field oCDS
-        row[18] = "1.."+str(3*len(seq))
+        #row[18] = "1.."+str(3*len(seq))
+        row[18] = "" # email from Braney: Put nothing in the CDS field
 
-        if dbName=="swissprot":
+        if protDb=="swissprot":
             ofh   = ofhSwiss
-        elif dbName=="trembl":
+        elif protDb=="trembl":
             ofh   = ofhTrembl
         else:
             assert(False)
 
         ofh.write("\t".join(row))
         ofh.write("\n")
 
     ofhSwiss.close()
     if ofhTrembl:
         ofhTrembl.close()
 
     asFname = join(asDir, "bigPslUniprot.as")
     cmd = "bedToBigBed -extraIndex=acc,name -type=bed12+ -as=%s -tab %s %s %s" % \
             (asFname, bpInputSwissFname, chromSizesFname, bigPslSwissFname)
     run(cmd)
 
     if ofhTrembl:
         cmd = "bedToBigBed -extraIndex=acc -type=bed12+ -as=%s -tab %s %s %s" % \
                 (asFname, bpInputTremblFname, chromSizesFname, bigPslTremblFname)
         run(cmd)
 
+    # output basic mapping information for debugging later
+    mapInfoFh = open(infoFname, "w")
+    mapInfoFh.write("#acc\tdb\tsyms\tentrez\tmapMethod\ttargetTrans\talignedTrans\tchromLocs\n")
+    for acc, db in accToDb.items():
+        # write debugging information to the mapInfo file
+        allLocs = []
+        allTrans = []
+        if acc in protToLocs:
+            for locInfo in protToLocs[acc]:
+                locs, mapSource, transList = locInfo
+                allLocs.append(locs)
+                allTrans.append(transList)
+            locStr = ",".join(allLocs)
+            transStr = "|".join(allTrans)
+        else:
+            locStr = "NA"
+            transStr = "NA"
+            mapSource = "NA"
+
+        syms = accToMeta[acc].hgncSym.replace("|", ",")
+        genes = accToMeta[acc].entrezGene.replace("|", ",")
+        if accToTrans:
+            filterTransList = ",".join(accToTrans[acc])
+        else:
+            filterTransList = "NA"
+        row = (acc, db, syms, genes, mapSource, filterTransList, transStr, locStr)
+        mapInfoFh.write("\t".join(row))
+        mapInfoFh.write("\n")
+
+    mapInfoFh.close()
+    logging.info("Wrote %s" % mapFname)
+
+
 def parseRecordMeta(tabDir, taxId, doTrembl):
     """ parse the uniprot record-level annotations, like xrefs etc. return dict acc -> namedtuple
     links from ALL isoforms accessions of a record.
     """
     fnames = [
             (join(tabDir,"swissprot.%d.tab" % taxId))
     ]
 
     if doTrembl:
         fnames.append( (join(tabDir,"trembl.%d.tab" % taxId)) )
 
     ret = {}
     for fname in fnames:
         for row in iterTsvRows(open(fname)):
             for acc in getAllIsoAccs(row):
                 ret[acc] = row
     return ret
 
 def blatProteinsKeepBest(fullFaFname, db, mapFname, stats):
     " blat the proteins directly, only good for small genomes where we don't have any gene models "
     myDir = dirname(__file__)
-    #nuclFname = mapFname+".nucl"
     cmd = "blat -q=prot -t=dnax /gbdb/{db}/{db}.2bit {inf} stdout -noHead | sort -k10 | pslReps stdin stdout /dev/null " \
         "-minAli=0.95 -nohead | pslProtCnv > {out}".format(db=db, inf=fullFaFname, out=mapFname, myDir=myDir)
     stats["minAli"]=0.95
-    # originally used "| {myDir}/pslProtCnv " but got rid of the markd-script dependency now
+    #nuclFname = mapFname+".nucl"
+    # originally used "| {myDir}/pslProtCnv " but tried to get rid of the markd-script dependency 
     #pslToProtPsl(nuclFname, mapFname)
-    # figured that pslToProtPsl() is not the same as pslProtCnv... argll...
+    # then later figured that pslToProtPsl() is not the same as pslProtCnv and went back the MarkD way
     #os.remove(nuclFname)
     run(cmd)
 
 def getTransIds(db, geneTable, transcriptFa):
     """ return all possible transcript IDs given a gene table. The reason that we need is that Uniprot often
       refers to transcripts that don't exist and we are using a select file to map only to those. In these
       cases, we want to make sure that our select file contains only transcript that we actually have,
       so we can log how many transcript can possibly mapped (and which ones). This is important for debugging.
     """
     # no idea how to get the version suffix out of our mysql tables - thanks Obama!
     if geneTable=="refGene":
         logging.debug("Gene table is refGene, reading IDs from %s" % transcriptFa)
         allTrans = set(parseFasta(open(transcriptFa)).keys())
         return allTrans
 
     logging.info("Getting transcript IDs for db=%s, table=%s, field=name" % (db, geneTable))
     sql = "SELECT DISTINCT name from "+geneTable
     cmd = ["hgsql", db, "-NBe", sql]
     proc = subprocess.Popen(cmd, encoding="latin1", stdout=PIPE)
     allTrans = set()
     for line in proc.stdout:
         idStr = line.rstrip()
         allTrans.add(idStr)
     logging.info("Got %d distinct transcript IDs from MySQL table" % len(allTrans))
     return allTrans
 
-def writeTransToSelect(upSeqId, transIds, shortToLong, notFound, outTransIds, commonTrans, ofh, verDiffCount, pairCount):
-    " given a list of transcript IDs, write them to select file, if their base is found in shortToLong. Return count written "
+def writeTransToSelect(upSeqIds, transIds, shortToLong, notFound, outTransIds, commonTrans, \
+        ofh, verDiffCount, pairCount):
+    """ given a list of transcript IDs, write them to select file, if their base (before dot)
+    is found in shortToLong. Return count written """
     foundTransCount = 0
     for transId in transIds:
         # remove the version for the comparison, but keep the version in the pslSelect file
         shortId = transId.split(".")[0]
         if shortId not in shortToLong:
             # if we do not have this transcript, we don't write anything into the pair select file
             # This means that pslSelect falls back to the defaults, passes through everything and then
             # pslCdnaFilter picks the best ones
             notFound.append(transId)
             continue
 
         commonTrans.add(transId)
         if transId!=shortToLong[shortId]:
             verDiffCount+=1
         pairCount += 1
 
         outTransIds.add(transId)
         # here we're fixing up the select file: we replace the acc.version that UniProt has 
         # with the acc.version that we have in our transcript file. This saves a few hundred
         # transcripts, sometimes thousands, from getting removed
+        for upSeqId in upSeqIds:
             ofh.write("%s\t%s\n" % (upSeqId, shortToLong[shortId]))
         foundTransCount+=1
     return foundTransCount, verDiffCount, pairCount
 
 def buildSelectFile(tabFnames, mapDir, taxId, db, geneTable, transcriptFa, geneToRefSeqs, uniprotMd5, transMd5, stats):
-    """ make a two-column file uniprotId <-> transcript ID for filtering the PSL file. Helps force the alignment to the correct
-    transcript. see makeUniProtPsl.sh """
+    """ make a two-column file uniprotId <-> transcript ID for filtering the
+    PSL file. The tool for this is called pslSelect. This helps force the alignment
+    to the correct transcript. It turned out to be absolutely necessary. see makeUniProtPsl.sh """
     # get the UniProt field that holds the list of annotated transcript IDs
     if "encode" in geneTable or "ensGene" in geneTable:
         transField = "ensemblTrans"
     elif "refGene" in geneTable or "ncbiRefSeq" in geneTable:
         transField = "refSeq"
     else:
         logging.info("No supported gene track found. The protein/transcript alignments will not be filtered using pslSelect.")
         stats["noGeneTrack"] = True
-        return None, stats
+        # anoGam1 for example, has only augustus genes as a track
+        return None, stats, {}
 
     allTransIds = getTransIds(db, geneTable, transcriptFa)
 
     # create a mapping from no-version (short) to with-version accession (long), so we can fix them up later 
     # to the accessions that we actually have in our fasta file
     shortToLong = {}
     for acc in allTransIds:
         shortToLong[acc.split(".")[0]] = acc
 
     logging.debug("Example transcript IDs: %s" % list(allTransIds)[:10])
 
     uniprotMd5 = uniprotMd5[:10]
     transMd5 = transMd5[:10]
 
     selectFname = join(mapDir, "%(geneTable)s_%(uniprotMd5)s_%(transMd5)s.upToTrans.tab" % locals())
     logging.info("Building pair file %s" % selectFname)
     ofh = open(selectFname, "w")
 
     notFound = []
 
-    # the following loop is FULL of statistics. It took me a long time to get my head around 
+    # the following loop is FULL of stat tracking. It took me a long time to get my head around 
     # all the possible ways that the mapping can go astray and I wanted to keep them all
-    # This makes the code hard to read
+    # This makes the code harder to read but simplifies debugging a lot when ML questions come in.
+
+    # these are used to collect data for the statistics
     upRecCount = 0
     outUpIds = set()
     inTransIds = set()
     outTransIds = set()
     commonTrans = set()
     notFoundAtAll = set()
     pairCount = 0
     verDiffCount = 0
     verDiffCount2 = 0
-    protMapSource = {}
     sourceCounts = defaultdict(int)
 
+    # these are data that are important for other functions and are returned
+    protMapSource = {} # sequence accession -> "uniprot", "entrez" or "best"
+    mainSeqIds = set() # all the sequence accessions that are the "main" isoform, selected and annotated by UniProt. The others are not annotated.
+
     for fname in tabFnames:
         fieldIdx = None
         entrezIdx = None
+        isoIdx = None
         for line in open(fname):
             row = line.rstrip("\n").split('\t')
             if fieldIdx is None:
                 # header line
                 fieldIdx = row.index(transField)
                 entrezIdx = row.index("entrezGene")
+                isoIdx = row.index("isoIds")
                 continue
             upRecCount += 1
 
-            upSeqId = row[2] # not row[1], row[2] is the name of the sequence
+            upAcc = row[1] # uniProt record ID (NOT sequence ID, the sequences are often -1, -2, -3, etc)
+
+            # make a list of all possible _sequence_ accessions for this record in Uniprot, so main + isoforms
+            mainSeqId = row[2] # not row[1], row[2] is the name of the main sequence ID
+            mainSeqIds.add(mainSeqId)
+
             #logging.debug("UniProt ID: %s" % upSeqId)
+            allSeqIds = [mainSeqId]
+            isoSeqStr = row[isoIdx]
+            if (isoSeqStr!=""):
+                isoSeqIds = isoSeqStr.split("|")
+                allSeqIds.extend(isoSeqIds)
 
             transIdStr = row[fieldIdx]
             entrezGenes = row[entrezIdx]
 
             if transIdStr=="":
-                #logging.debug("Empty transcript ID")
-                continue
+                #logging.debug("%s: Empty transcript ID" % mainSeqId)
+                transIds = []
+            else:
                 transIds = transIdStr.split("|")
                 #logging.debug("Transcript IDs: %s" % transIds)
 
-            outUpIds.add(upSeqId)
+            outUpIds.update(allSeqIds)
             inTransIds.update(transIds)
 
-            # stage 1: first try to use the RefSeq IDs as they were annotated by UniProt
-            foundTransCount, verDiffCount, pairCount = writeTransToSelect(upSeqId, transIds, shortToLong, notFound, outTransIds, commonTrans, ofh, verDiffCount2, pairCount)
+            # stage 1: first try to use the transcript (RefSeq) IDs as they were annotated by UniProt
+            foundTransCount, verDiffCount, pairCount = writeTransToSelect(allSeqIds, transIds, \
+                    shortToLong, notFound, outTransIds, commonTrans, ofh, verDiffCount2, pairCount)
             mapSource = None
 
             if foundTransCount != 0:
                 mapSource = "uniprot"
             else:
+                # optional stage2: try to go through entrez gene - not as precise, but better than nothing
                 if entrezGenes!="" and geneToRefSeqs:
-                    # stage2: try to go through entrez gene - not as precise, but better than nothing
 
                     # there can be several entrez genes for one uniprot ID
                     entrezGenes = entrezGenes.split('|')
                     transIds = set()
                     for geneId in entrezGenes:
                         if geneId in geneToRefSeqs:
                             # and of course many refseq for one geneId
                             refseqs = geneToRefSeqs[geneId]
                             transIds.update(refseqs)
 
                     foundTransCount, verDiffCount2, pairCount = \
-                            writeTransToSelect(upSeqId, transIds, shortToLong, notFound, outTransIds, commonTrans, ofh, verDiffCount2, pairCount)
-                    if foundTransCount==0:
-                        mapSource = "best"
-                    else:
+                            writeTransToSelect(allSeqIds, transIds, shortToLong, notFound, \
+                                    outTransIds, commonTrans, ofh, verDiffCount2, pairCount)
+
+                    if foundTransCount!=0:
                         mapSource = "entrez"
                     else:
+                        # if stage 2 also didn't work, simply use the best matching transcript
+                        mapSource = "best"
+                else:
                     mapSource = "best"
 
-            protMapSource[upSeqId] = mapSource
+            # now that we know how we map these proteins to the genome, keep this for later and keep the stats
+            for seqId in allSeqIds:
+                protMapSource[seqId] = mapSource
             sourceCounts[mapSource] += 1
             if mapSource=="best":
-                notFoundAtAll.add(upSeqId)
+                notFoundAtAll.add(upAcc)
 
     sourceCounts = list(dict(sourceCounts).items())
 
     stats["notFoundAtAll"] = len(notFoundAtAll)
     stats["notFoundAtAll_Examples"] = list(notFoundAtAll)[:10]
     stats["verDiffCount"] = verDiffCount
     stats["verDiffCount2"] = verDiffCount2
     stats["sourceCounts"] = sourceCounts
 
     # for debugging
     if len(notFoundAtAll)!=0:
         notFoundFname = join(mapDir, "lastRun_noAnnotatedTranscriptIDs.txt")
         notFoundFh = open(notFoundFname, "w")
         #notFoundFh.write(str(notFoundAtAll)+"\n")
         for acc in notFoundAtAll:
             notFoundFh.write(acc+"\n")
         notFoundFh.close()
         logging.info("Wrote UniProt IDs where no transcript could be found to %s" % notFoundFname)
 
     if pairCount==0:
         logging.warn("No single uniprot/transcript pair is left in select file. Not doing pair file filtering at all.")
         stats["outPairCount"]=0
-        return None, stats
+        return None, stats, protMapSource
 
     # make a little table with NM_ -> count, XM_ -> count, etc
     prefixCounts = defaultdict(int)
     for nfId in notFound:
         prefixCounts[nfId[:3]]+=1
 
     ofh.close()
     logging.info("Processed %d UniProt records, trying to match with the transcript IDs from '%s'" % (upRecCount, geneTable))
     logging.info("Found %d transcript IDs in UniProt" % len(inTransIds))
     logging.info("Found %d transcript IDs in the genePred table %s" % (len(allTransIds), geneTable))
     logging.info("%d transcript IDs found in both UniProt and transcript table" % len(commonTrans))
     if len(notFound)!=0:
         logging.info("%d transcript IDs found in UniProt records were not found in '%s'. These mappings were removed. Example: %s" %
                 (len(notFound), geneTable, notFound[0]))
     logging.info("The IDs from UniProt not found in the genePred fall into these classes: %s" % dict(prefixCounts))
     logging.info("%d proteins could not be mapped by any method, so they fall back to best-alignment" % len(notFoundAtAll))
     logging.info("The UniProt proteins were mapped using these methods (type:count): %s" % sourceCounts)
     logging.info("The transcript version suffix was corrected for %d transcripts at stage 1, and %d transcript at stage 2" % (verDiffCount, verDiffCount2))
     logging.info("The pslSelect file contains %d pairs: %d uniprot IDs are associated to %d transcript IDs" % (pairCount, len(outUpIds), len(outTransIds)))
 
+    stats["uniprotRecCount"] = len(mainSeqIds)
     stats["protSeqCount"] = upRecCount
     stats["uniprotTransCount"] = len(inTransIds)
     stats["browserTransCount"] = len(allTransIds)
     stats["commonTransCount"] = len(commonTrans)
     stats["notFoundTransCount"] = len(notFound)
     if len(notFound)!=0:
         stats["notFound_examples"] = list(notFound[:10])
         stats["notFoundClasses"] = dict(prefixCounts)
     stats["outPairCount"] = pairCount
     stats["outPairUniprot"] = len(outUpIds)
     stats["outPairTrans"] = len(outTransIds)
 
     return selectFname, stats, protMapSource
 
 def makeTranscriptFiles(db, geneTable, taxId, dbDir):
     """ given a db and a gene table, create two files,
     one for the transcript fasta sequences and one for the transcript -> genome PSL. Return tuple of the file names
     ALWAYS REMOVES _ALT/_FIX/_HAP results from the PSL!
     """
     #dbDir = join(mapDir, db+"_"+geneTable)
     #if not isdir(dbDir):
         #os.makedirs(dbDir)
 
     faName = join(dbDir, "transcripts.fa")
     pslName = join(dbDir, "transcripts.psl")
 
     pslFields = "matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,tNumInsert," \
         "tBaseInsert,strand,qName,qSize,qStart,qEnd,tName,tSize,tStart,tEnd,blockCount,blockSizes,qStarts,tStarts"
 
-    if geneTable in ["ncbiRefSeq", "refGene"]:
     if geneTable=="ncbiRefSeq":
         origFa = "/gbdb/%s/ncbiRefSeq/seqNcbiRefSeq.rna.fa" % db
         shutil.copyfile(origFa, faName)
         pslTable = "ncbiRefSeqPsl"
+
+        query = "SELECT "+pslFields+" from "+pslTable+" WHERE tName NOT LIKE '%_hap%' AND tName not like '%_alt%' AND tNAME NOT LIKE '%_fix%'"
+        runQueryToFile(db, query, pslName)
+
     elif geneTable=="refGene":
         # ChrisL told me where these files can be found
         #cmd = "curl https://hgdownload.cse.ucsc.edu/goldenpath/%s/bigZips/refMrna.fa.gz | zcat > %s" % (db, faName)
         cmd = "zcat /hive/data/outside/genbank/data/ftp/%s/bigZips/refMrna.fa.gz | tr ' ' '.' > %s" % (db, faName)
         run(cmd)
-            pslTable = "refSeqAli"
+        pslFields = "matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,tNumInsert," \
+            "tBaseInsert,strand,concat(qName, '.', gbSeq.version),qSize,qStart,qEnd,tName,tSize,tStart,tEnd,blockCount,blockSizes,qStarts,tStarts"
 
-        query = "SELECT "+pslFields+" from "+pslTable+" WHERE tName NOT LIKE '%_hap%' AND tName not like '%_alt%' AND tNAME NOT LIKE '%_fix%'"
+        query = "SELECT "+pslFields+" from refSeqAli, hgFixed.gbSeq WHERE refSeqAli.qname=gbSeq.acc and tName NOT LIKE '%_hap%' AND tName not like '%_alt%' AND tNAME NOT LIKE '%_fix%'"
         runQueryToFile(db, query, pslName)
     else:
         query = "SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds from %s where cdsStart < cdsEnd" % (geneTable)
         gpName = join(dbDir, "transcripts.gp")
         cdsName = join(dbDir, "transcripts.cds")
         runQueryToFile(db, query, gpName)
         cmd = ["genePredToFakePsl", db, gpName, pslName, cdsName]
         run(cmd)
         cmd = ["getRnaPred", db, gpName, "all", faName]
         run(cmd)
 
     logging.info("Created %s and %s for %s/%s" % (faName, pslName, db, geneTable))
     return faName, pslName
 
 def writeMapDesc(stats, db, geneTable, mapDescFname):
@@ -1596,217 +1733,286 @@
     m = hashlib.md5()
     for a in arr:
         m.update(a.encode("utf8"))
     return m.hexdigest()
 
 def readGeneToRefSeq(taxId):
     " return entrezId -> list of Refseq from tsv file "
     g2refseq = {}
     fname = join(NCBIDIR, str(taxId)+".tsv")
     for line in open(fname):
         geneId, transList = line.rstrip("\n").split('\t')
         transList = transList.split(',')
         g2refseq[geneId] = transList
     return g2refseq
 
+def parseKeyVal(fname):
+    " given a file with key<tab>val return a dict with key -> list of values "
+    if fname is None:
+        return None
+    vals = defaultdict(list)
+    for line in open(fname):
+        key, val = line.rstrip("\n").split("\t")
+        vals[key].append(val)
+    return vals
+
 def makeProteinGenomePsl(taxId, protFa, tabFnames, db, cluster, doForce, mapDir, mapDescFname):
     """ map proteins of a taxon stored in protFa to db via geneTable and create a PSL file for the mapping
-    Will try to reuse the old mapping file if possible, as it is expensive to build.
-    BLAST has no option to restrict the alignment, verified with an email to NCBI BLAST support team. """
+    Will try to reuse the old PSL mapping file if possible, as it is expensive to build.
+    Need to BLAST the entire file: BLAST has no option to subset the target (unlike blast or lastz), verified
+    with an email to NCBI BLAST support team.  A central point of this function
+    is buildSelectFile(). The psl select pair files have a huge influence on the result,
+    so it is rebuilt every time, as any change in UniProt will change it. It's also quick to create.
+    Its MD5 is part of the PSL mapping file. Any change to the pslSelect pair file will trigger
+    a realignment/rebuild of the pslMap PSL file.
+    """
     # use MD5s of the input files to determine if the protein -> genome map has to be rebuilt
     global MINALI
     protMd5 = fastaMd5(protFa)
 
     stats = OrderedDict()
     stats["user"] = os.getlogin()
     stats["taxId"] = taxId
     stats["db"] = db
 
     geneTable = findBestGeneTable(db)
 
     metaMd5 = manyTabMd5(tabFnames)
 
     if geneTable!="blat":
         transcriptFa, transcriptPsl = makeTranscriptFiles(db, geneTable, taxId, mapDir)
         transMd5 = fastaMd5(transcriptFa)
         pslMd5 = tabMd5(transcriptPsl)
     else:
+        # this usually only happens on viral genomes or other weird cases that don't have a single gene model track
+        # here we blat the proteins directly onto the genome
         transMd5, pslMd5 = "directBlat-noTranscripts", "directBlat-noPsl"
         transcriptFa, transcriptPsl = None, None
 
     geneToRefSeqs = None
+    selectFname = None
+    protToTrans = None
+
+    if geneTable in ["blat"]:
+        protMapSource = {"default":"direct"}
+    elif geneTable in ["augustusGene"]:
+        protMapSource = {"default":"best"}
+    else:
         if geneTable in ["refGene", "ncbiRefSeq"]:
             geneToRefSeqs = readGeneToRefSeq(taxId)
-
-    selectFname, stats, protMapSource = buildSelectFile(tabFnames, mapDir, taxId, db, geneTable, transcriptFa, geneToRefSeqs, metaMd5, transMd5, stats)
+        selectFname, stats, protMapSource = buildSelectFile(tabFnames, mapDir, taxId, db, \
+                geneTable, transcriptFa, geneToRefSeqs, metaMd5, transMd5, stats)
+        protToTrans = parseKeyVal(selectFname)
 
     allMd5s = [protMd5, transMd5, pslMd5]
     if selectFname:
         pairMd5 = tabMd5(selectFname)
         allMd5s.append(pairMd5)
         stats["pairMd5"] = pairMd5[:10]
+        stats["pairFname"] = selectFname
 
-    # if either the protein sequences, the transcripts, their alignment or the protein/transcript mapping changes
-    # -> create a new PSL mapping file
+    # if either the protein sequences, the transcripts, their alignment or the
+    # protein/transcript mapping changes -> create a new PSL mapping file
     fullMd5 = listMd5(allMd5s)[:10]
 
     mapFname = join(mapDir, "%(geneTable)s_%(fullMd5)s.psl" % locals())
 
     if isfile(mapFname) and not doForce:
         logging.info("%s already exists, not rebuilding the protein -> genome mapping PSL" % mapFname)
         assert(os.path.getsize(mapFname)!=0)
-        return mapFname, geneTable, fullMd5, protMapSource, False
+        # careful: if you modify this, also modify the other return statement below
+        return mapFname, geneTable, fullMd5, protMapSource, protToTrans, False
 
     stats["protMd5"] = protMd5[:10]
     stats["metaMd5"] = metaMd5[:10]
     stats["transMd5"] = transMd5[:10]
     stats["fullMd5"] = fullMd5[:10]
     stats["mapFname"] = basename(mapFname)
 
     logging.debug("%s does not exist" % mapFname)
     if geneTable=="blat":
         logging.error("Could not find any gene table for %s, using BLAT to map proteins" % db)
         blatProteinsKeepBest(protFa, db, mapFname, stats)
     else:
         if db in ["ci3"]: # wow: ciona's genome is from a different organism than the refseq transcripts.
             MINALI = 0.85
         stats["minAli"] = MINALI
         workDir = "clusterRun-map-%s-%s-%s.tmp" % (db, geneTable, fullMd5)
 
         scriptArgs = [protFa, transcriptFa, transcriptPsl, str(MINALI), cluster, workDir, mapFname]
         if selectFname is not None:
             scriptArgs.append(selectFname)
 
         # This is where the BLAST alignment-meat happens
         cmd = ["time", "./makeUniProtPsl.sh"]
         cmd.extend(scriptArgs)
         logging.info("Running %s" % cmd)
         run(cmd)
 
     writeMapDesc(stats, db, geneTable, mapDescFname)
 
     assert(os.path.getsize(mapFname)!=0)
-    return mapFname, geneTable, fullMd5, protMapSource, True
+    # if you modify this line, also modify the other return statement above
+    return mapFname, geneTable, fullMd5, protMapSource, protToTrans, True
 
-def convMapToBigPsl(fastaFname, mapFname, bigBedDir, db, accToDb, accToMeta, accToMapSource, chromSizesFname, doTrembl):
+def convMapToBigPsl(fastaFname, mapFname, bigPslDir, accToDb, accToMeta, accToMapSource, \
+        accToTrans, chromSizesFname, doTrembl):
     " convert the .psl mapping file to bigPsl annotated with a few basic prot infos "
-    bigPslDir = makeSubDir(bigBedDir, db)
     bigPslSwissFname = join(bigPslDir, "unipAliSwissprot.new.bb" % locals())
 
     if doTrembl:
-        bigPslTremblFname = join(bigBedDir, db, "unipAliTrembl.new.bb" % locals())
+        bigPslTremblFname = join(bigPslDir, "unipAliTrembl.new.bb" % locals())
     else:
         bigPslTremblFname = None
 
     protSeqs = parseFasta(open(fastaFname))
-    pslToBigPsl(mapFname, protSeqs, accToDb, accToMeta, accToMapSource, chromSizesFname, bigPslSwissFname, bigPslTremblFname)
+    infoFname = join(bigPslDir, "protMapInfo.tsv")
+    pslToBigPsl(mapFname, protSeqs, accToDb, accToMeta, accToMapSource, accToTrans, chromSizesFname, bigPslSwissFname, bigPslTremblFname, infoFname)
 
     # create a raw lift file in case someone wants to run pslMap one day
-    liftFname = join(bigBedDir, db, "unipToGenomeLift.psl")
+    liftFname = join(bigPslDir, "unipToGenomeLift.psl")
     shutil.copyfile(mapFname, liftFname)
     cmd = ["gzip","-f",liftFname]
     run(cmd)
     liftFname += ".gz"
     logging.info("Created %s" % liftFname)
 
-    chainFname = join(bigBedDir, db, "unipToGenome.over.chain")
-    cmd = ["pslToChain",liftFname, chainFname]
+    mapSwapFname = join(bigPslDir, "unipAlign.swap.psl")
+    cmd = ["pslSwap", mapFname, mapSwapFname]
+    run(cmd)
+
+    chainFname = join(bigPslDir, "unipToGenome.over.chain")
+    cmd = ["pslToChain",mapSwapFname, chainFname]
     run(cmd)
     cmd = ["gzip","-f",chainFname]
     run(cmd)
     logging.info("Created %s" % liftFname)
     chainFname += ".gz"
 
+    os.remove(mapSwapFname)
+
 def makeSubDir(parent, child):
     " mkdir child under parent and return "
     newDir = join(parent, child)
     if not isdir(newDir):
         logging.info("Making dir: %s" % newDir)
         os.makedirs(newDir)
     return newDir
 
-def updateUniprot(args, onlyDbs, taxIdDbs, options):
-    " This is the main function that runs a single uniprot update for a list of DBs "
-    uprotDir = options.uniprotDir
-    tmpDir = join(uprotDir, "download.tmp")
-    tabDir = options.tabDir
-    mapDir = options.mapDir
-    bigBedDir = options.bigBedDir
-    faDir = options.faDir
-    doTrembl = not options.skipTrembl
-
-    relFname = join(tabDir, "version.txt")
-
-    if not options.skipDownload and not options.skipParse and not options.onlyDbs:
-        # download NCBI -> refseq tables
-        downloadAndSplitNcbi(taxIdDbs)
-
-        # download UniProt using LFTP
+def downloadUniprot(uprotDir):
+    " if necessary, download UniProt using LFTP in an atomic way, update release info file "
     localFname = join(uprotDir, "uniprot_sprot.xml.gz")
     if not isNewer(upUrl, localFname):
         logging.info("files at %s are not newer than file in %s, nothing to do. Specify -l to skip this check." % (upUrl, localFname))
         sys.exit(0)
 
     # use lftp to update uniprot and download only changed files
     # in late 2020, pget started to trigger errors, so removed -P 3 and --use-pget-n=4
-        cmd = 'lftp ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/ -e "lcd %s && mirror . --exclude-glob *.dat.gz && exit"' % tmpDir
-        run(cmd)
+    tmpDir = join(uprotDir, "download.tmp")
+    if not isdir(tmpDir):
+        os.makedirs(tmpDir)
+
+    # the uniprot FTP server is too slow and too unreliable. Switching to EBI's which has rsync !!
+    #cmd = 'lftp ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/ -e "lcd %s && mirror . --exclude-glob *.dat.gz && exit"' % tmpDir
+    count = 0
+    while True:
+        cmd = "rsync -av --partial rsync://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/ %s/" % tmpDir
+        ret = run(cmd, ignoreErr=True)
+        if ret==0:
+            break
+
+        logging.info("Problem with rsync, waiting for 5 hours, then retrying")
+        time.sleep(5*60*60)
+        count += 1
+        if count > 5:
+            errAbort("Cannot run rsync cmd: %s" % cmd)
 
     # make sure that download interruptions don't mess up the files in the target dir
     logging.info("Moving files from %s to %s" % (tmpDir, uprotDir))
-        file_names = os.listdir(tmpDir)
-        for file_name in file_names:
-            outPath = join(uprotDir, file_name)
-            if file_name=="docs":
-                shutil.rmtree(outPath)
-
-            inPath = os.path.join(tmpDir, file_name)
-            if isdir(inPath):
-                shutil.move(inPath, outPath)
+    cmd = "rsync -a --remove-source-files %s/* %s" % (tmpDir, uprotDir)
+    run(cmd)
+    shutil.rmtree(tmpDir)
 
-        # only use the first four words of version string
+def readRelStringUniprot(uprotDir):
+    # only use the first four words of version string, the four words are
     # 'UniProt Knowledgebase Release 2017_09 consists of:'
     relString = open(join(uprotDir, "reldate.txt")).read().splitlines()[0]
     relString = " ".join(relString.split()[:4])
+    return relString
+
+def writeReleaseString(uprotDir, tabDir):
+    " read release info file from uprotDir and create a shorter version of it in tabDir "
+    relString = readRelStringUniprot(uprotDir)
+
+    relFname = join(tabDir, "version.txt")
     relFh = open(relFname, "w")
     relFh.write(relString)
     relFh.close()
     logging.debug("Wrote release version string '%s' to %s" % (relString, relFname))
 
+def updateUniprot(args, onlyDbs, taxIdDbs, options):
+    " This is the main function that runs a single uniprot update for a list of DBs "
+    uprotDir = options.uniprotDir
+    tabDir = options.tabDir
+    mapDir = options.mapDir
+    bigBedDir = options.bigBedDir
+    faDir = options.faDir
+    doTrembl = not options.skipTrembl
+
+    if not options.skipDownload and not options.skipParse and not options.onlyDbs:
+        # download NCBI -> refseq tables
+        downloadAndSplitNcbi(taxIdDbs)
+        downloadUniprot(uprotDir)
+
+    downVersion = readRelStringUniprot(uprotDir)
+    tabVersion = open(join(tabDir, "version.txt")).read()
+
+    doParse = True
+    if downVersion==tabVersion:
+        logging.info("Not converting to tab again, found same version is in tab directory")
+        doParse = False
+
+    if options.skipParse:
+        logging.info("Not converting to tab, was switched off by option")
+        doParse = False
+
+    if doParse:
+        logging.info("downloaded version is %s, parsed version is %s" % (downVersion, tabVersion))
         logging.info("Converting uniprot from %s to %s, using maps in %s" % (uprotDir, tabDir, mapDir))
-    if not options.skipParse and not options.onlyDbs:
         # parsing the UniProt XML files
         logging.info("Converting uniprot XML from %s to tab-sep in %s, this can take 2-3 days!! Specify -p to skip this." % (uprotDir, tabDir))
         dbToTaxIds = {}
         if onlyDbs:
             for taxId, dbs in taxIdDbs.items():
                 for db in dbs:
                     dbToTaxIds[db] = taxId
             taxIds = [dbToTaxIds[db] for db in onlyDbs]
         else:
             taxIds = [x for (x,y) in taxIdDbs.items()] # just the taxIds themselves
         taxIdStr = ",".join([str(x) for x in taxIds])
 
         myDir = dirname(__file__) # directory of doUniprot
         # takes an hour or so
         cmd="%s/uniprotToTab %s %s %s" % (myDir, uprotDir, taxIdStr, tabDir)
         run(cmd)
 
         # takes 2-3 days on hgwdev - big data and XML don't mix...
         cmd="%s/uniprotToTab %s %s %s --trembl" % (myDir, uprotDir, taxIdStr, tabDir)
         run(cmd)
 
+        writeReleaseString(uprotDir, tabDir)
+
     # if the uniprot update changed the sequences, update the corresponding pslMap files of that genome
     logging.info("checking/creating pslMap files")
     run("mkdir -p %s" % mapDir)
 
     # parasol cluster name
     assert(isfile(clusterFname)) # not running at UCSC?
     cluster = open(clusterFname).read().strip()
 
     # get the uniProt version for the trackVersion table that we will update later
     relFname = join(tabDir, "version.txt")
     versionString = open(relFname).read()
 
     for taxId, dbs in taxIdDbs.items():
         if onlyDbs is not None and len(set(dbs).intersection(onlyDbs))==0:
             continue
@@ -1832,96 +2038,107 @@
             tabFnames.append( "tab/trembl.%d.tab" % taxId )
 
         # find the best gene table for each database, create a mapping 
         # protein -> genome and lift the uniprot annotations to bigBed files
         for db in dbs:
             if onlyDbs is not None and db not in onlyDbs:
                 continue
 
             logging.debug("Annotating assembly %s" % db)
             chromSizesFname = "/hive/data/genomes/%s/chrom.sizes" % db
 
             accToMeta = parseRecordMeta(tabDir, taxId, doTrembl)
 
             dbMapDir = makeSubDir(mapDir, db)
             mapDescFname = join(dbMapDir, "liftInfo.json")
-            mapFname, geneTable, mapMd5, accToMapSource, isNewMap = \
+            mapFname, geneTable, mapMd5, accToMapSource, accToTrans, isNewMap = \
                 makeProteinGenomePsl(taxId, fullFaFname, tabFnames, db, cluster, options.force, dbMapDir, mapDescFname)
 
-            if isNewMap:
-                convMapToBigPsl(fullFaFname, mapFname, bigBedDir, db, accToDb, accToMeta, accToMapSource, chromSizesFname, doTrembl)
+            dbBigBedDir = makeSubDir(bigBedDir, db)
+
+            convMapToBigPsl(fullFaFname, mapFname, dbBigBedDir, accToDb, accToMeta, accToMapSource, accToTrans, chromSizesFname, doTrembl)
             shutil.copy(mapDescFname, mapDir)
 
             if options.onlyMap:
                 continue
 
-            dbBigBedDir = makeSubDir(bigBedDir, db)
-
             annotFnames = ["tab/swissprot.%d.annots.tab" % taxId]
             if doTrembl:
                 annotFnames.append( "tab/trembl.%d.annots.tab" % taxId )
 
             uniprotLift(fullFaFname, annotFnames, chromSizesFname, mapFname, dbBigBedDir, accToDb, accToMeta, options)
 
             shutil.copyfile(mapDescFname, join(dbBigBedDir, "liftInfo.json"))
 
 def makeSymlink(target, linkName):
-    if isfile(linkName):
-        os.remove(linkName)
-    if target is None or not isfile(target):
-        logging.error("Cannot symlink: %s does not exist" % str(target))
+    assert(".new" not in linkName) # make sure that this bug never happens again
+
+    if target is None:
+        logging.error("internal error, but not stopping: Symlink to a None target?")
         return
 
     targetPath = abspath(target)
+
+    if not isfile(targetPath):
+        logging.error("Cannot symlink: %s does not exist" % str(target))
+        return
+
+    if isfile(linkName):
+        currTarget = os.readlink(linkName)
+        logging.debug("Symlink %s already exists, points to %s. But should point to %s" % (linkName, currTarget, targetPath))
+        if targetPath != currTarget:
+            os.remove(linkName)
+
     linkPath = abspath(linkName)
     cmd = "ln -sf %(targetPath)s %(linkPath)s " % locals()
     run(cmd)
 
 def findForMask(fileMask):
     fnames = glob.glob(fileMask)
     if len(fnames)==0:
         logging.error("NOT FOUND: %s" % fileMask)
         return None
     assert(len(fnames)==1)
     fname = fnames[0]
     return fname
 
-def makeLinks(bigBedDir, faDir, onlyDbs, taxIdDbs):
+def makeLinks(bigBedDir, onlyDbs, taxIdDbs):
     " check the /gbdb symlinks "
     for taxId, dbs in taxIdDbs.items():
-        fullFaFname = join(faDir, str(taxId)+".fa")
-        md5 = None
 
         for db in dbs:
             if onlyDbs is not None and db not in onlyDbs:
                 continue
-            if md5 is None:
-                md5 = fastaMd5(fullFaFname) # do only once per organism, seqs don't change
 
             dbBigBedDir = join(bigBedDir, db)
 
             # find the bigBed files
             bbTargetDir = makeSubDir(join("/gbdb", db), "uniprot")
             bbFnames = glob.glob(join(dbBigBedDir, "*.bb"))
             logging.debug("Found %d bigBed files in %s" % (len(bbFnames), dbBigBedDir))
 
             # and create links to them
             for bbFname in bbFnames:
-                if "_" not in bbFname: # skip the bigPsl files with the md5 in them
+                # the following 'if' should not be necessary, but it has happened to me that during testing I had 
+                # the liftOver files or .new.bb files in bigBed/, so make sure that they never get linked, because our
+                # qa symlink checker will immediately complain
+                if not "_" in basename(bbFname) and ".new" not in basename(bbFname):
                     bbLink = join(bbTargetDir, basename(bbFname))
                     makeSymlink(bbFname, bbLink)
 
+            # and add the link to the bigPsls
+
 def checkPsl(pslName, faFname):
     " checking the qNames in a psl input (!) file "
     if pslName is None:
         logging.debug("Not found: %s" % pslName)
         return
 
     seqIds = list(parseFasta(faFname).keys())
     accs = set([x.split("-")[0] for x in seqIds])
 
     qNames = Counter()
     pslCount = 0
     for line in open(pslName):
         row = line.rstrip("\n").split("\t")
         qName = row[3]
         qNames[qName]+=1
@@ -1966,52 +2183,52 @@
     logging.info("10 worst multi-mappers: %s" % (", ".join(worstMultis)))
 
 def bedInfo(fname):
     " return number of features and coverage in bigBed file "
     cmd = "bigBedInfo %s" % fname
     proc = subprocess.Popen(cmd, encoding="latin1", stdout=PIPE, shell=True)
     for line in proc.stdout:
         # itemCount: 2,662
         # basesCovered: 341,994
         if line.startswith("itemCount"):
             itemCount = int(line.split()[1].replace(",",""))
         if line.startswith("basesCovered"):
             basesCovered = int(line.split()[1].replace(",",""))
     return (itemCount, basesCovered)
 
-def flipNewVersionCheckMaxDiff(bigBedDir, onlyDbs, taxIdDbs, maxDiff=None):
+def flipNewVersionCheckMaxDiff(bigBedDir, onlyDbs, taxIdDbs, doForce, maxDiff=None):
     """ rename all .new.bb files to .bb files, but only if they do not diff in more than factor maxDiff  
     If any database fails the check, no files will be moved.
     """
     for taxId, dbs in taxIdDbs.items():
         for db in dbs:
             if onlyDbs is not None and db not in onlyDbs:
                 continue
             logging.info("Flipping bigBed files for %s" % db)
             for newFname in glob.glob(join(bigBedDir, db, "*.new.bb")):
                 if maxDiff is None:
                     logging.info("Not doing maxDiff check, --force or --flipOnly specified")
                     continue
 
                 oldFname = newFname.replace(".new.bb", ".bb")
                 if not isfile(oldFname):
                     logging.warning("%s does not exist, cannot do max-diff check" % oldFname)
                 else:
                     newCount, newCov = bedInfo(newFname)
                     oldCount, oldCov = bedInfo(oldFname)
 
-                    if db in checkDbs:
+                    if db in checkDbs and not doForce:
                         if oldCount > 10000 and newCount / oldCount > maxDiff:
                             logging.error("%s: old feature count %d, new feature count %d. Too big difference." %
                                     (oldFname, oldCount, newCount))
                             logging.error("Stopping now. Re-Run with --flipOnly to move files over.")
                             sys.exit(1)
                         if oldCov > 10000 and newCov / oldCov > maxDiff:
                             logging.error("%s: old coverage %d, new coverage %d. Too big difference." % (oldFname, oldCov, newCov))
                             logging.error("Stopping now. Re-Run with --flipOnly to move files over.")
                             sys.exit(1)
 
 
     # only when everything is OK, do the flip
     for taxId, dbs in taxIdDbs.items():
         for db in dbs:
             if onlyDbs is not None and db not in onlyDbs:
@@ -2036,69 +2253,72 @@
 
             # find the psl file
             pslMask = join(mapDir, "%(taxId)s_%(db)s_*_%(md5)s.swissprot.pslInput" % locals())
             pslName = findForMask(pslMask)
             faFname = join(tabDir, "swissprot.%s.fa.gz" % taxId)
             checkPsl(pslName, faFname)
 
             pslMask = join(mapDir, "%(taxId)s_%(db)s_*_%(md5)s.trembl.pslInput" % locals())
             pslName = findForMask(pslMask)
             faFname = join(tabDir, "trembl.%s.fa.gz" % taxId)
             checkPsl(pslName, faFname)
 
 def createVersionFiles(tabDir, bigBedDir, onlyDbs, taxIdDbs):
     " update the release version files. This is the indicator for the auto-archiver to create a new archive "
     relFname = join(tabDir, "version.txt")
-    versionString = open(relFname, encoding="utf8").read()
+    versionString = open(relFname, encoding="utf8").read().strip()
     shortVersion = versionString.split()[-1]
     if (len(shortVersion)!=7):
         logging.warning("UniProt Version string is not seven characters long")
     logging.debug("Release is: %s" % shortVersion)
 
     for taxId, dbs in taxIdDbs.items():
         for db in dbs:
             if onlyDbs is not None and db not in onlyDbs:
                 continue
             dbDir = join(bigBedDir, db)
             if not isdir(dbDir):
                 continue
 
             liftInfo = json.load(open(join(dbDir, "liftInfo.json")))
             versionOfh = open(join(dbDir, "version.txt"), "w")
 
-            fullVersion = versionString + (", lifted through: %s %s on %s (taxId %s, %s)" % \
+            fullVersion = versionString + (", mapped to genome through gene transcripts from %s %s on %s (taxId %s, %s)" % \
                 (liftInfo["geneTable"], liftInfo["version"], liftInfo["createdDate"], liftInfo["taxId"], liftInfo["fullMd5"]))
             versionOfh.write(fullVersion)
             versionOfh.close()
 
             linkName = join("/gbdb", db, "uniprot", "version.txt")
             makeSymlink(versionOfh.name, linkName)
 
             logging.debug("Wrote release string to %s, symlink from %s" % (versionOfh.name, linkName))
 
     return versionString, shortVersion
 
 def makeTrackDb(archDir, shortVersion):
     " create a trackDb.txt file for the archive hub in archDir "
     templFname = join(dirname(__file__), "trackDb.template.txt")
     tdb = open(templFname).read()
 
     tdb = tdb.replace("$VER", shortVersion)
 
+    tdbLines = tdb.splitlines()
+    tdbLines = [l for l in tdbLines if not l.startswith("#")]
+
     tdbFname = join(archDir, "trackDb.txt")
     with open(tdbFname, "w") as ofh:
-        ofh.write(tdb)
+        ofh.write("\n".join(tdbLines))
     logging.info("Created %s" % tdbFname)
 
 def copyToArchive(bigBedDir, archRoot, shortVersion, onlyDbs):
     " make copies of the track files under the archiveDir and adapt the 'current' symlink "
     for db in os.listdir(bigBedDir):
         if onlyDbs and db not in onlyDbs:
             continue
 
         archDir = join(archRoot, db, "uniprot", shortVersion)
         if not isdir(archDir):
             os.makedirs(archDir)
 
         inDir = join(bigBedDir, db)
 
         count = 0
@@ -2167,77 +2387,81 @@
             continue
         geneId = row.GeneID
         transId = row.RNA_nucleotide_accession_version
         if transId!="-":
             geneToTrans[geneId].add(transId)
         if lastTax!=taxId and lastTax is not None:
             writeGeneTsv(geneToTrans, lastTax, outDir)
             geneToTrans = defaultdict(set)
         lastTax = taxId
 
     writeGeneTsv(geneToTrans, lastTax, outDir)
 
 def downloadAndSplitNcbi(taxIdDbs):
     " download and split the NCBI genes file with a mapping NCBI gene -> RefSeq transcripts "
     logging.info("Downloading NCBI gene2refseq file gene2refseq.tsv")
-    cmd = "curl -Ss https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz | zcat > ncbi/gene2refseq.tsv"
+    # removed -Ss trying to get error messages to show up
+    cmd = "curl https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz | zcat > ncbi/gene2refseq.tsv"
     run(cmd)
     splitGeneRefseq("ncbi/gene2refseq.gz", NCBIDIR, taxIdDbs)
 
 def delFlag():
     global flagFname
     if isfile(flagFname):
         os.remove(flagFname)
 
 def main():
     global flagFname
 
     args, options = parseArgs()
 
     onlyDbs = None
     if options.onlyDbs:
         onlyDbs = set(options.onlyDbs.split(","))
 
     taxIdDbs = getTaxIdDbs(onlyDbs)
 
     if options.mapQa:
-        #mapQa(options.tabDir, options.faDir, options.mapDir, onlyDbs, taxIdDbs)
+        mapQa(options.tabDir, options.faDir, options.mapDir, onlyDbs, taxIdDbs)
         sys.exit(0)
 
     if options.onlyFlip:
-        flipNewVersionCheckMaxDiff(options.bigBedDir, onlyDbs, taxIdDbs)
+        flipNewVersionCheckMaxDiff(options.bigBedDir, onlyDbs, taxIdDbs, options.force)
         # really need this here?
         relStr, shortVersion = createVersionFiles(options.tabDir, options.bigBedDir, onlyDbs, taxIdDbs)
         copyToArchive(options.bigBedDir, options.archiveDir, shortVersion, onlyDbs)
         makeHubs(options.archiveDir, onlyDbs)
         sys.exit(0)
 
     # create a lock file
     flagFname = join(options.uniprotDir, "doUniprot.lock")
     if isfile(flagFname) and not options.debug and not options.force:
         logging.error("%s exists. Is a doUniprot process already running ? "
             "If not, remove the flag file and restart this script." % flagFname)
         sys.exit(1)
     open(flagFname, "w")
     atexit.register(delFlag)
 
     if not options.onlyLinks:
-        # THIS IS THE MAIN FUNCTION
+        # THIS IS THE CORE PART
         updateUniprot(args, onlyDbs, taxIdDbs, options)
 
-        flipNewVersionCheckMaxDiff(options.bigBedDir, onlyDbs, taxIdDbs, maxDiff=maxDiffVersions)
+        flipNewVersionCheckMaxDiff(options.bigBedDir, onlyDbs, taxIdDbs, options.force, maxDiff=maxDiffVersions)
 
     if not options.skipLinks:
-        makeLinks(options.bigBedDir, options.faDir, onlyDbs, taxIdDbs)
+        makeLinks(options.bigBedDir, onlyDbs, taxIdDbs)
 
     relStr, shortVersion = createVersionFiles(options.tabDir, options.bigBedDir, onlyDbs, taxIdDbs)
     copyToArchive(options.bigBedDir, options.archiveDir, shortVersion, onlyDbs)
 
     makeHubs(options.archiveDir, onlyDbs)
 
     if isfile(flagFname):
         os.remove(flagFname)
 
+    logging.info("Uniprot pipeline completed.")
+
 try:
     main()
 except Exception as e:
     logging.error(e, exc_info=True)
+    sys.exit(1)