src/hg/makeDb/doc/evaSnp5.txt c874d12e8680a17fffa2d6d6f4d64a66c1a3878c

c874d12e8680a17fffa2d6d6f4d64a66c1a3878c
lrnassar
  Mon Sep 11 16:57:10 2023 -0700
Fixing a spot where I forgot to change 4 to 5, feedback from CR refs #32223

diff --git src/hg/makeDb/doc/evaSnp5.txt src/hg/makeDb/doc/evaSnp5.txt
index 11e5fb8..9eceb01 100644
--- src/hg/makeDb/doc/evaSnp5.txt
+++ src/hg/makeDb/doc/evaSnp5.txt
@@ -1,600 +1,600 @@
-# Track for EVA snp release 4  - https://www.ebi.ac.uk/eva/?RS-Release&releaseVersion=5
+# Track for EVA snp release 5  - https://www.ebi.ac.uk/eva/?RS-Release&releaseVersion=5
 # Tracks built by Lou on 9/7/2023
 
 # Track was built for the following 35 assemblies
 
 # The GCA accession on the eva release by accession list (https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/)
 # were compared to all native assemblies we have. In total there are 895,806,314 variants
 
 # All assemblies were passed by the python pipeline described below. Any special cases
 # have been baked into the pipeline itself, such as
 # bosTau9 and galGal5 where I had to remove their chrMT as we had none,
 # and rheMac8 I had to change the name of the chrMT to one that matches us. See RM #32095
 
 #####################################
 #### Beginning of python3 code ######
 #####################################
 
 import re, os, subprocess, sys
 
 def findAllAccFromDescPages():
     """Search all gbdb description pages and pull out GCA/GCF accessions"""
     listOfHits = []
     assemblyAlias = {}
     allDescPages = bash("ls /gbdb/*/html/description.html").rstrip().split("\n")
     regex = re.compile('GC[AF]_[0-9]*\.[0-9]*')
     for page in allDescPages:
         try:
             hits = bash("cat "+page+" | grep -i 'GCA\|GCF'").rstrip().split("\n")
         except:
             hits = []
         if hits != []:
             ucscName = page.split("/")[2]
             hits = str(hits)
             assemblyAlias[ucscName] = {}
             try:
                 accession = regex.findall(hits)[0]
                 if accession.startswith("GCA"):
                     assemblyAlias[ucscName]['GCA'] = accession
                 elif accession.startswith("GCF"):
                     assemblyAlias[ucscName]['GCF'] = accession
             except:
                 print("The following page could not be matched with the accession format",page,hits)
     return(assemblyAlias)
 
 def buildGCFdicFromGenbankFile(evaSnpVersionNumber):
     checkDirMakeDir("/hive/data/outside/eva"+evaSnpVersionNumber+"/")
     genbankSummaryDic = {}
     regexGenbankGCA = re.compile('GCA_[0-9]*\.[0-9]*')
     regexGenbankGCF = re.compile('GCF_[0-9]*\.[0-9]*')
     wgetFile("https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt", "/hive/data/outside/eva"+evaSnpVersionNumber+"/")
     summaryFile = open('/hive/data/outside/eva'+evaSnpVersionNumber+'/assembly_summary_genbank.txt','r')
     for line in summaryFile:
         if "GCF_" in line:
             GCAacc = regexGenbankGCA.findall(line)[0]
             GCFacc = regexGenbankGCF.findall(line)[0]
             genbankSummaryDic[GCFacc] = GCAacc
     summaryFile.close()
     return(genbankSummaryDic)
 
 def findGCAaccForUCSCgcfDbs(assemblyAlias,genbankSummaryDic):
     """Try to find matching GCA accession to UCSC dbs based on genbank file or curling web page"""
     n=0
     regexGenbankGCA = re.compile('GCA_[0-9]*\.[0-9]*')
     for key, value in assemblyAlias.items():
         if 'GCF' in value:
             if value['GCF'] in genbankSummaryDic.keys():
                 assemblyAlias[key]['GCA'] = genbankSummaryDic[value['GCF']]
                 n+=1
             else: #Try to find it online at NCBI
                 ncbiGCF = value['GCF']
                 ncbiWgetPage = bash("curl https://www.ncbi.nlm.nih.gov/assembly/"+ncbiGCF+"/").rstrip().split("\n")
                 for line in ncbiWgetPage:
                     if "GenBank assembly accession" in line and "GCA_" in line:
                         for subLine in line.split("/dd"):
                             if "GenBank assembly accession" in subLine:
                                 GCAacc = regexGenbankGCA.findall(subLine)[0]
                                 assemblyAlias[key]['GCA'] = GCAacc
                                 n+=1
     print("Total number of UCSC GCFs matched with GCAs: ",n)
     return(assemblyAlias)
 
 def sanitizeListToOnlyUCSCdbsWithGCA(assemblyAlias):
     n = 0
     m=0
     ucscGCAlist = {}
     for key, value in assemblyAlias.items():
             if 'GCA' in value:
                 if value['GCA'] != 'GCA_000':
                     ucscGCAlist[key] = value['GCA']
                 n+=1
             else:
                 m+=1
     print("Total number of UCSC databases already with a GCA:",n)
     print("Total number of UCSC databases with GCF that will need GCA lookup:",m)
     return(ucscGCAlist)
 
 def buildFinalDicOfUCSCdbsWithEvaSnpData(ucscGCAlist,evaSnpVersionNumber):
     ucscEvaSnpDbs = {}
     n=0
     regexEvaSnpGCA = re.compile('GCA_[0-9]*\.[0-9]*')
     evaReleasePage = bash("curl https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_"+evaSnpVersionNumber+"/by_assembly/").rstrip().split("\n")
     activeRRassemblies = bash("""hgsql -h genome-centdb -e "select name from dbDb where active='1'" hgcentral""")
     for key in ucscGCAlist:
         if ucscGCAlist[key] in str(evaReleasePage):
             for line in evaReleasePage:
                 if ucscGCAlist[key] in line:
                     if key in activeRRassemblies:
                         fullGCA = regexEvaSnpGCA.findall(line)[0]
                         ucscEvaSnpDbs[key] = fullGCA
                         n+=1
     ucscEvaSnpDbs['mm10']='GCA_000001635.6' #Manually add mm10 which is weird due to patches
     print("Total number of assemblies with EvaSnp data: ",n)
     return(ucscEvaSnpDbs)
 
 def checkDirMakeDir(workDir):
     if not os.path.isdir(workDir):
         os.mkdir(workDir)
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return(bashStdoutt)
 
 def wgetFile(url, workDir):
     bash("wget -N %s -P %s" % (url, workDir))
     fileName = workDir+url.split("/")[-1]
     return(fileName)
 
 def checkDupsAndUnzip(fileName,workDir,ucscDatabaseName):
     """Check for duplicate lines (common in EVA), and unzip file"""
     lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip()
     uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip()
     currentFileName = workDir+"evaSnps.vcf"
     #Add catches below in places where the chrom names don't match or are missing from UCSC for specific dbs
     if ucscDatabaseName == 'galGal5':
         bash("zcat %s | uniq | grep -v 'X52392.1' > %s" % (fileName,currentFileName))
     elif ucscDatabaseName == 'bosTau9':
         bash("zcat %s | uniq | grep -v 'MT' > %s" % (fileName,currentFileName))
     else:
         if lineCount != uniqCount:
             print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount))
             bash("zcat %s | uniq > %s" % (fileName,currentFileName))
         else:
             bash("zcat %s > %s" % (fileName,currentFileName))
     return(currentFileName)
 
 def sanitizeChromNames(currentFileName,dbs,workDir):
     """Swaps the chrom IDs with the accession names which are indexed in chromAlias"""
     inputFile = open(currentFileName,'r')
     outputFile = open(currentFileName+".fixed","w")
     IDtoAccessionDic = {}
 
     for line in inputFile:
         if line.startswith("##contig"):
             ID = line.split('=')[2].split(',')[0]
             accession = line.split('=')[3].split('"')[1]
             IDtoAccessionDic[ID] = accession
             outputFile.write(line)
         elif line.startswith("#"):
             outputFile.write(line)
         else:
             line = line.split("\t")
             line[0] = IDtoAccessionDic[line[0]]
             line = "\t".join(line)
             outputFile.write(line)
     inputFile.close()
     outputFile.close()
     #Replacing the same name because chromAlias should be updated soon and this function will be removed
     bash("mv %s.fixed %s" % (currentFileName,currentFileName))
     currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
     print("Chroms converted")
     
 def addChrToChromNumbers(fileName,currentFileName,workDir):
     bash("zcat %s | uniq > %s" % (fileName,currentFileName))
     with open(workDir+"evaSnps.ucscChroms.vcf",'w') as newChromVcf:
         with open(currentFileName,"r") as oldChromVcf:
             for line in oldChromVcf:
                 if line.startswith("#"):
                     newChromVcf.write(line)
                 elif line.startswith("AC024175.3"): #Exception for zebrafish
                     newChromVcf.write(line.replace("AC024175.3","chrM"))
                 elif line.startswith("AY612638.1"): #Exception for rheMac8
                     newChromVcf.write(line.replace("AY612638.1","chrM"))    
                 else:
                     newChromVcf.write("chr"+line)
     currentFileName = workDir+"evaSnps.ucscChroms.vcf"
     return(currentFileName)
 
 def convertChromToUcsc(currentFileName,dbs,workDir):
     """Convert to UCSC-style chroms"""
     bash("chromToUcsc --get %s" % (dbs))
     bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv"))
     bash("rm %s" % (dbs+".chromAlias.tsv"))
     currentFileName = workDir+"evaSnps.ucscChroms.vcf"
     return(currentFileName)
 
 def convertVcfToBed(currentFileName,workDir):
     """Convert to bed keeping only VC and SID fields"""
     bash("bgzip %s" % (currentFileName))
     bash("tabix -p vcf %s" % (currentFileName+".gz"))
     bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir))
     currentFileName = workDir+"evaSnp.bed"
     return(currentFileName)
 
 def buildChromSizesDic(workDir,dbs):
     """Create a dictionary the chromSizes"""
     chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n")
     chromDic = {}
     for chrom in chromSizes[1:-1]:
         chrom = chrom.split("\t")
         chromDic[chrom[0]] = chrom[1]
     return(chromDic)
 
 def splitChromsAndRunHgVai(workDir,dbs):
     """Split all the chroms in tenths in order to be able to run hgVai without running out of memory"""
     chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n")
     inputFile = workDir+"evaSnps.ucscChroms.vcf.gz"
     outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep"
     n=0
     allTables = bash("\hgsql -e \"show tables\" "+dbs+"")
     if dbs in ['galGal6','oviAri4','ponAbe3']: #Special exception to use refGene because ncbiRefSeq has incorrect protein sequence in Link table see #29262
         geneTableToUse = "refGene"
 #     elif "ncbiRefSeqSelect" in allTables: #Removing refseq select as the annotations are too sparse
 #         geneTableToUse = "ncbiRefSeqSelect"
     elif "ncbiRefSeqCurated" in allTables:
         geneTableToUse = "ncbiRefSeqCurated"
     elif "ensGene" in allTables:
         geneTableToUse = "ensGene"
     elif "refGene" in allTables:
         geneTableToUse = "refGene"
     elif "ncbiGene" in allTables:
         geneTableToUse = "ncbiGene"
     else:
         print(dbs)
         sys.exit("Could not find any tables to use for the following database: "+dbs)
 
     chromDic = buildChromSizesDic(workDir,dbs)
     #For function below, only bother with the chromosomes in the VCF
     chromsInVcf = bash("zcat "+inputFile+" | grep -v '^#' | cut -f1 | uniq").rstrip().split("\n")
 
     for chrom in chromsInVcf:
         chromName = chrom
         chromSize = chromDic[chrom]
         n+=1
         if (n % 100) == 0:
             print("Have currently run "+str(n)+"/"+str(len(chromsInVcf))+" chroms. Currently running "+chromName)
         if int(chromSize) < 1000000: #Don't worry about splitting if chrom is less than 1M bp
             bash("vai.pl --variantLimit=200000000 --hgVai=/usr/local/apache/cgi-bin/hgVai --position=%s:0-%s --geneTrack=%s --hgvsG=off --hgvsCN=off \
             --hgvsP=on %s %s >> %s" % (chromName,chromSize,geneTableToUse,dbs,inputFile,outputFile))
         else:
             breakPoints = [.2,.4,.6,.8,1]
             for part in breakPoints:
                 startCoord = round(int(chromSize)*(part-.2))
                 endCoord = round(int(chromSize)*part)
                 if part == breakPoints[0]: #Special exception for start
                     bash("vai.pl --variantLimit=200000000 --hgVai=/usr/local/apache/cgi-bin/hgVai --position=%s:0-%s --geneTrack=%s --hgvsG=off --hgvsCN=off \
                     --hgvsP=on %s %s >> %s" % (chromName,endCoord,geneTableToUse,dbs,inputFile,outputFile))
                 else:
                     bash("vai.pl --variantLimit=200000000 --hgVai=/usr/local/apache/cgi-bin/hgVai --position=%s:%s-%s --geneTrack=%s --hgvsG=off --hgvsCN=off \
                     --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,geneTableToUse,dbs,inputFile,outputFile))
     return(outputFile)
 
 def splitVepFileByChrom(hgVaiResultsFile,workDir):
     """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs"""
     splitChromDir = workDir+"splitChroms/"
     checkDirMakeDir(splitChromDir)
     vepFile = open(hgVaiResultsFile,"r")
     for line in vepFile:
         if line.startswith("#") or line.startswith("Uploaded"):
             continue
         else:
             #Need to compute itemRGB, aaChange, and ucscClass
             lineContents = line.rstrip().split("\t")
             chrom = lineContents[1].split(":")[0]
             outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a")
             outFile.write(line)
             outFile.close()
     vepFile.close()
 
 def fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts,workDir):
     """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList"""
     prevName = nameAlt
     #Check if chrom has changed and index current chrom
     if chrom != currentChrom:
         currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r")
         currentVepDic = {}
         for entry in currentVepFile:
             entry = entry.rstrip().split("\t")
             rsID = entry[0]
             rsIDAlt = entry[0]+entry[2]
             if rsIDAlt not in currentVepDic.keys():
                 currentVepDic[rsIDAlt] = {}
                 currentVepDic[rsIDAlt]['ucscClassList'] = []
                 currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6])
                 currentVepDic[rsIDAlt]['aaChange'] = []
                 if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                     currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
             else:
                 if entry[6] not in currentVepDic[rsIDAlt]['ucscClassList']:
                     currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6])
                 if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                     if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsIDAlt]['aaChange']:
                         currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
 
             if rsID not in currentVepDic.keys():
                 currentVepDic[rsID] = {}
                 currentVepDic[rsID]['ucscClassList'] = []
                 currentVepDic[rsID]['ucscClassList'].append(entry[6])
                 currentVepDic[rsID]['aaChange'] = []
                 if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                     currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
             else:
                 if entry[6] not in currentVepDic[rsID]['ucscClassList']:
                     currentVepDic[rsID]['ucscClassList'].append(entry[6])
                 if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                     if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']:
                         currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
 
         currentChrom=chrom
         currentVepFile.close()
 
     try:
         ucscClass = ",".join(currentVepDic[nameAlt]['ucscClassList'])
         aaChange = " ".join(currentVepDic[nameAlt]['aaChange'])
     except:
         tryAttempts+=1
         ucscClass = ",".join(currentVepDic[name]['ucscClassList'])
         aaChange = " ".join(currentVepDic[name]['aaChange'])
 
     return(prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts)
 
 def assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt):
     """Assign color based on most serious hgVai predicted consequence"""
 
     redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant',
                'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained',
                'stop_lost','coding_sequence_variant','transcript_ablation']
     greenVars = ['synonymous_variant','stop_retained_variant']
     blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant',
                 'non_coding_transcript_exon_variant']
     blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant',
                  'NMD_transcript_variant','no_sequence_alteration']
 
     try:
         if any(item in currentVepDic[nameAlt]['ucscClassList'] for item in redVars):
             itemRgb = '255,0,0'
         elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in greenVars):
             itemRgb = '0,128,0'
         elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blueVars):
             itemRgb = '0,0,255'
         elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blackVars):
             itemRgb = '0,0,0'
         else:
             sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
     except:
         if any(item in currentVepDic[name]['ucscClassList'] for item in redVars):
             itemRgb = '255,0,0'
         elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars):
             itemRgb = '0,128,0'
         elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars):
             itemRgb = '0,0,255'
         elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars):
             itemRgb = '0,0,0'
         else:
             sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
 
 
     return(itemRgb)
 
 def convertSOnumberToName(varClass):
     soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion',
            "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV', "SO:0000705": 'tandem_repeat'}
     varClass = soDic[varClass]
     return(varClass)
 
 def buildDuplicateRsIDset(currentFileName):
     """Look for multi-allelic variants to then try and more accurately match those hgVai results"""
     duplicateSet = set()
     inputFile = open(currentFileName,"r")
     prevName = False
     for line in inputFile:
         if line.startswith("#") or line.startswith("Uploaded"):
             continue
         else:
             lineContents = line.rstrip().split("\t")
             name = lineContents[3]
             if prevName == name:
                 duplicateSet.add(name)
             prevName = name
     print("Total number of multi-allelic variants: "+str(len(duplicateSet)))
     return(duplicateSet)
 
 def createFinalBedWithVepAnnotations(currentFileName,workDir):
     """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai"""
 
     inputFile = open(currentFileName,"r")
     outputFile = open(workDir+"evaSnp.final.bed","w")
     prevName = ""
     currentChrom = ""
     currentVepDic = {}
     n=0
     tryAttempts=0
     duplicateSet = buildDuplicateRsIDset(currentFileName)
 
     for line in inputFile:
         n+=1
         if n%5000000 == 0:
             print("Current line count: "+str(n))
         if line.startswith("#") or line.startswith("Uploaded"):
             continue
         else:
         #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange
         #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms
 
             lineContents = line.rstrip().split("\t")
             chrom = lineContents[0]
             chromStart = lineContents[1]
             chromEnd = lineContents[2]
             name = lineContents[3]
             ref = lineContents[9]
             alt = lineContents[10]
             varClass = lineContents[12]
             submitters = lineContents[13]
 
             #Convert from SO term to name. e.g. SO:0001483 to substitution
             varClass = convertSOnumberToName(varClass)
 
             #Check if is the same rsID, they are repeated in ~5% of file
             if prevName == name:
                 print("Repeated prevName: "+prevName)
                 outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                  itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
             elif name in duplicateSet:
                 #Find the aaChange and ucscClass
                 #Try and match the allele with what hgVai outputs
                 if varClass == "insertion":
                     if ',' in alt:
                         nameAlt = name+alt.split(",")[0][1:]
                     elif len(alt)-1 >= 59:
                         nameAlt = name+alt[len(ref):25]+"..."+alt[len(alt)-24:]+"("+str(len(alt)-1)+" nt)"
                     else:
                         nameAlt = name+alt[1:]
                 elif varClass == "deletion":
                     nameAlt = name+"-"
                 elif varClass == "tandem_repeat":
                     if len(alt) > len(ref):
                         nameAlt = name+alt[len(ref):]
                     else:
                         nameAlt = name+"-"
                 else:
                     nameAlt = name+alt
 
                 prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts,workDir)
                 itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt)
                 outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                  itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
             else:
                 nameAlt = name
                 prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts,workDir)
                 itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt)
                 outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                  itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
 
     print("Total number of try attempts = "+str(tryAttempts))
     print("Total number of lines in output file = "+str(n))
     inputFile.close()
     outputFile.close()
     
 def validateFinalFile(workDir,url,dbs):
     """Compare and make sure that the final bed rsID# matches the input file"""
     originalDownloadFileName = workDir+url.split("/")[-1]
     bedFile = workDir+"evaSnp.final.bed"
     originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName))
     finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile))
     #Skip the check for galGal5 which is known to have troublesome chrMT name
     if dbs != "galGal5" or dbs != "bosTau9":
         if originalRsIDCount != finalBedRsIDCount:
             sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\
     "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount)))
     finalBedLineCount = bash("cat %s | wc -l" % (bedFile))
     finalBedUniqueLineCount = bash("cat %s | sort | uniq | wc -l" % (bedFile))
     if finalBedLineCount != finalBedUniqueLineCount:
         sys.exit("There are duplicate entries in the file: "+str(finalBedLineCount)+" vs "+str(finalBedUniqueLineCount))
 
 def createBigBed(workDir,dbs,evaSnpVersionNumber):
     bedFile = workDir+"evaSnp.final.bed"
     bash("bedSort %s %s" % (bedFile,bedFile))
     bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \
     %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp%s.bb" % (bedFile,dbs,dbs,workDir,evaSnpVersionNumber))
 
 def chechIfDbsDone(key,evaSnpVersionNumber):
     if os.path.exists("/hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/evaSnp"+evaSnpVersionNumber+".bb"):
         print("DBS already completed: "+key)
         return(True)
     else:
         if os.path.exists("/hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/evaSnps.ucscChroms.vcf") or os.path.exists("/hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/evaSnps.ucscChroms.vcf.gz"):
             print("Cleaning up directory for previous attempt of "+key)
             bash("rm /hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/evaSnps*")
             if os.path.exists("/hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/splitChroms"):
                 try:
                     bash("rm /hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/splitChroms/*")
                 except:
                     print("Directly cleaned")
         return(False)
 
 def findFileUrl(GCAacc,evaSnpVersionNumber,dbs):
     dirStructure = bash("curl https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_5/by_assembly/"+GCAacc+"/ | grep '[DIR]'")
     #Loops to pick correct species name files since Ensembl began to add multiple species per GCA in eva5
     if dbs == 'oviAri3':
         speciesName = 'ovis_orientalis/'
     elif dbs == 'bosTau6':
         speciesName = 'bos_indicus_x_bos_taurus/'
     elif dbs == 'bosTau9':
         speciesName = 'bos_taurus/'
     else:
         speciesName = dirStructure.split("href")[6].split('"')[1]
     urlToFileDir = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_"+evaSnpVersionNumber+"/by_assembly/"+GCAacc+"/"+speciesName
     fileDir = bash("curl -L "+urlToFileDir+" | grep "+GCAacc)
     filespeciesNameDirHref = fileDir.split("href")
     fileName = filespeciesNameDirHref[2].split('"')[1]
     urlToFile = urlToFileDir+fileName
     return(urlToFile)
 
 def buildEvaSnpTrack(ucscDatabaseName,ucscEvaSnpDbs,evaSnpVersionNumber):
     """Build evaSnp track from start to finish"""
     dbs = ucscDatabaseName
     GCAacc = ucscEvaSnpDbs[dbs]
     workDir = "/hive/data/outside/eva"+evaSnpVersionNumber+"/"+dbs+"/"
     urlToFile = findFileUrl(GCAacc,evaSnpVersionNumber,dbs)
     checkDirMakeDir(workDir)
     fileName = wgetFile(urlToFile, workDir)
     fileName = workDir+urlToFile.split("/")[-1]
     print("Got file: ",dbs)
     currentFileName = checkDupsAndUnzip(fileName,workDir,ucscDatabaseName)
     print("File unzipped")
     try:
         currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
         print("Chroms converted")
     except:
         try:
             sanitizeChromNames(currentFileName,dbs,workDir)
             print("Chroms sanitized")
             currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
             print("Chroms converted")
         except:
             currentFileName = addChrToChromNumbers(fileName,currentFileName,workDir)
             print("Chroms completed after adding '1' to starts.")
     currentFileName = convertVcfToBed(currentFileName,workDir)
     print("File converted to bed")
     hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs)
     print("hgVai done")
     splitVepFileByChrom(hgVaiResultsFile,workDir)
     print("hgVai file split")
     currentFileName = workDir+"evaSnp.bed"
     createFinalBedWithVepAnnotations(currentFileName,workDir)
     print("Validating file")
     validateFinalFile(workDir,urlToFile,dbs)
     print("Created final file")
     createBigBed(workDir,dbs,evaSnpVersionNumber)
     print("bigBed made")
     
 def main(mode):
     #Run main with either "all" or a specific UCSC database name
     #Find all the assemblies we have with matching evaSnp data
     evaSnpVersionNumber = "5"
     assemblyAlias = findAllAccFromDescPages()
     genbankSummaryDic = buildGCFdicFromGenbankFile(evaSnpVersionNumber)
     assemblyAlias = findGCAaccForUCSCgcfDbs(assemblyAlias,genbankSummaryDic)
     ucscGCAlist = sanitizeListToOnlyUCSCdbsWithGCA(assemblyAlias)
     ucscEvaSnpDbs = buildFinalDicOfUCSCdbsWithEvaSnpData(ucscGCAlist,evaSnpVersionNumber)
     #Build evaSNP data for either one specific assembly, or all assemblies
     if mode == 'all':
         for key in ucscEvaSnpDbs:
             if not chechIfDbsDone(key,evaSnpVersionNumber):
                 try:
                     buildEvaSnpTrack(key,ucscEvaSnpDbs,evaSnpVersionNumber)
                 except:
                     print("Problem with DBS: "+key)
                     continue
     else:
         chechIfDbsDone(mode,evaSnpVersionNumber)
         buildEvaSnpTrack(mode,ucscEvaSnpDbs,evaSnpVersionNumber)
 
 main("all")
 
 #####################################
 ######### End of python3 code ######
 #####################################
 
 # Lastly links were made with a file containing the 35 assemblies:
 
 for dbs in $(cat /hive/data/outside/eva5/assemblyReleaseList.txt);
 do
 ln -s /hive/data/outside/eva5/$dbs/evaSnp5.bb /gbdb/$dbs/bbi/;
 done