src/hg/makeDb/doc/evaSnp3.txt 754be93f931254dfaf897d382a7b6847a2bbaa39

754be93f931254dfaf897d382a7b6847a2bbaa39
lrnassar
  Tue May 3 14:50:47 2022 -0700
Adding the updated evaSnp3 pipeline which now includes canFam3 and has better handling of multi-allelic variants. Refs #29031

diff --git src/hg/makeDb/doc/evaSnp3.txt src/hg/makeDb/doc/evaSnp3.txt
index d16f606..c46f320 100644
--- src/hg/makeDb/doc/evaSnp3.txt
+++ src/hg/makeDb/doc/evaSnp3.txt
@@ -1,377 +1,830 @@
 # Track for EVA snp release 3, which was released 2/24/2022 - https://www.ebi.ac.uk/eva/?RS-Release&releaseVersion=3
 # Tracks built by Lou on 4/19/2022
 
 # Assemblies were the following: mm39, danRer11, bosTau9, dm6, mm10, galGal6, susScr11, rn6, 
 # rn7, danRer10, equCab3, rheMac10, macFas5, oviAri4, felCat9, galGal5
 
 # The GCA accession on the eva release by accession list (https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/)
 # were compared to each of our dbs genome accessions with at least 100 hits/month to create that list
 
 # All assemblies were passed by the python pipeline described below. 3 assemblies required some 
 # variants to be removed (danRer11, dm6, oviAri4) due to issues with hgVai, see RM #29262.
 # The removed variants were done so after initial data download and noted in the code comment
 
 #####################################
 #### Beginning of python3 code ######
 #####################################
 
 import subprocess,sys,argparse,os
 
 #All databases with monthly hit counts of at least 100
 
 #mm39 - Completes
 # url = "http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/mus_musculus/GCA_000001635.9/GCA_000001635.9_current_ids.vcf.gz"
 # dbs='mm39'
 
 #danRer11 - Completes if I remove rs499587024 
 #url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.4/GCA_000002035.4_current_ids.vcf.gz"
 #dbs='danRer11'
 
 #bosTau9 - completes
 #url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002263795.2/GCA_002263795.2_current_ids.vcf.gz"
 #dbs="bosTau9"
 
 #dm6 - Completes if I remove /hive/data/outside/eva/dm6/problematicRSIDs.txt 
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001215.4/GCA_000001215.4_current_ids.vcf.gz"
 # dbs = "dm6"
 
 #mm10 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001635.6/GCA_000001635.6_current_ids.vcf.gz"
 # dbs = "mm10"
 
 # #galGal6 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.5/GCA_000002315.5_current_ids.vcf.gz"
 # dbs = "galGal6"
 
 # #susScr11 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000003025.6/GCA_000003025.6_current_ids.vcf.gz"
 # dbs = "susScr11"
 
 # rn6 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001895.4/GCA_000001895.4_current_ids.vcf.gz"
 # dbs = "rn6"
 
 # rn7 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_015227675.2/GCA_015227675.2_current_ids.vcf.gz"
 # dbs = "rn7
 
 # danRer10 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.3/GCA_000002035.3_current_ids.vcf.gz"
 # dbs = "danRer10"
 
 # equCab3 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002863925.1/GCA_002863925.1_current_ids.vcf.gz"
 # dbs = "equCab3"
 
 # rheMac10 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_003339765.3/GCA_003339765.3_current_ids.vcf.gz"
 # dbs = "rheMac10"
 
 # macFas5 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000364345.1/GCA_000364345.1_current_ids.vcf.gz"
 # dbs = "macFas5"
 
 # oviAri4 - Completes if removed rs1089233242
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000298735.2/GCA_000298735.2_current_ids.vcf.gz"
 # dbs = "oviAri4"
 
 # felCat9 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000181335.4/GCA_000181335.4_current_ids.vcf.gz"
 # dbs = "felCat9"
 
 # galGal5 - Completes
 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.3/GCA_000002315.3_current_ids.vcf.gz"
 # dbs = "galGal5"
 
 workDir = "/hive/data/outside/eva/"+dbs
 if workDir[-1] != "/":
     workDir=workDir+"/"
 
 def checkDirMakeDir(workDir):
     if not os.path.isdir(workDir):
         os.mkdir(workDir)
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return(bashStdoutt)
 
 def wgetFile(url, workDir):
     bash("wget %s -P %s" % (url, workDir))
     fileName = workDir+url.split("/")[-1]
     return(fileName)
     
 def checkDupsAndUnzip(fileName,workDir):
     """Check for duplicate lines (common in EVA), and unzip file"""
     lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip()
     uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip()
     currentFileName = workDir+"evaSnps.vcf"
     if lineCount != uniqCount:
         print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount))
         bash("zcat %s | uniq > %s" % (fileName,currentFileName))
     else:
         bash("zcat %s > %s" % (fileName,currentFileName))
     return(currentFileName)
 
 def sanitizeChromNames(currentFileName):
     """Swaps the chrom IDs with the accession names which are indexed in chromAlias"""
     inputFile = open(currentFileName,'r')
     outputFile = open(currentFileName+".fixed","w")
     IDtoAccessionDic = {}
     
     for line in inputFile:
         if line.startswith("##contig"):
             ID = line.split('=')[2].split(',')[0]
             accession = line.split('=')[3].split('"')[1]
             IDtoAccessionDic[ID] = accession
             outputFile.write(line)
         elif line.startswith("#"):
             outputFile.write(line)
         else:
             line = line.split("\t")
             line[0] = IDtoAccessionDic[line[0]]
             line = "\t".join(line)
             outputFile.write(line)
     inputFile.close()
     outputFile.close()
     #Replacing the same name because chromAlias should be updated soon and this function will be removed
     bash("mv %s.fixed %s" % (currentFileName,currentFileName))
 
 def convertChromToUcsc(currentFileName,dbs,workDir):
     """Convert to UCSC-style chroms"""
     bash("chromToUcsc --get %s" % (dbs))
     bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv"))
     bash("rm %s" % (dbs+".chromAlias.tsv"))
     currentFileName = workDir+"evaSnps.ucscChroms.vcf"
     return(currentFileName)
                                
 def convertVcfToBed(currentFileName,workDir):
     """Convert to bed keeping only VC and SID fields"""
     bash("bgzip %s" % (currentFileName))
     bash("tabix -p vcf %s" % (currentFileName+".gz"))
     bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir))
     currentFileName = workDir+"evaSnp.bed"
     return(currentFileName)
 
 def splitChromsAndRunHgVai(workDir,dbs):
     """Split all the chroms in tenths in order to be able to run hgVai without running out of memory"""
     chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n")
     inputFile = workDir+"evaSnps.ucscChroms.vcf.gz"
     outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep"
     for chrom in chromSizes[1:-1]:
         chrom = chrom.split("\t")
         chromName = chrom[0]
         print("Running "+chromName)
         if int(chrom[1]) < 1000000: #Don't worry about splitting if chrom is less than 1M bp
             if dbs == 'mm39':
                 bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
             --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile))
             else:
                 bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
             --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile))
         else:
             breakPoints = [.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
             for part in breakPoints:
                 startCoord = round(int(chrom[1])*(part-.1))+1
                 endCoord = round(int(chrom[1])*part)
                 if part == breakPoints[0]: #Special exception for start
                     if dbs == 'mm39':
                         bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
                     --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile))
                     else:
                         bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
                     --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile))
                 else:
                     if dbs == 'mm39':
                         bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
                     --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile))  
                     else:
                         bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
                     --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile))                    
     return(outputFile)
 
 def splitVepFileByChrom(hgVaiResultsFile,workDir):
     """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs"""
     splitChromDir = workDir+"splitChroms/"
     checkDirMakeDir(splitChromDir)
     vepFile = open(hgVaiResultsFile,"r")
     for line in vepFile:
         if line.startswith("#") or line.startswith("Uploaded"):
             continue
         else:
             #Need to compute itemRGB, aaChange, and ucscClass
             lineContents = line.rstrip().split("\t")
             chrom = lineContents[1].split(":")[0]
             outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a")
             outFile.write(line)
             outFile.close()
     vepFile.close()
 
 def fetchUcscClassAndAaChange(name,chrom,currentChrom,currentVepDic):             
     """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList"""
     prevName = name
     #Check if chrom has changed and index current chrom
     if chrom != currentChrom:
         currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r")
         currentVepDic = {}
         for entry in currentVepFile:
             entry = entry.rstrip().split("\t")
             rsID = entry[0]
             if rsID not in currentVepDic.keys():
                 currentVepDic[rsID] = {}
                 currentVepDic[rsID]['ucscClassList'] = []
                 currentVepDic[rsID]['ucscClassList'].append(entry[6])
                 currentVepDic[rsID]['aaChange'] = []
                 if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                     currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
             else:
                 if entry[6] not in currentVepDic[rsID]['ucscClassList']:
                     currentVepDic[rsID]['ucscClassList'].append(entry[6])
                 if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                     if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']:
                         currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
         currentChrom=chrom
         currentVepFile.close()
                 
     ucscClass = ",".join(currentVepDic[name]['ucscClassList'])
     aaChange = " ".join(currentVepDic[name]['aaChange'])
     
     return(prevName,currentChrom,currentVepDic,ucscClass,aaChange) 
 
 def assignRGBcolorByConsequence(currentVepDic,name):
     """Assign color based on most serious hgVai predicted consequence"""
 
     redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant',
                'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained',
                'stop_lost','coding_sequence_variant','transcript_ablation']
     greenVars = ['synonymous_variant','stop_retained_variant']
     blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant',
                 'non_coding_transcript_exon_variant']
     blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant',
                  'NMD_transcript_variant','no_sequence_alteration']
     
     if any(item in currentVepDic[name]['ucscClassList'] for item in redVars):
         itemRgb = '255,0,0'
     elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars):
         itemRgb = '0,128,0'
     elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars):
         itemRgb = '0,0,255'
     elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars):
         itemRgb = '0,0,0'
     else:
         sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
         
     return(itemRgb)
 
 def convertSOnumberToName(varClass):
     soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion', 
            "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV'}
     varClass = soDic[varClass]
     return(varClass)
 
 def createFinalBedWithVepAnnotations(currentFileName,workDir):
     """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai"""
 
     inputFile = open(currentFileName,"r")
     outputFile = open(workDir+"evaSnp.final.bed","w")
     prevName = ""
     currentChrom = ""
     currentVepDic = {}
     n=0
 
     for line in inputFile:
         n+=1
         if n%5000000 == 0:
             print("Current line count: "+str(n))
         if line.startswith("#") or line.startswith("Uploaded"):
             continue
         else:
         #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange
         #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms
         
             lineContents = line.rstrip().split("\t")
             chrom = lineContents[0]
             chromStart = lineContents[1]
             chromEnd = lineContents[2]
             name = lineContents[3]
             ref = lineContents[9]
             alt = lineContents[10]
             varClass = lineContents[12]
             submitters = lineContents[13]
             
             #Convert from SO term to name. e.g. SO:0001483 to substitution
             varClass = convertSOnumberToName(varClass)
         
             #Check if is the same rsID, they are repeated in ~5% of file
             if prevName == name:
                 outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                  itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
             else:        
                 #Find the aaChange and ucscClass                
                 prevName,currentChrom,currentVepDic,ucscClass,aaChange = fetchUcscClassAndAaChange(name,chrom,currentChrom,currentVepDic)
                 itemRgb = assignRGBcolorByConsequence(currentVepDic,name)
                 outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                  itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
 
     inputFile.close()
     outputFile.close()
 
 def validateFinalFile(workDir,url):
     """Compare and make sure that the final bed rsID# matches the input file"""
     originalDownloadFileName = workDir+url.split("/")[-1]
     bedFile = workDir+"evaSnp.final.bed"
     originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName))
     finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile))
     if originalRsIDCount != finalBedRsIDCount:
         sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\
     "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount)))
     
 def createBigBed(workDir,dbs):
     bedFile = workDir+"evaSnp.final.bed"
     bash("bedSort %s %s" % (bedFile,bedFile))
     bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \
     %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp.bb" % (bedFile,dbs,dbs,workDir))
     
 checkDirMakeDir(workDir)
 fileName = wgetFile(url, workDir)
 fileName = workDir+url.split("/")[-1]
 print("Got file")
 currentFileName = checkDupsAndUnzip(fileName,workDir)
 print("File unzipped")
 try:
     currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
     print("Chroms converted")
 except:
     sanitizeChromNames(currentFileName)
     print("Chroms sanitized")
     currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
     print("Chroms converted")
 currentFileName = convertVcfToBed(currentFileName,workDir)    
 print("File converted to bed")
 hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs)
 print("hgVai done")
 splitVepFileByChrom(hgVaiResultsFile,workDir)
 print("hgVai file split")
 currentFileName = workDir+"evaSnp.bed"
 createFinalBedWithVepAnnotations(currentFileName,workDir)
 print("Validating file")
 validateFinalFile(workDir,url)
 print("Created final file")
 createBigBed(workDir,dbs)
 print("bigBed made")
 
 #####################################
 ######### End of python3 code ######
 #####################################
 
 # Lastly links were made with a file containing the 16 assemblies:
 
 for dbs in $(cat $1);
 do
 ln -s /hive/data/outside/eva/$dbs/evaSnp.bb /gbdb/$dbs/bbi/;
 done
+
+#######################
+
+# On 5/3 the tracks were rebuilt to add better logic handling for multi-allele variants (rsIDs 
+# that come up multiple times in the file). These IDs now get matched with their unique 
+# hgVai alts. See #29031 for more info. Entire new updated pipeline included below.
+# canFam3 was also added to the pipeline.
+
+#####################################
+#########  Start of python3 #########
+#####################################
+
+import subprocess,sys,argparse,os
+
+#All databases with monthly hit counts of at least 100
+
+# mm39 - Completes
+# url = "http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/mus_musculus/GCA_000001635.9/GCA_000001635.9_current_ids.vcf.gz"
+# dbs='mm39'
+
+#danRer11 - Completes if I remove rs499587024 
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.4/GCA_000002035.4_current_ids.vcf.gz"
+# dbs='danRer11'
+
+#bosTau9 - completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002263795.2/GCA_002263795.2_current_ids.vcf.gz"
+# dbs="bosTau9"
+
+#dm6 - Completes if I remove /hive/data/outside/eva/dm6/problematicRSIDs.txt 
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001215.4/GCA_000001215.4_current_ids.vcf.gz"
+# dbs = "dm6"
+
+#mm10 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001635.6/GCA_000001635.6_current_ids.vcf.gz"
+# dbs = "mm10"
+
+# #galGal6 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.5/GCA_000002315.5_current_ids.vcf.gz"
+# dbs = "galGal6"
+
+# #susScr11 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000003025.6/GCA_000003025.6_current_ids.vcf.gz"
+# dbs = "susScr11"
+
+# rn6 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001895.4/GCA_000001895.4_current_ids.vcf.gz"
+# dbs = "rn6"
+
+# rn7 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_015227675.2/GCA_015227675.2_current_ids.vcf.gz"
+# dbs = "rn7
+
+# danRer10 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.3/GCA_000002035.3_current_ids.vcf.gz"
+# dbs = "danRer10"
+
+# equCab3 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002863925.1/GCA_002863925.1_current_ids.vcf.gz"
+# dbs = "equCab3"
+
+# rheMac10 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_003339765.3/GCA_003339765.3_current_ids.vcf.gz"
+# dbs = "rheMac10"
+
+# macFas5 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000364345.1/GCA_000364345.1_current_ids.vcf.gz"
+# dbs = "macFas5"
+
+# oviAri4 - Completes if removed rs1089233242
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000298735.2/GCA_000298735.2_current_ids.vcf.gz"
+# dbs = "oviAri4"
+
+# felCat9 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000181335.4/GCA_000181335.4_current_ids.vcf.gz"
+# dbs = "felCat9"
+
+# galGal5 - Completes
+# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.3/GCA_000002315.3_current_ids.vcf.gz"
+# dbs = "galGal5"
+
+# canFam3
+url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz"
+dbs = "canFam3"
+
+workDir = "/hive/data/outside/eva/"+dbs
+if workDir[-1] != "/":
+    workDir=workDir+"/"
+
+def checkDirMakeDir(workDir):
+    if not os.path.isdir(workDir):
+        os.mkdir(workDir)
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
+
+def wgetFile(url, workDir):
+    bash("wget %s -P %s" % (url, workDir))
+    fileName = workDir+url.split("/")[-1]
+    return(fileName)
+    
+def checkDupsAndUnzip(fileName,workDir):
+    """Check for duplicate lines (common in EVA), and unzip file"""
+    lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip()
+    uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip()
+    currentFileName = workDir+"evaSnps.vcf"
+    if lineCount != uniqCount:
+        print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount))
+        bash("zcat %s | uniq > %s" % (fileName,currentFileName))
+    else:
+        bash("zcat %s > %s" % (fileName,currentFileName))
+    return(currentFileName)
+
+def sanitizeChromNames(currentFileName):
+    """Swaps the chrom IDs with the accession names which are indexed in chromAlias"""
+    inputFile = open(currentFileName,'r')
+    outputFile = open(currentFileName+".fixed","w")
+    IDtoAccessionDic = {}
+    
+    for line in inputFile:
+        if line.startswith("##contig"):
+            ID = line.split('=')[2].split(',')[0]
+            accession = line.split('=')[3].split('"')[1]
+            IDtoAccessionDic[ID] = accession
+            outputFile.write(line)
+        elif line.startswith("#"):
+            outputFile.write(line)
+        else:
+            line = line.split("\t")
+            line[0] = IDtoAccessionDic[line[0]]
+            line = "\t".join(line)
+            outputFile.write(line)
+    inputFile.close()
+    outputFile.close()
+    #Replacing the same name because chromAlias should be updated soon and this function will be removed
+    bash("mv %s.fixed %s" % (currentFileName,currentFileName))
+
+def convertChromToUcsc(currentFileName,dbs,workDir):
+    """Convert to UCSC-style chroms"""
+    bash("chromToUcsc --get %s" % (dbs))
+    bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv"))
+    bash("rm %s" % (dbs+".chromAlias.tsv"))
+    currentFileName = workDir+"evaSnps.ucscChroms.vcf"
+    return(currentFileName)
+                               
+def convertVcfToBed(currentFileName,workDir):
+    """Convert to bed keeping only VC and SID fields"""
+    bash("bgzip %s" % (currentFileName))
+    bash("tabix -p vcf %s" % (currentFileName+".gz"))
+    bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir))
+    currentFileName = workDir+"evaSnp.bed"
+    return(currentFileName)
+
+def splitChromsAndRunHgVai(workDir,dbs):
+    """Split all the chroms in tenths in order to be able to run hgVai without running out of memory"""
+    chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n")
+    inputFile = workDir+"evaSnps.ucscChroms.vcf.gz"
+    outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep"
+    for chrom in chromSizes[1:-1]:
+        chrom = chrom.split("\t")
+        chromName = chrom[0]
+        print("Running "+chromName)
+        if int(chrom[1]) < 1000000: #Don't worry about splitting if chrom is less than 1M bp
+            if dbs == 'mm39':
+                bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
+            --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile))
+            else:
+                bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
+            --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile))
+        else:
+            breakPoints = [.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
+            for part in breakPoints:
+                startCoord = round(int(chrom[1])*(part-.1))+1
+                endCoord = round(int(chrom[1])*part)
+                if part == breakPoints[0]: #Special exception for start
+                    if dbs == 'mm39':
+                        bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
+                    --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile))
+                    else:
+                        bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
+                    --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile))
+                else:
+                    if dbs == 'mm39':
+                        bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
+                    --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile))  
+                    else:
+                        bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
+                    --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile))                    
+    return(outputFile)
+
+def splitVepFileByChrom(hgVaiResultsFile,workDir):
+    """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs"""
+    splitChromDir = workDir+"splitChroms/"
+    checkDirMakeDir(splitChromDir)
+    vepFile = open(hgVaiResultsFile,"r")
+    for line in vepFile:
+        if line.startswith("#") or line.startswith("Uploaded"):
+            continue
+        else:
+            #Need to compute itemRGB, aaChange, and ucscClass
+            lineContents = line.rstrip().split("\t")
+            chrom = lineContents[1].split(":")[0]
+            outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a")
+            outFile.write(line)
+            outFile.close()
+    vepFile.close()
+
+def fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts):             
+    """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList"""
+    prevName = nameAlt
+    #Check if chrom has changed and index current chrom
+    if chrom != currentChrom:
+        currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r")
+        currentVepDic = {}
+        for entry in currentVepFile:
+            entry = entry.rstrip().split("\t")
+            rsID = entry[0]
+            rsIDAlt = entry[0]+entry[2]
+            #print(rsID)
+            #altAllele = entry[2]
+
+            if rsIDAlt not in currentVepDic.keys():
+                currentVepDic[rsIDAlt] = {}
+                currentVepDic[rsIDAlt]['ucscClassList'] = []
+                currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6])
+                currentVepDic[rsIDAlt]['aaChange'] = []
+                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
+                    currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
+            else:
+                if entry[6] not in currentVepDic[rsIDAlt]['ucscClassList']:
+                    currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6])
+                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
+                    if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsIDAlt]['aaChange']:
+                        currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
+
+            if rsID not in currentVepDic.keys():
+                currentVepDic[rsID] = {}
+                currentVepDic[rsID]['ucscClassList'] = []
+                currentVepDic[rsID]['ucscClassList'].append(entry[6])
+                currentVepDic[rsID]['aaChange'] = []
+                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
+                    currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
+            else:
+                if entry[6] not in currentVepDic[rsID]['ucscClassList']:
+                    currentVepDic[rsID]['ucscClassList'].append(entry[6])
+                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
+                    if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']:
+                        currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
+                        
+        currentChrom=chrom
+        currentVepFile.close()
+        
+    try:
+        ucscClass = ",".join(currentVepDic[nameAlt]['ucscClassList'])
+        aaChange = " ".join(currentVepDic[nameAlt]['aaChange'])
+    except:
+        tryAttempts+=1
+        ucscClass = ",".join(currentVepDic[name]['ucscClassList'])
+        aaChange = " ".join(currentVepDic[name]['aaChange'])
+       
+    return(prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts) 
+
+def assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt):
+    """Assign color based on most serious hgVai predicted consequence"""
+
+    redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant',
+               'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained',
+               'stop_lost','coding_sequence_variant','transcript_ablation']
+    greenVars = ['synonymous_variant','stop_retained_variant']
+    blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant',
+                'non_coding_transcript_exon_variant']
+    blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant',
+                 'NMD_transcript_variant','no_sequence_alteration']
+    
+    try:
+        if any(item in currentVepDic[nameAlt]['ucscClassList'] for item in redVars):
+            itemRgb = '255,0,0'
+        elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in greenVars):
+            itemRgb = '0,128,0'
+        elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blueVars):
+            itemRgb = '0,0,255'
+        elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blackVars):
+            itemRgb = '0,0,0'
+        else:
+            sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
+    except:
+        if any(item in currentVepDic[name]['ucscClassList'] for item in redVars):
+            itemRgb = '255,0,0'
+        elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars):
+            itemRgb = '0,128,0'
+        elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars):
+            itemRgb = '0,0,255'
+        elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars):
+            itemRgb = '0,0,0'
+        else:
+            sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
+        
+            
+    return(itemRgb)
+
+def convertSOnumberToName(varClass):
+    soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion', 
+           "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV', "SO:0000705": 'tandem_repeat'}
+    varClass = soDic[varClass]
+    return(varClass)
+
+def buildDuplicateRsIDset(currentFileName):
+    """Look for multi-allelic variants to then try and more accurately match those hgVai results"""
+    duplicateSet = set()
+    inputFile = open(currentFileName,"r")
+    prevName = False
+    for line in inputFile:
+        if line.startswith("#") or line.startswith("Uploaded"):
+            continue
+        else:
+            lineContents = line.rstrip().split("\t")
+            name = lineContents[3]
+            if prevName == name:
+                duplicateSet.add(name)
+            prevName = name
+    print("Total number of multi-allelic variants: "+str(len(duplicateSet)))
+    return(duplicateSet)
+        
+
+def createFinalBedWithVepAnnotations(currentFileName,workDir):
+    """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai"""
+
+    inputFile = open(currentFileName,"r")
+    outputFile = open(workDir+"evaSnp.final.bed","w")
+    prevName = ""
+    currentChrom = ""
+    currentVepDic = {}
+    n=0
+    tryAttempts=0
+    duplicateSet = buildDuplicateRsIDset(currentFileName)
+    
+    for line in inputFile:
+        n+=1
+        if n%5000000 == 0:
+            print("Current line count: "+str(n))
+        if line.startswith("#") or line.startswith("Uploaded"):
+            continue
+        else:
+        #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange
+        #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms
+        
+            lineContents = line.rstrip().split("\t")
+            chrom = lineContents[0]
+            chromStart = lineContents[1]
+            chromEnd = lineContents[2]
+            name = lineContents[3]
+            ref = lineContents[9]
+            alt = lineContents[10]
+            varClass = lineContents[12]
+            submitters = lineContents[13]
+            
+            #Convert from SO term to name. e.g. SO:0001483 to substitution
+            varClass = convertSOnumberToName(varClass)
+        
+            #Check if is the same rsID, they are repeated in ~5% of file
+            if prevName == name:
+                print("Repeated prevName: "+prevName)
+                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
+                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
+            elif name in duplicateSet:        
+                #Find the aaChange and ucscClass
+                #Try and match the allele with what hgVai outputs
+                if varClass == "insertion":
+                    if ',' in alt:
+                        nameAlt = name+alt.split(",")[0][1:]
+                    elif len(alt)-1 >= 59:
+                        nameAlt = name+alt[len(ref):25]+"..."+alt[len(alt)-24:]+"("+str(len(alt)-1)+" nt)"
+                    else:
+                        nameAlt = name+alt[1:]
+                elif varClass == "deletion":
+                    nameAlt = name+"-"
+                elif varClass == "tandem_repeat":
+                    if len(alt) > len(ref):
+                        nameAlt = name+alt[len(ref):]
+                    else:
+                        nameAlt = name+"-"
+                else:
+                    nameAlt = name+alt
+          
+                prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts)
+                itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt)
+                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
+                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
+            else:
+                nameAlt = name
+                prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts)
+                itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt)
+                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
+                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
+
+    print("Total number of try attempts = "+str(tryAttempts))
+    print("Total number of lines in output file = "+str(n))
+    inputFile.close()
+    outputFile.close()
+
+def validateFinalFile(workDir,url):
+    """Compare and make sure that the final bed rsID# matches the input file"""
+    originalDownloadFileName = workDir+url.split("/")[-1]
+    bedFile = workDir+"evaSnp.final.bed"
+    originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName))
+    finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile))
+    if originalRsIDCount != finalBedRsIDCount:
+        sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\
+    "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount)))
+    
+def createBigBed(workDir,dbs):
+    bedFile = workDir+"evaSnp.final.bed"
+    bash("bedSort %s %s" % (bedFile,bedFile))
+    bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \
+    %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp.bb" % (bedFile,dbs,dbs,workDir))
+    
+checkDirMakeDir(workDir)
+fileName = wgetFile(url, workDir)
+fileName = workDir+url.split("/")[-1]
+print("Got file")
+currentFileName = checkDupsAndUnzip(fileName,workDir)
+print("File unzipped")
+try:
+    currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
+    print("Chroms converted")
+except:
+    sanitizeChromNames(currentFileName)
+    print("Chroms sanitized")
+    currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
+    print("Chroms converted")
+currentFileName = convertVcfToBed(currentFileName,workDir)    
+print("File converted to bed")
+hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs)
+print("hgVai done")
+splitVepFileByChrom(hgVaiResultsFile,workDir)
+print("hgVai file split")
+currentFileName = workDir+"evaSnp.bed"
+createFinalBedWithVepAnnotations(currentFileName,workDir)
+print("Validating file")
+validateFinalFile(workDir,url)
+print("Created final file")
+createBigBed(workDir,dbs)
+print("bigBed made")
+
+#####################################
+#### End of python3 code ############
+#####################################