754be93f931254dfaf897d382a7b6847a2bbaa39 lrnassar Tue May 3 14:50:47 2022 -0700 Adding the updated evaSnp3 pipeline which now includes canFam3 and has better handling of multi-allelic variants. Refs #29031 diff --git src/hg/makeDb/doc/evaSnp3.txt src/hg/makeDb/doc/evaSnp3.txt index d16f606..c46f320 100644 --- src/hg/makeDb/doc/evaSnp3.txt +++ src/hg/makeDb/doc/evaSnp3.txt @@ -1,377 +1,830 @@ # Track for EVA snp release 3, which was released 2/24/2022 - https://www.ebi.ac.uk/eva/?RS-Release&releaseVersion=3 # Tracks built by Lou on 4/19/2022 # Assemblies were the following: mm39, danRer11, bosTau9, dm6, mm10, galGal6, susScr11, rn6, # rn7, danRer10, equCab3, rheMac10, macFas5, oviAri4, felCat9, galGal5 # The GCA accession on the eva release by accession list (https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/) # were compared to each of our dbs genome accessions with at least 100 hits/month to create that list # All assemblies were passed by the python pipeline described below. 3 assemblies required some # variants to be removed (danRer11, dm6, oviAri4) due to issues with hgVai, see RM #29262. # The removed variants were done so after initial data download and noted in the code comment ##################################### #### Beginning of python3 code ###### ##################################### import subprocess,sys,argparse,os #All databases with monthly hit counts of at least 100 #mm39 - Completes # url = "http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/mus_musculus/GCA_000001635.9/GCA_000001635.9_current_ids.vcf.gz" # dbs='mm39' #danRer11 - Completes if I remove rs499587024 #url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.4/GCA_000002035.4_current_ids.vcf.gz" #dbs='danRer11' #bosTau9 - completes #url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002263795.2/GCA_002263795.2_current_ids.vcf.gz" #dbs="bosTau9" #dm6 - Completes if I remove /hive/data/outside/eva/dm6/problematicRSIDs.txt # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001215.4/GCA_000001215.4_current_ids.vcf.gz" # dbs = "dm6" #mm10 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001635.6/GCA_000001635.6_current_ids.vcf.gz" # dbs = "mm10" # #galGal6 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.5/GCA_000002315.5_current_ids.vcf.gz" # dbs = "galGal6" # #susScr11 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000003025.6/GCA_000003025.6_current_ids.vcf.gz" # dbs = "susScr11" # rn6 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001895.4/GCA_000001895.4_current_ids.vcf.gz" # dbs = "rn6" # rn7 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_015227675.2/GCA_015227675.2_current_ids.vcf.gz" # dbs = "rn7 # danRer10 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.3/GCA_000002035.3_current_ids.vcf.gz" # dbs = "danRer10" # equCab3 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002863925.1/GCA_002863925.1_current_ids.vcf.gz" # dbs = "equCab3" # rheMac10 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_003339765.3/GCA_003339765.3_current_ids.vcf.gz" # dbs = "rheMac10" # macFas5 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000364345.1/GCA_000364345.1_current_ids.vcf.gz" # dbs = "macFas5" # oviAri4 - Completes if removed rs1089233242 # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000298735.2/GCA_000298735.2_current_ids.vcf.gz" # dbs = "oviAri4" # felCat9 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000181335.4/GCA_000181335.4_current_ids.vcf.gz" # dbs = "felCat9" # galGal5 - Completes # url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.3/GCA_000002315.3_current_ids.vcf.gz" # dbs = "galGal5" workDir = "/hive/data/outside/eva/"+dbs if workDir[-1] != "/": workDir=workDir+"/" def checkDirMakeDir(workDir): if not os.path.isdir(workDir): os.mkdir(workDir) def bash(cmd): """Run the cmd in bash subprocess""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) bashStdoutt = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) return(bashStdoutt) def wgetFile(url, workDir): bash("wget %s -P %s" % (url, workDir)) fileName = workDir+url.split("/")[-1] return(fileName) def checkDupsAndUnzip(fileName,workDir): """Check for duplicate lines (common in EVA), and unzip file""" lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip() uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip() currentFileName = workDir+"evaSnps.vcf" if lineCount != uniqCount: print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount)) bash("zcat %s | uniq > %s" % (fileName,currentFileName)) else: bash("zcat %s > %s" % (fileName,currentFileName)) return(currentFileName) def sanitizeChromNames(currentFileName): """Swaps the chrom IDs with the accession names which are indexed in chromAlias""" inputFile = open(currentFileName,'r') outputFile = open(currentFileName+".fixed","w") IDtoAccessionDic = {} for line in inputFile: if line.startswith("##contig"): ID = line.split('=')[2].split(',')[0] accession = line.split('=')[3].split('"')[1] IDtoAccessionDic[ID] = accession outputFile.write(line) elif line.startswith("#"): outputFile.write(line) else: line = line.split("\t") line[0] = IDtoAccessionDic[line[0]] line = "\t".join(line) outputFile.write(line) inputFile.close() outputFile.close() #Replacing the same name because chromAlias should be updated soon and this function will be removed bash("mv %s.fixed %s" % (currentFileName,currentFileName)) def convertChromToUcsc(currentFileName,dbs,workDir): """Convert to UCSC-style chroms""" bash("chromToUcsc --get %s" % (dbs)) bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv")) bash("rm %s" % (dbs+".chromAlias.tsv")) currentFileName = workDir+"evaSnps.ucscChroms.vcf" return(currentFileName) def convertVcfToBed(currentFileName,workDir): """Convert to bed keeping only VC and SID fields""" bash("bgzip %s" % (currentFileName)) bash("tabix -p vcf %s" % (currentFileName+".gz")) bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir)) currentFileName = workDir+"evaSnp.bed" return(currentFileName) def splitChromsAndRunHgVai(workDir,dbs): """Split all the chroms in tenths in order to be able to run hgVai without running out of memory""" chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n") inputFile = workDir+"evaSnps.ucscChroms.vcf.gz" outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep" for chrom in chromSizes[1:-1]: chrom = chrom.split("\t") chromName = chrom[0] print("Running "+chromName) if int(chrom[1]) < 1000000: #Don't worry about splitting if chrom is less than 1M bp if dbs == 'mm39': bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \ --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile)) else: bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \ --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile)) else: breakPoints = [.1,.2,.3,.4,.5,.6,.7,.8,.9,1] for part in breakPoints: startCoord = round(int(chrom[1])*(part-.1))+1 endCoord = round(int(chrom[1])*part) if part == breakPoints[0]: #Special exception for start if dbs == 'mm39': bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \ --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile)) else: bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \ --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile)) else: if dbs == 'mm39': bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \ --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile)) else: bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \ --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile)) return(outputFile) def splitVepFileByChrom(hgVaiResultsFile,workDir): """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs""" splitChromDir = workDir+"splitChroms/" checkDirMakeDir(splitChromDir) vepFile = open(hgVaiResultsFile,"r") for line in vepFile: if line.startswith("#") or line.startswith("Uploaded"): continue else: #Need to compute itemRGB, aaChange, and ucscClass lineContents = line.rstrip().split("\t") chrom = lineContents[1].split(":")[0] outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a") outFile.write(line) outFile.close() vepFile.close() def fetchUcscClassAndAaChange(name,chrom,currentChrom,currentVepDic): """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList""" prevName = name #Check if chrom has changed and index current chrom if chrom != currentChrom: currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r") currentVepDic = {} for entry in currentVepFile: entry = entry.rstrip().split("\t") rsID = entry[0] if rsID not in currentVepDic.keys(): currentVepDic[rsID] = {} currentVepDic[rsID]['ucscClassList'] = [] currentVepDic[rsID]['ucscClassList'].append(entry[6]) currentVepDic[rsID]['aaChange'] = [] if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) else: if entry[6] not in currentVepDic[rsID]['ucscClassList']: currentVepDic[rsID]['ucscClassList'].append(entry[6]) if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']: currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) currentChrom=chrom currentVepFile.close() ucscClass = ",".join(currentVepDic[name]['ucscClassList']) aaChange = " ".join(currentVepDic[name]['aaChange']) return(prevName,currentChrom,currentVepDic,ucscClass,aaChange) def assignRGBcolorByConsequence(currentVepDic,name): """Assign color based on most serious hgVai predicted consequence""" redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant', 'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained', 'stop_lost','coding_sequence_variant','transcript_ablation'] greenVars = ['synonymous_variant','stop_retained_variant'] blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant', 'non_coding_transcript_exon_variant'] blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant', 'NMD_transcript_variant','no_sequence_alteration'] if any(item in currentVepDic[name]['ucscClassList'] for item in redVars): itemRgb = '255,0,0' elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars): itemRgb = '0,128,0' elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars): itemRgb = '0,0,255' elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars): itemRgb = '0,0,0' else: sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList'])) return(itemRgb) def convertSOnumberToName(varClass): soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion', "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV'} varClass = soDic[varClass] return(varClass) def createFinalBedWithVepAnnotations(currentFileName,workDir): """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai""" inputFile = open(currentFileName,"r") outputFile = open(workDir+"evaSnp.final.bed","w") prevName = "" currentChrom = "" currentVepDic = {} n=0 for line in inputFile: n+=1 if n%5000000 == 0: print("Current line count: "+str(n)) if line.startswith("#") or line.startswith("Uploaded"): continue else: #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms lineContents = line.rstrip().split("\t") chrom = lineContents[0] chromStart = lineContents[1] chromEnd = lineContents[2] name = lineContents[3] ref = lineContents[9] alt = lineContents[10] varClass = lineContents[12] submitters = lineContents[13] #Convert from SO term to name. e.g. SO:0001483 to substitution varClass = convertSOnumberToName(varClass) #Check if is the same rsID, they are repeated in ~5% of file if prevName == name: outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") else: #Find the aaChange and ucscClass prevName,currentChrom,currentVepDic,ucscClass,aaChange = fetchUcscClassAndAaChange(name,chrom,currentChrom,currentVepDic) itemRgb = assignRGBcolorByConsequence(currentVepDic,name) outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") inputFile.close() outputFile.close() def validateFinalFile(workDir,url): """Compare and make sure that the final bed rsID# matches the input file""" originalDownloadFileName = workDir+url.split("/")[-1] bedFile = workDir+"evaSnp.final.bed" originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName)) finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile)) if originalRsIDCount != finalBedRsIDCount: sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\ "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount))) def createBigBed(workDir,dbs): bedFile = workDir+"evaSnp.final.bed" bash("bedSort %s %s" % (bedFile,bedFile)) bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \ %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp.bb" % (bedFile,dbs,dbs,workDir)) checkDirMakeDir(workDir) fileName = wgetFile(url, workDir) fileName = workDir+url.split("/")[-1] print("Got file") currentFileName = checkDupsAndUnzip(fileName,workDir) print("File unzipped") try: currentFileName = convertChromToUcsc(currentFileName,dbs,workDir) print("Chroms converted") except: sanitizeChromNames(currentFileName) print("Chroms sanitized") currentFileName = convertChromToUcsc(currentFileName,dbs,workDir) print("Chroms converted") currentFileName = convertVcfToBed(currentFileName,workDir) print("File converted to bed") hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs) print("hgVai done") splitVepFileByChrom(hgVaiResultsFile,workDir) print("hgVai file split") currentFileName = workDir+"evaSnp.bed" createFinalBedWithVepAnnotations(currentFileName,workDir) print("Validating file") validateFinalFile(workDir,url) print("Created final file") createBigBed(workDir,dbs) print("bigBed made") ##################################### ######### End of python3 code ###### ##################################### # Lastly links were made with a file containing the 16 assemblies: for dbs in $(cat $1); do ln -s /hive/data/outside/eva/$dbs/evaSnp.bb /gbdb/$dbs/bbi/; done + +####################### + +# On 5/3 the tracks were rebuilt to add better logic handling for multi-allele variants (rsIDs +# that come up multiple times in the file). These IDs now get matched with their unique +# hgVai alts. See #29031 for more info. Entire new updated pipeline included below. +# canFam3 was also added to the pipeline. + +##################################### +######### Start of python3 ######### +##################################### + +import subprocess,sys,argparse,os + +#All databases with monthly hit counts of at least 100 + +# mm39 - Completes +# url = "http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/mus_musculus/GCA_000001635.9/GCA_000001635.9_current_ids.vcf.gz" +# dbs='mm39' + +#danRer11 - Completes if I remove rs499587024 +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.4/GCA_000002035.4_current_ids.vcf.gz" +# dbs='danRer11' + +#bosTau9 - completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002263795.2/GCA_002263795.2_current_ids.vcf.gz" +# dbs="bosTau9" + +#dm6 - Completes if I remove /hive/data/outside/eva/dm6/problematicRSIDs.txt +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001215.4/GCA_000001215.4_current_ids.vcf.gz" +# dbs = "dm6" + +#mm10 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001635.6/GCA_000001635.6_current_ids.vcf.gz" +# dbs = "mm10" + +# #galGal6 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.5/GCA_000002315.5_current_ids.vcf.gz" +# dbs = "galGal6" + +# #susScr11 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000003025.6/GCA_000003025.6_current_ids.vcf.gz" +# dbs = "susScr11" + +# rn6 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001895.4/GCA_000001895.4_current_ids.vcf.gz" +# dbs = "rn6" + +# rn7 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_015227675.2/GCA_015227675.2_current_ids.vcf.gz" +# dbs = "rn7 + +# danRer10 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.3/GCA_000002035.3_current_ids.vcf.gz" +# dbs = "danRer10" + +# equCab3 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002863925.1/GCA_002863925.1_current_ids.vcf.gz" +# dbs = "equCab3" + +# rheMac10 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_003339765.3/GCA_003339765.3_current_ids.vcf.gz" +# dbs = "rheMac10" + +# macFas5 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000364345.1/GCA_000364345.1_current_ids.vcf.gz" +# dbs = "macFas5" + +# oviAri4 - Completes if removed rs1089233242 +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000298735.2/GCA_000298735.2_current_ids.vcf.gz" +# dbs = "oviAri4" + +# felCat9 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000181335.4/GCA_000181335.4_current_ids.vcf.gz" +# dbs = "felCat9" + +# galGal5 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.3/GCA_000002315.3_current_ids.vcf.gz" +# dbs = "galGal5" + +# canFam3 +url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz" +dbs = "canFam3" + +workDir = "/hive/data/outside/eva/"+dbs +if workDir[-1] != "/": + workDir=workDir+"/" + +def checkDirMakeDir(workDir): + if not os.path.isdir(workDir): + os.mkdir(workDir) + +def bash(cmd): + """Run the cmd in bash subprocess""" + try: + rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ + stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) + bashStdoutt = rawBashOutput.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) + return(bashStdoutt) + +def wgetFile(url, workDir): + bash("wget %s -P %s" % (url, workDir)) + fileName = workDir+url.split("/")[-1] + return(fileName) + +def checkDupsAndUnzip(fileName,workDir): + """Check for duplicate lines (common in EVA), and unzip file""" + lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip() + uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip() + currentFileName = workDir+"evaSnps.vcf" + if lineCount != uniqCount: + print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount)) + bash("zcat %s | uniq > %s" % (fileName,currentFileName)) + else: + bash("zcat %s > %s" % (fileName,currentFileName)) + return(currentFileName) + +def sanitizeChromNames(currentFileName): + """Swaps the chrom IDs with the accession names which are indexed in chromAlias""" + inputFile = open(currentFileName,'r') + outputFile = open(currentFileName+".fixed","w") + IDtoAccessionDic = {} + + for line in inputFile: + if line.startswith("##contig"): + ID = line.split('=')[2].split(',')[0] + accession = line.split('=')[3].split('"')[1] + IDtoAccessionDic[ID] = accession + outputFile.write(line) + elif line.startswith("#"): + outputFile.write(line) + else: + line = line.split("\t") + line[0] = IDtoAccessionDic[line[0]] + line = "\t".join(line) + outputFile.write(line) + inputFile.close() + outputFile.close() + #Replacing the same name because chromAlias should be updated soon and this function will be removed + bash("mv %s.fixed %s" % (currentFileName,currentFileName)) + +def convertChromToUcsc(currentFileName,dbs,workDir): + """Convert to UCSC-style chroms""" + bash("chromToUcsc --get %s" % (dbs)) + bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv")) + bash("rm %s" % (dbs+".chromAlias.tsv")) + currentFileName = workDir+"evaSnps.ucscChroms.vcf" + return(currentFileName) + +def convertVcfToBed(currentFileName,workDir): + """Convert to bed keeping only VC and SID fields""" + bash("bgzip %s" % (currentFileName)) + bash("tabix -p vcf %s" % (currentFileName+".gz")) + bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir)) + currentFileName = workDir+"evaSnp.bed" + return(currentFileName) + +def splitChromsAndRunHgVai(workDir,dbs): + """Split all the chroms in tenths in order to be able to run hgVai without running out of memory""" + chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n") + inputFile = workDir+"evaSnps.ucscChroms.vcf.gz" + outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep" + for chrom in chromSizes[1:-1]: + chrom = chrom.split("\t") + chromName = chrom[0] + print("Running "+chromName) + if int(chrom[1]) < 1000000: #Don't worry about splitting if chrom is less than 1M bp + if dbs == 'mm39': + bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile)) + else: + bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile)) + else: + breakPoints = [.1,.2,.3,.4,.5,.6,.7,.8,.9,1] + for part in breakPoints: + startCoord = round(int(chrom[1])*(part-.1))+1 + endCoord = round(int(chrom[1])*part) + if part == breakPoints[0]: #Special exception for start + if dbs == 'mm39': + bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile)) + else: + bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile)) + else: + if dbs == 'mm39': + bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile)) + else: + bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile)) + return(outputFile) + +def splitVepFileByChrom(hgVaiResultsFile,workDir): + """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs""" + splitChromDir = workDir+"splitChroms/" + checkDirMakeDir(splitChromDir) + vepFile = open(hgVaiResultsFile,"r") + for line in vepFile: + if line.startswith("#") or line.startswith("Uploaded"): + continue + else: + #Need to compute itemRGB, aaChange, and ucscClass + lineContents = line.rstrip().split("\t") + chrom = lineContents[1].split(":")[0] + outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a") + outFile.write(line) + outFile.close() + vepFile.close() + +def fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts): + """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList""" + prevName = nameAlt + #Check if chrom has changed and index current chrom + if chrom != currentChrom: + currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r") + currentVepDic = {} + for entry in currentVepFile: + entry = entry.rstrip().split("\t") + rsID = entry[0] + rsIDAlt = entry[0]+entry[2] + #print(rsID) + #altAllele = entry[2] + + if rsIDAlt not in currentVepDic.keys(): + currentVepDic[rsIDAlt] = {} + currentVepDic[rsIDAlt]['ucscClassList'] = [] + currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6]) + currentVepDic[rsIDAlt]['aaChange'] = [] + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + else: + if entry[6] not in currentVepDic[rsIDAlt]['ucscClassList']: + currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6]) + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsIDAlt]['aaChange']: + currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + + if rsID not in currentVepDic.keys(): + currentVepDic[rsID] = {} + currentVepDic[rsID]['ucscClassList'] = [] + currentVepDic[rsID]['ucscClassList'].append(entry[6]) + currentVepDic[rsID]['aaChange'] = [] + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + else: + if entry[6] not in currentVepDic[rsID]['ucscClassList']: + currentVepDic[rsID]['ucscClassList'].append(entry[6]) + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']: + currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + + currentChrom=chrom + currentVepFile.close() + + try: + ucscClass = ",".join(currentVepDic[nameAlt]['ucscClassList']) + aaChange = " ".join(currentVepDic[nameAlt]['aaChange']) + except: + tryAttempts+=1 + ucscClass = ",".join(currentVepDic[name]['ucscClassList']) + aaChange = " ".join(currentVepDic[name]['aaChange']) + + return(prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts) + +def assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt): + """Assign color based on most serious hgVai predicted consequence""" + + redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant', + 'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained', + 'stop_lost','coding_sequence_variant','transcript_ablation'] + greenVars = ['synonymous_variant','stop_retained_variant'] + blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant', + 'non_coding_transcript_exon_variant'] + blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant', + 'NMD_transcript_variant','no_sequence_alteration'] + + try: + if any(item in currentVepDic[nameAlt]['ucscClassList'] for item in redVars): + itemRgb = '255,0,0' + elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in greenVars): + itemRgb = '0,128,0' + elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blueVars): + itemRgb = '0,0,255' + elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blackVars): + itemRgb = '0,0,0' + else: + sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList'])) + except: + if any(item in currentVepDic[name]['ucscClassList'] for item in redVars): + itemRgb = '255,0,0' + elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars): + itemRgb = '0,128,0' + elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars): + itemRgb = '0,0,255' + elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars): + itemRgb = '0,0,0' + else: + sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList'])) + + + return(itemRgb) + +def convertSOnumberToName(varClass): + soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion', + "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV', "SO:0000705": 'tandem_repeat'} + varClass = soDic[varClass] + return(varClass) + +def buildDuplicateRsIDset(currentFileName): + """Look for multi-allelic variants to then try and more accurately match those hgVai results""" + duplicateSet = set() + inputFile = open(currentFileName,"r") + prevName = False + for line in inputFile: + if line.startswith("#") or line.startswith("Uploaded"): + continue + else: + lineContents = line.rstrip().split("\t") + name = lineContents[3] + if prevName == name: + duplicateSet.add(name) + prevName = name + print("Total number of multi-allelic variants: "+str(len(duplicateSet))) + return(duplicateSet) + + +def createFinalBedWithVepAnnotations(currentFileName,workDir): + """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai""" + + inputFile = open(currentFileName,"r") + outputFile = open(workDir+"evaSnp.final.bed","w") + prevName = "" + currentChrom = "" + currentVepDic = {} + n=0 + tryAttempts=0 + duplicateSet = buildDuplicateRsIDset(currentFileName) + + for line in inputFile: + n+=1 + if n%5000000 == 0: + print("Current line count: "+str(n)) + if line.startswith("#") or line.startswith("Uploaded"): + continue + else: + #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange + #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms + + lineContents = line.rstrip().split("\t") + chrom = lineContents[0] + chromStart = lineContents[1] + chromEnd = lineContents[2] + name = lineContents[3] + ref = lineContents[9] + alt = lineContents[10] + varClass = lineContents[12] + submitters = lineContents[13] + + #Convert from SO term to name. e.g. SO:0001483 to substitution + varClass = convertSOnumberToName(varClass) + + #Check if is the same rsID, they are repeated in ~5% of file + if prevName == name: + print("Repeated prevName: "+prevName) + outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ + itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") + elif name in duplicateSet: + #Find the aaChange and ucscClass + #Try and match the allele with what hgVai outputs + if varClass == "insertion": + if ',' in alt: + nameAlt = name+alt.split(",")[0][1:] + elif len(alt)-1 >= 59: + nameAlt = name+alt[len(ref):25]+"..."+alt[len(alt)-24:]+"("+str(len(alt)-1)+" nt)" + else: + nameAlt = name+alt[1:] + elif varClass == "deletion": + nameAlt = name+"-" + elif varClass == "tandem_repeat": + if len(alt) > len(ref): + nameAlt = name+alt[len(ref):] + else: + nameAlt = name+"-" + else: + nameAlt = name+alt + + prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts) + itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt) + outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ + itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") + else: + nameAlt = name + prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts) + itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt) + outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ + itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") + + print("Total number of try attempts = "+str(tryAttempts)) + print("Total number of lines in output file = "+str(n)) + inputFile.close() + outputFile.close() + +def validateFinalFile(workDir,url): + """Compare and make sure that the final bed rsID# matches the input file""" + originalDownloadFileName = workDir+url.split("/")[-1] + bedFile = workDir+"evaSnp.final.bed" + originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName)) + finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile)) + if originalRsIDCount != finalBedRsIDCount: + sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\ + "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount))) + +def createBigBed(workDir,dbs): + bedFile = workDir+"evaSnp.final.bed" + bash("bedSort %s %s" % (bedFile,bedFile)) + bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \ + %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp.bb" % (bedFile,dbs,dbs,workDir)) + +checkDirMakeDir(workDir) +fileName = wgetFile(url, workDir) +fileName = workDir+url.split("/")[-1] +print("Got file") +currentFileName = checkDupsAndUnzip(fileName,workDir) +print("File unzipped") +try: + currentFileName = convertChromToUcsc(currentFileName,dbs,workDir) + print("Chroms converted") +except: + sanitizeChromNames(currentFileName) + print("Chroms sanitized") + currentFileName = convertChromToUcsc(currentFileName,dbs,workDir) + print("Chroms converted") +currentFileName = convertVcfToBed(currentFileName,workDir) +print("File converted to bed") +hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs) +print("hgVai done") +splitVepFileByChrom(hgVaiResultsFile,workDir) +print("hgVai file split") +currentFileName = workDir+"evaSnp.bed" +createFinalBedWithVepAnnotations(currentFileName,workDir) +print("Validating file") +validateFinalFile(workDir,url) +print("Created final file") +createBigBed(workDir,dbs) +print("bigBed made") + +##################################### +#### End of python3 code ############ +#####################################