26aac34f999f66703d253baef1ff5f084c2d5007 lrnassar Tue Apr 19 17:12:41 2022 -0700 EVA SNP3 track now QA ready, refs #29031 diff --git src/hg/makeDb/doc/evaSnp3.txt src/hg/makeDb/doc/evaSnp3.txt new file mode 100644 index 0000000..d16f606 --- /dev/null +++ src/hg/makeDb/doc/evaSnp3.txt @@ -0,0 +1,377 @@ +# Track for EVA snp release 3, which was released 2/24/2022 - https://www.ebi.ac.uk/eva/?RS-Release&releaseVersion=3 +# Tracks built by Lou on 4/19/2022 + +# Assemblies were the following: mm39, danRer11, bosTau9, dm6, mm10, galGal6, susScr11, rn6, +# rn7, danRer10, equCab3, rheMac10, macFas5, oviAri4, felCat9, galGal5 + +# The GCA accession on the eva release by accession list (https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/) +# were compared to each of our dbs genome accessions with at least 100 hits/month to create that list + +# All assemblies were passed by the python pipeline described below. 3 assemblies required some +# variants to be removed (danRer11, dm6, oviAri4) due to issues with hgVai, see RM #29262. +# The removed variants were done so after initial data download and noted in the code comment + +##################################### +#### Beginning of python3 code ###### +##################################### + +import subprocess,sys,argparse,os + +#All databases with monthly hit counts of at least 100 + +#mm39 - Completes +# url = "http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/mus_musculus/GCA_000001635.9/GCA_000001635.9_current_ids.vcf.gz" +# dbs='mm39' + +#danRer11 - Completes if I remove rs499587024 +#url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.4/GCA_000002035.4_current_ids.vcf.gz" +#dbs='danRer11' + +#bosTau9 - completes +#url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002263795.2/GCA_002263795.2_current_ids.vcf.gz" +#dbs="bosTau9" + +#dm6 - Completes if I remove /hive/data/outside/eva/dm6/problematicRSIDs.txt +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001215.4/GCA_000001215.4_current_ids.vcf.gz" +# dbs = "dm6" + +#mm10 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001635.6/GCA_000001635.6_current_ids.vcf.gz" +# dbs = "mm10" + +# #galGal6 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.5/GCA_000002315.5_current_ids.vcf.gz" +# dbs = "galGal6" + +# #susScr11 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000003025.6/GCA_000003025.6_current_ids.vcf.gz" +# dbs = "susScr11" + +# rn6 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001895.4/GCA_000001895.4_current_ids.vcf.gz" +# dbs = "rn6" + +# rn7 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_015227675.2/GCA_015227675.2_current_ids.vcf.gz" +# dbs = "rn7 + +# danRer10 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.3/GCA_000002035.3_current_ids.vcf.gz" +# dbs = "danRer10" + +# equCab3 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002863925.1/GCA_002863925.1_current_ids.vcf.gz" +# dbs = "equCab3" + +# rheMac10 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_003339765.3/GCA_003339765.3_current_ids.vcf.gz" +# dbs = "rheMac10" + +# macFas5 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000364345.1/GCA_000364345.1_current_ids.vcf.gz" +# dbs = "macFas5" + +# oviAri4 - Completes if removed rs1089233242 +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000298735.2/GCA_000298735.2_current_ids.vcf.gz" +# dbs = "oviAri4" + +# felCat9 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000181335.4/GCA_000181335.4_current_ids.vcf.gz" +# dbs = "felCat9" + +# galGal5 - Completes +# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.3/GCA_000002315.3_current_ids.vcf.gz" +# dbs = "galGal5" + +workDir = "/hive/data/outside/eva/"+dbs +if workDir[-1] != "/": + workDir=workDir+"/" + +def checkDirMakeDir(workDir): + if not os.path.isdir(workDir): + os.mkdir(workDir) + +def bash(cmd): + """Run the cmd in bash subprocess""" + try: + rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ + stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) + bashStdoutt = rawBashOutput.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) + return(bashStdoutt) + +def wgetFile(url, workDir): + bash("wget %s -P %s" % (url, workDir)) + fileName = workDir+url.split("/")[-1] + return(fileName) + +def checkDupsAndUnzip(fileName,workDir): + """Check for duplicate lines (common in EVA), and unzip file""" + lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip() + uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip() + currentFileName = workDir+"evaSnps.vcf" + if lineCount != uniqCount: + print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount)) + bash("zcat %s | uniq > %s" % (fileName,currentFileName)) + else: + bash("zcat %s > %s" % (fileName,currentFileName)) + return(currentFileName) + +def sanitizeChromNames(currentFileName): + """Swaps the chrom IDs with the accession names which are indexed in chromAlias""" + inputFile = open(currentFileName,'r') + outputFile = open(currentFileName+".fixed","w") + IDtoAccessionDic = {} + + for line in inputFile: + if line.startswith("##contig"): + ID = line.split('=')[2].split(',')[0] + accession = line.split('=')[3].split('"')[1] + IDtoAccessionDic[ID] = accession + outputFile.write(line) + elif line.startswith("#"): + outputFile.write(line) + else: + line = line.split("\t") + line[0] = IDtoAccessionDic[line[0]] + line = "\t".join(line) + outputFile.write(line) + inputFile.close() + outputFile.close() + #Replacing the same name because chromAlias should be updated soon and this function will be removed + bash("mv %s.fixed %s" % (currentFileName,currentFileName)) + +def convertChromToUcsc(currentFileName,dbs,workDir): + """Convert to UCSC-style chroms""" + bash("chromToUcsc --get %s" % (dbs)) + bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv")) + bash("rm %s" % (dbs+".chromAlias.tsv")) + currentFileName = workDir+"evaSnps.ucscChroms.vcf" + return(currentFileName) + +def convertVcfToBed(currentFileName,workDir): + """Convert to bed keeping only VC and SID fields""" + bash("bgzip %s" % (currentFileName)) + bash("tabix -p vcf %s" % (currentFileName+".gz")) + bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir)) + currentFileName = workDir+"evaSnp.bed" + return(currentFileName) + +def splitChromsAndRunHgVai(workDir,dbs): + """Split all the chroms in tenths in order to be able to run hgVai without running out of memory""" + chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n") + inputFile = workDir+"evaSnps.ucscChroms.vcf.gz" + outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep" + for chrom in chromSizes[1:-1]: + chrom = chrom.split("\t") + chromName = chrom[0] + print("Running "+chromName) + if int(chrom[1]) < 1000000: #Don't worry about splitting if chrom is less than 1M bp + if dbs == 'mm39': + bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile)) + else: + bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile)) + else: + breakPoints = [.1,.2,.3,.4,.5,.6,.7,.8,.9,1] + for part in breakPoints: + startCoord = round(int(chrom[1])*(part-.1))+1 + endCoord = round(int(chrom[1])*part) + if part == breakPoints[0]: #Special exception for start + if dbs == 'mm39': + bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile)) + else: + bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile)) + else: + if dbs == 'mm39': + bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile)) + else: + bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile)) + return(outputFile) + +def splitVepFileByChrom(hgVaiResultsFile,workDir): + """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs""" + splitChromDir = workDir+"splitChroms/" + checkDirMakeDir(splitChromDir) + vepFile = open(hgVaiResultsFile,"r") + for line in vepFile: + if line.startswith("#") or line.startswith("Uploaded"): + continue + else: + #Need to compute itemRGB, aaChange, and ucscClass + lineContents = line.rstrip().split("\t") + chrom = lineContents[1].split(":")[0] + outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a") + outFile.write(line) + outFile.close() + vepFile.close() + +def fetchUcscClassAndAaChange(name,chrom,currentChrom,currentVepDic): + """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList""" + prevName = name + #Check if chrom has changed and index current chrom + if chrom != currentChrom: + currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r") + currentVepDic = {} + for entry in currentVepFile: + entry = entry.rstrip().split("\t") + rsID = entry[0] + if rsID not in currentVepDic.keys(): + currentVepDic[rsID] = {} + currentVepDic[rsID]['ucscClassList'] = [] + currentVepDic[rsID]['ucscClassList'].append(entry[6]) + currentVepDic[rsID]['aaChange'] = [] + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + else: + if entry[6] not in currentVepDic[rsID]['ucscClassList']: + currentVepDic[rsID]['ucscClassList'].append(entry[6]) + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']: + currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + currentChrom=chrom + currentVepFile.close() + + ucscClass = ",".join(currentVepDic[name]['ucscClassList']) + aaChange = " ".join(currentVepDic[name]['aaChange']) + + return(prevName,currentChrom,currentVepDic,ucscClass,aaChange) + +def assignRGBcolorByConsequence(currentVepDic,name): + """Assign color based on most serious hgVai predicted consequence""" + + redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant', + 'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained', + 'stop_lost','coding_sequence_variant','transcript_ablation'] + greenVars = ['synonymous_variant','stop_retained_variant'] + blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant', + 'non_coding_transcript_exon_variant'] + blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant', + 'NMD_transcript_variant','no_sequence_alteration'] + + if any(item in currentVepDic[name]['ucscClassList'] for item in redVars): + itemRgb = '255,0,0' + elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars): + itemRgb = '0,128,0' + elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars): + itemRgb = '0,0,255' + elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars): + itemRgb = '0,0,0' + else: + sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList'])) + + return(itemRgb) + +def convertSOnumberToName(varClass): + soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion', + "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV'} + varClass = soDic[varClass] + return(varClass) + +def createFinalBedWithVepAnnotations(currentFileName,workDir): + """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai""" + + inputFile = open(currentFileName,"r") + outputFile = open(workDir+"evaSnp.final.bed","w") + prevName = "" + currentChrom = "" + currentVepDic = {} + n=0 + + for line in inputFile: + n+=1 + if n%5000000 == 0: + print("Current line count: "+str(n)) + if line.startswith("#") or line.startswith("Uploaded"): + continue + else: + #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange + #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms + + lineContents = line.rstrip().split("\t") + chrom = lineContents[0] + chromStart = lineContents[1] + chromEnd = lineContents[2] + name = lineContents[3] + ref = lineContents[9] + alt = lineContents[10] + varClass = lineContents[12] + submitters = lineContents[13] + + #Convert from SO term to name. e.g. SO:0001483 to substitution + varClass = convertSOnumberToName(varClass) + + #Check if is the same rsID, they are repeated in ~5% of file + if prevName == name: + outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ + itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") + else: + #Find the aaChange and ucscClass + prevName,currentChrom,currentVepDic,ucscClass,aaChange = fetchUcscClassAndAaChange(name,chrom,currentChrom,currentVepDic) + itemRgb = assignRGBcolorByConsequence(currentVepDic,name) + outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ + itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") + + inputFile.close() + outputFile.close() + +def validateFinalFile(workDir,url): + """Compare and make sure that the final bed rsID# matches the input file""" + originalDownloadFileName = workDir+url.split("/")[-1] + bedFile = workDir+"evaSnp.final.bed" + originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName)) + finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile)) + if originalRsIDCount != finalBedRsIDCount: + sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\ + "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount))) + +def createBigBed(workDir,dbs): + bedFile = workDir+"evaSnp.final.bed" + bash("bedSort %s %s" % (bedFile,bedFile)) + bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \ + %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp.bb" % (bedFile,dbs,dbs,workDir)) + +checkDirMakeDir(workDir) +fileName = wgetFile(url, workDir) +fileName = workDir+url.split("/")[-1] +print("Got file") +currentFileName = checkDupsAndUnzip(fileName,workDir) +print("File unzipped") +try: + currentFileName = convertChromToUcsc(currentFileName,dbs,workDir) + print("Chroms converted") +except: + sanitizeChromNames(currentFileName) + print("Chroms sanitized") + currentFileName = convertChromToUcsc(currentFileName,dbs,workDir) + print("Chroms converted") +currentFileName = convertVcfToBed(currentFileName,workDir) +print("File converted to bed") +hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs) +print("hgVai done") +splitVepFileByChrom(hgVaiResultsFile,workDir) +print("hgVai file split") +currentFileName = workDir+"evaSnp.bed" +createFinalBedWithVepAnnotations(currentFileName,workDir) +print("Validating file") +validateFinalFile(workDir,url) +print("Created final file") +createBigBed(workDir,dbs) +print("bigBed made") + +##################################### +######### End of python3 code ###### +##################################### + +# Lastly links were made with a file containing the 16 assemblies: + +for dbs in $(cat $1); +do +ln -s /hive/data/outside/eva/$dbs/evaSnp.bb /gbdb/$dbs/bbi/; +done