7e690f333eb9b803b3a6fcceed6a404f3fdf2fb7 lrnassar Thu Sep 7 12:23:00 2023 -0700 Staging new EvaSnp5 data QA Ready, refs #32095 diff --git src/hg/makeDb/doc/evaSnp5.txt src/hg/makeDb/doc/evaSnp5.txt new file mode 100644 index 0000000..11e5fb8 --- /dev/null +++ src/hg/makeDb/doc/evaSnp5.txt @@ -0,0 +1,600 @@ +# Track for EVA snp release 4 - https://www.ebi.ac.uk/eva/?RS-Release&releaseVersion=5 +# Tracks built by Lou on 9/7/2023 + +# Track was built for the following 35 assemblies + +# The GCA accession on the eva release by accession list (https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/) +# were compared to all native assemblies we have. In total there are 895,806,314 variants + +# All assemblies were passed by the python pipeline described below. Any special cases +# have been baked into the pipeline itself, such as +# bosTau9 and galGal5 where I had to remove their chrMT as we had none, +# and rheMac8 I had to change the name of the chrMT to one that matches us. See RM #32095 + +##################################### +#### Beginning of python3 code ###### +##################################### + +import re, os, subprocess, sys + +def findAllAccFromDescPages(): + """Search all gbdb description pages and pull out GCA/GCF accessions""" + listOfHits = [] + assemblyAlias = {} + allDescPages = bash("ls /gbdb/*/html/description.html").rstrip().split("\n") + regex = re.compile('GC[AF]_[0-9]*\.[0-9]*') + for page in allDescPages: + try: + hits = bash("cat "+page+" | grep -i 'GCA\|GCF'").rstrip().split("\n") + except: + hits = [] + if hits != []: + ucscName = page.split("/")[2] + hits = str(hits) + assemblyAlias[ucscName] = {} + try: + accession = regex.findall(hits)[0] + if accession.startswith("GCA"): + assemblyAlias[ucscName]['GCA'] = accession + elif accession.startswith("GCF"): + assemblyAlias[ucscName]['GCF'] = accession + except: + print("The following page could not be matched with the accession format",page,hits) + return(assemblyAlias) + +def buildGCFdicFromGenbankFile(evaSnpVersionNumber): + checkDirMakeDir("/hive/data/outside/eva"+evaSnpVersionNumber+"/") + genbankSummaryDic = {} + regexGenbankGCA = re.compile('GCA_[0-9]*\.[0-9]*') + regexGenbankGCF = re.compile('GCF_[0-9]*\.[0-9]*') + wgetFile("https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt", "/hive/data/outside/eva"+evaSnpVersionNumber+"/") + summaryFile = open('/hive/data/outside/eva'+evaSnpVersionNumber+'/assembly_summary_genbank.txt','r') + for line in summaryFile: + if "GCF_" in line: + GCAacc = regexGenbankGCA.findall(line)[0] + GCFacc = regexGenbankGCF.findall(line)[0] + genbankSummaryDic[GCFacc] = GCAacc + summaryFile.close() + return(genbankSummaryDic) + +def findGCAaccForUCSCgcfDbs(assemblyAlias,genbankSummaryDic): + """Try to find matching GCA accession to UCSC dbs based on genbank file or curling web page""" + n=0 + regexGenbankGCA = re.compile('GCA_[0-9]*\.[0-9]*') + for key, value in assemblyAlias.items(): + if 'GCF' in value: + if value['GCF'] in genbankSummaryDic.keys(): + assemblyAlias[key]['GCA'] = genbankSummaryDic[value['GCF']] + n+=1 + else: #Try to find it online at NCBI + ncbiGCF = value['GCF'] + ncbiWgetPage = bash("curl https://www.ncbi.nlm.nih.gov/assembly/"+ncbiGCF+"/").rstrip().split("\n") + for line in ncbiWgetPage: + if "GenBank assembly accession" in line and "GCA_" in line: + for subLine in line.split("/dd"): + if "GenBank assembly accession" in subLine: + GCAacc = regexGenbankGCA.findall(subLine)[0] + assemblyAlias[key]['GCA'] = GCAacc + n+=1 + print("Total number of UCSC GCFs matched with GCAs: ",n) + return(assemblyAlias) + +def sanitizeListToOnlyUCSCdbsWithGCA(assemblyAlias): + n = 0 + m=0 + ucscGCAlist = {} + for key, value in assemblyAlias.items(): + if 'GCA' in value: + if value['GCA'] != 'GCA_000': + ucscGCAlist[key] = value['GCA'] + n+=1 + else: + m+=1 + print("Total number of UCSC databases already with a GCA:",n) + print("Total number of UCSC databases with GCF that will need GCA lookup:",m) + return(ucscGCAlist) + +def buildFinalDicOfUCSCdbsWithEvaSnpData(ucscGCAlist,evaSnpVersionNumber): + ucscEvaSnpDbs = {} + n=0 + regexEvaSnpGCA = re.compile('GCA_[0-9]*\.[0-9]*') + evaReleasePage = bash("curl https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_"+evaSnpVersionNumber+"/by_assembly/").rstrip().split("\n") + activeRRassemblies = bash("""hgsql -h genome-centdb -e "select name from dbDb where active='1'" hgcentral""") + for key in ucscGCAlist: + if ucscGCAlist[key] in str(evaReleasePage): + for line in evaReleasePage: + if ucscGCAlist[key] in line: + if key in activeRRassemblies: + fullGCA = regexEvaSnpGCA.findall(line)[0] + ucscEvaSnpDbs[key] = fullGCA + n+=1 + ucscEvaSnpDbs['mm10']='GCA_000001635.6' #Manually add mm10 which is weird due to patches + print("Total number of assemblies with EvaSnp data: ",n) + return(ucscEvaSnpDbs) + +def checkDirMakeDir(workDir): + if not os.path.isdir(workDir): + os.mkdir(workDir) + +def bash(cmd): + """Run the cmd in bash subprocess""" + try: + rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ + stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) + bashStdoutt = rawBashOutput.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) + return(bashStdoutt) + +def wgetFile(url, workDir): + bash("wget -N %s -P %s" % (url, workDir)) + fileName = workDir+url.split("/")[-1] + return(fileName) + +def checkDupsAndUnzip(fileName,workDir,ucscDatabaseName): + """Check for duplicate lines (common in EVA), and unzip file""" + lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip() + uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip() + currentFileName = workDir+"evaSnps.vcf" + #Add catches below in places where the chrom names don't match or are missing from UCSC for specific dbs + if ucscDatabaseName == 'galGal5': + bash("zcat %s | uniq | grep -v 'X52392.1' > %s" % (fileName,currentFileName)) + elif ucscDatabaseName == 'bosTau9': + bash("zcat %s | uniq | grep -v 'MT' > %s" % (fileName,currentFileName)) + else: + if lineCount != uniqCount: + print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount)) + bash("zcat %s | uniq > %s" % (fileName,currentFileName)) + else: + bash("zcat %s > %s" % (fileName,currentFileName)) + return(currentFileName) + +def sanitizeChromNames(currentFileName,dbs,workDir): + """Swaps the chrom IDs with the accession names which are indexed in chromAlias""" + inputFile = open(currentFileName,'r') + outputFile = open(currentFileName+".fixed","w") + IDtoAccessionDic = {} + + for line in inputFile: + if line.startswith("##contig"): + ID = line.split('=')[2].split(',')[0] + accession = line.split('=')[3].split('"')[1] + IDtoAccessionDic[ID] = accession + outputFile.write(line) + elif line.startswith("#"): + outputFile.write(line) + else: + line = line.split("\t") + line[0] = IDtoAccessionDic[line[0]] + line = "\t".join(line) + outputFile.write(line) + inputFile.close() + outputFile.close() + #Replacing the same name because chromAlias should be updated soon and this function will be removed + bash("mv %s.fixed %s" % (currentFileName,currentFileName)) + currentFileName = convertChromToUcsc(currentFileName,dbs,workDir) + print("Chroms converted") + +def addChrToChromNumbers(fileName,currentFileName,workDir): + bash("zcat %s | uniq > %s" % (fileName,currentFileName)) + with open(workDir+"evaSnps.ucscChroms.vcf",'w') as newChromVcf: + with open(currentFileName,"r") as oldChromVcf: + for line in oldChromVcf: + if line.startswith("#"): + newChromVcf.write(line) + elif line.startswith("AC024175.3"): #Exception for zebrafish + newChromVcf.write(line.replace("AC024175.3","chrM")) + elif line.startswith("AY612638.1"): #Exception for rheMac8 + newChromVcf.write(line.replace("AY612638.1","chrM")) + else: + newChromVcf.write("chr"+line) + currentFileName = workDir+"evaSnps.ucscChroms.vcf" + return(currentFileName) + +def convertChromToUcsc(currentFileName,dbs,workDir): + """Convert to UCSC-style chroms""" + bash("chromToUcsc --get %s" % (dbs)) + bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv")) + bash("rm %s" % (dbs+".chromAlias.tsv")) + currentFileName = workDir+"evaSnps.ucscChroms.vcf" + return(currentFileName) + +def convertVcfToBed(currentFileName,workDir): + """Convert to bed keeping only VC and SID fields""" + bash("bgzip %s" % (currentFileName)) + bash("tabix -p vcf %s" % (currentFileName+".gz")) + bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir)) + currentFileName = workDir+"evaSnp.bed" + return(currentFileName) + +def buildChromSizesDic(workDir,dbs): + """Create a dictionary the chromSizes""" + chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n") + chromDic = {} + for chrom in chromSizes[1:-1]: + chrom = chrom.split("\t") + chromDic[chrom[0]] = chrom[1] + return(chromDic) + +def splitChromsAndRunHgVai(workDir,dbs): + """Split all the chroms in tenths in order to be able to run hgVai without running out of memory""" + chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n") + inputFile = workDir+"evaSnps.ucscChroms.vcf.gz" + outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep" + n=0 + allTables = bash("\hgsql -e \"show tables\" "+dbs+"") + if dbs in ['galGal6','oviAri4','ponAbe3']: #Special exception to use refGene because ncbiRefSeq has incorrect protein sequence in Link table see #29262 + geneTableToUse = "refGene" +# elif "ncbiRefSeqSelect" in allTables: #Removing refseq select as the annotations are too sparse +# geneTableToUse = "ncbiRefSeqSelect" + elif "ncbiRefSeqCurated" in allTables: + geneTableToUse = "ncbiRefSeqCurated" + elif "ensGene" in allTables: + geneTableToUse = "ensGene" + elif "refGene" in allTables: + geneTableToUse = "refGene" + elif "ncbiGene" in allTables: + geneTableToUse = "ncbiGene" + else: + print(dbs) + sys.exit("Could not find any tables to use for the following database: "+dbs) + + chromDic = buildChromSizesDic(workDir,dbs) + #For function below, only bother with the chromosomes in the VCF + chromsInVcf = bash("zcat "+inputFile+" | grep -v '^#' | cut -f1 | uniq").rstrip().split("\n") + + for chrom in chromsInVcf: + chromName = chrom + chromSize = chromDic[chrom] + n+=1 + if (n % 100) == 0: + print("Have currently run "+str(n)+"/"+str(len(chromsInVcf))+" chroms. Currently running "+chromName) + if int(chromSize) < 1000000: #Don't worry about splitting if chrom is less than 1M bp + bash("vai.pl --variantLimit=200000000 --hgVai=/usr/local/apache/cgi-bin/hgVai --position=%s:0-%s --geneTrack=%s --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,chromSize,geneTableToUse,dbs,inputFile,outputFile)) + else: + breakPoints = [.2,.4,.6,.8,1] + for part in breakPoints: + startCoord = round(int(chromSize)*(part-.2)) + endCoord = round(int(chromSize)*part) + if part == breakPoints[0]: #Special exception for start + bash("vai.pl --variantLimit=200000000 --hgVai=/usr/local/apache/cgi-bin/hgVai --position=%s:0-%s --geneTrack=%s --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,endCoord,geneTableToUse,dbs,inputFile,outputFile)) + else: + bash("vai.pl --variantLimit=200000000 --hgVai=/usr/local/apache/cgi-bin/hgVai --position=%s:%s-%s --geneTrack=%s --hgvsG=off --hgvsCN=off \ + --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,geneTableToUse,dbs,inputFile,outputFile)) + return(outputFile) + +def splitVepFileByChrom(hgVaiResultsFile,workDir): + """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs""" + splitChromDir = workDir+"splitChroms/" + checkDirMakeDir(splitChromDir) + vepFile = open(hgVaiResultsFile,"r") + for line in vepFile: + if line.startswith("#") or line.startswith("Uploaded"): + continue + else: + #Need to compute itemRGB, aaChange, and ucscClass + lineContents = line.rstrip().split("\t") + chrom = lineContents[1].split(":")[0] + outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a") + outFile.write(line) + outFile.close() + vepFile.close() + +def fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts,workDir): + """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList""" + prevName = nameAlt + #Check if chrom has changed and index current chrom + if chrom != currentChrom: + currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r") + currentVepDic = {} + for entry in currentVepFile: + entry = entry.rstrip().split("\t") + rsID = entry[0] + rsIDAlt = entry[0]+entry[2] + if rsIDAlt not in currentVepDic.keys(): + currentVepDic[rsIDAlt] = {} + currentVepDic[rsIDAlt]['ucscClassList'] = [] + currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6]) + currentVepDic[rsIDAlt]['aaChange'] = [] + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + else: + if entry[6] not in currentVepDic[rsIDAlt]['ucscClassList']: + currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6]) + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsIDAlt]['aaChange']: + currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + + if rsID not in currentVepDic.keys(): + currentVepDic[rsID] = {} + currentVepDic[rsID]['ucscClassList'] = [] + currentVepDic[rsID]['ucscClassList'].append(entry[6]) + currentVepDic[rsID]['aaChange'] = [] + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + else: + if entry[6] not in currentVepDic[rsID]['ucscClassList']: + currentVepDic[rsID]['ucscClassList'].append(entry[6]) + if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]: + if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']: + currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1]) + + currentChrom=chrom + currentVepFile.close() + + try: + ucscClass = ",".join(currentVepDic[nameAlt]['ucscClassList']) + aaChange = " ".join(currentVepDic[nameAlt]['aaChange']) + except: + tryAttempts+=1 + ucscClass = ",".join(currentVepDic[name]['ucscClassList']) + aaChange = " ".join(currentVepDic[name]['aaChange']) + + return(prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts) + +def assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt): + """Assign color based on most serious hgVai predicted consequence""" + + redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant', + 'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained', + 'stop_lost','coding_sequence_variant','transcript_ablation'] + greenVars = ['synonymous_variant','stop_retained_variant'] + blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant', + 'non_coding_transcript_exon_variant'] + blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant', + 'NMD_transcript_variant','no_sequence_alteration'] + + try: + if any(item in currentVepDic[nameAlt]['ucscClassList'] for item in redVars): + itemRgb = '255,0,0' + elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in greenVars): + itemRgb = '0,128,0' + elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blueVars): + itemRgb = '0,0,255' + elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blackVars): + itemRgb = '0,0,0' + else: + sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList'])) + except: + if any(item in currentVepDic[name]['ucscClassList'] for item in redVars): + itemRgb = '255,0,0' + elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars): + itemRgb = '0,128,0' + elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars): + itemRgb = '0,0,255' + elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars): + itemRgb = '0,0,0' + else: + sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList'])) + + + return(itemRgb) + +def convertSOnumberToName(varClass): + soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion', + "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV', "SO:0000705": 'tandem_repeat'} + varClass = soDic[varClass] + return(varClass) + +def buildDuplicateRsIDset(currentFileName): + """Look for multi-allelic variants to then try and more accurately match those hgVai results""" + duplicateSet = set() + inputFile = open(currentFileName,"r") + prevName = False + for line in inputFile: + if line.startswith("#") or line.startswith("Uploaded"): + continue + else: + lineContents = line.rstrip().split("\t") + name = lineContents[3] + if prevName == name: + duplicateSet.add(name) + prevName = name + print("Total number of multi-allelic variants: "+str(len(duplicateSet))) + return(duplicateSet) + +def createFinalBedWithVepAnnotations(currentFileName,workDir): + """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai""" + + inputFile = open(currentFileName,"r") + outputFile = open(workDir+"evaSnp.final.bed","w") + prevName = "" + currentChrom = "" + currentVepDic = {} + n=0 + tryAttempts=0 + duplicateSet = buildDuplicateRsIDset(currentFileName) + + for line in inputFile: + n+=1 + if n%5000000 == 0: + print("Current line count: "+str(n)) + if line.startswith("#") or line.startswith("Uploaded"): + continue + else: + #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange + #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms + + lineContents = line.rstrip().split("\t") + chrom = lineContents[0] + chromStart = lineContents[1] + chromEnd = lineContents[2] + name = lineContents[3] + ref = lineContents[9] + alt = lineContents[10] + varClass = lineContents[12] + submitters = lineContents[13] + + #Convert from SO term to name. e.g. SO:0001483 to substitution + varClass = convertSOnumberToName(varClass) + + #Check if is the same rsID, they are repeated in ~5% of file + if prevName == name: + print("Repeated prevName: "+prevName) + outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ + itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") + elif name in duplicateSet: + #Find the aaChange and ucscClass + #Try and match the allele with what hgVai outputs + if varClass == "insertion": + if ',' in alt: + nameAlt = name+alt.split(",")[0][1:] + elif len(alt)-1 >= 59: + nameAlt = name+alt[len(ref):25]+"..."+alt[len(alt)-24:]+"("+str(len(alt)-1)+" nt)" + else: + nameAlt = name+alt[1:] + elif varClass == "deletion": + nameAlt = name+"-" + elif varClass == "tandem_repeat": + if len(alt) > len(ref): + nameAlt = name+alt[len(ref):] + else: + nameAlt = name+"-" + else: + nameAlt = name+alt + + prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts,workDir) + itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt) + outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ + itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") + else: + nameAlt = name + prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts,workDir) + itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt) + outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+ + itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n") + + print("Total number of try attempts = "+str(tryAttempts)) + print("Total number of lines in output file = "+str(n)) + inputFile.close() + outputFile.close() + +def validateFinalFile(workDir,url,dbs): + """Compare and make sure that the final bed rsID# matches the input file""" + originalDownloadFileName = workDir+url.split("/")[-1] + bedFile = workDir+"evaSnp.final.bed" + originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName)) + finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile)) + #Skip the check for galGal5 which is known to have troublesome chrMT name + if dbs != "galGal5" or dbs != "bosTau9": + if originalRsIDCount != finalBedRsIDCount: + sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\ + "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount))) + finalBedLineCount = bash("cat %s | wc -l" % (bedFile)) + finalBedUniqueLineCount = bash("cat %s | sort | uniq | wc -l" % (bedFile)) + if finalBedLineCount != finalBedUniqueLineCount: + sys.exit("There are duplicate entries in the file: "+str(finalBedLineCount)+" vs "+str(finalBedUniqueLineCount)) + +def createBigBed(workDir,dbs,evaSnpVersionNumber): + bedFile = workDir+"evaSnp.final.bed" + bash("bedSort %s %s" % (bedFile,bedFile)) + bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \ + %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp%s.bb" % (bedFile,dbs,dbs,workDir,evaSnpVersionNumber)) + +def chechIfDbsDone(key,evaSnpVersionNumber): + if os.path.exists("/hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/evaSnp"+evaSnpVersionNumber+".bb"): + print("DBS already completed: "+key) + return(True) + else: + if os.path.exists("/hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/evaSnps.ucscChroms.vcf") or os.path.exists("/hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/evaSnps.ucscChroms.vcf.gz"): + print("Cleaning up directory for previous attempt of "+key) + bash("rm /hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/evaSnps*") + if os.path.exists("/hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/splitChroms"): + try: + bash("rm /hive/data/outside/eva"+evaSnpVersionNumber+"/"+key+"/splitChroms/*") + except: + print("Directly cleaned") + return(False) + +def findFileUrl(GCAacc,evaSnpVersionNumber,dbs): + dirStructure = bash("curl https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_5/by_assembly/"+GCAacc+"/ | grep '[DIR]'") + #Loops to pick correct species name files since Ensembl began to add multiple species per GCA in eva5 + if dbs == 'oviAri3': + speciesName = 'ovis_orientalis/' + elif dbs == 'bosTau6': + speciesName = 'bos_indicus_x_bos_taurus/' + elif dbs == 'bosTau9': + speciesName = 'bos_taurus/' + else: + speciesName = dirStructure.split("href")[6].split('"')[1] + urlToFileDir = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_"+evaSnpVersionNumber+"/by_assembly/"+GCAacc+"/"+speciesName + fileDir = bash("curl -L "+urlToFileDir+" | grep "+GCAacc) + filespeciesNameDirHref = fileDir.split("href") + fileName = filespeciesNameDirHref[2].split('"')[1] + urlToFile = urlToFileDir+fileName + return(urlToFile) + +def buildEvaSnpTrack(ucscDatabaseName,ucscEvaSnpDbs,evaSnpVersionNumber): + """Build evaSnp track from start to finish""" + dbs = ucscDatabaseName + GCAacc = ucscEvaSnpDbs[dbs] + workDir = "/hive/data/outside/eva"+evaSnpVersionNumber+"/"+dbs+"/" + urlToFile = findFileUrl(GCAacc,evaSnpVersionNumber,dbs) + checkDirMakeDir(workDir) + fileName = wgetFile(urlToFile, workDir) + fileName = workDir+urlToFile.split("/")[-1] + print("Got file: ",dbs) + currentFileName = checkDupsAndUnzip(fileName,workDir,ucscDatabaseName) + print("File unzipped") + try: + currentFileName = convertChromToUcsc(currentFileName,dbs,workDir) + print("Chroms converted") + except: + try: + sanitizeChromNames(currentFileName,dbs,workDir) + print("Chroms sanitized") + currentFileName = convertChromToUcsc(currentFileName,dbs,workDir) + print("Chroms converted") + except: + currentFileName = addChrToChromNumbers(fileName,currentFileName,workDir) + print("Chroms completed after adding '1' to starts.") + currentFileName = convertVcfToBed(currentFileName,workDir) + print("File converted to bed") + hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs) + print("hgVai done") + splitVepFileByChrom(hgVaiResultsFile,workDir) + print("hgVai file split") + currentFileName = workDir+"evaSnp.bed" + createFinalBedWithVepAnnotations(currentFileName,workDir) + print("Validating file") + validateFinalFile(workDir,urlToFile,dbs) + print("Created final file") + createBigBed(workDir,dbs,evaSnpVersionNumber) + print("bigBed made") + +def main(mode): + #Run main with either "all" or a specific UCSC database name + #Find all the assemblies we have with matching evaSnp data + evaSnpVersionNumber = "5" + assemblyAlias = findAllAccFromDescPages() + genbankSummaryDic = buildGCFdicFromGenbankFile(evaSnpVersionNumber) + assemblyAlias = findGCAaccForUCSCgcfDbs(assemblyAlias,genbankSummaryDic) + ucscGCAlist = sanitizeListToOnlyUCSCdbsWithGCA(assemblyAlias) + ucscEvaSnpDbs = buildFinalDicOfUCSCdbsWithEvaSnpData(ucscGCAlist,evaSnpVersionNumber) + #Build evaSNP data for either one specific assembly, or all assemblies + if mode == 'all': + for key in ucscEvaSnpDbs: + if not chechIfDbsDone(key,evaSnpVersionNumber): + try: + buildEvaSnpTrack(key,ucscEvaSnpDbs,evaSnpVersionNumber) + except: + print("Problem with DBS: "+key) + continue + else: + chechIfDbsDone(mode,evaSnpVersionNumber) + buildEvaSnpTrack(mode,ucscEvaSnpDbs,evaSnpVersionNumber) + +main("all") + +##################################### +######### End of python3 code ###### +##################################### + +# Lastly links were made with a file containing the 35 assemblies: + +for dbs in $(cat /hive/data/outside/eva5/assemblyReleaseList.txt); +do +ln -s /hive/data/outside/eva5/$dbs/evaSnp5.bb /gbdb/$dbs/bbi/; +done