a64bc5a6cfe9756104a86f633b0f0a01772c4b83 lrnassar Wed Oct 6 16:55:59 2021 -0700 Updating a few things on otto. For the most part these are changes to the wrapper emails, which had not been updated in git, and the big changes are were done by engineers directly to the hive copy without updating the tree version. The big updates for clinvar were Max, and decipher was Braney I believe. To clarify, these now updated versions are the ones that have been running daily/weekly, so all of it has been in action already. diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index 77d6de0..c40f520 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -1,42 +1,42 @@ #!/usr/bin/env python3 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess -import tempfile +import tempfile, json from collections import defaultdict -from os.path import join, basename, dirname, isfile, abspath +from os.path import join, basename, dirname, isfile, abspath, isdir from datetime import date, datetime, timedelta dbToUrl = { "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/", "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s", "OMIM Allelic Variant" : "http://www.omim.org/entry/%s", "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s", "OMIM" : "http://www.omim.org/entry/%s", "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s" } # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file # if they add a new one, this script must fail and tdb must be updated. possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""]) # === COMMAND LINE INTERFACE, OPTIONS AND HELP === -parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed +parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed. Output goes into the current dir -input file is ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz +Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/ """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed") parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n" # ==== FUNCTIONs ===== @@ -408,57 +408,82 @@ # print(("Header of variation_allele.txt.gz has changed again. Code needs fixing")) # print("found:",line) # print("expected:",expHead) # else: # foundHead = True # # if line.startswith("#"): # continue # if not foundHead: # raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing") # fields = line.split("\t") # allToVar[fields[2]]=fields[0] # return allToVar def parseHgvsFile(hgvsFname): - " return dict: allele Id -> (hgvs coding, hgvs protein) " + " return dict: allele Id -> list of (hgvs coding, hgvs protein) " logging.info("Parsing %s" % hgvsFname) expHead = "#Symbol GeneID VariationID AlleleID Type Assembly NucleotideExpression NucleotideChange ProteinExpression ProteinChange UsedForNaming Submitted OnRefSeqGene" - allToHgvs = {} + hg19Hgvs = defaultdict(list) + hg38Hgvs = defaultdict(list) foundHead = False for line in gzip.open(hgvsFname, "rt"): line = line.rstrip("\n") if line==expHead: foundHead = True if line.startswith("#Symbol") and "GeneID" in line: if not line == expHead: - print ("Header of hgvs file has changed again. Code needs fixing") + print ("Header of hgvs file has changed again. Code needs fixing. Look at the new fields and decide if we should show them.") print(("found:",line)) print(("expected:",expHead)) else: foundHead = True if line.startswith("#"): continue if not foundHead: raise Exception("Invalid Header. Code needs fixing, again.") fields = line.split("\t") - varType = fields[4] - if "coding" in varType and not "LRG" in fields[6]: - allToHgvs[fields[3]]=(fields[6], fields[8]) - return allToHgvs + sym, geneId, varId, alleleId, varType, assembly, nuclExpr, nuclChange, protExpr, protChange, usedForNaming, submitted, onLrg = fields + #varType = fields[4] + #if "coding" in varType and not "LRG" in fields[6]: + #assert("|" not in nuclChange and "|" not in protChange) + #hgvsRow = "|".join((nuclChange, protChange)) + #if ";" in hgvsRow: + #print(sym, geneId, varId, hgvsRow) + # this creates invalid HGVS, but it's the only way for now, until we have real JSON tables + #hgvsRow = hgvsRow.replace(";", ",") + #assert(";" not in hgvsRow) + hgvsRow = (nuclChange, protChange) + + if assembly=="GRCh37": + addTo = [hg19Hgvs] + elif assembly=="GRCh38": + addTo = [hg38Hgvs] + elif assembly=="na": # protein changes have no assembly and must be shown on all assemblies + addTo = [hg19Hgvs, hg38Hgvs] + elif assembly=="NCBI36": + continue + else: + print("invalid assembly in HGVS: %s, in %s" % (assembly, line)) + assert(False) + + for data in addTo: + data[alleleId].append( hgvsRow ) + + return hg19Hgvs, hg38Hgvs def accListToHtml(inStr): """ given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions to HTML links to the databases. Also, pull out the dbVar SSV accession for its own field later. """ dbVarAcc = "" newParts = [] for part in inStr.split(","): fields = part.split(":") if len(fields)!=2: newParts.append(part) continue db, acc = fields @@ -597,57 +622,57 @@ #1 930248 789256 G A . . ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS= if line.startswith("#"): continue chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t") infoParts = info.split(";") alleleId = None mc = None for info in infoParts: key, val = info.split("=") if key=="ALLELEID": alleleId = val elif key=="MC": mc = val.split("|")[1].split(",")[0].replace("_", " ") if not mc in possMolConseqs: - print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. ", + print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. " "Please update otto script %s" % (mc, __file__)) sys.exit(1) if mc and alleleId: ret[int(alleleId)] = mc return ret # ----------- MAIN -------------- if args==[] and not options.auto: parser.print_help() exit(1) outSuffix = "" if options.auto: filename, varFName, hgvsFname, vcfFname, summFname = downloadFromNcbi(options.skipDownload) else: filename = args[0] varFname = args[1] hgvsFname = args[2] vcfFname = args[3] #allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change allToVcf = parseVcf(vcfFname) -allToHgvs = parseHgvsFile(hgvsFname) +hg19Hgvs, hg38Hgvs = parseHgvsFile(hgvsFname) # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445) # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106) # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant # NC_000010.11:g.(?_87925503)_(87965482_?)del cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*') hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)') # e.g. # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA # c.80A>T # c.740+356C>T # c.3839_3855del17 @@ -679,46 +704,59 @@ line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions line = line.rstrip("\n") row = line.split("\t") row = [f.replace("\0", "") for f in row ] alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ submCategories, varId, posVcf, refAllVcf, altAllVcf = row # changed in July 2018 when the "varId" was merged into this table. It used to be # in allToVar if varId=="": logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc)) - hgvsCod, hgvsProt = allToHgvs.get(alleleId, ("", "")) + if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na": + noAssCount += 1 + continue + + if assembly=="GRCh37": + hgvs = hg19Hgvs + elif assembly=="GRCh38": + hgvs = hg38Hgvs + else: + print("Invalid assembly: %s, row %s" % (repr(assembly), repr(row))) + assert(False) + + #hgvsTable = ";".join(hgvs[alleleId]) + hgvsTable = json.dumps(hgvs[alleleId]) molConseq = allToVcf.get(int(alleleId)) if molConseq is None: logging.debug("%s has no molConseq" % alleleId) noMcCount += 1 molConseq = "" - if chrom=="" or assembly=="" or assembly=="NCBI36": - noAssCount += 1 - continue + if chromAcc=="NT_187513.1": + chrom = "chrUn_KI270742v1" + else: + chrom = "chr"+chrom - if chrom=="Un" and assembly=="GRCh38": + if chrom=="chrUn" and assembly=="GRCh38": print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc)) continue - chrom = "chr"+chrom if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ? if assembly=="GRCh37": chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 else: chrom = "chrM" shortName, longName = shortenName(name) if len(shortName)>20: shortName = shortName[:17]+"..." longCount+=1 if len(longName)>60: longName = longName[:60]+"..." @@ -805,76 +843,74 @@ if isCnv and dbVarSsvAcc!="": shortName = dbVarSsvAcc elif name=="": name = "No display name was provided by ClinVar for this variant" shortName = "NoName" mouseOverName = "" if len(name)>90: name = name[:90]+"..." name = varId+"|"+name if len(mouseOverName)>80: mouseOverName = mouseOverName[:80]+"..." - if len(hgvsProt) > 90: - hgvsProt = hgvsProt[:90]+"..." - if len(hgvsCod) > 90: - hgvsCod = hgvsCod[:90]+"..." + #if len(hgvsProt) > 90: + #hgvsProt = hgvsProt[:90]+"..." + #if len(hgvsCod) > 90: + #hgvsCod = hgvsCod[:90]+"..." phenotypeIds, _ = accListToHtml(phenotypeIds) starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus) mouseOverParts = [mouseOverName, "Type: "+allType, "Consequence: "+ molConseq, "Significance: "+clinSign, "Origin: "+origin, "Phenotypes: "+phenotypeList] # removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars mouseOver = ", ".join(mouseOverParts) row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \ blockCount, blockSizes, blockStarts, name, clinSign, starRatingHtml, allType, geneStr, molConseq, snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic, - hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver, + hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver, # these fields are the for the filters, not for the display pathoCode, originCode, allTypeCode, str(varLen), str(starCount), # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field varId, dbVarSsvAcc] # replace clinvar's placeholders with real empty fields newRow = [] for x in row: if x in ["-1", "-"]: newRow.append("") else: newRow.append(x) row = newRow if assembly=="GRCh37": ofh = hg19Bed if isCnv: ofh = hg19BedCnv elif assembly=="GRCh38": ofh = hg38Bed if isCnv: ofh = hg38BedCnv else: noAssCount +=1 - #for o in row: - #print(o, type(o)) ofh.write("\t".join(row)) ofh.write("\n") hg19Bed.close() hg38Bed.close() hg19BedCnv.close() hg38BedCnv.close() logging.info("%d lines with feature name that was too long, shortened them" % longCount) logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount) logging.info("%d lines with -1/-1 start/end positions skipped. Examples: %s" % (len(minusAccs), minusAccs[:3])) fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name] # sort the files @@ -901,22 +937,33 @@ tmpFname = "%s.%s.temp.bb" % (base, db) outFname = basename(outFname) # strip the archive/ part, put into current dir cmd = "bedToBigBed -extraIndex=_dbVarSsvId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname) mustRun(cmd) tableName = outFname.split('.')[0] # e.g. clinvarMain tmpFnames.append( (db, tableName, tmpFname, outFname) ) # make it atomic by renaming the temp files to the final file names at the very end for db, tableName, tmpFname, finalFname in tmpFnames: os.rename(tmpFname, finalFname) # make sure that the "Data last updated" in the table browser is correct +monYear = date.today().strftime("%Y-%m") for db, tableName, tmpFname, finalFname in tmpFnames: - cmd = "hgBbiDbLink %s %s /gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName, db, tableName) + gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName) + cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname) + mustRun(cmd) + + # put a copy into the archive + outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, monYear) + if not isdir(outDir): + os.mkdir(outDir) + archiveFname = join(outDir, basename(gbdbFname)) + logging.info("Creating %s" % archiveFname) + cmd = "cp %s %s" % (finalFname, archiveFname) mustRun(cmd) loadSubmissions(summFname) print("ClinVar update end: OK")