1942abfbe86d5b3f5ed3ad0482a31a9354587e18 max Tue Mar 26 09:30:03 2024 -0700 preparing clinvar for somatic mutations, no ticket yet diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index 79e71db..fa88597 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -35,31 +35,33 @@ parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if not options.isAlpha and os.getlogin()!="otto": print("Please do not run this script as a normal user, you will run into all sorts of permission problems.") print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.") print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.") sys.exit(1) -clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n" +#clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n" +clinvarExpectHeaders = "#AlleleID|Type|Name|GeneID|GeneSymbol|HGNC_ID|ClinicalSignificance|ClinSigSimple|LastEvaluated|RS# (dbSNP)|nsv/esv (dbVar)|RCVaccession|PhenotypeIDS|PhenotypeList|Origin|OriginSimple|Assembly|ChromosomeAccession|Chromosome|Start|Stop|ReferenceAllele|AlternateAllele|Cytogenetic|ReviewStatus|NumberSubmitters|Guidelines|TestedInGTR|OtherIDs|SubmitterCategories|VariationID|PositionVCF|ReferenceAlleleVCF|AlternateAlleleVCF|SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity".split("|") + # ==== FUNCTIONs ===== # benign = BN, likely benign = LB, conflicting = CF, likely pathogenic= LP, # pathogenic = PG, other = OT, uncertain = VUS, RF=risk factor # colors were defined by Ana Benet Pages # - WARNING ON FILTER CHANGES: - see #28562 # Never change these codes. They are in old sessions. If you change them, change the across the whole script # and also change the name of the .as field, so the cart variables will be different. You can add new values through: cnvColors = { "INS" : { "OT" : "225,165,0", "BN" : "250,214,69", "LB" : "250,214,69", "CF" : "225,165,0", "RF" : "225,165,0", @@ -428,31 +430,31 @@ # if line.startswith("#"): # continue # if not foundHead: # raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing") # fields = line.split("\t") # allToVar[fields[2]]=fields[0] # return allToVar def parseHgvsFile(hgvsFname): " return dict: allele Id -> list of (hgvs coding, hgvs protein) " logging.info("Parsing %s" % hgvsFname) expHead = "#Symbol GeneID VariationID AlleleID Type Assembly NucleotideExpression NucleotideChange ProteinExpression ProteinChange UsedForNaming Submitted OnRefSeqGene" hg19Hgvs = defaultdict(list) hg38Hgvs = defaultdict(list) foundHead = False - for line in gzip.open(hgvsFname, "rt"): + for line in gzip.open(hgvsFname, "rt", encoding="utf8"): line = line.rstrip("\n") if line==expHead: foundHead = True if line.startswith("#Symbol") and "GeneID" in line: if not line == expHead: print ("Header of hgvs file has changed again. Code needs fixing. Look at the new fields and decide if we should show them.") print(("found:",line)) print(("expected:",expHead)) else: foundHead = True if line.startswith("#"): continue if not foundHead: raise Exception("Invalid Header. Code needs fixing, again.") @@ -525,40 +527,47 @@ 2-criteria provided, multiple submitters, no conflicts 1-criteria provided, single submitter 0-no assertion criteria provided 0-no assertion for the individual variant 0-no assertion provided 4-practice guideline 3-reviewed by expert panel given the review status, return an html string with the correct number of black stars """ revStatus = reviewStatus.strip().lower().replace(" ", "") starCount = None if revStatus == "nointerpretationforthesinglevariant": starCount = 0 + if revStatus == "noclassificationprovided" or revStatus == "noclassificationsfromunflaggedrecords": + starCount = 0 if revStatus=="criteriaprovided,conflictinginterpretations": starCount = 1 + if revStatus=="criteriaprovided,conflictingclassifications": # first seen in Mar 2024 + starCount = 1 if revStatus=="criteriaprovided,multiplesubmitters,noconflicts": starCount = 2 if revStatus=="criteriaprovided,singlesubmitter": starCount = 1 if revStatus == "practiceguideline": starCount = 4 if revStatus == "reviewedbyexpertpanel": starCount = 3 + + if revStatus == "noclassificationforthesinglevariant": + starCount = 0 if "noassertion" in revStatus or revStatus=="-": starCount = 0 if starCount == None: print(("Illegal review status: %s" % revStatus)) assert(False) emptyStarCount = 4-starCount htmlList = ["★"]*starCount htmlList.extend( ["☆"]*emptyStarCount ) htmlStr = "".join(htmlList) htmlStr += " <small>based on: %s</small>" % reviewStatus @@ -576,34 +585,34 @@ tmpFname = fh.name cmd= "zgrep -v '^#' %s > %s" % (summFname, tmpFname) assert(os.system(cmd)==0) cmd = "chmod a+r %s" % (tmpFname) # otherwise mysqld can't read the file assert(os.system(cmd)==0) cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname) assert(os.system(cmd)==0) cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname) assert(os.system(cmd)==0) def mostCurrentVersionName(): " return name of version on FTP site " - cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1' + cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1' verLine = subprocess.check_output(cmd, shell=True, encoding="utf8") - #<ReleaseSet Dated="2023-09-03" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Type="full" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/clinvar_public_1.71.xsd"> - version = verLine.split(" ")[1].split("=")[1].strip('"') + #<ClinVarVariationRelease xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/ClinVar_VCV_2.0.xsd" ReleaseDate="2024-03-06"> + version = verLine.split(" ")[3].split("=")[1].split('"')[1] return version def downloadFromNcbi(skipDownload): " download files from NCBI and return list of filenames " today = date.today().isoformat() downDir = join("downloads", today) try: os.mkdir(downDir) except FileExistsError: pass version = mostCurrentVersionName() logging.info("Version on NCBI server is %s" % version) versionFname = join(downDir, "version.txt") open(versionFname, "w").write(version) @@ -663,30 +672,33 @@ key, val = info.split("=") if key=="ALLELEID": alleleId = val elif key=="MC": mc = val.split("|")[1].split(",")[0].replace("_", " ") if not mc in possMolConseqs: print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. " "Please update otto script %s" % (mc, __file__)) sys.exit(1) if mc and alleleId: ret[int(alleleId)] = mc return ret +def shortenSeq(s): + return s[:25]+"..."+s[-25:] + # ----------- MAIN -------------- if args==[] and not options.auto: parser.print_help() exit(1) outSuffix = "" if options.auto: filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.skipDownload) else: filename = args[0] varFname = args[1] hgvsFname = args[2] vcfFname = args[3] versionFname = args[4] @@ -703,58 +715,60 @@ cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*') hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)') # e.g. # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA # c.80A>T # c.740+356C>T # c.3839_3855del17 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$') ifh = gzip.open(filename, "rt", encoding="latin_1") # check the header line1 = ifh.readline() -if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again +fileHeaders = line1.rstrip().split("\t") +if (fileHeaders != clinvarExpectHeaders): # test if clinvar has changed their header line again logging.error("ClinVar has changed their header line again") - logging.error("current header line: %s" % line1.replace("\t","|")) - logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|")) - raise Exception("code needs fixing") + logging.error("current header line: %s" % fileHeaders) + logging.error("expected header line: %s" % clinvarExpectHeaders) + raise Exception("Sorry. The code will need to be fixed.") # open out files hg19Bed = open("bed/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1") hg38Bed = open("bed/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1") hg19BedCnv = open("bed/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1") hg38BedCnv = open("bed/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1") longCount = 0 noAssCount = 0 noMcCount = 0 minusAccs = [] # convert lines for line in ifh: line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions line = line.rstrip("\n") row = line.split("\t") row = [f.replace("\0", "") for f in row ] alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ - submCategories, varId, posVcf, refAllVcf, altAllVcf = row + submCategories, varId, posVcf, refAllVcf, altAllVcf, somClinImpact, somClinLastEval, revStatusClinImpact, oncogen, oncogenLastEval, revStatusOncogen = row + # SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity # changed in July 2018 when the "varId" was merged into this table. It used to be # in allToVar if varId=="": logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc)) if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na": noAssCount += 1 continue if assembly=="GRCh37": hgvs = hg19Hgvs elif assembly=="GRCh38": hgvs = hg38Hgvs else: @@ -762,30 +776,32 @@ assert(False) #hgvsTable = ";".join(hgvs[alleleId]) hgvsTable = json.dumps(hgvs[alleleId]) molConseq = allToVcf.get(int(alleleId)) if molConseq is None: #logging.debug("%s has no molConseq" % alleleId) noMcCount += 1 molConseq = "" if chromAcc=="NT_187513.1": chrom = "chrUn_KI270742v1" elif chromAcc=="NT_167222.1": chrom = "chrUn_gl000228" + elif chromAcc=='NT_187499.1': + chrom = "chrUn_KI270744v1" elif chromAcc=="NT_113889.1": if assembly=="GRCh37": chrom = "chrUn_gl000218" else: chrom = "chrUn_GL000218v1" elif chrom=="Un": print("Unknown fix/alt/patch chrom, please adapt clinVarToBed: %s, row: %s" % (chrom, row)) sys.exit(1) else: chrom = "chr"+chrom if chrom=="chrUn" and assembly=="GRCh38": print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc)) continue @@ -913,35 +929,52 @@ mouseOverParts = [ mouseOverName, "<b>Review Status: </b>"+starRatingHtml, "<b>Type: </b>"+allType, "<b>Consequence: </b>"+ molConseq, "<b>Significance: </b>"+clinSign, "<b>Origin: </b>"+origin, "<b>Phenotypes: </b>"+phenotypeList ] # removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars mouseOver = "<br>".join(mouseOverParts) snpAcc = "rs"+snpAcc # dbSnp links changed in mid/late 2022 + vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf + if len(vcfDesc)>=255: + refAllVcf = shortenSeq(refAllVcf) + altAllVcf = shortenSeq(altAllVcf) + vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf + + somImpactDesc = "" + if somClinImpact!="" and somClinImpact!="-": + somImpactDesc = "%s (%s, %s)" % (somClinImpact, somClinLastEval, revStatusClinImpact) + + oncogenDesc = "" + if oncogen!="" and oncogen!="-": + oncogenDesc = "%s (%s, %s)" % (oncogen, oncogenLastEval, revStatusOncogen) + row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \ blockCount, blockSizes, blockStarts, name, clinSign, starRatingHtml, allType, geneStr, molConseq, snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic, hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver, + vcfDesc, + somImpactDesc, + oncogenDesc, # these fields are the for the filters, not for the display pathoCode, originCode, allTypeCode, str(varLen), str(starCount), # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field varId, dbVarSsvAcc] # replace clinvar's placeholders with real empty fields newRow = [] for x in row: if x in ["-1", "-"]: newRow.append("") else: newRow.append(x) row = newRow if assembly=="GRCh37":