src/hg/utils/otto/clinvar/clinVarToBed 1942abfbe86d5b3f5ed3ad0482a31a9354587e18

1942abfbe86d5b3f5ed3ad0482a31a9354587e18
max
  Tue Mar 26 09:30:03 2024 -0700
preparing clinvar for somatic mutations, no ticket yet

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index 79e71db..fa88597 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -35,31 +35,33 @@
 parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
 parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
 else:
     logging.basicConfig(level=logging.INFO)
 
 if not options.isAlpha and os.getlogin()!="otto":
     print("Please do not run this script as a normal user, you will run into all sorts of permission problems.")
     print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.")
     print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.")
     sys.exit(1)
 
-clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID	PositionVCF	ReferenceAlleleVCF	AlternateAlleleVCF\n"
+#clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID	PositionVCF	ReferenceAlleleVCF	AlternateAlleleVCF\n"
+clinvarExpectHeaders = "#AlleleID|Type|Name|GeneID|GeneSymbol|HGNC_ID|ClinicalSignificance|ClinSigSimple|LastEvaluated|RS# (dbSNP)|nsv/esv (dbVar)|RCVaccession|PhenotypeIDS|PhenotypeList|Origin|OriginSimple|Assembly|ChromosomeAccession|Chromosome|Start|Stop|ReferenceAllele|AlternateAllele|Cytogenetic|ReviewStatus|NumberSubmitters|Guidelines|TestedInGTR|OtherIDs|SubmitterCategories|VariationID|PositionVCF|ReferenceAlleleVCF|AlternateAlleleVCF|SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity".split("|")
+
 # ==== FUNCTIONs =====
 
 # benign = BN, likely benign = LB, conflicting = CF, likely pathogenic= LP, 
 # pathogenic = PG, other = OT, uncertain = VUS, RF=risk factor
 # colors were defined by Ana Benet Pages
 # - WARNING ON FILTER CHANGES:  -  see #28562
 # Never change these codes. They are in old sessions. If you change them, change the across the whole script
 # and also change the name of the .as field, so the cart variables will be different. You can add new values through:
 cnvColors = {
         "INS" : {
             "OT" : "225,165,0",
             "BN" : "250,214,69",
             "LB" : "250,214,69",
             "CF" : "225,165,0",
             "RF" : "225,165,0",
@@ -428,31 +430,31 @@
 #        if line.startswith("#"):
 #            continue
 #        if not foundHead:
 #            raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing")
 #        fields = line.split("\t")
 #        allToVar[fields[2]]=fields[0]
 #    return allToVar
 
 def parseHgvsFile(hgvsFname):
     " return dict: allele Id -> list of (hgvs coding, hgvs protein) "
     logging.info("Parsing %s" % hgvsFname)
     expHead	= "#Symbol	GeneID	VariationID	AlleleID	Type	Assembly	NucleotideExpression	NucleotideChange	ProteinExpression	ProteinChange	UsedForNaming	Submitted	OnRefSeqGene"
     hg19Hgvs = defaultdict(list)
     hg38Hgvs = defaultdict(list)
     foundHead = False
-    for line in gzip.open(hgvsFname, "rt"):
+    for line in gzip.open(hgvsFname, "rt", encoding="utf8"):
         line = line.rstrip("\n")
         if line==expHead:
             foundHead = True
         if line.startswith("#Symbol") and "GeneID" in line:
             if not line == expHead:
                 print ("Header of hgvs file has changed again. Code needs fixing. Look at the new fields and decide if we should show them.")
                 print(("found:",line))
                 print(("expected:",expHead))
             else:
                 foundHead = True
 
         if line.startswith("#"):
             continue
         if not foundHead:
             raise Exception("Invalid Header. Code needs fixing, again.")
@@ -525,40 +527,47 @@
         2-criteria provided, multiple submitters, no conflicts
         1-criteria provided, single submitter
         0-no assertion criteria provided
         0-no assertion for the individual variant
         0-no assertion provided
         4-practice guideline
         3-reviewed by expert panel
 
     given the review status, return an html string with the correct number of black stars
     """
     revStatus = reviewStatus.strip().lower().replace(" ", "")
     starCount = None
 
     if revStatus == "nointerpretationforthesinglevariant":
         starCount = 0
+    if revStatus == "noclassificationprovided" or revStatus == "noclassificationsfromunflaggedrecords":
+        starCount = 0
     if revStatus=="criteriaprovided,conflictinginterpretations":
         starCount = 1
+    if revStatus=="criteriaprovided,conflictingclassifications": # first seen in Mar 2024
+        starCount = 1
     if revStatus=="criteriaprovided,multiplesubmitters,noconflicts":
         starCount = 2
     if revStatus=="criteriaprovided,singlesubmitter":
         starCount = 1
     if revStatus == "practiceguideline":
         starCount = 4
     if revStatus == "reviewedbyexpertpanel":
         starCount = 3
+
+    if revStatus == "noclassificationforthesinglevariant":
+        starCount = 0
     if "noassertion" in revStatus or revStatus=="-":
         starCount = 0
 
     if starCount == None:
         print(("Illegal review status: %s" % revStatus))
         assert(False)
 
     emptyStarCount = 4-starCount
 
     htmlList = ["&#9733;"]*starCount 
     htmlList.extend( ["&#9734;"]*emptyStarCount )
 
     htmlStr = "".join(htmlList)
     htmlStr += "&nbsp;&nbsp;<small>based on: %s</small>" % reviewStatus
 
@@ -576,34 +585,34 @@
     tmpFname = fh.name
     cmd= "zgrep -v '^#' %s > %s" % (summFname, tmpFname)
     assert(os.system(cmd)==0)
 
     cmd = "chmod a+r %s" % (tmpFname) # otherwise mysqld can't read the file
     assert(os.system(cmd)==0)
 
     cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname)
     assert(os.system(cmd)==0)
 
     cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname)
     assert(os.system(cmd)==0)
 
 def mostCurrentVersionName():
     " return name of version on FTP site "
-    cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1'
+    cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1'
     verLine = subprocess.check_output(cmd, shell=True, encoding="utf8")
-    #<ReleaseSet Dated="2023-09-03" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Type="full" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/clinvar_public_1.71.xsd">
-    version = verLine.split(" ")[1].split("=")[1].strip('"')
+    #<ClinVarVariationRelease xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/ClinVar_VCV_2.0.xsd" ReleaseDate="2024-03-06">
+    version = verLine.split(" ")[3].split("=")[1].split('"')[1]
     return version
 
 def downloadFromNcbi(skipDownload):
     " download files from NCBI and return list of filenames "
     today = date.today().isoformat()
     downDir = join("downloads", today)
     try:
         os.mkdir(downDir)
     except FileExistsError:
         pass
 
     version = mostCurrentVersionName()
     logging.info("Version on NCBI server is %s" % version)
     versionFname = join(downDir, "version.txt")
     open(versionFname, "w").write(version)
@@ -663,30 +672,33 @@
             key, val = info.split("=")
             if key=="ALLELEID":
                 alleleId = val
             elif key=="MC":
                 mc = val.split("|")[1].split(",")[0].replace("_", " ")
                 if not mc in possMolConseqs:
                     print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. "
                             "Please update otto script %s" % (mc, __file__))
                     sys.exit(1)
 
         if mc and alleleId:
             ret[int(alleleId)] = mc
 
     return ret
 
+def shortenSeq(s):
+    return s[:25]+"..."+s[-25:]
+
 # ----------- MAIN --------------
 if args==[] and not options.auto:
     parser.print_help()
     exit(1)
 
 outSuffix = ""
 
 if options.auto:
     filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.skipDownload)
 else:
     filename = args[0]
     varFname = args[1]
     hgvsFname = args[2]
     vcfFname = args[3]
     versionFname = args[4]
@@ -703,58 +715,60 @@
 cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
 
 hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
 
 # e.g.
 # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
 # c.80A>T
 # c.740+356C>T
 # c.3839_3855del17
 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')
 
 ifh = gzip.open(filename, "rt", encoding="latin_1")
 
 # check the header
 line1 = ifh.readline()
-if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again
+fileHeaders = line1.rstrip().split("\t")
+if (fileHeaders != clinvarExpectHeaders): # test if clinvar has changed their header line again
     logging.error("ClinVar has changed their header line again")
-    logging.error("current header line: %s" % line1.replace("\t","|"))
-    logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|"))
-    raise Exception("code needs fixing")
+    logging.error("current header line: %s" % fileHeaders)
+    logging.error("expected header line: %s" % clinvarExpectHeaders)
+    raise Exception("Sorry. The code will need to be fixed.")
 
 # open out files
 hg19Bed = open("bed/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1")
 hg38Bed = open("bed/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1")
 hg19BedCnv = open("bed/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1")
 hg38BedCnv = open("bed/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1")
 
 longCount = 0
 noAssCount = 0
 noMcCount = 0
 minusAccs = []
 
 # convert lines
 for line in ifh:
     line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
     line = line.rstrip("\n")
     row = line.split("\t")
     row = [f.replace("\0", "") for f in row ]
 
     alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
       irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
       refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
-      submCategories, varId, posVcf, refAllVcf, altAllVcf = row
+      submCategories, varId, posVcf, refAllVcf, altAllVcf, somClinImpact, somClinLastEval, revStatusClinImpact, oncogen, oncogenLastEval, revStatusOncogen = row
+   # SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity
 
     # changed in July 2018 when the "varId" was merged into this table. It used to be 
     # in allToVar
     if varId=="":
         logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
 
     if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na":
         noAssCount += 1
         continue
 
     if assembly=="GRCh37":
         hgvs = hg19Hgvs
     elif assembly=="GRCh38":
         hgvs = hg38Hgvs
     else:
@@ -762,30 +776,32 @@
         assert(False)
 
     #hgvsTable =  ";".join(hgvs[alleleId])
     hgvsTable =  json.dumps(hgvs[alleleId])
 
     molConseq = allToVcf.get(int(alleleId))
     if molConseq is None:
         #logging.debug("%s has no molConseq" % alleleId)
         noMcCount += 1
         molConseq = ""
 
     if chromAcc=="NT_187513.1":
         chrom = "chrUn_KI270742v1"
     elif chromAcc=="NT_167222.1":
         chrom = "chrUn_gl000228"
+    elif chromAcc=='NT_187499.1':
+        chrom = "chrUn_KI270744v1"
     elif chromAcc=="NT_113889.1":
         if assembly=="GRCh37":
             chrom = "chrUn_gl000218"
         else:
             chrom = "chrUn_GL000218v1"
     elif chrom=="Un":
         print("Unknown fix/alt/patch chrom, please adapt clinVarToBed: %s, row: %s" % (chrom, row))
         sys.exit(1)
     else:
         chrom = "chr"+chrom
 
     if chrom=="chrUn" and assembly=="GRCh38":
         print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc))
         continue
 
@@ -913,35 +929,52 @@
 
     mouseOverParts = [
             mouseOverName,
             "<b>Review Status: </b>"+starRatingHtml,
             "<b>Type: </b>"+allType,
             "<b>Consequence: </b>"+ molConseq,
             "<b>Significance: </b>"+clinSign,
             "<b>Origin: </b>"+origin,
             "<b>Phenotypes: </b>"+phenotypeList
             ]
     # removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars
     mouseOver = "<br>".join(mouseOverParts)
 
     snpAcc = "rs"+snpAcc # dbSnp links changed in mid/late 2022
 
+    vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf
+    if len(vcfDesc)>=255:
+	    refAllVcf = shortenSeq(refAllVcf)
+	    altAllVcf = shortenSeq(altAllVcf)
+	    vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf
+
+    somImpactDesc = ""
+    if somClinImpact!="" and somClinImpact!="-":
+	    somImpactDesc = "%s (%s, %s)" % (somClinImpact, somClinLastEval, revStatusClinImpact)
+
+    oncogenDesc = ""
+    if oncogen!="" and oncogen!="-":
+	    oncogenDesc = "%s (%s, %s)" % (oncogen, oncogenLastEval, revStatusOncogen)
+
     row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \
         blockCount, blockSizes, blockStarts,
         name, clinSign, starRatingHtml, allType, geneStr, molConseq,
         snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
         hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver,
+	vcfDesc, 
+	somImpactDesc, 
+	oncogenDesc,
         # these fields are the for the filters, not for the display
         pathoCode, originCode, allTypeCode, str(varLen), str(starCount),
         # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field
         varId, dbVarSsvAcc]
 
     # replace clinvar's placeholders with real empty fields
     newRow = []
     for x in row:
         if x in ["-1", "-"]:
             newRow.append("")
         else:
              newRow.append(x)
     row = newRow
 
     if assembly=="GRCh37":