src/hg/utils/otto/clinvar/clinVarToBed a64bc5a6cfe9756104a86f633b0f0a01772c4b83

a64bc5a6cfe9756104a86f633b0f0a01772c4b83
lrnassar
  Wed Oct 6 16:55:59 2021 -0700
Updating a few things on otto. For the most part these are changes to the wrapper emails, which had not been updated in git, and the big changes are were done by engineers directly to the hive copy without updating the tree version. The big updates for clinvar were Max, and decipher was Braney I believe. To clarify, these now updated versions are the ones that have been running daily/weekly, so all of it has been in action already.

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index 77d6de0..c40f520 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -1,42 +1,42 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
-import tempfile
+import tempfile, json
 from collections import defaultdict
-from os.path import join, basename, dirname, isfile, abspath
+from os.path import join, basename, dirname, isfile, abspath, isdir
 from datetime import date, datetime, timedelta
 
 dbToUrl = {
     "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
     "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
     "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
     "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
     "OMIM" : "http://www.omim.org/entry/%s",
     "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
 }
 
 # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file
 # if they add a new one, this script must fail and tdb must be updated.
 possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""])
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
-parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed
+parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed.
 
 Output goes into the current dir
 
-input file is ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
+Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/
 """) 
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
 parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed")
 parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
 parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
 else:
     logging.basicConfig(level=logging.INFO)
 
 clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID	PositionVCF	ReferenceAlleleVCF	AlternateAlleleVCF\n"
 # ==== FUNCTIONs =====
@@ -408,57 +408,82 @@
 #                print(("Header of variation_allele.txt.gz has changed again. Code needs fixing"))
 #                print("found:",line)
 #                print("expected:",expHead)
 #            else:
 #                foundHead = True
 #
 #        if line.startswith("#"):
 #            continue
 #        if not foundHead:
 #            raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing")
 #        fields = line.split("\t")
 #        allToVar[fields[2]]=fields[0]
 #    return allToVar
 
 def parseHgvsFile(hgvsFname):
-    " return dict: allele Id -> (hgvs coding, hgvs protein) "
+    " return dict: allele Id -> list of (hgvs coding, hgvs protein) "
     logging.info("Parsing %s" % hgvsFname)
     expHead	= "#Symbol	GeneID	VariationID	AlleleID	Type	Assembly	NucleotideExpression	NucleotideChange	ProteinExpression	ProteinChange	UsedForNaming	Submitted	OnRefSeqGene"
-    allToHgvs = {}
+    hg19Hgvs = defaultdict(list)
+    hg38Hgvs = defaultdict(list)
     foundHead = False
     for line in gzip.open(hgvsFname, "rt"):
         line = line.rstrip("\n")
         if line==expHead:
             foundHead = True
         if line.startswith("#Symbol") and "GeneID" in line:
             if not line == expHead:
-                print ("Header of hgvs file has changed again. Code needs fixing")
+                print ("Header of hgvs file has changed again. Code needs fixing. Look at the new fields and decide if we should show them.")
                 print(("found:",line))
                 print(("expected:",expHead))
             else:
                 foundHead = True
 
         if line.startswith("#"):
             continue
         if not foundHead:
             raise Exception("Invalid Header. Code needs fixing, again.")
 
         fields = line.split("\t")
-        varType = fields[4]
-        if "coding" in varType and not "LRG" in fields[6]:
-            allToHgvs[fields[3]]=(fields[6], fields[8])
-    return allToHgvs
+        sym, geneId, varId, alleleId, varType, assembly, nuclExpr, nuclChange, protExpr, protChange, usedForNaming, submitted, onLrg = fields
+        #varType = fields[4]
+        #if "coding" in varType and not "LRG" in fields[6]:
+        #assert("|" not in nuclChange and "|" not in protChange)
+        #hgvsRow = "|".join((nuclChange, protChange))
+        #if ";" in hgvsRow:
+            #print(sym, geneId, varId, hgvsRow)
+        # this creates invalid HGVS, but it's the only way for now, until we have real JSON tables
+        #hgvsRow = hgvsRow.replace(";", ",")
+        #assert(";" not in hgvsRow)
+        hgvsRow = (nuclChange, protChange)
+
+        if assembly=="GRCh37":
+            addTo = [hg19Hgvs]
+        elif assembly=="GRCh38":
+            addTo = [hg38Hgvs]
+        elif assembly=="na": # protein changes have no assembly and must be shown on all assemblies
+            addTo = [hg19Hgvs, hg38Hgvs]
+        elif assembly=="NCBI36":
+            continue
+        else:
+            print("invalid assembly in HGVS: %s, in %s" % (assembly, line))
+            assert(False)
+
+        for data in addTo:
+            data[alleleId].append( hgvsRow )
+
+    return hg19Hgvs, hg38Hgvs
 
 def accListToHtml(inStr):
     """
     given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions
     to HTML links to the databases. Also, pull out the dbVar SSV accession for its own field later.
     """
     dbVarAcc = ""
     newParts = []
     for part in inStr.split(","):
         fields = part.split(":")
         if len(fields)!=2:
             newParts.append(part)
             continue
 
         db, acc = fields
@@ -597,57 +622,57 @@
         #1       930248  789256  G       A       .       .       ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=
         if line.startswith("#"):
             continue
         chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t")
         infoParts = info.split(";")
 
         alleleId = None
         mc = None
         for info in infoParts:
             key, val = info.split("=")
             if key=="ALLELEID":
                 alleleId = val
             elif key=="MC":
                 mc = val.split("|")[1].split(",")[0].replace("_", " ")
                 if not mc in possMolConseqs:
-                    print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. ",
+                    print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. "
                             "Please update otto script %s" % (mc, __file__))
                     sys.exit(1)
 
         if mc and alleleId:
             ret[int(alleleId)] = mc
 
     return ret
 
 # ----------- MAIN --------------
 if args==[] and not options.auto:
     parser.print_help()
     exit(1)
 
 outSuffix = ""
 
 if options.auto:
     filename, varFName, hgvsFname, vcfFname, summFname = downloadFromNcbi(options.skipDownload)
 else:
     filename = args[0]
     varFname = args[1]
     hgvsFname = args[2]
     vcfFname = args[3]
 
 #allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change
 allToVcf = parseVcf(vcfFname)
-allToHgvs = parseHgvsFile(hgvsFname)
+hg19Hgvs, hg38Hgvs = parseHgvsFile(hgvsFname)
 
 # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
 # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
 # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
 # NC_000010.11:g.(?_87925503)_(87965482_?)del
 cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
 
 hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
 
 # e.g.
 # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
 # c.80A>T
 # c.740+356C>T
 # c.3839_3855del17
@@ -679,46 +704,59 @@
     line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
     line = line.rstrip("\n")
     row = line.split("\t")
     row = [f.replace("\0", "") for f in row ]
 
     alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
       irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
       refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
       submCategories, varId, posVcf, refAllVcf, altAllVcf = row
 
     # changed in July 2018 when the "varId" was merged into this table. It used to be 
     # in allToVar
     if varId=="":
         logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
 
-    hgvsCod, hgvsProt =  allToHgvs.get(alleleId, ("", ""))
+    if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na":
+        noAssCount += 1
+        continue
+
+    if assembly=="GRCh37":
+        hgvs = hg19Hgvs
+    elif assembly=="GRCh38":
+        hgvs = hg38Hgvs
+    else:
+        print("Invalid assembly: %s, row %s" % (repr(assembly), repr(row)))
+        assert(False)
+
+    #hgvsTable =  ";".join(hgvs[alleleId])
+    hgvsTable =  json.dumps(hgvs[alleleId])
 
     molConseq = allToVcf.get(int(alleleId))
     if molConseq is None:
         logging.debug("%s has no molConseq" % alleleId)
         noMcCount += 1
         molConseq = ""
 
-    if chrom=="" or assembly=="" or assembly=="NCBI36":
-        noAssCount += 1
-        continue
+    if chromAcc=="NT_187513.1":
+        chrom = "chrUn_KI270742v1"
+    else:
+        chrom = "chr"+chrom
 
-    if chrom=="Un" and assembly=="GRCh38":
+    if chrom=="chrUn" and assembly=="GRCh38":
         print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc))
         continue
-    chrom = "chr"+chrom
 
     if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ?
         if assembly=="GRCh37":
             chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 
         else:
             chrom = "chrM"
 
     shortName, longName = shortenName(name)
 
     if len(shortName)>20:
         shortName = shortName[:17]+"..."
         longCount+=1
 
     if len(longName)>60:
         longName = longName[:60]+"..."
@@ -805,76 +843,74 @@
 
     if isCnv and dbVarSsvAcc!="":
         shortName = dbVarSsvAcc
     elif name=="":
         name = "No display name was provided by ClinVar for this variant"
         shortName = "NoName"
         mouseOverName = ""
 
     if len(name)>90:
         name = name[:90]+"..."
     name = varId+"|"+name
 
     if len(mouseOverName)>80:
         mouseOverName = mouseOverName[:80]+"..."
 
-    if len(hgvsProt) > 90:
-        hgvsProt = hgvsProt[:90]+"..."
-    if len(hgvsCod) > 90:
-        hgvsCod = hgvsCod[:90]+"..."
+    #if len(hgvsProt) > 90:
+        #hgvsProt = hgvsProt[:90]+"..."
+    #if len(hgvsCod) > 90:
+        #hgvsCod = hgvsCod[:90]+"..."
 
     phenotypeIds, _ = accListToHtml(phenotypeIds)
 
     starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus)
 
     mouseOverParts = [mouseOverName, "Type: "+allType, "Consequence: "+ molConseq, "Significance: "+clinSign,
             "Origin: "+origin, "Phenotypes: "+phenotypeList]
     # removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars
     mouseOver = ", ".join(mouseOverParts)
 
     row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \
         blockCount, blockSizes, blockStarts,
         name, clinSign, starRatingHtml, allType, geneStr, molConseq,
         snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
-        hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver,
+        hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver,
         # these fields are the for the filters, not for the display
         pathoCode, originCode, allTypeCode, str(varLen), str(starCount),
         # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field
         varId, dbVarSsvAcc]
 
     # replace clinvar's placeholders with real empty fields
     newRow = []
     for x in row:
         if x in ["-1", "-"]:
             newRow.append("")
         else:
              newRow.append(x)
     row = newRow
 
     if assembly=="GRCh37":
         ofh = hg19Bed
         if isCnv:
             ofh = hg19BedCnv
     elif assembly=="GRCh38":
         ofh = hg38Bed
         if isCnv:
             ofh = hg38BedCnv
     else:
         noAssCount +=1
         
-    #for o in row:
-        #print(o, type(o))
     ofh.write("\t".join(row))
     ofh.write("\n")
 
 hg19Bed.close()
 hg38Bed.close()
 hg19BedCnv.close()
 hg38BedCnv.close()
 
 logging.info("%d lines with feature name that was too long, shortened them" % longCount)
 logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount)
 logging.info("%d lines with -1/-1 start/end positions skipped. Examples: %s" % (len(minusAccs), minusAccs[:3]))
 
 fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name]
 
 # sort the files
@@ -901,22 +937,33 @@
     tmpFname = "%s.%s.temp.bb" % (base, db)
 
     outFname = basename(outFname) # strip the archive/ part, put into current dir
     cmd = "bedToBigBed -extraIndex=_dbVarSsvId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname)
     mustRun(cmd)
 
     tableName = outFname.split('.')[0] # e.g. clinvarMain
 
     tmpFnames.append( (db, tableName, tmpFname, outFname) )
 
 # make it atomic by renaming the temp files to the final file names at the very end
 for db, tableName, tmpFname, finalFname in tmpFnames:
     os.rename(tmpFname, finalFname)
 
 # make sure that the "Data last updated" in the table browser is correct
+monYear = date.today().strftime("%Y-%m")
 for db, tableName, tmpFname, finalFname in tmpFnames:
-    cmd = "hgBbiDbLink %s %s /gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName, db, tableName)
+    gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName)
+    cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname)
+    mustRun(cmd)
+
+    # put a copy into the archive
+    outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, monYear)
+    if not isdir(outDir):
+        os.mkdir(outDir)
+    archiveFname = join(outDir, basename(gbdbFname))
+    logging.info("Creating %s" % archiveFname)
+    cmd = "cp %s %s" % (finalFname, archiveFname)
     mustRun(cmd)
 
 loadSubmissions(summFname)
 
 print("ClinVar update end: OK")