src/hg/utils/otto/clinvar/clinVarToBed 61dccdb3f1405aeee1c397b5247e673cab010911

61dccdb3f1405aeee1c397b5247e673cab010911
max
  Thu Dec 14 05:02:54 2023 -0800
adding version.txt files to clinvar. Also starting work on the --alpha
switch, but not tested yet

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index 85c298c..188ac7c 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -1,47 +1,54 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
 import tempfile, json
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile, abspath, isdir
 from datetime import date, datetime, timedelta
 
+if os.getlogin()!="otto":
+    print("Please do not run this script as a normal user, you will run into all sorts of permission problems.")
+    print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.")
+    print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.")
+    sys.exit(1)
+
 dbToUrl = {
     "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
     "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
     "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
     "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
     "OMIM" : "http://www.omim.org/entry/%s",
     "MONDO" : "https://monarchinitiative.org/disease/%s",
     "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s",
     "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
 }
 
 # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file
 # if they add a new one, this script must fail and tdb must be updated.
-possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""])
+possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", "initiator codon variant", ""])
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed.
 
 Output goes into the current dir
 
 Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/
 """) 
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+parser.add_option("", "--alpha", dest="isAlpha", action="store_true", help="Default target is /gbdb/{hg19,hg38}/bbi/clinvar. With this option, we use /gbdb/{hg19,hg38}/bbi/clinvarAlpha, the hgwdev only version of the track, see human/clinvarAlpha.ra in trackDb")
 parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed")
 parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
 parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
 else:
     logging.basicConfig(level=logging.INFO)
 
 clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID	PositionVCF	ReferenceAlleleVCF	AlternateAlleleVCF\n"
 # ==== FUNCTIONs =====
 
 # benign = BN, likely benign = LB, conflicting = CF, likely pathogenic= LP, 
 # pathogenic = PG, other = OT, uncertain = VUS, RF=risk factor
@@ -381,31 +388,31 @@
     percDiff = (todayCount-previousCount)/float(todayCount)
 
     if percDiff < 0 and maxDiff:
         raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \
             (previousFname, previousCount, fname, todayCount))
     if percDiff > maxDiff:
         raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount))
     # see RM 14350 do more than just linecount
     # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux
     cmd = "diff %s %s --speed-large-files | grep '^[\>\<]'  | wc -l" % (fname, previousFname)
     diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT))
     logging.info("%s: %d lines changed" % (fname, diffCount))
     percDiff = float(diffCount)/todayCount
     if percDiff > maxDiff:
         logging.info(cmd)
-        diffFname =  abspath(join("archive", "lastDiff.txt"))
+        diffFname =  abspath(join("downloads", "lastDiff.txt"))
         cmd = "diff %s %s --speed-large-files > %s" % (fname, previousFname, diffFname)
         os.system(cmd) # ret code is not 0, as there are differences, so no need to check
         raise Exception("too many differences: %d out of %d lines. Look at the differences n the file %s. If you are OK with them, run '/hive/data/outside/otto/clinvar/doUpdate.sh -nocheck'" % (diffCount, todayCount, diffFname))
 
 # I'm leaving this in the code, because I would not be surprised if one day we have to go back and parse
 # the other files again
 #def parseVariationAlleleFile(varFname):
 #    """ return dict allele Id -> variant ID """
 #    logging.info("Parsing %s" % varFname)
 #    allToVar = {}
 #    foundHead = False
 #    expHead = "#VariationID	Type	AlleleID	Interpreted"
 #    for line in gzip.open(varFname):
 #        line = line.rstrip("\n")
 #        if line==expHead:
@@ -567,71 +574,88 @@
     logging.info("Loading submissions table %s into mysql, hg19 and also hg38" % summFname)
     fh = tempfile.NamedTemporaryFile(prefix="clinVarToBed", suffix=".tsv")
     tmpFname = fh.name
     cmd= "zgrep -v '^#' %s > %s" % (summFname, tmpFname)
     assert(os.system(cmd)==0)
 
     cmd = "chmod a+r %s" % (tmpFname) # otherwise mysqld can't read the file
     assert(os.system(cmd)==0)
 
     cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname)
     assert(os.system(cmd)==0)
 
     cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname)
     assert(os.system(cmd)==0)
 
+def mostCurrentVersionName():
+    " return name of version on FTP site "
+    cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1'
+    verLine = subprocess.check_output(cmd, shell=True, encoding="utf8")
+    #<ReleaseSet Dated="2023-09-03" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Type="full" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/clinvar_public_1.71.xsd">
+    version = verLine.split(" ")[1].split("=")[1].strip('"')
+    return version
+
 def downloadFromNcbi(skipDownload):
-    " download files from NCBI and return triple of filenames "
+    " download files from NCBI and return list of filenames "
     today = date.today().isoformat()
-    outSuffix = "%s." % today
+    downDir = join("downloads", today)
+    try:
+        os.mkdir(downDir)
+    except FileExistsError:
+        pass
+
+    version = mostCurrentVersionName()
+    logging.info("Version on NCBI server is %s" % version)
+    versionFname = join(downDir, "version.txt")
+    open(versionFname, "w").write(version)
 
     baseUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/"
     url = baseUrl+"variant_summary.txt.gz"
-    outFname = "archive/variant_summary.%s.txt.gz" % today
+    outFname = join(downDir, "variant_summary.txt.gz")
     filename = outFname
 
     # get the new variant allele file
     varUrl = baseUrl+"variation_allele.txt.gz"
-    varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today)
+    varFname = join(downDir, "variation_allele.txt.gz")
 
     # get the new hgvs allele file
     hgvsUrl = baseUrl+"hgvs4variation.txt.gz"
-    hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today)
+    hgvsFname = join(downDir, "hgvs4variation.txt.gz")
 
     # get the summary file
     summUrl = baseUrl+"submission_summary.txt.gz"
-    summFname = join(dirname(filename), "submission_summary.%s.txt.gz" % today)
+    summFname = join(downDir, "submission_summary.txt.gz")
 
     # get the new VCF file
     vcfUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz"
-    vcfFname = join(dirname(filename), "GRCh38.clinvar.%s.vcf.gz" % today)
+    vcfFname = join(downDir, "GRCh38.clinvar.vcf.gz")
 
     if not (isfile(vcfFname)):
         logging.info("Downlading %s to %s" % (vcfUrl, vcfFname))
         open(vcfFname, "wb").write(urllib.request.urlopen(vcfUrl).read())
 
     if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname) and isfile(summFname)):
         logging.info("Downlading %s to %s" % (url, outFname))
         open(outFname, "wb").write(urllib.request.urlopen(url).read())
         logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
         open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read())
         logging.info("Downlading %s to %s" % (varUrl, varFname))
         open(varFname, "wb").write(urllib.request.urlopen(varUrl).read())
         logging.info("Downlading %s to %s" % (summUrl, summFname))
         open(summFname, "wb").write(urllib.request.urlopen(summUrl).read())
 
-    return filename, varFname, hgvsFname, vcfFname, summFname
+    return filename, varFname, hgvsFname, vcfFname, summFname, versionFname
 
 def parseVcf(vcfFname):
     " parse VCF and return as dict alleleId -> (molConseq) "
     logging.info("Parsing %s" % vcfFname)
     ret = {}
     for line in gzip.open(vcfFname, "rt", encoding="utf_8"):
         # #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
         #1       930248  789256  G       A       .       .       ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=
         if line.startswith("#"):
             continue
         chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t")
         infoParts = info.split(";")
 
         alleleId = None
         mc = None
@@ -647,36 +671,37 @@
                     sys.exit(1)
 
         if mc and alleleId:
             ret[int(alleleId)] = mc
 
     return ret
 
 # ----------- MAIN --------------
 if args==[] and not options.auto:
     parser.print_help()
     exit(1)
 
 outSuffix = ""
 
 if options.auto:
-    filename, varFName, hgvsFname, vcfFname, summFname = downloadFromNcbi(options.skipDownload)
+    filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.skipDownload)
 else:
     filename = args[0]
     varFname = args[1]
     hgvsFname = args[2]
     vcfFname = args[3]
+    versionFname = args[4]
 
 #allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change
 allToVcf = parseVcf(vcfFname)
 hg19Hgvs, hg38Hgvs = parseHgvsFile(hgvsFname)
 
 # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
 # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
 # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
 # NC_000010.11:g.(?_87925503)_(87965482_?)del
 cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
 
 hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
 
 # e.g.
@@ -685,34 +710,34 @@
 # c.740+356C>T
 # c.3839_3855del17
 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')
 
 ifh = gzip.open(filename, "rt", encoding="latin_1")
 
 # check the header
 line1 = ifh.readline()
 if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again
     logging.error("ClinVar has changed their header line again")
     logging.error("current header line: %s" % line1.replace("\t","|"))
     logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|"))
     raise Exception("code needs fixing")
 
 # open out files
-hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1")
-hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1")
-hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1")
-hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1")
+hg19Bed = open("bed/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1")
+hg38Bed = open("bed/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1")
+hg19BedCnv = open("bed/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1")
+hg38BedCnv = open("bed/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1")
 
 longCount = 0
 noAssCount = 0
 noMcCount = 0
 minusAccs = []
 
 # convert lines
 for line in ifh:
     line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
     line = line.rstrip("\n")
     row = line.split("\t")
     row = [f.replace("\0", "") for f in row ]
 
     alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
       irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
@@ -936,58 +961,76 @@
 logging.info("%d lines with -1/-1 start/end positions skipped. Examples: %s" % (len(minusAccs), minusAccs[:3]))
 
 fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name]
 
 # sort the files
 for fname in fnames:
     cmd = "bedSort %s %s" % (fname, fname)
     logging.debug(cmd)
     assert(os.system(cmd)==0)
 
 # check new bed files against last months's bed files
 if options.maxDiff:
     for fname in fnames:
         compareWithLastMonth(fname, options.maxDiff)
 
+bigBedDir = "bigBed"
+if options.isAlpha:
+    bigBedDir = "bigBedAlpha"
+
 # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb
 tmpFnames = []
 for fname in fnames:
     parts = fname.split(".")
     if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed
         base, db, date, ext = parts
     else:
         base, db, ext = parts
         
-    outFname = "%s.%s.bb" % (base, db)
     tmpFname = "%s.%s.temp.bb" % (base, db)
+    outFname = "%s/%s.%s.bb" % (bigBedDir, base, db)
 
     outFname = basename(outFname) # strip the archive/ part, put into current dir
     cmd = "bedToBigBed -extraIndex=_dbVarSsvId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname)
     mustRun(cmd)
 
     tableName = outFname.split('.')[0] # e.g. clinvarMain
 
     tmpFnames.append( (db, tableName, tmpFname, outFname) )
 
 # make it atomic by renaming the temp files to the final file names at the very end
 for db, tableName, tmpFname, finalFname in tmpFnames:
     os.rename(tmpFname, finalFname)
 
-# make sure that the "Data last updated" in the table browser is correct
-monYear = date.today().strftime("%Y-%m")
+# create the version.txt file
+clinvarVersion = open(versionFname).read()
+versionString = "ClinVar Release: %s, converted at UCSC on %s" % (clinvarVersion, date.today().strftime("%Y-%m-%d"))
+outFname = join(bigBedDir, "version.txt")
+with open(outFname, "w") as ofh:
+    ofh.write(versionString)
+
+if not options.isAlpha:
     for db, tableName, tmpFname, finalFname in tmpFnames:
-    gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName)
-    cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname)
-    mustRun(cmd)
+        # not doing this anymore, using bigDataUrl now in trackDb
+        #cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname)
+        #mustRun(cmd)
 
+        yearMonDay = date.today().strftime("%Y-%m-%d")
+        gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName)
         # put a copy into the archive
-    outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, monYear)
+        outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay)
         if not isdir(outDir):
             os.mkdir(outDir)
         archiveFname = join(outDir, basename(gbdbFname))
         logging.info("Creating %s" % archiveFname)
         cmd = "cp %s %s" % (finalFname, archiveFname)
         mustRun(cmd)
 
+    # add the version.txt files
+    for db in ["hg38", "hg19"]:
+        archDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay)
+        cmd = "cp %s/version.txt %s" % (bigBedDir, archDir)
+        mustRun(cmd)
+
 loadSubmissions(summFname)
 
 print("ClinVar update end: OK")