4ae81a7984cf1609801f0e4e96d3c9be009ab7e5 max Thu Dec 21 03:48:55 2023 -0800 trying to make clinvar otto job work again diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index 188ac7c..b3dfbf6 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -1,29 +1,23 @@ #!/usr/bin/env python3 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess import tempfile, json from collections import defaultdict from os.path import join, basename, dirname, isfile, abspath, isdir from datetime import date, datetime, timedelta -if os.getlogin()!="otto": - print("Please do not run this script as a normal user, you will run into all sorts of permission problems.") - print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.") - print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.") - sys.exit(1) - dbToUrl = { "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/", "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s", "OMIM Allelic Variant" : "http://www.omim.org/entry/%s", "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s", "OMIM" : "http://www.omim.org/entry/%s", "MONDO" : "https://monarchinitiative.org/disease/%s", "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s", "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s" } # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file # if they add a new one, this script must fail and tdb must be updated. possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", "initiator codon variant", ""]) @@ -35,30 +29,36 @@ Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/ """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("", "--alpha", dest="isAlpha", action="store_true", help="Default target is /gbdb/{hg19,hg38}/bbi/clinvar. With this option, we use /gbdb/{hg19,hg38}/bbi/clinvarAlpha, the hgwdev only version of the track, see human/clinvarAlpha.ra in trackDb") parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed") parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) +if not options.isAlpha and os.getlogin()!="otto": + print("Please do not run this script as a normal user, you will run into all sorts of permission problems.") + print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.") + print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.") + sys.exit(1) + clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n" # ==== FUNCTIONs ===== # benign = BN, likely benign = LB, conflicting = CF, likely pathogenic= LP, # pathogenic = PG, other = OT, uncertain = VUS, RF=risk factor # colors were defined by Ana Benet Pages # - WARNING ON FILTER CHANGES: - see #28562 # Never change these codes. They are in old sessions. If you change them, change the across the whole script # and also change the name of the .as field, so the cart variables will be different. You can add new values through: cnvColors = { "INS" : { "OT" : "225,165,0", "BN" : "250,214,69", "LB" : "250,214,69", "CF" : "225,165,0", @@ -754,31 +754,31 @@ continue if assembly=="GRCh37": hgvs = hg19Hgvs elif assembly=="GRCh38": hgvs = hg38Hgvs else: print("Invalid assembly: %s, row %s" % (repr(assembly), repr(row))) assert(False) #hgvsTable = ";".join(hgvs[alleleId]) hgvsTable = json.dumps(hgvs[alleleId]) molConseq = allToVcf.get(int(alleleId)) if molConseq is None: - logging.debug("%s has no molConseq" % alleleId) + #logging.debug("%s has no molConseq" % alleleId) noMcCount += 1 molConseq = "" if chromAcc=="NT_187513.1": chrom = "chrUn_KI270742v1" elif chromAcc=="NT_167222.1": chrom = "chrUn_gl000228" elif chromAcc=="NT_113889.1": if assembly=="GRCh37": chrom = "chrUn_gl000218" else: chrom = "chrUn_GL000218v1" elif chrom=="Un": print("Unknown fix/alt/patch chrom, please adapt clinVarToBed: %s, row: %s" % (chrom, row)) sys.exit(1) @@ -974,52 +974,55 @@ compareWithLastMonth(fname, options.maxDiff) bigBedDir = "bigBed" if options.isAlpha: bigBedDir = "bigBedAlpha" # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb tmpFnames = [] for fname in fnames: parts = fname.split(".") if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed base, db, date, ext = parts else: base, db, ext = parts + base = basename(base) + tmpFname = "%s.%s.temp.bb" % (base, db) - outFname = "%s/%s.%s.bb" % (bigBedDir, base, db) - outFname = basename(outFname) # strip the archive/ part, put into current dir cmd = "bedToBigBed -extraIndex=_dbVarSsvId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname) mustRun(cmd) - tableName = outFname.split('.')[0] # e.g. clinvarMain + finalFname = "%s/%s.%s.bb" % (bigBedDir, base, db) + tableName = finalFname.split('.')[0] # e.g. clinvarMain - tmpFnames.append( (db, tableName, tmpFname, outFname) ) + tmpFnames.append( (db, tableName, tmpFname, finalFname) ) # make it atomic by renaming the temp files to the final file names at the very end for db, tableName, tmpFname, finalFname in tmpFnames: + logging.debug("Renaming %s to %s" % (tmpFname, finalFname)) os.rename(tmpFname, finalFname) # create the version.txt file clinvarVersion = open(versionFname).read() versionString = "ClinVar Release: %s, converted at UCSC on %s" % (clinvarVersion, date.today().strftime("%Y-%m-%d")) outFname = join(bigBedDir, "version.txt") with open(outFname, "w") as ofh: ofh.write(versionString) +# put a copy of the files into the archive if not options.isAlpha: for db, tableName, tmpFname, finalFname in tmpFnames: # not doing this anymore, using bigDataUrl now in trackDb #cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname) #mustRun(cmd) yearMonDay = date.today().strftime("%Y-%m-%d") gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName) # put a copy into the archive outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay) if not isdir(outDir): os.mkdir(outDir) archiveFname = join(outDir, basename(gbdbFname)) logging.info("Creating %s" % archiveFname) cmd = "cp %s %s" % (finalFname, archiveFname)