70dea5ffaf352ada83e54b03aa0774bf515cbb65
max
Mon Sep 21 03:39:46 2020 -0700
adding the Benet-Pages colorscheme to clinvar, fixing up the 'g.' names
and auto-loading the clinvarSub table for Braney's new bigLolli clinvar
submissions track. Also fixing the categorization of submissions. refs #25207, #25405 and emails with Chris/Braney.
diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index c3b479d..de72ff2 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -1,18 +1,19 @@
#!/usr/bin/env python3
import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
+import tempfile
from collections import defaultdict
from os.path import join, basename, dirname, isfile, abspath
from datetime import date, datetime, timedelta
dbToUrl = {
"dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
"UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
"OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
"MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
"OMIM" : "http://www.omim.org/entry/%s",
"Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
}
# since we're filtering on it, we make sure that we have all molecular consequences in the tdb file
# if they add a new one, this script must fail and tdb must be updated.
@@ -25,33 +26,120 @@
input file is ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed")
parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
(options, args) = parser.parse_args()
if options.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
-clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID\n"
+clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n"
# ==== FUNCTIONs =====
+# benign = B, likely benign = LB, conflicting = CF, likely pathogenic= LP,
+# pathogenic = P, other = OT, uncertain = VUS, RF=risk factor
+# colors were defined by Ana Benet Pages
+# see colortable at https://drive.google.com/file/d/1-iqHrZTwT1wuatNA_A5b6knyqE6JG14f/view?usp=sharing
+cnvColors = {
+ "INS" : {
+ "OT" : "225,165,0",
+ "B" : "250,214,69",
+ "LB" : "250,214,69",
+ "CF" : "225,165,0",
+ "RF" : "225,165,0",
+ "VUS" : "225,165,0",
+ "LP" : "199,114,3",
+ "P" : "199,114,3"
+ },
+ "LOSS" : {
+ "OT" : "255,0,0",
+ "B" : "255,98,119",
+ "LB" : "255,98,119",
+ "CF" : "255,0,0",
+ "VUS" : "255,0,0",
+ "RF" : "255,0,0",
+ "LP" : "153,0,0",
+ "P" : "153,0,0"
+ },
+ "GAIN" : {
+ "OT" : "0,0,225",
+ "B" : "77,184,255",
+ "LB" : "77,184,255",
+ "CF" : "0,0,225",
+ "VUS" : "0,0,225",
+ "RF" : "0,0,225",
+ "LP" : "0,0,179",
+ "P" : "0,0,179"
+ },
+ "STRUCT" : {
+ "OT" : "0,210,0",
+ "B" : "0,255,0",
+ "LB" : "0,255,0",
+ "CF" : "0,210,0",
+ "VUS" : "0,210,0",
+ "RF" : "0,210,0",
+ "LP" : "0,128,0",
+ "P" : "0,128,0"
+ },
+ "INV" : {
+ "OT" : "192,0,192",
+ "B" : "255,128,255",
+ "LB" : "255,128,255",
+ "CF" : "192,0,192",
+ "VUS" : "192,0,192",
+ "RF" : "192,0,192",
+ "LP" : "128,0,128",
+ "P" : "128,0,128"
+ },
+ "SEQALT" : {
+ "OT" : "128,128,128",
+ "B" : "191,191,191",
+ "LB" : "191,191,191",
+ "CF" : "128,128,128",
+ "VUS" : "128,128,128",
+ "RF" : "128,128,128",
+ "LP" : "64,64,64",
+ "P" : "64,64,64"
+ },
+ "SEQLEN" : {
+ "OT" : "0,179,179",
+ "B" : "0,255,255",
+ "LB" : "0,255,255",
+ "CF" : "0,179,179",
+ "VUS" : "0,179,179",
+ "RF" : "0,179,179",
+ "LP" : "0,102,102",
+ "P" : "0,102,102"
+ },
+ "SUBST" : {
+ "OT" : "128,128,128",
+ "B" : "191,191,191",
+ "LB" : "191,191,191",
+ "CF" : "128,128,128",
+ "VUS" : "128,128,128",
+ "RF" : "128,128,128",
+ "LP" : "64,64,64",
+ "P" : "64,64,64"
+ },
+}
+
def mustRun(cmd):
logging.debug(cmd)
ret = os.system(cmd)
if ret!=0:
print(("Could not run: %s" % cmd))
sys.exit(1)
def shortenName(name):
"make the clinvar name shorter and keep a long version for the mouse over "
cnvMatch = cnvRe.match(name)
#hgvsMatch = hgvsRe.match(name)
if ":" in name and not "chr" in name:
longName = name.split(":")[0]
else:
@@ -63,96 +151,120 @@
# some names include the original assembly as a prefix
band, copy = cnvMatch.groups()
if copy == None:
copy = ""
shortName = band+copy
elif name.startswith("GRCh"):
# GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
print(name)
assert False # name that starts with GRCh but does not match our regex should not happen
elif "." in name:
# many names look like NM_000274.3(OAT):c.550_552delGCT (p.Ala184del)
# so we remove the gene and the protein description
if ":" in name:
name = name.split(":")[1]
longName = name.split(" ")[0]
+
+ # handle genomic HGVS NC_000010.11:g.(?_87925503)_(87965482_?)del
+ if name.startswith("NC_") and "g." in name and ")" in name:
+ shortName = name
+ else:
+ # strip away the final description like " (p.Ala184del)"
name = name.split(" ")[0].split("[")[0].split("(")[0]
shortName = name
if name.endswith("del"):
shortName = "del"
elif name.endswith("ins"):
shortName = "ins"
elif name.endswith("dup"):
shortName = "dup"
else:
posMatch = posRe.match(name)
if posMatch!=None:
pos, dupDelIns, change = posMatch.groups()
if dupDelIns==None:
dupDelIns= ""
if change==None or change=="":
shortName = pos+dupDelIns
else:
shortName = dupDelIns+change
return shortName, longName
-def allTypeToCode(allType):
+def allTypeToCode(allType, debugId):
"""
Map the allele type field from ClinVal to one of these classes for filtering:
- SNV = single nucleotide variant
- INS = insertion
- DEL = deletion
- INDEL = indel
- DUPL = duplication
- OTH = other
+ STRUCT = structual alteration
+ LOSS = deletion, copy number loss, mobile element deletion
+ GAIN = duplication, tandem dupl, intrachrom transp, copy number gain
+ INS = insertion, mobile or novel element insertion
+ INV = inversion
+ SEQALT = catch-all category: multiallele cnv, undetermined, others, not defined
+ SEQLEN = short tandem repeat, short repeat, rep exp, rep contraction, NT expansion
+ SUBST = SNV, MNV, multi nucl polymophrism, indel
+ OTH = protein only
+
+ # categories were defined by Ana Benet Pages
+ # see table at https://drive.google.com/file/d/1-iqHrZTwT1wuatNA_A5b6knyqE6JG14f/view?usp=sharing
In 3/2020, these were all possible values:
fusion 6 0.000%
complex 61 0.005%
protein only 103 0.008%
NT expansion 142 0.011%
Translocation 302 0.022%
inversion 583 0.043%
undetermined variant 866 0.064%
insertion 7133 0.530%
indel 8490 0.630%
short repeat 18778 1.394%
duplication 34148 2.535%
copy number loss 40441 3.002%
copy number gain 40671 3.019%
deletion 77325 5.740%
single nucleotide variant 1118033 82.997%
"""
- if allType=="single nucleotide variant":
- return "SNV"
+ allType = allType.lower() # they changed the casing in Sep 2020
+ # doing this first, as this is 80% of the data -> faster
+ if allType in ["single nucleotide variant", "variation"]:
+ # 'variation' doesn't fit anywhere, but we think that these are mostly OMIM imports, so SNV sounds OK
+ return "SUBST"
+ # the order here matches Ana's coloring table for easier reading
+ elif allType in ["translocation", "fusion", "complex"]:
+ return "STRUCT"
elif allType in ["deletion", "copy number loss"]:
- return "DEL"
+ return "LOSS"
elif allType in ["duplication", "copy number gain", "tandem duplication"]:
- return "DUPL"
- elif allType=="indel":
- return "INDEL"
- elif allType in ["insertion", "NT expansion", "short repeat"]:
+ return "GAIN"
+ elif allType in ["indel", "insertion"]:
return "INS"
- elif allType in ["Translocation", "fusion", "complex", "protein only", "inversion", "undetermined variant"]:
- return "OTH"
+ elif allType in ["inversion"]:
+ return "INV"
+ elif allType in ["undetermined variant"]:
+ return "SEQALT"
+ elif allType in ["short repeat", "nt expansion", "microsatellite"]:
+ return "SEQLEN"
+ #elif allType in ["variation"]: # appeared in Sep 2020
+ #return "OTH"
+ #elif allType in ["protein only"]: # looks like they removed protein only sometimes in early 2020
+ #return "OTH"
else:
- print("found unknown allele type: "+allType)
+ print("found unknown allele type '%s' for accession %s"%(allType, debugId))
assert(False) # found new allele type in ClinVar table : code needs fixing
def originSimpleToCode(originSimple):
"""
for filtering, group the simplified origins into simpler origin codes:
return one of these values:
"GERM" = germline
"SOM" = somatic
"GERMSOM" = germline/somatic
"NOVO" = de novo
"UNK" = unknown
as of 3/2020, these are the values in originSimple and how often they appear:
tested-inconclusive 91 0.007%
@@ -203,42 +315,47 @@
no interpretation for the single variant 564 0.092%
other 1687 0.274%
Pathogenic/Likely pathogenic 5209 0.846%
not provided 8860 1.440%
Benign/Likely benign 19489 3.167%
Likely pathogenic 30200 4.907%
Conflicting interpretations of pathogenicity 30910 5.023%
Pathogenic 66679 10.835%
Benign 77551 12.602%
Likely benign 154870 25.166%
Uncertain significance 217838 35.399%
"""
pathoStr = pathoStr.lower()
# don't change the order of these unless you tested it. The order is crucial.
+ # the aim of this order is to err on the side of caution: if a variant is VUS and pathogenic,
+ # we rather classify it as VUS (=triggering manual review). If it's pathogenic and benign at the same
+ # time, we rather classify it as pathogenic
if "conflicting" in pathoStr:
return "CF"
+ if "uncertain" in pathoStr:
+ return "VUS"
+ if "risk factor" in pathoStr:
+ return "RF"
if "likely pathogenic" in pathoStr:
return "LP"
+ if "pathogenic" in pathoStr:
+ return "P"
if "likely benign" in pathoStr:
return "LB"
- if "uncertain" in pathoStr:
- return "UC"
- if "pathogenic" in pathoStr:
- return "PG"
if "benign" in pathoStr:
- return "BN"
+ return "B"
return "OT"
def lastMonth(d,x=1):
""" returns same day as this month, but last month, e.g. for 2013-01-01
return 2012-12-01
"""
days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
newmonth = ((( d.month - 1) - x ) % 12 ) + 1
newyear = d.year + ((( d.month - 1) - x ) // 12 )
if d.day > days_of_month[newmonth-1]:
newday = days_of_month[newmonth-1]
else:
newday = d.day
return datetime( newyear, newmonth, newday)
@@ -323,51 +440,56 @@
if line.startswith("#"):
continue
if not foundHead:
raise Exception("Invalid Header. Code needs fixing, again.")
fields = line.split("\t")
varType = fields[4]
if "coding" in varType and not "LRG" in fields[6]:
allToHgvs[fields[3]]=(fields[6], fields[8])
return allToHgvs
def accListToHtml(inStr):
"""
given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions
- to HTML links to the databases
+ to HTML links to the databases. Also, pull out the dbVar SSV accession for its own field later.
"""
+ dbVarAcc = ""
newParts = []
for part in inStr.split(","):
fields = part.split(":")
if len(fields)!=2:
newParts.append(part)
continue
db, acc = fields
accForUrl = acc
+ if db=="dbVar" and len(acc)>5 and acc[1:4]=="ssv":
+ # can be nssv/essv/essv, see https://www.ncbi.nlm.nih.gov/dbvar/content/faq/#nsvnssv
+ dbVarAcc = acc
+
if db in dbToUrl:
if "OMIM" in db:
accForUrl = accForUrl.replace(".", "#")
url = (dbToUrl[db] % accForUrl)
newPart = "%s:%s" % (url, db, acc)
else:
newPart = part
newParts.append(newPart)
- return ", ".join(newParts)
+ return (", ".join(newParts)), dbVarAcc
def reviewStatusToHtmlStars(reviewStatus):
"""
In email from Donna maglott@ncbi.nlm.nih.gov mapping is like this:
1-criteria provided, conflicting interpretations
2-criteria provided, multiple submitters, no conflicts
1-criteria provided, single submitter
0-no assertion criteria provided
0-no assertion for the individual variant
0-no assertion provided
4-practice guideline
3-reviewed by expert panel
given the review status, return an html string with the correct number of black stars
"""
@@ -395,65 +517,89 @@
emptyStarCount = 4-starCount
htmlList = ["★"]*starCount
htmlList.extend( ["☆"]*emptyStarCount )
htmlStr = "".join(htmlList)
htmlStr += " based on: %s" % reviewStatus
if starCount==1:
asciiDesc = "%d star" % starCount
else:
asciiDesc = "%d stars" % starCount
return htmlStr, asciiDesc, starCount
+def loadSubmissions(summFname):
+ " load the submissions into the Mysql table for the lollipop track details page "
+ # not doing the load+rename strategy yet, I don't think it makes a difference here
+ logging.info("Loading submissions table %s into mysql, hg19 and also hg38" % summFname)
+ fh = tempfile.NamedTemporaryFile(prefix="clinVarToBed", suffix=".tsv")
+ tmpFname = fh.name
+ cmd= "zgrep -v '^#' %s > %s" % (summFname, tmpFname)
+ assert(os.system(cmd)==0)
+
+ cmd = "chmod a+r %s" % (tmpFname) # otherwise mysqld can't read the file
+ assert(os.system(cmd)==0)
+
+ cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname)
+ assert(os.system(cmd)==0)
+
+ cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname)
+ assert(os.system(cmd)==0)
+
def downloadFromNcbi(skipDownload):
" download files from NCBI and return triple of filenames "
today = date.today().isoformat()
outSuffix = "%s." % today
baseUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/"
url = baseUrl+"variant_summary.txt.gz"
outFname = "archive/variant_summary.%s.txt.gz" % today
filename = outFname
# get the new variant allele file
varUrl = baseUrl+"variation_allele.txt.gz"
varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today)
# get the new hgvs allele file
hgvsUrl = baseUrl+"hgvs4variation.txt.gz"
hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today)
+ # get the summary file
+ summUrl = baseUrl+"submission_summary.txt.gz"
+ summFname = join(dirname(filename), "submission_summary.%s.txt.gz" % today)
+
# get the new VCF file
vcfUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz"
vcfFname = join(dirname(filename), "GRCh38.clinvar.%s.vcf.gz" % today)
if not (isfile(vcfFname)):
logging.info("Downlading %s to %s" % (vcfUrl, vcfFname))
open(vcfFname, "wb").write(urllib.request.urlopen(vcfUrl).read())
- if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)):
+ if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname) and isfile(summFname)):
logging.info("Downlading %s to %s" % (url, outFname))
open(outFname, "wb").write(urllib.request.urlopen(url).read())
logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read())
logging.info("Downlading %s to %s" % (varUrl, varFname))
open(varFname, "wb").write(urllib.request.urlopen(varUrl).read())
+ logging.info("Downlading %s to %s" % (summUrl, summFname))
+ open(summFname, "wb").write(urllib.request.urlopen(summUrl).read())
- return filename, varFname, hgvsFname, vcfFname
+ return filename, varFname, hgvsFname, vcfFname, summFname
def parseVcf(vcfFname):
" parse VCF and return as dict alleleId -> (molConseq) "
logging.info("Parsing %s" % vcfFname)
ret = {}
for line in gzip.open(vcfFname, "rt", encoding="utf_8"):
# #CHROM POS ID REF ALT QUAL FILTER INFO
#1 930248 789256 G A . . ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=
if line.startswith("#"):
continue
chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t")
infoParts = info.split(";")
alleleId = None
mc = None
@@ -469,45 +615,46 @@
sys.exit(1)
if mc and alleleId:
ret[int(alleleId)] = mc
return ret
# ----------- MAIN --------------
if args==[] and not options.auto:
parser.print_help()
exit(1)
outSuffix = ""
if options.auto:
- filename, varFName, hgvsFname, vcfFname = downloadFromNcbi(options.skipDownload)
+ filename, varFName, hgvsFname, vcfFname, summFname = downloadFromNcbi(options.skipDownload)
else:
filename = args[0]
varFname = args[1]
hgvsFname = args[2]
vcfFname = args[3]
#allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change
allToVcf = parseVcf(vcfFname)
allToHgvs = parseHgvsFile(hgvsFname)
# e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
# GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
# GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
# GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
+# NC_000010.11:g.(?_87925503)_(87965482_?)del
cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
# e.g.
# c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
# c.80A>T
# c.740+356C>T
# c.3839_3855del17
posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')
ifh = gzip.open(filename, "rt", encoding="latin_1")
# check the header
line1 = ifh.readline()
@@ -525,31 +672,31 @@
longCount = 0
noAssCount = 0
noMcCount = 0
# convert lines
for line in ifh:
line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
line = line.rstrip("\n")
row = line.split("\t")
row = [f.replace("\0", "") for f in row ]
alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
- submCategories, varId = row
+ submCategories, varId, posVcf, refAllVcf, altAllVcf = row
# changed in July 2018 when the "varId" was merged into this table. It used to be
# in allToVar
if varId=="":
logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
hgvsCod, hgvsProt = allToHgvs.get(alleleId, ("", ""))
molConseq = allToVcf.get(int(alleleId))
if molConseq is None:
logging.debug("%s has no molConseq" % alleleId)
noMcCount += 1
molConseq = ""
if chrom=="" or assembly=="" or assembly=="NCBI36":
@@ -565,35 +712,35 @@
if assembly=="GRCh37":
chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020
else:
chrom = "chrM"
shortName, longName = shortenName(name)
if len(shortName)>20:
shortName = shortName[:17]+"..."
longCount+=1
if len(longName)>60:
longName = longName[:60]+"..."
if start=="" or end=="":
- print(("undefined start or end coordinate. record %s"% irvcAcc))
+ print("empty start or end coordinate. record %s, start/end = %s/%s "% (irvcAcc, start, end))
continue
if int(start)<0 or int(end)<0:
- print(("illegal start or end coordinate. record %s"% irvcAcc))
+ print("illegal start or end coordinate. record %s, start/end = %s/%s"% (irvcAcc, start, end))
continue
# sometimes they seem to inverse their coordinates
if int(start)>int(end):
start, end = end, start
if chrom=="chr17" and (end=="217748468" or end=="84062405"):
print("blacklisted feature")
continue
# reorder columns for bigBed
start = str(int(start)-1) # convert to zero-based, half-open
score = "0"
strand = "."
thickStart = start
@@ -602,117 +749,128 @@
blockCount = "1"
blockSizes = str(int(end)-int(start))
blockStarts = "0"
# used until Oct 2017
# if dbVarAcc.startswith("nsv") or "copy number" in allType:
# now using just size on genome, see #20268
# shortening to >= 50bp to bring into line with LOVD and gnomAD
if (int(end)-int(start))>=50:
isCnv = True
else:
isCnv = False
pathoCode = clinSignToCode(clinSign)
originCode = originSimpleToCode(originSimple)
- allTypeCode = allTypeToCode(allType)
+ allTypeCode = allTypeToCode(allType, "|".join(row))
if isCnv:
- if "gain" in allType or "duplication" in allType or "insertion" in allType:
- itemRgb = "0,0,255"
- if "deletion" in allType or "loss" in allType:
- itemRgb = "255,0,0"
+ typeColors = cnvColors[allTypeCode]
+ #else:
+ #print("CNV encountered with unknown type: %s / %s"%(allType, allTypeCode))
+ #assert(False)
+ itemRgb = typeColors.get(pathoCode)
+ if itemRgb is None:
+ print("CNV encountered with unknown pathoCode: "+pathoCode)
+ assert(False)
+
else:
- if pathoCode in ["PG", "LP"]: # pathogenic or likely pathogenic
+ if pathoCode in ["P", "LP"]: # pathogenic or likely pathogenic
itemRgb = "210,0,0" # red
- elif pathoCode in ["BN", "LB"]: # benign or likely benign
+ elif pathoCode in ["B", "LB"]: # benign or likely benign
itemRgb = "0,210,0" # green
- elif pathoCode in ["UC"]: # uncertain
+ elif pathoCode in ["VUS", "RF"]: # uncertain
itemRgb = "0,0,128" # dark-blue
elif pathoCode in ["CF"]: # conflicting
itemRgb = "137,121,212" # light-blue
elif pathoCode in ["OT"]: # other
itemRgb = "128,128,128" # grey
else:
assert(False)# should never happen
varLen = int(end)-int(start)
geneStr = geneSymbol
if geneId.isdigit() and geneId!="-1":
geneStr = geneId+"|"+geneSymbol
if len(geneStr)>250:
geneStr = "not shown, too many genes"
mouseOverName = name
- if name=="":
+ otherIds, dbVarSsvAcc = accListToHtml(otherIds)
+
+ if isCnv and dbVarSsvAcc!="":
+ shortName = dbVarSsvAcc
+ elif name=="":
name = "No display name was provided by ClinVar for this variant"
shortName = "NoName"
mouseOverName = ""
if len(name)>90:
name = name[:90]+"..."
name = varId+"|"+name
if len(mouseOverName)>80:
mouseOverName = mouseOverName[:80]+"..."
if len(hgvsProt) > 90:
hgvsProt = hgvsProt[:90]+"..."
if len(hgvsCod) > 90:
hgvsCod = hgvsCod[:90]+"..."
- otherIds = accListToHtml(otherIds)
-
- phenotypeIds = accListToHtml(phenotypeIds)
+ phenotypeIds, _ = accListToHtml(phenotypeIds)
starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus)
mouseOverParts = [mouseOverName, "Type: "+allType, "Consequence: "+ molConseq, "Significance: "+clinSign,
"Origin: "+origin, "Phenotypes: "+phenotypeList]
# removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars
mouseOver = ", ".join(mouseOverParts)
row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \
blockCount, blockSizes, blockStarts,
name, clinSign, starRatingHtml, allType, geneStr, molConseq,
snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver,
# these fields are the for the filters, not for the display
- pathoCode, originCode, allTypeCode, str(varLen), str(starCount)]
+ pathoCode, originCode, allTypeCode, str(varLen), str(starCount),
+ # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field
+ varId, dbVarSsvAcc]
# replace clinvar's placeholders with real empty fields
newRow = []
for x in row:
if x in ["-1", "-"]:
newRow.append("")
else:
newRow.append(x)
row = newRow
if assembly=="GRCh37":
ofh = hg19Bed
if isCnv:
ofh = hg19BedCnv
elif assembly=="GRCh38":
ofh = hg38Bed
if isCnv:
ofh = hg38BedCnv
else:
noAssCount +=1
+ #for o in row:
+ #print(o, type(o))
ofh.write("\t".join(row))
ofh.write("\n")
hg19Bed.close()
hg38Bed.close()
hg19BedCnv.close()
hg38BedCnv.close()
logging.info("%d lines with feature name that was too long, shortened them" % longCount)
logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount)
fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name]
# sort the files
for fname in fnames:
@@ -726,32 +884,34 @@
compareWithLastMonth(fname, options.maxDiff)
# convert to bigBed without the date in the fname, like clinvarMain.hg19.bb
tmpFnames = []
for fname in fnames:
parts = fname.split(".")
if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed
base, db, date, ext = parts
else:
base, db, ext = parts
outFname = "%s.%s.bb" % (base, db)
tmpFname = "%s.%s.temp.bb" % (base, db)
outFname = basename(outFname) # strip the archive/ part, put into current dir
- cmd = "bedToBigBed -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname)
+ cmd = "bedToBigBed -extraIndex=_dbVarSsvId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname)
mustRun(cmd)
tableName = outFname.split('.')[0] # e.g. clinvarMain
tmpFnames.append( (db, tableName, tmpFname, outFname) )
# make it atomic by renaming the temp files to the final file names at the very end
for db, tableName, tmpFname, finalFname in tmpFnames:
os.rename(tmpFname, finalFname)
# make sure that the "Data last updated" in the table browser is correct
for db, tableName, tmpFname, finalFname in tmpFnames:
cmd = "hgBbiDbLink %s %s /gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName, db, tableName)
mustRun(cmd)
+loadSubmissions(summFname)
+
print("ClinVar update end: OK")