1942abfbe86d5b3f5ed3ad0482a31a9354587e18
max
Tue Mar 26 09:30:03 2024 -0700
preparing clinvar for somatic mutations, no ticket yet
diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index 79e71db..fa88597 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -35,31 +35,33 @@
parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
(options, args) = parser.parse_args()
if options.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
if not options.isAlpha and os.getlogin()!="otto":
print("Please do not run this script as a normal user, you will run into all sorts of permission problems.")
print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.")
print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.")
sys.exit(1)
-clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n"
+#clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n"
+clinvarExpectHeaders = "#AlleleID|Type|Name|GeneID|GeneSymbol|HGNC_ID|ClinicalSignificance|ClinSigSimple|LastEvaluated|RS# (dbSNP)|nsv/esv (dbVar)|RCVaccession|PhenotypeIDS|PhenotypeList|Origin|OriginSimple|Assembly|ChromosomeAccession|Chromosome|Start|Stop|ReferenceAllele|AlternateAllele|Cytogenetic|ReviewStatus|NumberSubmitters|Guidelines|TestedInGTR|OtherIDs|SubmitterCategories|VariationID|PositionVCF|ReferenceAlleleVCF|AlternateAlleleVCF|SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity".split("|")
+
# ==== FUNCTIONs =====
# benign = BN, likely benign = LB, conflicting = CF, likely pathogenic= LP,
# pathogenic = PG, other = OT, uncertain = VUS, RF=risk factor
# colors were defined by Ana Benet Pages
# - WARNING ON FILTER CHANGES: - see #28562
# Never change these codes. They are in old sessions. If you change them, change the across the whole script
# and also change the name of the .as field, so the cart variables will be different. You can add new values through:
cnvColors = {
"INS" : {
"OT" : "225,165,0",
"BN" : "250,214,69",
"LB" : "250,214,69",
"CF" : "225,165,0",
"RF" : "225,165,0",
@@ -428,31 +430,31 @@
# if line.startswith("#"):
# continue
# if not foundHead:
# raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing")
# fields = line.split("\t")
# allToVar[fields[2]]=fields[0]
# return allToVar
def parseHgvsFile(hgvsFname):
" return dict: allele Id -> list of (hgvs coding, hgvs protein) "
logging.info("Parsing %s" % hgvsFname)
expHead = "#Symbol GeneID VariationID AlleleID Type Assembly NucleotideExpression NucleotideChange ProteinExpression ProteinChange UsedForNaming Submitted OnRefSeqGene"
hg19Hgvs = defaultdict(list)
hg38Hgvs = defaultdict(list)
foundHead = False
- for line in gzip.open(hgvsFname, "rt"):
+ for line in gzip.open(hgvsFname, "rt", encoding="utf8"):
line = line.rstrip("\n")
if line==expHead:
foundHead = True
if line.startswith("#Symbol") and "GeneID" in line:
if not line == expHead:
print ("Header of hgvs file has changed again. Code needs fixing. Look at the new fields and decide if we should show them.")
print(("found:",line))
print(("expected:",expHead))
else:
foundHead = True
if line.startswith("#"):
continue
if not foundHead:
raise Exception("Invalid Header. Code needs fixing, again.")
@@ -525,40 +527,47 @@
2-criteria provided, multiple submitters, no conflicts
1-criteria provided, single submitter
0-no assertion criteria provided
0-no assertion for the individual variant
0-no assertion provided
4-practice guideline
3-reviewed by expert panel
given the review status, return an html string with the correct number of black stars
"""
revStatus = reviewStatus.strip().lower().replace(" ", "")
starCount = None
if revStatus == "nointerpretationforthesinglevariant":
starCount = 0
+ if revStatus == "noclassificationprovided" or revStatus == "noclassificationsfromunflaggedrecords":
+ starCount = 0
if revStatus=="criteriaprovided,conflictinginterpretations":
starCount = 1
+ if revStatus=="criteriaprovided,conflictingclassifications": # first seen in Mar 2024
+ starCount = 1
if revStatus=="criteriaprovided,multiplesubmitters,noconflicts":
starCount = 2
if revStatus=="criteriaprovided,singlesubmitter":
starCount = 1
if revStatus == "practiceguideline":
starCount = 4
if revStatus == "reviewedbyexpertpanel":
starCount = 3
+
+ if revStatus == "noclassificationforthesinglevariant":
+ starCount = 0
if "noassertion" in revStatus or revStatus=="-":
starCount = 0
if starCount == None:
print(("Illegal review status: %s" % revStatus))
assert(False)
emptyStarCount = 4-starCount
htmlList = ["★"]*starCount
htmlList.extend( ["☆"]*emptyStarCount )
htmlStr = "".join(htmlList)
htmlStr += " based on: %s" % reviewStatus
@@ -576,34 +585,34 @@
tmpFname = fh.name
cmd= "zgrep -v '^#' %s > %s" % (summFname, tmpFname)
assert(os.system(cmd)==0)
cmd = "chmod a+r %s" % (tmpFname) # otherwise mysqld can't read the file
assert(os.system(cmd)==0)
cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname)
assert(os.system(cmd)==0)
cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname)
assert(os.system(cmd)==0)
def mostCurrentVersionName():
" return name of version on FTP site "
- cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1'
+ cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1'
verLine = subprocess.check_output(cmd, shell=True, encoding="utf8")
- #
- version = verLine.split(" ")[1].split("=")[1].strip('"')
+ #
+ version = verLine.split(" ")[3].split("=")[1].split('"')[1]
return version
def downloadFromNcbi(skipDownload):
" download files from NCBI and return list of filenames "
today = date.today().isoformat()
downDir = join("downloads", today)
try:
os.mkdir(downDir)
except FileExistsError:
pass
version = mostCurrentVersionName()
logging.info("Version on NCBI server is %s" % version)
versionFname = join(downDir, "version.txt")
open(versionFname, "w").write(version)
@@ -663,30 +672,33 @@
key, val = info.split("=")
if key=="ALLELEID":
alleleId = val
elif key=="MC":
mc = val.split("|")[1].split(",")[0].replace("_", " ")
if not mc in possMolConseqs:
print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. "
"Please update otto script %s" % (mc, __file__))
sys.exit(1)
if mc and alleleId:
ret[int(alleleId)] = mc
return ret
+def shortenSeq(s):
+ return s[:25]+"..."+s[-25:]
+
# ----------- MAIN --------------
if args==[] and not options.auto:
parser.print_help()
exit(1)
outSuffix = ""
if options.auto:
filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.skipDownload)
else:
filename = args[0]
varFname = args[1]
hgvsFname = args[2]
vcfFname = args[3]
versionFname = args[4]
@@ -703,58 +715,60 @@
cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
# e.g.
# c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
# c.80A>T
# c.740+356C>T
# c.3839_3855del17
posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')
ifh = gzip.open(filename, "rt", encoding="latin_1")
# check the header
line1 = ifh.readline()
-if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again
+fileHeaders = line1.rstrip().split("\t")
+if (fileHeaders != clinvarExpectHeaders): # test if clinvar has changed their header line again
logging.error("ClinVar has changed their header line again")
- logging.error("current header line: %s" % line1.replace("\t","|"))
- logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|"))
- raise Exception("code needs fixing")
+ logging.error("current header line: %s" % fileHeaders)
+ logging.error("expected header line: %s" % clinvarExpectHeaders)
+ raise Exception("Sorry. The code will need to be fixed.")
# open out files
hg19Bed = open("bed/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1")
hg38Bed = open("bed/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1")
hg19BedCnv = open("bed/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1")
hg38BedCnv = open("bed/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1")
longCount = 0
noAssCount = 0
noMcCount = 0
minusAccs = []
# convert lines
for line in ifh:
line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
line = line.rstrip("\n")
row = line.split("\t")
row = [f.replace("\0", "") for f in row ]
alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
- submCategories, varId, posVcf, refAllVcf, altAllVcf = row
+ submCategories, varId, posVcf, refAllVcf, altAllVcf, somClinImpact, somClinLastEval, revStatusClinImpact, oncogen, oncogenLastEval, revStatusOncogen = row
+ # SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity
# changed in July 2018 when the "varId" was merged into this table. It used to be
# in allToVar
if varId=="":
logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na":
noAssCount += 1
continue
if assembly=="GRCh37":
hgvs = hg19Hgvs
elif assembly=="GRCh38":
hgvs = hg38Hgvs
else:
@@ -762,30 +776,32 @@
assert(False)
#hgvsTable = ";".join(hgvs[alleleId])
hgvsTable = json.dumps(hgvs[alleleId])
molConseq = allToVcf.get(int(alleleId))
if molConseq is None:
#logging.debug("%s has no molConseq" % alleleId)
noMcCount += 1
molConseq = ""
if chromAcc=="NT_187513.1":
chrom = "chrUn_KI270742v1"
elif chromAcc=="NT_167222.1":
chrom = "chrUn_gl000228"
+ elif chromAcc=='NT_187499.1':
+ chrom = "chrUn_KI270744v1"
elif chromAcc=="NT_113889.1":
if assembly=="GRCh37":
chrom = "chrUn_gl000218"
else:
chrom = "chrUn_GL000218v1"
elif chrom=="Un":
print("Unknown fix/alt/patch chrom, please adapt clinVarToBed: %s, row: %s" % (chrom, row))
sys.exit(1)
else:
chrom = "chr"+chrom
if chrom=="chrUn" and assembly=="GRCh38":
print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc))
continue
@@ -913,35 +929,52 @@
mouseOverParts = [
mouseOverName,
"Review Status: "+starRatingHtml,
"Type: "+allType,
"Consequence: "+ molConseq,
"Significance: "+clinSign,
"Origin: "+origin,
"Phenotypes: "+phenotypeList
]
# removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars
mouseOver = "
".join(mouseOverParts)
snpAcc = "rs"+snpAcc # dbSnp links changed in mid/late 2022
+ vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf
+ if len(vcfDesc)>=255:
+ refAllVcf = shortenSeq(refAllVcf)
+ altAllVcf = shortenSeq(altAllVcf)
+ vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf
+
+ somImpactDesc = ""
+ if somClinImpact!="" and somClinImpact!="-":
+ somImpactDesc = "%s (%s, %s)" % (somClinImpact, somClinLastEval, revStatusClinImpact)
+
+ oncogenDesc = ""
+ if oncogen!="" and oncogen!="-":
+ oncogenDesc = "%s (%s, %s)" % (oncogen, oncogenLastEval, revStatusOncogen)
+
row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \
blockCount, blockSizes, blockStarts,
name, clinSign, starRatingHtml, allType, geneStr, molConseq,
snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver,
+ vcfDesc,
+ somImpactDesc,
+ oncogenDesc,
# these fields are the for the filters, not for the display
pathoCode, originCode, allTypeCode, str(varLen), str(starCount),
# the variant ID got forgotten in the original bigBed schema, it was only part of the origName field
varId, dbVarSsvAcc]
# replace clinvar's placeholders with real empty fields
newRow = []
for x in row:
if x in ["-1", "-"]:
newRow.append("")
else:
newRow.append(x)
row = newRow
if assembly=="GRCh37":