f286e5c10c6344503cf9cd73d594070b5f22a671 max Wed Feb 23 03:13:57 2022 -0800 adding the new links that ClinVar added a few months ago to our hgc page, refs #28926 diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index 79f795a..8b3e581 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -1,29 +1,31 @@ #!/usr/bin/env python3 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess import tempfile, json from collections import defaultdict from os.path import join, basename, dirname, isfile, abspath, isdir from datetime import date, datetime, timedelta dbToUrl = { "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/", "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s", "OMIM Allelic Variant" : "http://www.omim.org/entry/%s", "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s", "OMIM" : "http://www.omim.org/entry/%s", + "MONDO" : "https://monarchinitiative.org/disease/%s", + "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s", "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s" } # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file # if they add a new one, this script must fail and tdb must be updated. possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""]) # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed. Output goes into the current dir Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/ """) @@ -470,35 +472,36 @@ elif assembly=="NCBI36": continue else: print("invalid assembly in HGVS: %s, in %s" % (assembly, line)) assert(False) for data in addTo: data[alleleId].append( hgvsRow ) return hg19Hgvs, hg38Hgvs def accListToHtml(inStr): """ given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions to HTML links to the databases. Also, pull out the dbVar SSV accession for its own field later. + A more recent example is MONDO:MONDO:0019667, which meant that I had to add the "maxsplit" option. """ dbVarAcc = "" newParts = [] for part in inStr.split(","): - fields = part.split(":") + fields = part.split(":", maxsplit=1) if len(fields)!=2: newParts.append(part) continue db, acc = fields accForUrl = acc if db=="dbVar" and len(acc)>5 and acc[1:4]=="ssv": # can be nssv/essv/essv, see https://www.ncbi.nlm.nih.gov/dbvar/content/faq/#nsvnssv dbVarAcc = acc if db in dbToUrl: if "OMIM" in db: accForUrl = accForUrl.replace(".", "#") url = (dbToUrl[db] % accForUrl)