src/hg/utils/otto/clinvar/clinVarToBed f286e5c10c6344503cf9cd73d594070b5f22a671

f286e5c10c6344503cf9cd73d594070b5f22a671
max
  Wed Feb 23 03:13:57 2022 -0800
adding the new links that ClinVar added a few months ago to our hgc page, refs #28926

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index 79f795a..8b3e581 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -1,29 +1,31 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
 import tempfile, json
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile, abspath, isdir
 from datetime import date, datetime, timedelta
 
 dbToUrl = {
     "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
     "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
     "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
     "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
     "OMIM" : "http://www.omim.org/entry/%s",
+    "MONDO" : "https://monarchinitiative.org/disease/%s",
+    "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s",
     "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
 }
 
 # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file
 # if they add a new one, this script must fail and tdb must be updated.
 possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""])
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed.
 
 Output goes into the current dir
 
 Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/
 """) 
 
@@ -470,35 +472,36 @@
         elif assembly=="NCBI36":
             continue
         else:
             print("invalid assembly in HGVS: %s, in %s" % (assembly, line))
             assert(False)
 
         for data in addTo:
             data[alleleId].append( hgvsRow )
 
     return hg19Hgvs, hg38Hgvs
 
 def accListToHtml(inStr):
     """
     given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions
     to HTML links to the databases. Also, pull out the dbVar SSV accession for its own field later.
+    A more recent example is  MONDO:MONDO:0019667, which meant that I had to add the "maxsplit" option.
     """
     dbVarAcc = ""
     newParts = []
     for part in inStr.split(","):
-        fields = part.split(":")
+        fields = part.split(":", maxsplit=1)
         if len(fields)!=2:
             newParts.append(part)
             continue
 
         db, acc = fields
         accForUrl = acc
 
         if db=="dbVar" and len(acc)>5 and acc[1:4]=="ssv":
             # can be nssv/essv/essv, see https://www.ncbi.nlm.nih.gov/dbvar/content/faq/#nsvnssv
             dbVarAcc = acc
 
         if db in dbToUrl:
             if "OMIM" in db:
                 accForUrl = accForUrl.replace(".", "#")
             url = (dbToUrl[db] % accForUrl)