src/hg/utils/otto/clinvar/clinVarToBed 41c716e03c0390db4fd288b8df894277d1c4e998

41c716e03c0390db4fd288b8df894277d1c4e998
max
  Fri Jan 17 04:50:29 2020 -0800
converting clinvarToBed to python3, I simply typed 2To3 -w and it seems to have worked

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index b25275d..891b086 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -1,18 +1,18 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 
-import logging, sys, optparse, re, os, datetime, gzip, urllib2, subprocess
+import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile, abspath
 from datetime import date, datetime, timedelta
 
 dbToUrl = {
     "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
     "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
     "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
     "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
     "OMIM" : "http://www.omim.org/entry/%s",
     "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
 }
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed
@@ -28,31 +28,31 @@
 parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
 else:
     logging.basicConfig(level=logging.INFO)
 
 clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID\n"
 # ==== FUNCTIONs =====
 
 def mustRun(cmd):
     logging.debug(cmd)
     ret = os.system(cmd)
     if ret!=0:
-        print("Could not run: %s" % cmd)
+        print(("Could not run: %s" % cmd))
         sys.exit(1)
 
 def shortenName(name):
     "make the clinvar name shorter and keep a long version for the mouse over "
     cnvMatch = cnvRe.match(name)
     #hgvsMatch = hgvsRe.match(name)
 
     if ":" in name and not "chr" in name:
         longName = name.split(":")[0]
     else:
         longName = name
 
     shortName = name
 
     if cnvMatch != None:
@@ -103,31 +103,31 @@
     newmonth = ((( d.month - 1) - x ) % 12 ) + 1
     newyear  = d.year + ((( d.month - 1) - x ) // 12 ) 
     if d.day > days_of_month[newmonth-1]:
       newday = days_of_month[newmonth-1]
     else:
       newday = d.day
     return datetime( newyear, newmonth, newday)
 
 def compareWithLastMonth(fname, maxDiff):
     " go back one month from current date, make sure that linecount diff is at most maxDiff "
     todayStr = date.today().isoformat()
     previousDate = lastMonth(date.today())
     previousDateStr = previousDate.date().isoformat()
     previousFname = fname.replace(todayStr, previousDateStr)
     if not isfile(previousFname):
-        print("Cannot find %s, cannot compare new data with previous month" % previousFname)
+        print(("Cannot find %s, cannot compare new data with previous month" % previousFname))
         return
         
     previousCount = sum(1 for line in open(previousFname))
     todayCount = sum(1 for line in open(fname))
     percDiff = (todayCount-previousCount)/float(todayCount)
 
     if percDiff < 0 and maxDiff:
         raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \
             (previousFname, previousCount, fname, todayCount))
     if percDiff > maxDiff:
         raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount))
     # see RM 14350 do more than just linecount
     # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux
     cmd = "diff %s %s --speed-large-files | grep '^[\>\<]'  | wc -l" % (fname, previousFname)
     diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT))
@@ -169,32 +169,32 @@
 #    return allToVar
 
 def parseHgvsFile(hgvsFname):
     " return dict: allele Id -> (hgvs coding, hgvs protein) "
     logging.info("Parsing %s" % hgvsFname)
     expHead	= "#Symbol	GeneID	VariationID	AlleleID	Type	Assembly	NucleotideExpression	NucleotideChange	ProteinExpression	ProteinChange	UsedForNaming	Submitted	OnRefSeqGene"
     allToHgvs = {}
     foundHead = False
     for line in gzip.open(hgvsFname):
         line = line.rstrip("\n")
         if line==expHead:
             foundHead = True
         if line.startswith("#Symbol") and "GeneID" in line:
             if not line == expHead:
                 print ("Header of hgvs file has changed again. Code needs fixing")
-                print ("found:",line)
-                print ("expected:",expHead)
+                print(("found:",line))
+                print(("expected:",expHead))
             else:
                 foundHead = True
 
         if line.startswith("#"):
             continue
         if not foundHead:
             raise Exception("Invalid Header. Code needs fixing, again.")
 
         fields = line.split("\t")
         varType = fields[4]
         if "coding" in varType and not "LRG" in fields[6]:
             allToHgvs[fields[3]]=(fields[6], fields[8])
     return allToHgvs
 
 def accListToHtml(inStr):
@@ -243,31 +243,31 @@
         starCount = 0
     if revStatus=="criteriaprovided,conflictinginterpretations":
         starCount = 1
     if revStatus=="criteriaprovided,multiplesubmitters,noconflicts":
         starCount = 2
     if revStatus=="criteriaprovided,singlesubmitter":
         starCount = 1
     if revStatus == "practiceguideline":
         starCount = 4
     if revStatus == "reviewedbyexpertpanel":
         starCount = 3
     if "noassertion" in revStatus or revStatus=="-":
         starCount = 0
 
     if starCount == None:
-        print("Illegal review status: %s" % revStatus)
+        print(("Illegal review status: %s" % revStatus))
         assert(False)
 
     emptyStarCount = 4-starCount
 
     htmlList = ["&#9733;"]*starCount 
     htmlList.extend( ["&#9734;"]*emptyStarCount )
 
     htmlStr = "".join(htmlList)
     htmlStr += "&nbsp;&nbsp;<small>based on: %s</small>" % reviewStatus
 
     asciiDesc = "%d stars" % starCount
     return htmlStr, asciiDesc, starCount
 
 def downloadFromNcbi(skipDownload):
     " download files from NCBI and return triple of filenames "
@@ -276,35 +276,35 @@
 
     url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
     outFname = "archive/variant_summary.%s.txt.gz" % today
     filename = outFname
 
     # get the new variant allele file
     varUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz"
     varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today)
 
     # get the new hgvs allele file
     hgvsUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/hgvs4variation.txt.gz"
     hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today)
 
     if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)):
         logging.info("Downlading %s to %s" % (url, outFname))
-        open(outFname, "w").write(urllib2.urlopen(url).read())
+        open(outFname, "w").write(urllib.request.urlopen(url).read())
         logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
-        open(hgvsFname, "w").write(urllib2.urlopen(hgvsUrl).read())
+        open(hgvsFname, "w").write(urllib.request.urlopen(hgvsUrl).read())
         logging.info("Downlading %s to %s" % (varUrl, varFname))
-        open(varFname, "w").write(urllib2.urlopen(varUrl).read())
+        open(varFname, "w").write(urllib.request.urlopen(varUrl).read())
     return filename, varFname, hgvsFname
 
 # ----------- MAIN --------------
 if args==[] and not options.auto:
     parser.print_help()
     exit(1)
 
 outSuffix = ""
 
 if options.auto:
     filename, varFName, hgvsFname = downloadFromNcbi(options.skipDownload)
 else:
     filename = args[0]
     varFname = args[1]
     hgvsFname = args[2]
@@ -358,56 +358,56 @@
       refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
       submCategories, varId = row
 
     # changed in July 2018 when the "varId" was merged into this table. It used to be 
     # in allToVar
     if varId=="":
         logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
 
     hgvsCod, hgvsProt =  allToHgvs.get(alleleId, ("", ""))
 
     if chrom=="" or assembly=="" or assembly=="NCBI36":
         noAssCount += 1
         continue
 
     if chrom=="Un" and assembly=="GRCh38":
-        print("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc)
+        print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc))
         continue
     chrom = "chr"+chrom
 
     # Code-review/QA: Is it OK to pass through coordinates on chrM for hg19?
     if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ?
         if assembly=="GRCh37":
             chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 
         else:
             chrom = "chrM"
 
     shortName, longName = shortenName(name)
 
     if len(shortName)>20:
         shortName = shortName[:17]+"..."
         longCount+=1
 
     if len(longName)>60:
         longName = longName[:60]+"..."
 
     if start=="" or end=="":
-        print("undefined start or end coordinate. record %s"% irvcAcc)
+        print(("undefined start or end coordinate. record %s"% irvcAcc))
         continue
 
     if int(start)<0 or int(end)<0:
-        print("illegal start or end coordinate. record %s"% irvcAcc)
+        print(("illegal start or end coordinate. record %s"% irvcAcc))
         continue
 
     # sometimes they seem to inverse their coordinates
     if int(start)>int(end):
         start, end = end, start
 
     if chrom=="chr17" and (end=="217748468" or end=="84062405"):
         print("blacklisted feature")
         continue
 
     # reorder columns for bigBed
     start = str(int(start)-1) # convert to zero-based, half-open
     score = "0"
     strand = "."
     thickStart = start