0ce9210dc769e64e79daf51e11ab0f08774c287f
max
  Mon Nov 9 05:12:43 2020 -0800
making clinvar otto a little bit less verbose, no redmine

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index a23a3c0..77d6de0 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -660,30 +660,31 @@
 if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again
     logging.error("ClinVar has changed their header line again")
     logging.error("current header line: %s" % line1.replace("\t","|"))
     logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|"))
     raise Exception("code needs fixing")
 
 # open out files
 hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1")
 hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1")
 hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1")
 hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1")
 
 longCount = 0
 noAssCount = 0
 noMcCount = 0
+minusAccs = []
 
 # convert lines
 for line in ifh:
     line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
     line = line.rstrip("\n")
     row = line.split("\t")
     row = [f.replace("\0", "") for f in row ]
 
     alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
       irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
       refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
       submCategories, varId, posVcf, refAllVcf, altAllVcf = row
 
     # changed in July 2018 when the "varId" was merged into this table. It used to be 
     # in allToVar
@@ -710,30 +711,34 @@
     if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ?
         if assembly=="GRCh37":
             chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 
         else:
             chrom = "chrM"
 
     shortName, longName = shortenName(name)
 
     if len(shortName)>20:
         shortName = shortName[:17]+"..."
         longCount+=1
 
     if len(longName)>60:
         longName = longName[:60]+"..."
 
+    if start=="-1" and end=="-1":
+        minusAccs.append(irvcAcc)
+        continue
+
     if start=="" or end=="":
         print("empty start or end coordinate. record %s, start/end = %s/%s "% (irvcAcc, start, end))
         continue
 
     if int(start)<0 or int(end)<0:
         print("illegal start or end coordinate. record %s, start/end = %s/%s"% (irvcAcc, start, end))
         continue
 
     # sometimes they seem to inverse their coordinates
     if int(start)>int(end):
         start, end = end, start
 
     if chrom=="chr17" and (end=="217748468" or end=="84062405"):
         print("blacklisted feature")
         continue
@@ -856,30 +861,31 @@
     else:
         noAssCount +=1
         
     #for o in row:
         #print(o, type(o))
     ofh.write("\t".join(row))
     ofh.write("\n")
 
 hg19Bed.close()
 hg38Bed.close()
 hg19BedCnv.close()
 hg38BedCnv.close()
 
 logging.info("%d lines with feature name that was too long, shortened them" % longCount)
 logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount)
+logging.info("%d lines with -1/-1 start/end positions skipped. Examples: %s" % (len(minusAccs), minusAccs[:3]))
 
 fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name]
 
 # sort the files
 for fname in fnames:
     cmd = "bedSort %s %s" % (fname, fname)
     logging.debug(cmd)
     assert(os.system(cmd)==0)
 
 # check new bed files against last months's bed files
 if options.maxDiff:
     for fname in fnames:
         compareWithLastMonth(fname, options.maxDiff)
 
 # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb