cf24bf1f4d870e1b6d2d9067c04ff480cfb469b2
lrnassar
  Fri Oct 20 12:54:49 2023 -0700
Adding an updated cosmic makedoc after a user wrote in with multiple errors found, refs #32430

diff --git src/hg/makeDb/doc/hg38/cosmicV98.txt src/hg/makeDb/doc/hg38/cosmicV98.txt
new file mode 100644
index 0000000..c3a9718
--- /dev/null
+++ src/hg/makeDb/doc/hg38/cosmicV98.txt
@@ -0,0 +1,159 @@
+#This makedoc is for the update to the v98 track (#29625) originally in the hg38.txt makedoc
+#A user wrote in and many errors were found, largely due to the provider not correctly coverting to proper BED file coordinates
+#The following is the python script run to generate the new file, followed by the output
+
+#Parse the COSMIC input file and try and correct the incorrect coordinates. See https://redmine.soe.ucsc.edu/issues/32430
+
+import subprocess
+
+def writeOutToFile(outputFile,chrom,chromStart,chromEnd,refAllele,altAllele,strand,mutationID,legacyID):
+    outputFile.write(chrom+"\t"+str(chromStart)+"\t"+str(chromEnd)+"\t"+mutationID+"\t0\t"+strand+"\t"+refAllele+"\t"+altAllele+"\t"+legacyID+"\n")
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
+
+#Vars
+inputFile = open("/hive/data/outside/cosmic/hg38/v98/ucsc_export.bed","r")
+outputFile = open("/hive/data/outside/cosmic/hg38/v98/cosmicV93.bed","w")
+outputFilePath = "/hive/data/outside/cosmic/hg38/v98/cosmicV93.bed"
+badItems = 0
+insertionsFixed = 0
+deletionsFixed = 0
+delinsFixed = 0
+pointSubsFixed = 0
+correctEntries = 0
+n = 0
+criticalErrors = 0
+
+print("Printing entries that are illegal (chromStart > chromEnd)")
+
+#chr19   56183614        56183614        G       T       +       COSV61999794    COSN20232200
+for entry in inputFile:
+    n+=1
+    entry = entry.rstrip()
+    entrySplit = entry.split("\t")
+    chrom = entrySplit[0]
+    chromStart = int(entrySplit[1])
+    chromEnd = int(entrySplit[2])
+    refAllele = entrySplit[3]
+    altAllele = entrySplit[4]
+    strand = entrySplit[5]
+    mutationID = entrySplit[6]
+    legacyID = entrySplit[7]
+
+    # Look for incorrect insertions which have start > end, and need -2 to start
+    if chromStart > chromEnd:
+        badItems+=1
+        chromStart = chromStart-2
+        chromEnd = chromEnd+1
+        print(entry)
+        
+    # Handle insertions which can also be dups, same coordinate insertions are fine but dups need -2 to end and -1 to start
+    elif refAllele == "" and altAllele != "":
+        if chromEnd-chromStart==2:
+            chromEnd = chromEnd-2
+            chromStart = chromStart-1
+            insertionsFixed+=1
+        elif chromEnd-chromStart==1:
+            insertionsFixed+=1
+            chromEnd = chromEnd-1
+            chromStart = chromStart-1
+        else:
+            n+=1
+        
+    # Handle deletions which are wrong when they are only 1 or 2 bases in size. Some 2 size items are correct, so check for that too
+    elif altAllele == "" and refAllele != "" and len(refAllele) < 3:
+        if chromEnd-chromStart != 2:
+            deletionsFixed+=1
+            chromStart = chromStart-1
+        
+    # Handle delins of different sizes which are wrong
+    elif altAllele != "" and refAllele != "" and len(altAllele) != len(refAllele):
+        delinsFixed+=1
+        chromStart = chromStart-1
+        
+    # Handle point substitutions which are wrong and need -1 to start
+    elif altAllele != "" and refAllele != "" and len(altAllele) == 1 and len(refAllele) == 1:
+        pointSubsFixed+=1
+        chromStart = chromStart-1
+        
+    # Final check for any illegal coordinates, all cases should have been covered before this
+    elif chromStart == chromEnd or chromStart > chromEnd:
+        criticalErrors+=1
+        print("ERROR: This should never come up. Problem with variant below.")
+        print(entry)
+    
+    # Final count of items that were properly formatted
+    else:
+        correctEntries+=1
+    
+    #Write out corrected (if needed) coordinates to file:
+    writeOutToFile(outputFile,chrom,chromStart,chromEnd,refAllele,altAllele,strand,mutationID,legacyID)
+
+outputFile.close()
+inputFile.close()
+
+# pointSubsFixed delinsFixed deletionsFixed insertionsFixed badItems criticalErrors correctEntries n
+print("Point substitutions fixed: "+str(pointSubsFixed)+" ("+str(round(pointSubsFixed/n,3))+"%)\nDeletionInsertions fixed: "+str(delinsFixed)+" ("+str(round(delinsFixed/n,3))+"%)\n" \
+     +"Deletions fixed: "+str(deletionsFixed)+" ("+str(round(deletionsFixed/n,3))+"%)\nInsertions fixed: "+str(insertionsFixed)+" ("+str(round(insertionsFixed/n,3))+"%)\n" \
+     +"Illegal items with start > end: "+str(badItems)+" ("+str(round(badItems/n,3))+"%)\nCritical errors: "+str(criticalErrors)+" ("+str(round(criticalErrors/n,3))+"%)\n" \
+     +"Correct entries not changed: "+str(correctEntries)+" ("+str(round(correctEntries/n,3))+"%)\nTotal items: "+str(n))
+
+bash("bedSort "+outputFilePath+" "+outputFilePath+".sorted")
+bash("bedToBigBed -type=bed6+3 -as=/hive/data/outside/cosmic/hg38/v98/cosmic.as -tab "+outputFilePath+".sorted /cluster/data/hg38/chrom.sizes /hive/data/outside/cosmic/hg38/v98/cosmic.bb")
+#make symlink
+bash("ln -s /hive/data/outside/cosmic/hg38/v98/cosmic.bb /gbdb/hg38/cosmic/cosmic.bb")
+
+####OUTPUT#####
+
+Printing entries that are illegal (chromStart > chromEnd)
+chr1	54139647	54139646		G	-	COSV61283612	COSM4967518
+chr1	92832120	92832119		C	+	COSV59873663	COSM5751392
+chr10	14908622	14908621		T	-	COSV57954760	COSM5751422
+chr11	32434699	32434698		T	-	COSV60065461	COSM5751469
+chr11	61967312	61967311		TCTTACTACTTTGACCGCGATGATGTGGCTTTGAAGAACTTTGCCAAATACTTTCTTCACCAATCTCATGAGGA	-	COSV56445358	COSM4746415
+chr12	25225627	25225626		T	-	COSV55926705	COSM5752083
+chr12	25225676	25225675		T	-	COSV55736226	COSM5751707
+chr14	32822108	32822107		A	+	COSV55234562	COSM5751856
+chr14	32822293	32822292		A	+	COSV55225526	COSM5751218
+chr14	32822984	32822983		A	+	COSV55217470	COSM5751765
+chr14	72465624	72465623		GGAT	+	COSV59575192	COSN190986
+chr14	99257556	99257555		T	-	COSV61735888	COSM5751532
+chr14	99257788	99257787		G	-	COSV61733306	COSM5751244
+chr16	67611246	67611245		A	+	COSV50461550	COSM5751489
+chr16	67611510	67611509		A	+	COSV50465990	COSM5751202
+chr17	7669666	7669665		T	-	COSV53205989	COSM5751520
+chr17	31327838	31327837		AGAGTTTA	+	COSV106105962	COSM34184
+chr19	10795440	10795439		A	+	COSV58965104	COSM5752008
+chr19	44274080	44274079		AACTCTCTGGTGAAGACCAGAATTCCTATTAAATATCCTGTCACTTACTT	-	COSV61933827	COSM5016626
+chr19	54145818	54145817		A	+	COSV55364997	COSM5751982
+chr19	54148744	54148743		A	+	COSV55366597	COSM5751751
+chr2	178443266	178443265		AAAGGGGGCATCAAAAAAGCAAGCCAAAAGGAACGCTGCT	-	COSV57870943	COSM4746427
+chr2	189854347	189854346		A	+	COSV59720649	COSM5751951
+chr2	241740952	241740951		G	+	COSV100345173	COSN31769167
+chr21	34834466	34834465		T	-	COSV55877103	COSM5751784
+chr3	52587352	52587351		A	-	COSV56310253	COSM422809
+chr3	195781704	195781703		CACGCCACCCCTCTTCATGTCACCAGCCCTTCCTCAGCATCCACAGGTGA	-	COSV57801139	COSM5016048
+chr6	31669484	31669483		CCT	+	COSV65507799	COSM306820
+chr6	134173149	134173148		A	-	COSV52804500	COSM1161565
+chr6	135195910	135195909		A	+	COSV57199385	COSM5751503
+chr6	138885581	138885580		G	+	COSV62884323	COSM5752103
+chr7	5986937	5986936		G	-	COSV56220859	COSM5751600
+chr7	50327638	50327637		A	+	COSV58792006	COSM5751834
+chrX	37453358	37453357		C	+	COSV66158042	COSM1161821
+chrX	41125720	41125719		TCTCGC	+	COSV61067912	COSN19269805
+Point substitutions fixed: 12394972 (0.938%)
+DeletionInsertions fixed: 4430 (0.0%)
+Deletions fixed: 322858 (0.024%)
+Insertions fixed: 316932 (0.024%)
+Illegal items with start > end: 35 (0.0%)
+Critical errors: 0 (0.0%)
+Correct entries not changed: 162397 (0.012%)
+Total items: 13218527