src/hg/makeDb/scripts/prsEmerge/prs2bed.py 1dedbd46c366091fd9d93d961b4b7b91d83ad743

1dedbd46c366091fd9d93d961b4b7b91d83ad743
max
  Mon Oct 2 07:50:55 2023 -0700
adding polygenic risk scores track, refs #31546

diff --git src/hg/makeDb/scripts/prsEmerge/prs2bed.py src/hg/makeDb/scripts/prsEmerge/prs2bed.py
new file mode 100644
index 0000000..436f915
--- /dev/null
+++ src/hg/makeDb/scripts/prsEmerge/prs2bed.py
@@ -0,0 +1,38 @@
+# Written by Zia Truong, committed by Max
+import pathlib, gzip, csv
+
+#I put the PRS SNP files in a directory called "data"
+#iterate through all the filenames in the directory
+for filename in map(str, pathlib.Path("data").iterdir()):
+    disease = filename.split("_PRS")[0][5:]
+
+    #BED file to write to
+    bed_file = open("output/"+disease+".bed", 'w')
+
+    #the PRS file is gzipped, read it as a text file
+    prs_file = gzip.open(filename, 'rt')
+
+    #skip header line
+    next(prs_file)
+
+    for line in csv.reader(prs_file, delimiter='\t'):
+        #compute field values
+        effect_allele, weight = line[1], line[2]
+
+        #id contains the chromosome, start position
+        id_info = line[0].split(':')
+
+        #make start positions zero-based
+        chr, start_pos = id_info[0], int(id_info[1])-1
+
+        #get the reference allele (the allele in the id that isn't the effect allele)
+        ref_allele = id_info[2] if effect_allele != id_info[2] else id_info[3]
+
+        #compute 1-based end position
+        end_pos = start_pos + len(ref_allele)
+
+        #chr  start  end  id  score  strand  thick_start  thick_end  color  effect_allele  weight
+        bed_file.write("\t".join(("chr"+chr, str(start_pos), str(end_pos), line[0], '0', '.', str(start_pos), str(start_pos), "0,0,0", effect_allele, weight))+"\n")
+
+    prs_file.close()
+    bed_file.close()