676b58d841804f049f720cc9ba3fddec216dae61 max Tue Dec 2 06:22:46 2025 -0800 adding saudi arabia to variant frequencies track diff --git src/hg/makeDb/scripts/saudiToVcf.py src/hg/makeDb/scripts/saudiToVcf.py new file mode 100644 index 00000000000..feb5899466b --- /dev/null +++ src/hg/makeDb/scripts/saudiToVcf.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +"""Convert Saudi Arabian variant frequency TSV file to VCF format.""" + +import sys, gzip + +INPUT_FILE = "51297884.tsv.gz" +OUTPUT_FILE = "saudi_variants.vcf" + +def main(): + with gzip.open(INPUT_FILE, 'r') as infile, open(OUTPUT_FILE, 'w') as outfile: + # Write VCF header + outfile.write("##fileformat=VCFv4.2\n") + outfile.write("##source=SaudiArabianVariantFrequencies\n") + outfile.write("##INFO=\n") + outfile.write("##INFO=\n") + outfile.write("##INFO=\n") + outfile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") + + for line in infile: + # Skip comment lines and header + if line.startswith("##") or line.startswith("#") or line.strip() == "": + continue + + fields = line.strip().split('\t') + if len(fields) < 6: + continue + + chrom, pos, ref, alt, an, ac = fields[:6] + + # Calculate allele frequency + try: + af = int(ac) / int(an) if int(an) > 0 else 0 + except ValueError: + continue + + # Build INFO field + info = f"AN={an};AC={ac};AF={af:.6f}" + + # Write VCF line: CHROM POS ID REF ALT QUAL FILTER INFO + outfile.write(f"{chrom}\t{pos}\t.\t{ref}\t{alt}\t.\tPASS\t{info}\n") + + print(f"Converted {INPUT_FILE} to {OUTPUT_FILE}") + +if __name__ == "__main__": + main()