src/hg/makeDb/scripts/kovaToVcf.py aa8e2e87c3b54e9662d87cd5b921d5607fc846ad

aa8e2e87c3b54e9662d87cd5b921d5607fc846ad
max
  Mon Nov 10 05:51:08 2025 -0800
adding korea to varFreqs supertrack, refs #36642

diff --git src/hg/makeDb/scripts/kovaToVcf.py src/hg/makeDb/scripts/kovaToVcf.py
new file mode 100644
index 00000000000..ec1f5a02008
--- /dev/null
+++ src/hg/makeDb/scripts/kovaToVcf.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""
+Convert KOVA biobank TSV.gz files to VCF format.
+Handles field renaming and preserves all information in INFO field.
+"""
+
+import gzip
+import sys
+import re
+from datetime import datetime
+import argparse
+
+def clean_value(value):
+    """Clean values for VCF INFO field."""
+    if value == 'NA' or value == '.' or value == '':
+        return '.'
+    # Replace spaces and special characters that might cause issues
+    value = value.replace(' ', '_')
+    value = value.replace(';', ',')
+    return value
+
+def format_info_value(value):
+    """Format a value for the INFO field."""
+    if value == 'NA' or value == '.' or value == '':
+        return None  # Will be skipped in INFO field
+    return clean_value(value)
+
+def convert_tsv_to_vcf(input_file, output_file):
+    """Convert KOVA TSV.gz file to VCF format."""
+    
+    # Field name mappings
+    field_mappings = {
+        'kova_AC': 'AC',
+        'kova_AN': 'AN', 
+        'kova_AF': 'AF'
+    }
+    
+    # Fields to exclude from INFO (already used in other VCF columns)
+    exclude_from_info = {'chrom', 'pos', 'ref_allele', 'alt_allele', 'rsid', 'qual', 'filters'}
+    
+    with gzip.open(input_file, 'rt') if input_file.endswith('.gz') else open(input_file, 'r') as infile:
+        # Read header line
+        header_line = infile.readline().strip()
+        headers = header_line.split('\t')
+        
+        # Prepare output file
+        outfile = gzip.open(output_file, 'wt') if output_file.endswith('.gz') else open(output_file, 'w')
+        
+        # Write VCF header
+        outfile.write("##fileformat=VCFv4.3\n")
+        outfile.write(f"##fileDate={datetime.now().strftime('%Y%m%d')}\n")
+        outfile.write(f"##source=KOVA_biobank_converter\n")
+        outfile.write(f"##reference=GRCh38\n")
+        
+        # Write INFO field descriptions
+        # Map field names and write descriptions
+        info_fields = []
+        for field in headers:
+            if field not in exclude_from_info:
+                # Apply field mapping if exists
+                info_field = field_mappings.get(field, field)
+                # Clean field name for VCF compatibility
+                info_field = re.sub(r'[^A-Za-z0-9_]', '_', info_field)
+                info_fields.append((field, info_field))
+                
+                # Determine field type based on name
+                field_type = "String"  # Default
+                if 'AC' in info_field or 'AN' in info_field or 'count' in field.lower():
+                    field_type = "Integer"
+                elif 'AF' in info_field or 'pvalue' in field.lower() or 'percentile' in field.lower():
+                    field_type = "Float"
+                
+                outfile.write(f'##INFO=<ID={info_field},Number=.,Type={field_type},Description="{field}">\n')
+        
+        # Write VCF column header
+        outfile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
+        
+        # Process data lines
+        for line_num, line in enumerate(infile, start=2):
+            fields = line.strip().split('\t')
+            
+            if len(fields) != len(headers):
+                print(f"Warning: Line {line_num} has {len(fields)} fields, expected {len(headers)}")
+                continue
+            
+            # Create dictionary of field values
+            data = dict(zip(headers, fields))
+            
+            # Extract main VCF fields
+            chrom = data.get('chrom', '.')
+            # Remove 'chr' prefix if present for standard VCF
+            if chrom.startswith('chr'):
+                chrom = chrom[3:]
+            
+            # Position is 1-based in both TSV and VCF, no conversion needed
+            pos = data.get('pos', '.')
+            
+            # ID field (using rsid)
+            id_field = data.get('rsid', '.')
+            if id_field == '.' or id_field == 'NA' or id_field == '':
+                id_field = '.'
+            
+            ref = data.get('ref_allele', '.')
+            alt = data.get('alt_allele', '.')
+            
+            # QUAL field
+            qual = data.get('qual', '.')
+            if qual == '.' or qual == 'NA' or qual == '':
+                qual = '.'
+            
+            # FILTER field
+            filter_field = data.get('filters', '.')
+            if filter_field == '.' or filter_field == 'NA' or filter_field == '':
+                filter_field = 'PASS'
+            
+            # Build INFO field
+            info_parts = []
+            for orig_field, info_field in info_fields:
+                value = data.get(orig_field, '.')
+                formatted_value = format_info_value(value)
+                if formatted_value is not None:
+                    # For flag fields (present/absent), just include the key
+                    if formatted_value == 'true' or formatted_value == 'True':
+                        info_parts.append(info_field)
+                    else:
+                        info_parts.append(f"{info_field}={formatted_value}")
+            
+            info = ';'.join(info_parts) if info_parts else '.'
+            
+            # Write VCF line
+            vcf_line = f"{chrom}\t{pos}\t{id_field}\t{ref}\t{alt}\t{qual}\t{filter_field}\t{info}\n"
+            outfile.write(vcf_line)
+        
+        outfile.close()
+    
+    print(f"Conversion complete. VCF file written to: {output_file}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert KOVA biobank TSV.gz files to VCF format')
+    parser.add_argument('input', help='Input TSV or TSV.gz file')
+    parser.add_argument('output', help='Output VCF file (use .vcf.gz for compressed output)')
+    
+    args = parser.parse_args()
+    
+    convert_tsv_to_vcf(args.input, args.output)
+
+if __name__ == "__main__":
+    main()