src/hg/makeDb/scripts/varFreqs/abraomToVcf.py aa61ebc800429515f9ced7e28f669c6042219f43

aa61ebc800429515f9ced7e28f669c6042219f43
max
  Wed Mar 18 09:09:13 2026 -0700
varFreqs supertrack: add GREGoR track, update all HTML docs, move scripts to varFreqs/, refs #36642

Add GREGoR R04 WGS track to varFreqs superTrack. Update Data Access and
Methods sections for all 20+ subtrack HTML files with consistent formatting,
sequencing methods from source papers, and links to makeDoc and Github scripts.
Move all varFreqs conversion scripts into scripts/varFreqs/ subdirectory and
update makeDoc paths accordingly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/varFreqs/abraomToVcf.py src/hg/makeDb/scripts/varFreqs/abraomToVcf.py
new file mode 100644
index 00000000000..1f0182e0729
--- /dev/null
+++ src/hg/makeDb/scripts/varFreqs/abraomToVcf.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""
+Convert TSV variant file to VCF format with proper indel formatting using pyfaidx
+"""
+
+import gzip
+import sys
+import argparse
+from datetime import datetime
+import pyfaidx
+from collections import defaultdict
+
+def open_file(filename):
+    """Open regular or gzipped file"""
+    if filename.endswith('.gz'):
+        return gzip.open(filename, 'rt')
+    else:
+        return open(filename, 'r')
+
+def format_indel(chrom, pos, ref, alt, fasta):
+    """
+    Format indels for VCF by adding flanking base
+    VCF requires the preceding base for indels
+    """
+    # Convert position to 0-based for pyfaidx (TSV appears to be 1-based)
+    pos_0based = pos - 1
+    
+    # Get the preceding base from reference
+    try:
+        if pos_0based > 0:
+            preceding_base = fasta[chrom][pos_0based - 1:pos_0based].seq.upper()
+        else:
+            # Edge case: indel at position 1
+            preceding_base = 'N'
+    except:
+        # If chromosome not found or other error, use N
+        preceding_base = 'N'
+    
+    # Handle deletions (Alt is '-')
+    if alt == '-':
+        # For deletion: REF = preceding_base + deleted_sequence, ALT = preceding_base
+        vcf_ref = preceding_base + ref
+        vcf_alt = preceding_base
+        vcf_pos = pos - 1  # Shift position back by 1
+    # Handle insertions (Ref is '-')
+    elif ref == '-':
+        # For insertion: REF = preceding_base, ALT = preceding_base + inserted_sequence
+        vcf_ref = preceding_base
+        vcf_alt = preceding_base + alt
+        vcf_pos = pos - 1  # Shift position back by 1
+    else:
+        # Not an indel, shouldn't happen but handle gracefully
+        vcf_ref = ref
+        vcf_alt = alt
+        vcf_pos = pos
+    
+    return vcf_pos, vcf_ref, vcf_alt
+
+def natural_sort_key(chrom):
+    """Sort chromosomes naturally (1-22, X, Y, M, others)"""
+    chrom = chrom.replace('chr', '')
+    if chrom.isdigit():
+        return (0, int(chrom))
+    elif chrom == 'X':
+        return (1, 23)
+    elif chrom == 'Y':
+        return (1, 24)
+    elif chrom in ['M', 'MT']:
+        return (1, 25)
+    else:
+        return (2, chrom)
+
+def tsv_to_vcf(tsv_file, vcf_file, reference_fasta):
+    """Convert TSV file to VCF format"""
+    
+    # Load reference genome
+    print(f"Loading reference genome from {reference_fasta}...")
+    fasta = pyfaidx.Fasta(reference_fasta)
+    
+    # Store all variants in memory for sorting
+    variants = []
+    
+    # Process TSV file
+    with open_file(tsv_file) as f:
+        # Read header
+        header = f.readline().strip().split('\t')
+        
+        # Find column indices
+        chr_idx = header.index('Chr')
+        start_idx = header.index('Start')
+        ref_idx = header.index('Ref')
+        alt_idx = header.index('Alt')
+        
+        # Optional fields to include in INFO
+        info_field_indices = {}
+        if 'PredictedFunc.refGene' in header:
+            info_field_indices['Function'] = header.index('PredictedFunc.refGene')
+        
+        if 'Gene.refGene' in header:
+            info_field_indices['Gene'] = header.index('Gene.refGene')
+        
+        if 'avsnp150' in header:
+            rsid_idx = header.index('avsnp150')
+        else:
+            rsid_idx = None
+        
+        if 'Frequencies' in header:
+            info_field_indices['AF'] = header.index('Frequencies')
+        
+        if 'FILTER' in header:
+            filter_idx = header.index('FILTER')
+        else:
+            filter_idx = None
+        
+        # Process each variant
+        for line_num, line in enumerate(f, start=2):
+            if not line.strip():
+                continue
+            
+            fields = line.strip().split('\t')
+            
+            try:
+                chrom = fields[chr_idx]
+                if not chrom.startswith('chr'):
+                    chrom = 'chr' + chrom
+                
+                pos = int(fields[start_idx])
+                ref = fields[ref_idx]
+                alt = fields[alt_idx]
+                
+                # Handle indels
+                if ref == '-' or alt == '-':
+                    vcf_pos, vcf_ref, vcf_alt = format_indel(chrom, pos, ref, alt, fasta)
+                else:
+                    # SNP or MNP - no modification needed
+                    vcf_pos = pos
+                    vcf_ref = ref
+                    vcf_alt = alt
+                
+                # Get ID (rsID if available)
+                if rsid_idx is not None and fields[rsid_idx] and fields[rsid_idx] not in ['', '.']:
+                    variant_id = fields[rsid_idx]
+                else:
+                    variant_id = '.'
+                
+                # Get FILTER
+                if filter_idx is not None:
+                    filter_val = fields[filter_idx] if fields[filter_idx] else 'PASS'
+                else:
+                    filter_val = 'PASS'
+                
+                # Build INFO field
+                info_parts = []
+                for info_name, idx in info_field_indices.items():
+                    if fields[idx] and fields[idx] not in ['', '.']:
+                        # Clean up the value (remove special characters if needed)
+                        value = fields[idx].replace(';', ',').replace(' ', '_')
+                        info_parts.append(f"{info_name}={value}")
+                
+                info_field = ';'.join(info_parts) if info_parts else '.'
+                
+                # Store variant for sorting
+                variants.append({
+                    'chrom': chrom,
+                    'pos': vcf_pos,
+                    'id': variant_id,
+                    'ref': vcf_ref,
+                    'alt': vcf_alt,
+                    'filter': filter_val,
+                    'info': info_field,
+                    'original_line': line_num
+                })
+                
+            except Exception as e:
+                print(f"Error processing line {line_num}: {e}", file=sys.stderr)
+                print(f"Line content: {line}", file=sys.stderr)
+                continue
+    
+    # Sort variants by chromosome and position
+    print(f"Sorting {len(variants)} variants...")
+    variants.sort(key=lambda x: (natural_sort_key(x['chrom']), x['pos'], x['ref'], x['alt']))
+    
+    # Check for duplicates at same position and warn
+    prev_variant = None
+    for variant in variants:
+        if prev_variant and prev_variant['chrom'] == variant['chrom'] and prev_variant['pos'] == variant['pos']:
+            print(f"Warning: Multiple variants at {variant['chrom']}:{variant['pos']} (original lines {prev_variant['original_line']}, {variant['original_line']})", 
+                  file=sys.stderr)
+        prev_variant = variant
+    
+    # Open output VCF file
+    out_handle = gzip.open(vcf_file, 'wt') if vcf_file.endswith('.gz') else open(vcf_file, 'w')
+    
+    # Write VCF header
+    out_handle.write("##fileformat=VCFv4.2\n")
+    out_handle.write(f"##fileDate={datetime.now().strftime('%Y%m%d')}\n")
+    out_handle.write(f"##source=TSV_to_VCF_converter\n")
+    out_handle.write(f"##reference={reference_fasta}\n")
+    
+    # Write INFO field descriptions
+    if 'Function' in info_field_indices:
+        out_handle.write('##INFO=<ID=Function,Number=1,Type=String,Description="Predicted function from refGene">\n')
+    if 'Gene' in info_field_indices:
+        out_handle.write('##INFO=<ID=Gene,Number=1,Type=String,Description="Gene from refGene">\n')
+    if 'AF' in info_field_indices:
+        out_handle.write('##INFO=<ID=AF,Number=A,Type=Float,Description="Allele frequency">\n')
+    
+    # Write contigs in order (optional but good practice)
+    seen_chroms = sorted(set(v['chrom'] for v in variants), key=natural_sort_key)
+    for chrom in seen_chroms:
+        try:
+            length = len(fasta[chrom])
+            out_handle.write(f"##contig=<ID={chrom},length={length}>\n")
+        except:
+            # If we can't get the length, just write the contig ID
+            out_handle.write(f"##contig=<ID={chrom}>\n")
+    
+    # Write VCF column headers
+    out_handle.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
+    
+    # Write sorted variants
+    for variant in variants:
+        out_handle.write(f"{variant['chrom']}\t{variant['pos']}\t{variant['id']}\t"
+                        f"{variant['ref']}\t{variant['alt']}\t.\t{variant['filter']}\t{variant['info']}\n")
+    
+    out_handle.close()
+    fasta.close()
+    print(f"VCF file written to {vcf_file}")
+    print(f"Total variants written: {len(variants)}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert TSV variant file to VCF format')
+    parser.add_argument('tsv_file', help='Input TSV file (can be gzipped)')
+    parser.add_argument('vcf_file', help='Output VCF file (use .gz extension for compressed output)')
+    parser.add_argument('reference_fasta', help='Reference genome FASTA file (must be indexed with samtools faidx)')
+    
+    args = parser.parse_args()
+    
+    tsv_to_vcf(args.tsv_file, args.vcf_file, args.reference_fasta)
+
+if __name__ == '__main__':
+    main()