src/hg/makeDb/scripts/ncbiCloneEndsCH1073/makeBed.py 8faeb3cba60c7cb842bc17c17a57c9b53ef1b478

8faeb3cba60c7cb842bc17c17a57c9b53ef1b478
max
  Tue Apr 21 02:51:32 2026 -0700
ncbiCloneEndsCH1073: add NCBI CH1073 BAC library clone end placements track on danRer11, refs #35059

210,777 unique-concordant clone-insert placements from NCBI's CH1073
(RZPD-1073 / DanioKey) library clone report. Separate from the existing
bacEndPairsLift (danRer4 -> danRer11 UCSC-BLAT lift), which is left in place.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/ncbiCloneEndsCH1073/makeBed.py src/hg/makeDb/scripts/ncbiCloneEndsCH1073/makeBed.py
new file mode 100755
index 00000000000..7a5b9ffa315
--- /dev/null
+++ src/hg/makeDb/scripts/ncbiCloneEndsCH1073/makeBed.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""Convert an NCBI clone-reports GFF (clone_insert records only) to a
+BED 6+7 file matching cloneEnds.as.  Maps RefSeq contig names to UCSC
+chrom names via refSeq.ucscName.tab and clamps to chrom.sizes.
+
+Usage:
+  makeBed.py <refSeqMap.tab> <chrom.sizes> <input.gff> > out.bed
+"""
+
+import sys
+
+OVERSIZE_THRESHOLD = 500_000
+
+
+def load_map(path):
+    m = {}
+    with open(path) as fh:
+        for line in fh:
+            line = line.rstrip("\n")
+            if not line:
+                continue
+            k, v = line.split("\t", 1)
+            m[k] = v
+    return m
+
+
+def load_sizes(path):
+    s = {}
+    with open(path) as fh:
+        for line in fh:
+            line = line.rstrip("\n")
+            if not line:
+                continue
+            name, size = line.split("\t")
+            s[name] = int(size)
+    return s
+
+
+def parse_attrs(field):
+    out = {}
+    for kv in field.split(";"):
+        if "=" in kv:
+            k, v = kv.split("=", 1)
+            out[k] = v
+    return out
+
+
+def main(map_path, sizes_path, gff_path):
+    refseq_to_ucsc = load_map(map_path)
+    sizes = load_sizes(sizes_path)
+
+    n_in = n_out = n_unmapped = n_oversize = n_nosize = 0
+    unmapped_contigs = {}
+
+    with open(gff_path) as fh:
+        for line in fh:
+            if line.startswith("#"):
+                continue
+            parts = line.rstrip("\n").split("\t")
+            if len(parts) < 9:
+                continue
+            if parts[2] != "clone_insert":
+                continue
+            n_in += 1
+
+            refseq = parts[0]
+            ucsc = refseq_to_ucsc.get(refseq)
+            if not ucsc or ucsc not in sizes:
+                n_unmapped += 1
+                unmapped_contigs[refseq] = unmapped_contigs.get(refseq, 0) + 1
+                continue
+
+            start = int(parts[3]) - 1
+            end = int(parts[4])
+            chrom_size = sizes[ucsc]
+            if start < 0:
+                start = 0
+            if end > chrom_size:
+                sys.stderr.write(
+                    f"# warn: clamping end {end} -> {chrom_size} on {ucsc}\n")
+                end = chrom_size
+            if start >= end:
+                n_nosize += 1
+                continue
+
+            attrs = parse_attrs(parts[8])
+            name = attrs.get("Name", "?")
+            ncbi_id = attrs.get("ID", "?")
+            concordant = attrs.get("concordant", "?")
+            unique = attrs.get("unique", "?")
+            method = attrs.get("placement-method", "?")
+            assm_unit = attrs.get("assm_unit_name", "?").replace(" ", "_")
+
+            insert_size = end - start
+            oversize = "TRUE" if insert_size > OVERSIZE_THRESHOLD else "FALSE"
+            if oversize == "TRUE":
+                n_oversize += 1
+
+            row = [
+                ucsc, str(start), str(end), name, "0", "+",
+                str(insert_size), concordant, unique, assm_unit,
+                oversize, ncbi_id, method,
+            ]
+            print("\t".join(row))
+            n_out += 1
+
+    sys.stderr.write(f"# input clone_insert rows:  {n_in}\n")
+    sys.stderr.write(f"# output bed rows:          {n_out}\n")
+    sys.stderr.write(f"# unmapped contig rows:     {n_unmapped}\n")
+    sys.stderr.write(f"# zero-length rows skipped: {n_nosize}\n")
+    sys.stderr.write(f"# oversize (>500kb) flagged:{n_oversize}\n")
+    if unmapped_contigs:
+        sys.stderr.write("# unmapped contigs (top 10 by count):\n")
+        top = sorted(unmapped_contigs.items(),
+                     key=lambda kv: -kv[1])[:10]
+        for k, v in top:
+            sys.stderr.write(f"#   {k}\t{v}\n")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        sys.stderr.write(
+            "usage: makeBed.py <refSeqMap.tab> <chrom.sizes> <input.gff>\n")
+        sys.exit(1)
+    main(sys.argv[1], sys.argv[2], sys.argv[3])