8faeb3cba60c7cb842bc17c17a57c9b53ef1b478 max Tue Apr 21 02:51:32 2026 -0700 ncbiCloneEndsCH1073: add NCBI CH1073 BAC library clone end placements track on danRer11, refs #35059 210,777 unique-concordant clone-insert placements from NCBI's CH1073 (RZPD-1073 / DanioKey) library clone report. Separate from the existing bacEndPairsLift (danRer4 -> danRer11 UCSC-BLAT lift), which is left in place. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/ncbiCloneEndsCH1073/refSeqNames.py src/hg/makeDb/scripts/ncbiCloneEndsCH1073/refSeqNames.py new file mode 100755 index 00000000000..e5bbe9c494d --- /dev/null +++ src/hg/makeDb/scripts/ncbiCloneEndsCH1073/refSeqNames.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +"""Build a RefSeq-accession to UCSC-chrom-name map from an NCBI assembly +report (e.g. GCF_000002035.6_GRCz11_assembly_report.txt). The report +already has the UCSC-style name in column 10; we just emit +`<refSeqAcc>\\t<ucscName>` for rows that have a RefSeq accession. +Writes tab-separated pairs to stdout.""" + +import sys + + +def main(path): + with open(path) as fh: + for line in fh: + if line.startswith("#"): + continue + parts = line.rstrip("\n").split("\t") + if len(parts) < 10: + continue + refseq = parts[6] + ucsc = parts[9] + if not refseq or refseq == "na": + continue + if not ucsc or ucsc == "na": + continue + print(f"{refseq}\t{ucsc}") + + +if __name__ == "__main__": + if len(sys.argv) != 2: + sys.stderr.write( + "usage: refSeqNames.py <GCF_xxxxxx.assembly.txt>\n") + sys.exit(1) + main(sys.argv[1])