d46e25fbafa02b136e9cdbb6eafcbf01583d2a8a hiram Tue Jan 14 10:51:47 2025 -0800 dramatically faster than the perl equivalent no redmine diff --git src/hg/utils/automation/kmerPrint.py src/hg/utils/automation/kmerPrint.py new file mode 100755 index 0000000..00319eb --- /dev/null +++ src/hg/utils/automation/kmerPrint.py @@ -0,0 +1,61 @@ +#!/cluster/software/bin/python3 + +import sys +import gzip + +def countKmers(sequence, chrName, kmerSize, chrStart): + """Count and print kmers from a sequence.""" + sequenceLen = len(sequence) + for i in range(sequenceLen - kmerSize + 1): + kmer = sequence[i:i + kmerSize] + if all(base in "ACGT" for base in kmer): + print(f"{kmer}\t{chrName}\t{chrStart}") + chrStart += 1 + +def processFastaFiles(kmerSize, fastaFiles): + """Process each FASTA file, reading sequences and generating kmers.""" + chrName = "" + chrStart = 0 + for file in fastaFiles: + sequence = "" + if file == "stdin": + fileHandle = sys.stdin + elif file.endswith(".gz"): + fileHandle = gzip.open(file, "rt") + else: + fileHandle = open(file, "r") + + with fileHandle as fh: + for line in fh: + line = line.strip() + if not line or line.startswith("#"): # Skip empty and comment lines + continue + if line.startswith(">"): + if sequence: + countKmers(sequence, chrName, kmerSize, chrStart) + sequence = "" + chrName = line[1:] # Remove ">" + chrStart = 0 + else: + sequence += line.upper() + if sequence: + countKmers(sequence, chrName, kmerSize, chrStart) + sequence = "" + +def main(): + """Main function to parse arguments and execute the script.""" + if len(sys.argv) < 3: + sys.stderr.write( + "usage: kmerPrint.py <N> <fastaFile.fa> [moreFastaFiles.fa] | gzip -c > result.gz\n" + "where <N> is kmer size\n" + "followed by fasta file names (can be 'stdin')\n" + ) + sys.exit(255) + + kmerSize = int(sys.argv[1]) + fastaFiles = sys.argv[2:] + + processFastaFiles(kmerSize, fastaFiles) + +if __name__ == "__main__": + main()