c31d6f7174025d44270f64f086b06df10b4196a6 hiram Sat Nov 22 17:14:37 2025 -0800 now correctly managing GenArk GCA and GCF assembly names refs #34370 diff --git src/hg/ratStuff/mafSplit/mafSplit.c src/hg/ratStuff/mafSplit/mafSplit.c index 102d5cd20d4..d671d296afe 100644 --- src/hg/ratStuff/mafSplit/mafSplit.c +++ src/hg/ratStuff/mafSplit/mafSplit.c @@ -37,30 +37,33 @@ " Instead of auto-incrementing an integer to determine\n" " output filename, use the target sequence name\n" " to tack onto outRoot.\n" " -useHashedName=N For use only with -byTarget.\n" " Instead of auto-incrementing an integer or requiring\n" " a unique number in the sequence name, use a hash\n" " function on the sequence name to compute an N-bit\n" " number. This limits the max #filenames to 2^N and\n" " ensures that even if different subsets of sequences\n" " appear in different pairwise mafs, the split file\n" " names will be consistent (due to hash function).\n" " This option is useful when a \"scaffold-based\"\n" " assembly has more than one sequence name pattern,\n" " e.g. both chroms and scaffolds.\n" "\n" + "NOTE: as of November 2025 - can manage GenArk assembly names GCA_...\n" + " and GCF_... with their .n extensions. Can only work with such\n" + " such names that begin with GC." ); } static struct optionSpec options[] = { {"byTarget", OPTION_BOOLEAN}, {"outDirDepth", OPTION_INT}, {"useSequenceName", OPTION_BOOLEAN}, {"useFullSequenceName", OPTION_BOOLEAN}, {"useHashedName", OPTION_INT}, {NULL, 0}, }; /* Option variables */ static boolean byTarget = FALSE; static int outDirDepth = 0; @@ -164,31 +167,43 @@ targetName, path); f = mustOpen(path, "a"); } else { int seqNum = (*pSeqNum)++; if (useSequenceName) seqNum = numberFromName(targetName); else if (hashedNameBits > 0) seqNum = numberFromHashedName(targetName, hashedNameBits); if (useFullSequenceName) { /* skip over db. prefix if any */ char *target = strchr(targetName,'.'); if (target) + { ++target; + /* if GenArk assembly, get to the next dot */ + if (startsWith("GC", targetName)) + { + char *nextDot = strchr(target,'.'); + if (nextDot) + { + ++nextDot; + target = nextDot; + } + } /* else: no next dot, leave target it where it is */ + } else target = targetName; path = mkOutPath(outRootDir, outRootFile, seqNum, target); } else path = mkOutPath(outRootDir, outRootFile, seqNum, NULL); verbose(3, "Opening path %s for writing and adding it to hash " "for %s\n", path, targetName); f = mustOpen(path, "w"); fprintf(f, "##maf version=1 scoring=blastz\n"); if (hashedNameBits > 0) hashAdd(pathHash, path, path); else hashAdd(pathHash, targetName, path); } @@ -219,31 +234,41 @@ hashAdd(splitHash, bed->chrom, cloneBed(bed)); } else slAddTail(&bedList, cloneBed(bed)); freeMem(bed); } return splitHash; } char *chromFromSrc(char *src) /* get chrom name from . */ { char *p; if ((p = strchr(src, '.')) == NULL) errAbort("Can't find chrom in MAF component src: %s\n", src); -return ++p; +++p; /* skip the dot to the word following */ +if (startsWith("GC", src)) + { + char *nextDot = strchr(p,'.'); + if (nextDot) + { + ++nextDot; /* skip the dot to the word following */ + p = nextDot; /* new answer */ + } + } /* else: no next dot, leave p it where it is */ +return p; } void splitMafFile(char *file, char *outDir, char *outPrefix, struct hash *splitHash) /* split file based on positions in hash */ { char *chrom = NULL; char outFile[PATH_LEN]; int ix = 0; FILE *f; boolean nextFile = TRUE; struct bed *bed, *bedList = NULL; int splitPos = 0; struct mafFile *mf = mafOpen(file); struct mafAli *maf = NULL;