b6f7fd1e76673d2fd96e46ede11a11f421e15675 hiram Sat Nov 22 17:13:04 2025 -0800 now correctly managing GenArk GCA and GCF assembly names refs #34370 diff --git src/hg/ratStuff/mafAddIRows/mafAddIRows.c src/hg/ratStuff/mafAddIRows/mafAddIRows.c index 3875bf785d5..5c2b861d1eb 100644 --- src/hg/ratStuff/mafAddIRows/mafAddIRows.c +++ src/hg/ratStuff/mafAddIRows/mafAddIRows.c @@ -26,30 +26,33 @@ void usage() /* Explain usage and exit. */ { errAbort( "mafAddIRows - add 'i' rows to a maf\n" "usage:\n" " mafAddIRows mafIn twoBitFile mafOut\n" "WARNING: requires a maf with only a single target sequence\n" "options:\n" " -nBeds=listOfBedFiles\n" " reads in list of bed files, one per species, with N locations\n" " -addN\n" " adds rows of N's into maf blocks (rather than just annotating them)\n" " -addDash\n" " adds rows of -'s into maf blocks (rather than just annotating them)\n" + "NOTE: as of November 2025 - can manage GenArk assembly names GCA_...\n" + " and GCF_... with their .n extensions. Can only work with such\n" + " such names that begin with GC." ); } static struct optionSpec options[] = { {"nBeds", OPTION_STRING}, {"addN", OPTION_BOOLEAN}, {"addDash", OPTION_BOOLEAN}, {NULL, 0}, }; struct bedHead { struct bed *list; }; @@ -76,80 +79,100 @@ struct cBlock cb; struct mafComp *mc; }; struct strandHead { struct strandHead *next; char strand; char *name; char *qName; int qSize; char *species; struct linkBlock *links; }; +static char *chromFromSrc(char *src) +/* get chrom name from <db>.<chrom> + returned pointer should be on the . separator */ +{ +char *p; +if ((p = strchr(src, '.')) == NULL) + errAbort("Can't find chrom in MAF component src: %s\n", src); +char *skipDot = p; +++skipDot; /* skip the dot to the word following */ +if (startsWith("GC", src)) + { + char *nextDot = strchr(skipDot,'.'); + if (nextDot) + { + p = nextDot; /* new answer */ + } + } /* else: no next dot, leave p it where it is */ +return p; +} + struct mafAli *readMafs(struct mafFile *mf) { struct mafAli *maf; char buffer[2048]; char buffer2[2048]; struct strandHead *strandHead; struct mafAli *mafList = NULL; char *ourChrom = NULL; while((maf = mafNext(mf)) != NULL) { struct mafComp *mc, *masterMc = maf->components; char *species = buffer; char *chrom; if (ourChrom == NULL) ourChrom = masterMc->src; else { if (differentString(masterMc->src, ourChrom)) errAbort("ERROR: mafAddIrows requires maf have only one target sequence.\n" "Use mafSplit -byTarget -useFullSequenceName to split maf"); } strcpy(species, masterMc->src); - chrom = strchr(species,'.'); + chrom = chromFromSrc(species); if (chrom) *chrom++ = 0; else errAbort("reference species has no chrom name\n"); if (masterSpecies == NULL) { masterSpecies = cloneString(species); masterChrom = cloneString(chrom); //printf("master %s %s\n",masterSpecies,masterChrom); } else { if (!sameString(masterSpecies, species)) errAbort("first species (%s) not master species (%s)\n",species,masterSpecies); } for(mc= masterMc->next; mc; mc = mc->next) { struct linkBlock *linkBlock; struct subSpecies *subSpecies = NULL; strcpy(species, mc->src); - chrom = strchr(species,'.'); + chrom = chromFromSrc(species); *chrom++ = 0; if ((subSpecies = hashFindVal(speciesHash, species)) == NULL) { //printf("new species %s\n",species); AllocVar(subSpecies); subSpecies->name = cloneString(species); subSpecies->hash = newHash(6); subSpecies->blockStatus.strand = '+'; subSpecies->blockStatus.masterStart = masterMc->start; slAddHead(&speciesList, subSpecies); hashAdd(speciesHash, species, subSpecies); } subSpecies->blockStatus.masterEnd = masterMc->start + masterMc->size ; sprintf(buffer2, "%s%c%s", masterChrom,mc->strand,chrom); @@ -351,66 +374,64 @@ { struct mafComp *mc = NULL, *masterMc, *lastMc = NULL; struct mafAli *newMaf = NULL; struct blockStatus *blockStatus; nextMaf = maf->next; masterMc=maf->components; if (masterMc->start > lastEnd) { struct subSpecies *species; for(species = speciesList; species; species = species->next) { mc = NULL; -// printf("looking at %s\n",species->name); blockStatus = &species->blockStatus; if (blockStatus->mc) { -// printf("should match at %s\n",blockStatus->mc->src); switch (blockStatus->mc->rightStatus) { case MAF_MISSING_STATUS: //printf("missing right\n"); case MAF_NEW_NESTED_STATUS: case MAF_MAYBE_NEW_NESTED_STATUS: case MAF_CONTIG_STATUS: case MAF_TANDEM_STATUS: case MAF_INSERT_STATUS: AllocVar(mc); mc->rightStatus = mc->leftStatus = blockStatus->mc->rightStatus; mc->rightLen = mc->leftLen = blockStatus->mc->rightLen; mc->src = blockStatus->mc->src; mc->srcSize = blockStatus->mc->srcSize; mc->strand = blockStatus->mc->strand; mc->start = blockStatus->mc->start + blockStatus->mc->size; if (lastMc == NULL) { struct mafComp *miniMasterMc = NULL; char *seqName; struct dnaSeq *seq; AllocVar(miniMasterMc); miniMasterMc->next = mc; miniMasterMc->strand = '+'; miniMasterMc->srcSize = masterMc->srcSize; miniMasterMc->src = masterMc->src; miniMasterMc->start = lastEnd; miniMasterMc->size = masterMc->start - lastEnd; - if ((seqName = strchr(miniMasterMc->src, '.')) != NULL) + if ((seqName = chromFromSrc(miniMasterMc->src)) != NULL) seqName++; else seqName = miniMasterMc->src; // printf("hole filled from %d to %d\n",lastEnd, masterMc->start); seq = twoBitReadSeqFrag(twoBit, seqName, lastEnd, masterMc->start); miniMasterMc->text = seq->dna; AllocVar(newMaf); newMaf->textSize = maf->textSize; newMaf->components = miniMasterMc; newMaf->next = maf; if (prevMaf) prevMaf->next = newMaf; else @@ -502,31 +523,30 @@ mc->srcSize = 200000; mc->size = maf->textSize; mc->text = needMem(mc->size + 1); memset(mc->text, 'N', mc->size); } else if (addDash) { mc->size = masterMc->size; mc->text = needMem(mc->size + 1); if (mc->size == 0) errAbort("bad dash add"); memset(mc->text, '-', mc->size); mc->text[mc->size] = 0; mc->size = 0; } - break; default: break; } } } if (mc) { blockStatus->mc = mc; } } } } struct hash *readBed(char *fileName) @@ -541,33 +561,30 @@ while (lineFileRow(lf, row)) { hel = hashLookup(hash, row[0]); if ((lastHel) && (hel != lastHel)) { assert(bedHead != NULL); slReverse(&bedHead->list); } if (hel == NULL) { char *ptr; AllocVar(bedHead); - if ((ptr = strchr(row[0], '.')) != NULL) - ptr++; - else ptr = row[0]; hel = hashAdd(hash, ptr, bedHead); } bedHead = hel->val; AllocVar(bed); bed->chrom = hel->name; bed->chromStart = lineFileNeedNum(lf, row, 1); bed->chromEnd = lineFileNeedNum(lf, row, 2); if (bed->chromStart > bed->chromEnd) errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); slAddHead(&bedHead->list, (struct bed *)bed); lastHel = hel; } if (bedHead != NULL)