c6eed4550b6684cc7cde7779b9084a72573b5d83 markd Fri Mar 11 19:54:00 2022 -0800 added ability to read ascii version of bigRmskAlign files diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c index 9f67c94..ebb039d 100644 --- src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c +++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c @@ -1,45 +1,49 @@ /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */ #include "common.h" #include "linefile.h" #include "rmskAlign.h" +#include "bigRmskAlignBed.h" #include "psl.h" #include "options.h" void usage() /* Explain usage and exit. */ { errAbort( "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n" "\n" "usage:\n" " rmskAlignToPsl rmskAlignTab rmskPslFile\n" "\n" + " -bigRmsk - input is the text version of bigRmskAlignBed files.\n" " -dump - print alignments to stdout for debugging purposes\n" "\n" "This convert *.fa.align.tsv file, created by\n" "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n" "Non-TE Repeats without consensus sequence are not included.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { + {"bigRmsk", OPTION_BOOLEAN}, {"dump", OPTION_BOOLEAN}, {NULL, 0}, }; +static boolean bigRmsk = FALSE; static boolean dump = FALSE; /* * rmskAlign notes * * - genoStart - genoEnd are zero-base, half-open * - repStart, repEnd - are one-based * - rmskAlign records are multi-part, grouped by id. * - A zero-repeat length record in the middle of group * will have repStart == repEnd, appearing to have a length of 1. * It appears these all have a alignment starting with '-' * * Alignment string format: * * from hg/hgc/rmskJoinedClick.c @@ -162,57 +166,112 @@ static void rmskAlignListPrint(struct rmskAlign *alignParts, FILE *fh) /* print out alignments in list */ { for (struct rmskAlign *ap = alignParts; ap != NULL; ap = ap->next) rmskAlignPrint(ap, alignParts, fh); } static boolean shouldConvert(struct rmskAlign *rmskAlign) /* does this repeat type have a consensus and should be converted? */ { return !sameString(rmskAlign->repClass, "Simple_repeat"); } -static struct rmskAlign **groupRmskAligns(struct rmskAlign **rmskAlignsPtr, +static struct rmskAlign **groupRmskAligns(struct rmskAlign *rmskAligns, unsigned *maxAlignIdPtr) /* build array of alignment by ids */ { unsigned maxAlignId = 0; -for (struct rmskAlign *ra = *rmskAlignsPtr; ra != NULL; ra = ra->next) +for (struct rmskAlign *ra = rmskAligns; ra != NULL; ra = ra->next) maxAlignId = max(maxAlignId, ra->id); struct rmskAlign **rmskAlignsById = AllocN(struct rmskAlign *, maxAlignId + 1); -for (struct rmskAlign *ra = slPopHead(rmskAlignsPtr); ra != NULL; ra = slPopHead(rmskAlignsPtr)) +for (struct rmskAlign *ra = slPopHead(&rmskAligns); ra != NULL; ra = slPopHead(&rmskAligns)) slAddTail((&rmskAlignsById[ra->id]), ra); // keep in genomic order *maxAlignIdPtr = maxAlignId; return rmskAlignsById; } int rmskAlignGenomeCmp(const void *va, const void *vb) /* sort in ascending genomic location */ { const struct rmskAlign *a = *((struct rmskAlign **)va); const struct rmskAlign *b = *((struct rmskAlign **)vb); int diff = strcmp(a->genoName, b->genoName); if (diff != 0) return diff; return a->genoStart - b->genoStart; } +static struct rmskAlign **loadRmskAlign(char *rmskAlignFile, + unsigned *maxAlignIdPtr) +/* load rmskAlign file into an array of lists by indexed by alignment id */ +{ +struct rmskAlign *rmskAligns = rmskAlignLoadAllByTab(rmskAlignFile); +slSort(&rmskAligns, rmskAlignGenomeCmp); + +return groupRmskAligns(rmskAligns, maxAlignIdPtr); +} + +static struct rmskAlign *bigRmskAlignToRmskAlign(struct bigRmskAlignBed* bigRmskAlign) +/* convert a bigRmskAlign record to an rmskAlign record */ +{ +struct rmskAlign *rmskAlign; +AllocVar(rmskAlign); + +rmskAlign->swScore = bigRmskAlign->score; +rmskAlign->milliDiv = 1000 * bigRmskAlign->percSubst; +rmskAlign->milliDel = 1000 * bigRmskAlign->percDel; +rmskAlign->milliIns = 1000 * bigRmskAlign->percIns; +rmskAlign->genoName = cloneString(bigRmskAlign->chrom); +rmskAlign->genoStart = bigRmskAlign->chromStart; +rmskAlign->genoEnd = bigRmskAlign->chromEnd; +rmskAlign->genoLeft = bigRmskAlign->chromRemain; +rmskAlign->strand[0] = bigRmskAlign->strand[0]; +rmskAlign->repName = cloneString(bigRmskAlign->repName); +rmskAlign->repClass = cloneString(bigRmskAlign->repType); +rmskAlign->repFamily = cloneString(bigRmskAlign->repSubtype); +rmskAlign->repStart = bigRmskAlign->repStart; +rmskAlign->repEnd = bigRmskAlign->repEnd; +rmskAlign->repLeft = bigRmskAlign->repRemain; +rmskAlign->id = bigRmskAlign->id; +rmskAlign->alignment = cloneString(bigRmskAlign->calignData); + +return rmskAlign; +} + +static struct rmskAlign **loadBigRmskAlign(char *bigRmskAlignFile, + unsigned *maxAlignIdPtr) +/* load bigRmskAlignBed file into an array of lists by indexed by alignment id */ +{ +struct bigRmskAlignBed *bigRmskAligns = bigRmskAlignBedLoadAllByTab(bigRmskAlignFile); +struct rmskAlign *rmskAligns = NULL; +struct bigRmskAlignBed *bigRmskAlign; + +for (bigRmskAlign = slPopHead(&bigRmskAligns); bigRmskAlign != NULL; bigRmskAlign = slPopHead(&bigRmskAligns)) + { + slAddHead(&rmskAligns, bigRmskAlignToRmskAlign(bigRmskAlign)); + bigRmskAlignBedFree(&bigRmskAlign); + } + +slSort(&rmskAligns, rmskAlignGenomeCmp); +return groupRmskAligns(rmskAligns, maxAlignIdPtr); +} + static boolean rmskLinear(struct rmskAlign *prev, struct rmskAlign *next) /* are the two alignments linear? assumes genomic sort */ { // rmskAlign 1-based and not in strand-specific order if (prev->strand[0] == '+') return next->repStart >= prev->repEnd; else return prev->repEnd > next->repStart; } static boolean rmskOverlap(struct rmskAlign *rm1, struct rmskAlign *rm2) /* do they overlap in genomic or repeat coordinates */ { @@ -549,38 +608,39 @@ if (psl != NULL) { pslTabOut(psl, outFh); pslFree(&psl); } } } } static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile) /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */ { // load all, so we can join ones split by other insertions by id // don't bother freeing -struct rmskAlign *rmskAligns = rmskAlignLoadAllByTab(rmskAlignFile); -slSort(&rmskAligns, rmskAlignGenomeCmp); - -// get counts parts for each id -unsigned maxAlignId; -struct rmskAlign **rmskAlignGroups = groupRmskAligns(&rmskAligns, &maxAlignId); +struct rmskAlign **rmskAlignGroups = NULL; +unsigned maxAlignId = 0; +if (bigRmsk) + rmskAlignGroups = loadBigRmskAlign(rmskAlignFile, &maxAlignId); +else + rmskAlignGroups = loadRmskAlign(rmskAlignFile, &maxAlignId); FILE* outFh = mustOpen(rmskPslFile, "w"); for (unsigned id = 0; id <= maxAlignId; id++) convertAlignGroup(&(rmskAlignGroups[id]), outFh); carefulClose(&outFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); +bigRmsk = optionExists("bigRmsk"); dump = optionExists("dump"); rmskAlignToPsl(argv[1], argv[2]); return 0; }