57d31ee9aede2bf610907285b05af8d9cbfe9bec markd Sun Mar 13 10:45:03 2022 -0700 added ability to override incorrect sizes often found in RepeatMaster alignments diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c index ebb039d..2c72ad4 100644 --- src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c +++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c @@ -1,44 +1,49 @@ /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */ #include "common.h" #include "linefile.h" #include "rmskAlign.h" #include "bigRmskAlignBed.h" #include "psl.h" +#include "hash.h" #include "options.h" void usage() /* Explain usage and exit. */ { errAbort( "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n" "\n" "usage:\n" " rmskAlignToPsl rmskAlignTab rmskPslFile\n" "\n" " -bigRmsk - input is the text version of bigRmskAlignBed files.\n" + " -repSizes=tab - two column tab file with repeat name and size.\n" + " Sometimes the repeat sizes are incorrect in the align file.\n" + " Obtain the sizes from this file instead.\n" " -dump - print alignments to stdout for debugging purposes\n" "\n" "This convert *.fa.align.tsv file, created by\n" "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n" "Non-TE Repeats without consensus sequence are not included.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"bigRmsk", OPTION_BOOLEAN}, + {"repSizes", OPTION_STRING}, {"dump", OPTION_BOOLEAN}, {NULL, 0}, }; static boolean bigRmsk = FALSE; static boolean dump = FALSE; /* * rmskAlign notes * * - genoStart - genoEnd are zero-base, half-open * - repStart, repEnd - are one-based * - rmskAlign records are multi-part, grouped by id. * - A zero-repeat length record in the middle of group * will have repStart == repEnd, appearing to have a length of 1. @@ -50,54 +55,79 @@ * Print RepeatMasker alignment data stored in RM's cAlign format. * The format is basically a lightly compressed diff format where * the query and subject are merged into one squence line. The * runs of exact matching sequences are interrupted by either * single base substitutions annotated as queryBase "/" subjectBase, * insertions in the subject annotated as "+ACTAT+", or deletions * in the query annotated as "-ACTTTG-". * * Alignment parts must be decoded separately and then concatenated. * as blocks starting with a '-' or '+' maybe terminated by the end of the string * * Alignments parts may overlap and must then be split. A given id might * have multiple repNames and are split into multiple PSL. */ + +static struct hash* loadRepSizes(char *repSizesFile) +/* load repeat sizes file into a hash */ +{ +struct hash* repSizes = hashNew(20); +struct lineFile* lf = lineFileOpen(repSizesFile, TRUE); +char *row[2]; +while (lineFileNextRowTab(lf, row, ArraySize(row))) + hashAddInt(repSizes, row[0], lineFileNeedNum(lf, row, 1)); +lineFileClose(&lf); +return repSizes; +} + static unsigned getGenomeSize(struct rmskAlign *alignParts) /* get the length or the chromosome */ { return alignParts->genoEnd + alignParts->genoLeft; } static unsigned getRepeatSize(struct rmskAlign *alignParts) /* Get the length of repeat sequence. This is not all that easy. When a * repeat is split, CrossMatch reports the same repLeft for other alignments, * at least in some cases. RepeatMasker compensates for this when creating * the .out. However, rmToUCSCTables.pl doesn't correct for this. So we just * take the largest length from all alignments. * * Example .align problem: * 103 20.51 0.00 9.86 chr1 147061 147161 (248809261) C AluJr#SINE/Alu (146) 166 70 [m_b3s356i8] 1 * 103 20.51 0.00 9.86 chr1 147481 147535 (248808887) C AluJr#SINE/Alu (146) 69 25 [m_b3s356i8] 1 + * + * Still this doesn't always get the correct size. */ { unsigned size = 0; for (struct rmskAlign *rmskAlign = alignParts; rmskAlign != NULL; rmskAlign = rmskAlign->next) size = max(rmskAlign->repEnd + rmskAlign->repLeft, size); return size; } +static unsigned lookupRepeatSize(struct rmskAlign *alignParts, + struct hash* repSizes) +/* get repeat size from supplied table in available, otherwise from + * alignments */ +{ +return (repSizes != NULL) + ? hashIntVal(repSizes, alignParts->repName) + : getRepeatSize(alignParts); +} + static void rmskAlignToPairwise(struct rmskAlign *rmskAlign, char *tStr, char *qStr) /* convert a rmsk alignment string to a pairwise string to print. Output * tStr/qStr should be allocated to the total alignment string from all parts * plus one. */ { /// see 'Alignment string format' above char *t = tStr, *q = qStr; char *a = rmskAlign->alignment; while (*a != '\0') { if (*a == '+') { // query insert a++; @@ -466,34 +496,35 @@ return NULL; } return blkCoords; } static void trimLeadingIndels(struct blkCoord **blkCoordsPtr) /* drop blocks blocks with indels at the start */ { struct blkCoord *blkCoords = *blkCoordsPtr; while ((blkCoords != NULL) && ((blkCoords->qSize == 0) || (blkCoords->tSize == 0))) freeMem(slPopHead(&blkCoords)); *blkCoordsPtr = blkCoords; } -static struct blkCoord *parseCAligns(struct rmskAlign *alignParts) +static struct blkCoord *parseCAligns(struct rmskAlign *alignParts, + struct hash* repSizes) /* parses alignment strings from all parts to go into a PSL */ { -unsigned repSize = getRepeatSize(alignParts); +unsigned repSize = lookupRepeatSize(alignParts, repSizes); // build list struct blkCoord *blkCoords = NULL; for (struct rmskAlign *alignPart = alignParts; alignPart != NULL; alignPart = alignPart->next) { struct blkCoord *blkCoordsPart = parseCAlign(alignPart, repSize); if (blkCoordsPart == NULL) { slFreeList(&blkCoords); return NULL; } blkCoords = slCat(blkCoords, blkCoordsPart); } // trim leading and trailing indels @@ -526,121 +557,128 @@ assert(blk->qSize == blk->tSize); psl->match += blk->matches; psl->repMatch += blk->matches; psl->misMatch += blk->mismatches; if (psl->blockCount >= *blockSpacePtr) pslGrow(psl, blockSpacePtr); psl->tStarts[psl->blockCount] = blk->tStart; psl->qStarts[psl->blockCount] = blk->qStart; psl->blockSizes[psl->blockCount] = blk->tSize; psl->blockCount += 1; } } } static struct psl *convertBlocksToPsl(struct rmskAlign *alignParts, - struct blkCoord *blkCoords) + struct blkCoord *blkCoords, + struct hash* repSizes) /* create a PSL from a repeat masker alignment */ { /* must use coordinates from blocks, as end-indels have been trimmed */ struct blkCoord *blkCountN = slLastEl(blkCoords); -unsigned qSize = getRepeatSize(alignParts); +unsigned qSize = lookupRepeatSize(alignParts, repSizes); unsigned qStart = blkCoords->qStart; unsigned qEnd = blkCountN->qStart + blkCountN->qSize; if (alignParts->strand[0] == '-') reverseUnsignedRange(&qStart, &qEnd, qSize); unsigned tSize = getGenomeSize(alignParts); unsigned tStart = blkCoords->tStart; unsigned tEnd = blkCountN->tStart + blkCountN->tSize; int blockSpace = slCount(blkCoords); struct psl *psl = pslNew(alignParts->repName, qSize, qStart, qEnd, alignParts->genoName, tSize, tStart, tEnd, alignParts->strand, blockSpace, 0); addPslBlocks(blkCoords, psl, &blockSpace); pslComputeInsertCounts(psl); slFreeList(&blkCoords); return psl; } -static struct psl *convertToPsl(struct rmskAlign *alignParts) +static struct psl *convertToPsl(struct rmskAlign *alignParts, + struct hash* repSizes) /* create a PSL from a repeat masker alignment, return NULL if fails */ { -struct blkCoord *blkCoords = parseCAligns(alignParts); +struct blkCoord *blkCoords = parseCAligns(alignParts, repSizes); if (blkCoords == NULL) return NULL; if (dump) blkCoordListPrint(blkCoords, stderr); -return convertBlocksToPsl(alignParts, blkCoords); +return convertBlocksToPsl(alignParts, blkCoords, repSizes); } -static struct psl *alignToPsl(struct rmskAlign *alignParts) +static struct psl *alignToPsl(struct rmskAlign *alignParts, struct hash* repSizes) /* convert and output one set of alignment parts */ { -struct psl* psl = convertToPsl(alignParts); +struct psl* psl = convertToPsl(alignParts, repSizes); if ((psl != NULL) && (pslCheck("rmskAlign", stderr, psl) != 0)) { fprintf(stderr, "Invalid PSL created\n"); pslTabOut(psl, stderr); rmskAlignListPrint(alignParts, stderr); errAbort("BUG invalid PSL created"); } return psl; } static void convertAlignGroup(struct rmskAlign **rmskAlignGroupPtr, + struct hash* repSizes, FILE* outFh) /* convert an alignment group */ { struct rmskAlign *alignParts; while ((alignParts = popAlignParts(rmskAlignGroupPtr)) != NULL) { if (dump) rmskAlignListPrint(alignParts, stderr); if (shouldConvert(alignParts)) { - struct psl *psl = alignToPsl(alignParts); + struct psl *psl = alignToPsl(alignParts, repSizes); if (psl != NULL) { pslTabOut(psl, outFh); pslFree(&psl); } } } } -static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile) +static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile, + struct hash* repSizes) /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */ { // load all, so we can join ones split by other insertions by id // don't bother freeing struct rmskAlign **rmskAlignGroups = NULL; unsigned maxAlignId = 0; if (bigRmsk) rmskAlignGroups = loadBigRmskAlign(rmskAlignFile, &maxAlignId); else rmskAlignGroups = loadRmskAlign(rmskAlignFile, &maxAlignId); FILE* outFh = mustOpen(rmskPslFile, "w"); for (unsigned id = 0; id <= maxAlignId; id++) - convertAlignGroup(&(rmskAlignGroups[id]), outFh); + convertAlignGroup(&(rmskAlignGroups[id]), repSizes, outFh); carefulClose(&outFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); bigRmsk = optionExists("bigRmsk"); dump = optionExists("dump"); -rmskAlignToPsl(argv[1], argv[2]); +struct hash* repSizes = NULL; +if (optionExists("repSizes")) + repSizes = loadRepSizes(optionVal("repSizes", NULL)); +rmskAlignToPsl(argv[1], argv[2], repSizes); return 0; }