015db67f25bdbaffebb795b6e952c405f15e1e23 markd Sun Mar 13 14:10:06 2022 -0700 premature commit of -repSizes, this really made things worse; back out diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c index 8dcc404..1cf25e6 100644 --- src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c +++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c @@ -5,46 +5,41 @@ #include "bigRmskAlignBed.h" #include "psl.h" #include "hash.h" #include "options.h" void usage() /* Explain usage and exit. */ { errAbort( "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n" "\n" "usage:\n" " rmskAlignToPsl rmskAlignTab rmskPslFile\n" "\n" " -bigRmsk - input is the text version of bigRmskAlignBed files.\n" - " -repSizes=tab - two column tab file with repeat name and size.\n" - " Sometimes the repeat sizes are incorrect in the align file.\n" - " Obtain the sizes from this file instead. Alignments not found\n" - " in file are dropped.\n" " -dump - print alignments to stdout for debugging purposes\n" "\n" "This convert *.fa.align.tsv file, created by\n" "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n" "Non-TE Repeats without consensus sequence are not included.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"bigRmsk", OPTION_BOOLEAN}, - {"repSizes", OPTION_STRING}, {"dump", OPTION_BOOLEAN}, {NULL, 0}, }; static boolean bigRmsk = FALSE; static boolean dump = FALSE; /* * rmskAlign notes * * - genoStart - genoEnd are zero-base, half-open * - repStart, repEnd - are one-based * - rmskAlign records are multi-part, grouped by id. * - A zero-repeat length record in the middle of group * will have repStart == repEnd, appearing to have a length of 1. @@ -57,78 +52,56 @@ * The format is basically a lightly compressed diff format where * the query and subject are merged into one squence line. The * runs of exact matching sequences are interrupted by either * single base substitutions annotated as queryBase "/" subjectBase, * insertions in the subject annotated as "+ACTAT+", or deletions * in the query annotated as "-ACTTTG-". * * Alignment parts must be decoded separately and then concatenated. * as blocks starting with a '-' or '+' maybe terminated by the end of the string * * Alignments parts may overlap and must then be split. A given id might * have multiple repNames and are split into multiple PSL. */ -static struct hash* loadRepSizes(char *repSizesFile) -/* load repeat sizes file into a hash */ -{ -struct hash* repSizes = hashNew(20); -struct lineFile* lf = lineFileOpen(repSizesFile, TRUE); -char *row[2]; -while (lineFileNextRowTab(lf, row, ArraySize(row))) - hashAddInt(repSizes, row[0], lineFileNeedNum(lf, row, 1)); -lineFileClose(&lf); -return repSizes; -} - static unsigned getGenomeSize(struct rmskAlign *alignParts) /* get the length or the chromosome */ { return alignParts->genoEnd + alignParts->genoLeft; } static unsigned getRepeatSize(struct rmskAlign *alignParts) /* Get the length of repeat sequence. This is not all that easy. When a * repeat is split, CrossMatch reports the same repLeft for other alignments, * at least in some cases. RepeatMasker compensates for this when creating * the .out. However, rmToUCSCTables.pl doesn't correct for this. So we just * take the largest length from all alignments. * * Example .align problem: * 103 20.51 0.00 9.86 chr1 147061 147161 (248809261) C AluJr#SINE/Alu (146) 166 70 [m_b3s356i8] 1 * 103 20.51 0.00 9.86 chr1 147481 147535 (248808887) C AluJr#SINE/Alu (146) 69 25 [m_b3s356i8] 1 * * Still this doesn't always get the correct size. */ { unsigned size = 0; for (struct rmskAlign *rmskAlign = alignParts; rmskAlign != NULL; rmskAlign = rmskAlign->next) size = max(rmskAlign->repEnd + rmskAlign->repLeft, size); return size; } -static unsigned lookupRepeatSize(struct rmskAlign *alignParts, - struct hash* repSizes) -/* get repeat size from supplied table in available, otherwise from - * alignments */ -{ -return (repSizes != NULL) - ? hashIntVal(repSizes, alignParts->repName) - : getRepeatSize(alignParts); -} - static void rmskAlignToPairwise(struct rmskAlign *rmskAlign, char *tStr, char *qStr) /* convert a rmsk alignment string to a pairwise string to print. Output * tStr/qStr should be allocated to the total alignment string from all parts * plus one. */ { /// see 'Alignment string format' above char *t = tStr, *q = qStr; char *a = rmskAlign->alignment; while (*a != '\0') { if (*a == '+') { // query insert a++; @@ -497,35 +470,34 @@ return NULL; } return blkCoords; } static void trimLeadingIndels(struct blkCoord **blkCoordsPtr) /* drop blocks blocks with indels at the start */ { struct blkCoord *blkCoords = *blkCoordsPtr; while ((blkCoords != NULL) && ((blkCoords->qSize == 0) || (blkCoords->tSize == 0))) freeMem(slPopHead(&blkCoords)); *blkCoordsPtr = blkCoords; } -static struct blkCoord *parseCAligns(struct rmskAlign *alignParts, - struct hash* repSizes) +static struct blkCoord *parseCAligns(struct rmskAlign *alignParts) /* parses alignment strings from all parts to go into a PSL */ { -unsigned repSize = lookupRepeatSize(alignParts, repSizes); +unsigned repSize = getRepeatSize(alignParts); // build list struct blkCoord *blkCoords = NULL; for (struct rmskAlign *alignPart = alignParts; alignPart != NULL; alignPart = alignPart->next) { struct blkCoord *blkCoordsPart = parseCAlign(alignPart, repSize); if (blkCoordsPart == NULL) { slFreeList(&blkCoords); return NULL; } blkCoords = slCat(blkCoords, blkCoordsPart); } // trim leading and trailing indels @@ -557,158 +529,127 @@ { assert(blk->qSize == blk->tSize); psl->repMatch += blk->matches; // all matches are repeat matches psl->misMatch += blk->mismatches; if (psl->blockCount >= *blockSpacePtr) pslGrow(psl, blockSpacePtr); psl->tStarts[psl->blockCount] = blk->tStart; psl->qStarts[psl->blockCount] = blk->qStart; psl->blockSizes[psl->blockCount] = blk->tSize; psl->blockCount += 1; } } } static struct psl *convertBlocksToPsl(struct rmskAlign *alignParts, - struct blkCoord *blkCoords, - struct hash* repSizes) + struct blkCoord *blkCoords) /* create a PSL from a repeat masker alignment */ { /* must use coordinates from blocks, as end-indels have been trimmed */ struct blkCoord *blkCountN = slLastEl(blkCoords); -unsigned qSize = lookupRepeatSize(alignParts, repSizes); +unsigned qSize = getRepeatSize(alignParts); unsigned qStart = blkCoords->qStart; unsigned qEnd = blkCountN->qStart + blkCountN->qSize; if (alignParts->strand[0] == '-') reverseUnsignedRange(&qStart, &qEnd, qSize); unsigned tSize = getGenomeSize(alignParts); unsigned tStart = blkCoords->tStart; unsigned tEnd = blkCountN->tStart + blkCountN->tSize; int blockSpace = slCount(blkCoords); struct psl *psl = pslNew(alignParts->repName, qSize, qStart, qEnd, alignParts->genoName, tSize, tStart, tEnd, alignParts->strand, blockSpace, 0); addPslBlocks(blkCoords, psl, &blockSpace); pslComputeInsertCounts(psl); slFreeList(&blkCoords); return psl; } -static struct psl *convertToPsl(struct rmskAlign *alignParts, - struct hash* repSizes) +static struct psl *convertToPsl(struct rmskAlign *alignParts) /* create a PSL from a repeat masker alignment, return NULL if fails */ { -struct blkCoord *blkCoords = parseCAligns(alignParts, repSizes); +struct blkCoord *blkCoords = parseCAligns(alignParts); if (blkCoords == NULL) return NULL; if (dump) blkCoordListPrint(blkCoords, stderr); -return convertBlocksToPsl(alignParts, blkCoords, repSizes); +return convertBlocksToPsl(alignParts, blkCoords); } -static struct psl *alignToPsl(struct rmskAlign *alignParts, struct hash* repSizes) +static struct psl *alignToPsl(struct rmskAlign *alignParts) /* convert and output one set of alignment parts */ { -struct psl* psl = convertToPsl(alignParts, repSizes); +struct psl* psl = convertToPsl(alignParts); if ((psl != NULL) && (pslCheck("rmskAlign", stderr, psl) != 0)) { fprintf(stderr, "Invalid PSL created\n"); pslTabOut(psl, stderr); rmskAlignListPrint(alignParts, stderr); errAbort("BUG invalid PSL created"); } return psl; } static void convertAlignGroup(struct rmskAlign **rmskAlignGroupPtr, - struct hash* repSizes, FILE* outFh) /* convert an alignment group */ { struct rmskAlign *alignParts; while ((alignParts = popAlignParts(rmskAlignGroupPtr)) != NULL) { if (dump) rmskAlignListPrint(alignParts, stderr); if (shouldConvert(alignParts)) { - struct psl *psl = alignToPsl(alignParts, repSizes); + struct psl *psl = alignToPsl(alignParts); if (psl != NULL) { pslTabOut(psl, outFh); pslFree(&psl); } } } } -static boolean checkMissingSize(struct rmskAlign* rmskAlignGroup, - struct hash *repSizes, - struct hash *repSizeWarned) -/* If repSizes are supplied, check if we have it for this query. If not, warn - * for first occurrence. These are skipped */ -{ -if (repSizes != NULL) - { - if (hashLookup(repSizes, rmskAlignGroup->repName) == NULL) - { - if (hashLookup(repSizeWarned, rmskAlignGroup->repName) == NULL) - { - fprintf(stderr, "Warning: '%s' size not found for query, skipping\n", rmskAlignGroup->repName); - hashAddInt(repSizeWarned, rmskAlignGroup->repName, TRUE); - } - return FALSE; - } - } -return TRUE; -} - -static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile, - struct hash* repSizes) +static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile) /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */ { // load all, so we can join ones split by other insertions by id // don't bother freeing struct rmskAlign **rmskAlignGroups = NULL; unsigned maxAlignId = 0; if (bigRmsk) rmskAlignGroups = loadBigRmskAlign(rmskAlignFile, &maxAlignId); else rmskAlignGroups = loadRmskAlign(rmskAlignFile, &maxAlignId); -struct hash* repSizeWarned = hashNew(12); // don't warn multiple times on same sequence - FILE* outFh = mustOpen(rmskPslFile, "w"); for (unsigned id = 0; id <= maxAlignId; id++) { - if ((rmskAlignGroups[id] != NULL) - && checkMissingSize(rmskAlignGroups[id], repSizes, repSizeWarned)) - convertAlignGroup(&(rmskAlignGroups[id]), repSizes, outFh); + if (rmskAlignGroups[id] != NULL) + convertAlignGroup(&(rmskAlignGroups[id]), outFh); } carefulClose(&outFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); bigRmsk = optionExists("bigRmsk"); dump = optionExists("dump"); -struct hash* repSizes = NULL; -if (optionExists("repSizes")) - repSizes = loadRepSizes(optionVal("repSizes", NULL)); -rmskAlignToPsl(argv[1], argv[2], repSizes); +rmskAlignToPsl(argv[1], argv[2]); return 0; }