47b962229a3ebcd05377a52db7d389d48a5f38fa markd Fri Mar 25 09:08:31 2022 -0700 added option to specify expected repeat sizes and discard PSLs that don't match diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c index 1cf25e6..1ad1c7d 100644 --- src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c +++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c @@ -5,41 +5,47 @@ #include "bigRmskAlignBed.h" #include "psl.h" #include "hash.h" #include "options.h" void usage() /* Explain usage and exit. */ { errAbort( "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n" "\n" "usage:\n" " rmskAlignToPsl rmskAlignTab rmskPslFile\n" "\n" " -bigRmsk - input is the text version of bigRmskAlignBed files.\n" + " -repSizes=tab - two column tab file with repeat name and size.\n" + " Sometimes the repeat sizes are incorrect in the align file.\n" + " If a repeat alignment doesn't match the size here or is not\n" + " in the file it will be discarded.\n" + " \n" " -dump - print alignments to stdout for debugging purposes\n" "\n" "This convert *.fa.align.tsv file, created by\n" "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n" "Non-TE Repeats without consensus sequence are not included.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"bigRmsk", OPTION_BOOLEAN}, + {"repSizes", OPTION_STRING}, {"dump", OPTION_BOOLEAN}, {NULL, 0}, }; static boolean bigRmsk = FALSE; static boolean dump = FALSE; /* * rmskAlign notes * * - genoStart - genoEnd are zero-base, half-open * - repStart, repEnd - are one-based * - rmskAlign records are multi-part, grouped by id. * - A zero-repeat length record in the middle of group * will have repStart == repEnd, appearing to have a length of 1. @@ -52,30 +58,42 @@ * The format is basically a lightly compressed diff format where * the query and subject are merged into one squence line. The * runs of exact matching sequences are interrupted by either * single base substitutions annotated as queryBase "/" subjectBase, * insertions in the subject annotated as "+ACTAT+", or deletions * in the query annotated as "-ACTTTG-". * * Alignment parts must be decoded separately and then concatenated. * as blocks starting with a '-' or '+' maybe terminated by the end of the string * * Alignments parts may overlap and must then be split. A given id might * have multiple repNames and are split into multiple PSL. */ +static struct hash* loadRepSizes(char *repSizesFile) +/* load repeat sizes file into a hash */ +{ +struct hash* repSizes = hashNew(20); +struct lineFile* lf = lineFileOpen(repSizesFile, TRUE); +char *row[2]; +while (lineFileNextRowTab(lf, row, ArraySize(row))) + hashAddInt(repSizes, row[0], lineFileNeedNum(lf, row, 1)); +lineFileClose(&lf); +return repSizes; +} + static unsigned getGenomeSize(struct rmskAlign *alignParts) /* get the length or the chromosome */ { return alignParts->genoEnd + alignParts->genoLeft; } static unsigned getRepeatSize(struct rmskAlign *alignParts) /* Get the length of repeat sequence. This is not all that easy. When a * repeat is split, CrossMatch reports the same repLeft for other alignments, * at least in some cases. RepeatMasker compensates for this when creating * the .out. However, rmToUCSCTables.pl doesn't correct for this. So we just * take the largest length from all alignments. * * Example .align problem: * 103 20.51 0.00 9.86 chr1 147061 147161 (248809261) C AluJr#SINE/Alu (146) 166 70 [m_b3s356i8] 1 @@ -609,47 +627,86 @@ rmskAlignListPrint(alignParts, stderr); if (shouldConvert(alignParts)) { struct psl *psl = alignToPsl(alignParts); if (psl != NULL) { pslTabOut(psl, outFh); pslFree(&psl); } } } } -static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile) +static boolean checkSizeWithExpected(struct rmskAlign* rmskAlignGroup, + struct hash *repSizes, + struct hash *repSizeWarned) +/* Check if we have an expected repSize for this and they match query. If not, warn + * for first occurrence. These are skipped */ +{ +if (hashLookup(repSizes, rmskAlignGroup->repName) == NULL) + { + if (hashLookup(repSizeWarned, rmskAlignGroup->repName) == NULL) + { + fprintf(stderr, "Warning: '%s' expected not found, skipping\n", rmskAlignGroup->repName); + hashAddInt(repSizeWarned, rmskAlignGroup->repName, TRUE); + } + return FALSE; + } +int gotRepSize = getRepeatSize(rmskAlignGroup); +int expectRepSize = hashIntVal(repSizes, rmskAlignGroup->repName); +if (gotRepSize != expectRepSize) + { + if (hashLookup(repSizeWarned, rmskAlignGroup->repName) == NULL) + { + fprintf(stderr, "Warning: '%s' size %d does not match expected size %d, skipping\n", + rmskAlignGroup->repName, gotRepSize, expectRepSize); + hashAddInt(repSizeWarned, rmskAlignGroup->repName, TRUE); + } + return FALSE; + } +return TRUE; +} + +static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile, + struct hash* repSizes) /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */ { // load all, so we can join ones split by other insertions by id // don't bother freeing struct rmskAlign **rmskAlignGroups = NULL; unsigned maxAlignId = 0; if (bigRmsk) rmskAlignGroups = loadBigRmskAlign(rmskAlignFile, &maxAlignId); else rmskAlignGroups = loadRmskAlign(rmskAlignFile, &maxAlignId); +struct hash* repSizeWarned = hashNew(12); // don't warn multiple times on same sequence + FILE* outFh = mustOpen(rmskPslFile, "w"); for (unsigned id = 0; id <= maxAlignId; id++) { if (rmskAlignGroups[id] != NULL) + { + if ((repSizes == NULL) || checkSizeWithExpected(rmskAlignGroups[id], repSizes, repSizeWarned)) convertAlignGroup(&(rmskAlignGroups[id]), outFh); } + } carefulClose(&outFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); bigRmsk = optionExists("bigRmsk"); dump = optionExists("dump"); -rmskAlignToPsl(argv[1], argv[2]); +struct hash* repSizes = NULL; +if (optionExists("repSizes")) + repSizes = loadRepSizes(optionVal("repSizes", NULL)); +rmskAlignToPsl(argv[1], argv[2], repSizes); return 0; }