d99300bbd4e6250c0a72144745b3383a973adb14 markd Sun Mar 13 12:32:04 2022 -0700 drop querys not in repSizes file rather than generate an error diff --git src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c index fecc67b..8dcc404 100644 --- src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c +++ src/hg/utils/rmskAlignToPsl/rmskAlignToPsl.c @@ -7,31 +7,32 @@ #include "hash.h" #include "options.h" void usage() /* Explain usage and exit. */ { errAbort( "rmskAlignToPsl - convert repeatmasker alignments to PSLs\n" "\n" "usage:\n" " rmskAlignToPsl rmskAlignTab rmskPslFile\n" "\n" " -bigRmsk - input is the text version of bigRmskAlignBed files.\n" " -repSizes=tab - two column tab file with repeat name and size.\n" " Sometimes the repeat sizes are incorrect in the align file.\n" - " Obtain the sizes from this file instead.\n" + " Obtain the sizes from this file instead. Alignments not found\n" + " in file are dropped.\n" " -dump - print alignments to stdout for debugging purposes\n" "\n" "This convert *.fa.align.tsv file, created by\n" "RepeatMasker/util/rmToUCSCTables.pl into a PSL file.\n" "Non-TE Repeats without consensus sequence are not included.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"bigRmsk", OPTION_BOOLEAN}, {"repSizes", OPTION_STRING}, {"dump", OPTION_BOOLEAN}, {NULL, 0}, }; @@ -584,34 +585,37 @@ unsigned tSize = getGenomeSize(alignParts); unsigned tStart = blkCoords->tStart; unsigned tEnd = blkCountN->tStart + blkCountN->tSize; int blockSpace = slCount(blkCoords); struct psl *psl = pslNew(alignParts->repName, qSize, qStart, qEnd, alignParts->genoName, tSize, tStart, tEnd, alignParts->strand, blockSpace, 0); addPslBlocks(blkCoords, psl, &blockSpace); pslComputeInsertCounts(psl); slFreeList(&blkCoords); return psl; } + + static struct psl *convertToPsl(struct rmskAlign *alignParts, struct hash* repSizes) /* create a PSL from a repeat masker alignment, return NULL if fails */ { + struct blkCoord *blkCoords = parseCAligns(alignParts, repSizes); if (blkCoords == NULL) return NULL; if (dump) blkCoordListPrint(blkCoords, stderr); return convertBlocksToPsl(alignParts, blkCoords, repSizes); } static struct psl *alignToPsl(struct rmskAlign *alignParts, struct hash* repSizes) /* convert and output one set of alignment parts */ { struct psl* psl = convertToPsl(alignParts, repSizes); if ((psl != NULL) && (pslCheck("rmskAlign", stderr, psl) != 0)) @@ -636,46 +640,73 @@ rmskAlignListPrint(alignParts, stderr); if (shouldConvert(alignParts)) { struct psl *psl = alignToPsl(alignParts, repSizes); if (psl != NULL) { pslTabOut(psl, outFh); pslFree(&psl); } } } } +static boolean checkMissingSize(struct rmskAlign* rmskAlignGroup, + struct hash *repSizes, + struct hash *repSizeWarned) +/* If repSizes are supplied, check if we have it for this query. If not, warn + * for first occurrence. These are skipped */ +{ +if (repSizes != NULL) + { + if (hashLookup(repSizes, rmskAlignGroup->repName) == NULL) + { + if (hashLookup(repSizeWarned, rmskAlignGroup->repName) == NULL) + { + fprintf(stderr, "Warning: '%s' size not found for query, skipping\n", rmskAlignGroup->repName); + hashAddInt(repSizeWarned, rmskAlignGroup->repName, TRUE); + } + return FALSE; + } + } +return TRUE; +} + static void rmskAlignToPsl(char *rmskAlignFile, char *rmskPslFile, struct hash* repSizes) /* rmskAlignToPsl - convert repeatmasker alignment files to PSLs. */ { // load all, so we can join ones split by other insertions by id // don't bother freeing struct rmskAlign **rmskAlignGroups = NULL; unsigned maxAlignId = 0; if (bigRmsk) rmskAlignGroups = loadBigRmskAlign(rmskAlignFile, &maxAlignId); else rmskAlignGroups = loadRmskAlign(rmskAlignFile, &maxAlignId); +struct hash* repSizeWarned = hashNew(12); // don't warn multiple times on same sequence + FILE* outFh = mustOpen(rmskPslFile, "w"); for (unsigned id = 0; id <= maxAlignId; id++) + { + if ((rmskAlignGroups[id] != NULL) + && checkMissingSize(rmskAlignGroups[id], repSizes, repSizeWarned)) convertAlignGroup(&(rmskAlignGroups[id]), repSizes, outFh); + } carefulClose(&outFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); bigRmsk = optionExists("bigRmsk"); dump = optionExists("dump"); struct hash* repSizes = NULL; if (optionExists("repSizes")) repSizes = loadRepSizes(optionVal("repSizes", NULL)); rmskAlignToPsl(argv[1], argv[2], repSizes);