b766d400687cd499a41a895f249557e49a381d72 hiram Wed Feb 28 18:13:55 2024 -0800 allow bamToPsl to fixup qSize when given a querySizes file no redmine diff --git src/utils/bamToPsl/bamToPsl.c src/utils/bamToPsl/bamToPsl.c index 31143bc..d9d8880 100644 --- src/utils/bamToPsl/bamToPsl.c +++ src/utils/bamToPsl/bamToPsl.c @@ -17,54 +17,59 @@ "usage:\n" " bamToPsl [options] in.bam out.psl\n" "options:\n" " -fasta=output.fa - output query sequences to specified file\n" " -chromAlias=file - specify a two-column file: 1: alias, 2: other name\n" " for target name translation from column 1 name to column 2 name\n" " names not found are passed through intact\n" " -nohead - do not output the PSL header, default has header output\n" " -allowDups - for fasta output, allow duplicate query sequences output\n" " - default is to eliminate duplicate sequences\n" " - runs much faster without the duplicate check\n" " -noSequenceVerify - when checking for dups, do not verify each sequence\n" " - when the same name is identical, assume they are\n" " - helps speed up the dup check but not thorough\n" " -dots=N - output progress dot(.) every N alignments processed\n" + " -querySizes=file - two column tab file: 'name size' to fixup the query\n" + " - sequence sizes since bam does not provide qSize\n" "\n" "Note: a chromAlias file can be obtained from a UCSC database, e.g.:\n" " hgsql -N -e 'select alias,chrom from chromAlias;' hg38 > hg38.chromAlias.tab\n" " Or from the downloads server:\n" " wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/chromAlias.txt.gz\n" "See also our tool chromToUcsc\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"fasta", OPTION_STRING}, {"chromAlias", OPTION_STRING}, {"nohead", OPTION_BOOLEAN}, {"allowDups", OPTION_BOOLEAN}, {"noSequenceVerify", OPTION_BOOLEAN}, {"dots", OPTION_INT}, + {"querySizes", OPTION_STRING}, {NULL, 0}, }; static int dots = 0; static boolean nohead = FALSE; static boolean allowDups = FALSE; static boolean noSequenceVerify = FALSE; +static char *querySizes = NULL; // use chrom.sizes file to set query size +static struct hash *qSizeHash = NULL; static struct hash *hashChromAlias(char *fileName) /* Read two column file into hash keyed by first column */ { struct hash *hash = hashNew(0); struct lineFile *lf = netLineFileOpen(fileName); char *row[2]; while (lineFileRow(lf, row)) hashAdd(hash, row[0], cloneString(row[1])); lineFileClose(&lf); return hash; } static void bamToPsl(char *inBam, char *outPsl, char *outFasta, char *aliasFile) @@ -106,30 +111,42 @@ if (sam_read1(in, head, &one) < 0) { break; } if (one.core.n_cigar != 0) { struct psl *psl = bamToPslUnscored(&one, head); if (psl != NULL) { if (chromAlias) { struct hashEl *hel = NULL; if ((hel = hashLookup(chromAlias, psl->tName)) != NULL) psl->tName = cloneString((char *)hel->val); /* memory leak */ } + if (qSizeHash) + { + int brokenSize = psl->qSize; + psl->qSize = hashIntVal(qSizeHash, psl->qName); + int offset = psl->qSize - brokenSize; + /* check to see if negative qStarts need to be fixed */ + if ((offset > 0) && (psl->strand[0] == '-')) + { + for (int i = 0; i < psl->blockCount; ++i) + psl->qStarts[i] += offset; + } + } pslTabOut(psl, f); /* no free of this psl data, memory leak */ pslFree(&psl); } } ++processCount; if (dots) if (0 == processCount % dots) verbose(1,"."); if (faF != NULL) { char *dna = bamGetQuerySequence(&one, TRUE); char *qName = bam1_qname(&one); if (allowDups) faWriteNext(faF, qName, dna, strlen(dna)); else @@ -157,22 +174,27 @@ verbose(1,"\n"); samclose(in); carefulClose(&f); carefulClose(&faF); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); char *fastaName = optionVal("fasta", NULL); char *aliasFile = optionVal("chromAlias", NULL); + +querySizes = optionVal("querySizes", NULL); + dots = optionInt("dots", dots); nohead = optionExists("nohead"); allowDups = optionExists("allowDups"); noSequenceVerify = optionExists("noSequenceVerify"); +if (querySizes != NULL) + qSizeHash = loadSizes(querySizes); bamToPsl(argv[1], argv[2], fastaName, aliasFile); return 0; }