bd19d0f0854ea7e27017395d96715b61071b3391 markd Thu Mar 7 22:49:24 2024 -0800 restructure code to make it easier to modify and then simplified by remove fasta checksum complexity checks that were added because of a missunderstanding about how supplemental alignments were represented diff --git src/utils/bamToPsl/bamToPsl.c src/utils/bamToPsl/bamToPsl.c index d9d8880..09cc543 100644 --- src/utils/bamToPsl/bamToPsl.c +++ src/utils/bamToPsl/bamToPsl.c @@ -1,200 +1,171 @@ /* bamToPsl - Convert a bam file to a psl and optionally also a fasta file that contains the reads.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "bamFile.h" #include "psl.h" #include "fa.h" -#include "md5.h" #include "net.h" void usage() /* Explain usage and exit. */ { errAbort( "bamToPsl - Convert a bam file to a psl and optionally also a fasta file that contains the reads.\n" "usage:\n" " bamToPsl [options] in.bam out.psl\n" "options:\n" " -fasta=output.fa - output query sequences to specified file\n" " -chromAlias=file - specify a two-column file: 1: alias, 2: other name\n" " for target name translation from column 1 name to column 2 name\n" " names not found are passed through intact\n" " -nohead - do not output the PSL header, default has header output\n" - " -allowDups - for fasta output, allow duplicate query sequences output\n" - " - default is to eliminate duplicate sequences\n" - " - runs much faster without the duplicate check\n" - " -noSequenceVerify - when checking for dups, do not verify each sequence\n" - " - when the same name is identical, assume they are\n" - " - helps speed up the dup check but not thorough\n" " -dots=N - output progress dot(.) every N alignments processed\n" - " -querySizes=file - two column tab file: 'name size' to fixup the query\n" - " - sequence sizes since bam does not provide qSize\n" "\n" "Note: a chromAlias file can be obtained from a UCSC database, e.g.:\n" " hgsql -N -e 'select alias,chrom from chromAlias;' hg38 > hg38.chromAlias.tab\n" " Or from the downloads server:\n" " wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/chromAlias.txt.gz\n" "See also our tool chromToUcsc\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"fasta", OPTION_STRING}, {"chromAlias", OPTION_STRING}, {"nohead", OPTION_BOOLEAN}, {"allowDups", OPTION_BOOLEAN}, {"noSequenceVerify", OPTION_BOOLEAN}, {"dots", OPTION_INT}, {"querySizes", OPTION_STRING}, {NULL, 0}, }; static int dots = 0; static boolean nohead = FALSE; -static boolean allowDups = FALSE; -static boolean noSequenceVerify = FALSE; -static char *querySizes = NULL; // use chrom.sizes file to set query size -static struct hash *qSizeHash = NULL; static struct hash *hashChromAlias(char *fileName) /* Read two column file into hash keyed by first column */ { struct hash *hash = hashNew(0); struct lineFile *lf = netLineFileOpen(fileName); char *row[2]; while (lineFileRow(lf, row)) hashAdd(hash, row[0], cloneString(row[1])); lineFileClose(&lf); return hash; } +static void convertToPsl(bam1_t *aln, bam_header_t *head, struct hash *chromAlias, FILE *f) +/* convert one alignment to a psl */ +{ +struct psl *psl = bamToPslUnscored(aln, head); +if (psl != NULL) + { + if (chromAlias) + { + struct hashEl *hel = NULL; + if ((hel = hashLookup(chromAlias, psl->tName)) != NULL) + psl->tName = cloneString((char *)hel->val); /* memory leak */ + } + pslTabOut(psl, f); /* no free of this psl data, memory leak */ + pslFree(&psl); + } +} + +static void convertToFasta(bam1_t *aln, struct hash *fastaDoneSeqs, FILE *faF) +/* output a FASTA record of the query sequence */ +{ +char *dna = bamGetQuerySequence(aln, TRUE); +char *qName = bam1_qname(aln); +if (hashLookup(fastaDoneSeqs, qName) == NULL) // first seen + { + hashAddInt(fastaDoneSeqs, qName, TRUE); + faWriteNext(faF, qName, dna, strlen(dna)); + } +freez(&dna); +} + static void bamToPsl(char *inBam, char *outPsl, char *outFasta, char *aliasFile) /* bamToPsl - Convert a bam file to a psl and optionally also a fasta file that contains the reads.. */ { unsigned long long processCount = 0; - /* record md5sums to avoid duplicate fasta output - key is name, value is md5sum */ -struct hash *fastaSums = NULL; /* initialize later if needed */ samfile_t *in = bamMustOpenLocal(inBam, "rb", NULL); bam_header_t *head = sam_hdr_read(in); if (head == NULL) errAbort("Aborting ... bad BAM header in %s", inBam); /* Open up psl output and write header (if allowed). */ FILE *f = mustOpen(outPsl, "w"); if (! nohead) pslWriteHead(f); /* Optionally use alias file */ struct hash *chromAlias = NULL; /* initialize later if needed */ if (aliasFile != NULL) chromAlias = hashChromAlias(aliasFile); /* Optionally open up fasta output */ +struct hash *fastaDoneSeqs = NULL; // avoids duplicates FILE *faF = NULL; if (outFasta != NULL) { faF = mustOpen(outFasta, "w"); - fastaSums = newHashExt(20, TRUE); /* using stack local memory */ + fastaDoneSeqs = hashNew(20); } bam1_t one; ZeroVar(&one); // This seems to be necessary! /* Write next sequence to fa file. */ for (;;) { if (sam_read1(in, head, &one) < 0) { break; } if (one.core.n_cigar != 0) { - struct psl *psl = bamToPslUnscored(&one, head); - if (psl != NULL) - { - if (chromAlias) - { - struct hashEl *hel = NULL; - if ((hel = hashLookup(chromAlias, psl->tName)) != NULL) - psl->tName = cloneString((char *)hel->val); /* memory leak */ + convertToPsl(&one, head, chromAlias, f); } - if (qSizeHash) - { - int brokenSize = psl->qSize; - psl->qSize = hashIntVal(qSizeHash, psl->qName); - int offset = psl->qSize - brokenSize; - /* check to see if negative qStarts need to be fixed */ - if ((offset > 0) && (psl->strand[0] == '-')) + if ((faF != NULL) && ((one.core.flag & BAM_FSUPPLEMENTARY) == 0)) { - for (int i = 0; i < psl->blockCount; ++i) - psl->qStarts[i] += offset; - } - } - pslTabOut(psl, f); /* no free of this psl data, memory leak */ - pslFree(&psl); - } + // supplementary are not include as they don't have full sequence + convertToFasta(&one, fastaDoneSeqs, faF); } ++processCount; if (dots) if (0 == processCount % dots) verbose(1,"."); - if (faF != NULL) - { - char *dna = bamGetQuerySequence(&one, TRUE); - char *qName = bam1_qname(&one); - if (allowDups) - faWriteNext(faF, qName, dna, strlen(dna)); - else - { - struct hashEl *hel = NULL; - if ((hel = hashLookup(fastaSums, qName)) == NULL) // first seen - { - char *md5sum = md5HexForString(dna); - hel = hashAdd(fastaSums, qName, md5sum); - faWriteNext(faF, qName, dna, strlen(dna)); - } - else if (! noSequenceVerify) // repeated md5sum calculation - { - char *md5sum = md5HexForString(dna); - /* verify sequence is identical for same name */ - if (differentWord((char *)hel->val, md5sum)) - verbose(1, "# warning: different sequence found for '%s'\n", - qName); - } - } - freez(&dna); - } } if (dots) verbose(1,"\n"); samclose(in); carefulClose(&f); carefulClose(&faF); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); char *fastaName = optionVal("fasta", NULL); char *aliasFile = optionVal("chromAlias", NULL); -querySizes = optionVal("querySizes", NULL); - dots = optionInt("dots", dots); nohead = optionExists("nohead"); -allowDups = optionExists("allowDups"); -noSequenceVerify = optionExists("noSequenceVerify"); -if (querySizes != NULL) - qSizeHash = loadSizes(querySizes); +if (optionExists("allowDups")) + fprintf(stderr, "Note: -allowDups is obsolete and ignored"); +if (optionExists("noSequenceVerify")) + fprintf(stderr, "Note: -noSequenceVerify is obsolete and ignored"); +if (optionExists("querySizes")) + fprintf(stderr, "Note: -querySizes is obsolete and ignored"); bamToPsl(argv[1], argv[2], fastaName, aliasFile); return 0; }