533112afe2a2005e80cdb1f82904ea65032d4302 braney Sat Oct 2 11:37:34 2021 -0700 split hg/lib into two separate libaries, one only used by the cgis diff --git src/hg/cgilib/hgBam.c src/hg/cgilib/hgBam.c new file mode 100644 index 0000000..5e0ac5d --- /dev/null +++ src/hg/cgilib/hgBam.c @@ -0,0 +1,247 @@ +/* bamFile -- interface to binary alignment format files using Heng Li's samtools lib. */ + +/* Copyright (C) 2014 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "common.h" +#include "hdb.h" +#include "hgBam.h" + +#include "htmshell.h" +#include "samAlignment.h" + +char *bamFileNameFromTable(struct sqlConnection *conn, char *table, char *bamSeqName) +/* Return file name from table. If table has a seqName column, then grab the + * row associated with bamSeqName (which can be e.g. '1' not 'chr1' if that is the + * case in the bam file). */ +{ +boolean checkSeqName = (sqlFieldIndex(conn, table, "seqName") >= 0); +if (checkSeqName && bamSeqName == NULL) + errAbort("bamFileNameFromTable: table %s has seqName column, but NULL seqName passed in", + table); +char query[512]; +if (checkSeqName) + sqlSafef(query, sizeof(query), "select fileName from %s where seqName = '%s'", + table, bamSeqName); +else + sqlSafef(query, sizeof(query), "select fileName from %s", table); +char *fileName = sqlQuickString(conn, query); +if (fileName == NULL && checkSeqName) + { + if (startsWith("chr", bamSeqName)) + sqlSafef(query, sizeof(query), "select fileName from %s where seqName = '%s'", + table, bamSeqName+strlen("chr")); + else + sqlSafef(query, sizeof(query), "select fileName from %s where seqName = 'chr%s'", + table, bamSeqName); + fileName = sqlQuickString(conn, query); + } +if (fileName == NULL) + { + if (checkSeqName) + errAbort("Missing fileName for seqName '%s' in %s table", bamSeqName, table); + else + errAbort("Missing fileName in %s table", table); + } +return fileName; +} + +struct ffAli *bamToFfAli(const bam1_t *bam, struct dnaSeq *target, int targetOffset, + boolean useStrand, char **retQSeq) +/* Convert from bam to ffAli format. If retQSeq is non-null, set it to the + * query sequence into which ffAli needle pointers point. (Adapted from psl.c's pslToFfAli.) */ +{ +struct ffAli *ffList = NULL, *ff; +const bam1_core_t *core = &bam->core; +boolean isRc = useStrand && bamIsRc(bam); +DNA *needle = (DNA *)bamGetQuerySequence(bam, useStrand); +if (retQSeq) + *retQSeq = needle; +if (isRc) + reverseComplement(target->dna, target->size); +DNA *haystack = target->dna; +unsigned int *cigarPacked = bam1_cigar(bam); +int tStart = targetOffset, qStart = 0, i; +// If isRc, need to go through the CIGAR ops backwards, but sequence offsets still count up. +int iStart = isRc ? (core->n_cigar - 1) : 0; +int iIncr = isRc ? -1 : 1; +for (i = iStart; isRc ? (i >= 0) : (i < core->n_cigar); i += iIncr) + { + char op; + int size = bamUnpackCigarElement(cigarPacked[i], &op); + switch (op) + { + case 'M': // match or mismatch (gapless aligned block) + case '=': // match + case 'X': // mismatch + AllocVar(ff); + ff->left = ffList; + ffList = ff; + ff->nStart = needle + qStart; + ff->nEnd = ff->nStart + size; + ff->hStart = haystack + tStart - targetOffset; + ff->hEnd = ff->hStart + size; + tStart += size; + qStart += size; + break; + case 'I': // inserted in query + case 'S': // skipped query bases at beginning or end ("soft clipping") + qStart += size; + break; + case 'D': // deleted from query + case 'N': // long deletion from query (intron as opposed to small del) + tStart += size; + break; + case 'H': // skipped query bases not stored in record's query sequence ("hard clipping") + case 'P': // P="silent deletion from padded reference sequence" -- ignore these. + break; + default: + errAbort("bamToFfAli: unrecognized CIGAR op %c -- update me", op); + } + } +ffList = ffMakeRightLinks(ffList); +ffCountGoodEnds(ffList); +return ffList; +} + + +struct bamToSamHelper +/* Helper structure to help get sam alignments out of a BAM file */ + { + struct lm *lm; /* Local memory pool to allocate into */ + char *chrom; /* Chromosome name */ + struct dyString *dy; /* Dynamic temporary string */ + samfile_t *samFile; /* Open sam/bam file header. */ + struct samAlignment *samList; /* List of alignments. */ + }; + +static void addToChars(char *s, int size, char amount) +/* Add amount to each char in s of given size. */ +{ +int i; +for (i=0; ilm; +struct samAlignment *sam; +lmAllocVar(lm, sam); +const bam1_core_t *core = &bam->core; +struct dyString *dy = helper->dy; +sam->qName = lmCloneString(lm, bam1_qname(bam)); +sam->flag = core->flag; +if (helper->chrom != NULL) + sam->rName = helper->chrom; +else + sam->rName = lmCloneString(lm, header->target_name[core->tid]); +sam->pos = core->pos + 1; +sam->mapQ = core->qual; +dyStringClear(dy); +bamUnpackCigar(bam, dy); +sam->cigar = lmCloneStringZ(lm, dy->string, dy->stringSize); +if (core->mtid >= 0) + { + if (core->tid == core->mtid) + sam->rNext = "="; + else + sam->rNext = lmCloneString(lm, header->target_name[core->mtid]); + } +else + sam->rNext = "*"; +sam->pNext = core->mpos + 1; +sam->tLen = core->isize; +sam->seq = lmAlloc(lm, core->l_qseq + 1); +bamUnpackQuerySequence(bam, FALSE, sam->seq); +char *bamQual = (char *)bam1_qual(bam); +if (isAllSameChar(bamQual, core->l_qseq, -1)) + sam->qual = "*"; +else + { + sam->qual = lmCloneStringZ(lm, bamQual, core->l_qseq); + addToChars(sam->qual, core->l_qseq, 33); + } +dyStringClear(dy); +bamUnpackAux(bam, dy); +sam->tagTypeVals = lmCloneStringZ(lm, dy->string, dy->stringSize); +slAddHead(&helper->samList, sam); +return 0; +} + +struct samAlignment *bamFetchSamAlignmentPlus(char *fileOrUrl, char *chrom, int start, int end, + struct lm *lm, char *refUrl, char *cacheDir) +/* Fetch region as a list of samAlignments - which is more or less an unpacked + * bam record. Results is allocated out of lm, since it tends to be large... */ +{ +return bamAndIndexFetchSamAlignmentPlus(fileOrUrl, NULL, chrom, start, end, lm, refUrl, cacheDir); +} + +struct samAlignment *bamAndIndexFetchSamAlignmentPlus(char *fileOrUrl, char *baiUrl, char *chrom, int start, int end, + struct lm *lm, char *refUrl, char *cacheDir) +/* Like bamFetchSamAlignmentPlus but can specify bai index file url in addition to the bam file */ +{ +struct bamToSamHelper helper; +helper.lm = lm; +helper.chrom = chrom; +helper.dy = dyStringNew(0); +helper.samList = NULL; +char posForBam[256]; +safef(posForBam, sizeof(posForBam), "%s:%d-%d", chrom, start+1, end); +bamAndIndexFetchPlus(fileOrUrl, baiUrl, posForBam, bamAddOneSamAlignment, &helper, &helper.samFile, + refUrl, cacheDir); +dyStringFree(&helper.dy); +slReverse(&helper.samList); +return helper.samList; +} + +struct samAlignment *bamFetchSamAlignment(char *fileOrUrl, char *chrom, int start, int end, + struct lm *lm) +/* Fetch region as a list of samAlignments - which is more or less an unpacked + * bam record. Results is allocated out of lm, since it tends to be large... */ +{ +return bamFetchSamAlignmentPlus(fileOrUrl, chrom, start, end, lm, NULL, NULL); +} + +struct samAlignment *bamReadNextSamAlignments(samfile_t *fh, bam_hdr_t *header, int count, struct lm *lm) +/* Read next count alignments in SAM format, allocated in lm. May return less than + * count at end of file. */ +{ +/* Set up helper. */ +struct bamToSamHelper helper; +helper.lm = lm; +helper.chrom = NULL; +helper.dy = dyStringNew(0); +helper.samFile = fh; +helper.samList = NULL; + +/* Loop through calling our own fetch function */ +int i; +bam1_t *b = bam_init1(); +for (i=0; i