50d5df2a88178ea49caa2fd1b41ce28af84f219f braney Mon May 8 12:36:08 2023 -0700 support bigMaf for sequence logos diff --git src/hg/lib/hgMaf.c src/hg/lib/hgMaf.c index 00667e7..1ac3302 100644 --- src/hg/lib/hgMaf.c +++ src/hg/lib/hgMaf.c @@ -179,43 +179,37 @@ dyStringAppendMultiC(org->dy, '.', fillSize); } } static int countAlpha(char *s) /* Count up number of alphabetic characters in string */ { int count = 0; char c; while ((c = *s++) != 0) if (isalpha(c)) ++count; return count; } -struct mafBaseProbs *hgMafProbs( - char *database, /* Database, must already have hSetDb to this */ - char *track, /* Name of MAF track */ - char *chrom, /* Chromosome (in database genome) */ - int start, int end, /* start/end in chromosome */ - char strand /* strand */ +struct mafBaseProbs *hgMafProbsHelper( + int size, + struct mafAli *maf ) /* calculate the probability of each nucleotide in each column of a maf. */ { -struct mafAli *maf = hgMafFrag(database, track, chrom, start, end, '+', NULL, NULL); - struct mafBaseProbs *probs; -int size = end - start; AllocArray(probs, size); int count = 0; unsigned letterBox[256]; int ii; for(ii=0; ii < maf->textSize; ii++) { struct mafComp *comp = maf->components; char ch = comp->text[ii]; if (ch == '-') continue; memset(letterBox, 0, sizeof letterBox); @@ -229,51 +223,79 @@ double total = 0; total += letterBox['a']; total += letterBox['c']; total += letterBox['g']; total += letterBox['t']; probs[count].aProb = letterBox['a'] / total; probs[count].cProb = letterBox['c'] / total; probs[count].gProb = letterBox['g'] / total; probs[count].tProb = letterBox['t'] / total; count++; } return probs; } -struct mafAli *hgMafFrag( +struct mafBaseProbs *hgBigMafProbs( + char *database, /* Database, must already have hSetDb to this */ + struct bbiFile *bbi, + char *chrom, /* Chromosome (in database genome) */ + int start, int end, /* start/end in chromosome */ + char strand /* strand */ + ) +/* calculate the probability of each nucleotide in each column of a bigMaf. */ +{ +struct mafAli *maf = hgBigMafFrag(database, bbi, chrom, start, end, '+', NULL, NULL); + +return hgMafProbsHelper(end - start, maf); +} + +struct mafBaseProbs *hgMafProbs( char *database, /* Database, must already have hSetDb to this */ char *track, /* Name of MAF track */ char *chrom, /* Chromosome (in database genome) */ int start, int end, /* start/end in chromosome */ + char strand /* strand */ + ) +/* calculate the probability of each nucleotide in each column of a maf. */ +{ +struct mafAli *maf = hgMafFrag(database, track, chrom, start, end, '+', NULL, NULL); + +return hgMafProbsHelper(end - start, maf); +} + +static struct mafAli *hgMafFragHelper( + char *database, /* Database, must already have hSetDb to this */ + //char *track, /* Name of MAF track */ + char *chrom, /* Chromosome (in database genome) */ + int start, int end, /* start/end in chromosome */ char strand, /* Chromosome strand. */ + struct mafAli *mafList, char *outName, /* Optional name to use in first component */ struct slName *orderList /* Optional order of organisms. */ ) -/* mafFrag- Extract maf sequences for a region from database. +/* hgMafFragHelper- Extract maf sequences for a region from database. * This creates a somewhat unusual MAF that extends from start * to end whether or not there are actually alignments. Where * there are no alignments (or alignments missing a species) * a . character fills in. The score is always zero, and * the sources just indicate the species. You can mafFree this * as normal. */ { int chromSize = hChromSize(database, chrom); -struct sqlConnection *conn = hAllocConn(database); struct dnaSeq *native = hChromSeq(database, chrom, start, end); -struct mafAli *maf, *mafList = mafLoadInRegion(conn, track, chrom, start, end); +struct mafAli *maf; char masterSrc[128]; struct hash *orgHash = newHash(10); struct oneOrg *orgList = NULL, *org, *nativeOrg = NULL; int curPos = start, symCount = 0; struct slName *name; int order = 0; /* Check that the mafs are really copacetic, the particular * subtype we think is in the database that this (relatively) * simple code can handle. */ safef(masterSrc, sizeof(masterSrc), "%s.%s", database, chrom); mafCheckFirstComponentSrc(mafList, masterSrc); mafCheckFirstComponentStrand(mafList, '+'); slSort(&mafList, mafCmp); @@ -449,34 +471,68 @@ int size = countAlpha(org->dy->string); mc->src = cloneString(org->name); mc->srcSize = size; mc->strand = '+'; mc->start = 0; mc->size = size; } mc->text = cloneString(org->dy->string); dyStringFree(&org->dy); slAddHead(&maf->components, mc); } slReverse(&maf->components); slFreeList(&orgList); freeHash(&orgHash); -hFreeConn(&conn); return maf; } +struct mafAli *hgBigMafFrag( + char *database, /* Database, must already have hSetDb to this */ + struct bbiFile *bbi, + char *chrom, /* Chromosome (in database genome) */ + int start, int end, /* start/end in chromosome */ + char strand, /* Chromosome strand. */ + char *outName, /* Optional name to use in first component */ + struct slName *orderList /* Optional order of organisms. */ + ) +/* hgBigMafFrag - Extract maf sequences for a region from a bigMaf and call hgMafFragHelper. */ +{ +struct mafAli *mafList = bigMafLoadInRegion( bbi, chrom, start, end); +return hgMafFragHelper(database, chrom, start, end, strand, mafList, outName, orderList); +} + +struct mafAli *hgMafFrag( + char *database, /* Database, must already have hSetDb to this */ + char *track, /* Name of MAF track */ + char *chrom, /* Chromosome (in database genome) */ + int start, int end, /* start/end in chromosome */ + char strand, /* Chromosome strand. */ + char *outName, /* Optional name to use in first component */ + struct slName *orderList /* Optional order of organisms. */ + ) +/* hgMafFrag- Extract maf sequences for a region from database and call hgMafFragHelper. */ +{ +struct sqlConnection *conn = hAllocConn(database); +struct mafAli *mafList = mafLoadInRegion(conn, track, chrom, start, end); +struct mafAli *ret = hgMafFragHelper(database, chrom, start, end, strand, mafList, outName, orderList); + +hFreeConn(&conn); + +return ret; +} + struct consWiggle *wigMafWiggles(char *db, struct trackDb *tdb) /* get conservation wiggle table names and labels from trackDb setting, ignoring those where table doesn't exist */ { char *fields[20]; int fieldCt; int i; char *wigTable, *wigLabel; struct consWiggle *wig, *wigList = NULL; char *setting = trackDbSetting(tdb, CONS_WIGGLE); if (!setting) return NULL; fieldCt = chopLine(cloneString(setting), fields); for (i = 0; i < fieldCt; i += 3) {