a44421a79fb36cc2036fe116b97ea3bc9590cd0c braney Fri Dec 2 09:34:39 2011 -0800 removed rcsid (#295) diff --git src/hg/lib/ggCluster.c src/hg/lib/ggCluster.c index fc1a6ce..4b1580d 100644 --- src/hg/lib/ggCluster.c +++ src/hg/lib/ggCluster.c @@ -1,504 +1,503 @@ /***************************************************************************** * Copyright (C) 2000 Jim Kent. This source code may be freely used * * for personal, academic, and non-profit purposes. Commercial use * * permitted only by explicit agreement with Jim Kent (jim_kent@pacbell.net) * *****************************************************************************/ /* ggCluster - This takes as input a list of mRNAs and produces * a list of mRNA clusters. mRNAs are lumped into a cluster * if exons on the same strand overlap. */ #include "common.h" #include "dnautil.h" #include "dnaseq.h" #include "geneGraph.h" #include "ggPrivate.h" -static char const rcsid[] = "$Id: ggCluster.c,v 1.13 2007/02/12 01:58:13 kent Exp $"; static void updateActiveClusters(struct ggMrnaCluster **pActiveClusters, struct ggMrnaCluster **pFinishedClusters, char *strand, int tStart) /* Move clusters that could no longer match off active list onto finished list. */ { struct ggMrnaCluster *mc, *ggNext, *newActive = NULL; for (mc = *pActiveClusters; mc != NULL; mc = ggNext) { ggNext = mc->next; if (differentString(mc->strand, strand) || mc->tEnd < tStart) { slAddHead(pFinishedClusters, mc); } else { slAddHead(&newActive, mc); } } slReverse(&newActive); *pActiveClusters = newActive; } static boolean isGoodIntron(struct dnaSeq *targetSeq, struct ggMrnaBlock *a, struct ggMrnaBlock *b, boolean isRev) /* Return true if there's a good intron between a and b. If true will update strand.*/ { if (a->qEnd == b->qStart && b->tStart - a->tEnd >= 30) { if (targetSeq) { DNA *iStart = targetSeq->dna + a->tEnd; DNA *iEnd = targetSeq->dna + b->tStart; if (isRev) { if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'c') return TRUE; if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'g' && iEnd[-1] == 'c') return TRUE; } else { if (iStart[0] == 'g' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'g') return TRUE; if (iStart[0] == 'g' && iStart[1] == 'c' && iEnd[-2] == 'a' && iEnd[-1] == 'g') return TRUE; } } else return TRUE; /* Done all the checking we can */ } return FALSE; } static struct ggMrnaCluster *finishClusterOfOne( struct ggMrnaAli *ma, struct dnaSeq *genoSeq, struct ggVertex *vertices, int vCount) /* Finish making up a ggMrnaCluster from a single alignment * once have already calculated the vertices that go into it. */ { struct ggMrnaCluster *mc; struct maRef *ref; struct ggAliInfo *da; AllocVar(mc); AllocVar(ref); ref->ma = ma; mc->refList = ref; AllocVar(da); AllocArray(da->vertices, vCount); CopyArray(vertices, da->vertices, vCount); da->vertexCount = vCount; da->ma = ma; mc->mrnaList = da; mc->tName = cloneString(ma->tName); snprintf(mc->strand, sizeof(mc->strand), "%s", ma->strand); mc->tStart = ma->tStart; mc->tEnd = ma->tEnd; mc->genoSeq = genoSeq; return mc; } struct ggMrnaCluster *ggMrnaClusterOfOne(struct ggMrnaAli *ma, struct dnaSeq *genoSeq) /* Make up a ggMrnaCluster with just one thing on it. Looks at each intron to see if it is good before determining which type a vertex should be (hard or soft) */ { static struct ggVertex *vertices = NULL; /* Resized array. */ static int vAlloc = 0; int vCount = 0; struct ggMrnaBlock *blocks = ma->blocks; int blockCount = ma->blockCount; int isRev = ma->orientation < 0; int i; struct ggVertex *v; for(i=0; i<blockCount; i++) { struct ggMrnaBlock *block = blocks+i; if (vCount+2 > vAlloc) { if (vAlloc == 0) vAlloc = 128; else vAlloc <<= 1; ExpandArray(vertices, vCount, vAlloc); } /* Enter first vertex, checking to see if it has consensus splice sites. */ v = &vertices[vCount++]; v->position = block->tStart; if (i == 0 || (!isGoodIntron(genoSeq, block-1, block, isRev))) v->type = ggSoftStart; else v->type = ggHardStart; /* Enter second vertex, checking to see if it has consensus splice sites. */ v = &vertices[vCount++]; v->position = block->tEnd; if(i == blockCount-1 || (!isGoodIntron(genoSeq, block, block+1, isRev))) v->type = ggSoftEnd; else v->type = ggHardEnd; } /* Allocate and fill in ggMrnaCluster. */ return finishClusterOfOne(ma, genoSeq, vertices, vCount); } struct ggMrnaCluster *ggMrnaSoftClusterOfOne(struct ggMrnaAli *ma, struct dnaSeq *genoSeq) /* Make up a ggMrnaCluster with just one thing on it. * Allow internal soft edges. */ { static struct ggVertex *vertices = NULL; /* Resized array. */ static int vAlloc = 0; int vCount = 0; struct ggMrnaBlock *blocks = ma->blocks; int i, blockCount = ma->blockCount; struct ggVertex *v; int isRev = ma->orientation < 0; /* Do initial vertex allocation. */ if (vAlloc == 0) { vAlloc = 128; AllocArray(vertices, vAlloc); } /* Put start of first block in - always soft. */ v = &vertices[vCount++]; v->position = blocks->tStart; v->type = ggSoftStart; /* Loop around putting in block end, next block start. */ for (i=1; i<blockCount; ++i) { /* Expand buffer if need be to handle these two new * points, and the one at the end too. */ if (vCount+3 > vAlloc) { vAlloc <<= 1; ExpandArray(vertices, vCount, vAlloc); } v = &vertices[vCount]; v->position = blocks->tEnd; if (isGoodIntron(genoSeq, blocks, blocks+1, isRev)) { v->type = ggHardEnd; v += 1; v->type = ggHardStart; } else { v->type = ggSoftEnd; v += 1; v->type = ggSoftStart; } blocks += 1; v->position = blocks->tStart; vCount += 2; } /* Put in end of last block - always soft. */ v = &vertices[vCount++]; v->position = blocks->tEnd; v->type = ggSoftEnd; /* Allocate and fill in ggMrnaCluster. */ return finishClusterOfOne(ma, genoSeq, vertices, vCount); } struct ggMrnaCluster *ggMrnaSoftFilteredClusterOfOne(struct ggMrnaAli *ma, struct dnaSeq *genoSeq, int minExonSize, int minNonSpliceExon) /* Make up a ggMrnaCluster with just one thing on it. * All edges here will be soft (not intended for alt-splicing use) */ { static struct ggVertex *vertices = NULL; /* Resized array. */ static int vAlloc = 0; int vCount = 0; int firstExon = -1, lastExon = -1; struct ggMrnaBlock *blocks = ma->blocks, *b; int i, size, blockCount = ma->blockCount; struct ggVertex *v; int isRev = ma->orientation < 0; struct ggMrnaCluster *mc; /* Do initial vertex allocation. */ if (vAlloc == 0) { vAlloc = 128; AllocArray(vertices, vAlloc); } /* Find first exon which is either large enough or * precedes a good intron. */ for (i=0; i<blockCount; ++i) { size = blocks[i].tEnd - blocks[i].tStart; if (size >= minNonSpliceExon) { firstExon = i; break; } if (i < blockCount-1 && size >= minExonSize && isGoodIntron(genoSeq, blocks+i, blocks+i+1, isRev)) { firstExon = i; break; } } if (firstExon < 0) return NULL; /* Find last exon which is either large enough or * follows a good intron. */ for (i=blockCount-1; i>= 0; --i) { size = blocks[i].tEnd - blocks[i].tStart; if (size >= minNonSpliceExon) { lastExon = i; break; } if (i > 0 && size >= minExonSize && isGoodIntron(genoSeq, blocks+i-1, blocks+i, isRev)) { lastExon = i; break; } } if (lastExon < 0) return NULL; for (i=firstExon; i<=lastExon; ++i) { if (vCount+2 > vAlloc) { vAlloc <<= 1; ExpandArray(vertices, vCount, vAlloc); } b = blocks+i; size = b->tEnd - b->tStart; if (size >= minExonSize) { v = vertices + vCount; v->position = b->tStart; v->type = ggSoftStart; v += 1; v->position = b->tEnd; v->type = ggSoftEnd; vCount += 2; } } /* Allocate and fill in ggMrnaCluster. */ mc = finishClusterOfOne(ma, genoSeq, vertices, vCount); mc->tStart = blocks[firstExon].tStart; mc->tEnd = blocks[lastExon].tEnd; return mc; } static boolean exonOverlaps(int eStart, int eEnd, struct ggMrnaCluster *mc) /* Returns TRUE if exon defined by eStart/eEnd overlaps with any * exon in cluster. */ { struct ggAliInfo *da; for (da = mc->mrnaList; da != NULL; da = da->next) { struct ggVertex *v = da->vertices; int vCount = da->vertexCount; int i; for (i=0; i<vCount; i+=2,v+=2) { int mStart = v[0].position; int mEnd = v[1].position; int s = max(eStart, mStart); int e = min(eEnd, mEnd); if (e - s > 0) return TRUE; } } return FALSE; } static boolean clustersOverlap(struct ggMrnaCluster *a, struct ggMrnaCluster *b) /* Return TRUE if a and b have an overlapping exon. */ { struct ggAliInfo *da; for (da = a->mrnaList; da != NULL; da = da->next) { struct ggVertex *v = da->vertices; int vCount = da->vertexCount; int i; for (i=0; i<vCount; i+=2,v+=2) { if (exonOverlaps(v[0].position, v[1].position, b)) return TRUE; } } return FALSE; } static struct ggMrnaCluster *findMergableClusters(struct ggMrnaCluster **pActiveClusters, struct ggMrnaCluster *newMc) /* Remove list of clusters that are mergable with cluster from active list * and return it as list. Assumes that only clusters on same strand of same clone * are on active list. */ { struct ggMrnaCluster *mc, *mcNext, *newActive = NULL, *mergeList = NULL; for (mc = *pActiveClusters; mc != NULL; mc = mcNext) { mcNext = mc->next; if (clustersOverlap(mc, newMc)) { slAddHead(&mergeList, mc); } else { slAddHead(&newActive, mc); } } slReverse(&newActive); *pActiveClusters = newActive; slReverse(&mergeList); return mergeList; } void ggMrnaClusterMerge(struct ggMrnaCluster *aMc, struct ggMrnaCluster *bMc) /* Merge bMc into aMc. Afterwords aMc will be bigger and * bMc will be gone. */ { aMc->refList = slCat(aMc->refList, bMc->refList); aMc->mrnaList = slCat(aMc->mrnaList, bMc->mrnaList); if (aMc->tStart > bMc->tStart) aMc->tStart = bMc->tStart; if (aMc->tEnd < bMc->tEnd) aMc->tEnd = bMc->tEnd; freez(&bMc->tName); freeMem(bMc); } static struct ggMrnaCluster *clustersFromMas(struct ggMrnaAli *maList, struct dnaSeq *genoSeq) /* Given a sorted ggMrnaAli list return list of mRNA clusters. */ { struct ggMrnaCluster *activeClusters = NULL; /* List that can still add to. */ struct ggMrnaCluster *finishedClusters = NULL; /* List that's all done. */ struct ggMrnaAli *ma; for (ma = maList; ma != NULL; ma = ma->next) { struct ggMrnaCluster *mergableClusters = NULL, *newMc = NULL; updateActiveClusters(&activeClusters, &finishedClusters, ma->strand, ma->tStart); newMc = ggMrnaClusterOfOne(ma, genoSeq); if (newMc) { mergableClusters = findMergableClusters(&activeClusters, newMc); if (mergableClusters == NULL) { slAddHead(&activeClusters, newMc); } else { struct ggMrnaCluster *mergeHead = mergableClusters; struct ggMrnaCluster *mc, *nextGg; ggMrnaClusterMerge(mergeHead, newMc); for (mc = mergeHead->next; mc != NULL; mc = nextGg) { nextGg = mc->next; ggMrnaClusterMerge(mergableClusters, mc); } slAddHead(&activeClusters, mergeHead); } } } finishedClusters = slCat(activeClusters, finishedClusters); slReverse(&finishedClusters); return finishedClusters; } struct ggMrnaCluster *ggClusterMrna(struct ggMrnaInput *ci) /* Make a list of clusters from ci. */ { struct ggMrnaCluster *mc = NULL; slSort(&ci->maList, cmpGgMrnaAliTargetStart); mc = clustersFromMas(ci->maList, ci->genoSeq); return mc; } void freeDenseAliInfo(struct ggAliInfo **pDa) /* Free up a ggAliInfo */ { struct ggAliInfo *da; if ((da = *pDa) != NULL) { freeMem(da->vertices); freez(pDa); } } void freeDenseAliInfoList(struct ggAliInfo **pDaList) /* Free up a whole ggAliInfo list. */ { struct ggAliInfo *da, *next; for (da = *pDaList; da != NULL; da = next) { next = da->next; freeDenseAliInfo(&da); } *pDaList = NULL; } void maRefFree(struct maRef **pMaRef) { struct maRef *ma; if ((ma = *pMaRef) != NULL) { ggMrnaAliFreeList(&ma->ma); } } void maRefFreeList(struct maRef **pMaRef) { struct maRef *ma, *next; for(ma = *pMaRef; ma != NULL; ma = ma->next) { next = ma->next; maRefFree(&ma); } *pMaRef = NULL; } static void freeMrnaCluster(struct ggMrnaCluster **pMc) /* Free a single mrna cluster */ { struct ggMrnaCluster *mc; if ((mc = *pMc) != NULL) { slFreeList(&mc->refList); freeDenseAliInfoList(&mc->mrnaList); freez(&mc->tName); freez(pMc); } } void ggFreeMrnaClusterList(struct ggMrnaCluster **pMcList) /* Free a list of mrna clusters. */ { struct ggMrnaCluster *mc, *next; for(mc = *pMcList; mc != NULL; mc = next) { next = mc->next; freeMrnaCluster(&mc); } *pMcList = NULL; }