3932f706eeb2d0b4722d35df64b39b0755b92861 hiram Tue Jan 14 10:55:27 2014 -0800 remove useless self assignment refs #12209 diff --git src/hg/lib/ggGraph.c src/hg/lib/ggGraph.c index c02047b..129d464 100644 --- src/hg/lib/ggGraph.c +++ src/hg/lib/ggGraph.c @@ -1,1384 +1,1383 @@ /***************************************************************************** * Copyright (C) 2000 Jim Kent. This source code may be freely used * * for personal, academic, and non-profit purposes. Commercial use * * permitted only by explicit agreement with Jim Kent (jim_kent@pacbell.net) * *****************************************************************************/ /* ggGraph - Third pass of routine that generates a representation * of possible splicing patterns of a gene as a graph from mrna * clusters. */ #include "common.h" #include "dnautil.h" #include "dnaseq.h" #include "localmem.h" #include "ggPrivate.h" #include "hdb.h" #include "rangeTree.h" static int maxEvidence = 500; enum{ softEndBleedLimit = 5}; /* For soft ends allow some bleeding into next intron */ static int ggEvidenceIx(struct ggEvidence *hayStack, struct ggEvidence *needle) /* return index of needle in haystack or -1 if not present */ { int count = 0; struct ggEvidence *ge = NULL; for(ge = hayStack; ge != NULL; ge = ge->next) { if(needle->id == ge->id) return count; count++; } return -1; } void ggEvAddHeadWrapper(struct ggEvidence **list, struct ggEvidence **el) /* Wrapper to avoid getting more than 50 pieces of evidence on an edge. 50 is enough that I believe it.*/ { int count = slCount(*list); if(count < maxEvidence) { slAddHead(list, *el); } else ggEvidenceFree(el); } void moveEvidenceIfUnique(struct ggEvidence **currList, struct ggEvidence **newList) /* add the new evidence to the old if not currently in in */ { struct ggEvidence *ge = NULL, *geNext = NULL; for(ge = *newList; ge != NULL; ge = geNext) { int index = -1; geNext = ge->next; index = ggEvidenceIx(*currList, ge); if(index == -1) ggEvAddHeadWrapper(currList, &ge); else ggEvidenceFree(&ge); } *newList = NULL; } static int vertexIx(struct ggVertex *array, int arraySize, int pos, int type) /* Return index of vertex in array, or -1 if not in array. */ { int i; /* Check for a hard vertex first. */ if(type == ggSoftStart || type == ggSoftEnd) { for (i=0; inext) { AllocVar(ret); ret->id = ge->id; ret->start = ge->start; ret->end = ge->end; ggEvAddHeadWrapper(&retList, &ret); } slReverse(&retList); return retList; } int findMrnaIdByName(char *name, char **mrnaRefs, int count) /* find the index of the name in the mrnaRef array, return -1 if not found */ { int i; for(i = 0; i < count ; ++i) { if(sameString(name, mrnaRefs[i])) return i; } return -1; } static struct geneGraph *makeInitialGraph(struct ggMrnaCluster *mc, struct ggMrnaInput *ci) /* Create initial gene graph from input input. */ { int vAlloc = 512; int i; struct ggVertex *vAll; int vAllCount = 0; struct ggAliInfo *da; struct geneGraph *gg = NULL; bool **em; int totalWithDupes = 0; struct maRef *ref; int mrnaRefCount; struct ggEvidence ***evM = NULL; AllocArray(vAll, vAlloc); /* First fill up array with unique vertices. */ for (da = mc->mrnaList; da != NULL; da = da->next) { int countOne = da->vertexCount; struct ggVertex *vOne = da->vertices; int vix; totalWithDupes += countOne; for (i=0; iposition, vOne->type); /* if vOne not in vALL vix == -1 and we need to insert it */ if (vix < 0) { if (vAllCount >= vAlloc) { vAlloc <<= 1; ExpandArray(vAll, vAllCount, vAlloc); } vAll[vAllCount++] = *vOne; } ++vOne; } } AllocVar(gg); /* Fill in other info from ci and mc. */ safef(gg->strand, sizeof(gg->strand), "%s", mc->strand); gg->tName = cloneString(ci->tName); gg->tStart = mc->tStart; gg->tEnd = mc->tEnd; gg->mrnaRefCount = mrnaRefCount = slCount(mc->refList); AllocArray(gg->mrnaRefs, mrnaRefCount); AllocArray(gg->mrnaTypes, mrnaRefCount); for (ref = mc->refList, i=0; ref != NULL; ref = ref->next, ++i) { gg->mrnaRefs[i] = cloneString(ref->ma->qName); gg->mrnaTypes[i] = ref->ma->sourceType; } /* Allocate gene graph and edge matrix. Also the evidence matrix */ gg->vertexCount = vAllCount; AllocArray(gg->vertices, vAllCount); memcpy(gg->vertices, vAll, vAllCount*sizeof(*vAll)); gg->edgeMatrix = AllocArray(em, vAllCount); gg->evidence = AllocArray(evM, vAllCount); for (i=0; imrnaList; da != NULL; da = da->next) { int countOne = da->vertexCount; struct ggVertex *vOne = da->vertices; int vix, lastVix; struct ggEvidence *ev = NULL; vix = vertexIx(vAll, vAllCount, vOne->position, vOne->type); for (i=1; iposition, vOne->type); assert(vix >= 0 && lastVix >= 0); em[lastVix][vix] = TRUE; ev->id = findMrnaIdByName(da->ma->qName, gg->mrnaRefs, gg->mrnaRefCount); if(ev->id < 0) errAbort("ggGraph::makeInitialGraph() - Couldn't find %s in mrnalist.", da->ma->tName); ev->start = vAll[lastVix].position; ev->end = vAll[vix].position; ggEvAddHeadWrapper(&evM[lastVix][vix], &ev); } } freeMem(vAll); return gg; } void printEdges(struct geneGraph *gg, int position) /* Print verices with position. */ { int i; for(i = 0; i < gg->vertexCount; i++) if(gg->vertices[i].position == position) fprintf(stderr, "%d\t%d\t%d\n", i, gg->vertices[i].position, gg->vertices[i].type); } static int findFurthestLinkedStart(struct geneGraph *gg, int endIx) /* Find index furthest connected start vertex. */ { int vCount = gg->vertexCount; struct ggVertex *vertices = gg->vertices; bool **em = gg->edgeMatrix; int startIx; int startPos; int bestStartIx = 0; int bestStartPos = 0x3fffffff; for (startIx=0; startIxvertexCount; i++) if(gg->edgeMatrix[vertIx][i] == TRUE) return TRUE; return FALSE; } static boolean anythingConnectsTo(struct geneGraph *gg, int vertIx) /* Return TRUE if anything connects to vertIx. */ { int i; for(i = 0; i < gg->vertexCount; i++) if(gg->edgeMatrix[i][vertIx] == TRUE) return TRUE; return FALSE; } static boolean softlyTrimStart(struct geneGraph *gg, int softIx) /* Try and trim one soft start. Return TRUE if did trim. */ { int vCount = gg->vertexCount; struct ggVertex *vertices = gg->vertices; bool **em = gg->edgeMatrix; bool *edgesOut = em[softIx]; int endIx; boolean anyTrim = FALSE; boolean anyLeft = FALSE; for (endIx=0; endIx < vCount; ++endIx) { if (edgesOut[endIx]) { int bestStart = findFurthestLinkedStart(gg, endIx); if (bestStart != softIx) { anyTrim = TRUE; edgesOut[endIx] = FALSE; moveEvidenceIfUnique(&gg->evidence[bestStart][endIx], &gg->evidence[softIx][endIx]); } else { anyLeft = TRUE; } } } if (!anyLeft && !anythingConnectsTo(gg, softIx)) vertices[softIx].type = ggUnused; return anyTrim; } static int findFurthestLinkedEnd(struct geneGraph *gg, int startIx) /* Find index furthest connected end vertex. */ { int vCount = gg->vertexCount; struct ggVertex *vertices = gg->vertices; bool **em = gg->edgeMatrix; bool *edgesOut = em[startIx]; int endIx; int endPos; int bestEndIx = 0; int bestEndPos = -0x3fffffff; for (endIx=0; endIx bestEndPos) { bestEndPos = endPos; bestEndIx = endIx; } } } return bestEndIx; } static boolean softlyTrimEnd(struct geneGraph *gg, int softIx) /* Try and trim one soft start. Return TRUE if did trim. */ { int vCount = gg->vertexCount; struct ggVertex *vertices = gg->vertices; bool **em = gg->edgeMatrix; int startIx; boolean anyTrim = FALSE; boolean anyLeft = FALSE; for (startIx=0; startIx < vCount; ++startIx) { if (em[startIx][softIx]) { int bestEnd = findFurthestLinkedEnd(gg, startIx); if (bestEnd != softIx) { anyTrim = TRUE; em[startIx][softIx] = FALSE; // update evidence moveEvidenceIfUnique(&gg->evidence[startIx][bestEnd], &gg->evidence[startIx][softIx]); } else { anyLeft = TRUE; } } } if (!anyLeft && !connectToAnything(gg, softIx)) vertices[softIx].type = ggUnused; return anyTrim; } static boolean softlyTrim(struct geneGraph *gg) /* Trim all redundant soft starts and ends. Return TRUE if trimmed * any. */ { int i; int vCount = gg->vertexCount; struct ggVertex *vertices = gg->vertices; boolean result = FALSE; for (i=0; ivertexCount; int sIx, eIx; bool **em = gg->edgeMatrix; struct ggVertex *vertices = gg->vertices; int softStartPos = vertices[softStartIx].position; /* for each vertice */ for (eIx = 0; eIx < vCount; ++eIx) { struct ggVertex *ev = vertices+eIx; int type = ev->type; /* if the type is an end */ if (type == ggSoftEnd || type == ggHardEnd) { int evpos = ev->position; int softBleedAdjust = (type == ggHardEnd ? softEndBleedLimit : 0); /* if the position of the vertice is > the softStartPosition, assumes coordinates on + strand? */ if (evpos > softStartPos) /* Why did jim comment this out? looks like now if there is a connection to any downstream exon we create the edge */ // if (evpos <= hardEndPos && evpos > softStartPos) { for (sIx = 0; sIx < vCount; ++sIx) { if (sIx != softStartIx && em[sIx][eIx]) { struct ggVertex *sv = vertices+sIx; if (sv->position <= softStartPos + softBleedAdjust) { struct ggEvidence *ge = NULL; ge = ggEvidenceClone(gg->evidence[softStartIx][endIx]); em[sIx][endIx] = TRUE; moveEvidenceIfUnique(&gg->evidence[sIx][endIx], &ge); result = TRUE; } } } } } } return result; } int blocksOverlap(struct ggMrnaBlock *block1, struct ggMrnaBlock *block2) /** Return TRUE if block1 is inside block2. */ { /* if(block1->tStart >= block2->tStart && block1->tEnd <= block2->tEnd) */ /* return TRUE; */ if(rangeIntersection(block1->tStart, block1->tEnd, block2->tStart, block2->tEnd) > 0) return TRUE; return FALSE; } boolean ggAliInfoOverlap(struct ggVertex *v, struct ggAliInfo *ai1, struct ggAliInfo *ai2, int startIx, int endIx, int start2Ix, int end2Ix) /** Return TRUE if the two alignments have an overlap in the area * defined by starts and ends, return FALSE otherwise. */ { struct ggMrnaBlock *blocks1 = NULL, *blocks2 = NULL; int i=0,j=0; blocks1 = ai1->ma->blocks; blocks2 = ai2->ma->blocks; for(i=0; ima->blockCount; i++) { /* Find the matching block in the other mRNA. */ for(j=0; jma->blockCount; j++) { if(blocksOverlap(&blocks1[i], &blocks2[j]) > 0) { return TRUE; } } } return FALSE; } boolean overlappingMRnas(struct geneGraph *gg, struct hash *aliHash, int startIx, int endIx, int start2Ix, int end2Ix) /** Check to see if the mRNAs contained in the edges defined by these vertexes overlap. */ { struct ggEvidence *edge1 = gg->evidence[startIx][endIx]; struct ggEvidence *edge2 = gg->evidence[start2Ix][end2Ix]; struct ggEvidence *ev1 = NULL, *ev2 = NULL; struct ggAliInfo *ai1 = NULL; struct ggAliInfo *ai2 = NULL; char buff[256]; boolean overlap = FALSE; for(ev1 = edge1; ev1 != NULL; ev1 = ev1->next) { safef(buff, sizeof(buff), "%s", gg->mrnaRefs[ev1->id]); ai1 = hashMustFindVal(aliHash, buff); - ai1 = ai1; for(ev2 = edge2; ev2 != NULL; ev2 = ev2->next) { safef(buff, sizeof(buff), "%s", gg->mrnaRefs[ev2->id]); ai2 = hashMustFindVal(aliHash, buff); if(ggAliInfoOverlap(gg->vertices, ai1, ai2, startIx, endIx, start2Ix, end2Ix )) return TRUE; } } return overlap; } void getStartEdgesOverlapping(struct geneGraph *gg, struct hash *mRnaAli, int softStartIx, int endIx, enum ggVertexType startType, int **oIndex, int *oCount) /** Loop through all the edges looking for ones that overlap em[softStartIx][endIx], start with hard edge, and have an mRNA that actually overlaps with the softStartix edge. */ { bool **em = gg->edgeMatrix; struct ggVertex *v = gg->vertices; int i=0; *oIndex = needMem(sizeof(int)*gg->vertexCount); for(i=0; ivertexCount; i++) { if(em[i][endIx] && v[i].type == startType) { if(overlappingMRnas(gg, mRnaAli, softStartIx, endIx, i, endIx)) { (*oIndex)[*oCount] = i; (*oCount)++; } } } } void getEndEdgesOverlapping(struct geneGraph *gg, struct hash *mRnaAli, int hardStartIx, int softEndIx, enum ggVertexType endType, int **oIndex, int *oCount) /** Loop through all the edges looking for ones that overlap em[hardStartIx][softEndIx], end with hard vertex, and have an mRNA that actually overlaps with the softEndIx vertex. */ { bool **em = gg->edgeMatrix; struct ggVertex *v = gg->vertices; int i=0; *oIndex = needMem(sizeof(int)*gg->vertexCount); for(i=0; ivertexCount; i++) { if(em[hardStartIx][i] && v[i].type == endType) { if(overlappingMRnas(gg, mRnaAli, hardStartIx, softEndIx, hardStartIx, i)) { (*oIndex)[*oCount] = i; (*oCount)++; } } } } struct hash *newAlignmentHash(struct ggMrnaCluster *mc) /** Hash the ggAliInfo's by the qName. */ { struct hash *hash = newHash(4); struct ggAliInfo *ai = NULL; for(ai=mc->mrnaList; ai != NULL; ai = ai->next) { hashAdd(hash, ai->ma->qName, ai); } return hash; } int consensusSoftStartVertex(struct geneGraph *gg, int softStartIx, int hardEndIx, int *oIndex, int oCount, boolean allowSmaller) /** Pick the best soft start out of oIndex. Return index of best start into gg->vertices, -1 if no vertex found. Allow smaller exon only if allowSmaller == TRUE. Best is defined by: - Number of other alignments that support it. - Distance from softStartIx. */ { int bestVertex = -1; struct ggVertex *v = gg->vertices; struct ggEvidence ***e = gg->evidence; int longerBetterPrior = 5; int maxDistance = 3000; int minDistance = 20; int i=0; int bestCount = -1; int bestDist = -1; boolean isHard = FALSE; for(i=0; i= 0 || allowSmaller) && /* Exon is larger or we're allowing smaller exons. */ (!isHard || v[oIndex[i]].type == ggHardStart) && /* Replace if not replacing hard with soft. */ (curCount > bestCount || /* and current count is better. */ (((curCount +longerBetterPrior) >= bestCount || curDist - bestDist <= minDistance) && /* end has decent evidence or is close */ curDist >= bestDist && curDist <= maxDistance) )) /* or start is farther upstream. */ { if(v[oIndex[i]].type == ggHardStart) isHard = TRUE; bestVertex = oIndex[i]; bestCount = curCount; bestDist = curDist; } } return bestVertex; } int consensusSoftEndVertex(struct geneGraph *gg, int hardStartIx, int softEndIx, int *oIndex, int oCount, boolean allowSmaller) /** Pick the best hard end out of oIndex. Return index of best hard end index into gg->vertices, -1 if no vertex found. Allow smaller exon only if allowSmaller == TRUE. Best is defined by: - Number of other alignments that support it. - Distance from softStartIx. */ { int bestVertex = -1; struct ggVertex *v = gg->vertices; struct ggEvidence ***e = gg->evidence; int i=0; int longerBetterPrior = 5; int bestCount = -1; int bestDist = -1; int maxDist = 3000; int minDist = 20; boolean isHard = FALSE; for(i=0; i= 0 || allowSmaller) && /* Exon is larger or we're allowing smaller exons. */ (!isHard || v[oIndex[i]].type == ggHardEnd) && /* Replace if not replacing hard with soft. */ (curCount > bestCount || /* and current count is better. */ (((curCount + longerBetterPrior )>= bestCount || curDist - bestDist <= minDist) && /* end has decent evidence or is close */ curDist >= bestDist && curDist <= maxDist) )) /* and end is farther downstream. */ { if(v[oIndex[i]].type == ggHardEnd) isHard = TRUE; bestVertex = oIndex[i]; bestCount = curCount; bestDist = curDist; } } return bestVertex; } static boolean softStartOverlapConsensusArcs(struct geneGraph *gg, struct hash *aliHash, int softStartIx, enum ggVertexType startType, boolean allowSmaller) /** For each edge that contains softStartIx see if there are overlapping edges that have a hard start. If there are, find the consensus hard start and extend original edge to it. Only allow replacement of a softStart with a shorter exon when allowSmaller is TRUE. */ { int *oIndex = NULL; /* Index into gg->vertexes array for overlapping. */ int oCount = 0; /* Number of items in oIndex and overlapping arrays. */ bool **em = gg->edgeMatrix; int vCount = gg->vertexCount; int i; int bestStart = -1; boolean changed = FALSE; for (i=0; i= 0 && bestStart != softStartIx) { em[softStartIx][i] = FALSE; moveEvidenceIfUnique(&gg->evidence[bestStart][i], &gg->evidence[softStartIx][i]); em[bestStart][i] = TRUE; changed=TRUE; } freez(&oIndex); bestStart = -1; } } return changed; } static boolean softEndOverlapConsensusArcs(struct geneGraph *gg, struct hash *aliHash, int softEndIx, enum ggVertexType endType, boolean allowSmaller) /** For each edge that contains softEndIx see if there are overlapping edges that have a hard end. If there are, find the consensus hard start and extend original edge to it. */ { int *oIndex = NULL; /* Index into gg->vertexes array for overlapping. */ int oCount = 0; /* Number of items in oIndex and overlapping arrays. */ bool **em = gg->edgeMatrix; int vCount = gg->vertexCount; int i; int bestEnd = -1; boolean changed = FALSE; for (i=0; i= 0 && bestEnd != softEndIx) { em[i][softEndIx] = FALSE; moveEvidenceIfUnique(&gg->evidence[i][bestEnd], &gg->evidence[i][softEndIx]); em[i][bestEnd] = TRUE; changed=TRUE; } freez(&oIndex); bestEnd = -1; } } return changed; } static boolean softStartArcs(struct geneGraph *gg, int softStartIx) /* Replace a soft start with all hard starts of overlapping, * but not conflicting exons. */ { int vCount = gg->vertexCount; bool **em = gg->edgeMatrix; boolean result = FALSE; boolean anyLeft = FALSE; int i; for (i=0; ievidence[softStartIx][i]); result = TRUE; } else { anyLeft = TRUE; } } } if (!anyLeft && !anythingConnectsTo(gg, softStartIx)) { gg->vertices[softStartIx].type = ggUnused; } return result; } static boolean arcsForSoftEndedExon(struct geneGraph *gg, int startIx, int softEndIx) /* Put in arcs between hard start, and hard ends of any compatible * exons overlapping this one */ { boolean result = FALSE; int vCount = gg->vertexCount; int sIx, eIx; bool **em = gg->edgeMatrix; struct ggVertex *vertices = gg->vertices; int softEndPos = vertices[softEndIx].position; for (sIx = 0; sIx < vCount; ++sIx) { struct ggVertex *sv = vertices+sIx; int type = sv->type; if (type == ggSoftStart || type == ggHardStart) { int svpos = sv->position; int softBleedAdjust = (type == ggHardStart ? softEndBleedLimit : 0); // if (svpos >= startPos && svpos < softEndPos) if (svpos < softEndPos) { for (eIx = 0; eIx < vCount; ++eIx) { if (eIx != softEndIx && em[sIx][eIx]) { struct ggVertex *ev = vertices+eIx; if (ev->position >= softEndPos - softBleedAdjust) { struct ggEvidence *ge = NULL; ge = ggEvidenceClone(gg->evidence[startIx][softEndIx]); em[startIx][eIx] = TRUE; moveEvidenceIfUnique(&gg->evidence[startIx][eIx], &ge); result = TRUE; } } } } } } return result; } static boolean softEndArcs(struct geneGraph *gg, int softEndIx) /* Replace a soft end with all hard ends of overlapping, * but not conflicting gexons. */ { int vCount = gg->vertexCount; bool **em = gg->edgeMatrix; boolean result = FALSE; boolean anyLeft = FALSE; int i; for (i=0; ievidence[i][softEndIx]); result = TRUE; } else { anyLeft = TRUE; } } } if (!anyLeft && !connectToAnything(gg, softEndIx)) { gg->vertices[softEndIx].type = ggUnused; } return result; } static boolean arcsForOverlaps(struct geneGraph *gg) /* Add in arcs for exons with soft ends that overlap other * exons. */ { int i; int vCount = gg->vertexCount; struct ggVertex *vertices = gg->vertices; boolean result = FALSE; for (i=0; ivertexCount; struct ggVertex *vertices = gg->vertices; bool **em = gg->edgeMatrix; int eStart = vertices[startIx].position; for (i=0; itype; if ((type == ggSoftStart || type == ggHardStart) && sv->position <= eStart) { for (j=0; jtype; if ((type == ggSoftEnd) || ((type == ggHardEnd) && (dv->position >= endIx))) { *retStartIx = i; *retEndIx = j; return TRUE; } } } } } return FALSE; } static boolean hideLittleOrphans(struct geneGraph *gg) /* Get rid of exons which are soft edged on both sides and * completely enclosed by another exon. * (The arcsForOverlaps doesn't handle these. */ { int i,j; int vCount = gg->vertexCount; struct ggVertex *vertices = gg->vertices; bool **em = gg->edgeMatrix; boolean result = FALSE; for (i=0; ievidence[otherI][otherJ], &gg->evidence[i][j]); result = TRUE; } else anyLeft = TRUE; } } if (!anyLeft) { vertices[i].type = ggUnused; } } } return result; } void ggOutputDebug(struct geneGraph *gg, int offset) /* convert to altGraphX and output to stdout */ { struct altGraphX *ag = NULL; ag = ggToAltGraphX(gg); altGraphXoffset(ag, offset); altGraphXVertPosSort(ag); altGraphXTabOut(ag, stdout); altGraphXFree(&ag); } boolean softlyTrimConsensus(struct geneGraph *gg, struct hash *aliHash) /** Extend soft starts and ends to a consensus start or end if it exists. */ { int i; int vCount = gg->vertexCount; struct ggVertex *vertices = gg->vertices; boolean result = FALSE; for (i=0; igenoSeq; struct ggVertex *v = gg->vertices; int i=0; for(i=0; i <= v[hardStart].position - v[softStart].position; i++) { if(seq->dna[v[hardStart].position - i] != seq->dna[v[upStreamEnd].position -i]) return FALSE; } return TRUE; } void tryToFixSoftStartAlignment(struct geneGraph *gg, struct ggMrnaCluster *mc, struct hash *aliHash, int softStart, int minOverlap) /** Try to fix alignments where the EST/mRNA should reach into the next exon but it isn't big enough for blat to find it. */ { int i, j, k; int vCount = gg->vertexCount; bool **em = gg->edgeMatrix; struct ggVertex *v = gg->vertices; for(i=0; i 0 && v[j].position - v[softStart].position <= 5) { for(k=0; kevidence[softStart][i]); return; } /* Else if we are less than the minimum distance drop the edge. */ else if(abs(v[softStart].position - v[j].position) < minOverlap) { em[softStart][i] = FALSE; ggEvidenceFreeList(&gg->evidence[softStart][i]); return; } } } } } } } } boolean matchAsWellSoftEnd(struct geneGraph *gg, struct ggMrnaCluster *mc, int hardStart, int softEnd, int hardEnd, int downStreamStart) /** Return TRUE if the changing the alignment to gap accross hardEnd -- downStreamStart produces as good of an alignment as hardStart -- softEnd. */ { struct dnaSeq *seq = mc->genoSeq; struct ggVertex *v = gg->vertices; int i=0; for(i=0; i < v[softEnd].position - v[hardEnd].position; i++) { if(seq->dna[v[hardEnd].position + i] != seq->dna[v[downStreamStart].position + i]) return FALSE; } return TRUE; } void tryToFixSoftEndAlignment(struct geneGraph *gg, struct ggMrnaCluster *mc, struct hash *aliHash, int softEnd, int minOverlap) /** Try to fix alignments where the EST/mRNA should reach into the next exon but it isn't big enough for blat to find it. If an overlap is less than minOverlap it will be removed anyhow. */ { int i, j, k; int vCount = gg->vertexCount; bool **em = gg->edgeMatrix; struct ggVertex *v = gg->vertices; for(i=0; i 0 && v[softEnd].position - v[j].position <= 5) { for(k=0; kevidence[i][softEnd]); return; } /* Else if we are less than the minimum distance drop the edge. */ else if( abs(v[softEnd].position - v[j].position) < minOverlap) { em[i][softEnd] = FALSE; ggEvidenceFreeList(&gg->evidence[i][softEnd]); return; } } } } } } } } void fixLargeMrnaInserts(struct geneGraph *gg) /* If there is an insert in the mRNA can end up with cases where there is a soft end and soft start at the same genomic position. Splice these out and connect the two upstream and downstream vertices. */ { struct ggVertex *v = gg->vertices; int vStart = -1, vEnd = -1, vUp = -1, vDown = -1; int vC = gg->vertexCount; bool **em = gg->edgeMatrix; for(vStart = 0; vStart < vC; vStart++) { for(vEnd = 0; vEnd < vC; vEnd++) { /* If we have two vertices with same position that connect to eachother try to find outside things that they connect to and splice out self connection. */ if(v[vStart].position == v[vEnd].position && em[vStart][vEnd] && v[vStart].type == ggSoftEnd && v[vEnd].type == ggSoftStart) { /* Got one - Try to find all upstream and downstream connections. */ boolean foundOne = FALSE; for(vUp = 0; vUp < vC; vUp++) { for(vDown = 0; vDown < vC; vDown++) { if(em[vUp][vStart] && em[vEnd][vDown]) { struct ggEvidence *newList = ggEvidenceClone(gg->evidence[vStart][vEnd]); moveEvidenceIfUnique(&gg->evidence[vUp][vDown], &newList); em[vUp][vDown] = TRUE; em[vUp][vStart] = FALSE; em[vEnd][vDown] = FALSE; foundOne = TRUE; } } } if(foundOne == TRUE) { em[vStart][vEnd] = FALSE; ggEvidenceFreeList(&gg->evidence[vStart][vEnd]); } } } } } static void updatePointsInEdgeWithMedianOfEvidence(struct geneGraph *gg, int v1, int v2) /* Replace point positions with median values of evidence for edge. */ { struct ggEvidence *evList = gg->evidence[v1][v2]; int evCount = slCount(evList); int sorter[evCount]; int i; struct ggEvidence *ev; for (ev = evList, i=0; ev != NULL; ev = ev->next, i++) sorter[i] = ev->start; gg->vertices[v1].position = intMedian(evCount, sorter); for (ev = evList, i=0; ev != NULL; ev = ev->next, i++) sorter[i] = ev->end; gg->vertices[v2].position = intMedian(evCount, sorter); } static void trimEmptyEdges(struct geneGraph *gg) /* Get rid of edges that are less than zero length. */ { int vertexCount = gg->vertexCount; int i,j; for(i=0; ivertices[i].position; for (j=0; jedgeMatrix[i][j]) { int pos2 = gg->vertices[j].position; int edgeSize = pos2-pos1; if (edgeSize <= 0) { gg->edgeMatrix[i][j] = FALSE; ggEvidenceFreeList(&gg->evidence[i][j]); } } } } } static void mergeDoubleSofts(struct geneGraph *gg) /* Merge together overlapping edges with soft ends. */ { struct mergedEdge /* Hold together info on a merged edge. */ { int vertex1, vertex2; struct ggEvidence *evidence; }; int i,j; /* Traverse graph and build up range tree */ struct rbTree *rangeTree = rangeTreeNew(); bool **edgeMatrix = gg->edgeMatrix; int vertexCount = gg->vertexCount; for (i=0; ivertices[i]; struct ggVertex *end = &gg->vertices[j]; if (start->position < end->position) if (start->type == ggSoftStart && end->type == ggSoftEnd) rangeTreeAdd(rangeTree, start->position, end->position); } } /* Traverse graph again merging edges */ struct ggEvidence ***evidence = gg->evidence; for (i=0; ivertices[i]; struct ggVertex *end = &gg->vertices[j]; if (start->type == ggSoftStart && end->type == ggSoftEnd) { if (start->position < end->position) { struct range *r = rangeTreeFindEnclosing(rangeTree, start->position, end->position); assert(r != NULL); struct mergedEdge *mergeEdge = r->val; if (mergeEdge == NULL) { lmAllocVar(rangeTree->lm, mergeEdge); r->val = mergeEdge; } if (start->position == r->start) mergeEdge->vertex1 = i; if (end->position == r->end) mergeEdge->vertex2 = j; moveEvidenceIfUnique(&mergeEdge->evidence, &evidence[i][j]); edgeMatrix[i][j] = FALSE; } } } } /* Traverse merged edge list */ struct range *r; for (r = rangeTreeList(rangeTree); r != NULL; r = r->next) { struct mergedEdge *mergeEdge = r->val; int v1 = mergeEdge->vertex1; int v2 = mergeEdge->vertex2; evidence[v1][v2] = mergeEdge->evidence; updatePointsInEdgeWithMedianOfEvidence(gg, v1, v2); edgeMatrix[v1][v2] = TRUE; } trimEmptyEdges(gg); rbTreeFree(&rangeTree); } int ggCountEdges(struct geneGraph *gg) /* Count number of edges. */ { int edgeCount = 0; int vertexCount = gg->vertexCount; int i,j; bool **edgeMatrix = gg->edgeMatrix; for(i=0; ivertexCount; int i,j; struct ggEvidence ***evidence = gg->evidence; for(i=0; ivertexCount, ggCountEdges(gg), ggCountAllEvidence(gg)); if (ci->genoSeq != NULL) { for(i=0; ivertexCount; i++) { if(gg->vertices[i].type == ggSoftStart) tryToFixSoftStartAlignment(gg, mc, aliHash, i, minOverlap); if(gg->vertices[i].type == ggSoftEnd) tryToFixSoftEndAlignment(gg, mc, aliHash, i, minOverlap); } } /* Try to fill in soft ends and starts with hard starts and ends. */ for(i=0; ivertexCount; i++) { if(gg->vertices[i].type == ggSoftStart) softStartOverlapConsensusArcs(gg, aliHash, i, ggHardStart, FALSE); else if(gg->vertices[i].type == ggSoftEnd) softEndOverlapConsensusArcs(gg, aliHash, i, ggHardEnd, FALSE); } /* Fill in soft ends and starts with the "best" soft end or start. */ softlyTrimConsensus(gg, aliHash); mergeDoubleSofts(gg); hideLittleOrphans(gg); fixLargeMrnaInserts(gg); if(fillInEvidence) { if(tissLibHash == NULL) conn = hAllocConn(genomeDb); ggFillInTissuesAndLibraries(gg, tissLibHash, conn); } else { AllocArray(gg->mrnaTissues, gg->mrnaRefCount); AllocArray(gg->mrnaLibs, gg->mrnaRefCount); } hFreeConn(&conn); hashFree(&aliHash); return gg; } struct geneGraph *ggGraphCluster(char *genomeDb, struct ggMrnaCluster *mc, struct ggMrnaInput *ci) /* Make up a gene transcript graph out of the ggMrnaCluster. */ { struct sqlConnection *conn = hAllocConn(genomeDb); struct geneGraph *gg = makeInitialGraph(mc, ci); arcsForOverlaps(gg); softlyTrim(gg); hideLittleOrphans(gg); ggFillInTissuesAndLibraries(gg, NULL, conn); hFreeConn(&conn); return gg; } void printEdgesWithVertex(struct geneGraph *gg, int queryIx) /* Print all of the edges that contain a given vertex. Useful for debugging. */ { int i = 0; int vC = gg->vertexCount; bool **em = gg->edgeMatrix; struct ggVertex *query = &gg->vertices[queryIx]; for(i = 0; i < vC; i++) { struct ggVertex *v = &gg->vertices[i]; if(em[i][queryIx]) printf("%d-%d %d-%d %d-%d\n", i, queryIx, v->position, query->position, v->type, query->type); if(em[queryIx][i]) printf("%d-%d %d-%d %d-%d\n", queryIx, i, query->position, v->position, query->type, v->type); } }