aba8125cb532df17beb7c7c9bc8467a43d09e3d6 braney Wed Feb 10 13:39:27 2016 -0800 changes to allow for GenBank metadata to be held in a common table. #16809 diff --git src/hg/lib/geneGraph.c src/hg/lib/geneGraph.c index 58bd12c..6ea785f 100644 --- src/hg/lib/geneGraph.c +++ src/hg/lib/geneGraph.c @@ -1,786 +1,786 @@ /***************************************************************************** * Copyright (C) 2000 Jim Kent. This source code may be freely used * * for personal, academic, and non-profit purposes. Commercial use * * permitted only by explicit agreement with Jim Kent (jim_kent@pacbell.net) * *****************************************************************************/ /* geneGraph - Relatively short, high level routines for using * (but not necessarily creating) a geneGraph. A geneGraph * is a way of representing possible alt-splicing patterns * of a gene. */ #include "common.h" #include "dnautil.h" #include "dnaseq.h" #include "geneGraph.h" #include "ggPrivate.h" #include "altGraphX.h" #include "geneGraph.h" #include "dystring.h" - +#include "genbank.h" void ggEvidenceFree(struct ggEvidence **pEl) /* Free a single dynamically allocated ggEvidence */ { struct ggEvidence *el; if ((el = *pEl) == NULL) return; freez(pEl); } void ggEvidenceFreeList(struct ggEvidence **pList) /* Free a list of dynamically allocated ggEvidence's */ { struct ggEvidence *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; ggEvidenceFree(&el); } *pList = NULL; } void freeGeneGraph(struct geneGraph **pGg) /* Free up a gene graph. */ { struct geneGraph *gg; if ((gg = *pGg) != NULL) { int i, j, vcount; bool **em = gg->edgeMatrix; vcount = gg->vertexCount; for (i=0; i<vcount; ++i) { freez(&em[i]); if(gg->evidence != NULL) { for(j=0; j<vcount; ++j) ggEvidenceFreeList(&gg->evidence[i][j]); freez(&gg->evidence[i]); } } if(gg->evidence != NULL) freez(&gg->evidence); for(i=0; i<gg->mrnaRefCount; i++) { freez(&gg->mrnaRefs[i]); } freez(&gg->mrnaRefs); freez(&gg->mrnaTypes); freez(&gg->tName); freez(&gg->mrnaTissues); freez(&gg->mrnaLibs); freez(&gg->evidence); freeMem(em); freeMem(gg->vertices); freez(pGg); } } /* Some variables and routines for managing the depth first * traversal of geneGraph - to generate individual * constraints from it. */ static GgTransOut rOutput; /* Output transcript. */ static struct ggVertex *rVertices; /* Vertex stack. */ static int rVertexAlloc = 0; /* Allocated size of stack. */ static int rVertexCount = 0; /* Count on stack. */ static int rStart, rEnd; /* Start/end of whole set in genomic coordinates*/ static void pushVertex(struct ggVertex *v) /* Add vertex to end of rVertices. */ { if (rVertexCount >= 1000) errAbort("Looks like a loop in graph!"); if (rVertexCount >= rVertexAlloc) { if (rVertexAlloc == 0) { rVertexAlloc = 256; AllocArray(rVertices, rVertexAlloc); } else { rVertexAlloc <<= 1; ExpandArray(rVertices, rVertexCount, rVertexAlloc); } } rVertices[rVertexCount++] = *v; } static void popVertex() /* Remove vertex from end of rVertices. */ { --rVertexCount; if (rVertexCount < 0) errAbort("ggVertex stack underflow"); } static void outputTranscript() /* Output one transcript. */ { struct ggAliInfo da; da.next = NULL; da.vertices = rVertices; da.vertexCount = rVertexCount; rOutput(&da, rStart, rEnd); } static void rTraverse(struct geneGraph *gg, int startVertex) /* Recursive traversal of gene graph - printing transcripts * when hit end nodes. */ { struct ggVertex *v = &gg->vertices[startVertex]; pushVertex(v); if (v->type == ggSoftEnd) { outputTranscript(); } else { int i; int vCount = gg->vertexCount; bool *waysOut = gg->edgeMatrix[startVertex]; for (i=0; i<vCount; ++i) { if (waysOut[i]) rTraverse(gg, i); } } popVertex(); } void traverseGeneGraph(struct geneGraph *gg, int cStart, int cEnd, GgTransOut output) /* Print out constraints for gene graph. */ { int vCount = gg->vertexCount; struct ggVertex *vertices = gg->vertices; int i; rStart = cStart; rEnd = cEnd; rOutput = output; for (i=0; i<vCount; ++i) { if (vertices[i].type == ggSoftStart) rTraverse(gg, i); } } boolean vertexUsed(struct geneGraph *gg, int vertexIx) /* Is the vertex attached to anything in the edgeMatrix? */ { bool **em = gg->edgeMatrix; int vC = gg->vertexCount; int i=0; /* First check the row. */ for(i=0; i<vC; i++) if(em[vertexIx][i]) return TRUE; /* Now check the column. */ for(i=0; i<vC; i++) if(em[i][vertexIx]) return TRUE; /* Guess it isn't used. */ return FALSE; } static int countUsed(struct geneGraph *gg, int vCount, int *translator) /* Count number of vertices actually used. */ { int i; int count = 0; struct ggVertex *v = gg->vertices; for (i=0; i<vCount; ++i) { translator[i] = count; if (v[i].type != ggUnused && vertexUsed(gg,i)) ++count; } return count; } struct altGraph *ggToAltGraph(struct geneGraph *gg) /* convert a gene graph to an altGraph data structure */ { struct altGraph *ag = NULL; bool **em = gg->edgeMatrix; int edgeCount = 0; int totalVertexCount = gg->vertexCount; int usedVertexCount; int *translator; /* Translates away unused vertices. */ struct ggVertex *vertices = gg->vertices; int i,j; UBYTE *vTypes; int *vPositions, *edgeStarts, *edgeEnds; AllocArray(translator, totalVertexCount); usedVertexCount = countUsed(gg, totalVertexCount, translator); for (i=0; i<totalVertexCount; ++i) { bool *waysOut = em[i]; for (j=0; j<totalVertexCount; ++j) if (waysOut[j]) ++edgeCount; } AllocVar(ag); safef(ag->strand, sizeof(ag->strand), "%s", gg->strand); ag->tName = cloneString(gg->tName); ag->tStart = gg->tStart; ag->tEnd = gg->tEnd; ag->vertexCount = usedVertexCount; ag->vTypes = AllocArray(vTypes, usedVertexCount); ag->vPositions = AllocArray(vPositions, usedVertexCount); ag->mrnaRefCount = gg->mrnaRefCount; AllocArray(ag->mrnaRefs, gg->mrnaRefCount); for(i=0; i < gg->mrnaRefCount; i++) { ag->mrnaRefs[i] = cloneString(gg->mrnaRefs[i]); } for (i=0,j=0; i<totalVertexCount; ++i) { struct ggVertex *v = vertices+i; if (v->type != ggUnused) { vTypes[j] = v->type; vPositions[j] = v->position; ++j; } } ag->edgeCount = edgeCount; ag->edgeStarts = AllocArray(edgeStarts, edgeCount); ag->edgeEnds = AllocArray(edgeEnds, edgeCount); edgeCount = 0; for (i=0; i<totalVertexCount; ++i) { bool *waysOut = em[i]; for (j=0; j<totalVertexCount; ++j) if (waysOut[j]) { edgeStarts[edgeCount] = translator[i] ; edgeEnds[edgeCount] = translator[j]; ++edgeCount; } } freeMem(translator); return ag; } void ggPrintHeader(int count) /* Print a header with numbers for vertices. Assumes less than 100 edges, * used for debugging output. */ { int i; printf(" "); for(i=0; i<count; i++) { if(i != 0 && i % 10 == 0) printf("%d", i/10); else printf(" "); } printf("\n "); for(i=0; i<count; i++) printf("%d", i % 10); printf("\n "); for(i=0; i<count; i++) printf("-"); printf("\n"); } void ggPrintStart(int count) /* print a number plus bar, i.e. ' 0|'. Used for debugging output. */ { if(count % 10 == 0 && count != 0) printf("%d", count/10); else printf(" "); printf("%d", count%10); printf("|"); } void ggEvidenceDebug(struct geneGraph *gg) /* Dump out the edge matrix and evidence matrix for a genegraph as 1 and 0's. */ { int i, j; int totalVertexCount = gg->vertexCount; printf("EdgeMatrix:\n"); ggPrintHeader(totalVertexCount); for(i=0; i<totalVertexCount; ++i) { ggPrintStart(i); for(j=0; j<totalVertexCount; ++j) { printf("%d",gg->edgeMatrix[i][j]); } printf("\n"); } printf("evidence matrix:\n"); ggPrintHeader(totalVertexCount); for(i=0; i<totalVertexCount; ++i) { ggPrintStart(i); for(j=0; j<totalVertexCount; ++j) { printf("%d", (gg->evidence[i][j] == NULL ? 0 : 1)); } printf("\n"); } printf("difference:\n"); ggPrintHeader(totalVertexCount); for(i=0; i<totalVertexCount; ++i) { ggPrintStart(i); for(j=0; j<totalVertexCount; ++j) { if((gg->evidence[i][j] == NULL && gg->edgeMatrix[i][j] == 1)) printf("m"); else if (gg->evidence[i][j] != NULL && gg->edgeMatrix[i][j] == 0) printf("e"); else printf("0"); } printf("\n"); } } boolean checkEvidenceMatrix(struct geneGraph *gg) /* Check to make sure that no edge has more weight than possible. * Used for testing. */ { boolean allOk = TRUE; int i,j,eCount; struct ggEvidence *ge = NULL; for(i =0; i<gg->vertexCount; i++) { for(j=0; j<gg->vertexCount; j++) { ge = gg->evidence[i][j]; eCount = slCount(ge); if(eCount > gg->mrnaRefCount) { warn("Too many rnas for edge at %d-%d. Max is %d, counted %d.", i, j, gg->mrnaRefCount, eCount ); allOk = FALSE; } } } return allOk; } boolean matricesInSync(struct geneGraph *gg) /* Return TRUE if edge and evidence matrices are in synch, FALSE otherwise. * Used for testing purposes. */ { boolean sync = TRUE; int i,j; int totalVertexCount = gg->vertexCount; for(i=0; i<totalVertexCount; ++i) { for(j=0; j<totalVertexCount; ++j) { if((gg->evidence[i][j] == NULL && gg->edgeMatrix[i][j] == 1)) sync = FALSE; else if (gg->evidence[i][j] != NULL && gg->edgeMatrix[i][j] == 0) sync = FALSE; } } return sync; } void ggFillInTissuesAndLibraries(struct geneGraph *gg, struct hash *tissLibHash, struct sqlConnection *conn) /* Load up the library and tissue information for mrnas from the mrna table. If the tissLibHash != NULL use it to find the library and tissue. They will be stored as an slInt list keyed by the accessions. */ { int i; int mrnaCount = gg->mrnaRefCount; AllocArray(gg->mrnaTissues, mrnaCount); gg->mrnaLibs = needMem(sizeof(int)*mrnaCount); //gg->mrnaLibs = AllocArray(gg->mrnaLibs, mrnaCount); for(i=0; i< mrnaCount; ++i) { /* Look in the hash if provided first. */ if(tissLibHash != NULL) { struct slInt *library = NULL, *tissue=NULL; library = hashMustFindVal(tissLibHash, gg->mrnaRefs[i]); gg->mrnaLibs[i] = library->val; tissue = library->next; assert(tissue); gg->mrnaTissues[i] = tissue->val; } else { struct sqlResult *sr = NULL; char **row = NULL; char query[256]; assert(gg->mrnaRefs[i]); - sqlSafef(query, sizeof(query), "select library, tissue from gbCdnaInfo where acc='%s'", gg->mrnaRefs[i]); + sqlSafef(query, sizeof(query), "select library, tissue from %s where acc='%s'", gbCdnaInfoTable, gg->mrnaRefs[i]); sr = sqlGetResult(conn, query); row = sqlNextRow(sr); if(row == NULL) errAbort("geneGraph.c::ggFillInTissuesAndLibraries() - Couldn't load library and tissue info for est: %s using query:\n%s", gg->mrnaRefs[i], query); gg->mrnaLibs[i] = sqlSigned(row[0]); gg->mrnaTissues[i] = sqlSigned(row[1]); sqlFreeResult(&sr); } } } boolean isSameGeneGraph(struct geneGraph *gg1, struct geneGraph *gg2) /* Returns true if the gene graphs are the same, otherwise returns false. */ { boolean allOk = TRUE; int i,j; if(gg1->vertexCount != gg2->vertexCount) return FALSE; if(gg1->mrnaRefCount != gg2->mrnaRefCount) return FALSE; if(differentString(gg1->tName, gg2->tName)) allOk = FALSE; if(gg1->tStart != gg2->tStart) allOk = FALSE; if(gg1->tEnd != gg2->tEnd) allOk = FALSE; if(differentString(gg1->strand,gg2->strand)) allOk = FALSE; for(i=0; i<gg1->vertexCount; i++) for(j=0; j<gg1->vertexCount; j++) { int ev1,ev2; if(gg1->edgeMatrix[i][j] != gg2->edgeMatrix[i][j]) allOk = FALSE; ev1 = slCount(gg1->evidence[i][j]); ev2 = slCount(gg2->evidence[i][j]); if(ev1 != ev2) allOk = FALSE; } for(i=0; i<gg1->vertexCount; i++) { if(gg1->vertices[i].type != gg2->vertices[i].type) allOk = FALSE; if(gg1->vertices[i].position != gg2->vertices[i].position) allOk = FALSE; } for(i=0; i<gg1->mrnaRefCount; i++) { if(differentString(gg1->mrnaRefs[i], gg2->mrnaRefs[i])) allOk = FALSE; if(gg1->mrnaTissues[i] != gg2->mrnaTissues[i]) allOk = FALSE; if(gg1->mrnaLibs[i] != gg2->mrnaLibs[i]) allOk = FALSE; } return allOk; } enum ggEdgeType ggClassifyEdge(struct geneGraph *gg, int v1, int v2) /* Classify and edge as ggExon or ggIntron. ggExon is not * necessarily coding. */ { struct ggVertex *vertices = gg->vertices; enum ggEdgeType et = 0; /* Make sure the indexes are in bounds and edge exists. */ assert(v1 < gg->vertexCount && v2 < gg->vertexCount && gg->edgeMatrix[v1][v2]); if( (vertices[v1].type == ggHardStart || vertices[v1].type == ggSoftStart) && (vertices[v2].type == ggHardEnd || vertices[v2].type == ggSoftEnd)) et = ggExon; else if((vertices[v1].type == ggHardEnd || vertices[v1].type == ggSoftEnd) && (vertices[v2].type == ggHardStart || vertices[v2].type == ggSoftStart)) et = ggIntron; else errAbort("geneGraph::ggClassifyEdge() - " "Edge from %d -> %d has types %d-%d which isn't recognized as intron or exon.\n", v1, v2, vertices[v1].type, vertices[v2].type); #ifdef NEVER // What the heck? Flipping inton/exon on strand???? */ if(sameString(gg->strand, "-")) { if(et == ggExon) et = ggIntron; else if(et == ggIntron) et = ggExon; } #endif /* NEVER */ return et; } struct altGraphX *ggToAltGraphX(struct geneGraph *gg) /* Convert a gene graph to an altGraphX data structure for storage in * database. */ { struct altGraphX *ag = NULL; bool **em = gg->edgeMatrix; int edgeCount = 0; int totalVertexCount = gg->vertexCount; int usedVertexCount; struct dyString *accessionList = NULL; int *translator; /* Translates away unused vertices. */ struct ggVertex *vertices = gg->vertices; struct ggEvidence *ev = NULL; int i,j; UBYTE *vTypes; int *vPositions; AllocArray(translator, totalVertexCount); usedVertexCount = countUsed(gg, totalVertexCount, translator); for (i=0; i<totalVertexCount; ++i) { bool *waysOut = em[i]; for (j=0; j<totalVertexCount; ++j) if (waysOut[j] && gg->vertices[j].type != ggUnused) ++edgeCount; } AllocVar(ag); safef(ag->strand, sizeof(ag->strand), "%s", gg->strand); ag->tName = cloneString(gg->tName); ag->tStart = gg->tStart; ag->tEnd = gg->tEnd; ag->name = cloneString("NA"); ag->vertexCount = usedVertexCount; ag->vTypes = AllocArray(vTypes, usedVertexCount); ag->vPositions = AllocArray(vPositions, usedVertexCount); ag->mrnaRefCount = gg->mrnaRefCount; accessionList = newDyString(10*gg->mrnaRefCount); /* Have to print the accessions all out in the same string to conform to how the memory is handled later. */ for(i=0; i<gg->mrnaRefCount; i++) dyStringPrintf(accessionList, "%s,", gg->mrnaRefs[i]); sqlStringDynamicArray(accessionList->string, &ag->mrnaRefs, &ag->mrnaRefCount); dyStringFree(&accessionList); if(gg->mrnaRefCount > 0) { ag->mrnaTissues = CloneArray(gg->mrnaTissues, gg->mrnaRefCount); ag->mrnaLibs = CloneArray(gg->mrnaLibs, gg->mrnaRefCount); } /* convert vertexes */ for (i=0,j=0; i<totalVertexCount; ++i) { struct ggVertex *v = vertices+i; if (v->type != ggUnused && vertexUsed(gg, i)) { vTypes[j] = v->type; vPositions[j] = v->position; ++j; } } /* convert edges */ ag->edgeCount = edgeCount; AllocArray(ag->edgeStarts, edgeCount); AllocArray(ag->edgeEnds, edgeCount); AllocArray(ag->edgeTypes, edgeCount); ag->evidence = NULL; edgeCount = 0; for (i=0; i<totalVertexCount; ++i) { bool *waysOut = em[i]; if(gg->vertices[i].type == ggUnused) { for (j=0; j<totalVertexCount; ++j) { if(gg->edgeMatrix[i][j]) errAbort("Shouldn't be any connections to an unused vertex %d-%d.", i, j); } continue; } for (j=0; j<totalVertexCount; ++j) if (waysOut[j] && gg->vertices[j].type != ggUnused) { struct evidence *e; int count =0; /* Store the mrna evidence. */ AllocVar(e); e->evCount = slCount(gg->evidence[i][j]); AllocArray(e->mrnaIds, e->evCount); for(ev = gg->evidence[i][j]; ev != NULL; ev = ev->next) { assert(count < e->evCount); e->mrnaIds[count++] = ev->id; } slAddHead(&ag->evidence, e); /* Store the edge information. */ ag->edgeStarts[edgeCount] = translator[i]; ag->edgeEnds[edgeCount] = translator[j]; ag->edgeTypes[edgeCount] = ggClassifyEdge(gg, i, j); ++edgeCount; } } slReverse(&ag->evidence); freeMem(translator); return ag; } struct geneGraph *altGraphXToGG(struct altGraphX *ag) /* Convert an altGraphX to a geneGraph. Free with freeGeneGraph */ { struct geneGraph *gg = NULL; int i,j; AllocVar(gg); gg->tName = cloneString(ag->tName); gg->tStart = ag->tStart; gg->tEnd = ag->tEnd; gg->vertexCount = ag->vertexCount; safef(gg->strand, sizeof(gg->strand), "%s", ag->strand); gg->mrnaRefCount = ag->mrnaRefCount; gg->mrnaTissues = CloneArray(ag->mrnaTissues, ag->mrnaRefCount); gg->mrnaLibs = CloneArray(ag->mrnaLibs, ag->mrnaRefCount); AllocArray(gg->mrnaRefs, gg->mrnaRefCount); for(i=0; i<gg->mrnaRefCount; i++) gg->mrnaRefs[i] = cloneString(ag->mrnaRefs[i]); gg->edgeMatrix = altGraphXCreateEdgeMatrix(ag); /* will be free'd when geneGraph free'd */ AllocArray(gg->vertices, gg->vertexCount); for(i=0; i<gg->vertexCount; i++) { gg->vertices[i].position = ag->vPositions[i]; gg->vertices[i].type = ag->vTypes[i]; } AllocArray(gg->evidence, gg->vertexCount); for(i=0; i<gg->vertexCount; i++) AllocArray(gg->evidence[i], gg->vertexCount); for(i=0; i < ag->edgeCount; i++) { int x = ag->edgeStarts[i]; int y = ag->edgeEnds[i]; struct evidence *e = slElementFromIx(ag->evidence, i); for(j=0; j<e->evCount; j++) { struct ggEvidence *ge = NULL; AllocVar(ge); ge->id = e->mrnaIds[j]; ge->start = gg->vertices[x].position; ge->end = gg->vertices[y].position; ggEvAddHeadWrapper(&gg->evidence[x][y], &ge); } slReverse(&gg->evidence[x][y]); } return gg; } boolean ggIsCassetteExonEdge(struct geneGraph *gg, int vertex1, int vertex2) /* Return TRUE if there is evidence that this exon is optional * or FALSE otherwise. */ { int i,j; struct ggVertex *vertices = NULL; /* shorthand for vertices */ int vCount = 9; /* shorthand for vertexCount */ bool **em = NULL; /* shorthand for edgeMatrix */ boolean cassette = FALSE; /* final result, is this a cassette exon? */ int *sCandidates = NULL; /* array of vertices that connect to vertex1 */ int sCount = 0; /* count of vertices that connect to vertex1 */ int *eCandidates = NULL; /* array of vertices that vertex2 connects to */ int eCount = 0; /* count of vertices that vertex2 connects to */ assert(gg); vertices = gg->vertices; vCount = gg->vertexCount; em = gg->edgeMatrix; AllocArray(sCandidates, vCount); AllocArray(eCandidates, vCount); if(vertices[vertex1].type != ggHardStart && vertices[vertex1].type != ggSoftStart) { warn("geneGraph::ggIsCassetteExonEdge() - Hard to be an exon when at %s:%d-%d vertex 1 isn't a start", gg->tName, vertices[vertex1].position, vertices[vertex2].position); return FALSE; } if(vertices[vertex2].type != ggHardEnd && vertices[vertex2].type != ggSoftEnd) { warn("geneGraph::ggIsCassetteExonEdge() - Hard to be an exon when at %s:%d-%d vertex 2 isn't an end", gg->tName, vertices[vertex2].position, vertices[vertex2].position); return FALSE; } /* find all of our candidates that link to the exon */ for(i=0; i<vCount; i++) { if(i != vertex1 && em[i][vertex1]) { assert(sCount < vCount); sCandidates[sCount++] = i; } } /* find coordinate of what the exon links to */ for(i=0; i<vCount; i++) { if(i != vertex2 && em[vertex2][i]) { assert(eCount < vCount); eCandidates[eCount++] = i; } } /* check to see if the starts connect to the ends */ for(i=0; i< sCount && !cassette; i++) { for(j=0; j<eCount && !cassette; j++) { if(em[sCandidates[i]][eCandidates[j]]) cassette=TRUE; } } freez(&eCandidates); freez(&sCandidates); return cassette; } struct ggEdge *ggFindCassetteExons(struct geneGraph *gg) /* Return a list of edges that appear to be cassette exons. */ { bool **em; int vCount = 0; struct ggVertex *vertices = NULL; struct ggEdge *edgeList = NULL, *edge = NULL; int i, j; assert(gg); vertices = gg->vertices; vCount = gg->vertexCount; em = gg->edgeMatrix; for(i=0; i<vCount; i++) { for(j=0; j<vCount; j++) { if(i != j && em[i][j] && (vertices[i].type == ggHardStart) && (vertices[j].type == ggHardEnd)) { if(ggIsCassetteExonEdge(gg, i, j)) { AllocVar(edge); edge->vertex1 = i; edge->vertex2 = j; edge->type = ggExon; slAddHead(&edgeList, edge); } } } } return edgeList; } struct ggEdge *ggCreateEdge(int v1, int v2, int type) /* create and return and graph edge, free with freez(). */ { struct ggEdge *edge; AllocVar(edge); edge->vertex1 = v1; edge->vertex2 = v2; edge->type = type; return edge; }