e70152e44cc66cc599ff6b699eb8adc07f3e656a kent Sat May 24 21:09:34 2014 -0700 Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment. diff --git src/hg/lib/geneSimilarities.c src/hg/lib/geneSimilarities.c index 9dd9f4e..0387db8 100644 --- src/hg/lib/geneSimilarities.c +++ src/hg/lib/geneSimilarities.c @@ -1,210 +1,213 @@ /* Objects to find genePreds given similarity in genome bases annotated. This * is used to relate genes between two gene sets. */ + +/* Copyright (C) 2014 The Regents of the University of California + * See README in this or parent directory for licensing information. */ #include "common.h" #include "geneSimilarities.h" #include "genePred.h" #include "genePredReader.h" #include "jksql.h" static struct geneSim *geneSimNew(struct genePred *gene, float sim) /* construct a new geneSim */ { struct geneSim *gs; AllocVar(gs); gs->gene = gene; gs->sim = sim; return gs; } static void geneSimFreeList(struct geneSim **gss) /* free list of geneSims and associated genePreds */ { struct geneSim *gs; while ((gs = slPopHead(gss)) != NULL) { genePredFree(&gs->gene); freeMem(gs); } } struct geneSimilarities *geneSimilaritiesNew(struct genePred *query, boolean cdsSim) /* construct a new geneSimilarities object. Ownership of query is passed. */ { struct geneSimilarities *gss; AllocVar(gss); gss->query = query; gss->cdsSim = cdsSim; return gss; } void geneSimilaritiesFree(struct geneSimilarities **gssPtr) /* free a geneSimilarities object and all contained genePreds */ { struct geneSimilarities *gss = *gssPtr; if (gss != NULL) { geneSimFreeList(&gss->genes); genePredFree(&gss->query); freez(gssPtr); } } void geneSimilaritiesFreeList(struct geneSimilarities **gssList) /* free a list of geneSimilarities objects and all contained genePreds */ { struct geneSimilarities *gss; while ((gss = slPopHead(gssList)) != NULL) geneSimilaritiesFree(&gss); } static int cntOverlapCdsBases(struct genePred *ga, struct genePred *gb) /* determine the number of CDS bases overlaping */ { int overBases = 0, ia, ib; int cdsStartA, cdsEndA, cdsStartB, cdsEndB; for (ia = 0; ia < ga->exonCount; ia++) { if (genePredCdsExon(ga, ia, &cdsStartA, &cdsEndA)) { for (ib = 0; ib < gb->exonCount; ib++) { if (genePredCdsExon(gb, ib, &cdsStartB, &cdsEndB)) { int o = min(cdsEndA, cdsEndB) - max(cdsStartA, cdsStartB); if (o > 0) overBases += o; } } } } return overBases; } static float geneCdsSimilarity(struct genePred *ga, struct genePred *gb) /* computer a CDS similary score for two genes, which is a fraction */ { int gaBases = genePredCodingBases(ga); int gbBases = genePredCodingBases(gb); return ((float)(2*cntOverlapCdsBases(ga, gb)))/(float)(gaBases+gbBases); } static int cntOverlapExonBases(struct genePred *ga, struct genePred *gb) /* determine the number of exon bases overlaping */ { int overBases = 0, ia, ib; for (ia = 0; ia < ga->exonCount; ia++) { for (ib = 0; ib < gb->exonCount; ib++) { int o = min(ga->exonEnds[ia], gb->exonEnds[ib]) - max(ga->exonStarts[ia], gb->exonStarts[ib]); if (o > 0) overBases += o; } } return overBases; } static float geneExonSimilarity(struct genePred *ga, struct genePred *gb) /* computer an exon similary score for two genes, which is a fraction */ { int gaBases = genePredBases(ga); int gbBases = genePredBases(gb); return ((float)(2*cntOverlapExonBases(ga, gb)))/(float)(gaBases+gbBases); } int geneSimCmp(const void *va, const void *vb) /* compare geneSim objects by descending similarity score. */ { struct geneSim *ga = *((struct geneSim **)va); struct geneSim *gb = *((struct geneSim **)vb); if (ga->sim > gb->sim) return -1; else if (ga->sim < gb->sim) return -1; else return 0; } static void findSimilarities(struct geneSimilarities *gss, struct sqlConnection *conn, char *targetGeneTbl) /* find genes similar to the query in gss and add them. */ { char where[64]; sqlSafefFrag(where, sizeof(where), "strand = \"%s\"", gss->query->strand); int start = gss->cdsSim ? gss->query->cdsStart :gss->query->txStart; int end = gss->cdsSim ? gss->query->cdsEnd :gss->query->txEnd; struct genePred *targetGenes = genePredReaderLoadRangeQuery(conn, targetGeneTbl, gss->query->chrom, start, end, where); struct genePred *gp; while ((gp = slPopHead(&targetGenes)) != NULL) { float sim = gss->cdsSim ? geneCdsSimilarity(gp, gss->query) : geneExonSimilarity(gp, gss->query); slSafeAddHead(&gss->genes, geneSimNew(gp, sim)); } } struct geneSimilarities *geneSimilaritiesBuild(struct sqlConnection *conn, boolean cdsSim, struct genePred *query, char *targetGeneTbl) /* Construct a geneSimilarities object by finding all similar genes in * the specified table. Ownership of query is passed. */ { struct geneSimilarities *gss = geneSimilaritiesNew(query, cdsSim); findSimilarities(gss, conn, targetGeneTbl); slSort(&gss->genes, geneSimCmp); return gss; } struct geneSimilarities *geneSimilaritiesBuildAt(struct sqlConnection *conn, boolean cdsSim, char *queryName, char *queryChrom, int queryStart, char *queryGeneTbl, char *targetGeneTbl) /* Construct a geneSimilarities object by finding the genePred named * queryName with txStart of queryStart in queryGeneTbl and * find similar genes in targetGeneTbl. */ { char where[256]; sqlSafefFrag(where, sizeof(where), "(chrom = \"%s\") and (txStart = %d)", queryChrom, queryStart); struct genePred *query = genePredReaderLoadQuery(conn, queryGeneTbl, where); if (query == NULL) errAbort("gene %s starting at %s:%d not found in %s", queryName, queryChrom, queryStart, queryGeneTbl); // only use the first if by some wierd reason multiple are found genePredFreeList(&query->next); return geneSimilaritiesBuild(conn, cdsSim, query, targetGeneTbl); } struct geneSimilarities *geneSimilaritiesBuildAll(struct sqlConnection *conn, boolean cdsSim, char *queryName, char *queryGeneTbl, char *targetGeneTbl) /* Construct a list of geneSimilarities objects by finding all entries * of queryName in queryGeneTbl and build a geneSimilarities object for each * one from targetGeneTbl. */ { struct geneSimilarities *gssList = NULL; char where[64]; sqlSafefFrag(where, sizeof(where), "name = \"%s\"", queryName); struct genePred *queries = genePredReaderLoadQuery(conn, queryGeneTbl, where); struct genePred *query; while ((query = slPopHead(&queries)) != NULL) slSafeAddHead(&gssList, geneSimilaritiesBuild(conn, cdsSim, query, targetGeneTbl)); return gssList; }