533112afe2a2005e80cdb1f82904ea65032d4302
braney
  Sat Oct 2 11:37:34 2021 -0700
split hg/lib into two separate libaries, one only used by the cgis

diff --git src/hg/cgilib/geneSimilarities.c src/hg/cgilib/geneSimilarities.c
new file mode 100644
index 0000000..0387db8
--- /dev/null
+++ src/hg/cgilib/geneSimilarities.c
@@ -0,0 +1,213 @@
+/* Objects to find genePreds given similarity in genome bases annotated.  This
+ * is used to relate genes between two gene sets. */
+
+/* Copyright (C) 2014 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+#include "common.h"
+#include "geneSimilarities.h"
+#include "genePred.h"
+#include "genePredReader.h"
+#include "jksql.h"
+
+static struct geneSim *geneSimNew(struct genePred *gene, float sim)
+/* construct a new geneSim */
+{
+struct geneSim *gs;
+AllocVar(gs);
+gs->gene = gene;
+gs->sim = sim;
+return gs;
+}
+
+static void geneSimFreeList(struct geneSim **gss)
+/* free list of geneSims and associated genePreds */
+{
+struct geneSim *gs;
+while ((gs = slPopHead(gss)) != NULL)
+    {
+    genePredFree(&gs->gene);
+    freeMem(gs);
+    }
+}
+
+struct geneSimilarities *geneSimilaritiesNew(struct genePred *query,
+                                             boolean cdsSim)
+/* construct a new geneSimilarities object. Ownership of query is passed. */
+{
+struct geneSimilarities *gss;
+AllocVar(gss);
+gss->query = query;
+gss->cdsSim = cdsSim;
+return gss;
+}
+
+void geneSimilaritiesFree(struct geneSimilarities **gssPtr)
+/* free a geneSimilarities object and all contained genePreds */
+{
+struct geneSimilarities *gss = *gssPtr;
+if (gss != NULL)
+    {
+    geneSimFreeList(&gss->genes);
+    genePredFree(&gss->query);
+    freez(gssPtr);
+    }
+}
+
+void geneSimilaritiesFreeList(struct geneSimilarities **gssList)
+/* free a list of geneSimilarities objects and all contained genePreds */
+{
+struct geneSimilarities *gss;
+while ((gss = slPopHead(gssList)) != NULL)
+    geneSimilaritiesFree(&gss);
+}
+                                             
+static int cntOverlapCdsBases(struct genePred *ga, struct genePred *gb)
+/* determine the number of CDS bases overlaping */
+{
+int overBases = 0, ia, ib;
+int cdsStartA, cdsEndA, cdsStartB, cdsEndB;
+
+for (ia = 0; ia < ga->exonCount; ia++)
+    {
+    if (genePredCdsExon(ga, ia, &cdsStartA, &cdsEndA))
+        {
+        for (ib = 0; ib < gb->exonCount; ib++)
+            {
+            if (genePredCdsExon(gb, ib, &cdsStartB, &cdsEndB))
+                {
+                int o = min(cdsEndA, cdsEndB) - max(cdsStartA, cdsStartB);
+                if (o > 0)
+                    overBases += o;
+                }
+            }
+        }
+    }
+return overBases;
+}
+
+static float geneCdsSimilarity(struct genePred *ga, struct genePred *gb)
+/* computer a CDS similary score for two genes, which is a fraction */
+{
+int gaBases = genePredCodingBases(ga);
+int gbBases = genePredCodingBases(gb);
+return ((float)(2*cntOverlapCdsBases(ga, gb)))/(float)(gaBases+gbBases);
+}
+
+static int cntOverlapExonBases(struct genePred *ga, struct genePred *gb)
+/* determine the number of exon bases overlaping */
+{
+int overBases = 0, ia, ib;
+for (ia = 0; ia < ga->exonCount; ia++)
+    {
+    for (ib = 0; ib < gb->exonCount; ib++)
+        {
+        int o = min(ga->exonEnds[ia], gb->exonEnds[ib])
+            - max(ga->exonStarts[ia], gb->exonStarts[ib]);
+        if (o > 0)
+            overBases += o;
+        }
+    }
+return overBases;
+}
+
+static float geneExonSimilarity(struct genePred *ga, struct genePred *gb)
+/* computer an exon similary score for two genes, which is a fraction */
+{
+int gaBases = genePredBases(ga);
+int gbBases = genePredBases(gb);
+return ((float)(2*cntOverlapExonBases(ga, gb)))/(float)(gaBases+gbBases);
+}
+
+int geneSimCmp(const void *va, const void *vb)
+/* compare geneSim objects by descending similarity score. */
+{
+struct geneSim *ga = *((struct geneSim **)va);
+struct geneSim *gb = *((struct geneSim **)vb);
+
+if (ga->sim > gb->sim)
+    return -1;
+else if (ga->sim < gb->sim)
+    return -1;
+else
+    return 0;
+}
+
+static void findSimilarities(struct geneSimilarities *gss,
+                             struct sqlConnection *conn,
+                             char *targetGeneTbl)
+/* find genes similar to the query in gss and add them. */
+{
+char where[64];
+sqlSafefFrag(where, sizeof(where), "strand = \"%s\"", gss->query->strand);
+int start = gss->cdsSim ? gss->query->cdsStart :gss->query->txStart;
+int end = gss->cdsSim ? gss->query->cdsEnd :gss->query->txEnd;
+
+struct genePred *targetGenes
+    = genePredReaderLoadRangeQuery(conn, targetGeneTbl, gss->query->chrom,
+                                   start, end, where);
+struct genePred *gp;
+while ((gp = slPopHead(&targetGenes)) != NULL)
+    {
+    float sim = gss->cdsSim
+        ? geneCdsSimilarity(gp, gss->query)
+        : geneExonSimilarity(gp, gss->query);
+    slSafeAddHead(&gss->genes, geneSimNew(gp, sim));
+    }
+}
+
+struct geneSimilarities *geneSimilaritiesBuild(struct sqlConnection *conn,
+                                               boolean cdsSim,
+                                               struct genePred *query,
+                                               char *targetGeneTbl)
+/* Construct a geneSimilarities object by finding all similar genes in
+ * the specified table. Ownership of query is passed. */
+{
+struct geneSimilarities *gss = geneSimilaritiesNew(query, cdsSim);
+findSimilarities(gss, conn, targetGeneTbl);
+slSort(&gss->genes, geneSimCmp);
+return gss;
+}
+
+struct geneSimilarities *geneSimilaritiesBuildAt(struct sqlConnection *conn,
+                                                 boolean cdsSim,
+                                                 char *queryName,
+                                                 char *queryChrom,
+                                                 int queryStart,
+                                                 char *queryGeneTbl,
+                                                 char *targetGeneTbl)
+/* Construct a geneSimilarities object by finding the genePred named
+ * queryName with txStart of queryStart in queryGeneTbl and
+ * find similar genes in targetGeneTbl. */
+{
+char where[256];
+sqlSafefFrag(where, sizeof(where), "(chrom = \"%s\") and (txStart = %d)", queryChrom, queryStart);
+struct genePred *query = genePredReaderLoadQuery(conn, queryGeneTbl, where);
+if (query == NULL)
+    errAbort("gene %s starting at %s:%d not found in %s", queryName,
+             queryChrom, queryStart, queryGeneTbl);
+
+// only use the first if by some wierd reason multiple are found
+genePredFreeList(&query->next);
+
+return geneSimilaritiesBuild(conn, cdsSim, query, targetGeneTbl);
+}
+
+struct geneSimilarities *geneSimilaritiesBuildAll(struct sqlConnection *conn,
+                                                  boolean cdsSim, 
+                                                  char *queryName,
+                                                  char *queryGeneTbl,
+                                                  char *targetGeneTbl)
+/* Construct a list of geneSimilarities objects by finding all entries
+ * of queryName in queryGeneTbl and build a geneSimilarities object for each
+ * one from targetGeneTbl. */
+{
+struct geneSimilarities *gssList = NULL;
+char where[64];
+sqlSafefFrag(where, sizeof(where), "name = \"%s\"", queryName);
+struct genePred *queries = genePredReaderLoadQuery(conn, queryGeneTbl, where);
+struct genePred *query;
+while ((query = slPopHead(&queries)) != NULL)
+    slSafeAddHead(&gssList, geneSimilaritiesBuild(conn, cdsSim, query, targetGeneTbl));
+return gssList;
+}
+