src/hg/near/hgClusterGenes/hgClusterGenes.c 1.13
1.13 2009/09/23 18:42:23 angie
Fixed compiler warnings from gcc 4.3.3, mostly about system calls whose return values weren't checked and non-literal format strings with no args.
Index: src/hg/near/hgClusterGenes/hgClusterGenes.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/near/hgClusterGenes/hgClusterGenes.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -b -B -U 1000000 -r1.12 -r1.13
--- src/hg/near/hgClusterGenes/hgClusterGenes.c 3 Sep 2008 19:20:41 -0000 1.12
+++ src/hg/near/hgClusterGenes/hgClusterGenes.c 23 Sep 2009 18:42:23 -0000 1.13
@@ -1,419 +1,419 @@
/* hgClusterGenes - Cluster overlapping gene predictions. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "dlist.h"
#include "jksql.h"
#include "genePred.h"
#include "hdb.h"
#include "hgRelate.h"
#include "binRange.h"
#include "rbTree.h"
static char const rcsid[] = "$Id$";
void usage()
/* Explain usage and exit. */
{
errAbort(
"hgClusterGenes - Cluster overlapping gene predictions\n"
"usage:\n"
" hgClusterGenes database geneTable clusterTable cannonicalTable\n"
"where clusterTable pairs clusterIds and geneIds, and cannonicalTable\n"
"pairs the longest protein isoform with the clusterId\n"
"options:\n"
" -chrom=chrN - Just work on one chromosome\n"
" -verbose=N - Print copious debugging info. 0 for none, 3 for loads\n"
" -noProt - Skip protein field\n"
" -sangerLinks - Use sangerLinks table for protein\n"
" -protAccQuery='query' - Use query to retrieve gene->protein map\n"
);
}
boolean noProt = FALSE;
boolean sangerLinks = FALSE;
char *protAccQuery = NULL;
static struct optionSpec options[] = {
{"chrom", OPTION_STRING},
{"noProt", OPTION_BOOLEAN},
{"sangerLinks", OPTION_BOOLEAN},
{"protAccQuery", OPTION_STRING},
{NULL, 0},
};
struct cluster
/* A cluster of overlapping genes. */
{
struct cluster *next; /* Next in list. */
struct hash *geneHash; /* Associated genes. */
int start,end; /* Range covered by cluster. */
};
void clusterDump(int verbosity, struct cluster *cluster)
/* Dump contents of cluster to log. */
{
struct hashEl *helList = hashElListHash(cluster->geneHash);
struct hashEl *hel;
verbose(verbosity, "%d-%d", cluster->start, cluster->end);
for (hel = helList; hel != NULL; hel = hel->next)
{
struct genePred *gp = hel->val;
verbose(verbosity, " %s", gp->name);
}
verbose(verbosity, "\n");
}
struct cluster *clusterNew()
/* Create new cluster. */
{
struct cluster *cluster;
AllocVar(cluster);
cluster->geneHash = hashNew(5);
return cluster;
}
void clusterFree(struct cluster **pCluster)
/* Free up a cluster. */
{
struct cluster *cluster = *pCluster;
if (cluster == NULL)
return;
hashFree(&cluster->geneHash);
freez(pCluster);
}
void clusterFreeList(struct cluster **pList)
/* Free a list of dynamically allocated cluster's */
{
struct cluster *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
clusterFree(&el);
}
*pList = NULL;
}
int clusterCmp(const void *va, const void *vb)
/* Compare to sort based on start. */
{
const struct cluster *a = *((struct cluster **)va);
const struct cluster *b = *((struct cluster **)vb);
return a->start - b->start;
}
void clusterAddExon(struct cluster *cluster,
int start, int end, struct genePred *gp)
/* Add exon to cluster. */
{
if (!hashLookup(cluster->geneHash, gp->name))
hashAdd(cluster->geneHash, gp->name, gp);
if (cluster->start == cluster->end)
{
cluster->start = start;
cluster->end = end;
}
else
{
if (start < cluster->start) cluster->start = start;
if (cluster->end < end) cluster->end = end;
}
}
void addExon(struct binKeeper *bk, struct dlNode *clusterNode,
int start, int end, struct genePred *gp)
/* Add exon to cluster and binKeeper. */
{
clusterAddExon(clusterNode->val, start, end, gp);
binKeeperAdd(bk, start, end, clusterNode);
}
void mergeClusters(struct binKeeper *bk, struct binElement *bkRest,
struct dlNode *aNode, struct dlNode *bNode)
/* Move bNode into aNode. */
{
struct cluster *aCluster = aNode->val;
struct cluster *bCluster = bNode->val;
struct binElement *bkEl;
struct hashEl *hEl, *hList;
verbose(3, " a: ");
clusterDump(3, aCluster);
verbose(3, " b: ");
clusterDump(3, bCluster);
/* First change references to bNode. */
binKeeperReplaceVal(bk, bCluster->start, bCluster->end, bNode, aNode);
for (bkEl = bkRest; bkEl != NULL; bkEl = bkEl->next)
if (bkEl->val == bNode)
bkEl->val = aNode;
/* Add b's genes to a. */
hList = hashElListHash(bCluster->geneHash);
for (hEl = hList; hEl != NULL; hEl = hEl->next)
hashAdd(aCluster->geneHash, hEl->name, hEl->val);
/* Adjust start/end. */
if (bCluster->start < aCluster->start)
aCluster->start = bCluster->start;
if (aCluster->end < bCluster->end)
aCluster->end = bCluster->end;
/* Remove all traces of bNode. */
dlRemove(bNode);
clusterFree(&bCluster);
verbose(3, " ab: ");
clusterDump(3, aCluster);
}
int totalGeneCount = 0;
int totalClusterCount = 0;
struct cluster *makeCluster(struct genePred *geneList, int chromSize)
/* Create clusters out of overlapping genes. */
{
struct binKeeper *bk = binKeeperNew(0, chromSize);
struct dlList *clusters = newDlList();
struct dlNode *node;
struct cluster *clusterList = NULL, *cluster;
struct genePred *gp;
/* Build up cluster list with aid of binKeeper.
* For each exon look to see if it overlaps an
* existing cluster. If so put it into existing
* cluster, otherwise make a new cluster. The hard
* case is where part of the gene is already in one
* cluster and the exon overlaps with a new
* cluster. In this case merge the new cluster into
* the old one. */
for (gp = geneList; gp != NULL; gp = gp->next)
{
int exonIx;
struct dlNode *oldNode = NULL;
verbose(2, "%s %d-%d\n", gp->name, gp->txStart, gp->txEnd);
for (exonIx = 0; exonIx < gp->exonCount; ++exonIx)
{
int exonStart = gp->exonStarts[exonIx];
int exonEnd = gp->exonEnds[exonIx];
struct binElement *bEl, *bList = binKeeperFind(bk, exonStart, exonEnd);
verbose(4, " %d %d\n", exonIx, exonStart);
if (bList == NULL)
{
if (oldNode == NULL)
{
struct cluster *cluster = clusterNew();
oldNode = dlAddValTail(clusters, cluster);
}
addExon(bk, oldNode, exonStart, exonEnd, gp);
}
else
{
for (bEl = bList; bEl != NULL; bEl = bEl->next)
{
struct dlNode *newNode = bEl->val;
if (newNode != oldNode)
{
if (oldNode == NULL)
{
/* Add to existing cluster. */
oldNode = newNode;
}
else
{
/* Merge new cluster into old one. */
verbose(3, "Merging %p %p\n", oldNode, newNode);
mergeClusters(bk, bEl->next, oldNode, newNode);
}
}
}
addExon(bk, oldNode, exonStart, exonEnd, gp);
slFreeList(&bList);
}
}
}
/* We build up the cluster list as a doubly-linked list
* to make it faster to remove clusters that get merged
* into another cluster. At the end though we make
* a singly-linked list out of it. */
for (node = clusters->tail; !dlStart(node); node=node->prev)
{
cluster = node->val;
slAddHead(&clusterList, cluster);
}
slSort(&clusterList, clusterCmp);
/* Clean up and go home. */
binKeeperFree(&bk);
dlListFree(&clusters);
return clusterList;
}
void createClusterTable(struct sqlConnection *conn,
char *tableName, int longestName)
/* Create cluster table. */
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy,
"CREATE TABLE %s (\n"
" clusterId int unsigned not null,\n"
" transcript varchar(255) not null,\n"
" INDEX(transcript(%d)),\n"
" INDEX(clusterId))\n"
, tableName, longestName);
sqlRemakeTable(conn, tableName, dy->string);
dyStringFree(&dy);
}
void createCannonicalTable(struct sqlConnection *conn,
char *tableName, int longestName)
/* Create cannonical representative of cluster table. */
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy,
"CREATE TABLE %s (\n"
" chrom varchar(255) not null,\n"
" chromStart int not null,\n"
" chromEnd int not null,\n"
" clusterId int unsigned not null,\n"
" transcript varchar(255) not null,\n"
" protein varchar(255) not null,\n"
" UNIQUE(clusterId),\n"
" INDEX(transcript(%d)),\n"
" INDEX(protein(8)))\n"
, tableName, longestName);
sqlRemakeTable(conn, tableName, dy->string);
dyStringFree(&dy);
}
int clusterId = 0; /* Assign a unique id to each cluster. */
int longestName = 1; /* Longest gene name. */
void clusterGenesOnStrand(char *database, struct sqlConnection *conn,
char *geneTable, char *chrom, char strand,
FILE *clusterFile, FILE *canFile)
/* Scan through genes on this strand, cluster, and write clusters to file. */
{
struct sqlResult *sr;
char **row;
int rowOffset;
struct genePred *gp, *gpList = NULL;
char extraWhere[64], query[256];
struct cluster *clusterList = NULL, *cluster;
int nameLen;
struct hash *protHash = NULL;
verbose(1, "%s %c\n", chrom, strand);
safef(extraWhere, sizeof(extraWhere), "strand = '%c'", strand);
sr = hChromQuery(conn, geneTable, chrom, extraWhere, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
{
gp = genePredLoad(row + rowOffset);
nameLen = strlen(gp->name);
slAddHead(&gpList, gp);
if (nameLen > longestName)
longestName = nameLen;
++totalGeneCount;
}
sqlFreeResult(&sr);
/* Build hash to map between transcript names and protein IDs. */
if (!noProt)
{
protHash = newHash(16);
if (sangerLinks)
safef(query, sizeof(query), "select orfName,protName from sangerLinks");
else if (isNotEmpty(protAccQuery))
- safef(query, sizeof(query), protAccQuery);
+ safecpy(query, sizeof(query), protAccQuery);
else
safef(query, sizeof(query), "select name, proteinId from %s", geneTable);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
hashAdd(protHash, row[0], cloneString(row[1]));
sqlFreeResult(&sr);
}
slReverse(&gpList);
clusterList = makeCluster(gpList, hChromSize(database, chrom));
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
{
struct hashEl *helList = hashElListHash(cluster->geneHash);
struct hashEl *hel;
struct genePred *cannonical = NULL;
char *protName;
int cannonicalSize = -1, size = 0;
++clusterId;
for (hel = helList; hel != NULL; hel = hel->next)
{
struct genePred *gp = hel->val;
fprintf(clusterFile, "%d\t%s\n", clusterId, gp->name);
size = genePredCodingBases(gp);
if (size > cannonicalSize)
{
cannonical = gp;
cannonicalSize = size;
}
}
protName = cannonical->name;
if (protHash != NULL)
{
char *newVal = hashFindVal(protHash, protName);
if (newVal != NULL)
protName = newVal;
}
fprintf(canFile, "%s\t%d\t%d\t%d\t%s\t%s\n",
chrom, cannonical->txStart, cannonical->txEnd, clusterId, cannonical->name,
protName);
++totalClusterCount;
}
genePredFreeList(&gpList);
freeHashAndVals(&protHash);
}
void hgClusterGenes(char *database, char *geneTable, char *clusterTable,
char *cannonicalTable)
/* hgClusterGenes - Cluster overlapping gene predictions. */
{
struct slName *chromList, *chrom;
FILE *clusterFile = hgCreateTabFile(".", clusterTable);
FILE *canFile = hgCreateTabFile(".", cannonicalTable);
struct sqlConnection *conn;
if (optionExists("chrom"))
chromList = slNameNew(optionVal("chrom", NULL));
else
chromList = hAllChromNames(database);
conn = hAllocConn(database);
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
{
clusterGenesOnStrand(database, conn, geneTable, chrom->name, '+',
clusterFile, canFile);
clusterGenesOnStrand(database, conn, geneTable, chrom->name, '-',
clusterFile, canFile);
}
createClusterTable(conn, clusterTable, longestName);
createCannonicalTable(conn, cannonicalTable, longestName);
verbose(1, "Loading database\n");
hgLoadTabFile(conn, ".", clusterTable, &clusterFile);
hgLoadTabFile(conn, ".", cannonicalTable, &canFile);
verbose(1, "Got %d clusters, from %d genes in %d chromosomes\n",
totalClusterCount, totalGeneCount, slCount(chromList));
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
noProt = optionExists("noProt");
sangerLinks = optionExists("sangerLinks");
protAccQuery = optionVal("protAccQuery", protAccQuery);
if (argc != 5)
usage();
hgClusterGenes(argv[1], argv[2], argv[3], argv[4]);
return 0;
}