src/hg/instinct/raToDb/raToDb.c 1.12
1.12 2010/05/26 21:27:23 cszeto
Added Microscope toggle support
Index: src/hg/instinct/raToDb/raToDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/raToDb/raToDb.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -b -B -U 1000000 -r1.11 -r1.12
--- src/hg/instinct/raToDb/raToDb.c 23 Apr 2010 19:43:06 -0000 1.11
+++ src/hg/instinct/raToDb/raToDb.c 26 May 2010 21:27:23 -0000 1.12
@@ -1,695 +1,695 @@
/* raToDb - RA to database table converter RA to database converter. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "cheapcgi.h"
#include "localmem.h"
#include "dystring.h"
#include "obscure.h"
#include "hdb.h"
#include "sqlList.h"
#include "hPrint.h"
#include "hgConfig.h"
#include "ra.h"
#include "featuresLib.h"
#include "hgHeatmapLib.h"
#include "raDb.h"
static char const rcsid[] = "$Id$";
void usage()
/* Explain usage and exit. */
{
errAbort(
"raToDb - RA to database table converter RA to database converter\n"
"usage:\n"
" raToDb -alpha db file.ra\n"
"\n"
" db = database to put ra_username table\n"
" file.ra = root ra file\n"
"options:\n"
" -alpha = Make alpha database table (raDb)\n"
);
}
static struct optionSpec options[] = {
{"alpha", OPTION_BOOLEAN},
{NULL, 0},
};
boolean alpha = FALSE;
static char *rootDir = "../hgHeatmap2/hgHeatmapData";
static char *hgCgiDir = "../../makeDb/hgCgiData";
static char *maGroupsFile = "microarrayGroups.ra";
static char *localDbProfile = "localDb";
static char *genomicDb = "hg18";
/********************* BEGIN HELPER FXNS ******************************/
void createRaDbTable(struct sqlConnection *conn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned,\n");
dyStringPrintf(dy, "name varchar(255),\n");
dyStringPrintf(dy, "accessTable varchar(255),\n");
dyStringPrintf(dy, "shortLabel varchar(255),\n");
dyStringPrintf(dy, "longLabel varchar(255),\n");
dyStringPrintf(dy, "expCount int unsigned,\n");
dyStringPrintf(dy, "height int unsigned,\n");
dyStringPrintf(dy, "groupName varchar(255),\n");
dyStringPrintf(dy, "raFile varchar(255),\n");
dyStringPrintf(dy, "patDb varchar(255),\n");
+dyStringPrintf(dy, "microscope varchar(255),\n");
dyStringPrintf(dy, "sampleField varchar(255),\n");
dyStringPrintf(dy, "patTable varchar(255),\n");
dyStringPrintf(dy, "patField varchar(255),\n");
dyStringPrintf(dy, "aliasTable varchar(255),\n");
dyStringPrintf(dy, "displayNameTable varchar(255),\n");
dyStringPrintf(dy, "dataType varchar(255),\n");
dyStringPrintf(dy, "platform varchar(255),\n");
dyStringPrintf(dy, "expScale float,\n");
dyStringPrintf(dy, "gainFull float,\n");
dyStringPrintf(dy, "gainSet float,\n");
dyStringPrintf(dy, "type varchar(255),\n");
dyStringPrintf(dy, "visibility varchar(255),\n");
dyStringPrintf(dy, "priority float,\n");
dyStringPrintf(dy, "url varchar(255),\n");
dyStringPrintf(dy, "security varchar(255),\n");
-dyStringPrintf(dy, "local_url varchar(255),\n");
dyStringPrintf(dy, "profile varchar(255),\n");
dyStringPrintf(dy, "wrangler varchar(255),\n");
dyStringPrintf(dy, "citation varchar(255),\n");
dyStringPrintf(dy, "article_title longblob,\n");
dyStringPrintf(dy, "author_list longblob,\n");
dyStringPrintf(dy, "wrangling_procedure longblob,\n");
dyStringPrintf(dy, "PRIMARY KEY(id),\n");
dyStringPrintf(dy, "KEY(name)\n");
dyStringPrintf(dy, ");");
sqlUpdate(conn, dy->string);
dyStringFree(&dy);
}
void dyAddStringNULL(struct dyString *dy, char *str, boolean addComma)
{
if (str)
dyStringPrintf(dy, "'%s'", str);
else
dyStringPrintf(dy, "NULL");
if (addComma)
dyStringPrintf(dy, ",");
}
void raDbSaveToDbWithNULL(struct sqlConnection *conn, struct raDb *el, char *tableName, int updateSize)
/* Save raDb as a row to the table specified by tableName.
* Have to do it myself since AutoSQL doesn't properly handle NULL values.
* Assumes already escaped strings */
{
struct dyString *update = newDyString(updateSize);
dyStringPrintf(update, "insert into %s values (", tableName);
dyStringPrintf(update, "%u,", *(el->id));
dyAddStringNULL(update, el->name, TRUE);
dyAddStringNULL(update, el->accessTable, TRUE);
dyAddStringNULL(update, el->shortLabel, TRUE);
dyAddStringNULL(update, el->longLabel, TRUE);
dyStringPrintf(update, "%u,", *(el->expCount));
dyStringPrintf(update, "%u,", *(el->height));
dyAddStringNULL(update, el->groupName, TRUE);
dyAddStringNULL(update, el->raFile, TRUE);
dyAddStringNULL(update, el->patDb, TRUE);
+dyAddStringNULL(update, el->microscope, TRUE);
dyAddStringNULL(update, el->sampleField, TRUE);
dyAddStringNULL(update, el->patTable, TRUE);
dyAddStringNULL(update, el->patField, TRUE);
dyAddStringNULL(update, el->aliasTable, TRUE);
dyAddStringNULL(update, el->displayNameTable, TRUE);
dyAddStringNULL(update, el->dataType, TRUE);
dyAddStringNULL(update, el->platform, TRUE);
dyStringPrintf(update, "%g,", *(el->expScale));
dyStringPrintf(update, "%g,", *(el->gainFull));
dyStringPrintf(update, "%g,", *(el->gainSet));
dyAddStringNULL(update, el->type, TRUE);
dyAddStringNULL(update, el->visibility, TRUE);
dyStringPrintf(update, "%g,", *(el->priority));
dyAddStringNULL(update, el->url, TRUE);
dyAddStringNULL(update, el->security, TRUE);
-dyAddStringNULL(update, el->local_url, TRUE);
dyAddStringNULL(update, el->profile, TRUE);
dyAddStringNULL(update, el->wrangler, TRUE);
dyAddStringNULL(update, el->citation, TRUE);
dyAddStringNULL(update, el->article_title, TRUE);
dyAddStringNULL(update, el->author_list, TRUE);
dyAddStringNULL(update, el->wrangling_procedure, FALSE);
dyStringPrintf(update, ")");
sqlUpdate(conn, update->string);
freeDyString(&update);
}
struct slName *getRaIncludes(char *raFile)
{
struct lineFile *lf = lineFileOpen(raFile, TRUE);
char *line, *file;
struct slName *sl, *slList = NULL;
for (;;)
{
if (!lineFileNext(lf, &line, NULL))
break;
line = skipLeadingSpaces(line);
if (startsWith("#include", line) || startsWith("include", line))
{
nextWord(&line);
file = nextQuotedWord(&line);
sl = slNameNew(file);
slAddHead(&slList, sl);
}
}
lineFileClose(&lf);
return slList;
}
struct hash *readRaFile(char *rootName)
/* read datasets.ra file, including any #include .ra files */
{
struct hash *hashOfHash = newHash(10);
struct hashEl *helList, *hel;
struct hash *raList = NULL, *ra;
char fileName[HDB_MAX_PATH_STRING];
safef(fileName, sizeof(fileName), "%s/%s", rootDir, rootName);
struct slName *sl, *slList = getRaIncludes(fileName);
raFoldIn(fileName, hashOfHash);
helList = hashElListHash(hashOfHash);
for (hel = helList; hel != NULL; hel = hel->next)
{
ra = hel->val;
slAddHead(&raList, ra);
hel->val = NULL;
}
hashElFreeList(&helList);
hashFree(&hashOfHash);
for (sl = slList; sl; sl = sl->next)
{
safef(fileName, sizeof(fileName), "%s/%s", rootDir, sl->name);
hashOfHash = newHash(10);
raFoldIn(fileName, hashOfHash);
helList = hashElListHash(hashOfHash);
for (hel = helList; hel != NULL; hel = hel->next)
{
ra = hel->val;
slAddHead(&raList, ra);
hel->val = NULL;
}
hashElFreeList(&helList);
hashFree(&hashOfHash);
}
/* Create list. */
return raList;
}
struct microarrayGroups *maGroupingsForRa(char *database, char *table)
/* Get the settings from the microarrayGrouop.ra files and put them in a convenient struct. */
{
struct microarrayGroups *ret;
char groupings[strlen(table)+strlen("Groups")+1];
safef(groupings, sizeof(groupings), "%s%s", table,"Groups");
if (!groupings)
return NULL;
struct hash *allGroups;
struct hash *hashList = hgReadRa(hGenome(database), database, hgCgiDir,
maGroupsFile, &allGroups);
if (!allGroups)
return NULL;
struct hash *mainGroup = (struct hash *)(hashFindVal(allGroups, groupings));
if (!mainGroup)
{
printf("maGroups: %s not found\n", groupings);
return NULL;
}
char *s = (char *)(hashFindVal(mainGroup, "all"));
if (!s)
return NULL;
struct hash *tmpGroup = (struct hash *)(hashFindVal(allGroups, s));
if (!tmpGroup)
return NULL;
struct slName *wordList;
s = hashMustFindVal(tmpGroup, "expIds");
wordList = slNameListFromComma(s);
int numExpIds = slCount(wordList);
if (numExpIds == 0)
{
printf("maGroups: expIds is empty\n");
return NULL;
}
s = hashMustFindVal(tmpGroup, "names");
wordList = slNameListFromComma(s);
int numNames = slCount(wordList);
if (numNames == 0)
{
printf("maGroups: names is empty\n");
return NULL;
}
s = hashMustFindVal(tmpGroup, "groupSizes");
wordList = slNameListFromComma(s);
int numGroups = slCount(wordList);
if (numGroups == 0)
{
printf("maGroups: groupSizes is empty\n");
return NULL;
}
if (numGroups != numNames)
{
printf("maGroups: Bad format (groupSizes size "
"= %d, != to names size = %d)\n", numGroups, numNames);
return NULL;
}
AllocVar(ret);
ret->allArrays = maHashToMaGrouping(tmpGroup);
hashFreeList(&hashList);
return ret;
}
/********************* END HELPER FXNS ******************************/
char *getOptionalString(struct hash *hash, char *key, char *usual)
{
char *ret = (char *)(hashOptionalVal(hash, key, usual));
if (ret)
ret = sqlEscapeString(ret);
return ret;
}
char *mustGetString(struct hash *hash, char *key)
{
char *ret = (char *)(hashMustFindVal(hash, key));
ret = sqlEscapeString(ret);
return ret;
}
struct raDb *initRA()
{
struct raDb *ra = AllocA(struct raDb);
ra->next = NULL;
ra->id = AllocA(uint);
ra->name = NULL;
ra->profile = NULL;
ra->dataType = NULL;
ra->expCount = AllocA(uint);
ra->height = AllocA(uint);
ra->accessTable = NULL;
ra->aliasTable = NULL;
ra->displayNameTable = NULL;
ra->shortLabel = NULL;
ra->longLabel = NULL;
-ra->local_url = NULL;
ra->groupName = NULL;
ra->raFile = NULL;
ra->patDb = NULL;
+ra->microscope = NULL;
ra->patTable = NULL;
ra->patField = NULL;
ra->sampleField = NULL;
ra->platform = NULL;
ra->expScale = AllocA(float);
ra->gainFull = AllocA(float);
ra->gainSet = AllocA(float);
ra->security = NULL;
ra->wrangler = NULL;
ra->citation = NULL;
ra->article_title = NULL;
ra->author_list = NULL;
ra->wrangling_procedure = NULL;
ra->type = NULL;
ra->visibility = NULL;
ra->priority = AllocA(float);
ra->url = NULL;
return ra;
}
int checkSampleCount(struct sqlConnection *conn, struct raDb *ra)
{
char query[256];
safef(query, sizeof(query), "select expCount from %s limit 1", ra->name);
int count = sqlQuickNum(conn, query);
if (count != *(ra->expCount))
{
printf("* FAIL\t%s\tmaGroups count != count in data table (%d != %d)\n",
ra->name, count, *(ra->expCount));
return 0;
}
return 1;
}
int checkClinicalData(struct raDb *ra)
{
if (!ra->patDb)
{
printf(" WARN\t%s\tMissing clinical db in RA file.\n", ra->name);
return 1;
}
struct sqlConnection *conn = sqlMayConnectProfile(ra->profile, ra->patDb);
if (!conn)
{
printf("* FAIL\t%s\tClinical db '%s' doesn't exist.\n", ra->name, ra->patDb);
sqlDisconnect(&conn);
return 0;
}
if (!sqlTableExists(conn, ra->patTable))
{
printf("* FAIL\t%s\tClinical table '%s' doesn't exist.\n", ra->name, ra->patTable);
sqlDisconnect(&conn);
return 0;
}
struct slName *sl, *slList = sqlFieldNames(conn, ra->patTable);
boolean pfExists = FALSE;
boolean sfExists = FALSE;
for (sl = slList; sl; sl = sl->next)
{
if (sameWord(sl->name, ra->patField))
pfExists = TRUE;
if (sameWord(sl->name, ra->sampleField))
sfExists = TRUE;
}
if (!pfExists)
{
printf("* FAIL\t%s\tpatField '%s' not in table '%s'.\n",
ra->name, ra->patField, ra->patTable);
sqlDisconnect(&conn);
return 0;
}
if (!sfExists)
{
printf("* FAIL\t%s\tsampleField '%s' not in table '%s'.\n",
ra->name, ra->sampleField, ra->patTable);
sqlDisconnect(&conn);
return 0;
}
struct hash *raHash, *raHashList = readRaFile(ra->raFile);
struct slName *missing = NULL;
char query[512];
for (raHash = raHashList; raHash; raHash = raHash->next)
{
char *name = hashMustFindVal(raHash, "name");
char *queryType = (char *)(hashOptionalVal(raHash, "type", NULL));
if (!queryType)
{
printf(" WARN\t%s\tField 'type' is missing for clinical field '%s' in %s.\n",
ra->name, name, ra->raFile);
continue;
}
char *type = cloneString(nextWord(&queryType));
if (!sameString(type, "lookup"))
continue; // only checking lookup columns
char *table = cloneString(nextWord(&queryType));
char *keyField = cloneString(nextWord(&queryType));
char *valField = cloneString(nextWord(&queryType));
if (slNameInList(missing, table))
continue;
if (!sqlTableExists(conn, table))
{
printf(" WARN\t%s\tTable '%s' does not exist in clinical db '%s'.\n",
ra->name, table, ra->patDb);
slNameAddHead(&missing, table);
continue;
}
struct slName *fields = sqlFieldNames(conn, table);
if (!slNameInList(fields, valField))
{
printf(" WARN\t%s\tTable '%s' does not contain field '%s'.\n",
ra->name, table, valField);
slNameFreeList(&fields);
continue;
}
slNameFreeList(&fields);
safef(query, sizeof(query), "select %s,%s from %s limit 1",
keyField, valField, table);
if (!sqlExists(conn, query))
{
printf("* FAIL\t%s\tvalField '%s' for feature '%s' doesn't exist.\n",
ra->name, valField, name);
sqlDisconnect(&conn);
return 0;
}
}
sqlDisconnect(&conn);
return 1;
}
struct raDb *validateRa(struct hash *raHash)
{
struct raDb *ra = initRA();
ra->name = mustGetString(raHash, "name");
ra->profile = getOptionalString(raHash, "profile", localDbProfile);
struct sqlConnection *conn = hAllocConnProfile(ra->profile, genomicDb);
if (!sqlTableExists(conn, ra->name))
{
printf("* FAIL\t%s\tData table not in hg18 db (%s).\n", ra->name, ra->profile);
hFreeConn(&conn); // No longer need connection
return NULL;
}
ra->dataType = mustGetString(raHash, "dataType");
*(ra->expCount) = 0;
/* microarray specific settings*/
if (sameWord(ra->dataType, "bed 15"))
{
struct microarrayGroups *maGs = maGroupingsForRa(genomicDb, ra->name);
if (!maGs)
{
printf("* FAIL\t%s\tBad maGroups.\n", ra->name);
hFreeConn(&conn); // No longer need connection
return NULL;
}
struct maGrouping *allA = maGs->allArrays;
*(ra->expCount) = allA->size;
if (!checkSampleCount(conn, ra))
{
hFreeConn(&conn); // No longer need connection
return NULL;
}
}
ra->accessTable = getOptionalString(raHash, "accessTable", ra->name);
if (!sqlTableExists(conn, ra->accessTable))
{
printf(" WARN\t%s\taccessTable '%s' not in db.\n", ra->name, ra->accessTable);
ra->accessTable = cloneString(ra->name);
}
ra->aliasTable = getOptionalString(raHash, "aliasTable", NULL);
if (ra->aliasTable && !sqlTableExists(conn, ra->aliasTable))
{
printf(" WARN\t%s\taliasTable '%s' not in db.\n", ra->name, ra->aliasTable);
ra->aliasTable = NULL;
}
ra->displayNameTable = getOptionalString(raHash, "displayNameTable", NULL);
if (ra->displayNameTable && !sqlTableExists(conn, ra->displayNameTable))
{
printf(" WARN\t%s\tdisplayNameTable '%s' not in db.\n", ra->name, ra->displayNameTable);
ra->displayNameTable = NULL;
}
hFreeConn(&conn); // No longer need connection
ra->shortLabel = mustGetString(raHash, "shortLabel");
ra->longLabel = mustGetString(raHash, "longLabel");
-ra->local_url = getOptionalString(raHash, "local_url", NULL);
ra->groupName = getOptionalString(raHash, "group", NULL);
/* Settings on patient information from datasets.ra file */
ra->raFile = getOptionalString(raHash, "raFile", NULL);
ra->patDb = getOptionalString(raHash, "patDb", NULL);
ra->patTable = getOptionalString(raHash, "patTable", NULL);
ra->patField = getOptionalString(raHash, "patField", NULL);
ra->sampleField = getOptionalString(raHash, "sampleField", NULL);
if (!checkClinicalData(ra))
return NULL;
/* Platform setting, currently defaults to expression */
ra->platform = getOptionalString(raHash, "platform", "expression");
+ra->microscope = getOptionalString(raHash, "microscope", NULL);
ra->type = getOptionalString(raHash, "type", NULL);
ra->url = getOptionalString(raHash, "url", NULL);
ra->visibility = getOptionalString(raHash, "visibility", "off");
ra->security = getOptionalString(raHash, "security", "private");
ra->wrangler = getOptionalString(raHash, "wrangler", NULL);
ra->citation = getOptionalString(raHash, "citation", NULL);
ra->article_title = getOptionalString(raHash, "article_title", NULL);
ra->author_list = getOptionalString(raHash, "author_list", NULL);
ra->wrangling_procedure = getOptionalString(raHash, "wrangling_procedure", NULL);
/* Settings on graphic information from datasets.ra file */
if (hashFindVal(raHash, "expScale"))
*(ra->expScale) = (float) atof(((char *) hashFindVal(raHash, "expScale")));
else
*(ra->expScale) = (float) 0.0;
if (hashFindVal(raHash, "gainFull"))
*(ra->gainFull) = atof(((char *) hashFindVal(raHash, "gainFull")));
else
*(ra->gainFull) = 1.0;
if (hashFindVal(raHash, "gainSet"))
*(ra->gainSet) = atof(((char *) hashFindVal(raHash, "gainSet")));
else
*(ra->gainSet) = 1.0;
if (hashFindVal(raHash, "priority"))
*(ra->priority) = atof(((char *) hashFindVal(raHash, "priority")));
else
*(ra->priority) = 10000.0; // default, put at bottom.
if (hashFindVal(raHash, "height"))
*(ra->height) = atoi(((char *) hashFindVal(raHash, "height")));
else
*(ra->height) = 0;
printf(" pass\t%s\n", ra->name);
return ra;
}
void putDataToTable(struct sqlConnection *conn, char *tableName, struct raDb *raList)
{
if (!raList)
{
printf("nothing to put in database.\n");
return;
}
if (sqlTableExists(conn, tableName))
{
printf("Table %s already exists in database, dropping...\n", tableName);
sqlDropTable(conn, tableName);
}
createRaDbTable(conn, tableName);
printf("writing raDb table to %s to database.\n", tableName);
struct raDb *ra;
uint i = 0;
for (ra = raList; ra; ra = ra->next)
{
*(ra->id) = i;
raDbSaveToDbWithNULL(conn, ra, tableName, 1024);
i++;
}
}
int raDbCmpPriority(const void *va, const void *vb)
/* Compare to sort columns based on priority. */
{
const struct raDb *a = *((struct raDb **)va);
const struct raDb *b = *((struct raDb **)vb);
float dif = *(a->priority) - *(b->priority);
if (dif < 0)
return -1;
else if (dif > 0)
return 1;
else
return 0;
}
void raToDb(char *db, char *raName)
/* raToDb - RA to database table converter RA to database converter. */
{
struct hash *raHash, *raHashList = readRaFile(raName);
if (raHashList == NULL)
errAbort("Couldn't find anything from %s", raName);
int numPass = 0, numFail = 0;
struct raDb *ra, *raList = NULL;
for (raHash = raHashList; raHash; raHash = raHash->next)
{
if ((ra = validateRa(raHash)) == NULL)
{
numFail += 1;
continue;
}
slAddHead(&raList, ra);
numPass += 1;
}
slSort(raList, raDbCmpPriority);
struct sqlConnection *conn = hAllocConnProfile(localDbProfile, db);
char *raDbName;
if (alpha)
raDbName = cloneString("raDb");
else
raDbName = raDbPath();
putDataToTable(conn, raDbName, raList);
printf("\n");
printf("raToDb Validation:\n");
printf("\t%d passed\n\t%d failed\n", numPass, numFail);
printf("\n");
hFreeConn(&conn);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 3)
usage();
if (optionExists("alpha"))
alpha = TRUE;
raToDb(argv[1], argv[2]);
return 0;
}