src/hg/instinct/raToDb/raToDb.c 1.3

1.3 2010/04/09 22:58:36 jsanborn
updated
Index: src/hg/instinct/raToDb/raToDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/raToDb/raToDb.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/instinct/raToDb/raToDb.c	9 Apr 2010 19:31:15 -0000	1.2
+++ src/hg/instinct/raToDb/raToDb.c	9 Apr 2010 22:58:36 -0000	1.3
@@ -1,53 +1,478 @@
 /* raToDb - RA to database table converter RA to database converter. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
+#include "cheapcgi.h"
+#include "localmem.h"
+#include "dystring.h"
+#include "obscure.h"
+#include "hdb.h"
+#include "sqlList.h"
+#include "hPrint.h"
 #include "ra.h"
 #include "hgHeatmapLib.h"
 #include "raDb.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "raToDb - RA to database table converter RA to database converter\n"
   "usage:\n"
-  "   raToDb db file.ra\n"
+  "   raToDb db tableName file.ra\n"
   "\n"
   "    db = database to put ra_username table\n"
   "    file.ra = root ra file\n"
   );
 }
 
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
-void raToDb(char *db, char *raName)
+static char *rootDir        = "../hgHeatmap2/hgHeatmapData";
+static char *hgCgiDir       = "../../makeDb/hgCgiData";
+static char *maGroupsFile   = "microarrayGroups.ra";
+static char *localDbProfile = "localDb";
+static char *genomicDb      = "hg18";
+
+/*********************  BEGIN HELPER FXNS ******************************/
+
+void createRaDbTable(struct sqlConnection *conn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned,\n");
+dyStringPrintf(dy, "name varchar(255),\n");
+dyStringPrintf(dy, "accessTable varchar(255),\n");
+dyStringPrintf(dy, "shortLabel varchar(255),\n");
+dyStringPrintf(dy, "longLabel varchar(255),\n");
+dyStringPrintf(dy, "expCount int unsigned,\n");
+dyStringPrintf(dy, "groupName varchar(255),\n");
+dyStringPrintf(dy, "raFile varchar(255),\n");
+dyStringPrintf(dy, "patDb varchar(255),\n");
+dyStringPrintf(dy, "sampleField varchar(255),\n");
+dyStringPrintf(dy, "patTable varchar(255),\n");
+dyStringPrintf(dy, "patField varchar(255),\n");
+dyStringPrintf(dy, "aliasTable varchar(255),\n");
+dyStringPrintf(dy, "displayNameTable varchar(255),\n");
+dyStringPrintf(dy, "dataType varchar(255),\n");
+dyStringPrintf(dy, "platform varchar(255),\n");
+dyStringPrintf(dy, "expScale float,\n");
+dyStringPrintf(dy, "gainFull float,\n");
+dyStringPrintf(dy, "gainSet float,\n");
+dyStringPrintf(dy, "type varchar(255),\n");
+dyStringPrintf(dy, "visibility varchar(255),\n");
+dyStringPrintf(dy, "priority float,\n");
+dyStringPrintf(dy, "url varchar(255),\n");
+dyStringPrintf(dy, "security varchar(255),\n");
+dyStringPrintf(dy, "local_url varchar(255),\n");
+dyStringPrintf(dy, "profile varchar(255),\n");
+dyStringPrintf(dy, "wrangler varchar(255),\n");
+dyStringPrintf(dy, "citation varchar(255),\n");
+dyStringPrintf(dy, "article_title longblob,\n");
+dyStringPrintf(dy, "author_list longblob,\n");
+dyStringPrintf(dy, "wrangling_procedure longblob,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id),\n"); 
+dyStringPrintf(dy, "KEY(name)\n");
+dyStringPrintf(dy, ");");
+sqlUpdate(conn, dy->string);
+dyStringFree(&dy);
+}
+
+void dyAddStringNULL(struct dyString *dy, char *str, boolean addComma)
+{
+if (str)
+    dyStringPrintf(dy, "'%s'", str);
+else
+    dyStringPrintf(dy, "NULL");
+
+if (addComma)
+    dyStringPrintf(dy, ",");
+}
+
+void raDbSaveToDbWithNULL(struct sqlConnection *conn, struct raDb *el, char *tableName, int updateSize)
+/* Save raDb as a row to the table specified by tableName.
+ * Have to do it myself since AutoSQL doesn't properly handle NULL values.
+ * Assumes already escaped strings */
+{
+struct dyString *update = newDyString(updateSize);
+dyStringPrintf(update, "insert into %s values (", tableName);
+dyStringPrintf(update, "%u,", *(el->id));
+dyAddStringNULL(update, el->name, TRUE);
+dyAddStringNULL(update, el->accessTable, TRUE);
+dyAddStringNULL(update, el->shortLabel, TRUE);
+dyAddStringNULL(update, el->longLabel, TRUE);
+dyStringPrintf(update, "%u,", *(el->expCount));
+dyAddStringNULL(update, el->groupName, TRUE);
+dyAddStringNULL(update, el->raFile, TRUE);
+dyAddStringNULL(update, el->patDb, TRUE);
+dyAddStringNULL(update, el->sampleField, TRUE);
+dyAddStringNULL(update, el->patTable, TRUE);
+dyAddStringNULL(update, el->patField, TRUE);
+dyAddStringNULL(update, el->aliasTable, TRUE);
+dyAddStringNULL(update, el->displayNameTable, TRUE);
+dyAddStringNULL(update, el->dataType, TRUE);
+dyAddStringNULL(update, el->platform, TRUE);
+dyStringPrintf(update, "%g,", *(el->expScale));
+dyStringPrintf(update, "%g,", *(el->gainFull));
+dyStringPrintf(update, "%g,", *(el->gainSet));
+dyAddStringNULL(update, el->type, TRUE);
+dyAddStringNULL(update, el->visibility, TRUE);
+dyStringPrintf(update, "%g,", *(el->priority));
+dyAddStringNULL(update, el->url, TRUE);
+dyAddStringNULL(update, el->security, TRUE);
+dyAddStringNULL(update, el->local_url, TRUE);
+dyAddStringNULL(update, el->profile, TRUE);
+dyAddStringNULL(update, el->wrangler, TRUE);
+dyAddStringNULL(update, el->citation, TRUE);
+dyAddStringNULL(update, el->article_title, TRUE);
+dyAddStringNULL(update, el->author_list, TRUE);
+dyAddStringNULL(update, el->wrangling_procedure, FALSE);
+dyStringPrintf(update, ")");
+sqlUpdate(conn, update->string);
+freeDyString(&update);
+}
+
+
+
+struct slName *getRaIncludes(char *raFile)
+{
+struct lineFile *lf = lineFileOpen(raFile, TRUE);
+char *line, *file;
+
+struct slName *sl, *slList = NULL;
+
+for (;;)
+    {
+    if (!lineFileNext(lf, &line, NULL))
+	break;
+
+    line = skipLeadingSpaces(line);
+
+    if (startsWith("#include", line) || startsWith("include", line))
+	{
+	nextWord(&line);
+	file = nextQuotedWord(&line);
+	sl = slNameNew(file);
+	slAddHead(&slList, sl);
+	}
+    }
+
+lineFileClose(&lf);
+
+return slList;
+}
+
+struct hash *readRaFile(char *rootName)
+/* read datasets.ra file, including any #include .ra files */
+{
+struct hash *hashOfHash = newHash(10);
+struct hashEl *helList, *hel;
+struct hash *raList = NULL, *ra;
+
+char fileName[HDB_MAX_PATH_STRING];
+safef(fileName, sizeof(fileName), "%s/%s", rootDir, rootName);
+struct slName *sl, *slList = getRaIncludes(fileName);
+
+raFoldIn(fileName, hashOfHash);
+for (sl = slList; sl; sl = sl->next)
+    {
+    safef(fileName, sizeof(fileName), "%s/%s", rootDir, sl->name);
+    raFoldIn(fileName, hashOfHash);
+    }
+
+/* Create list. */
+helList = hashElListHash(hashOfHash);
+for (hel = helList; hel != NULL; hel = hel->next)
+    {
+    ra = hel->val;
+    slAddHead(&raList, ra);
+    hel->val = NULL;
+    }
+hashElFreeList(&helList);
+return raList;
+}
+
+struct microarrayGroups *maGroupingsForRa(char *database, char *table)
+/* Get the settings from the microarrayGrouop.ra files and put them in a convenient struct. */
+{
+struct microarrayGroups *ret;
+char groupings[strlen(table)+strlen("Groups")+1];
+safef(groupings, sizeof(groupings), "%s%s", table,"Groups");
+if (!groupings)
+    return NULL;
+
+struct hash *allGroups;
+struct hash *hashList = hgReadRa(hGenome(database), database, hgCgiDir,
+				 maGroupsFile, &allGroups);
+if (!allGroups)
+    return NULL;
+
+struct hash *mainGroup = (struct hash *)(hashFindVal(allGroups, groupings));
+if (!mainGroup)
+    {
+    printf("%s not found\n", groupings);
+    return NULL;
+    }
+
+char *s = (char *)(hashFindVal(mainGroup, "all"));
+if (!s)
+    return NULL;
+
+struct hash *tmpGroup = (struct hash *)(hashFindVal(allGroups, s));
+if (!tmpGroup)
+    return NULL;
+
+AllocVar(ret);
+ret->allArrays = maHashToMaGrouping(tmpGroup);
+
+hashFreeList(&hashList);
+return ret;
+}
+
+/*********************  END HELPER FXNS ******************************/
+
+char *getOptionalString(struct hash *hash, char *key, char *usual)
+{
+char *ret = (char *)(hashOptionalVal(hash, key, usual));
+
+if (ret)
+    ret = sqlEscapeString(ret);
+
+return ret;
+}
+
+char *mustGetString(struct hash *hash, char *key)
+{
+char *ret = (char *)(hashMustFindVal(hash, key));
+ret = sqlEscapeString(ret);
+return ret;
+}
+
+
+int checkSampleCount(struct sqlConnection *conn, struct raDb *ra)
+{
+return 1;
+}
+
+struct raDb *initRA()
+{
+struct raDb *ra = AllocA(struct raDb);
+ra->next = NULL;
+ra->id   = AllocA(uint);
+ra->name = NULL;
+ra->profile = NULL;
+ra->dataType = NULL;
+ra->expCount = AllocA(uint);
+ra->accessTable = NULL;
+ra->aliasTable  = NULL;
+ra->displayNameTable = NULL;
+ra->shortLabel  = NULL;
+ra->longLabel   = NULL;
+ra->local_url   = NULL;
+ra->groupName   = NULL;
+ra->raFile      = NULL;
+ra->patDb       = NULL;
+ra->patTable    = NULL;
+ra->patField    = NULL;
+ra->sampleField = NULL;
+ra->platform    = NULL;
+ra->expScale    = AllocA(float);
+ra->gainFull    = AllocA(float);
+ra->gainSet     = AllocA(float);
+ra->security    = NULL;
+ra->wrangler    = NULL;
+ra->citation    = NULL;
+ra->article_title = NULL;
+ra->author_list = NULL;
+ra->wrangling_procedure = NULL;
+ra->type        = NULL;
+ra->visibility  = NULL;
+ra->priority    = AllocA(float);
+ra->url         = NULL;
+
+return ra;
+}
+
+struct raDb *validateRa(struct hash *raHash)
+{
+struct raDb *ra = initRA();
+
+ra->name = mustGetString(raHash, "name");
+ra->profile = getOptionalString(raHash, "profile", localDbProfile);
+
+struct sqlConnection *conn = hAllocConnProfile(ra->profile, genomicDb);
+
+printf("validating %s :\n", ra->name);
+if (!sqlTableExists(conn, ra->name))
+    {
+    printf("\tFAIL\tTable '%s' not in hg18 (%s).\n", ra->name, ra->profile);
+    hFreeConn(&conn);  // No longer need connection
+    return NULL;
+    }
+
+ra->dataType = mustGetString(raHash, "dataType");
+
+*(ra->expCount) = 0;
+/* microarray specific settings*/
+if (sameWord(ra->dataType, "bed 15"))
+    {
+    struct microarrayGroups *maGs = maGroupingsForRa(genomicDb, ra->name);
+    if (!maGs)
+	{
+	printf("\tFAIL\tBad microarray groups for '%s'\n", ra->name);
+	hFreeConn(&conn);  // No longer need connection
+	return NULL;
+	}
+    struct maGrouping *allA = maGs->allArrays;
+    *(ra->expCount) = allA->size;
+
+    if (!checkSampleCount(conn, ra))
+	{
+	hFreeConn(&conn);  // No longer need connection
+	return NULL;
+	}
+    }
+
+ra->accessTable = getOptionalString(raHash, "accessTable", ra->name);
+if (!sqlTableExists(conn, ra->accessTable))
+    {
+    printf("\tWARN\tDown-sampled table '%s' not in database.\n", ra->accessTable);
+    ra->accessTable = cloneString(ra->name);
+    }
+
+ra->aliasTable  = getOptionalString(raHash, "aliasTable", NULL);
+if (ra->aliasTable && !sqlTableExists(conn, ra->aliasTable))
+    {
+    printf("\tWARN\tProbe->Gene Alias table '%s' not in database.\n", ra->aliasTable);
+    ra->aliasTable = NULL;
+    }
+
+ra->displayNameTable = getOptionalString(raHash, "displayNameTable", NULL);
+if (ra->displayNameTable && !sqlTableExists(conn, ra->displayNameTable))
+    {
+    printf("\tWARN\tDisplay Name table '%s' not in database.\n", ra->displayNameTable);
+    ra->displayNameTable = NULL;
+    }
+
+hFreeConn(&conn);  // No longer need connection
+
+ra->shortLabel  = mustGetString(raHash, "shortLabel");
+ra->longLabel   = mustGetString(raHash, "longLabel");
+ra->local_url   = getOptionalString(raHash, "local_url", NULL);
+ra->groupName   = getOptionalString(raHash, "group", NULL);
+
+/* Settings on patient information from datasets.ra file */
+ra->raFile      = getOptionalString(raHash, "raFile", NULL);
+ra->patDb       = getOptionalString(raHash, "patDb", NULL);
+ra->patTable    = getOptionalString(raHash, "patTable", NULL);
+ra->patField    = getOptionalString(raHash, "patField", NULL);
+ra->sampleField = getOptionalString(raHash, "sampleField", NULL);
+
+/*** TODO CHECK PATIENT DB ***/
+
+/* Platform setting, currently defaults to expression */
+
+ra->platform = getOptionalString(raHash, "platform", "expression");
+
+ra->type = getOptionalString(raHash, "type", NULL);
+ra->url  = getOptionalString(raHash, "url", NULL);
+ra->visibility = getOptionalString(raHash, "visibility", "off");
+ra->security = getOptionalString(raHash, "security", "public");
+
+ra->wrangler            = getOptionalString(raHash, "wrangler", NULL);
+ra->citation            = getOptionalString(raHash, "citation", NULL);
+ra->article_title       = getOptionalString(raHash, "article_title", NULL);
+ra->author_list         = getOptionalString(raHash, "author_list", NULL);
+ra->wrangling_procedure = getOptionalString(raHash, "wrangling_procedure", NULL);
+
+/* Settings on graphic information from datasets.ra file */
+if (hashFindVal(raHash, "expScale"))
+    *(ra->expScale) = (float) atof(((char *) hashFindVal(raHash, "expScale")));
+else
+    *(ra->expScale) = (float) 0.0;
+
+if (hashFindVal(raHash, "gainFull"))
+    *(ra->gainFull) = atof(((char *) hashFindVal(raHash, "gainFull")));
+else
+    *(ra->gainFull) = 1.0;
+
+if (hashFindVal(raHash, "gainSet"))
+    *(ra->gainSet) = atof(((char *) hashFindVal(raHash, "gainSet")));
+else
+    *(ra->gainSet) = 1.0;
+
+if (hashFindVal(raHash, "priority"))
+    *(ra->priority) = atof(((char *) hashFindVal(raHash, "priority")));
+else
+    *(ra->priority) = 0.0;
+
+printf("\tPASS\n");
+return ra;
+}
+
+
+void putDataToTable(struct sqlConnection *conn, char *tableName, struct raDb *raList)
+{
+if (!raList)
+    {
+    printf("nothing to put in database.\n");
+    return;
+    }
+
+if (sqlTableExists(conn, tableName))
+    {
+    printf("Table %s already exists in database, dropping...\n", tableName);
+    sqlDropTable(conn, tableName);
+    }
+
+createRaDbTable(conn, tableName);
+
+printf("writing raDb table to %s to database.\n", tableName);
+struct raDb *ra;
+uint i = 0;
+for (ra = raList; ra; ra = ra->next)
+    {
+    *(ra->id) = i;
+    raDbSaveToDbWithNULL(conn, ra, tableName, 1024);
+    i++;
+    }
+}
+
+void raToDb(char *db, char *tableName, char *raName)
 /* raToDb - RA to database table converter RA to database converter. */
 {
-struct hash *ra, *raList = readRa(raName);
+struct hash *raHash, *raHashList = readRaFile(raName);
 
-if (raList == NULL)
+if (raHashList == NULL)
     errAbort("Couldn't find anything from %s", raName);
 
-for (ra = raList; ra != NULL; ra = ra->next)
+struct raDb *ra, *raList = NULL;
+for (raHash = raHashList; raHash; raHash = raHash->next)
     {
-    char *name = (char *)(hashMustFindVal(ra, "name"));
-    printf("name: %s\n", name); 
+    if ((ra = validateRa(raHash)) == NULL)
+	continue;
+    slAddHead(&raList, ra); 
     }
+
+struct sqlConnection *conn = hAllocConnProfile(localDbProfile, db);
+
+putDataToTable(conn, tableName, raList);
+
+hFreeConn(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
-if (argc != 3)
+if (argc != 4)
     usage();
-raToDb(argv[1], argv[2]);
+raToDb(argv[1], argv[2], argv[3]);
 return 0;
 }