src/hg/instinct/raToDb/raToDb.c 1.4

1.4 2010/04/09 23:56:03 jsanborn
updated
Index: src/hg/instinct/raToDb/raToDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/raToDb/raToDb.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/instinct/raToDb/raToDb.c	9 Apr 2010 22:58:36 -0000	1.3
+++ src/hg/instinct/raToDb/raToDb.c	9 Apr 2010 23:56:03 -0000	1.4
@@ -1,478 +1,492 @@
 /* raToDb - RA to database table converter RA to database converter. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "cheapcgi.h"
 #include "localmem.h"
 #include "dystring.h"
 #include "obscure.h"
 #include "hdb.h"
 #include "sqlList.h"
 #include "hPrint.h"
 #include "ra.h"
 #include "hgHeatmapLib.h"
 #include "raDb.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "raToDb - RA to database table converter RA to database converter\n"
   "usage:\n"
   "   raToDb db tableName file.ra\n"
   "\n"
   "    db = database to put ra_username table\n"
   "    file.ra = root ra file\n"
   );
 }
 
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 static char *rootDir        = "../hgHeatmap2/hgHeatmapData";
 static char *hgCgiDir       = "../../makeDb/hgCgiData";
 static char *maGroupsFile   = "microarrayGroups.ra";
 static char *localDbProfile = "localDb";
 static char *genomicDb      = "hg18";
 
 /*********************  BEGIN HELPER FXNS ******************************/
 
 void createRaDbTable(struct sqlConnection *conn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned,\n");
 dyStringPrintf(dy, "name varchar(255),\n");
 dyStringPrintf(dy, "accessTable varchar(255),\n");
 dyStringPrintf(dy, "shortLabel varchar(255),\n");
 dyStringPrintf(dy, "longLabel varchar(255),\n");
 dyStringPrintf(dy, "expCount int unsigned,\n");
+dyStringPrintf(dy, "height int unsigned,\n");
 dyStringPrintf(dy, "groupName varchar(255),\n");
 dyStringPrintf(dy, "raFile varchar(255),\n");
 dyStringPrintf(dy, "patDb varchar(255),\n");
 dyStringPrintf(dy, "sampleField varchar(255),\n");
 dyStringPrintf(dy, "patTable varchar(255),\n");
 dyStringPrintf(dy, "patField varchar(255),\n");
 dyStringPrintf(dy, "aliasTable varchar(255),\n");
 dyStringPrintf(dy, "displayNameTable varchar(255),\n");
 dyStringPrintf(dy, "dataType varchar(255),\n");
 dyStringPrintf(dy, "platform varchar(255),\n");
 dyStringPrintf(dy, "expScale float,\n");
 dyStringPrintf(dy, "gainFull float,\n");
 dyStringPrintf(dy, "gainSet float,\n");
 dyStringPrintf(dy, "type varchar(255),\n");
 dyStringPrintf(dy, "visibility varchar(255),\n");
 dyStringPrintf(dy, "priority float,\n");
 dyStringPrintf(dy, "url varchar(255),\n");
 dyStringPrintf(dy, "security varchar(255),\n");
 dyStringPrintf(dy, "local_url varchar(255),\n");
 dyStringPrintf(dy, "profile varchar(255),\n");
 dyStringPrintf(dy, "wrangler varchar(255),\n");
 dyStringPrintf(dy, "citation varchar(255),\n");
 dyStringPrintf(dy, "article_title longblob,\n");
 dyStringPrintf(dy, "author_list longblob,\n");
 dyStringPrintf(dy, "wrangling_procedure longblob,\n");
 dyStringPrintf(dy, "PRIMARY KEY(id),\n"); 
 dyStringPrintf(dy, "KEY(name)\n");
 dyStringPrintf(dy, ");");
 sqlUpdate(conn, dy->string);
 dyStringFree(&dy);
 }
 
 void dyAddStringNULL(struct dyString *dy, char *str, boolean addComma)
 {
 if (str)
     dyStringPrintf(dy, "'%s'", str);
 else
     dyStringPrintf(dy, "NULL");
 
 if (addComma)
     dyStringPrintf(dy, ",");
 }
 
 void raDbSaveToDbWithNULL(struct sqlConnection *conn, struct raDb *el, char *tableName, int updateSize)
 /* Save raDb as a row to the table specified by tableName.
  * Have to do it myself since AutoSQL doesn't properly handle NULL values.
  * Assumes already escaped strings */
 {
 struct dyString *update = newDyString(updateSize);
 dyStringPrintf(update, "insert into %s values (", tableName);
 dyStringPrintf(update, "%u,", *(el->id));
 dyAddStringNULL(update, el->name, TRUE);
 dyAddStringNULL(update, el->accessTable, TRUE);
 dyAddStringNULL(update, el->shortLabel, TRUE);
 dyAddStringNULL(update, el->longLabel, TRUE);
 dyStringPrintf(update, "%u,", *(el->expCount));
+dyStringPrintf(update, "%u,", *(el->height));
 dyAddStringNULL(update, el->groupName, TRUE);
 dyAddStringNULL(update, el->raFile, TRUE);
 dyAddStringNULL(update, el->patDb, TRUE);
 dyAddStringNULL(update, el->sampleField, TRUE);
 dyAddStringNULL(update, el->patTable, TRUE);
 dyAddStringNULL(update, el->patField, TRUE);
 dyAddStringNULL(update, el->aliasTable, TRUE);
 dyAddStringNULL(update, el->displayNameTable, TRUE);
 dyAddStringNULL(update, el->dataType, TRUE);
 dyAddStringNULL(update, el->platform, TRUE);
 dyStringPrintf(update, "%g,", *(el->expScale));
 dyStringPrintf(update, "%g,", *(el->gainFull));
 dyStringPrintf(update, "%g,", *(el->gainSet));
 dyAddStringNULL(update, el->type, TRUE);
 dyAddStringNULL(update, el->visibility, TRUE);
 dyStringPrintf(update, "%g,", *(el->priority));
 dyAddStringNULL(update, el->url, TRUE);
 dyAddStringNULL(update, el->security, TRUE);
 dyAddStringNULL(update, el->local_url, TRUE);
 dyAddStringNULL(update, el->profile, TRUE);
 dyAddStringNULL(update, el->wrangler, TRUE);
 dyAddStringNULL(update, el->citation, TRUE);
 dyAddStringNULL(update, el->article_title, TRUE);
 dyAddStringNULL(update, el->author_list, TRUE);
 dyAddStringNULL(update, el->wrangling_procedure, FALSE);
 dyStringPrintf(update, ")");
 sqlUpdate(conn, update->string);
 freeDyString(&update);
 }
 
 
 
 struct slName *getRaIncludes(char *raFile)
 {
 struct lineFile *lf = lineFileOpen(raFile, TRUE);
 char *line, *file;
 
 struct slName *sl, *slList = NULL;
 
 for (;;)
     {
     if (!lineFileNext(lf, &line, NULL))
 	break;
 
     line = skipLeadingSpaces(line);
 
     if (startsWith("#include", line) || startsWith("include", line))
 	{
 	nextWord(&line);
 	file = nextQuotedWord(&line);
 	sl = slNameNew(file);
 	slAddHead(&slList, sl);
 	}
     }
 
 lineFileClose(&lf);
 
 return slList;
 }
 
 struct hash *readRaFile(char *rootName)
 /* read datasets.ra file, including any #include .ra files */
 {
 struct hash *hashOfHash = newHash(10);
 struct hashEl *helList, *hel;
 struct hash *raList = NULL, *ra;
 
 char fileName[HDB_MAX_PATH_STRING];
 safef(fileName, sizeof(fileName), "%s/%s", rootDir, rootName);
 struct slName *sl, *slList = getRaIncludes(fileName);
 
 raFoldIn(fileName, hashOfHash);
 for (sl = slList; sl; sl = sl->next)
     {
     safef(fileName, sizeof(fileName), "%s/%s", rootDir, sl->name);
     raFoldIn(fileName, hashOfHash);
     }
 
 /* Create list. */
 helList = hashElListHash(hashOfHash);
 for (hel = helList; hel != NULL; hel = hel->next)
     {
     ra = hel->val;
     slAddHead(&raList, ra);
     hel->val = NULL;
     }
 hashElFreeList(&helList);
 return raList;
 }
 
 struct microarrayGroups *maGroupingsForRa(char *database, char *table)
 /* Get the settings from the microarrayGrouop.ra files and put them in a convenient struct. */
 {
 struct microarrayGroups *ret;
 char groupings[strlen(table)+strlen("Groups")+1];
 safef(groupings, sizeof(groupings), "%s%s", table,"Groups");
 if (!groupings)
     return NULL;
 
 struct hash *allGroups;
 struct hash *hashList = hgReadRa(hGenome(database), database, hgCgiDir,
 				 maGroupsFile, &allGroups);
 if (!allGroups)
     return NULL;
 
 struct hash *mainGroup = (struct hash *)(hashFindVal(allGroups, groupings));
 if (!mainGroup)
     {
     printf("%s not found\n", groupings);
     return NULL;
     }
 
 char *s = (char *)(hashFindVal(mainGroup, "all"));
 if (!s)
     return NULL;
 
 struct hash *tmpGroup = (struct hash *)(hashFindVal(allGroups, s));
 if (!tmpGroup)
     return NULL;
 
 AllocVar(ret);
 ret->allArrays = maHashToMaGrouping(tmpGroup);
 
 hashFreeList(&hashList);
 return ret;
 }
 
 /*********************  END HELPER FXNS ******************************/
 
 char *getOptionalString(struct hash *hash, char *key, char *usual)
 {
 char *ret = (char *)(hashOptionalVal(hash, key, usual));
 
 if (ret)
     ret = sqlEscapeString(ret);
 
 return ret;
 }
 
 char *mustGetString(struct hash *hash, char *key)
 {
 char *ret = (char *)(hashMustFindVal(hash, key));
 ret = sqlEscapeString(ret);
 return ret;
 }
 
-
-int checkSampleCount(struct sqlConnection *conn, struct raDb *ra)
-{
-return 1;
-}
-
 struct raDb *initRA()
 {
 struct raDb *ra = AllocA(struct raDb);
 ra->next = NULL;
 ra->id   = AllocA(uint);
 ra->name = NULL;
 ra->profile = NULL;
 ra->dataType = NULL;
 ra->expCount = AllocA(uint);
+ra->height      = AllocA(uint);
 ra->accessTable = NULL;
 ra->aliasTable  = NULL;
 ra->displayNameTable = NULL;
 ra->shortLabel  = NULL;
 ra->longLabel   = NULL;
 ra->local_url   = NULL;
 ra->groupName   = NULL;
 ra->raFile      = NULL;
 ra->patDb       = NULL;
 ra->patTable    = NULL;
 ra->patField    = NULL;
 ra->sampleField = NULL;
 ra->platform    = NULL;
 ra->expScale    = AllocA(float);
 ra->gainFull    = AllocA(float);
 ra->gainSet     = AllocA(float);
 ra->security    = NULL;
 ra->wrangler    = NULL;
 ra->citation    = NULL;
 ra->article_title = NULL;
 ra->author_list = NULL;
 ra->wrangling_procedure = NULL;
 ra->type        = NULL;
 ra->visibility  = NULL;
 ra->priority    = AllocA(float);
 ra->url         = NULL;
 
 return ra;
 }
 
+int checkSampleCount(struct sqlConnection *conn, struct raDb *ra)
+{
+char query[256];
+safef(query, sizeof(query), "select expCount from %s limit 1", ra->name);
+
+int count = sqlQuickNum(conn, query);
+
+if (count != *(ra->expCount))
+    {
+    printf("FAIL\t%s\tMicroarray groups count != count in dataset (%d != %d)\n", 
+	   ra->name, count, *(ra->expCount));
+    return 0;
+    }
+return 1;
+}
+
+
 struct raDb *validateRa(struct hash *raHash)
 {
 struct raDb *ra = initRA();
 
 ra->name = mustGetString(raHash, "name");
 ra->profile = getOptionalString(raHash, "profile", localDbProfile);
 
 struct sqlConnection *conn = hAllocConnProfile(ra->profile, genomicDb);
-
-printf("validating %s :\n", ra->name);
 if (!sqlTableExists(conn, ra->name))
     {
-    printf("\tFAIL\tTable '%s' not in hg18 (%s).\n", ra->name, ra->profile);
+    printf("FAIL\t%s\tGenomic data not in hg18 (%s).\n", ra->name, ra->profile);
     hFreeConn(&conn);  // No longer need connection
     return NULL;
     }
-
 ra->dataType = mustGetString(raHash, "dataType");
 
 *(ra->expCount) = 0;
 /* microarray specific settings*/
 if (sameWord(ra->dataType, "bed 15"))
     {
     struct microarrayGroups *maGs = maGroupingsForRa(genomicDb, ra->name);
     if (!maGs)
 	{
-	printf("\tFAIL\tBad microarray groups for '%s'\n", ra->name);
+	printf("FAIL\t%s\tBad microarray groups\n", ra->name);
 	hFreeConn(&conn);  // No longer need connection
 	return NULL;
 	}
     struct maGrouping *allA = maGs->allArrays;
     *(ra->expCount) = allA->size;
 
     if (!checkSampleCount(conn, ra))
 	{
 	hFreeConn(&conn);  // No longer need connection
 	return NULL;
 	}
     }
 
 ra->accessTable = getOptionalString(raHash, "accessTable", ra->name);
 if (!sqlTableExists(conn, ra->accessTable))
     {
-    printf("\tWARN\tDown-sampled table '%s' not in database.\n", ra->accessTable);
+    printf("WARN\t%s\tDown-sampled table '%s' not in database.\n", ra->name, ra->accessTable);
     ra->accessTable = cloneString(ra->name);
     }
 
 ra->aliasTable  = getOptionalString(raHash, "aliasTable", NULL);
 if (ra->aliasTable && !sqlTableExists(conn, ra->aliasTable))
     {
-    printf("\tWARN\tProbe->Gene Alias table '%s' not in database.\n", ra->aliasTable);
+    printf("WARN\t%s\tProbe->Gene Alias table '%s' not in database.\n", ra->name, ra->aliasTable);
     ra->aliasTable = NULL;
     }
 
 ra->displayNameTable = getOptionalString(raHash, "displayNameTable", NULL);
 if (ra->displayNameTable && !sqlTableExists(conn, ra->displayNameTable))
     {
-    printf("\tWARN\tDisplay Name table '%s' not in database.\n", ra->displayNameTable);
+    printf("WARN\t%s\tDisplay Name table '%s' not in database.\n", ra->name, ra->displayNameTable);
     ra->displayNameTable = NULL;
     }
 
 hFreeConn(&conn);  // No longer need connection
 
 ra->shortLabel  = mustGetString(raHash, "shortLabel");
 ra->longLabel   = mustGetString(raHash, "longLabel");
 ra->local_url   = getOptionalString(raHash, "local_url", NULL);
 ra->groupName   = getOptionalString(raHash, "group", NULL);
 
 /* Settings on patient information from datasets.ra file */
 ra->raFile      = getOptionalString(raHash, "raFile", NULL);
 ra->patDb       = getOptionalString(raHash, "patDb", NULL);
 ra->patTable    = getOptionalString(raHash, "patTable", NULL);
 ra->patField    = getOptionalString(raHash, "patField", NULL);
 ra->sampleField = getOptionalString(raHash, "sampleField", NULL);
-
 /*** TODO CHECK PATIENT DB ***/
 
 /* Platform setting, currently defaults to expression */
-
 ra->platform = getOptionalString(raHash, "platform", "expression");
 
 ra->type = getOptionalString(raHash, "type", NULL);
 ra->url  = getOptionalString(raHash, "url", NULL);
 ra->visibility = getOptionalString(raHash, "visibility", "off");
-ra->security = getOptionalString(raHash, "security", "public");
+ra->security   = getOptionalString(raHash, "security", "private");
 
 ra->wrangler            = getOptionalString(raHash, "wrangler", NULL);
 ra->citation            = getOptionalString(raHash, "citation", NULL);
 ra->article_title       = getOptionalString(raHash, "article_title", NULL);
 ra->author_list         = getOptionalString(raHash, "author_list", NULL);
 ra->wrangling_procedure = getOptionalString(raHash, "wrangling_procedure", NULL);
 
 /* Settings on graphic information from datasets.ra file */
 if (hashFindVal(raHash, "expScale"))
     *(ra->expScale) = (float) atof(((char *) hashFindVal(raHash, "expScale")));
 else
     *(ra->expScale) = (float) 0.0;
 
 if (hashFindVal(raHash, "gainFull"))
     *(ra->gainFull) = atof(((char *) hashFindVal(raHash, "gainFull")));
 else
     *(ra->gainFull) = 1.0;
 
 if (hashFindVal(raHash, "gainSet"))
     *(ra->gainSet) = atof(((char *) hashFindVal(raHash, "gainSet")));
 else
     *(ra->gainSet) = 1.0;
 
 if (hashFindVal(raHash, "priority"))
     *(ra->priority) = atof(((char *) hashFindVal(raHash, "priority")));
 else
     *(ra->priority) = 0.0;
 
-printf("\tPASS\n");
+if (hashFindVal(raHash, "height"))
+    *(ra->height) = atoi(((char *) hashFindVal(raHash, "height")));
+else
+    *(ra->height) = 0;
+			     
+printf("PASS\t%s\n", ra->name);
 return ra;
 }
 
 
 void putDataToTable(struct sqlConnection *conn, char *tableName, struct raDb *raList)
 {
 if (!raList)
     {
     printf("nothing to put in database.\n");
     return;
     }
 
 if (sqlTableExists(conn, tableName))
     {
     printf("Table %s already exists in database, dropping...\n", tableName);
     sqlDropTable(conn, tableName);
     }
 
 createRaDbTable(conn, tableName);
 
 printf("writing raDb table to %s to database.\n", tableName);
 struct raDb *ra;
 uint i = 0;
 for (ra = raList; ra; ra = ra->next)
     {
     *(ra->id) = i;
     raDbSaveToDbWithNULL(conn, ra, tableName, 1024);
     i++;
     }
 }
 
 void raToDb(char *db, char *tableName, char *raName)
 /* raToDb - RA to database table converter RA to database converter. */
 {
 struct hash *raHash, *raHashList = readRaFile(raName);
 
 if (raHashList == NULL)
     errAbort("Couldn't find anything from %s", raName);
 
 struct raDb *ra, *raList = NULL;
 for (raHash = raHashList; raHash; raHash = raHash->next)
     {
     if ((ra = validateRa(raHash)) == NULL)
 	continue;
     slAddHead(&raList, ra); 
     }
 
 struct sqlConnection *conn = hAllocConnProfile(localDbProfile, db);
 
 putDataToTable(conn, tableName, raList);
 
 hFreeConn(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 raToDb(argv[1], argv[2], argv[3]);
 return 0;
 }