src/hg/instinct/bioInt2/populateDb.c 1.6

1.6 2009/04/04 00:39:22 jsanborn
added cohorts api
Index: src/hg/instinct/bioInt2/populateDb.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/populateDb.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 1000000 -r1.5 -r1.6
--- src/hg/instinct/bioInt2/populateDb.c	27 Mar 2009 21:56:27 -0000	1.5
+++ src/hg/instinct/bioInt2/populateDb.c	4 Apr 2009 00:39:22 -0000	1.6
@@ -1,1275 +1,1289 @@
 /* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "bed.h"
 #include "genePred.h"
 #include "hPrint.h"
 #include "hdb.h"  
 #include "microarray.h"
 #include "ra.h"
 #include "featuresLib.h"
 #include "hgHeatmapLib.h"
 #include "bioIntDriver.h"
 #include "bioIntDb.h"
 
 char *hgDb = "hg18";
 char *genome = "Human";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "populateDb \n"
   "   populateDb [OPTIONS] db table tissue\n"
   "options:\n"
   "   -dropAll   Drop/recreate any table\n"
+  "   -tcga      handles TCGA ids\n"
   "\n"
   );
 }
 
 boolean dropTable = FALSE;   // If true, any table that should be dropped/recreated will be
+boolean isTCGA    = FALSE;   // If true, specially handle TCGA ids 
 
 static struct optionSpec options[] = {
     {"dropAll", OPTION_BOOLEAN},
+    {"tcga", OPTION_BOOLEAN},
     {NULL, 0},
 };
 
 
 char *getId(struct sqlConnection *conn, char *table, char *key, char *sample, char *value)
 /* get ISPY ID from sample (or experiment) Id */
 {
 char query[512];
 safef(query, sizeof(query), "select %s from %s where %s = \"%s\" ", key, table, value, sample);
 return sqlQuickString(conn, query);
 }
 
 
 struct slName *getProbesFromTable(struct sqlConnection *hgConn, char *tableName)
 {
 char query[512];
 char *key = "name";
 safef(query, sizeof(query), "select DISTINCT %s from %s ", key, tableName);
 struct sqlResult *sr = sqlGetResult(hgConn, query);
 char **row = NULL;
 
 struct slName *sl, *slList = NULL;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     sl = slNameNew(row[0]); 
     slAddHead(&slList, sl);                                                                       
     }
 
 slReverse(&slList);
 sqlFreeResult(&sr);
 return slList;
 }
 
 
 struct maGrouping *getMaGrouping(struct sqlConnection *hgConn, char *tableName)
 {
 /*microarray specific settings*/
 struct trackDb *tdb = hMaybeTrackInfo(hgConn, tableName);  
 struct microarrayGroups *maGs = maGroupings("hg18", tableName);
 trackDbFreeList(&tdb);
 if (!maGs)
     return NULL;
 return maGs->allArrays;
 }
 
 struct hash *getSettings(char *tableName)
 {
 struct column *raList = getColumns(NULL, "datasets.ra", NULL);  
 
 struct column *col;
 struct hash *settings = NULL;
 for (col = raList; col; col = col->next)
     {
     if (!sameString(col->name, tableName))
 	continue;
 
     settings = col->settings;
     break;
     }
 
 if (!settings)
     errAbort("Couldn't find datasets.ra listing for %s", tableName);
 
 return settings;
 }
 
 struct geneAlias {
     struct geneAlias *next;
 
     char *probe;
     struct slName *genes;
 }; 
 
  
 struct geneAlias *getAliases(struct sqlConnection *hgConn, char *tableName)
 {
 if (!hgConn || !tableName)
     return NULL;
 
 char query[512];
 char **row;
   
 safef(query, sizeof(query), "select * from %s", tableName);
 
 struct sqlResult *sr = sqlGetResult(hgConn, query);
 
 struct geneAlias *ga, *gaList = NULL;
 struct hash *gaHash = hashNew(0);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *probe = cloneString(row[0]);
     char *gene = cloneString(row[1]);
 
     struct hashEl *el = hashLookup(gaHash, probe);
 
     if (!el)
 	{
 	ga = AllocA(struct geneAlias);
 	ga->probe = cloneString(probe);
 	ga->genes = NULL;
 
 	slAddHead(&gaList, ga);
 	hashAdd(gaHash, probe, ga);
 	}
     else
 	ga = el->val;
 
     slNameAddHead(&ga->genes, gene);	
     }
 
 hashFree(&gaHash);
 sqlFreeResult(&sr);              
 
 return gaList;                   
 }
 
 struct dataTypes *findDataType(struct sqlConnection *biConn, char *type, char *platform)
 {
 if (!sameString(type, "bed 15"))
     errAbort("populateDb only runs on bed 15 files.");
 
 char *data_format = "probeVals";
 
 char query[256];
 safef(query, sizeof(query), 
       "select * from %s where format = \"%s\" and name = \"%s\"",
       DT_TABLE, data_format, platform);
 
 return dataTypesLoadByQuery(biConn, query);
 }
 
 struct dataTypes *createDataType(struct sqlConnection *biConn, char *type, char *platform)
 {
 int nextId = sqlTableSize(biConn, DT_TABLE);
 struct dataTypes *dt;
 AllocVar(dt);
 dt->id = nextId;
 dt->format = cloneString("probeVals");
 dt->name   = cloneString(platform);
 
 /* Save to db */
 dataTypesSaveToDb(biConn, dt, DT_TABLE, 100);
 return dt;
 } 
 
 void createDataTypesTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "format varchar(255) not null,\n");
 dyStringPrintf(dy, "name varchar(255) not null,\n");
 dyStringPrintf(dy, "PRIMARY KEY(id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 struct dataTypes *setupDataType(struct sqlConnection *biConn, 
 				char *type, char *platform)
 {
 if (!sqlTableExists(biConn, DT_TABLE))
     {
     fprintf(stderr, "Tables dataTypes doesn't exist, creating...\n");
     createDataTypesTable(biConn, DT_TABLE);
     }
 
 struct dataTypes *dt = findDataType(biConn, type, platform);
 if (!dt)
     dt = createDataType(biConn, type, platform);
 
 return dt;
 }
 
 
 struct tissues *findTissue(struct sqlConnection *biConn, char *tissue)
 {
 char query[256];
 safef(query, sizeof(query), 
       "select * from %s where name = \"%s\";",
       TI_TABLE, tissue);
 return tissuesLoadByQuery(biConn, query);
 }
 
 struct tissues *createTissue(struct sqlConnection *biConn, char *tissue)
 {
 int nextId = sqlTableSize(biConn, TI_TABLE);
 struct tissues *ti;
 AllocVar(ti);
 ti->id = nextId;
 ti->name = cloneString(tissue);
 
 /* Save to db */
 tissuesSaveToDb(biConn, ti, TI_TABLE, 100);
 return ti;
 } 
 
 void createTissuesTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "name varchar(255) not null,\n");
 dyStringPrintf(dy, "PRIMARY KEY(id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 struct tissues *setupTissue(struct sqlConnection *biConn, char *tissue)
 {
 if (!sqlTableExists(biConn, TI_TABLE))
     {
     fprintf(stderr, "Tables tissues doesn't exist, creating...\n");
     createTissuesTable(biConn,  TI_TABLE);
     }
 
 struct tissues *ti = findTissue(biConn, tissue);
 if (!ti)
     ti = createTissue(biConn, tissue);
 
 return ti;
 }
 
 
 struct datasets *findDataset(struct sqlConnection *biConn, char *name)
 {
 char query[256];
 safef(query, sizeof(query), 
       "select * from %s where data_table = \"%s\";",
       DA_TABLE, name);
 return datasetsLoadByQuery(biConn, query);
 }
 
 
 struct datasets *createDataset(struct sqlConnection *biConn, 
 			       char *tableName, char *tissue, int numSamples)
 {
 struct hash *settings = getSettings(tableName);
 
 struct hashEl *el = hashLookup(settings, "shortLabel");
 if (!el)
     errAbort("No shortLabel");
 char *shortLabel = cloneString(el->val);
 
 el = hashLookup(settings, "name");
 if (!el)
     errAbort("No name");
 char *dataTable = cloneString(el->val);
 
 char probeTable[256];
 safef(probeTable, sizeof(probeTable), "%s_probeInfo", dataTable);
 
 char p2gTable[256];
 safef(p2gTable, sizeof(p2gTable), "%s_probeToGene", dataTable);
   
 char *platform;
 el = hashLookup(settings, "platform");
 if (!el)
     platform = cloneString("Expression");
 else
     platform = cloneString(el->val);
 
 el = hashLookup(settings, "dataType");
 if (!el)
     errAbort("No dataType");
 char *dataType = cloneString(el->val);
 
 struct dataTypes *dt = setupDataType(biConn, dataType, platform);
 
 struct tissues *ti = setupTissue(biConn, tissue);
 
 int nextId = sqlTableSize(biConn, DA_TABLE);
 
 struct datasets *da;
 AllocVar(da);
 da->id = nextId;
 da->tissue_id = ti->id;
 da->type_id = dt->id;
 da->num_samples = numSamples;
 da->name = shortLabel;
 da->data_table = dataTable;
 da->probe_table = cloneString(probeTable);
 da->probe_to_gene_table = cloneString(p2gTable);
 
 dataTypesFree(&dt);
 tissuesFree(&ti);
 
 /* Write datasets */
 datasetsSaveToDbEscaped(biConn, da, DA_TABLE, 100); 
 
 return da;
 }
 
 void createDatasetsTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
 dyStringPrintf(dy, "type_id int unsigned not null,\n");
 dyStringPrintf(dy, "num_samples int unsigned not null,\n");
 dyStringPrintf(dy, "name varchar(255) not null,\n");
 dyStringPrintf(dy, "data_table varchar(255) not null,\n");
 dyStringPrintf(dy, "probe_table varchar(255) not null,\n");
 dyStringPrintf(dy, "probe_to_gene_table varchar(255) not null,\n");
 dyStringPrintf(dy, "PRIMARY KEY(id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 struct datasets *setupDataset(struct sqlConnection *biConn, 
 			      char *tableName, char *tissue, int numSamples)
 {
 if (!sqlTableExists(biConn, DA_TABLE))
     {
     fprintf(stderr, "Tables datasets doesn't exist, creating...");
     createDatasetsTable(biConn, DA_TABLE);
     }
 
 struct datasets *da = findDataset(biConn, tableName);
 if (!da)
     da = createDataset(biConn, tableName, tissue, numSamples);
 
 return da;
 }
 
 
 char *findPatientName(struct sqlConnection *pdConn, char *pTable, 
 		      char *pField, char *sField, char *sName)
 {
 char query[256];
 safef(query, sizeof(query),
       "select %s from %s where %s = \"%s\"",
       pField, pTable, sField, sName);
 
 return sqlQuickString(pdConn, query);
 }
 
 int findId(struct sqlConnection *biConn, char *idField, char *sField, char *name)
 {
 if (sqlTableSize(biConn, SA_TABLE) == 0)  /* brand new table, return 0 */
     return 0;
 
 char query[256];
 safef(query, sizeof(query), 
       "select DISTINCT %s from %s where %s = \"%s\";",
       idField, SA_TABLE, sField, name);
 if (sqlExists(biConn, query))  /* sample name found, use same id */
     return sqlQuickNum(biConn, query);
 
 /* Else, find maximum sample id and add one to it */
 safef(query, sizeof(query),
       "select max(%s) from %s;", 
       idField, SA_TABLE);
 int maxId = sqlQuickNum(biConn, query);
 return maxId + 1;
 }
 
 boolean sampleExists(struct sqlConnection *biConn, struct samples *sa)
 {
 char query[256];
 safef(query, sizeof(query),
       "select * from %s where id = %d "
       "and name = \"%s\" "
       "and patient_id = %d "
       "and patient_name = \"%s\" "
       "and dataset_id = %d "
       "and exp_id = %d "
       "and tissue_id = %d ",
       SA_TABLE, sa->id, sa->name, sa->patient_id, sa->patient_name, sa->dataset_id,
       sa->exp_id, sa->tissue_id);
 
 return sqlExists(biConn, query);
 }
 
 void createSamples(struct sqlConnection *biConn, struct datasets *da, struct maGrouping *allA)
 {
 int datasetId = da->id;
 int tissueId = da->tissue_id;
 
 struct hash *settings = getSettings(da->data_table);
 
 struct hashEl *el = hashLookup(settings, "patDb");
 if (!el)
     errAbort("No patDb!");
 char *patDb = cloneString(el->val);
 
 el = hashLookup(settings, "patTable");
 if (!el)
     errAbort("No patTable");
 char *patTable = cloneString(el->val);
 
 el = hashLookup(settings, "patField");
 if (!el)
     errAbort("No patField");
 char *patField = cloneString(el->val);
 
 el = hashLookup(settings, "sampleField");
 if (!el)
     errAbort("No sampleField");
 char *sampleField = cloneString(el->val);
 
 struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb);
 
 int i;
 struct samples *sa;
 for (i = 0; i < allA->size; i++)
     {
-    char *sampleName = cloneString(allA->names[i]);
-    int expId = allA->expIds[i];
+    char *sampleName, *patientName;
+    if (isTCGA)
+	{
+	sampleName = cloneStringZ(allA->names[i], 16);
+	patientName = cloneStringZ(allA->names[i], 12);
+	}
+    else
+	{
+	sampleName = cloneString(allA->names[i]);
+	patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName); 
+	}
 
+    int expId = allA->expIds[i];
     int sampleId = findId(biConn, "id", "name", sampleName);
-    char *patientName = findPatientName(pdConn, patTable, patField, sampleField, sampleName); 
     int patientId = findId(biConn, "patient_id", "patient_name", patientName);
 
     AllocVar(sa);
     sa->id = sampleId;
     sa->name = sampleName;
     sa->patient_id = patientId;
     sa->patient_name = patientName;
     sa->dataset_id = datasetId;
     sa->exp_id = expId;
     sa->tissue_id = tissueId;
 
     if (!sampleExists(biConn, sa))
 	samplesSaveToDb(biConn, sa, SA_TABLE, 100);
 
     samplesFree(&sa);
     }
 
 hFreeConn(&pdConn);
 }
 
 void createSamplesTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "name varchar(255) not null,\n");
 dyStringPrintf(dy, "patient_id int unsigned not null,\n");
 dyStringPrintf(dy, "patient_name varchar(255) not null,\n");
 dyStringPrintf(dy, "dataset_id int unsigned not null,\n");
 dyStringPrintf(dy, "exp_id int unsigned not null,\n");
 dyStringPrintf(dy, "tissue_id int unsigned not null,\n");
 dyStringPrintf(dy, "KEY(id),\n");
 dyStringPrintf(dy, "KEY(dataset_id),\n");
 dyStringPrintf(dy, "KEY(id, dataset_id),\n");
 dyStringPrintf(dy, "KEY(dataset_id,id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 struct samples *getSamples(struct sqlConnection *biConn, struct datasets *da)
 {
 char query[256];
 safef(query, sizeof(query),
       "select * from %s where dataset_id = %d order by exp_id;",
       SA_TABLE, da->id);
 
 return samplesLoadByQuery(biConn, query);
 }
 
 
 struct samples *setupSamples(struct sqlConnection *biConn, struct datasets *da, 
 			     struct maGrouping *allA)
 {
 if (!sqlTableExists(biConn, SA_TABLE))
     {
     fprintf(stderr, "Table samples doesn't exist, creating...\n");
     createSamplesTable(biConn, SA_TABLE);
     }
 
 createSamples(biConn, da, allA);
 struct samples *saList = getSamples(biConn, da);
 if (slCount(saList) != allA->size)
     errAbort("Sample count from microarrayGroups and database don't match!");
 
 return saList;
 }
 
 
 int getFeatureId(struct sqlConnection *biConn, char *name)
 {
 if (sqlTableSize(biConn, FE_TABLE) == 0)  /* brand new table, return 0 */
     return 0;
 
 char query[256];
 safef(query, sizeof(query), 
       "select id from %s where name = \"%s\";",
       FE_TABLE, name);
 
 if (sqlExists(biConn, query))  /* sample name found, use same id */
     return sqlQuickNum(biConn, query);
 else
     return sqlTableSize(biConn, FE_TABLE);
 }
 
 struct features *getFeature(struct sqlConnection *biConn, char *name)
 {
 char query[256];
 safef(query, sizeof(query),
       "select * from %s where name = \"%s\";",
       FE_TABLE, name);
 
 return featuresLoadByQuery(biConn, query);
 }
 
 boolean featureExists(struct sqlConnection *biConn, struct features *fs)
 {
 char query[256];
 safef(query, sizeof(query),
       "select * from %s where name = \"%s\";",
       FE_TABLE, fs->name);
 
 return sqlExists(biConn, query);
 }
 
 boolean clinicalDataExists(struct sqlConnection *biConn, struct clinicalData *cd)
 {
 char query[256];
 safef(query, sizeof(query),
       "select * from %s where sample_id = %d "
       "and feature_id = %d; ",
       CD_TABLE, cd->sample_id, cd->feature_id);
 
 if (!sqlExists(biConn, query))  /* entry doesn't exist, report */
     return FALSE;
 
 /* Make sure entry has same values, if not there is a problem 
  * (sample_id, feature_id) should be unique */
 
 struct clinicalData *cd2 = clinicalDataLoadByQuery(biConn, query);
 if (slCount(cd2) != 1)
     errAbort("clinicalData entries not unique, sample_id = %d, feature_id = %d",
 	     cd->sample_id, cd->feature_id);
 
 if (cd->val != cd2->val)
     errAbort("clinicalData values don't match, sample_id = %d, feature_id = %d, "
 	     "%f != %f",
 	     cd->sample_id, cd->feature_id, cd->val, cd2->val);
 
 if (cd->code && cd2->code)
     if (!sameString(cd->code, cd2->code))	
 	errAbort("clinicalData codes don't match, sample_id = %d, feature_id = %d",
 		 cd->sample_id, cd->feature_id);
 
 return TRUE;
 }
 
 void createFeaturesTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "name varchar(255) not null,\n");
 dyStringPrintf(dy, "shortLabel varchar(255) not null,\n");
 dyStringPrintf(dy, "longLabel varchar(255) not null,\n");
 dyStringPrintf(dy, "KEY(id),\n");
 dyStringPrintf(dy, "KEY(name),\n");
 dyStringPrintf(dy, "KEY(id,name)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 void createClinicalDataTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "sample_id int unsigned not null,\n");
 dyStringPrintf(dy, "feature_id int unsigned not null,\n");
 dyStringPrintf(dy, "val double not null,\n");
 dyStringPrintf(dy, "code varchar(255),\n");
 dyStringPrintf(dy, "KEY(sample_id),\n");
 dyStringPrintf(dy, "KEY(feature_id),\n");
 dyStringPrintf(dy, "KEY(sample_id,feature_id),\n");
 dyStringPrintf(dy, "KEY(feature_id,sample_id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 void setupClinicalInfo(struct sqlConnection *biConn, struct datasets *da, struct samples *saList)
 {
 if (!saList)
     return;
 
 if (!sqlTableExists(biConn, FE_TABLE))
     {
     fprintf(stderr, "Table features doesn't exist, creating...\n");
     createFeaturesTable(biConn, FE_TABLE);
     }
 
 if (!sqlTableExists(biConn, CD_TABLE))
     {
     fprintf(stderr, "Table clinicalData doesn't exist, creating...\n");
     createClinicalDataTable(biConn, CD_TABLE);
     }
 
 struct hash *settings = getSettings(da->data_table);
 
 struct hashEl *el = hashLookup(settings, "raFile");
 if (!el)
     errAbort("No raFile");
 char *raFile = cloneString(el->val);
 
 el = hashLookup(settings, "patDb");
 if (!el)
     errAbort("No patDb");
 char *patDb = cloneString(el->val);
 
 el = hashLookup(settings, "patTable");
 if (!el)
     errAbort("No patTable");
 char *patTable = cloneString(el->val);
 
 el = hashLookup(settings, "patField");
 if (!el)
     errAbort("No patField");
 char *patField = cloneString(el->val);
 
 el = hashLookup(settings, "sampleField");
 if (!el)
     errAbort("No sampleField");
 char *sampleField = cloneString(el->val);
 
 if (!raFile || !patDb || !patTable || !patField || !sampleField)
     errAbort("Incomplete ra entry for %s.", da->data_table);
 
 struct sqlConnection *pdConn = hAllocConnProfile("localDb", patDb); //connection to patient data
 
 if (DEBUG)
     fprintf(stderr, "Getting columns of clinical data...\n");
 struct column *col, *colList = getColumns(pdConn, raFile, patDb);
 
 /* Set up features */
 struct features *fs;
 for (col = colList; col; col = col->next)
     {
     char *name = col->name;
     char *shortLabel = col->shortLabel;
     char *longLabel = col->longLabel;
 
     int id = getFeatureId(biConn, name);
     AllocVar(fs);
     fs->id = id;
     fs->name = cloneString(name);
     fs->shortLabel = cloneString(shortLabel);
     fs->longLabel= cloneString(longLabel);
 
     if (!featureExists(biConn, fs))
 	featuresSaveToDbEscaped(biConn, fs, FE_TABLE, 100);
     featuresFree(&fs);
 
     fs = getFeature(biConn, name);
     if (!fs)
 	errAbort("Could not find feature %s.", name);
     
     if (slCount(fs) != 1)
 	errAbort("Could not find unique feature by name = %s.", name);
 
     /* Loop through all samples, putting data in database */
     struct samples *sa; 
     struct clinicalData *cd;
     for (sa = saList; sa; sa = sa->next)
 	{
 	struct slName *id = slNameNew(getId(pdConn, patTable, patField, sa->name, sampleField));
 
 	char *cellVal = col->cellVal(col, id, pdConn);
 	if (!cellVal)
 	    continue; 
 
 	AllocVar(cd);
 	cd->sample_id = sa->id;
 	cd->feature_id = fs->id;
 
 	cd->val = atof(cellVal);
 	cd->code = NULL;
 	if (col->cellCoded(col, pdConn))
 	    cd->code = cloneString(col->cellCodedVal(col, id, pdConn));
 	
 	if (!clinicalDataExists(biConn, cd))
 	    clinicalDataSaveToDb(biConn, cd, CD_TABLE, 100);
 
 	clinicalDataFree(&cd);
 	slNameFree(&id);
 	}
     featuresFree(&fs);
     }
 
 hFreeConn(&pdConn);
 }
 
 void createProbeValsTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "probe_id int unsigned not null,\n");
 dyStringPrintf(dy, "sample_count int unsigned not null,\n");
 dyStringPrintf(dy, "sample_data longblob not null,\n");
 dyStringPrintf(dy, "PRIMARY KEY(probe_id)\n"); 
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 void createProbeInfoTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "chrom varchar(255) not null,\n");
 dyStringPrintf(dy, "start int unsigned not null,\n");
 dyStringPrintf(dy, "stop int unsigned not null,\n");
 dyStringPrintf(dy, "name varchar(255) not null,\n");
 dyStringPrintf(dy, "PRIMARY KEY(id),\n");
 dyStringPrintf(dy, "KEY(name),\n");
 dyStringPrintf(dy, "KEY(id,name)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 void addProbeValsToDb(struct sqlConnection *biConn, char *tableName, 
 		      int probe_id, int sample_count, char *dataString, int updateSize)
 { /* This bypasses the need to convert to float array to enter in database, may not
    * be faster, but by converting first to float array any "blank" datapoints are 
    * converted to 0.0, instead of leaving blank */
 struct dyString *update = newDyString(updateSize);
 dyStringPrintf(update, "insert into %s values ( %u,%u,'%s')",
 	       tableName, probe_id,  sample_count,  dataString );
 sqlUpdate(biConn, update->string);
 freeDyString(&update);
 }
 
 
 void setupProbeData(struct sqlConnection *hgConn, struct sqlConnection *biConn, 
 		    struct datasets *da)
 {
 char *dataTable = da->data_table;
 char *probeTable = da->probe_table;
 
 if (!dataTable || !probeTable)
     errAbort("datasets entry not complete, data_table or probe_table not set.");
 
 boolean inputProbeVals = FALSE;
 boolean inputProbeInfo = FALSE;
 
 if (sqlTableExists(biConn, dataTable) && dropTable)
     {
     fprintf(stderr, "probeVals table %s already exists in db, dropping...\n", dataTable);
     sqlDropTable(biConn, dataTable);
     }
 
 if (!sqlTableExists(biConn, dataTable))
     {
     fprintf(stderr, "Creating probeVals table %s...\n", dataTable);
     createProbeValsTable(biConn, dataTable);
     inputProbeVals = TRUE;  // empty table, input
     }
 
 if (sqlTableExists(biConn, probeTable) && dropTable)
     {
     fprintf(stderr, "probeInfo table %s already exists in db, dropping...\n", probeTable);
     sqlDropTable(biConn, probeTable);
     }
 
 if (!sqlTableExists(biConn, probeTable))
     {
     fprintf(stderr, "Creating probeInfo table %s...\n", probeTable);
     createProbeInfoTable(biConn, probeTable);
     inputProbeInfo = TRUE;   // empty table, input
     }
 
 char query[256];
 safef(query, sizeof(query), "select * from %s;", dataTable);
 
 /* Get bed15 data from hg18 database */
 int id = 0;
 struct sqlResult *sr = sqlGetResult(hgConn, query);
 
 char **row = NULL;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *chrom = row[1];
     unsigned chromStart = sqlUnsigned(row[2]);
     unsigned chromEnd = sqlUnsigned(row[3]);
     char *name = row[4];
     unsigned expCount = sqlUnsigned(row[13]);
     char *expScores = row[15];
 
     /* Make probeInfo entry and load into db*/
     struct probeInfo *pi;
     AllocVar(pi);
     pi->id = id;
     pi->chrom = cloneString(chrom);
     pi->start = chromStart;
     pi->stop  = chromEnd;
     pi->name  = cloneString(name);
     
     if (inputProbeInfo)
 	probeInfoSaveToDb(biConn, pi, probeTable, 100);
 
     /* Make probeVals entry and load into db, straight-up copying longblob*/
     if (inputProbeVals)
 	addProbeValsToDb(biConn, dataTable, id, expCount, expScores, 500);
 
     id++;
     probeInfoFree(&pi);
     }
 }
 
 void createGeneLookupTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "kgId varchar(255) not null,\n");
 dyStringPrintf(dy, "PRIMARY KEY(id),\n");
 dyStringPrintf(dy, "KEY(kgId),\n");
 dyStringPrintf(dy, "KEY(id,kgId),\n");
 dyStringPrintf(dy, "KEY(kgId,id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 void createGeneLookup(struct sqlConnection *biConn)
 {
 if (!sqlTableExists(biConn, GL_TABLE))
     {
     fprintf(stderr, "geneLookup table doesn't exist in bioInt database, recreating it.\n");
     createGeneLookupTable(biConn, GL_TABLE);
     }
 
 if (sqlTableSize(biConn, GL_TABLE) > 0)
     {
     fprintf(stderr, "geneLookup table already has data in it, doing nothing.\n");
     return;
     }
 
 if (!sqlTableExists(biConn, "knownGene"))
     errAbort("Need knownGene table in bioInt database.");
 
 char query[256];
 safef(query, sizeof(query), "select name from knownGene;");
 
 struct slName *sl, *slList = sqlQuickList(biConn, query);
 
 int id = 0;
 struct geneLookup *gl;
 for (sl = slList; sl; sl = sl->next)
     {
     AllocVar(gl);
     gl->id = id;
     gl->kgId = cloneString(sl->name);
     
     id++;
     geneLookupSaveToDb(biConn, gl, GL_TABLE, 100);
     geneLookupFree(&gl);
     }
 
 slNameFreeList(&slList);
 }
 
 void createProbeToGeneTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "probe_id int unsigned not null,\n");
 dyStringPrintf(dy, "gene_id int unsigned not null,\n");
 dyStringPrintf(dy, "KEY(probe_id),\n");
 dyStringPrintf(dy, "KEY(gene_id),\n");
 dyStringPrintf(dy, "KEY(probe_id,gene_id),\n");
 dyStringPrintf(dy, "KEY(gene_id,probe_id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 int getProbeId(struct sqlConnection *biConn, char *tableName, char *name)
 {
 char query[128];
 safef(query, sizeof(query),
       "select id from %s where name = \"%s\"",
       tableName, name);
 
 if (!sqlExists(biConn, query))
     return -1;
 
 return sqlQuickNum(biConn, query);
 }
 
 struct slInt *getGeneIdsBySymbol(struct sqlConnection *biConn, 
 				 struct slName *slList)
 {
 if (!slList)
     return NULL;
 
 struct slName *sl;
 struct dyString *dy = newDyString(100);
 dyStringPrintf(dy, "select id from %s "
 	       "join kgXref on %s.kgId = kgXref.kgId "
 	       "where kgXref.geneSymbol in (", GL_TABLE, GL_TABLE);
 for (sl = slList; sl; sl = sl->next)
     {
     dyStringPrintf(dy, "\"%s\"", sl->name);
     if (sl->next)
 	dyStringPrintf(dy, ",");
     }
 dyStringPrintf(dy, ");");
 char *query = dyStringCannibalize(&dy);
 
 return sqlQuickNumList(biConn, query);
 }
 
 
 void setupProbeToGene(struct sqlConnection *hgConn, 
 		      struct sqlConnection *biConn, struct datasets *da)
 {
 char *p2gTable = da->probe_to_gene_table;
 if (!p2gTable)
     {
     fprintf(stderr, "probeToGene table not set, doing nothing.\n");
     return;
     }
 
 struct hash *settings = getSettings(da->data_table);
 struct hashEl *el = hashLookup(settings, "aliasTable");
 if (!el)
     errAbort("No aliasTable.\n");
 char *aliasTable = cloneString(el->val);
 
 if (!sqlTableExists(hgConn, aliasTable))
     errAbort("Table %s not found in hg18 database.\n", aliasTable);
 
 if (!sqlTableExists(biConn, "kgXref"))
     errAbort("kgXref table not found in database. Cannot create probeToGene table.\n");
 
 if (sqlTableExists(biConn, p2gTable) && dropTable)
     {
     fprintf(stderr, "Table %s already exists, dropping and recreating.\n", p2gTable);
     sqlDropTable(biConn, p2gTable);
     }
 
 boolean inputProbeToGene = FALSE;
 if (!sqlTableExists(biConn, p2gTable))
     {
     fprintf(stderr, "Creating probeToGene table...\n");
     createProbeToGeneTable(biConn, p2gTable);
     inputProbeToGene = TRUE;
     }
 
 if (!inputProbeToGene)
     return;
 
 struct geneAlias *ga, *gaList = getAliases(hgConn, aliasTable);
 for (ga = gaList; ga; ga = ga->next)
     {
     int probeId = getProbeId(biConn, da->probe_table, ga->probe);
     if (probeId < 0)  // probe in alias table doesn't exist in dataset
 	continue;
 
     struct slInt *si, *geneIds = getGeneIdsBySymbol(biConn, ga->genes);
 
     struct probeToGene *pg;
     AllocVar(pg);
     pg->probe_id = probeId;
     for (si = geneIds; si; si = si->next)
 	{
 	pg->gene_id = si->val;
 	probeToGeneSaveToDb(biConn, pg, p2gTable, 10);
 	}
     probeToGeneFree(&pg);
     }
 }
 
 void createPathwaysTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "name varchar(255) not null,\n");
 dyStringPrintf(dy, "source varchar(255) not null,\n");
 dyStringPrintf(dy, "KEY(id),\n");
 dyStringPrintf(dy, "KEY(name),\n");
 dyStringPrintf(dy, "KEY(id,name),\n");
 dyStringPrintf(dy, "KEY(name,id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 void createPathwayGenesTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "gene_id int unsigned not null,\n");
 dyStringPrintf(dy, "KEY(id),\n");
 dyStringPrintf(dy, "KEY(gene_id),\n");
 dyStringPrintf(dy, "KEY(id,gene_id),\n");
 dyStringPrintf(dy, "KEY(gene_id,id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 void createPathwayInfoTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "description longblob not null,\n");
 dyStringPrintf(dy, "KEY(id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 }    
 
 char *getPathwayDescription(struct sqlConnection *pdConn, char *name)
 {
 if (!sqlTableExists(pdConn, name))
     return NULL;
 
 char query[128];
 safef(query, sizeof(query), 
       "select description from descriptions where name = \"%s\";",
       name);
 
 return sqlQuickString(pdConn, query);
 }
 
 void setupPathways(struct sqlConnection *biConn)
 {
 boolean inputPathways = FALSE;
 boolean inputPathwayInfo = FALSE;
 boolean inputPathwayGenes = FALSE;
 
 struct sqlConnection *pdConn = hAllocConnProfile("localDb", "pathway");
 if (!pdConn)
     errAbort("Could not connect to pathways database.\n");
 
 if (sqlTableExists(biConn, PA_TABLE) && dropTable)
     {
     fprintf(stderr, "pathways table already exists, dropping and recreating.\n");
     sqlDropTable(biConn, PA_TABLE);
     }
 if (!sqlTableExists(biConn, PA_TABLE))
     {
     fprintf(stderr, "Creating pathways table.\n");
     createPathwaysTable(biConn, PA_TABLE);
     inputPathways = TRUE;
     }
 
 if (sqlTableExists(biConn, PG_TABLE) && dropTable)
     {
     fprintf(stderr, "pathwayGenes table already exists, dropping and recreating.\n");
     sqlDropTable(biConn, PG_TABLE);
     }
 
 if (!sqlTableExists(biConn, PG_TABLE))
     {
     fprintf(stderr, "Creating pathwayGenes table.\n");
     createPathwayGenesTable(biConn, PG_TABLE);
     inputPathwayGenes = TRUE;
     }
 
 if (sqlTableExists(biConn, PI_TABLE) && dropTable)
     {
     fprintf(stderr, "pathwayInfo table already exists, dropping and recreating.\n");
     sqlDropTable(biConn, PI_TABLE);
     }
 
 if (!sqlTableExists(biConn, PI_TABLE))
     {
     fprintf(stderr, "Creeting pathwayInfo table.\n");
     createPathwayInfoTable(biConn, PI_TABLE);
     inputPathwayInfo = TRUE;
     }
 
 if (!inputPathways && !inputPathwayInfo && !inputPathwayGenes)
     {
     fprintf(stderr, "Nothing to do for pathway tables.\n");
     return;
     }
 
 /* Setting up pathways table */
 char query[128];
 safef(query, sizeof(query), "select * from genesets;");
 
 struct sqlResult *sr = sqlGetResult(pdConn, query);
 char **row = NULL;
 
 /* Save all data in lists to avoid "out of sync" error when attempting
  * query inside of a running query on same db */
 struct slName *na, *names = NULL;
 struct slName *ge, *genes = NULL;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     slNameAddHead(&names, row[0]);
     slNameAddHead(&genes, row[1]);
     }
 slReverse(&names);
 slReverse(&genes);
 sqlFreeResult(&sr);
 
 int id = 0;
 for (na = names, ge = genes; na && ge; na = na->next, ge = ge->next)
     {
     char *name = na->name;
     char *genes = ge->name;
 
     struct slName *slList = slNameListFromComma(genes);
     struct slInt *si, *siList = getGeneIdsBySymbol(biConn, slList); 
     
     struct pathways *ps;
 
     if (inputPathways)
 	{
 	AllocVar(ps);
 	ps->id = id;
 	ps->name = cloneString(name);
 	ps->source = cloneString("N/A");
 	pathwaysSaveToDb(biConn, ps, PA_TABLE, 100);
 	pathwaysFree(&ps);
 	}
 
     if (inputPathwayGenes)
 	{
 	struct pathwayGenes *pg;
 	AllocVar(pg);
 	pg->id = id;
 	for (si = siList; si; si = si->next)
 	    {
 	    pg->gene_id = si->val;
 	    pathwayGenesSaveToDb(biConn, pg, PG_TABLE, 100);
 	    }
 	pathwayGenesFree(&pg);
 	}
     
     if (inputPathwayInfo)
 	{
 	char *desc = getPathwayDescription(pdConn, name);
 	if (desc)
 	    {
 	    struct pathwayInfo *pi;
 	    AllocVar(pi);
 	    pi->id = id;
 	    pi->description = desc;
 	    pathwayInfoSaveToDbEscaped(biConn, pi, PI_TABLE, 200);
 	    pathwayInfoFree(&pi);
 	    }
 	}
     id++;
     }
 
 hFreeConn(&pdConn);
 }
 
 
 void populateDb(char *db, char *tableName, char *tissue)
 {
 tissue = strLower(tissue);
 struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
 struct sqlConnection *hgConn = hAllocConnProfile("localDb", hgDb);
 
 /* Create geneLookup table (if necessary) */
 uglyTime(NULL);
 fprintf(stderr, "Setting up geneLookup table...\n");
 createGeneLookup(biConn);
 uglyTime("Time");
 
 /* Set up pathways */
 uglyTime(NULL);
 fprintf(stderr, "Setting up pathways tables...\n");
 setupPathways(biConn);
 uglyTime("Time");
 
 /* Set up datasets entry */
 struct maGrouping *allA = getMaGrouping(hgConn, tableName);
 if (!allA)
     errAbort("Could not find maGrouping for %s!", tableName);
 
 uglyTime(NULL);
 int numSamples = allA->size;
 fprintf(stderr, "Adding datasets entry...\n");
 struct datasets *da = setupDataset(biConn, tableName, tissue, numSamples);
 uglyTime("Time");
 
 /* Set up samples entries */
 uglyTime(NULL);
 fprintf(stderr, "Adding to samples table...\n");
 struct samples *saList = setupSamples(biConn, da, allA);
 uglyTime("Time");
 
 /* Set up features and clinicalData */
 uglyTime(NULL);
 fprintf(stderr, "Setting up clinical data tables...\n");
 setupClinicalInfo(biConn, da, saList);
 uglyTime("Time");
 
 /* Set up probeInfo table (if necessary) and probeVals table */
 uglyTime(NULL);
 fprintf(stderr, "Setting up probe data tables (be patient!)...\n");
 setupProbeData(hgConn, biConn, da);
 uglyTime("Time");
 
 /* Set up probeToGene table (if necessary) */
 uglyTime(NULL);
 fprintf(stderr, "Setting up probeToGene table...\n");
 setupProbeToGene(hgConn, biConn, da);
 uglyTime("Time");
 
 fprintf(stderr, "Done!");
 fprintf(stderr, "Please run 'setCohort' to find datasets that have overlapping samples.\n");
 
 hFreeConn(&biConn);
 hFreeConn(&hgConn);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 
 dropTable = FALSE;
 if (optionExists("dropAll"))
     dropTable = TRUE;
+if (optionExists("tcga"))
+    isTCGA = TRUE;
 
 populateDb(argv[1], argv[2], argv[3]);
 return 0;
 }