src/hg/lib/annoStreamDb.c 06845edb9038bb632170891cebcdc8f477f2ea8d

06845edb9038bb632170891cebcdc8f477f2ea8d
angie
  Fri Dec 4 08:59:38 2015 -0800
Add dbNSFP v3.1a (including VEST scores) to hg38 for hgVai.
Add hgVai options for adding in transcript status info: GENCODE tags
when applicable, knownCanonical for knownGene, refSeqStatus for refGene.
refs #16502, #16503

diff --git src/hg/lib/annoStreamDb.c src/hg/lib/annoStreamDb.c
index dad9640..f68f39d 100644
--- src/hg/lib/annoStreamDb.c
+++ src/hg/lib/annoStreamDb.c
@@ -52,50 +52,50 @@
     struct slName *queryChrom;		// most recently queried chrom for whole-genome (or NULL)
     boolean eof;			// TRUE when we are done (maxItems or no more items)
     boolean needQuery;			// TRUE when we haven't yet queried, or need to query again
     boolean doNextChunk;		// TRUE if rowBuf ends before end of chrom/region
     uint nextChunkStart;		// Start coord for next chunk of rows to query
 
     // Info for joining in related tables/fields
     struct joinerDtf *mainTableDtfList; // Fields from the main table to include in output
     struct joinerDtf *relatedDtfList;	// Fields from related tables to include in output
     struct joiner *joiner;		// Parsed all.joiner schema
     struct joinMixer *joinMixer;	// Plan for joining related tables using sql and/or hash
 					// (NULL if no joining is necessary)
     uint sqlRowSize;			// Number of columns from sql query (may include related)
     uint bigRowSize;			// Number of columns from sql + joinMixer->hashJoins
     boolean hasLeftJoin;		// If we have to use 'left join' we'll have to 'order by'.
+    boolean naForMissing;		// If true, insert "n/a" for missing related table values
+					// to match hgTables.
 
     struct rowBuf
     // Temporary storage for rows from chunked query
         {
 	struct lm *lm;			// storage for rows
 	char ***buf;			// array of pointers to rows
 	int size;			// number of rows
 	int ix;				// offset in buffer, [0..size]
         } rowBuf;
 
     char **(*nextRowRaw)(struct annoStreamDb *self);
     // Depending on query style, use either sqlNextRow or temporary row storage to get next row.
     // This may return NULL but set self->needQuery; asdNextRow watches for that.
 
     void (*doQuery)(struct annoStreamDb *self, char *minChrom, uint minEnd);
     // Depending on query style, perform either a single query or (series of) chunked query
     };
 
-static boolean naForMissing = TRUE;	// This should be made into a UI option.
-
 //#*** TODO: make a way to pass the filter with dtf into annoStreamDb.
 
 struct annoFilterDb
     // annoFilter has columnIx which works fine for all fields of main table,
     // but for joining filters we will need dtf.
     {
     struct annoFilter filter;            // parent class
     struct joinerDtf *dtf;               // {database, table, field} in case this is from
                                          // some table to be joined with the main table
     };
 
 // For performance reasons, even if !useMaxItems (no limit), we need to limit the
 // number of rows that are returned from a query, so we can slurp them into memory and
 // close the sqlResult before mysql gets unhappy about the result being open so long.
 #define ASD_CHUNK_SIZE 100000
@@ -246,31 +246,31 @@
 return listOut;
 }
 
 static void asdMakeBaselineQuery(struct annoStreamDb *self)
 /* Build a dy SQL query with no position constraints (select ... from ...)
  * possibly including joins and filters if specified (where ...). */
 {
 if (self->relatedDtfList)
     {
     struct joinerDtf *outputFieldList = slCat(joinerDtfCloneList(self->mainTableDtfList),
                                               joinerDtfCloneList(self->relatedDtfList));
     if (self->joiner == NULL)
         self->joiner = joinerRead(JOINER_FILE);
     int expectedRows = sqlRowCount(self->conn, self->table);
     self->joinMixer = joinMixerNew(self->joiner, self->db, self->table, outputFieldList,
-                                   expectedRows, naForMissing);
+                                   expectedRows, self->naForMissing);
     self->sqlRowSize = slCount(self->joinMixer->sqlFieldList);
     self->bigRowSize = self->joinMixer->bigRowSize;
     joinerDtfFreeList(&outputFieldList);
     }
 else
     {
     self->sqlRowSize = slCount(self->mainTableDtfList);
     self->bigRowSize = self->sqlRowSize;
     }
 struct dyString *query = sqlDyStringCreate("select ");
 appendFieldList(self, query);
 dyStringAppend(query, " from ");
 self->hasLeftJoin = appendTableList(self, query);
 boolean hasWhere = FALSE;
 self->baselineQuery = dyStringCannibalize(&query);
@@ -699,31 +699,31 @@
 
 static struct annoRow *rowToAnnoRow(struct annoStreamDb *self, char **row, boolean rightFail,
 				    struct lm *lm)
 /* Extract coords from row and return an annoRow including right-fail status. */
 {
 char **finalRow = row + self->omitBin;
 uint numCols = self->streamer.numCols;
 char *swizzleRow[numCols];
 if (self->joinMixer)
     {
     uint i;
     for (i = 0;  i < numCols;  i++)
         {
         uint outIx = self->joinMixer->outIxs[i+self->omitBin];
         if (row[outIx] == NULL)
-            swizzleRow[i] = naForMissing ? "n/a" : "";
+            swizzleRow[i] = self->naForMissing ? "n/a" : "";
         else
             swizzleRow[i] = row[outIx];
         }
     finalRow = swizzleRow;
     }
 char *chrom = finalRow[self->chromIx];
 uint chromStart = sqlUnsigned(finalRow[self->startIx]);
 uint chromEnd = sqlUnsigned(finalRow[self->endIx]);
 return annoRowFromStringArray(chrom, chromStart, chromEnd, rightFail, finalRow, numCols, lm);
 }
 
 static char **getFinestBinItem(struct annoStreamDb *self, char **row, boolean *pRightFail,
 			       char *minChrom, uint minEnd)
 /* If row is a coarse-bin item, add it to bigItemQueue, get the next row(s) and
  * add any subsequent coarse-bin items to bigItemQueue.  As soon as we get an item from a
@@ -898,76 +898,84 @@
     }
 else
     asObj = hel->val;
 return asObj;
 }
 
 static void makeDottedTriple(char *dtfString, size_t dtfStringSize,
                              char *db, char *table, char *field)
 /* In case we don't have a struct joinerDtf for a field that we want to look up,
  * but we do have the db, table and field name, concat with dots into dtfString.
  * Unlike joinerDtfToSqlFieldString, don't bother checking whether db is the main db. */
 {
 safef(dtfString, dtfStringSize, "%s.%s.%s", db, table, field);
 }
 
+char *annoStreamDbColumnNameFromDtf(char *db, char *mainTable, struct joinerDtf *dtf)
+/* Return a string with the autoSql column name that would be assigned according to dtf's
+ * db, table and field. */
+{
+char colName[PATH_LEN*2];
+if (differentString(dtf->table, mainTable) || differentString(dtf->database, db))
+    {
+    joinerDtfToSqlFieldString(dtf, db, colName, sizeof(colName));
+    // asParse rejects names that have '.' in them, which makes sense because it's for SQL,
+    // so replace the '.'s with '_'s.
+    subChar(colName, '.', '_');
+    }
+else
+    safecpy(colName, sizeof(colName), dtf->field);
+return cloneString(colName);
+}
+
 static void addOneColumn(struct dyString *dy, struct joinerDtf *dtf, char *db, char *mainTable,
                          struct asColumn *col, struct hash *dtfNames)
 /* Append an autoSql text line describing col to dy.
  * If col is an array whose size is some other column that has not yet been added,
  * coerce its type to string to avoid asParseText errAbort. */
 {
 // First see if this col depends on a linked size column that hasn't been added yet.
 boolean sizeColIsMissing = FALSE;
 if (col->isArray && !col->fixedSize && isNotEmpty(col->linkedSizeName))
     {
     // col's size comes from another column -- has that column already been added?
     char linkedDtfString[PATH_LEN];
     makeDottedTriple(linkedDtfString, sizeof(linkedDtfString),
                      dtf->database, dtf->table, col->linkedSizeName);
     if (!hashLookup(dtfNames, linkedDtfString))
         sizeColIsMissing = TRUE;
     }
 if (col->isArray && sizeColIsMissing)
     {
     // The size column is missing, so this can't be a valid array in autoSql --
     // ignore col->lowType and call it a (comma-separated) string.
-    dyStringAppend(dy, "    string");
+    dyStringAppend(dy, "    lstring");
     }
 else
     {
     dyStringPrintf(dy, "    %s", col->lowType->name);
     if (col->isArray)
         {
         dyStringAppendC(dy, '[');
         if (col->fixedSize)
             dyStringPrintf(dy, "%d", col->fixedSize);
         else
             dyStringAppend(dy, col->linkedSizeName);
         dyStringAppendC(dy, ']');
         }
     }
-char colName[PATH_LEN];
-if (differentString(dtf->table, mainTable) || differentString(dtf->database, db))
-    {
-    joinerDtfToSqlFieldString(dtf, db, colName, sizeof(colName));
-    // asParse rejects names that have '.' in them, which makes sense because it's for SQL,
-    // so replace the '.'s with '_'s.
-    subChar(colName, '.', '_');
-    }
-else
-    safecpy(colName, sizeof(colName), col->name);
+char *colName = annoStreamDbColumnNameFromDtf(db, mainTable, dtf);
 dyStringPrintf(dy, "  %s; \"%s\"\n", colName, col->comment);
 // Store plain old dotted triple in dtfNames in case we need to look it up later.
 char dtfString[PATH_LEN];
 makeDottedTriple(dtfString, sizeof(dtfString), dtf->database, dtf->table, dtf->field);
 hashAdd(dtfNames, dtfString, NULL);
 }
 
 static struct asObject *asdAutoSqlFromTableFields(struct annoStreamDb *self,
                                                   struct asObject *mainAsObj)
 /* Get autoSql for each table in self->relatedDtfList and append the columns
  * included in self->relatedDtfList to the main table asObj columns. */
 {
 struct dyString *newAsText = dyStringCreate("table %sCustom\n"
                                             "\"query based on %s with customized fields.\"\n"
                                             "    (",
@@ -1066,31 +1074,32 @@
 static boolean isPubsTable(char *table)
 // Not absolutely every pubs* table is unsorted, but most of them are.
 {
 return startsWith("pubs", table);
 }
 
 static struct asObject *asdParseConfig(struct annoStreamDb *self, struct jsonElement *configEl)
 /* Extract the autoSql for self->table from the database.
  * If configEl is not NULL, expect it to be a description of related tables and fields like this:
  * config = { "relatedTables": [ { "table": "hg19.kgXref",
  *                                 "fields": ["geneSymbol", "description"] },
  *                               { "table": "hg19.knownCanonical",
  *                                 "fields": ["clusterId"] }
  *                             ] }
  * If so, unpack the [db.]tables and fields into self->relatedDtfList and append autoSql
- * column descriptions for each field to the autoSql object that describes our output. */
+ * column descriptions for each field to the autoSql object that describes our output.
+ * It might also have "naForMissing": true/false; if so, set self->naForMissing. */
 {
 //#*** TODO: hAnnoGetAutoSqlForDbTable should do its own split-table checking
 char maybeSplitTable[HDB_MAX_TABLE_STRING];
 if (!hFindSplitTable(self->db, NULL, self->table, maybeSplitTable, NULL))
     errAbort("annoStreamDbNew: can't find table (or split table) for '%s.%s'",
              self->db, self->table);
 struct asObject *asObj = hAnnoGetAutoSqlForDbTable(self->db, maybeSplitTable, NULL, TRUE);
 makeMainTableDtfList(self, asObj);
 if (configEl != NULL)
     {
     struct hash *config = jsonObjectVal(configEl, "config");
     struct jsonElement *relatedTablesEl = hashFindVal(config, "relatedTables");
     if (relatedTablesEl)
         {
         // relatedTables is a list of objects whose keys are related [db.]table names
@@ -1112,30 +1121,33 @@
                 {
                 struct jsonElement *fieldListEl = hashMustFindVal(tfObj, "fields");
                 struct slRef *fieldList = jsonListVal(fieldListEl, "fieldList");
                 struct slRef *fieldRef;
                 for (fieldRef = fieldList;  fieldRef != NULL;  fieldRef = fieldRef->next)
                     {
                     struct jsonElement *fieldEl = fieldRef->val;
                     char *tfField = jsonStringVal(fieldEl, "field");
                     slAddHead(&self->relatedDtfList, joinerDtfNew(tfDb, tfTable, tfField));
                     }
                 }
             }
         slReverse(&self->relatedDtfList);
         asObj = asdAutoSqlFromTableFields(self, asObj);
         }
+    struct jsonElement *naForMissingEl = hashFindVal(config, "naForMissing");
+    if (naForMissingEl != NULL)
+        self->naForMissing = jsonBooleanVal(naForMissingEl, "naForMissing");
     }
 return asObj;
 }
 
 // Why isn't this in jksql.h?
 #define NOSQLINJ "NOSQLINJ "
 
 static char *sqlExplain(struct sqlConnection *conn, char *query)
 /* For now, just turn the values back into a multi-line "#"-comment string. */
 {
 char *trimmedQuery = query;
 if (startsWith(NOSQLINJ, trimmedQuery))
     trimmedQuery = trimmedQuery + strlen(NOSQLINJ);
 struct dyString *dy = dyStringCreate("# Output of 'explain %s':\n", trimmedQuery);
 char explainQuery[PATH_LEN*8];