9b1292002a4ede647cfea3d0a357c184d6fe557a
angie
  Wed Sep 14 08:31:30 2022 -0700
SARS-CoV-2 protobuf size is now larger than BIGNUM; use BIGLONGLONG to avoid cryptic error.  refs #29994

diff --git src/hg/hgPhyloPlace/parsimonyProto.c src/hg/hgPhyloPlace/parsimonyProto.c
index a55ab68..adaad5c 100644
--- src/hg/hgPhyloPlace/parsimonyProto.c
+++ src/hg/hgPhyloPlace/parsimonyProto.c
@@ -1,275 +1,276 @@
 /* Parse a protobuf-serialized file created by Yatish Turakhia's usher program
  * with the --save-mutation-annotated-tree into a phylogenetic tree annotated with SNVs.
  * https://github.com/yatisht/usher/ file parsimony.proto defines the file's data structure. */
 
 /* Copyright (C) 2020 The Regents of the University of California */
 
 #include "common.h"
 #include "hash.h"
 #include "obscure.h"
 #include "parsimonyProto.h"
 #include "protobuf.h"
 
 /* Data structures corresponding to Yatish's parsimony.proto spec */
 // struct singleNucChange defined in parsimonyProto.h corresponds to mutation message
 
 struct mutationList
 /* Data for one node in a phylogenetic tree: a list of single-nucleotide mutations
  * associated with this node by parsimonious assignment. */
     {
     struct mutationList *next;
     struct singleNucChange *muts;
     };
 
 struct parsimonyData
 /* A Newick format string encoding a tree, and an array of struct mutationList, one per node
  * in the tree (in order of left paren in Newick string). */
     {
     char *newick;                        // Tree in Newick/"New Hampshire" (.nh) format.
     struct hash *condensedNodes;         // Condensed node names --> list of sample names
     struct mutationList *nodeMutations;  // List (in DFS order) of per-node mutation lists.
     int nodeCount;                       // Number of nodes in tree
     };
 
 static char intToNt(int val)
 /* [0123] --> [ACGT] */
 {
 static char map[5] = "ACGT";
 if (val < 0 || val > 3)
     errAbort("intToNt: expected input to be between 0 and 3 but got %d", val);
 return map[val];
 }
 
 struct singleNucChange *sncNew(int chromStart, char refBase, char parBase, char newBase)
 /* Alloc and return a new singleNucChange. */
 {
 struct singleNucChange *snc;
 AllocVar(snc);
 snc->chromStart = chromStart;
 snc->refBase = toupper(refBase);
 snc->parBase = toupper(parBase);
 snc->newBase = toupper(newBase);
 return snc;
 }
 
 struct parsimonyData *parsimonyDataFromProtobuf(struct protobuf *data)
 /* Unpack the parsed protobuf into friendlier data structures (also converting [0123] to [ACGT]. */
 {
 struct parsimonyData *parData;
 AllocVar(parData);
 struct mutationList *nodeList = NULL;
 struct hash *condensedNodes = hashNew(0);
 struct protobufField *field;
 int nodeIx = 0;
 for (field = data->fields;  field != NULL;  field = field->next)
     {
     if (sameString(field->def->name, "newick"))
         parData->newick = field->value.vString;
     else if (sameString(field->def->name, "node_mutations"))
         {
         struct mutationList *mutList;
         AllocVar(mutList);
         int mutListIx;
         for (mutListIx = 0;  mutListIx < field->length;  mutListIx++)
             {
             struct protobuf *mutListMessage = field->value.vEmbedded[mutListIx];
             struct protobufField *mutListField;
             for (mutListField = mutListMessage->fields;  mutListField != NULL;
                  mutListField = mutListField->next)
                 {
                 if (differentString(mutListField->def->name, "mutation"))
                     errAbort("Expected field name mutation, got '%s'", mutListField->def->name);
                 int mutIx;
                 for (mutIx = 0;  mutIx < mutListField->length;  mutIx++)
                     {
                     struct singleNucChange *snc;
                     AllocVar(snc);
                     struct protobuf *mutMessage = mutListField->value.vEmbedded[mutIx];
                     struct protobufField *mutField;
                     for (mutField = mutMessage->fields;  mutField != NULL;
                          mutField = mutField->next)
                         {
                         char *mutFieldName = mutField->def->name;
                         if (sameString(mutFieldName, "position"))
                             snc->chromStart = mutField->value.vInt32 - 1;
                         else if (sameString(mutFieldName, "ref_nuc"))
                             snc->refBase = intToNt(mutField->value.vInt32);
                         else if (sameString(mutFieldName, "par_nuc"))
                             snc->parBase = intToNt(mutField->value.vInt32);
                         else if (sameString(mutFieldName, "mut_nuc"))
                             {
                             if (mutField->length != 1)
                                 warn("node %d, pos %d: expected exactly one mut_nuc but got %d "
                                      "(ref %c)\n",
                                      nodeIx, snc->chromStart+1, mutField->length, snc->refBase);
                             snc->newBase = intToNt(mutField->value.vPacked[0].vInt32);
                             }
                         else if (sameString(mutFieldName, "chromosome"))
                             {
                             // Ignore -- not applicable for SARS-CoV-2
                             }
                         else
                             errAbort("Unrecognized field name '%s' in mutation message",
                                      mutFieldName);
                         }
                     // Int fields whose value is 0 are omitted
                     if (snc->refBase == '\0')
                         snc->refBase = intToNt(0);
                     if (snc->parBase == '\0')
                         snc->parBase = intToNt(0);
                     if (snc->newBase == '\0')
                         warn("node %d, pos %d: How did we get a node with a mutation message "
                              "but no mut_nuc field?",
                              nodeIx, snc->chromStart+1);
                     slAddHead(&mutList->muts, snc);
                     }
                 }
             slReverse(&mutList->muts);
             }
         nodeIx++;
         slAddHead(&nodeList, mutList);
         }
     else if (sameString(field->def->name, "condensed_nodes"))
         {
         int cnIx;
         for (cnIx = 0;  cnIx < field->length;  cnIx++)
             {
             struct protobuf *condNodeMessage = field->value.vEmbedded[cnIx];
             char *cnName = NULL;
             struct protobufField *cnField;
             for (cnField = condNodeMessage->fields;  cnField != NULL;  cnField = cnField->next)
                 {
                 if (sameString(cnField->def->name, "node_name"))
                     {
                     if (cnName != NULL)
                         errAbort("Multiple node_name messages in condensed_node (%s, %s)",
                                  cnName, cnField->value.vString);
                     cnName = cnField->value.vString;
                     }
                 else if (sameString(cnField->def->name, "condensed_leaves"))
                     {
                     if (cnName == NULL)
                         errAbort("Need more complex parsing code: expected node_name field to "
                                  "precede condensed_leaves field, but it did not.");
                     struct hashEl *hel = hashLookup(condensedNodes, cnName);
                     if (hel)
                         {
                         struct slName *clList = hel->val;
                         slNameAddTail(&clList, cnField->value.vString);
                         }
                     else
                         hashAdd(condensedNodes, cnName, slNameNew(cnField->value.vString));
                     }
                 else
                     errAbort("Unexpected field name '%s' in condensed_nodes message",
                              cnField->def->name);
                 }
             }
         }
     else
         errAbort("Unexpected field name '%s' in top-level message", field->def->name);
     }
 slReverse(&nodeList);
 parData->condensedNodes = condensedNodes;
 // Make nodeMutations array from nodeList.
 parData->nodeCount = slCount(nodeList);
 AllocArray(parData->nodeMutations, parData->nodeCount);
 struct mutationList *nodeMuts;
 int i;
 for (i = 0, nodeMuts = nodeList;  i < parData->nodeCount;  i++, nodeMuts = nodeMuts->next)
     parData->nodeMutations[i] = *nodeMuts;
 return parData;
 }
 
 static void treeAddVariants(struct phyloTree *node, struct mutationList *nodeMutations,
                             int nodeCount, int *pNodeNum, int *pINodeNum, struct hash *nodeHash)
 /* Annotate the nodes of tree from the array of per-node mutations -- use node->priv
  * to store the variants.  Also, if a node has no name, add the internal node number as the name. */
 {
 if (*pNodeNum < 0 || *pNodeNum >= nodeCount)
     errAbort("treeAddVariants: bad nodeNum %d (must be between 0 and %d)", *pNodeNum, nodeCount-1);
 node->priv = nodeMutations[*pNodeNum].muts;
 // Add internal node number as node name -- helps match up with mutation paths from usher.
 if (isEmpty(node->ident->name) && node->numEdges > 0)
     {
     char iNodeName[16];
     safef(iNodeName, sizeof iNodeName, "%d", *pINodeNum);
     node->ident->name = cloneString(iNodeName);
     }
 (*pNodeNum)++;
 if (node->numEdges > 0)
     (*pINodeNum)++;
 int i;
 for (i = 0;  i < node->numEdges;  i++)
     treeAddVariants(node->edges[i], nodeMutations, nodeCount, pNodeNum, pINodeNum, nodeHash);
 if (isNotEmpty(node->ident->name))
     hashAdd(nodeHash, node->ident->name, node);
 }
 
 struct mutationAnnotatedTree *parseParsimonyProtobuf(char *savedAssignmentsFile)
 /* Return an annotated phylogenetic tree loaded from savedAssignments file.  Each node->priv
  * points to struct singleNucChange list (NULL if no change is associated with that node).
  * condensedNodes is a hash mapping names of condensed nodes to slName lists of
  * sample IDs that were condensed.  nodeHash is a hash mapping node names to nodes. */
 {
 FILE *f = mustOpen(savedAssignmentsFile, "r");
 
 // Hand-compiled from ~angie/github/yatish_strain_phylogenetics/parsimony.proto Aug. 10 2020...
 // message mut {
 //     int32 position = 1;
 //     int32 ref_nuc = 2;
 //     int32 par_nuc = 3;
 //     repeated int32 mut_nuc = 4;
 // }
 struct protobufFieldDef chromField = { NULL, "chromosome", 5, pbdtString, NULL, FALSE };
 struct protobufFieldDef mutNucField = { &chromField, "mut_nuc", 4, pbdtInt32, NULL, TRUE };
 struct protobufFieldDef parNucField = { &mutNucField, "par_nuc", 3, pbdtInt32, NULL, FALSE };
 struct protobufFieldDef refNucField = { &parNucField, "ref_nuc", 2, pbdtInt32, NULL, FALSE };
 struct protobufFieldDef positionField = { &refNucField, "position", 1, pbdtInt32, NULL, FALSE };
 struct protobufDef mutDef = { NULL, "mut", &positionField };
 // message mutation_list {
 //     repeated mut mutation = 1;
 // }
 struct protobufFieldDef mutationField = { NULL, "mutation", 1, pbdtEmbedded, &mutDef, TRUE };
 struct protobufDef mutListDef = { NULL, "mutation_list", &mutationField };
 // message condensed_node {
 //     string node_name = 1;
 //     repeated string condensed_leaves = 2;
 // }
 struct protobufFieldDef condLeavesField = { NULL, "condensed_leaves", 2, pbdtString, NULL, TRUE };
 struct protobufFieldDef nodeNameField = { &condLeavesField, "node_name", 1, pbdtString, NULL, FALSE};
 struct protobufDef condNodeDef = { NULL, "condensed_node", &nodeNameField };
 // message data {
 //     string newick = 1;
 //     repeated mutation_list node_mutations = 2; 
 //     repeated condensed_node condensed_nodes = 3;
 // }
 struct protobufFieldDef condNodeField = { NULL, "condensed_nodes", 3, pbdtEmbedded, &condNodeDef,
                                           TRUE };
 struct protobufFieldDef nodeMutsField = { &condNodeField, "node_mutations", 2, pbdtEmbedded,
                                           &mutListDef, TRUE };
 struct protobufFieldDef newickField = { &nodeMutsField, "newick", 1, pbdtString, NULL, FALSE };
 struct protobufDef def = { NULL, "data", &newickField };
 // Parse the protobuf file into protobuf data structures
-long long bytesLeft = BIGNUM;
+long long bytesLeft = BIGLONGLONG;
 struct protobuf *data = protobufParse(f, &def, &bytesLeft);
 if (fgetc(f) != EOF)
-    errAbort("Why did we not get EOF starting out with BIGNUM?");
+    errAbort("Protobuf size exceeded %lx bytes, please report this error to "
+             "genome-www@soe.ucsc.edu.", BIGLONGLONG);
 carefulClose(&f);
 // Convert the protobuf data structures into friendlier data structures
 struct parsimonyData *parData = parsimonyDataFromProtobuf(data);
 if (parData == NULL)
     return NULL;
 struct mutationAnnotatedTree *tree;
 AllocVar(tree);
 tree->tree = phyloParseString(parData->newick);
 // Annotate the tree (->priv) with parsimonious assignments and single-nucleotide changes.
 tree->nodeHash = hashNew(digitsBaseTwo(parData->nodeCount) + 1);
 int nodeNum = 0, iNodeNum = 1;
 treeAddVariants(tree->tree, parData->nodeMutations, parData->nodeCount, &nodeNum, &iNodeNum,
                 tree->nodeHash);
 tree->condensedNodes = parData->condensedNodes;
 // Mem leak since we don't free up protobuf data or packaging of singleNucChanges,
 // but we only do this once.
 return tree;
 }