21e42695bd70f58fcd48b6080764a5c2ddad907a
angie
  Wed Apr 7 19:50:25 2021 -0700
When a row has a million columns, throwing words[] on the stack just won't do.  Alloc the array once and use for all rows.

diff --git src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c
index a43f84b..342b879 100644
--- src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c
+++ src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c
@@ -24,40 +24,43 @@
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 void vcfRenameAndPrune(char *vcfInFile, char *renamingFile, char *vcfOutFile)
 /* vcfRenameAndPrune - Rename or remove samples from VCF with genotypes. */
 {
 struct hash *renaming = hashTwoColumnFile(renamingFile);
 struct lineFile *lf = lineFileOpen(vcfInFile, TRUE);
 FILE *outF = mustOpen(vcfOutFile, "w");
 int headerColCount = 0;
 int keeperCountMax = hashNumEntries(renaming);
 int keeperColumns[keeperCountMax];
 int keeperCount = 0;
 int keeperIx = 0;
+// VCF with >1M samples (for SARS-CoV-2) causes stack problems / SEGV if we declare words on stack,
+// so allocate it once we know how many columns to expect:
+char **words = NULL;
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     if (startsWith("#CHROM", line))
         {
         // Parse & replace sample names, build array of genotype columns that we're keeping
         headerColCount = chopString(line, "\t", NULL, 0);
         lineFileExpectAtLeast(lf, VCF_NUM_COLS_BEFORE_GENOTYPES+1, headerColCount);
-        char *words[headerColCount];
-        chopTabs(line, words);
+        AllocArray(words, headerColCount+1);
+        chopByChar(line, '\t', words, headerColCount+1);
         fputs(words[0], outF);
         int i;
         for (i = 1;  i < VCF_NUM_COLS_BEFORE_GENOTYPES;  i++)
             fprintf(outF, "\t%s", words[i]);
         for (i = VCF_NUM_COLS_BEFORE_GENOTYPES;  i < headerColCount;  i++)
             {
             char *newName = hashFindVal(renaming, words[i]);
             if (newName)
                 {
                 fprintf(outF, "\t%s", newName);
                 if (keeperIx >= keeperCountMax)
                     lineFileAbort(lf, "Too many matching names in #CHROM line -- "
                                   "duplicate values in input? "
                                   "(%d values in renaming, too many matching names at column %d",
                                   keeperCountMax, i);
@@ -69,32 +72,31 @@
         verbose(2, "Found %d keepers (out of %d genotype columns and %d entries in %s)\n",
                 keeperCount, headerColCount - VCF_NUM_COLS_BEFORE_GENOTYPES, keeperCountMax,
                 renamingFile);
         }
     else if (line[0] == '#')
         {
         // Pass through other header lines
         fputs(line, outF);
         fputc('\n', outF);
         }
     else
         {
         // Data line: print out only the genotype columns that we're keeping
         if (headerColCount == 0)
             lineFileAbort(lf, "Missing #CHROM header line -- can't rename.");
-        char *words[headerColCount+1];
-        int wordCount = chopTabs(line, words);
+        int wordCount = chopByChar(line, '\t', words, headerColCount+1);
         lineFileExpectWords(lf, headerColCount, wordCount);
         // Recompute the counts of reference and alternate alleles in genotypes that we're keeping.
         // Keep only the alternate alleles that have a nonzero count.
         // Discard a row if there are no alternate alleles with nonzero count.
         char altCopy[strlen(words[4])+1];
         safecpy(altCopy, sizeof altCopy, words[4]);
         int altCount = chopString(altCopy, ",", NULL, 0);
         char *alts[altCount];
         chopCommas(altCopy, alts);
         int newAltCount = 0;
         char *newAlts[altCount];
         int newAltCounts[altCount];
         memset(newAltCounts, 0, sizeof newAltCounts);
         int altIxOldToNew[altCount];
         int i;