21e42695bd70f58fcd48b6080764a5c2ddad907a angie Wed Apr 7 19:50:25 2021 -0700 When a row has a million columns, throwing words[] on the stack just won't do. Alloc the array once and use for all rows. diff --git src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c index a43f84b..342b879 100644 --- src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c +++ src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c @@ -24,40 +24,43 @@ static struct optionSpec options[] = { {NULL, 0}, }; void vcfRenameAndPrune(char *vcfInFile, char *renamingFile, char *vcfOutFile) /* vcfRenameAndPrune - Rename or remove samples from VCF with genotypes. */ { struct hash *renaming = hashTwoColumnFile(renamingFile); struct lineFile *lf = lineFileOpen(vcfInFile, TRUE); FILE *outF = mustOpen(vcfOutFile, "w"); int headerColCount = 0; int keeperCountMax = hashNumEntries(renaming); int keeperColumns[keeperCountMax]; int keeperCount = 0; int keeperIx = 0; +// VCF with >1M samples (for SARS-CoV-2) causes stack problems / SEGV if we declare words on stack, +// so allocate it once we know how many columns to expect: +char **words = NULL; char *line; while (lineFileNext(lf, &line, NULL)) { if (startsWith("#CHROM", line)) { // Parse & replace sample names, build array of genotype columns that we're keeping headerColCount = chopString(line, "\t", NULL, 0); lineFileExpectAtLeast(lf, VCF_NUM_COLS_BEFORE_GENOTYPES+1, headerColCount); - char *words[headerColCount]; - chopTabs(line, words); + AllocArray(words, headerColCount+1); + chopByChar(line, '\t', words, headerColCount+1); fputs(words[0], outF); int i; for (i = 1; i < VCF_NUM_COLS_BEFORE_GENOTYPES; i++) fprintf(outF, "\t%s", words[i]); for (i = VCF_NUM_COLS_BEFORE_GENOTYPES; i < headerColCount; i++) { char *newName = hashFindVal(renaming, words[i]); if (newName) { fprintf(outF, "\t%s", newName); if (keeperIx >= keeperCountMax) lineFileAbort(lf, "Too many matching names in #CHROM line -- " "duplicate values in input? " "(%d values in renaming, too many matching names at column %d", keeperCountMax, i); @@ -69,32 +72,31 @@ verbose(2, "Found %d keepers (out of %d genotype columns and %d entries in %s)\n", keeperCount, headerColCount - VCF_NUM_COLS_BEFORE_GENOTYPES, keeperCountMax, renamingFile); } else if (line[0] == '#') { // Pass through other header lines fputs(line, outF); fputc('\n', outF); } else { // Data line: print out only the genotype columns that we're keeping if (headerColCount == 0) lineFileAbort(lf, "Missing #CHROM header line -- can't rename."); - char *words[headerColCount+1]; - int wordCount = chopTabs(line, words); + int wordCount = chopByChar(line, '\t', words, headerColCount+1); lineFileExpectWords(lf, headerColCount, wordCount); // Recompute the counts of reference and alternate alleles in genotypes that we're keeping. // Keep only the alternate alleles that have a nonzero count. // Discard a row if there are no alternate alleles with nonzero count. char altCopy[strlen(words[4])+1]; safecpy(altCopy, sizeof altCopy, words[4]); int altCount = chopString(altCopy, ",", NULL, 0); char *alts[altCount]; chopCommas(altCopy, alts); int newAltCount = 0; char *newAlts[altCount]; int newAltCounts[altCount]; memset(newAltCounts, 0, sizeof newAltCounts); int altIxOldToNew[altCount]; int i;