c69992a8a18bf93e03b4d57af505b05df2213f45 angie Wed Aug 17 16:43:41 2011 -0700 Feature #3711 (VCF haplotype sorting display): Performance improvements.1. lib/hacTree.c: instead of doing a zillion incremental memcpy's to drop nodes, build up a set of nodes to delete and then do the minimal memmove's afterwards. Use memmove per memcpy's man page, which says not to use memcpy if regions overlap! 2. lib/vcf.c: avoid unnecessary calls to vcfFilePooledStr from inner loops. Also, if ref is all nucleotides (not symbolic as for SV's), use it to set chromEnd. It may still be overwritten if the END keyword is used in the info column. diff --git src/lib/vcf.c src/lib/vcf.c index a5432fd..e88de56 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -418,40 +418,44 @@ vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 or v4.1", vcff->majorVersion, vcff->minorVersion); // Next, one header line beginning with single "#" that names the columns: if (line == NULL) // EOF after metadata return vcff; parseColumnHeaderRow(vcff, line); return vcff; } #define VCF_MAX_INFO 512 static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt) /* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns. + * Use the length of the reference sequence to set record->chromEnd. * Note: this trashes the alt argument, since this is expected to be its last use. */ { char *altAlleles[VCF_MAX_INFO]; int altCount = chopCommas(alt, altAlleles); record->alleleCount = 1 + altCount; record->alleles = vcfFileAlloc(vcff, record->alleleCount * sizeof(record->alleles[0])); record->alleles[0] = vcfFilePooledStr(vcff, ref); int i; for (i = 0; i < altCount; i++) record->alleles[1+i] = vcfFilePooledStr(vcff, altAlleles[i]); +int refLen = strlen(ref); +if (refLen == dnaFilteredSize(ref)) + record->chromEnd = record->chromStart + refLen; } static void parseFilterColumn(struct vcfFile *vcff, struct vcfRecord *record, char *filterStr) /* Transform ;-separated filter codes into count + string array. */ { // We don't want to modify something allocated with vcfFilePooledStr because that uses // hash element names for storage! So don't make a vcfFilePooledStr copy of filterStr and // chop that; instead, chop a temp string and pool the words separately. static struct dyString *tmp = NULL; if (tmp == NULL) tmp = dyStringNew(0); dyStringClear(tmp); dyStringAppend(tmp, filterStr); record->filterCount = countChars(filterStr, ';') + 1; record->filters = vcfFileAlloc(vcff, record->filterCount * sizeof(char **)); @@ -535,48 +539,49 @@ static void parseInfoColumn(struct vcfFile *vcff, struct vcfRecord *record, char *string) /* Translate string into array of vcfInfoElement. */ { if (sameString(string, ".")) { record->infoCount = 0; return; } char *elWords[VCF_MAX_INFO]; record->infoCount = chopByChar(string, ';', elWords, ArraySize(elWords)); if (record->infoCount >= VCF_MAX_INFO) vcfFileErr(vcff, "INFO column contains at least %d elements; " "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO); record->infoElements = vcfFileAlloc(vcff, record->infoCount * sizeof(struct vcfInfoElement)); +char *emptyString = vcfFilePooledStr(vcff, ""); int i; for (i = 0; i < record->infoCount; i++) { char *elStr = elWords[i]; char *eq = strchr(elStr, '='); struct vcfInfoElement *el = &(record->infoElements[i]); if (eq == NULL) { el->key = vcfFilePooledStr(vcff, elStr); enum vcfInfoType type = typeForInfoKey(vcff, el->key); if (type != vcfInfoFlag) { vcfFileErr(vcff, "Missing = after key in INFO element: \"%s\" (type=%d)", elStr, type); if (type == vcfInfoString) { el->values = vcfFileAlloc(vcff, sizeof(union vcfDatum)); - el->values[0].datString = vcfFilePooledStr(vcff, ""); + el->values[0].datString = emptyString; } } continue; } *eq = '\0'; el->key = vcfFilePooledStr(vcff, elStr); enum vcfInfoType type = typeForInfoKey(vcff, el->key); char *valStr = eq+1; el->count = parseInfoValue(record, el->key, type, valStr, &(el->values)); if (el->count >= VCF_MAX_INFO) vcfFileErr(vcff, "A single element of the INFO column has at least %d values; " "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO); } } @@ -587,31 +592,31 @@ if (vcff == NULL) return; int expected = 8; if (vcff->genotypeCount > 0) expected = 9 + vcff->genotypeCount; char *words[VCF_MAX_COLUMNS]; int wordCount; while ((wordCount = lineFileChop(vcff->lf, words)) > 0) { lineFileExpectWords(vcff->lf, expected, wordCount); struct vcfRecord *record; AllocVar(record); record->file = vcff; record->chrom = vcfFilePooledStr(vcff, words[0]); record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1; - // chromEnd may be modified by parseInfoColumn, if INFO column includes END. + // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn. record->chromEnd = record->chromStart + 1; record->name = vcfFilePooledStr(vcff, words[2]); parseRefAndAlt(vcff, record, words[3], words[4]); record->qual = vcfFilePooledStr(vcff, words[5]); parseFilterColumn(vcff, record, words[6]); parseInfoColumn(vcff, record, words[7]); if (vcff->genotypeCount > 0) { record->format = vcfFilePooledStr(vcff, words[8]); record->genotypeUnparsedStrings = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; // Don't bother actually parsing all these until & unless we need the info: for (i = 0; i < vcff->genotypeCount; i++) record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]); @@ -759,36 +764,46 @@ #define VCF_MAX_FORMAT_LEN (VCF_MAX_FORMAT * 4) void vcfParseGenotypes(struct vcfRecord *record) /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[]. * This destroys genotypesUnparsedStrings. */ { if (record->genotypeUnparsedStrings == NULL) return; struct vcfFile *vcff = record->file; record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype)); char format[VCF_MAX_FORMAT_LEN]; safecpy(format, sizeof(format), record->format); char *formatWords[VCF_MAX_FORMAT]; int formatWordCount = chopByChar(format, ':', formatWords, ArraySize(formatWords)); if (formatWordCount >= VCF_MAX_FORMAT) + { vcfFileErr(vcff, "The FORMAT column has at least %d words; " "VCF_MAX_FORMAT may need to be increased in vcf.c!", VCF_MAX_FORMAT); + formatWordCount = VCF_MAX_FORMAT; + } if (differentString(formatWords[0], vcfGtGenotype)) vcfFileErr(vcff, "FORMAT column should begin with \"%s\" but begins with \"%s\"", vcfGtGenotype, formatWords[0]); int i; +// Store the pooled format word pointers and associated types for use in inner loop below. +enum vcfInfoType formatTypes[VCF_MAX_FORMAT]; +for (i = 0; i < formatWordCount; i++) + { + formatTypes[i] = typeForGtFormat(vcff, formatWords[i]); + formatWords[i] = vcfFilePooledStr(vcff, formatWords[i]); + } for (i = 0; i < vcff->genotypeCount; i++) { char *string = record->genotypeUnparsedStrings[i]; struct vcfGenotype *gt = &(record->genotypes[i]); // Each genotype can have multiple :-separated info elements: char *gtWords[VCF_MAX_FORMAT]; int gtWordCount = chopByChar(string, ':', gtWords, ArraySize(gtWords)); if (gtWordCount != formatWordCount) vcfFileErr(vcff, "The FORMAT column has %d words but the genotype column for %s " "has %d words", formatWordCount, vcff->genotypeIds[i], gtWordCount); if (gtWordCount > formatWordCount) gtWordCount = formatWordCount; gt->id = vcff->genotypeIds[i]; gt->infoCount = gtWordCount; gt->infoElements = vcfFileAlloc(vcff, gtWordCount * sizeof(struct vcfInfoElement)); @@ -799,33 +814,33 @@ if (sameString(formatWords[j], vcfGtGenotype)) { char *genotype = gtWords[j]; char *sep = strchr(genotype, '|'); if (sep != NULL) gt->isPhased = TRUE; else sep = strchr(genotype, '/'); gt->hapIxA = atoi(genotype); if (sep == NULL) gt->isHaploid = TRUE; else gt->hapIxB = atoi(sep+1); } struct vcfInfoElement *el = &(gt->infoElements[j]); - el->key = vcfFilePooledStr(vcff, formatWords[j]); - enum vcfInfoType type = typeForGtFormat(vcff, formatWords[j]); - el->count = parseInfoValue(record, formatWords[j], type, gtWords[j], &(el->values)); + el->key = formatWords[j]; + el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j], + &(el->values)); if (el->count >= VCF_MAX_INFO) vcfFileErr(vcff, "A single element of the genotype column for \"%s\" " "has at least %d values; " "VCF_MAX_INFO may need to be increased in vcf.c!", gt->id, VCF_MAX_INFO); } } record->genotypeUnparsedStrings = NULL; } const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId) /* Find the genotype and associated info for the individual, or return NULL. * This calls vcfParseGenotypes if it has not already been called. */ {