2b5eb866f050d964d8964ec5a84f7b63889cc6b1 angie Mon Jun 3 14:39:36 2013 -0700 New CGI, hgVai (Variant Annotation Integrator): simple checklist-styleUI by which user can select variants that they have uploaded; gene predictions to identify which part of a gene, if any, is hit by each variant; several additional sources of annotations/predictions e.g. dbNSFP scores and conserved elements/scores; and several filters to constrain output to the variants most likely to have a functional effect. Along with the new CGI, there are various lib bugfixes and improvements, a new hg/lib/tests/ testcase, and some test file changes to accomodate data updates to both knownGene and the pg* tables in knownGene. refs #6152 diff --git src/lib/vcf.c src/lib/vcf.c index 5f55c5d..3f3754a 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -1070,37 +1070,67 @@ " string info; \"Additional information encoded as a semicolon-separated series " "of short keys with optional comma-separated values\"" " string format; \"If genotype columns are specified in header, a " "semicolon-separated list of of short keys starting with GT\"" " string genotypes; \"If genotype columns are specified in header, a tab-separated " "set of genotype column values; each value is a colon-separated " "list of values corresponding to keys in the format column\"" " )"; struct asObject *vcfAsObj() // Return asObject describing fields of VCF { return asParseText(vcfDataLineAutoSqlString); } -char *vcfGetSlashSepAllelesFromWords(char **words, struct dyString *dy) -/* Overwrite dy with a /-separated allele string from VCF words; - * return dy->string for convenience. */ +char *vcfGetSlashSepAllelesFromWords(char **words, struct dyString *dy, + boolean *retSkippedFirstBase) +/* Overwrite dy with a /-separated allele string from VCF words, + * skipping the extra initial base that VCF requires for indel alleles if necessary. + * Return dy->string for convenience. */ { dyStringClear(dy); // VCF reference allele gets its own column: -dyStringAppend(dy, words[3]); +char *refAllele = words[3]; +char *altAlleles = words[4]; +// First determine whether there is an extra initial base that we need to skip: +boolean allStartSame = TRUE; +char *p; +while ((p = strchr(altAlleles, ',')) != NULL) + { + if (altAlleles[0] != refAllele[0]) + allStartSame = FALSE; + altAlleles = p+1; + } +if (altAlleles[0] != refAllele[0]) + allStartSame = FALSE; +int offset = allStartSame ? 1 : 0; +if (refAllele[offset] == '\0') + dyStringAppendC(dy, '-'); +else + dyStringAppend(dy, refAllele+offset); // VCF alternate alleles are comma-separated, make them /-separated: -if (isNotEmpty(words[4])) +altAlleles = words[4]; +if (isNotEmpty(altAlleles) && differentString(altAlleles, ".")) { - char *altAlleles = words[4], *p; + // Now construct the string: while ((p = strchr(altAlleles, ',')) != NULL) { dyStringAppendC(dy, '/'); - dyStringAppendN(dy, altAlleles, p-altAlleles); + int len = p - altAlleles - offset; + if (len == 0) + dyStringAppendC(dy, '-'); + else + dyStringAppendN(dy, altAlleles+offset, len); altAlleles = p+1; } dyStringAppendC(dy, '/'); - dyStringAppend(dy, altAlleles); + int len = strlen(altAlleles) - offset; + if (len == 0) + dyStringAppendC(dy, '-'); + else + dyStringAppendN(dy, altAlleles+offset, len); } +if (retSkippedFirstBase) + *retSkippedFirstBase = offset; return dy->string; }