fffb0ceba7685b27ca84238e4bd3093c9fd669d3 angie Mon Nov 18 15:53:49 2013 -0800 Fixed vcfGetSlashSepAllelesFromWords to use same logic asvcfRecordTrimIndelLeftBase to detect the VCF left padding base. Made improvements to the display of symbolic alleles from dbSNP. Fixed strand bug in parseDbSnpAltAlleles. Thanks Jonathan! refs #11460 (hgVai: paste/upload variant input options) diff --git src/hg/hgVai/hgVai.c src/hg/hgVai/hgVai.c index f4a03ce..bb4ad05 100644 --- src/hg/hgVai/hgVai.c +++ src/hg/hgVai/hgVai.c @@ -1461,94 +1461,102 @@ struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), subDir, assembly->name, md5sum); return dyStringCannibalize(&dy); } static struct slName *hashListNames(struct hash *hash) /* Return a list of all element names in the hash (if any). */ { struct slName *list = NULL; struct hashCookie cookie = hashFirst(hash); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) slAddHead(&list, slNameNew(hel->name)); return list; } -static char *encloseInAngleBrackets(char *stringIn) -/* If stringIn begins and ends with ()'s, replace them with <> and return stringIn. - * Otherwise, alloc a new string and surround stringIn with <>. */ +static char *encloseInAngleBracketsDbSnp(char *stringIn) +/* Return a string that has , with spaces replaced by '_'s. */ { -char *stringOut = stringIn; int stringInLen = strlen(stringIn); -if (stringIn[0] == '(' && stringIn[stringInLen-1] == ')') - { - stringIn[0] = '<'; - stringIn[stringInLen-1] = '>'; - } -else - { - int stringOutLen = stringInLen + 2 + 1; - stringOut = needMem(stringOutLen); - safef(stringOut, stringOutLen, "<%s>", stringIn); - } +int stringOutLen = stringInLen + strlen("") + 1; +char *stringOut = needMem(stringOutLen); +safef(stringOut, stringOutLen, "", stringIn); +subChar(stringOut, ' ', '_'); return stringOut; } +// dbSNP named alleles have many ways to describe a deletion from the reference, +// for example "LARGEDELETION", "LARGE DELETION", "... DELETED", "... DEL": +static const char *dbSnpDelRegex = "^\\(.*(DELET.*| DEL)\\)$"; static char **parseDbSnpAltAlleles(char *refAl, char *obsAls, boolean minusStrand, int *retAltAlCount, boolean *retNeedLeftBase) /* Given a non-symbolic reference allele and slash-sep observed alleles from dbSNP, * return an array of +-strand alleles that are not the same as the reference. * If any allele is "-" (deleted, zero-length), then set retNeedLeftBase to TRUE * because in this case VCF requires that the reference base to the left of the indel * must be added to all alleles, and the start coord also moves one base to the left. * Also, if any alt allele is symbolic, padding is required. * Note: this trashes obsAls. Resulting array can be freed but not its contents. */ { int obsCount = countChars(obsAls, '/') + 1; char *obsWords[obsCount]; chopByChar(obsAls, '/', obsWords, obsCount); +boolean obsHasDeletion = FALSE; +int i; +for (i = 0; i < obsCount; i++) + if (sameString(obsWords[i], "-")) + { + obsHasDeletion = TRUE; + break; + } char **altAls; AllocArray(altAls, obsCount); -int altCount = 0, i; +int altCount = 0; boolean needLeftBase = isEmpty(refAl) || sameString(refAl, "-"); for (i = 0; i < obsCount; i++) { char *altAl = obsWords[i]; + int altAlLen = strlen(altAl); + if (minusStrand && isAllNt(altAl, altAlLen)) + reverseComplement(altAl, altAlLen); if (differentString(altAl, refAl)) { if (sameString(altAl, "-")) { altAls[altCount] = ""; needLeftBase = TRUE; } else { // It would be nice to expand the "(CA)11/12/14/15/16/17/18/19/20" syntax of // some dbSNP observed's. What are these?: "(D1S243)", "(D1S2870)" // Unfortunately for observed="lengthTooLong" we just can't get the correct allele // sequence. (76,130 of those in snp138) // Hmmm, I guess we could at least stick in the right number of N's if we can // parse "(245 BP INSERTION)". (2403 rows rlike "[0-9]+ BP ?INSERTION" in snp138) - if (!isAllNt(altAl, strlen(altAl))) + if (!isAllNt(altAl, altAlLen)) { - // Symbolic allele: left base required, and enclose it in <>'s. + // Symbolic allele: left base required, and enclose it in 's. + // But if it's one of dbSNP's LARGEDELETION kind of alleles, that is redundant + // with the reference allele, so if we know there is already a "-" allele, + // skip it. + if (obsHasDeletion && regexMatch(altAl, dbSnpDelRegex)) + continue; needLeftBase = TRUE; - altAl = encloseInAngleBrackets(altAl); + altAl = encloseInAngleBracketsDbSnp(altAl); } - else if (minusStrand) - reverseComplement(altAl, strlen(altAl)); altAls[altCount] = altAl; } altCount++; } } *retAltAlCount = altCount; *retNeedLeftBase = needLeftBase; return altAls; } char *firstNCommaSep(struct slName *nameList, int n) /* Return a comma-separated string with the first n names in nameList. */ { struct dyString *dy = dyStringNew(0); int i;