cc6ca259385f385c252b6a675801025e96783818 angie Thu May 11 12:40:15 2017 -0700 Converting HGVS coords to 0-based needs to be done differently when the HGVS coords are negative because when negative they are already 0-based (but end is closed and needs to be converted to open). fixes #19396 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index ab6b363..be8e96d 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -656,62 +656,79 @@ dyStringFree(&nmTerm); freeMem(nmAcc); } } return hgvs; } static char refBaseFromNucSubst(char *change) /* If sequence change is a nucleotide substitution, return the reference base, else null char. */ { if (regexMatchNoCase(change, "^([ACGTU])>")) return toupper(change[0]); return '\0'; } +static void hgvsStartEndToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int *retStart, int *retEnd) +/* Convert HGVS's fully closed, 1-based-unless-negative start and end to UCSC start and end. */ +{ +// If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive. +int start = hgvs->start1; +if (start > 0) + start--; +// If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end. +int end = hgvs->end; +if (end < 0) + end++; +if (retStart) + *retStart = start; +if (retEnd) + *retEnd = end; +} + static boolean hgvsValidateGenomic(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs->seqAcc can be mapped to a chrom in db and coords are legal for the chrom. * If retFoundAcc is non-NULL, set it to the versionless seqAcc if chrom is found, else NULL. * If retFoundVersion is non-NULL, set it to seqAcc's version if chrom is found, else NULL. * If retDiffRefAllele is non-NULL and hgvs specifies a reference allele then compare it to * the genomic sequence at that location; set *retDiffRefAllele to NULL if they match, or to * genomic sequence if they differ. */ { boolean coordsOK = FALSE; if (retDiffRefAllele) *retDiffRefAllele = NULL; if (retFoundAcc) *retFoundAcc = NULL; if (retFoundVersion) *retFoundVersion = 0; char *chrom = hgOfficialChromName(db, hgvs->seqAcc); if (isNotEmpty(chrom)) { struct chromInfo *ci = hGetChromInfo(db, chrom); - if ((hgvs->start1 >= 1 && hgvs->start1 <= ci->size) && - (hgvs->end >=1 && hgvs->end <= ci->size)) + int start, end; + hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end); + if (start >= 0 && start < ci->size && end > 0 && end < ci->size) { coordsOK = TRUE; if (retDiffRefAllele) { char hgvsBase = refBaseFromNucSubst(hgvs->changes); if (hgvsBase != '\0') { - struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom, - hgvs->start1-1, hgvs->start1); + struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom, start, end); touppers(refBase->dna); if (refBase->dna[0] != hgvsBase) *retDiffRefAllele = cloneString(refBase->dna); dnaSeqFree(&refBase); } } } // Since we found hgvs->seqAcc, split it into versionless and version parts. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); if (retFoundVersion) { char *p = strchr(hgvs->seqAcc, '.'); if (p) *retFoundVersion = atoi(p+1); @@ -808,40 +825,40 @@ else return aaAbbrToLetter(match); } return '\0'; } static char *cloneStringFromChar(char c) /* Return a cloned string from a single character. */ { char str[2]; str[0] = c; str[1] = '\0'; return cloneString(str); } -static void checkRefAllele(struct hgvsVariant *hgvs, int start1, char *accSeq, +static void checkRefAllele(struct hgvsVariant *hgvs, int start, char *accSeq, char **retDiffRefAllele) -/* If hgvs change includes a reference allele, and if accSeq at start1 does not match, +/* If hgvs change includes a reference allele, and if accSeq at start does not match, * then set retDiffRefAllele to the accSeq version. retDiffRefAllele must be non-NULL. */ { char hgvsRefBase = (hgvs->type == hgvstProtein) ? refBaseFromProt(hgvs->changes) : refBaseFromNucSubst(hgvs->changes); if (hgvsRefBase != '\0') { - char seqRefBase = toupper(accSeq[start1-1]); + char seqRefBase = toupper(accSeq[start]); if (seqRefBase != hgvsRefBase) *retDiffRefAllele = cloneStringFromChar(seqRefBase); } if (hgvs->type == hgvstProtein && *retDiffRefAllele == NULL) { // Protein ranges have a second ref allele base to check char *p = strchr(hgvs->changes, '_'); if (p != NULL) { hgvsRefBase = refBaseFromProt(p+1); if (hgvsRefBase != '\0') { int end1 = atoi(skipToNumeric(p+1)); char seqRefBase = toupper(accSeq[end1-1]); if (seqRefBase != hgvsRefBase) @@ -922,52 +939,53 @@ * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ { char *acc = normalizeVersion(db, hgvs->seqAcc, retFoundVersion); if (isEmpty(acc)) return FALSE; boolean coordsOK = FALSE; char *accSeq = (hgvs->type == hgvstProtein ? getProteinSeq(db, acc) : getCdnaSeq(db, acc)); if (accSeq) { // By convention, foundAcc is always versionless because it's accompanied by foundVersion. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(acc, '.'); int seqLen = strlen(accSeq); + int start, end; + hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end); if (hgvs->type == hgvstCoding) { // Coding term coords can extend beyond the bounds of the transcript so // we can't check them without mapping to the genome. However, if the coords // are in bounds and a reference allele is provided, we can check that. struct genbankCds cds; coordsOK = getCds(db, acc, &cds); if (coordsOK && retDiffRefAllele) { - int start = hgvs->start1 + (hgvs->startIsUtr3 ? cds.end : cds.start); - if (hgvs->startOffset == 0 && start >= 1 && start <= seqLen) + start += (hgvs->startIsUtr3 ? cds.end : cds.start); + if (hgvs->startOffset == 0 && start >= 0 && start < seqLen) checkRefAllele(hgvs, start, accSeq, retDiffRefAllele); } } else { - if (hgvs->start1 >= 1 && hgvs->start1 <= seqLen && - hgvs->end >= 1 && hgvs->end <= seqLen) + if (start >= 0 && start < seqLen && end > 0 && end <= seqLen) { coordsOK = TRUE; if (retDiffRefAllele) - checkRefAllele(hgvs, hgvs->start1, accSeq, retDiffRefAllele); + checkRefAllele(hgvs, start, accSeq, retDiffRefAllele); } } } freeMem(accSeq); freeMem(acc); return coordsOK; } boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. @@ -1007,56 +1025,50 @@ /* HGVS insertion coordinates include the base before and the base after the insertion point, * while we treat it as a zero-length point. So if this is insertion, adjust the start and end. */ { if (hgvsIsInsertion(hgvs)) { *retStart0 = *retStart0 + 1; *retEnd = *retEnd - 1; } } static struct bed *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */ { struct bed *region = NULL; char *chrom = hgOfficialChromName(db, hgvs->seqAcc); -if (isNotEmpty(chrom)) +if (isNotEmpty(chrom) && hgvs->start1 > 0) { region = bed6New(chrom, hgvs->start1 - 1, hgvs->end, "", 0, '+'); adjustInsStartEnd(hgvs, ®ion->chromStart, ®ion->chromEnd); } if (retPslTable) *retPslTable = NULL; return region; } static void hgvsCodingToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int maxCoord, struct genbankCds *cds, int *retStart, int *retEnd, int *retUpstreamBases, int *retDownstreamBases) /* Convert a coding HGVS's start1 and end into UCSC coords plus upstream and downstream lengths * for when the coding HGVS has coordinates that extend beyond its sequence boundaries. * ret* args must be non-NULL. */ { -// If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive. -int start = hgvs->start1; -if (start > 0) - start -= 1; -// If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end. -int end = hgvs->end; -if (end < 0) - end += 1; +int start, end; +hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end); // If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart if (hgvs->startIsUtr3) *retStart = cds->end + start; else *retStart = cds->start + start; if (hgvs->endIsUtr3) *retEnd = cds->end + end; else *retEnd = cds->start + end; // Now check for coords that extend beyond the transcript('s alignment to the genome) if (*retStart < 0) { // hgvs->start1 is upstream of coding transcript. *retUpstreamBases = -*retStart; *retStart = 0; @@ -1099,32 +1111,31 @@ * NOTE: coding intron offsets are ignored; the PSL contains the exon anchor point * and the caller will have to map that to the genome and then apply the intron offset. */ { if (hgvs == NULL) return NULL; if (hgvs->type == hgvstProtein) errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein."); struct psl *psl; AllocVar(psl); psl->tName = cloneString(acc); safecpy(psl->strand, sizeof(psl->strand), "+"); psl->tSize = accSize; if (hgvs->type != hgvstCoding) { // Sane 1-based fully closed coords. - psl->tStart = hgvs->start1 - 1; - psl->tEnd = hgvs->end; + hgvsStartEndToZeroBasedHalfOpen(hgvs, &psl->tStart, &psl->tEnd); } else { // Simple or insanely complex CDS-relative coords. hgvsCodingToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd, retUpstreamBases, retDownstreamBases); } int refLen = psl->tEnd - psl->tStart; // Just use refLen for alt until we parse the sequence change portion of the term: int altLen = refLen; psl->qName = cloneString(hgvs->changes); psl->qSize = altLen; psl->qStart = 0; psl->qEnd = altLen; psl->blockCount = 1; @@ -1478,32 +1489,32 @@ seq = newDnaSeq(dna, size, name); } freez(&wholeSeq); } return seq; } static boolean hgvsToTxCoords(struct hgvsVariant *hgvs, char *db, uint *retStart, uint *retEnd) /* If hgvs is not coding then set *retStart to hgvs->start1-1 and *retEnd to hgvs->end & ret TRUE. * If coding and we have complete CDS info then apply cdsStart and/or cdsEnd offset to start & end. * If start or end is intronic, or start is negative (genomic upstream of tx) then return FALSE. * Note: at this point we don't know the length of the sequence so can't tell if start and/or end * fall past the end of the transcript (genomic downstream). */ { boolean coordsOk = TRUE; -int start = hgvs->start1 - 1; -int end = hgvs->end; +int start, end; +hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end); if (hgvs->type == hgvstCoding) { struct genbankCds cds; if (getCds(db, hgvs->seqAcc, &cds)) { // Determine whether to use cdsStart or cdsEnd for hgvs start and hgvs end. // Make sure the cds start and end are marked as complete. if (hgvs->startIsUtr3 && cds.endComplete) start += cds.end; else if (!hgvs->startIsUtr3 && cds.startComplete) start += cds.start; else coordsOk = FALSE; if (hgvs->endIsUtr3 && cds.endComplete) end += cds.end;