de29d745d32fb04a5187fb5998cc94bbb9bf41a4 angie Wed Aug 9 13:39:59 2017 -0700 Add optional HGVS output to annoGratorGpVar and hgVai. Since annoGratorGpVar is genePred-based, it can't yet take advantage of variantProjector's full PSL+CDS+sequence support, so when transcripts don't align cleanly to genome, HGVS c./n./p. output may be incorrect. refs #19968 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index c76b795..77fc0eb 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -1,25 +1,26 @@ /* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */ /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hgHgvs.h" #include "chromInfo.h" #include "genbank.h" #include "hdb.h" +#include "indelShift.h" #include "pslTransMap.h" #include "regexHelper.h" #include "trackHub.h" void hgvsVariantFree(struct hgvsVariant **pHgvs) // Free *pHgvs and its contents, and set *pHgvs to NULL. { if (pHgvs && *pHgvs) { struct hgvsVariant *hgvs = *pHgvs; freez(&hgvs->seqAcc); freez(&hgvs->seqGeneSymbol); freez(&hgvs->changes); freez(pHgvs); } @@ -474,96 +475,85 @@ struct hgvsVariant *hgvsParseTerm(char *term) /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions, coordinates or alleles. */ { struct hgvsVariant *hgvs = hgvsParseCNDotPos(term); if (hgvs == NULL) hgvs = hgvsParsePDotSubst(term); if (hgvs == NULL) hgvs = hgvsParsePDotRange(term); if (hgvs == NULL) hgvs = hgvsParseGDotPos(term); return hgvs; } -static boolean dbHasNcbiRefSeq(char *db) -/* Test whether NCBI's RefSeq alignments are available in db. */ -{ -// hTableExists() caches results so this shouldn't make for loads of new SQL queries if called -// more than once. -return (hTableExists(db, "ncbiRefSeq") && hTableExists(db, "ncbiRefSeqPsl") && - hTableExists(db, "ncbiRefSeqCds") && hTableExists(db, "ncbiRefSeqLink") && - hTableExists(db, "ncbiRefSeqPepTable") && - hTableExists(db, "seqNcbiRefSeq") && hTableExists(db, "extNcbiRefSeq")); -} - static char *npForGeneSymbol(char *db, char *geneSymbol) /* Given a gene symbol, look up and return its NP_ accession; if not found return NULL. */ { char query[2048]; -if (dbHasNcbiRefSeq(db)) +if (hDbHasNcbiRefSeq(db)) { sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where name = '%s' " "and protAcc != 'n/a' and protAcc != '' " "order by length(protAcc), protAcc", geneSymbol); } else if (hTableExists(db, "refGene")) { // Join with refGene to make sure it's an NP for this species. sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where l.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc" , refLinkTable, geneSymbol); } else return NULL; struct sqlConnection *conn = hAllocConn(db); char *npAcc = sqlQuickString(conn, query); hFreeConn(&conn); return npAcc; } static char *nmForGeneSymbol(char *db, char *geneSymbol) /* Given a gene symbol, look up and return its NM_ accession; if not found return NULL. */ { if (trackHubDatabase(db)) return NULL; char query[2048]; char *geneTable = NULL; -if (dbHasNcbiRefSeq(db)) +if (hDbHasNcbiRefSeq(db)) geneTable = "ncbiRefSeq"; else if (hTableExists(db, "refGene")) geneTable = "refGene"; if (geneTable == NULL) return NULL; sqlSafef(query, sizeof(query), "select name from %s where name2 = '%s' " "order by length(name), name", geneTable, geneSymbol); struct sqlConnection *conn = hAllocConn(db); char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); return nmAcc; } static char *npForNm(char *db, char *nmAcc) /* Given an NM_ accession, look up and return its NP_ accession; if not found return NULL. */ { if (trackHubDatabase(db)) return NULL; char query[2048]; -if (dbHasNcbiRefSeq(db)) +if (hDbHasNcbiRefSeq(db)) { // ncbiRefSeq tables use versioned NM_ accs, but the user might have passed in a // versionless NM_, so adjust query accordingly: if (strchr(nmAcc, '.')) sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id = '%s'", nmAcc); else sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'", nmAcc); } else if (hTableExists(db, "refGene")) { // Trim .version if present since our genbank tables don't use versioned names. char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.'); sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where r.name = '%s' and l.mrnaAcc = r.name " @@ -584,31 +574,31 @@ return NULL; char *seq = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgPep")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } } else { - if (dbHasNcbiRefSeq(db)) + if (hDbHasNcbiRefSeq(db)) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable " "where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } else { aaSeq *aaSeq = hGenBankGetPep(db, acc, NULL); if (aaSeq) seq = aaSeq->dna; } } @@ -807,85 +797,71 @@ } // Since we found hgvs->seqAcc, split it into versionless and version parts. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); if (retFoundVersion) { char *p = strchr(hgvs->seqAcc, '.'); if (p) *retFoundVersion = atoi(p+1); } // Don't free chromInfo, it's cached! } return coordsOK; } -static char *dnaSeqCannibalize(struct dnaSeq **pDnaSeq) -/* Return the already-allocated dna string and free the dnaSeq container. */ -{ -char *seq = NULL; -if (pDnaSeq && *pDnaSeq) - { - struct dnaSeq *dnaSeq = *pDnaSeq; - seq = dnaSeq->dna; - dnaSeq->dna = NULL; - dnaSeqFree(pDnaSeq); - } -return seq; -} - static char *getCdnaSeq(char *db, char *acc) /* Return cdna sequence for acc, or NULL if not found. */ { if (trackHubDatabase(db)) return NULL; char *seq = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgCdna")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } } else { struct dnaSeq *cdnaSeq = NULL; - if (dbHasNcbiRefSeq(db)) + if (hDbHasNcbiRefSeq(db)) cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq"); else cdnaSeq = hGenBankGetMrna(db, acc, NULL); if (cdnaSeq) seq = dnaSeqCannibalize(&cdnaSeq); } return seq; } static boolean getCds(char *db, char *acc, struct genbankCds *retCds) /* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */ { if (trackHubDatabase(db)) return FALSE; boolean gotCds = FALSE; char query[1024]; if (startsWith("LRG_", acc)) sqlSafef(query, sizeof(query), "select cds from lrgCds where id = '%s'", acc); -else if (dbHasNcbiRefSeq(db) && +else if (hDbHasNcbiRefSeq(db) && // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete: strchr(acc, '.')) sqlSafef(query, sizeof(query), "select cds from ncbiRefSeqCds where id = '%s'", acc); else sqlSafef(query, sizeof(query), "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND " "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc); struct sqlConnection *conn = hAllocConn(db); char cdsBuf[2048]; char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); hFreeConn(&conn); if (isNotEmpty(cdsStr)) gotCds = (genbankCdsParse(cdsStr, retCds) && retCds->startComplete && retCds->start != retCds->end); @@ -966,31 +942,31 @@ /* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0. * The user may give us a RefSeq accession with or without a .version. * If ncbiRefSeq tables are present, return acc with version, looking up version if necessary. * If we look it up and can't find it, this returns NULL. * If instead we're using genbank tables, return acc with no version. * That ensures that acc will be found in our local tables. */ { if (trackHubDatabase(db)) return NULL; char *normalizedAcc = NULL; int foundVersion = 0; if (startsWith("LRG_", acc)) { normalizedAcc = cloneString(acc); } -else if (dbHasNcbiRefSeq(db)) +else if (hDbHasNcbiRefSeq(db)) { // ncbiRefSeq tables need versioned accessions. if (strchr(acc, '.')) normalizedAcc = cloneString(acc); else { char query[2048]; sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc); struct sqlConnection *conn = hAllocConn(db); normalizedAcc = sqlQuickString(conn, query); hFreeConn(&conn); } if (isNotEmpty(normalizedAcc)) { char *p = strchr(normalizedAcc, '.'); @@ -1351,31 +1327,31 @@ sqlFreeResult(&sr); hFreeConn(&conn); return mappedToGenome; } static char *pslTableForAcc(char *db, char *acc) /* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where * acc should be found (table may or may not exist). Don't free the returned string. */ { char *pslTable = NULL; if (startsWith("LRG_", acc)) pslTable = "lrgTranscriptAli"; else if (startsWith("NM_", acc) || startsWith("NR_", acc)) { // Use NCBI's alignments if they are available - if (dbHasNcbiRefSeq(db)) + if (hDbHasNcbiRefSeq(db)) pslTable = "ncbiRefSeqPsl"; else pslTable = "refSeqAli"; } return pslTable; } #define limitToRange(val, min, max) { if (val < min) { val = min; } \ if (val > max) { val = max; } } static struct bed *pslAndFriendsToRegion(struct psl *psl, struct hgvsVariant *hgvs, int upstream, int downstream) /* If hgvs has any intron offsets and/or upstream/downstream offsets, add those to the * anchor coords in psl and return the variant's region of the genome. */ { @@ -1458,31 +1434,31 @@ } return region; } static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { struct bed *region = NULL; char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); if (acc && startsWith("NP_", acc)) { // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome. struct sqlConnection *conn = hAllocConn(db); char query[2048]; - if (dbHasNcbiRefSeq(db)) + if (hDbHasNcbiRefSeq(db)) sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", acc); else if (hTableExists(db, "refGene")) sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); else return NULL; char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); if (nmAcc) { struct hgvsVariant cDot; zeroBytes(&cDot, sizeof(cDot)); cDot.type = hgvstCoding; cDot.seqAcc = nmAcc; @@ -1800,159 +1776,30 @@ mapping->chromEnd = mapping->chromStart; } else { // '+' strand and extra seq after dup ==> insertion at chromEnd, update leftBase & offset *pLeftShiftOffset += refLen; *pLeftBase = genomicRef[refLen-1]; mapping->chromStart = mapping->chromEnd; } memmove(hgvsSeqAlt, hgvsSeqAlt+refLen, altLen-refLen+1); genomicRef[0] = genomicRefFwd[0] = hgvsSeqRef[0] = '\0'; } return dupToIns; } -static void updateSeq(struct dnaSeq *seq, char *db, char *chrom, int start, int end) -/* Free and reallocate seq's contents with the new sequence range. */ -// BEWARE: newDnaSeq does not clone dna so original seq->dna was allocated elsewhere! -// In this case I'm assuming that seq comes from a prior call to hChromSeq, which -// passes control of seq over to the caller (there's no stale pointer to cause trouble). -{ -struct dnaSeq *newSeq = hChromSeq(db, chrom, start, end); -touppers(newSeq->dna); -freeMem(seq->dna); -freeMem(seq->name); -memcpy(seq, newSeq, sizeof(*seq)); -freeMem(newSeq); -} - -static boolean shiftAndFetchMoreSeq(char *db, int altLen, struct dnaSeq *genomicSeq, - int *pLeftShiftOffset, struct bed *mapping, - int *pTotalBasesShifted, int *pStart) -/* We need to fetch more genomic sequence to the left; update genomicSeq and remaining args - * with the distance that we have shifted so far, and fetch a larger block of sequence. - * Return TRUE if we have shifted all the way to the beginning of the chromosome. */ -{ -// Adjust mapping->chrom{Start,End} and totalBasesShifted -mapping->chromStart -= *pLeftShiftOffset; -mapping->chromEnd -= *pLeftShiftOffset; -*pTotalBasesShifted += *pLeftShiftOffset; -// Double the size of our left-shifting buffer (but don't underflow chrom) -*pLeftShiftOffset = min(mapping->chromStart, *pLeftShiftOffset * 2); -// Update genomicSeq and start offset -int genomicSeqStart = mapping->chromStart - *pLeftShiftOffset; -int genomicSeqEnd = mapping->chromEnd + altLen; -if (genomicSeqEnd > genomicSeqStart) - updateSeq(genomicSeq, db, mapping->chrom, genomicSeqStart, genomicSeqEnd); -*pStart = *pLeftShiftOffset; -return (mapping->chromStart == 0); -} - -static int leftShift(boolean isRc, char *db, struct dnaSeq *genomicSeq, int *pLeftShiftOffset, - char *genomicRef, char *genomicRefFwd, char *hgvsSeqRef, char *hgvsSeqAlt, - char pLeftBase[1], struct bed *mapping) -/* HGVS requires right-shifting ambiguous alignments while VCF requires left-shifting. - * If it's possible to left-shift this ref/alt, then change pLeftBase and mapping. - * Return */ -{ -int totalBasesShifted = 0; -if (sameString(genomicRef, hgvsSeqRef)) - { - int refLen = mapping->chromEnd - mapping->chromStart; - int altLen = strlen(hgvsSeqAlt); - if ((refLen == 0 && altLen > 0) || - (refLen > 0 && altLen == 0)) - { - char *genomeChunk = genomicSeq->dna; - int start = *pLeftShiftOffset; - boolean done = FALSE; - // If insertion, first compare inserted seq to genomic seq to the left of insertion point. - char hgvsSeqAltFwd[altLen+1]; - copyMaybeRc(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), hgvsSeqAlt, isRc); - int ix; - for (ix = altLen-1; !done && ix >= 0 && start > 0; ix--) - { - if (genomeChunk[start-1] == hgvsSeqAltFwd[ix]) - { - start--; - if (start == 0) - { - // We have hit the beginning of genomeChunk and are not done; fetch more seq. - done = shiftAndFetchMoreSeq(db, altLen, genomicSeq, pLeftShiftOffset, - mapping, &totalBasesShifted, &start); - genomeChunk = genomicSeq->dna; - } - } - else - done = TRUE; - } - // If we're not done, keep trying to shift left. - while (!done && start > 0) - { - if (genomeChunk[start-1] == genomeChunk[start-1+refLen+altLen]) - { - start--; - if (start == 0) - { - // We have hit the beginning of genomeChunk and are not done; fetch more seq. - done = shiftAndFetchMoreSeq(db, altLen, genomicSeq, pLeftShiftOffset, - mapping, &totalBasesShifted, &start); - genomeChunk = genomicSeq->dna; - } - } - else - done = TRUE; - } - // Done shifting; update mapping coords and ref/alt sequences if we shifted. - int basesShifted = (*pLeftShiftOffset - start); - totalBasesShifted += basesShifted; - if (totalBasesShifted > 0) - { - mapping->chromStart -= basesShifted; - mapping->chromEnd -= basesShifted; - if (altLen > 0) - { - // Insertion: ref is still "", update hgvsSeqAlt. - if (totalBasesShifted < altLen) - { - // The beginning of hgvsSeqAlt is from genomic sequence. The remainder is a - // shifted portion of the original. Do the shifting first, then the copying - // from genome. - memmove(hgvsSeqAltFwd+totalBasesShifted, hgvsSeqAltFwd, - (altLen - totalBasesShifted)); - memcpy(hgvsSeqAltFwd, genomeChunk+start, totalBasesShifted); - } - else - safencpy(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), genomeChunk+start, altLen); - copyMaybeRc(hgvsSeqAlt, altLen+1, hgvsSeqAltFwd, isRc); - *pLeftBase = (start > 0) ? genomeChunk[start-1] : genomeChunk[0]; - } - if (refLen > 0) - { - // Deletion: update ref from genome, alt is still "". - safencpy(genomicRefFwd, refLen+1, genomeChunk+start, refLen); - copyMaybeRc(genomicRef, refLen+1, genomicRefFwd, isRc); - safecpy(hgvsSeqRef, refLen+1, genomicRef); - *pLeftBase = (start > 0) ? genomeChunk[start-1] : genomeChunk[0]; - } - } - } - } -return totalBasesShifted; -} - static char *makeVcfAlt(char *genomicRef, char *hgvsSeqRef, char *hgvsSeqAlt, struct bed *mapping, boolean isRc, char leftBase, boolean leftBaseRight, boolean *retIsIndel) /* Based on comparing the three possibly differing alleles (genomic, hgvs ref, hgvs alt), * determine which sequence(s) will go in alt. Determine whether this variant is an indel * (alleles not all the same length), and if so prepend leftBase to alt allele(s). */ { int hgvsSeqRefLen = strlen(hgvsSeqRef); int hgvsSeqAltLen = strlen(hgvsSeqAlt); // VCF alleles are always on '+' (Fwd) strand of genome. char hgvsSeqRefFwd[hgvsSeqRefLen+1]; copyMaybeRc(hgvsSeqRefFwd, sizeof(hgvsSeqRefFwd), hgvsSeqRef, isRc); char hgvsSeqAltFwd[hgvsSeqAltLen+1]; copyMaybeRc(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), hgvsSeqAlt, isRc); int genomicRefLen = strlen(genomicRef); // The alt allele string will contain 0 to 2 sequences, maybe a comma, and 0 to 2 leftBases: @@ -2088,69 +1935,87 @@ return dyStringCannibalize(&filter); } static struct vcfRow *vcfFromHgvs(char *db, char *term, struct bed *mapping, struct hgvsVariant *hgvs, struct hgvsChange *changeList, boolean doLeftShift, struct dyString *dyError) /* Determine ref and alt sequence, move start left one base if indel, and return vcfRow. * If doLeftShift, also shift items with ambiguous start as far left on the genome as * possible. That is the VCF convention but it is inconvenient for variant annotation; * the HGVS standard is to shift right as far as possible on the tx strand, so consequences * are reported at the point where the sequence has changed. */ { // Include some genomic sequence to the left (if possible) in case we need to left-shift // an ambiguous mapping and/or include the base to the left of an indel. int genomicRefLen = mapping->chromEnd - mapping->chromStart; -int leftShiftOffset = min(mapping->chromStart, - max(128, 4*genomicRefLen)); -int genomicSeqStart = mapping->chromStart - leftShiftOffset; -struct dnaSeq *genomicSeq = hChromSeq(db, mapping->chrom, genomicSeqStart, mapping->chromEnd); -touppers(genomicSeq->dna); +uint winStart, winEnd; +indelShiftRangeForVariant(mapping->chromStart, genomicRefLen, strlen(hgvs->changes), + &winStart, &winEnd); +struct seqWindow *seqWin = chromSeqWindowNew(db, mapping->chrom, winStart, winEnd); +int varWinStart = mapping->chromStart - seqWin->start; int indelOffset = (mapping->chromStart == 0) ? 0 : 1; -char leftBase = genomicSeq->dna[leftShiftOffset-indelOffset]; +char leftBase = seqWin->seq[varWinStart-indelOffset]; char genomicRefFwd[genomicRefLen+1]; -safencpy(genomicRefFwd, sizeof(genomicRefFwd), genomicSeq->dna + leftShiftOffset, genomicRefLen); +safencpy(genomicRefFwd, sizeof(genomicRefFwd), seqWin->seq + varWinStart, genomicRefLen); // VCF is always on '+' (Fwd) strand of reference. HGVS sequences may be on the '-' strand. // Make a version of the genome assembly allele for comparison to HGVS sequences. boolean isRc = (mapping->strand[0] == '-'); char genomicRef[genomicRefLen+1]; copyMaybeRc(genomicRef, sizeof(genomicRef), genomicRefFwd, isRc); // Get HGVS ref and alt alleles char *hgvsSeqRef = hgvsGetRef(hgvs, db); if (hgvsSeqRef == NULL) // hgvsSeqRef can be NULL when sequence coords are out of transcript bounds e.g. intron. // Use genomic reference sequence (rev-complemented if mapped to '-' strand) hgvsSeqRef = cloneString(genomicRef); char *hgvsTermRef = hgvsChangeGetAssertedRef(changeList); char *hgvsSeqAlt = hgvsChangeGetAlt(changeList, hgvsSeqRef, db); if (hgvsIsInsertion(hgvs)) // HGVS insertions are two bases (before and after), but ours are zero-length points, // so adjust hgvsSeqRef accordingly. hgvsSeqRef[0] = '\0'; if (hgvsSeqAlt == NULL) { dyStringAppend(dyError, "Unable to get alternate sequence"); return NULL; } char *filter = makeHgvsToVcfFilter(genomicRef, hgvsSeqRef, hgvsTermRef); -// These two VCF-normalization functions may modify the contents of their arguments: -boolean dupToIns = doDupToIns(isRc, &leftShiftOffset, genomicRef, genomicRefFwd, +// doDupToIns and indelShift may modify the contents of their arguments: +boolean dupToIns = doDupToIns(isRc, &varWinStart, genomicRef, genomicRefFwd, hgvsSeqRef, hgvsSeqAlt, &leftBase, mapping); +if (dupToIns) + genomicRefLen = strlen(genomicRef); int basesShifted = 0; -if (doLeftShift) - basesShifted = leftShift(isRc, db, genomicSeq, &leftShiftOffset, genomicRef, genomicRefFwd, - hgvsSeqRef, hgvsSeqAlt, &leftBase, mapping); +if (doLeftShift && + sameString(genomicRef, hgvsSeqRef)) + { + int altLen = strlen(hgvsSeqAlt); + char hgvsSeqAltFwd[altLen+1]; + copyMaybeRc(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), hgvsSeqAlt, isRc); + basesShifted = indelShift(seqWin, &mapping->chromStart, &mapping->chromEnd, + hgvsSeqAltFwd, INDEL_SHIFT_NO_MAX, isdLeft); + if (basesShifted) + { + varWinStart = mapping->chromStart - seqWin->start; + int leftBaseOffset = max(0, varWinStart - 1); + leftBase = seqWin->seq[leftBaseOffset]; + copyMaybeRc(hgvsSeqAlt, altLen+1, hgvsSeqAltFwd, isRc); + safencpy(genomicRefFwd, sizeof(genomicRefFwd), seqWin->seq+varWinStart, genomicRefLen); + copyMaybeRc(genomicRef, sizeof(genomicRef), genomicRefFwd, isRc); + safecpy(hgvsSeqRef, genomicRefLen+1, genomicRef); + } + } // VCF special case for indel at beginning of chromosome: there is no left base, so instead // put the first base of the chromosome to the right! // See https://samtools.github.io/hts-specs/VCFv4.2.pdf 1.4.1.4 REF and discussion at // https://sourceforge.net/p/vcftools/mailman/vcftools-spec/thread/508CAD45.8010301%40broadinstitute.org/ boolean leftBaseRight = (mapping->chromStart == 0); // Make REF and ALT strings ('+' strand, leftBase for indels) and write VCF boolean isIndel = FALSE; char *vcfAlt = makeVcfAlt(genomicRef, hgvsSeqRef, hgvsSeqAlt, mapping, isRc, leftBase, leftBaseRight, &isIndel); int pos = mapping->chromStart + 1; if (isIndel && !leftBaseRight) pos--; char vcfRef[genomicRefLen+1+1]; if (isIndel) { @@ -2161,31 +2026,31 @@ } else safecpy(vcfRef, sizeof(vcfRef), genomicRefFwd); char *info = makeHgvsToVcfInfo(dupToIns, basesShifted, isIndel, mapping->chromEnd); struct vcfRow *row; AllocVar(row); row->chrom = cloneString(mapping->chrom); row->pos = pos; row->id = cloneString(term); row->ref = cloneString(vcfRef); row->alt = vcfAlt; row->filter = filter; row->info = info; freeMem(hgvsSeqRef); freeMem(hgvsSeqAlt); -dnaSeqFree(&genomicSeq); +chromSeqWindowFree(&seqWin); return row; } struct vcfRow *hgvsToVcfRow(char *db, char *term, boolean doLeftShift, struct dyString *dyError) /* Convert HGVS to a row of VCF suitable for sorting & printing. If unable, return NULL and * put the reason in dyError. Protein terms are ambiguous at the nucleotide level so they are * not supported at this point. */ { struct vcfRow *row = NULL; struct hgvsVariant *hgvs = hgvsParseTerm(term); if (!hgvs) { dyStringPrintf(dyError, "Unable to parse '%s' as HGVS", term); return NULL; } @@ -2234,15 +2099,512 @@ } else { row = vcfFromHgvs(db, term, mapping, hgvs, changeList, doLeftShift, dyWarn); if (dyStringIsNotEmpty(dyWarn)) { dyStringPrintf(dyError, "Can't convert '%s' to VCF: %s", term, dyStringContents(dyWarn)); } } } bedFree(&mapping); dyStringFree(&dyWarn); return row; } + +static int allNCount(char *seq) +/* If seq is entirely N's, return its length, otherwise 0. */ +{ +int nLen = 0; +for (nLen = 0; seq[nLen] != '\0'; nLen++) + if (seq[nLen] != 'N') + return 0; +return nLen; +} + +// HGVS allows terms to specify deleted or duplicated bases if there are "several bases". +// Search for "NOTE: it is allowed" here: +// http://varnomen.hgvs.org/recommendations/DNA/variant/deletion/ +// http://varnomen.hgvs.org/recommendations/DNA/variant/duplication/ +// It does not say how many "several" can be. It doesn't specify for inv, but common +// practice is to include bases and I don't see why it should be different from del or dup. +// If somebody complains about hardcoding then I guess we could make it a parameter of +// hgvsAppendChangesFromNucRefAlt. +#define HGVS_SEVERAL 30 + +static void hgvsAppendChangesFromNucRefAlt(struct dyString *dy, char *ref, char *alt, int dupLen, + boolean breakDelIns) +/* Translate reference allele and alternate allele into an HGVS change description, append to dy. + * If breakDelIns, then show deleted bases (e.g. show 'delAGinsTT' instead of 'delinsTT'). + * Note: this forces ref and alt to uppercase. + * No support for con or repeats at this point, just {=, >, del, dup, ins, inv}. */ +{ +touppers(ref); +touppers(alt); +if (sameString(ref, alt)) + dyStringPrintf(dy, "%s=", ref); +else + { + int refLen = strlen(ref); + int altLen = strlen(alt); + char refInv[refLen+1]; + safencpy(refInv, sizeof(refInv), ref, refLen); + reverseComplement(refInv, refLen); + if (refLen == 1 && altLen == 1) + dyStringPrintf(dy, "%c>%c", ref[0], alt[0]); + else if (dupLen > 0) + { + dyStringAppend(dy, "dup"); + if (dupLen <= HGVS_SEVERAL) + dyStringAppendN(dy, alt, dupLen); + // Could be a pure duplication followed by insertion: + if (altLen > dupLen) + dyStringPrintf(dy, "ins%s", alt+dupLen); + } + else if (refLen == 0) + { + int allNLen = allNCount(alt); + if (allNLen) + dyStringPrintf(dy, "ins%d", allNLen); + else + dyStringPrintf(dy, "ins%s", alt); + } + else if (altLen == 0) + { + dyStringAppend(dy, "del"); + if (refLen <= HGVS_SEVERAL) + dyStringAppendN(dy, ref, refLen); + } + else if (refLen == altLen && refLen > 1 && sameString(refInv, alt)) + { + dyStringAppend(dy, "inv"); + if (refLen <= HGVS_SEVERAL) + dyStringAppendN(dy, ref, refLen); + } + else + { + dyStringAppend(dy, "del"); + if (breakDelIns && refLen <= HGVS_SEVERAL) + dyStringAppendN(dy, ref, refLen); + int allNLen = allNCount(alt); + if (allNLen) + dyStringPrintf(dy, "ins%d", allNLen); + else + dyStringPrintf(dy, "ins%s", alt); + } + } +} + +void hgvsAppendChangesFromPepRefAlt(struct dyString *dy, char *ref, char *alt, int dupLen) +/* Translate reference allele and alternate allele into an HGVS change description, append to dy. + * ref and alt must be sequences of single-letter IUPAC amino acid codes. + * Note: this forces ref and alt to uppercase. */ +{ +touppers(ref); +touppers(alt); +if (sameString(ref, alt)) + dyStringAppendC(dy, '='); +else + { + int refLen = strlen(ref); + int altLen = strlen(alt); + char altAbbr[3*altLen+1]; + int ix; + for (ix = 0; ix < altLen; ix++) + aaToAbbr(alt[ix], altAbbr+3*ix, sizeof(altAbbr)-3*ix); + if (refLen == 1 && altLen == 1) + // Simple substitution + dyStringAppend(dy, altAbbr); + else if (dupLen > 0) + { + dyStringAppend(dy, "dup"); + // Could be a pure duplication followed by insertion: + if (altLen > dupLen) + dyStringPrintf(dy, "ins%s", altAbbr+(dupLen*3)); + } + else if (refLen == 0) + dyStringPrintf(dy, "ins%s", altAbbr); + else if (altLen == 0) + { + dyStringAppend(dy, "del"); + } + else + { + dyStringPrintf(dy, "delins%s", altAbbr); + } + } +} + +char *hgvsGFromVariant(struct seqWindow *gSeqWin, struct bed3 *variantBed, char *alt, char *acc, + boolean breakDelIns) +/* Return an HGVS g. string representing the genomic variant at the position of variantBed with + * reference allele from gSeqWin and alternate allele alt. 3'-shift indels if applicable. + * If acc is non-NULL it is used instead of variantBed->chrom. + * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ +{ +struct dyString *dy = dyStringCreate("%s:g.", acc ? acc : variantBed->chrom); +uint vStart = variantBed->chromStart, vEnd = variantBed->chromEnd; +gSeqWin->fetch(gSeqWin, variantBed->chrom, vStart, vEnd); +int refLen = vEnd - vStart; +char ref[refLen+1]; +seqWindowCopy(gSeqWin, vStart, refLen, ref, sizeof(ref)); +int altLen = strlen(alt); +char altCpy[altLen+1]; +safecpy(altCpy, sizeof(altCpy), alt); +if (differentString(ref, altCpy)) + // Trim identical bases from start and end -- unless this is an assertion that there is + // no change, in which case it's good to keep the range on which that assertion was made. + trimRefAlt(ref, altCpy, &vStart, &vEnd, &refLen, &altLen); +if (indelShiftIsApplicable(refLen, altLen) && + indelShift(gSeqWin, &vStart, &vEnd, altCpy, INDEL_SHIFT_NO_MAX, isdRight)) + // update ref + seqWindowCopy(gSeqWin, vStart, refLen, ref, sizeof(ref)); +int dupLen = 0; +if (refLen == 1) + // Single base: single 1-based coordinate + dyStringPrintf(dy, "%d", vStart+1); +else if (refLen == 0) + { + // Insertion or duplication + if (altLen > 0 && altLen <= vStart) + { + // Detect duplicated sequence + uint seqStart = vStart - altLen; + char precedingRef[altLen+1]; + seqWindowCopy(gSeqWin, seqStart, altLen, precedingRef, sizeof(precedingRef)); + if (sameString(altCpy, precedingRef)) + dupLen = altLen; + } + if (dupLen > 0) + { + if (dupLen == 1) + // Single-base duplication + dyStringPrintf(dy, "%d", vStart); // - dupLen + 1 cancel each other out + else + // Multi-base duplication: range is the dupLen bases preceding vStart + dyStringPrintf(dy, "%d_%d", vStart - dupLen + 1, vStart); + } + else + // Insertion: two-base range enclosing zero-base insertion point + dyStringPrintf(dy, "%d_%d", vStart, vEnd+1); + } +else + // Deletion or MNV + dyStringPrintf(dy, "%d_%d", vStart+1, vEnd); +hgvsAppendChangesFromNucRefAlt(dy, ref, altCpy, dupLen, breakDelIns); +return dyStringCannibalize(&dy); +} + +static int findDup(char *alt, struct seqWindow *seqWin, uint refPoint, boolean isRc) +/* Given that the variant is an insertion of alt at refPoint (minding isRc), if alt looks + * like a duplicate of the sequence before refPoint, return the length of duplicated sequence. */ +{ +int altLen = strlen(alt); +// Don't underflow the sequence: +if (!isRc && altLen > refPoint) + return 0; +uint seqStart = refPoint - (isRc ? 0 : altLen); +uint minEnd = seqStart + altLen; +if (seqWin->start > seqStart || seqWin->end < minEnd) + seqWin->fetch(seqWin, seqWin->seqName, + min(seqStart, seqWin->start), + max(seqWin->end, minEnd)); +// Get sequence, reverse-complement if necessary +char precedingRef[altLen+1]; +seqWindowCopy(seqWin, seqStart, altLen, precedingRef, sizeof(precedingRef)); +if (isRc) + reverseComplement(precedingRef, altLen); +if (sameString(alt, precedingRef)) + return altLen; +else + { + // Detect an insertion plus a few slop bases at the end, like "dupinsTAT" where + // alt starts with precedingRef[3..] followed by new "TAT". Then dupLen is altLen-3. + // Unfortunately, indelShift can actually shift far enough to prevent a perfect match. :( + // Example: ClinVar NM_000179.2(MSH6):c.1573_3439-429dupinsTAT -- the insertion is right- + // shifted by 1 and then there's no longer a perfect dup. :b We would have to do dup-aware + // indel shifting!! + int searchLimit = 5; + int offset; + for (offset = 1; offset < searchLimit; offset++) + { + if (sameStringN(alt, precedingRef+offset, altLen-offset)) + return altLen-offset; + } + } +return 0; +} + +static int tweakInsDup(struct vpTxPosition *startPos, struct vpTxPosition *endPos, char *alt, + struct seqWindow *gSeqWin, struct psl *txAli, struct dnaSeq *txSeq) +/* If this variant is an insertion, HGVS needs the 2-base range surrounding the insertion point. + * However, if an inserted sequence happens to be identical to the preceding sequence then + * HGVS calls it a dup on the range of preceding sequence. If this is a dup, return the length of + * duplicated sequence (else return 0). + * This modifies startPos and endPos if the variant is an insertion/dup. + * For non-exonic insertions, the preceding reference sequence is from the genome not tx. + * Note: this doesn't yet detect "dupins", where most of the inserted sequence (starting from the + * beginning of the inserted sequence) matches the previous ref sequence */ +{ +int dupLen = 0; +int isIns = vpTxPosIsInsertion(startPos, endPos); +if (isIns) + { + // Yes, this is a zero-length insertion point -- see if it happens to be a duplication. + // "insTCA" is preferable to "dupTinsCA"... where to draw the line for when dup is worth it?? + // For now, just say half of altLen... but an argument could be made for some small constant too + int minDup = strlen(alt) / 2; + // Fetch preceding sequence from tx if exonic, otherwise from genome. + if (startPos->region == vpExon && endPos->region == vpExon) + { + struct seqWindow *txSeqWin = memSeqWindowNew(txSeq->name, txSeq->dna); + dupLen = findDup(alt, txSeqWin, startPos->txOffset, FALSE); + if (dupLen > minDup) + { + // Since txSeqWin was used to find dup, the new startPos must also be exonic. + // In case startPos was looking forward from the boundary between an exon and an intron + // or downstream, adjust its other fields to be exonic now: + startPos->region = vpExon; + startPos->txOffset -= dupLen; + startPos->gDistance = startPos->intron3TxOffset = startPos->intron3Distance = 0; + } + else + dupLen = 0; + memSeqWindowFree(&txSeqWin); + } + else + { + boolean isRc = (pslQStrand(txAli) == '-'); + dupLen = findDup(alt, gSeqWin, startPos->gOffset, isRc); + if (dupLen > minDup) + { + uint newGOffset = startPos->gOffset + (isRc ? dupLen : -dupLen); + vpPosGenoToTx(newGOffset, txAli, startPos, FALSE); + } + else + dupLen = 0; + } + } +if (dupLen == 0 && isIns) + { + // Expand insertion point to 2-base region around insertion point for HGVS. + // startPos = base to left of endPos whose region looks left/5'; + // endPos = base to right of startPos whose region looks right/3'. + struct vpTxPosition newStart = *endPos; + vpTxPosSlideInSameRegion(&newStart, -1); + struct vpTxPosition newEnd = *startPos; + vpTxPosSlideInSameRegion(&newEnd, 1); + *startPos = newStart; + *endPos = newEnd; + } +return dupLen; +} + +static uint hgvsTxToCds(uint txOffset, struct genbankCds *cds, boolean isStart, char pPrefix[2]) +/* Return the cds-relative HGVS coord and prefix corresponding to 0-based txOffset & cds. */ +{ +// Open end adjustment for determining CDS/UTR region: +int endCmp = isStart ? 0 : 1; +// For adjusting non-negative coords to HGVS's positive 1-based closed: +int oneBased = isStart ? 1 : 0; +// For adjusting negative coords to HGVS's negative 0-based closed: +int closedEnd = isStart ? 0 : 1; +pPrefix[1] = '\0'; +uint cdsCoord = 0; +if (txOffset - endCmp < cds->start) + { + // 5'UTR: coord is negative distance from cdsStart + pPrefix[0] = '-'; + cdsCoord = cds->start - txOffset + closedEnd; + } +else if (txOffset - endCmp < cds->end) + { + // CDS: coord is positive distance from cdsStart + pPrefix[0] = '\0'; + cdsCoord = txOffset - cds->start + oneBased; + } +else + { + // 3'UTR: coord is positive distance from cdsEnd + pPrefix[0] = '*'; + cdsCoord = txOffset - cds->end + oneBased; + } +return cdsCoord; +} + +static void appendHgvsNucPos(struct dyString *dy, struct vpTxPosition *txPos, boolean isStart, + struct genbankCds *cds) +/* Translate txPos (start or end) into an HGVS position (coding if cds is non-NULL). */ +{ +// Open end adjustment for determining region: +int endCmp = isStart ? 0 : 1; +// For adjusting non-negative coords to HGVS's positive 1-based closed: +int oneBased = isStart ? 1 : 0; +// For adjusting negative coords to HGVS's negative 0-based closed: +int closedEnd = isStart ? 0 : 1; +if (txPos->region == vpUpstream) + { + uint distance = txPos->gDistance; + if (cds) + distance += cds->start; + dyStringPrintf(dy, "-%u", distance + closedEnd); + } +else if (txPos->region == vpDownstream) + { + uint distance = txPos->txOffset + txPos->gDistance; + if (cds) + dyStringPrintf(dy, "*%u", distance - cds->end + oneBased); + else + dyStringPrintf(dy, "%u", distance + oneBased); + } +else if (txPos->region == vpExon) + { + char cdsPrefix[2]; + cdsPrefix[0] = '\0'; + uint hgvsCoord = cds ? hgvsTxToCds(txPos->txOffset, cds, isStart, cdsPrefix) : + txPos->txOffset + oneBased; + dyStringPrintf(dy, "%s%u", cdsPrefix, hgvsCoord); + } +else if (txPos->region == vpIntron) + { + // If intron length is odd, bias toward picking the 5' exon (middle base has positive offset). + char cdsPrefix[2]; + cdsPrefix[0] = '\0'; + char direction; + uint exonAnchor, intronOffset; + boolean anchorIsStart; + if (txPos->gDistance - endCmp < txPos->intron3Distance) + { + exonAnchor = txPos->txOffset; + direction = '+'; + intronOffset = txPos->gDistance + oneBased; + anchorIsStart = FALSE; + } + else + { + exonAnchor = txPos->intron3TxOffset; + direction = '-'; + intronOffset = txPos->intron3Distance + closedEnd; + anchorIsStart = TRUE; + } + exonAnchor = cds ? hgvsTxToCds(exonAnchor, cds, anchorIsStart, cdsPrefix) : + exonAnchor + (anchorIsStart ? oneBased : 0); + dyStringPrintf(dy, "%s%u%c%u", cdsPrefix, exonAnchor, direction, intronOffset); + } +else + errAbort("appendHgvsNucPos: unrecognized vpTxRegion value %d", txPos->region); +} + +char *hgvsNFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli, + struct dnaSeq *txSeq, boolean breakDelIns) +/* Return an HGVS n. (noncoding transcript) term for a variant projected onto a transcript. + * gSeqWin must already have at least the correct seqName if not the surrounding sequence. + * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ +{ +struct dyString *dy = dyStringCreate("%s:n.", vpTx->txName); +// Make local copies of vpTx->{start,end} -- we may need to modify them for HGVS ins/dup. +struct vpTxPosition startPos = vpTx->start, endPos = vpTx->end; +int dupLen = tweakInsDup(&startPos, &endPos, vpTx->txAlt, gSeqWin, txAli, txSeq); +appendHgvsNucPos(dy, &startPos, TRUE, NULL); +if (!vpTxPosRangeIsSingleBase(&startPos, &endPos)) + { + dyStringAppendC(dy, '_'); + appendHgvsNucPos(dy, &endPos, FALSE, NULL); + } +char *ref = vpTx->txRef ? vpTx->txRef : vpTx->gRef; +hgvsAppendChangesFromNucRefAlt(dy, ref, vpTx->txAlt, dupLen, breakDelIns); +return dyStringCannibalize(&dy); +} + + +char *hgvsCFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli, + struct genbankCds *cds, struct dnaSeq *txSeq, boolean breakDelIns) +/* Return an HGVS c. (coding transcript) term for a variant projected onto a transcript w/cds. + * gSeqWin must already have at least the correct seqName if not the surrounding sequence. + * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ +{ +struct dyString *dy = dyStringCreate("%s:c.", vpTx->txName); +// Make local copies of vpTx->{start,end} -- we may need to modify them for HGVS ins/dup. +struct vpTxPosition startPos = vpTx->start, endPos = vpTx->end; +int dupLen = tweakInsDup(&startPos, &endPos, vpTx->txAlt, gSeqWin, txAli, txSeq); +appendHgvsNucPos(dy, &startPos, TRUE, cds); +if (!vpTxPosRangeIsSingleBase(&startPos, &endPos)) + { + dyStringAppendC(dy, '_'); + appendHgvsNucPos(dy, &endPos, FALSE, cds); + } +char *ref = vpTx->txRef ? vpTx->txRef : vpTx->gRef; +hgvsAppendChangesFromNucRefAlt(dy, ref, vpTx->txAlt, dupLen, breakDelIns); +return dyStringCannibalize(&dy); +} + +char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens) +/* Return an HGVS p. (protein) term for a variant projected into protein space. + * Strict HGVS compliance requires parentheses around predicted protein changes, but + * nobody seems to do that in practice. + * Return NULL if an input is NULL. */ +{ +if (vpPep == NULL || protSeq == NULL) + return NULL; +struct dyString *dy = dyStringCreate("%s:p.", vpPep->name); +if (addParens) + dyStringAppendC(dy, '('); +int refLen = vpPep->end - vpPep->start; +int altLen = vpPep->alt ? strlen(vpPep->alt) : 0; +char *pSeq = protSeq->dna; +char refStartAbbr[4]; +aaToAbbr(pSeq[vpPep->start], refStartAbbr, sizeof(refStartAbbr)); +if (vpPep->likelyNoChange) + dyStringAppend(dy, "="); +else if (vpPep->cantPredict || vpPep->spansUtrCds) + dyStringAppend(dy, "?"); +else if (vpPep->frameshift) + { + if (altLen == 1) + dyStringPrintf(dy, "%s%dTer", refStartAbbr, vpPep->start+1); + else + { + char altStartAbbr[4]; + aaToAbbr(vpPep->alt[0], altStartAbbr, sizeof(altStartAbbr)); + dyStringPrintf(dy, "%s%d%sfsTer%d", refStartAbbr, vpPep->start+1, altStartAbbr, altLen); + } + } +else + { + int dupLen = 0; + if (refLen == 0 && altLen > 0) + { + // It's an insertion; is it a duplication? + struct seqWindow *pSeqWin = memSeqWindowNew(protSeq->name, protSeq->dna); + dupLen = findDup(vpPep->alt, pSeqWin, vpPep->start, FALSE); + memSeqWindowFree(&pSeqWin); + } + if (refLen == 1) + dyStringPrintf(dy, "%s%d", refStartAbbr, vpPep->start+1); + else + { + int rangeStart = vpPep->start, rangeEnd = vpPep->end; + if (dupLen > 0) + { + // Duplication; position range changes to preceding bases. + rangeEnd = rangeStart; + rangeStart -= dupLen; + aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr)); + } + else if (refLen == 0) + { + // Insertion; expand to two-AA range around insertion point + rangeStart--; + aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr)); + rangeEnd++; + } + char refLastAbbr[4]; + aaToAbbr(pSeq[rangeEnd-1], refLastAbbr, sizeof(refLastAbbr)); + dyStringPrintf(dy, "%s%d_%s%d", refStartAbbr, rangeStart+1, refLastAbbr, rangeEnd); + } + hgvsAppendChangesFromPepRefAlt(dy, vpPep->ref, vpPep->alt, dupLen); + } +if (addParens) + dyStringAppendC(dy, ')'); +return dyStringCannibalize(&dy); +}