de29d745d32fb04a5187fb5998cc94bbb9bf41a4
angie
  Wed Aug 9 13:39:59 2017 -0700
Add optional HGVS output to annoGratorGpVar and hgVai. Since annoGratorGpVar is genePred-based, it can't yet take advantage of variantProjector's full PSL+CDS+sequence support, so when transcripts don't align cleanly to genome, HGVS c./n./p. output may be incorrect. refs #19968

diff --git src/hg/inc/hgHgvs.h src/hg/inc/hgHgvs.h
index 6be2bb1..070e0ae 100644
--- src/hg/inc/hgHgvs.h
+++ src/hg/inc/hgHgvs.h
@@ -1,26 +1,29 @@
 /* hgHgvs - support for a subset of the Human Genome Variation Society's (HGVS) nomenclature
  * for describing variants with respect to a specific gene sequence (or genome assembly).
  * See http://www.hgvs.org/mutnomen/ */
 
 /* Copyright (C) 2016 The Regents of the University of California
  * See README in this or parent directory for licensing information. */
 
 #ifndef HGHGVS_H
 #define HGHGVS_H
 
 #include "bed.h"
+#include "dnaseq.h"
+#include "seqWindow.h"
+#include "variantProjector.h"
 
 /* The full nomenclature is extremely complicated, able to encode things such as gene fusions and
  * advanced clinical info (e.g. "=/" for somatic mosaicism, "=//" for chimerism).  UCSC supports
  * substitutions, insertions, deletions, duplications and inversions.  Conversions are parsed out
  * of HGVS terms but not detected in genomic variants when generating HGVS terms.
  * UCSC does not fully support repeated sequences because in practice they seem to be frequently
  * incorrect and inherently error-prone.
  *
  * At the same time, since the spec has repeatedly changed, we will need to be flexible in our
  * parsing in order to support previously published HGVS (or HGVS-like) terms. */
 
 enum hgvsSeqType
     // HGVS describes changes relative to several types of sequence: genomic, coding, protein, etc
     {
     hgvstUndefined,  // the usual int default value means we haven't actually checked
@@ -141,30 +144,45 @@
     struct hgvsChangeFrameshift fs;  // Optional info about early termination of translation
     struct hgvsChangeRepeat repeat;  // Optional repeated sequence and min/max observed counts.
     int count;                       // Repeat unit count in alternate sequence (ref count is 1)
     };
 
 struct hgvsChange
     // Usually this contains a sequence of bases from the reference and a sequence of bases from
     // the alternate allele, but can contain nested HGVS terms.  For deletions, reference bases
     // are usually not specified, except for protein which includes the first and last amino acid.
     {
     struct hgvsChange *next;           // One HGVS term can specify a sequence of changes
     enum hgvsChangeType type;          // HGVS operator: >, del, ins, fs etc.
     union hgvsChangeValue value;       // the actual sequences changed (possibly complex)
     };
 
+//
+// HGVS output option bit flags
+//
+// Output an HGVS genomic (g.) term:
+#define HGVS_OUT_G  0x01
+// Output either an HGVS coding (c.) term if applicable, otherwise noncoding (n.) term:
+#define HGVS_OUT_CN 0x02
+// Output an HGVS protein (p.) term if applicable:
+#define HGVS_OUT_P  0x04
+// Add parentheses around predicted protein (p.) changes e.g. p.(Arg159del):
+#define HGVS_OUT_P_ADD_PARENS 0x10
+// Add deleted sequence to delins changes (e.g. show 'delAGinsTT' instead of 'delinsTT'):
+#define HGVS_OUT_BREAK_DELINS 0x20
+
+
 void hgvsVariantFree(struct hgvsVariant **pHgvs);
 // Free *pHgvs and its contents, and set *pHgvs to NULL.
 
 struct hgvsVariant *hgvsParseTerm(char *term);
 /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL.
  * This does not check validity of accessions or alleles. */
 
 struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term);
 /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS. */
 
 boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion,
                      char **retDiffRefAllele);
 /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
  * Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so
  * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found.
@@ -222,16 +240,41 @@
         "Description=\"Asserted reference sequence in HGVS term does not match actual " \
         "reference sequence\">\n" \
         "##FILTER=<ID=HgvsRefGenomicMismatch," \
         "Description=\"HGVS reference sequence does not match genomic sequence; " \
         "HGVS reference sequence is included in ALT\">\n" \
         "##INFO=<ID=DupToIns,Number=0,Type=Flag," \
         "Description=\"HGVS dup (duplication) was converted to insertion\">\n" \
         "##INFO=<ID=BasesShifted,Number=1,Type=Integer," \
         "Description=\"Position of HGVS variant was shifted this number of bases to the left\">\n"
 
 struct vcfRow *hgvsToVcfRow(char *db, char *term, boolean doLeftShift, struct dyString *dyError);
 /* Convert HGVS to a row of VCF suitable for sorting & printing.  If unable, return NULL and
  * put the reason in dyError.  Protein terms are ambiguous at the nucleotide level so they are
  * not supported at this point. */
 
+char *hgvsGFromVariant(struct seqWindow *gSeqWin, struct bed3 *variantBed, char *alt, char *acc,
+                       boolean breakDelIns);
+/* Return an HGVS g. string representing the genomic variant at the position of variantBed with
+ * reference allele from gSeqWin and alternate allele alt.  If acc is non-NULL it is used
+ * instead of variantBed->chrom.
+ * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */
+
+char *hgvsNFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli,
+                    struct dnaSeq *txSeq, boolean breakDelIns);
+/* Return an HGVS n. (noncoding transcript) term for a variant projected onto a transcript.
+ * gSeqWin must already have at least the correct seqName if not the surrounding sequence.
+ * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */
+
+char *hgvsCFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli,
+                    struct genbankCds *cds,  struct dnaSeq *txSeq, boolean breakDelIns);
+/* Return an HGVS c. (coding transcript) term for a variant projected onto a transcript w/cds.
+ * gSeqWin must already have at least the correct seqName (chrom) if not the surrounding sequence.
+ * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */
+
+char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens);
+/* Return an HGVS p. (protein) term for a variant projected into protein space.
+ * Strict HGVS compliance requires parentheses around predicted protein changes (addParens=TRUE),
+ * but nobody seems to do that in practice.
+ * Return NULL if an input is NULL. */
+
 #endif /* HGHGVS_H */