380a1b308bd3bb4f4e52d89ef9e1ccb962892bab angie Tue Oct 3 14:10:37 2017 -0700 Major changes to annoGratorGpVar, annoFormatVep and gpFx.c with the addition of functional effect prediction to variantProjector using PSL+CDS from annoStreamDbPslPlus, which enables accurate predictions even when the genome and transcript have indel differences. struct gpFx includes new members exonCount, txRef and txAlt so that gpFx and variantProjector can compute those and send them forward to annoFormatVep, instead of annoFormatVep computing them assuming that genome and transcript match perfectly. annoGratorGpVar passes forward the new gpFx members in output columns and, when input is PSL+CDS instead of genePred, uses variantProjector instead of gpFx to do functional predictions. diff --git src/hg/inc/gpFx.h src/hg/inc/gpFx.h index 68972dd..fc000d6 100644 --- src/hg/inc/gpFx.h +++ src/hg/inc/gpFx.h @@ -1,70 +1,87 @@ -/* gpFx --- routines to calculate the effect of variation on a genePred */ +/* gpFx --- routines to calculate the effect of variation on a genePred; variantProjector.c can + * also produce struct gpFx results using PSL+CDS+sequence, more accurate than genePred. */ /* Copyright (C) 2013 The Regents of the University of California * See README in this or parent directory for licensing information. */ #ifndef GPFX_H #define GPFX_H #include "variant.h" #include "soTerm.h" // a single gpFx variant effect call struct gpFx { struct gpFx *next; - char *allele; // Allele sequence used to determine functional effect + char *gAllele; // Genomic allele sequence used to determine functional effect char *transcript; // ID of feature affected by this call uint soNumber; // Sequence Ontology Number of effect enum detailType // This tells which value to use for 'union details' below { unknown, // Catch uninitialized (except for needMem) use codingChange, // (non)synonymous variant, deletions in CDS nonCodingExon, // variant in non-coding gene or UTR of coding gene intron, // intron_variant none // variant for which soNumber is enough (e.g. up/downstream) } detailType; union details { struct codingChange // (non)synonymous variant, deletions in CDS { uint exonNumber; // 0-based exon number (from genePred, beware false "introns") + uint exonCount; // Total number of exons (sometimes less than aligned blocks) uint cDnaPosition; // offset of variant in transcript cDNA + char *txRef; // Transcript reference allele (usually == genomic ref % strand) + char *txAlt; // Transcript alternate allele (usually == gAllele % strand) uint cdsPosition; // offset of variant from transcript's cds start uint pepPosition; // offset of variant in translated product char *aaOld; // peptides, before change by variant (starting at pepPos) char *aaNew; // peptides, changed by variant char *codonOld; // codons, before change by variant (starting at cdsPos) char *codonNew; // codons, changed by variant } codingChange; struct nonCodingExon // variant in non-coding gene or UTR of coding gene { uint exonNumber; // 0-based exon number (from genePred, beware false "introns") + uint exonCount; // Total number of exons (sometimes less than aligned blocks) uint cDnaPosition; // offset of variant in transcript cDNA + char *txRef; // Transcript reference allele (usually == genomic ref % strand) + char *txAlt; // Transcript alternate allele (usually == gAllele % strand) } nonCodingExon; struct intron // intron_variant { uint intronNumber; // 0-based intron number (from genePred, beware false "introns") + uint intronCount; // Total number of introns (sometimes less than aligned blocks - 1) } intron; } details; }; struct gpFx *gpFxPredEffect(struct variant *variant, struct genePred *pred, struct dnaSeq *transcriptSequence, struct lm *lm); // return the predicted effect(s) of a variation list on a genePred // number of bases up or downstream that we flag #define GPRANGE 5000 boolean hasAltAllele(struct allele *alleles); /* Return TRUE if alleles include at least one non-reference allele. */ char *firstAltAllele(struct allele *alleles); /* Ensembl always reports an alternate allele, even if that allele is not being used * to calculate any consequence. When allele doesn't really matter, just use the * first alternate allele that is given. */ +struct gpFx *gpFxNew(char *gAllele, char *transcript, enum soTerm soNumber, + enum detailType detailType, struct lm *lm); +/* Fill in the common members of gpFx; leave soTerm-specific members for caller to fill in. */ + struct gpFx *gpFxNoVariation(struct variant *variant, struct lm *lm); /* Return a gpFx with SO term no_sequence_alteration, for VCF rows that aren't really variants. */ +void gpFxSetNoncodingInfo(struct gpFx *gpFx, int exonIx, int exonCount, int cdnaPos, + char *txRef, char *txAlt, struct lm *lm); +/* This gpFx is for a variant in exon of non-coding gene or UTR exon of coding gene; + * set details.nonCodingExon values. */ + #endif /* GPFX_H */