533112afe2a2005e80cdb1f82904ea65032d4302 braney Sat Oct 2 11:37:34 2021 -0700 split hg/lib into two separate libaries, one only used by the cgis diff --git src/hg/cgilib/annoFormatVep.c src/hg/cgilib/annoFormatVep.c new file mode 100644 index 0000000..4c54628 --- /dev/null +++ src/hg/cgilib/annoFormatVep.c @@ -0,0 +1,1625 @@ +/* annoFormatVep -- write functional predictions in the same format as Ensembl's + * Variant Effect Predictor to fileName, interpreting input rows according to config. + * See http://uswest.ensembl.org/info/docs/variation/vep/vep_formats.html */ + +/* Copyright (C) 2014 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "annoFormatVep.h" +#include "annoGratorGpVar.h" +#include "annoGratorQuery.h" +#include "annoStreamDbPslPlus.h" +#include "asParse.h" +#include "bigGenePred.h" +#include "dystring.h" +#include "genePred.h" +#include "gpFx.h" +#include "hdb.h" +#include "hgHgvs.h" +#include "htmshell.h" +#include "pgSnp.h" +#include "portable.h" +#include "vcf.h" +#include + +struct annoFormatVepExtraItem + // A single input column whose value should be placed in the Extras output column, + // identified by tag. + { + struct annoFormatVepExtraItem *next; + char *tag; // Keyword to use in extras column (tag=value;) + char *description; // Text description for output header + int rowIx; // Offset of column in row from data source + // (N/A for wig sources) + boolean isRegulatory; // TRUE if overlap implies regulatory_region + boolean isBoolean; // TRUE if we don't need to print out the value, + // only whether it is found + }; + +struct annoFormatVep; + +struct annoFormatVepExtraSource + // A streamer or grator that supplies at least one value for Extras output column. + { + struct annoFormatVepExtraSource *next; + struct annoStreamer *source; // streamer or grator: same pointers as below + struct annoFormatVepExtraItem *items; // one or more columns of source and their tags + + void (*printExtra)(struct annoFormatVepExtraSource *self, + struct annoFormatVep *afVep, struct annoFormatVepExtraItem *extraItem, + struct annoRow *extraRows, struct annoRow *gpvRow, boolean *pGotExtra); + /* Method for printing items from this source; pGotExtra is in/out to keep track of + * whether any call has actually printed something yet. */ + }; + +struct annoFormatVepConfig + // Describe the primary source and grators (whose rows must be delivered in this order) + // that provide data for VEP output columns. + { + struct annoStreamer *variantSource; // Primary source: variants + char *variantDescription; // Description to appear in output header + struct annoStreamer *gpVarSource; // annoGratorGpVar makes the core predictions + char *gpVarDescription; // Description to appear in output header + struct annoStreamer *snpSource; // Latest dbSNP provides IDs of known variants + char *snpDescription; // Description to appear in output header + struct annoFormatVepExtraSource *extraSources; // Everything else that may be tacked on + }; + +enum afVariantDataType + // What type of primary data (variants) are we integrating with? + { + afvdtInvalid, // uninitialized/unknown + afvdtVcf, // primary data source is VCF + afvdtPgSnpTable, // primary data source is pgSnp table with bin in .as + afvdtPgSnpFile // primary data source is pgSnp file, no bin + }; + +struct annoFormatVep +// Subclass of annoFormatter that writes VEP-equivalent output to a file. + { + struct annoFormatter formatter; // superclass / external interface + struct annoFormatVepConfig *config; // Description of input sources and values for Extras col + char *fileName; // Output filename + FILE *f; // Output file handle + struct lm *lm; // localmem for scratch storage + struct dyString *dyScratch; // dyString for local temporary use + struct annoAssembly *assembly; // Reference assembly (for sequence) + int lmRowCount; // counter for periodic localmem cleanup + int varNameIx; // Index of name column from variant source, or -1 if N/A + int varAllelesIx; // Index of alleles column from variant source, or -1 + int txNameIx; // Index of transcript name from (big)genePred/PSL + int geneNameIx; // Index of gene name from (big)genePred/PSL+ if included + int hgvsGIx; // Index of HGVS g. column from annoGratorGpVar + int hgvsCNIx; // Index of HGVS c./n. column from annoGratorGpVar + int hgvsPIx; // Index of HGVS p. column from annoGratorGpVar + int snpNameIx; // Index of name column from dbSNP source, or -1 + boolean needHeader; // TRUE if we should print out the header + enum afVariantDataType variantType; // Are variants VCF or a flavor of pgSnp? + boolean doHtml; // TRUE if we should include html tags & make a . + struct seqWindow *gSeqWin; // genomic sequence fetcher for HGVS term generation + boolean hgvsMakeG; // Generate genomic (g.) HGVS terms only if this is set + boolean hgvsBreakDelIns; // Include deleted sequence (not only ins) e.g. delGGinsAT + }; + + +INLINE void afVepLineBreak(FILE *f, boolean doHtml) +/* Depending on whether we're printing HTML, print either a newline or
. */ +{ +if (doHtml) + fputs("
", f); +fputc('\n', f); +} + +INLINE void afVepStartRow(FILE *f, boolean doHtml) +/* If we're printing HTML, print a , otherwise, just a newline. */ +{ +if (doHtml) + fputs("\n", f); +else + fputc('\n', f); +} + +INLINE void afVepPuts(char *text, struct annoFormatVep *self) +/* Write text to self->f. If we're printing HTML, write encoded text, otherwise just write text. */ +{ +if (self->doHtml) + htmTextOut(self->f, text); +else + fputs(text, self->f); +} + +static void afVepPrintf(struct annoFormatVep *self, char *format, ...) +/* Printf to self->f, but if self->doHtml then encode before writing out. */ +#ifdef __GNUC__ +__attribute__((format(printf, 2, 3))) +#endif + ; + +static void afVepPrintf(struct annoFormatVep *self, char *format, ...) +/* Printf to self->f, but if self->doHtml then encode before writing out. */ +{ +va_list args; +va_start(args, format); +if (self->doHtml) + vaHtmlFprintf(self->f, format, args); +else + vfprintf(self->f, format, args); +va_end(args); +} + +static void afVepPrintHeaderExtraTags(struct annoFormatVep *self, char *bStart, char *bEnd) +/* For each extra column described in config, write out its tag and a brief description. + * bStart and bEnd are for bold output in HTML, or if not HTML, starting the line with "## ". */ +{ +struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; +if (extras == NULL) + return; +fprintf(self->f, "%sKeys for Extra column items:%s", bStart, bEnd); +afVepLineBreak(self->f, self->doHtml); +for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) + { + struct annoFormatVepExtraItem *extraItem; + for (extraItem = extraSrc->items; extraItem != NULL; extraItem = extraItem->next) + if (isNotEmpty(extraItem->tag)) + { + fprintf(self->f, "%s%s%s: %s", bStart, extraItem->tag, bEnd, extraItem->description); + afVepLineBreak(self->f, self->doHtml); + } + } +} + +static void afVepPrintHeaderDate(FILE *f, boolean doHtml) +/* VEP header includes a date formatted like "2012-06-16 16:09:38" */ +{ +long now = clock1(); +struct tm *tm = localtime(&now); +fprintf(f, "## Output produced at %d-%02d-%02d %02d:%02d:%02d", + 1900 + tm->tm_year, 1 + tm->tm_mon, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec); +afVepLineBreak(f, doHtml); +} + +static char *columnLabels[] = { "Uploaded Variation", + "Location", + "Allele", + "Gene", + "Feature", + "Feature type", + "Consequence", + "Position in cDNA", + "Position in CDS", + "Position in protein", + "Amino acid change", + "Codon change", + "Co-located Variation", + "Extra", + NULL }; + +static void afVepPrintColumnLabels(struct annoFormatVep *self) +/* If we're printing HTML, begin a table and use TH for labels; otherwise just + * print out a tab-sep text line with labels. VEP doesn't begin this line with a #! */ +{ +char **pLabel = columnLabels; +if (self->doHtml) + { + fputs("
. */ +{ +if (doHtml) + fputs("
", f); +} + +INLINE void afVepNextColumn(FILE *f, boolean doHtml) +/* Depending on whether we're printing HTML, print either a tab or . */ +{ +if (doHtml) + fputs("", f); +else + fputc('\t', f); +} + +INLINE void afVepEndRow(FILE *f, boolean doHtml) +/* If we're printing HTML, print a
\n\n", self->f); + } +else + { + fputs(*pLabel++, self->f); + while (*pLabel != NULL) + { + fputc('\t', self->f); + fputs(*pLabel++, self->f); + } + fputc('\n', self->f); + } +} + +static void afVepPrintHeader(struct annoFormatVep *self, char *db) +/* Print a header that looks almost like a VEP header. */ +{ +FILE *f = self->f; +boolean doHtml = self->doHtml; +char *bStart = doHtml ? "" : "## "; +char *bEnd = doHtml ? "" : ""; +if (!doHtml) + { + // Suppress these lines from HTML output -- IMO they're better suited for a file header: + fputs("## ENSEMBL VARIANT EFFECT PREDICTOR format (UCSC Variant Annotation Integrator)", f); + afVepLineBreak(f, doHtml); + afVepPrintHeaderDate(f, doHtml); + fprintf(f, "## Connected to UCSC database %s", db); + afVepLineBreak(f, doHtml); + } +struct annoFormatVepConfig *config = self->config; +fprintf(f, "%sVariants:%s ", bStart, bEnd); +afVepPrintf(self, "%s", config->variantDescription); +if (! strstr(config->variantSource->name, "/trash/")) + fprintf(f, " (%s)", config->variantSource->name); +afVepLineBreak(f, doHtml); +fprintf(f, "%sTranscripts:%s %s (%s)", bStart, bEnd, + config->gpVarDescription, config->gpVarSource->name); +afVepLineBreak(f, doHtml); +if (config->snpSource != NULL) + { + fprintf(f, "%sdbSNP:%s %s (%s)", bStart, bEnd, + config->snpDescription, config->snpSource->name); + afVepLineBreak(f, doHtml); + } +afVepPrintHeaderExtraTags(self, bStart, bEnd); +afVepPrintColumnLabels(self); +self->needHeader = FALSE; +} + +static void afVepInitialize(struct annoFormatter *fSelf, struct annoStreamer *primarySource, + struct annoStreamer *integrators) +/* Print header, regardless of whether we get any data after this. */ +{ +struct annoFormatVep *self = (struct annoFormatVep *)fSelf; +if (self->needHeader) + afVepPrintHeader(self, primarySource->assembly->name); +} + +static void afVepComment(struct annoFormatter *fSelf, char *content) +/* Print out a comment, either starting with "# " or as a warnBox depending on doHtml. */ +{ +if (strchr(content, '\n')) + errAbort("afVepComment: no multi-line input"); +struct annoFormatVep *self = (struct annoFormatVep *)fSelf; +if (self->doHtml) + warn("%s", content); +else + fprintf(self->f, "# %s\n", content); +} + +static void compressDashes(char *string) +/* If string has a run of '-' characters, turn it into single '-'. */ +{ +char *p = string; +while ((p = strchr(p, '-')) != NULL) + { + char *end = p+1; + while (*end == '-') + end++; + if (end > p+1) + memmove(p+1, end, strlen(end)+1); + p++; + } +} + +static char *afVepGetAlleles(struct annoFormatVep *self, struct annoRow *varRow) +/* Return a string with slash-separated alleles. Result is alloc'd by self->lm. */ +{ +char **varWords = (char **)(varRow->data); +char *alleles = NULL; +if (self->variantType == afvdtVcf) + alleles = lmCloneString(self->lm, vcfGetSlashSepAllelesFromWords(varWords, self->dyScratch)); +else if (self->varAllelesIx >= 0) + alleles = lmCloneString(self->lm, varWords[self->varAllelesIx]); +else + errAbort("annoFormatVep: afVepSetConfig didn't specify how to get alleles"); +compressDashes(alleles); +return alleles; +} + +static void afVepPrintNameAndLoc(struct annoFormatVep *self, struct annoRow *varRow) +/* Print variant name and position in genome. */ +{ +char **varWords = (char **)(varRow->data); +uint start1Based = varRow->start + 1; +// Use variant name if available, otherwise construct an identifier: +if (self->varNameIx >= 0 && !sameString(varWords[self->varNameIx], ".")) + afVepPuts(varWords[self->varNameIx], self); +else + { + char *alleles = afVepGetAlleles(self, varRow); + afVepPrintf(self, "%s_%u_%s", varRow->chrom, start1Based, alleles); + } +afVepNextColumn(self->f, self->doHtml); +// Location is chr:start for single-base, chr:start-end for indels: +if (varRow->end == start1Based) + afVepPrintf(self, "%s:%u", varRow->chrom, start1Based); +else if (start1Based > varRow->end) + afVepPrintf(self, "%s:%u-%u", varRow->chrom, varRow->end, start1Based); +else + afVepPrintf(self, "%s:%u-%u", varRow->chrom, start1Based, varRow->end); +afVepNextColumn(self->f, self->doHtml); +} + +INLINE void afVepPrintPlaceholders(FILE *f, int count, boolean doHtml) +/* VEP uses "-" for N/A. Sometimes there are several consecutive N/A columns. + * Count = 0 means print "-" with no tab. Count > 0 means print that many "-" columns. */ +{ +if (count == 0) + fputc('-', f); +else + { + int i; + for (i = 0; i < count; i++) + { + fputc('-', f); + afVepNextColumn(f, doHtml); + } + } +} + +#define afVepPrintPlaceholder(f, doHtml) afVepPrintPlaceholders(f, 1, doHtml) + +INLINE char *placeholderForEmpty(char *val) +/* Return VEP placeholder "-" if val is empty, otherwise return val. */ +{ +return (isEmpty(val) ? "-" : val); +} + + +static void afVepPrintGene(struct annoFormatVep *self, struct annoRow *gpvRow) +/* If the genePred portion of gpvRow contains a gene name (in addition to transcript name), + * print it out; otherwise print a placeholder. */ +{ +if (self->geneNameIx >= 0) + { + char **words = (char **)(gpvRow->data); + afVepPuts(placeholderForEmpty(words[self->geneNameIx]), self); + afVepNextColumn(self->f, self->doHtml); + } +else + afVepPrintPlaceholder(self->f, self->doHtml); +} + +static void limitLength(char *seq, int baseCount, char *unit) +/* If seq is longer than an abbreviated version of itself, change it to the abbreviated version. */ +{ +if (isEmpty(seq)) + return; +int len = strlen(seq); +const int elipsLen = 3; +char lengthNote[512]; +safef(lengthNote, sizeof(lengthNote), "(%d %s)", len, unit); +if (len > 2*baseCount + elipsLen + strlen(lengthNote)) + { + // First baseCount bases, then "...": + int offset = baseCount; + safecpy(seq+offset, len+1-offset, "..."); + offset += elipsLen; + // then last baseCount bases: + safecpy(seq+offset, len+1-offset, seq+len-baseCount); + offset += baseCount; + // then lengthNote: + safecpy(seq+offset, len+1-offset, lengthNote); + } +} + +static void tweakStopCodonAndLimitLength(char *aaSeq, char *codonSeq) +/* If aa from gpFx has a stop 'Z', replace it with "*". + * If the strings are very long, truncate with a note about how long they are. */ +{ +char *earlyStop = strchr(aaSeq, 'Z'); +if (earlyStop) + { + earlyStop[0] = '*'; + earlyStop[1] = '\0'; + } +limitLength(aaSeq, 5, "aa"); +limitLength(codonSeq, 12, "nt"); +} + +INLINE char *dashForEmpty(char *s) +/* Represent empty alleles in insertions/deletions by the customary "-". */ +{ +if (isEmpty(s)) + return "-"; +else + return s; +} + +static void afVepPrintPredictions(struct annoFormatVep *self, struct annoRow *varRow, + struct annoRow *gpvRow, struct gpFx *gpFx) +/* Print VEP columns computed by annoGratorGpVar (or placeholders) */ +{ +// variant allele used to calculate the consequence (or first alternate allele) +char *abbrevAllele = cloneString(gpFx->gAllele); +limitLength(abbrevAllele, 24, "nt"); +afVepPuts(placeholderForEmpty(abbrevAllele), self); +afVepNextColumn(self->f, self->doHtml); +// ID of affected gene +afVepPrintGene(self, gpvRow); +// ID of feature +afVepPrintf(self, "%s", placeholderForEmpty(gpFx->transcript)); +afVepNextColumn(self->f, self->doHtml); +// type of feature {Transcript, RegulatoryFeature, MotifFeature} +if (gpFx->soNumber == intergenic_variant) + afVepPrintPlaceholder(self->f, self->doHtml); +else + { + fputs("Transcript", self->f); + afVepNextColumn(self->f, self->doHtml); + } +// consequence: SO term e.g. splice_region_variant +afVepPuts(soTermToString(gpFx->soNumber), self); +afVepNextColumn(self->f, self->doHtml); +if (gpFx->detailType == codingChange) + { + struct codingChange *change = &(gpFx->details.codingChange); + uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt); + boolean isInsertion = (refLen == 0); + boolean isDeletion = altLen < refLen; + if (isInsertion) + { + fprintf(self->f, "%u-%u", change->cDnaPosition, change->cDnaPosition+1); + afVepNextColumn(self->f, self->doHtml); + fprintf(self->f, "%u-%u", change->cdsPosition, change->cdsPosition+1); + } + else if (isDeletion && refLen > 1) + { + fprintf(self->f, "%u-%u", change->cDnaPosition+1, change->cDnaPosition+refLen); + afVepNextColumn(self->f, self->doHtml); + fprintf(self->f, "%u-%u", change->cdsPosition+1, change->cdsPosition+refLen); + } + else + { + fprintf(self->f, "%u", change->cDnaPosition+1); + afVepNextColumn(self->f, self->doHtml); + fprintf(self->f, "%u", change->cdsPosition+1); + } + afVepNextColumn(self->f, self->doHtml); + fprintf(self->f, "%u", change->pepPosition+1); + afVepNextColumn(self->f, self->doHtml); + int variantFrame = change->cdsPosition % 3; + strLower(change->codonOld); + int upLen = min(strlen(change->codonOld+variantFrame), refLen); + toUpperN(change->codonOld+variantFrame, upLen); + strLower(change->codonNew); + int alleleLength = altLen; + // watch out for symbolic alleles [actually now that we're using txAlt we shouldn't see these]: + if (sameString(change->txAlt, "") || sameString(change->txAlt, "<*>")) + alleleLength = upLen; + else if (startsWith("<", change->txAlt)) + alleleLength = 0; + toUpperN(change->codonNew+variantFrame, alleleLength); + tweakStopCodonAndLimitLength(change->aaOld, change->codonOld); + tweakStopCodonAndLimitLength(change->aaNew, change->codonNew); + fprintf(self->f, "%s/%s", dashForEmpty(change->aaOld), dashForEmpty(change->aaNew)); + afVepNextColumn(self->f, self->doHtml); + fprintf(self->f, "%s/%s", dashForEmpty(change->codonOld), dashForEmpty(change->codonNew)); + afVepNextColumn(self->f, self->doHtml); + } +else if (gpFx->detailType == nonCodingExon) + { + struct nonCodingExon *change = &(gpFx->details.nonCodingExon); + uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt); + boolean isInsertion = (refLen == 0); + boolean isDeletion = altLen < refLen; + int cDnaPosition = change->cDnaPosition; + if (isInsertion) + fprintf(self->f, "%u-%u", cDnaPosition, cDnaPosition+1); + else if (isDeletion) + fprintf(self->f, "%u-%u", cDnaPosition+1, cDnaPosition+refLen); + else + fprintf(self->f, "%u", cDnaPosition+1); + afVepNextColumn(self->f, self->doHtml); + // Coding effect columns (except for cDnaPosition) are N/A: + afVepPrintPlaceholders(self->f, 4, self->doHtml); + } +else + // Coding effect columns are N/A: + afVepPrintPlaceholders(self->f, 5, self->doHtml); +freez(&abbrevAllele); +} + +static void afVepPrintExistingVar(struct annoFormatVep *self, struct annoRow *varRow, + struct annoStreamRows gratorData[], int gratorCount) +/* Print existing variant ID (or placeholder) */ +{ +if (self->snpNameIx >= 0) + { + if (gratorCount < 2 || gratorData[1].streamer != self->config->snpSource) + errAbort("annoFormatVep: config error, snpSource is not where expected"); + struct annoRow *snpRows = gratorData[1].rowList, *row; + if (snpRows != NULL) + { + int count = 0; + for (row = snpRows; row != NULL; row = row->next) + { + char **snpWords = (char **)(row->data); + if (row->start == varRow->start && row->end == varRow->end) + { + if (count > 0) + fputc(',', self->f); + afVepPrintf(self, "%s", snpWords[self->snpNameIx]); + count++; + } + } + if (count == 0) + afVepPrintPlaceholder(self->f, self->doHtml); + else + afVepNextColumn(self->f, self->doHtml); + } + else + afVepPrintPlaceholder(self->f, self->doHtml); + } +else + afVepPrintPlaceholder(self->f, self->doHtml); +} + +static boolean isCodingSnv(struct annoRow *primaryRow, struct gpFx *gpFx) +/* Return TRUE if this is a single-nucleotide non-synonymous change. */ +{ +if (gpFx == NULL || + gpFx->detailType != codingChange || + gpFx->soNumber == synonymous_variant) + return FALSE; +char *txRef = gpFx->details.codingChange.txRef; +char *txAlt = gpFx->details.codingChange.txAlt; +if (txRef == NULL || txAlt == NULL || + strlen(txRef) != 1 || sameString(txRef, "-") || + strlen(txAlt) != 1 || sameString(txAlt, "-")) + return FALSE; +return TRUE; +} + +static int commaSepFindIx(char *item, char *s) +/* Treating comma-separated, non-NULL s as an array of words, + * return the index of item in the array, or -1 if item is not in array. */ +{ +int itemLen = strlen(item); +int ix = 0; +char *p = strchr(s, ','); +while (p != NULL) + { + int elLen = (p - s); + if (elLen == itemLen && strncmp(s, item, itemLen) == 0) + return ix; + s = p+1; + p = strchr(s, ','); + ix++; + } +if (strlen(s) == itemLen && strncmp(s, item, itemLen) == 0) + return ix; +return -1; +} + +static int commaSepFindIntIx(int item, char *s) +/* Treating comma-separated, non-NULL s as an array of words that encode integers, + * return the index of item in the array, or -1 if item is not in array. */ +{ +char itemString[64]; +safef(itemString, sizeof(itemString), "%d", item); +return commaSepFindIx(itemString, s); +} + +static char *commaSepWordFromIxOrWhole(int ix, char *s, struct lm *lm) +/* Treating comma-separated, non-NULL s as an array of words, + * return the word at ix in the array -- but if s has no commas, + * just return (lmCloned) s. errAborts if s is comma-sep but ix is out of bounds. */ +{ +int i = 0; +char *p = strchr(s, ','); +if (p == NULL) + return lmCloneString(lm, s); +while (p != NULL) + { + if (i == ix) + return lmCloneStringZ(lm, s, p-s); + s = p+1; + p = strchr(s, ','); + i++; + } +if (i == ix) + return lmCloneString(lm, s); +errAbort("commaSepWordFromIxOrWhole: Bad index %d for string '%s'", ix, s); +return NULL; +} + +INLINE void afVepNewExtra(struct annoFormatVep *self, boolean *pGotExtra) +/* If we already printed an extra item, print the extra column's separator; set pGotExtra. */ +{ +assert(pGotExtra); +if (*pGotExtra) + { + if (self->doHtml) + fputs("; ", self->f); + else + fputc(';', self->f); + } +*pGotExtra = TRUE; +} + +static void afVepPrintDbNsfpSift(struct annoFormatVep *self, + struct annoFormatVepExtraSource *extraSrc, + struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, + boolean *pGotExtra) +/* Match the allele from gpFx to the per-allele scores in row from dbNsfpSift. */ +{ +// Look up column indices only once: +static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix; +if (ensTxIdIx == -1) + { + struct asColumn *columns = extraSrc->source->asObj->columnList; + ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); + altAl1Ix = asColumnMustFindIx(columns, "altAl1"); + score1Ix = asColumnMustFindIx(columns, "score1"); + altAl2Ix = asColumnMustFindIx(columns, "altAl2"); + score2Ix = asColumnMustFindIx(columns, "score2"); + altAl3Ix = asColumnMustFindIx(columns, "altAl3"); + score3Ix = asColumnMustFindIx(columns, "score3"); + } + +struct annoRow *row; +for (row = extraRows; row != NULL; row = row->next) + { + // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx + // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame) + char **words = row->data; + if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && + commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) + continue; + struct annoFormatVepExtraItem *extraItem = extraSrc->items; + char *scoreStr = NULL; + if (sameString(gpFx->gAllele, words[altAl1Ix])) + scoreStr = words[score1Ix]; + else if (sameString(gpFx->gAllele, words[altAl2Ix])) + scoreStr = words[score2Ix]; + else if (sameString(gpFx->gAllele, words[altAl3Ix])) + scoreStr = words[score3Ix]; + double score = atof(scoreStr); + char prediction = '?'; + if (score < 0.05) + prediction = 'D'; + else + prediction = 'T'; + if (isNotEmpty(scoreStr) && differentString(scoreStr, ".")) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "%s=%c(%s)", extraItem->tag, prediction, scoreStr); + } + } +} + +static void afVepPrintDbNsfpPolyPhen2(struct annoFormatVep *self, + struct annoFormatVepExtraSource *extraSrc, + struct annoRow *extraRows, struct gpFx *gpFx, + boolean *pGotExtra) +/* Match the allele from gpFx to the per-allele scores in each row from dbNsfpPolyPhen2. */ +{ +// Look up column indices only once: +static int aaPosIx=-1, + altAl1Ix, hDivScore1Ix, hDivPred1Ix, hVarScore1Ix, hVarPred1Ix, + altAl2Ix, hDivScore2Ix, hDivPred2Ix, hVarScore2Ix, hVarPred2Ix, + altAl3Ix, hDivScore3Ix, hDivPred3Ix, hVarScore3Ix, hVarPred3Ix; +if (aaPosIx == -1) + { + struct asColumn *columns = extraSrc->source->asObj->columnList; + aaPosIx = asColumnMustFindIx(columns, "uniProtAaPos"); + altAl1Ix = asColumnMustFindIx(columns, "altAl1"); + hDivScore1Ix = asColumnMustFindIx(columns, "hDivScore1"); + hDivPred1Ix = asColumnMustFindIx(columns, "hDivPred1"); + hVarScore1Ix = asColumnMustFindIx(columns, "hVarScore1"); + hVarPred1Ix = asColumnMustFindIx(columns, "hVarPred1"); + altAl2Ix = asColumnMustFindIx(columns, "altAl2"); + hDivScore2Ix = asColumnMustFindIx(columns, "hDivScore2"); + hDivPred2Ix = asColumnMustFindIx(columns, "hDivPred2"); + hVarScore2Ix = asColumnMustFindIx(columns, "hVarScore2"); + hVarPred2Ix = asColumnMustFindIx(columns, "hVarPred2"); + altAl3Ix = asColumnMustFindIx(columns, "altAl3"); + hDivScore3Ix = asColumnMustFindIx(columns, "hDivScore3"); + hDivPred3Ix = asColumnMustFindIx(columns, "hDivPred3"); + hVarScore3Ix = asColumnMustFindIx(columns, "hVarScore3"); + hVarPred3Ix = asColumnMustFindIx(columns, "hVarPred3"); + } + +struct codingChange *cc = &(gpFx->details.codingChange); +int count = 0; +struct annoRow *row; +for (row = extraRows; row != NULL; row = row->next) + { + char **words = row->data; +//#*** polyphen2 can actually have multiple scores/preds for the same pepPosition... +//#*** so what we really should do is loop on comma-sep words[aaPosIx] and print scores/preds +//#*** whenever pepPosition matches. + int txIx = commaSepFindIntIx(cc->pepPosition+1, words[aaPosIx]); + if (txIx < 0) + continue; + struct annoFormatVepExtraItem *extraItem = extraSrc->items; + for (; extraItem != NULL; extraItem = extraItem->next) + { + boolean isHdiv = (stringIn("HDIV", extraItem->tag) != NULL); + int predIx = -1, scoreIx = -1; + if (sameString(gpFx->gAllele, words[altAl1Ix])) + { + predIx = isHdiv ? hDivPred1Ix : hVarPred1Ix; + scoreIx = isHdiv ? hDivScore1Ix : hVarScore1Ix; + } + else if (sameString(gpFx->gAllele, words[altAl2Ix])) + { + predIx = isHdiv ? hDivPred2Ix : hVarPred2Ix; + scoreIx = isHdiv ? hDivScore2Ix : hVarScore2Ix; + } + else if (sameString(gpFx->gAllele, words[altAl3Ix])) + { + predIx = isHdiv ? hDivPred3Ix : hVarPred3Ix; + scoreIx = isHdiv ? hDivScore3Ix : hVarScore3Ix; + } + char *pred = (predIx < 0) ? NULL : words[predIx]; + if (pred == NULL || sameString(pred, ".")) + continue; + pred = commaSepWordFromIxOrWhole(txIx, pred, self->lm); + if (isNotEmpty(pred)) + { + if (count == 0) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "%s=", extraItem->tag); + } + else + fputc(',', self->f); + char *score = (scoreIx < 0) ? "?" : commaSepWordFromIxOrWhole(txIx, words[scoreIx], self->lm); + fprintf(self->f, "%s(%s)", pred, score); + count++; + } + } + } +} + +static void afVepPrintDbNsfpMutationTA(struct annoFormatVep *self, + struct annoFormatVepExtraSource *extraSrc, + struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, + boolean *pGotExtra) +/* Match the allele from gpFx to the per-allele scores in row from dbNsfpMutationTaster + * or dbNsfpMutationAssessor -- they have identical column indices. */ +{ +// Look up column indices only once: +static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix, + altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix; +if (ensTxIdIx == -1) + { + struct asColumn *columns = extraSrc->source->asObj->columnList; + ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); + altAl1Ix = asColumnMustFindIx(columns, "altAl1"); + score1Ix = asColumnMustFindIx(columns, "score1"); + pred1Ix = asColumnMustFindIx(columns, "pred1"); + altAl2Ix = asColumnMustFindIx(columns, "altAl2"); + score2Ix = asColumnMustFindIx(columns, "score2"); + pred2Ix = asColumnMustFindIx(columns, "pred2"); + altAl3Ix = asColumnMustFindIx(columns, "altAl3"); + score3Ix = asColumnMustFindIx(columns, "score3"); + pred3Ix = asColumnMustFindIx(columns, "pred3"); + } + +struct annoRow *row; +for (row = extraRows; row != NULL; row = row->next) + { + // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx + // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame) + char **words = row->data; + if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && + commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) + continue; + struct annoFormatVepExtraItem *extraItem = extraSrc->items; + char *score = NULL, *pred = NULL; + if (sameString(gpFx->gAllele, words[altAl1Ix])) + { + score = words[score1Ix]; + pred = words[pred1Ix]; + } + else if (sameString(gpFx->gAllele, words[altAl2Ix])) + { + score = words[score2Ix]; + pred = words[pred2Ix]; + } + else if (sameString(gpFx->gAllele, words[altAl3Ix])) + { + score = words[score3Ix]; + pred = words[pred3Ix]; + } + if (isNotEmpty(score) && differentString(score, ".")) + { + afVepNewExtra(self, pGotExtra); + if (isEmpty(pred)) + pred = "?"; + fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score); + } + } +} + +static void afVepPrintDbNsfpLrt(struct annoFormatVep *self, + struct annoFormatVepExtraSource *extraSrc, + struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, + boolean *pGotExtra) +/* Match the allele from gpFx to the per-allele scores in row from dbNsfpLrt -- + * it also has omega{1,2,3} columns, but I'm not sure what those mean so am leaving out for now. */ +{ +// Look up column indices only once: +static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix, + altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix; +if (ensTxIdIx == -1) + { + struct asColumn *columns = extraSrc->source->asObj->columnList; + ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); + altAl1Ix = asColumnMustFindIx(columns, "altAl1"); + score1Ix = asColumnMustFindIx(columns, "score1"); + pred1Ix = asColumnMustFindIx(columns, "pred1"); + altAl2Ix = asColumnMustFindIx(columns, "altAl2"); + score2Ix = asColumnMustFindIx(columns, "score2"); + pred2Ix = asColumnMustFindIx(columns, "pred2"); + altAl3Ix = asColumnMustFindIx(columns, "altAl3"); + score3Ix = asColumnMustFindIx(columns, "score3"); + pred3Ix = asColumnMustFindIx(columns, "pred3"); + } + +struct annoRow *row; +for (row = extraRows; row != NULL; row = row->next) + { + // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx + // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame) + char **words = row->data; + if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && + commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) + continue; + struct annoFormatVepExtraItem *extraItem = extraSrc->items; + char *score = NULL, *pred = NULL; + if (sameString(gpFx->gAllele, words[altAl1Ix])) + { + score = words[score1Ix]; + pred = words[pred1Ix]; + } + else if (sameString(gpFx->gAllele, words[altAl2Ix])) + { + score = words[score2Ix]; + pred = words[pred2Ix]; + } + else if (sameString(gpFx->gAllele, words[altAl3Ix])) + { + score = words[score3Ix]; + pred = words[pred3Ix]; + } + if (isNotEmpty(score) && differentString(score, ".")) + { + afVepNewExtra(self, pGotExtra); + if (isEmpty(pred)) + pred = "?"; + fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score); + } + } +} + +static void afVepPrintDbNsfpVest(struct annoFormatVep *self, + struct annoFormatVepExtraSource *extraSrc, + struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, + boolean *pGotExtra) +/* Match the allele from gpFx to the per-allele scores in row from dbNsfpVest */ +{ +// TODO: also compare var[123] protein change columns to gpFx to make sure we have the right one +// Look up column indices only once: +static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix; +if (ensTxIdIx == -1) + { + struct asColumn *columns = extraSrc->source->asObj->columnList; + ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); + altAl1Ix = asColumnMustFindIx(columns, "altAl1"); + score1Ix = asColumnMustFindIx(columns, "score1"); + altAl2Ix = asColumnMustFindIx(columns, "altAl2"); + score2Ix = asColumnMustFindIx(columns, "score2"); + altAl3Ix = asColumnMustFindIx(columns, "altAl3"); + score3Ix = asColumnMustFindIx(columns, "score3"); + } + +struct annoRow *row; +for (row = extraRows; row != NULL; row = row->next) + { + // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx + char **words = row->data; + if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && + commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) + continue; + struct annoFormatVepExtraItem *extraItem = extraSrc->items; + char *score = NULL; + if (sameString(gpFx->gAllele, words[altAl1Ix])) + score = words[score1Ix]; + else if (sameString(gpFx->gAllele, words[altAl2Ix])) + score = words[score2Ix]; + else if (sameString(gpFx->gAllele, words[altAl3Ix])) + score = words[score3Ix]; + if (isNotEmpty(score) && differentString(score, ".")) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "%s=%s", extraItem->tag, score); + } + } +} + +static void afVepPrintDbNsfpInterPro(struct annoFormatVep *self, + struct annoFormatVepExtraSource *extraSrc, + struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, + boolean *pGotExtra) +/* Print out any overlapping (comma-sep list, HTML-encoded ',' and '=') InterPro domains. */ +{ +// Look up column indices only once: +static int domainsIx = -1; +if (domainsIx == -1) + { + struct asColumn *columns = extraSrc->source->asObj->columnList; + domainsIx = asColumnMustFindIx(columns, "domains"); + } + +struct annoRow *row; +for (row = extraRows; row != NULL; row = row->next) + { + char **words = row->data; + struct annoFormatVepExtraItem *extraItem = extraSrc->items; + char *domains = words[domainsIx]; + int lastC = strlen(domains) - 1; + if (domains[lastC] == ',') + domains[lastC] = '\0'; + if (isNotEmpty(domains) && differentString(domains, ".")) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "%s=%s", extraItem->tag, domains); + } + } +} + +static boolean allelesAgree(char altNt, char altAa, char **words) +/* Return TRUE if dbNsfpSeqChange words have altAa associated with altNt. */ +{ +//#*** TODO: handle stop codon representation +if ((altNt == words[11][0] && altAa == words[12][0]) || + (altNt == words[13][0] && altAa == words[14][0]) || + (altNt == words[15][0] && altAa == words[16][0])) + return TRUE; +return FALSE; +} + +static struct annoRow *getRowsFromSource(struct annoStreamer *src, + struct annoStreamRows gratorData[], int gratorCount) +/* Search gratorData for src, and return its rows when found. */ +{ +int i; +for (i = 0; i < gratorCount; i++) + { + if (gratorData[i].streamer == src) + return gratorData[i].rowList; + } +errAbort("annoFormatVep: Can't find source %s in gratorData", src->name); +return NULL; +} + +static char *getDbNsfpEnsTx(struct annoFormatVep *self, struct gpFx *gpFx, + struct annoStreamRows *gratorData, int gratorCount) +/* Find the Ensembl transcript ID, if any, for which dbNsfp has results consistent + * with gpFx. */ +{ +int i; +for (i = 0; i < gratorCount; i++) + { + struct annoStreamer *source = gratorData[i].streamer; + if (!sameString(source->asObj->name, "dbNsfpSeqChange")) //#*** need metadata! + continue; + struct codingChange *cc = &(gpFx->details.codingChange); + struct annoRow *extraRows = getRowsFromSource(source, gratorData, gratorCount); + struct annoRow *row; + for (row = extraRows; row != NULL; row = row->next) + { + char **words = row->data; + if (strcasecmp(cc->codonOld, words[7]) != 0) + continue; + if (!allelesAgree(gpFx->gAllele[0], cc->aaNew[0], words)) + continue; + int txIx = commaSepFindIntIx(cc->pepPosition+1, words[10]); + if (txIx >= 0) + { + if (sameString(words[4], ".")) + return "."; + char *ensTxId = commaSepWordFromIxOrWhole(txIx, words[4], self->lm); + return ensTxId; + } + } + break; + } +return NULL; +} + +static void afVepPrintExtrasDbNsfp(struct annoFormatVep *self, struct annoRow *varRow, + struct annoRow *gpvRow, struct gpFx *gpFx, + struct annoStreamRows gratorData[], int gratorCount, + boolean *pGotExtra) +/* Print the Extra column's tag=value; components from dbNSFP data, if we have any. */ +{ +// dbNSFP has data only for coding non-synonymous single-nucleotide changes: +if (!isCodingSnv(varRow, gpFx)) + return; +// Does dbNsfpSeqChange have a coding change consistent with gpFx?: +char *ensTxId = getDbNsfpEnsTx(self, gpFx, gratorData, gratorCount); +// Now cycle through selected dbNsfp* sources, printing out scores for ensTxId: +struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; +for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) + { + char *asObjName = extraSrc->source->asObj->name; + if (!startsWith("dbNsfp", asObjName)) + continue; + struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount); + if (sameString(asObjName, "dbNsfpPolyPhen2")) + { + // PolyPhen2 is based on UniProt proteins, not GENCODE/Ensembl transcripts, + // so ensTxId doesn't apply. + afVepPrintDbNsfpPolyPhen2(self, extraSrc, extraRows, gpFx, pGotExtra); + if (ensTxId == NULL) + break; // all done now, no need to keep looking + } + else if (ensTxId != NULL) + { + if (sameString(asObjName, "dbNsfpSift")) + afVepPrintDbNsfpSift(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); + else if (sameString(asObjName, "dbNsfpMutationTaster") || + sameString(asObjName, "dbNsfpMutationAssessor")) + afVepPrintDbNsfpMutationTA(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); + else if (sameString(asObjName, "dbNsfpLrt")) + afVepPrintDbNsfpLrt(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); + else if (sameString(asObjName, "dbNsfpVest")) + afVepPrintDbNsfpVest(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); + else if (sameString(asObjName, "dbNsfpInterPro")) + afVepPrintDbNsfpInterPro(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); + else if (!sameString(asObjName, "dbNsfpSeqChange")) + errAbort("Unrecognized asObj->name '%s' from dbNSFP source '%s'", + asObjName, extraSrc->source->name); + } + } +} + +static void afVepPrintExtrasHgvs(struct annoFormatVep *self, struct annoRow *gpvRow, + boolean *pGotExtra) +/* If not empty, print HGVS terms (last column of gpvRow). */ +{ +char **row = gpvRow->data; +if (isNotEmpty(row[self->hgvsGIx])) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "HGVSG=%s", row[self->hgvsGIx]); + } +if (isNotEmpty(row[self->hgvsCNIx])) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "HGVSCN=%s", row[self->hgvsCNIx]); + } +if (isNotEmpty(row[self->hgvsPIx])) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "HGVSP=%s", row[self->hgvsPIx]); + } +} + +static void afVepPrintExtraWigVec(struct annoFormatVepExtraSource *extraSrc, + struct annoFormatVep *self, + struct annoFormatVepExtraItem *extraItem, + struct annoRow *extraRows, struct annoRow *gpvRow, + boolean *pGotExtra) +/* Print values from a wig source with type arWigVec (per-base floats) */ +{ +int i; +struct annoRow *row; +for (i = 0, row = extraRows; row != NULL; i++, row = row->next) + { + float *vector = row->data; + int len = row->end - row->start; + int j; + for (j = 0; j < len; j++) + { + if (i+j == 0) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "%s=", extraItem->tag); + } + else + fputc(',', self->f); + fprintf(self->f, "%g", vector[j]); + } + } +} + +static void afVepPrintExtraWigSingle(struct annoFormatVepExtraSource *extraSrc, + struct annoFormatVep *self, + struct annoFormatVepExtraItem *extraItem, + struct annoRow *extraRows, struct annoRow *gpvRow, + boolean *pGotExtra) +/* Print values from a wig source with type arWigSingle (one double per row, e.g. an average) */ +{ +struct annoRow *row; +for (row = extraRows; row != NULL; row = row->next) + { + double *pValue = row->data; + if (row == extraRows) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "%s=", extraItem->tag); + } + else + fputc(',', self->f); + fprintf(self->f, "%g", *pValue); + } +} + +static void afVepPrintExtraWords(struct annoFormatVepExtraSource *extraSrc, + struct annoFormatVep *self, + struct annoFormatVepExtraItem *extraItem, + struct annoRow *extraRows, struct annoRow *gpvRow, + boolean *pGotExtra) +/* Print comma-separated values in the specified column from the usual array-of-words. */ +{ +if (extraItem->rowIx < 0) + errAbort("annoFormatVep: invalid rowIx for tag %s", extraItem->tag); +char **gpvWords = gpvRow ? gpvRow->data : NULL; +char *gpvTranscript = gpvWords ? gpvWords[self->txNameIx] : NULL; +boolean gotGpvTx = FALSE; +int i; +struct annoRow *row; +for (i = 0, row = extraRows; row != NULL; row = row->next) + { + char **words = row->data; + char *extraTranscript = words[0]; + // If extraSrc happens to be gpVarSource, use only the row that matches the current + // transcript, and use it only once. + if (extraSrc->source == self->config->gpVarSource) + { + extraTranscript = words[self->txNameIx]; + if (gpvTranscript == NULL || differentString(extraTranscript, gpvTranscript) || gotGpvTx) + continue; + gotGpvTx = TRUE; + } + char *val = words[extraItem->rowIx]; + if (isEmpty(val)) + continue; + // Strip trailing comma if found + int valLen = strlen(val); + if (val[valLen-1] == ',') + val[valLen-1] = '\0'; + if (i == 0) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "%s=", extraItem->tag); + if (extraItem->isBoolean) + fputs("YES", self->f); + } + else if (! extraItem->isBoolean) + fputc(',', self->f); + if (! extraItem->isBoolean) + { + // Watch out for characters that will mess up parsing of EXTRAS column: + subChar(val, '=', '_'); + subChar(val, ';', '_'); + afVepPuts(val, self); + } + i++; + } +} + +static void afVepPrintExtrasOther(struct annoFormatVep *self, struct annoRow *varRow, + struct annoRow *gpvRow, struct gpFx *gpFx, + struct annoStreamRows gratorData[], int gratorCount, + boolean includeRegulatory, boolean *pGotExtra) +/* Print the Extra column's tag=value; components (other than dbNSFP) if we have any. */ +{ +struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; +for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) + { + char *asObjName = extraSrc->source->asObj->name; + if (startsWith("dbNsfp", asObjName)) + continue; + struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount); + if (extraRows != NULL) + { + struct annoFormatVepExtraItem *extraItem; + for (extraItem = extraSrc->items; extraItem != NULL; extraItem = extraItem->next) + if (includeRegulatory || ! extraItem->isRegulatory) + extraSrc->printExtra(extraSrc, self, extraItem, extraRows, gpvRow, pGotExtra); + } + } +// VEP automatically adds DISTANCE for upstream/downstream variants +if (gpFx && (gpFx->soNumber == upstream_gene_variant || gpFx->soNumber == downstream_gene_variant)) + { + afVepNewExtra(self, pGotExtra); + // In order to really understand the boundary conditions of Ensembl's DISTANCE function, + // I'll have to read the code someday. Upstream deletions on + strand still not handled + // quite right. + int ensemblFudge = 0; + int distance = gpvRow->start - varRow->start + ensemblFudge; + if (distance < 0) + { + ensemblFudge = 1; + distance = varRow->start - gpvRow->end + ensemblFudge; + } + fprintf(self->f, "DISTANCE=%d", distance); + } +boolean includeExonNumber = TRUE; //#*** optional in VEP +if (includeExonNumber && gpFx) + { + // Add Exon or intron number if applicable + enum detailType deType = gpFx->detailType; + int num = -1, count = 0; + if (deType == codingChange) + { + num = gpFx->details.codingChange.exonNumber; + count = gpFx->details.codingChange.exonCount; + } + else if (deType == nonCodingExon) + { + num = gpFx->details.nonCodingExon.exonNumber; + count = gpFx->details.nonCodingExon.exonCount; + } + else if (deType == intron) + { + num = gpFx->details.intron.intronNumber; + count = gpFx->details.intron.intronCount; + } + if (num >= 0) + { + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "%s=%d/%d", (deType == intron) ? "INTRON" : "EXON", num+1, count); + } + } +if (!*pGotExtra) + afVepPrintPlaceholders(self->f, 0, self->doHtml); +} + +static void afVepLmCleanup(struct annoFormatVep *self) +/* Clean out localmem every 1k rows. */ +{ +self->lmRowCount++; +if (self->lmRowCount > 1024) + { + lmCleanup(&(self->lm)); + self->lm = lmInit(0); + self->lmRowCount = 0; + } +} + +static void afVepPrintOneLineGpVar(struct annoFormatVep *self, struct annoStreamRows *varData, + struct annoRow *gpvRow, + struct annoStreamRows gratorData[], int gratorCount) +/* Print one line of VEP with a coding gene consequence: a variant, an allele, + * functional consequences of that allele, and whatever else is included in the config. */ +{ +afVepLmCleanup(self); +struct annoRow *varRow = varData->rowList; +struct gpFx *gpFx = annoGratorGpVarGpFxFromRow(self->config->gpVarSource, gpvRow, self->lm); +afVepStartRow(self->f, self->doHtml); +afVepPrintNameAndLoc(self, varRow); +afVepPrintPredictions(self, varRow, gpvRow, gpFx); +afVepPrintExistingVar(self, varRow, gratorData, gratorCount); +boolean gotExtra = FALSE; +afVepPrintExtrasDbNsfp(self, varRow, gpvRow, gpFx, gratorData, gratorCount, &gotExtra); +afVepPrintExtrasHgvs(self, gpvRow, &gotExtra); +afVepPrintExtrasOther(self, varRow, gpvRow, gpFx, gratorData, gratorCount, FALSE, &gotExtra); +afVepEndRow(self->f, self->doHtml); +} + +static boolean afVepIsRegulatory(struct annoFormatVep *self, + struct annoStreamRows gratorData[], int gratorCount) +/* Return TRUE if one of the extraSource items is a regulatory region. */ +{ +struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; +for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) + { + struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount); + if (extraRows != NULL) + { + struct annoFormatVepExtraItem *extraItem; + for (extraItem = extraSrc->items; extraItem != NULL; extraItem = extraItem->next) + { + if (extraItem->isRegulatory) + return TRUE; + } + } + } +return FALSE; +} + +static char *afVepGetFirstAltAllele(struct annoFormatVep *self, struct annoRow *varRow) +/* For VEP we must print out some alternate allele, whether the alternate allele is + * related to any other info (like coding changes) or not. Pick the first non-reference + * allele. */ +{ +int refAlBufSize = varRow->end - varRow->start + 1; +char refAllele[refAlBufSize]; +annoAssemblyGetSeq(self->assembly, varRow->chrom, varRow->start, varRow->end, + refAllele, sizeof(refAllele)); +struct variant *variant = NULL; +if (self->variantType == afvdtVcf) + variant = variantFromVcfAnnoRow(varRow, refAllele, self->lm, self->dyScratch); +else if (self->variantType == afvdtPgSnpTable) + variant = variantFromPgSnpAnnoRow(varRow, refAllele, TRUE, self->lm); +else if (self->variantType == afvdtPgSnpFile) + variant = variantFromPgSnpAnnoRow(varRow, refAllele, FALSE, self->lm); +return firstAltAllele(variant->alleles); +} + +static void afVepPrintPredictionsReg(struct annoFormatVep *self, char *alt) +/* Print VEP allele, placeholder gene and item names, 'RegulatoryFeature', + * 'regulatory_region_variant', and placeholder coding effect columns. */ +{ +afVepPuts(alt, self); +afVepNextColumn(self->f, self->doHtml); +afVepPrintPlaceholders(self->f, 2, self->doHtml); +fputs("RegulatoryFeature", self->f); +afVepNextColumn(self->f, self->doHtml); +fputs(soTermToString(regulatory_region_variant), self->f); +afVepNextColumn(self->f, self->doHtml); +// Coding effect columns are N/A: +afVepPrintPlaceholders(self->f, 5, self->doHtml); +} + +static void afVepPrintExtrasHgvsGOnly(struct annoFormatVep *self, struct annoRow *varRow, char *alt, + boolean *pGotExtra) +/* Make our own HGVS g. term (when we aren't taking HGVS terms from annoGratorGpVar). */ +{ +if (self->hgvsMakeG) + { + struct bed3 *variantBed = (struct bed3 *)varRow; + char *chromAcc = hRefSeqAccForChrom(self->assembly->name, varRow->chrom); + char *hgvsG = hgvsGFromVariant(self->gSeqWin, variantBed, alt, chromAcc, self->hgvsBreakDelIns); + afVepNewExtra(self, pGotExtra); + fprintf(self->f, "HGVSG=%s", hgvsG); + freeMem(hgvsG); + } +} + +static void afVepPrintRegulatory(struct annoFormatVep *self, struct annoStreamRows *varData, + struct annoStreamRows gratorData[], int gratorCount) +/* If there are EXTRAS column sources that imply regulatory_region_variant, print out + * lines with that consequence (no gpFx/dbNSFP). */ +{ +if (afVepIsRegulatory(self, gratorData, gratorCount)) + { + afVepLmCleanup(self); + struct annoRow *varRow = varData->rowList; + afVepStartRow(self->f, self->doHtml); + afVepPrintNameAndLoc(self, varRow); + char *alt = afVepGetFirstAltAllele(self, varRow); + afVepPrintPredictionsReg(self, alt); + afVepPrintExistingVar(self, varRow, gratorData, gratorCount); + boolean gotExtra = FALSE; + afVepPrintExtrasHgvsGOnly(self, varRow, alt, &gotExtra); + afVepPrintExtrasOther(self, varRow, NULL, NULL, gratorData, gratorCount, TRUE, &gotExtra); + afVepEndRow(self->f, self->doHtml); + } +} + +static void afVepFormatOne(struct annoFormatter *fSelf, struct annoStreamRows *primaryData, + struct annoStreamRows gratorData[], int gratorCount) +/* Print one variant's VEP (possibly multiple lines) using collected rows. */ +{ +struct annoFormatVep *self = (struct annoFormatVep *)fSelf; +struct annoRow *gpVarRows = gratorData[0].rowList, *gpvRow; +for (gpvRow = gpVarRows; gpvRow != NULL; gpvRow = gpvRow->next) + afVepPrintOneLineGpVar(self, primaryData, gpvRow, gratorData, gratorCount); +afVepPrintRegulatory(self, primaryData, gratorData, gratorCount); +} + +static void afVepEndHtml(struct annoFormatVep *self) +{ +fputs("
", self->f); + fputs(*pLabel++, self->f); + while (*pLabel != NULL) + { + fputs("", self->f); + fputs(*pLabel++, self->f); + } + fputs("

\n", self->f); +} + +static void afVepClose(struct annoFormatter **pFSelf) +/* Close file handle, free self. */ +{ +if (pFSelf == NULL) + return; +struct annoFormatVep *self = *(struct annoFormatVep **)pFSelf; +if (self->doHtml) + afVepEndHtml(self); +freeMem(self->fileName); +carefulClose(&(self->f)); +lmCleanup(&(self->lm)); +dyStringFree(&(self->dyScratch)); +chromSeqWindowFree(&self->gSeqWin); +annoFormatterFree(pFSelf); +} + +static void afVepSetConfig(struct annoFormatVep *self, struct annoFormatVepConfig *config) +/* Check config and figure out where various output columns will come from. */ +{ +self->config = config; +struct asColumn *varAsColumns = config->variantSource->asObj->columnList; +self->varNameIx = -1; +self->varAllelesIx = -1; +if (asObjectsMatch(config->variantSource->asObj, pgSnpAsObj())) + self->variantType = afvdtPgSnpTable; +else if (asObjectsMatch(config->variantSource->asObj, pgSnpFileAsObj())) + self->variantType = afvdtPgSnpFile; +else if (asObjectsMatch(config->variantSource->asObj, vcfAsObj())) + { + self->variantType = afvdtVcf; + self->varNameIx = asColumnMustFindIx(varAsColumns, "id"); + } +else + errAbort("afVepSetConfig: variant source %s doesn't look like pgSnp or VCF", + config->variantSource->name); +if (self->variantType == afvdtPgSnpTable || self->variantType == afvdtPgSnpFile) + { + // pgSnp's "name" column actually contains slash-separated alleles + self->varAllelesIx = asColumnMustFindIx(varAsColumns, "name"); + } +if (config->gpVarSource == NULL) + errAbort("afVepSetConfig: config must have a gpVarSource"); +else if (! asColumnNamesMatchFirstN(config->gpVarSource->asObj, genePredAsObj(), 10) && + ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, bigGenePredAsObj(), 16) && + ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, annoStreamDbPslPlusAsObj(), + PSLPLUS_NUM_COLS)) + errAbort("afVepSetConfig: gpVarSource %s doesn't look like genePred or pslPlus", + config->gpVarSource->name); +// refGene and augmented knownGene have extra name fields that have the HGNC gene symbol: +struct asColumn *gpvAsColumns = config->gpVarSource->asObj->columnList; +self->txNameIx = asColumnFindIx(gpvAsColumns, "name"); +if (self->txNameIx < 0) + self->txNameIx = asColumnFindIx(gpvAsColumns, "qName"); +if (self->txNameIx < 0) + errAbort("afVepSetConfig: can't find name or qName column in gpVarSource %s", + config->gpVarSource->name); +self->geneNameIx = asColumnFindIx(gpvAsColumns, "geneSymbol"); +if (self->geneNameIx < 0) + self->geneNameIx = asColumnFindIx(gpvAsColumns, "kgXref_geneSymbol"); +if (self->geneNameIx < 0) + self->geneNameIx = asColumnFindIx(gpvAsColumns, "name2"); +if (self->geneNameIx < 0) + self->geneNameIx = asColumnFindIx(gpvAsColumns, "proteinID"); +self->hgvsGIx = asColumnMustFindIx(gpvAsColumns, "hgvsG"); +self->hgvsCNIx = asColumnMustFindIx(gpvAsColumns, "hgvsCN"); +self->hgvsPIx = asColumnMustFindIx(gpvAsColumns, "hgvsP"); +if (config->snpSource != NULL) + { + struct asColumn *snpAsColumns = config->snpSource->asObj->columnList; + self->snpNameIx = asColumnMustFindIx(snpAsColumns, "name"); + } +else + self->snpNameIx = -1; +} + +struct annoFormatVepConfig *annoFormatVepConfigNew(struct annoStreamer *variantSource, + char *variantDescription, + struct annoStreamer *gpVarSource, + char *gpVarDescription, + struct annoStreamer *snpSource, + char *snpDescription) +/* Return a basic configuration for VEP output. variantSource and gpVarSource must be + * provided; snpSource can be NULL. */ +{ +struct annoFormatVepConfig *config; +AllocVar(config); +config->variantSource = variantSource; +config->variantDescription = cloneString(variantDescription); +config->gpVarSource = gpVarSource; +config->gpVarDescription = cloneString(gpVarDescription); +config->snpSource = snpSource; +config->snpDescription = cloneString(snpDescription); +return config; +} + +struct annoFormatter *annoFormatVepNew(char *fileName, boolean doHtml, + struct annoStreamer *variantSource, + char *variantDescription, + struct annoStreamer *gpVarSource, + char *gpVarDescription, + struct annoStreamer *snpSource, + char *snpDescription, + struct annoAssembly *assembly) +/* Return a formatter that will write functional predictions in the same format as Ensembl's + * Variant Effect Predictor to fileName (can be "stdout"). + * variantSource and gpVarSource must be provided; snpSource can be NULL. */ +{ +struct annoFormatVep *self; +AllocVar(self); +struct annoFormatter *fSelf = &(self->formatter); +fSelf->getOptions = annoFormatterGetOptions; +fSelf->setOptions = annoFormatterSetOptions; +fSelf->initialize = afVepInitialize; +fSelf->comment = afVepComment; +fSelf->formatOne = afVepFormatOne; +fSelf->close = afVepClose; +self->fileName = cloneString(fileName); +self->f = mustOpen(fileName, "w"); +self->lm = lmInit(0); +self->dyScratch = dyStringNew(0); +self->needHeader = TRUE; +self->doHtml = doHtml; +self->assembly = assembly; +struct annoFormatVepConfig *config = annoFormatVepConfigNew(variantSource, variantDescription, + gpVarSource, gpVarDescription, + snpSource, snpDescription); +afVepSetConfig(self, config); +return (struct annoFormatter *)self; +} + +static void afvAddExtraItemMaybeReg(struct annoFormatter *fSelf, struct annoStreamer *extraSource, + char *tag, char *description, char *column, boolean isBoolean, + boolean isReg) +/* Tell annoFormatVep that it should include the given column of extraSource + * in the EXTRAS column with tag. The VEP header will include tag's description. + * If isReg, overlap implies that the variant has consequence regulatory_region_variant. + * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored. */ +{ +struct annoFormatVep *self = (struct annoFormatVep *)fSelf; +struct annoFormatVepExtraSource *src; +for (src = self->config->extraSources; src != NULL; src = src->next) + if (src->source == extraSource) + break; +if (src == NULL) + { + AllocVar(src); + src->source = extraSource; + if (extraSource->rowType == arWigVec) + src->printExtra = afVepPrintExtraWigVec; + else if (extraSource->rowType == arWigSingle) + src->printExtra = afVepPrintExtraWigSingle; + else + src->printExtra = afVepPrintExtraWords; + slAddTail(&(self->config->extraSources), src); + } +struct annoFormatVepExtraItem *item; +AllocVar(item); +item->tag = cloneString(tag); +item->description = cloneString(description); +item->rowIx = -1; +if (isNotEmpty(column)) + item->rowIx = asColumnMustFindIx(extraSource->asObj->columnList, column); +item->isRegulatory = isReg; +item->isBoolean = isBoolean; +slAddTail(&(src->items), item); +} + +void annoFormatVepAddExtraItem(struct annoFormatter *fSelf, struct annoStreamer *extraSource, + char *tag, char *description, char *column, boolean isBoolean) +/* Tell annoFormatVep that it should include the given column of extraSource + * in the EXTRAS column with tag. The VEP header will include tag's description. + * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored. + * If isBoolean, the column's description won't be output, only whether a match was found. */ +{ +afvAddExtraItemMaybeReg(fSelf, extraSource, tag, description, column, isBoolean, FALSE); +} + +void annoFormatVepAddRegulatory(struct annoFormatter *fSelf, struct annoStreamer *regSource, + char *tag, char *description, char *column) +/* Tell annoFormatVep to use the regulatory_region_variant consequence if there is overlap + * with regSource, and to include the given column of regSource in the EXTRAS column. + * The VEP header will include tag's description. */ +{ +afvAddExtraItemMaybeReg(fSelf, regSource, tag, description, column, FALSE, TRUE); +struct annoFormatVep *self = (struct annoFormatVep *)fSelf; +if (self->gSeqWin == NULL) + { + char *db = regSource->assembly->name; + self->gSeqWin = chromSeqWindowNew(db, NULL, 0, 0); + } +} + +void annoFormatVepSetHgvsOutOptions(struct annoFormatter *fSelf, uint hgvsOutOptions) +/* Import the HGVS output options described in hgHgvs.h */ +{ +struct annoFormatVep *self = (struct annoFormatVep *)fSelf; +// We only do g. terms, for regulatory region variants, so no need for transcript/protein opts. +if (hgvsOutOptions & HGVS_OUT_G) + self->hgvsMakeG = TRUE; +if (hgvsOutOptions & HGVS_OUT_BREAK_DELINS) + self->hgvsBreakDelIns = TRUE; +}