f3d3d03be73576bd1011a6b34e604c9c6566751b angie Fri Jul 21 20:55:57 2023 -0700 Move up NULL check of scoreStr before use (doh). refs MLQ #31782 diff --git src/hg/cgilib/annoFormatVep.c src/hg/cgilib/annoFormatVep.c index e741f9e..9941e60 100644 --- src/hg/cgilib/annoFormatVep.c +++ src/hg/cgilib/annoFormatVep.c @@ -1,1625 +1,1625 @@ /* annoFormatVep -- write functional predictions in the same format as Ensembl's * Variant Effect Predictor to fileName, interpreting input rows according to config. * See http://uswest.ensembl.org/info/docs/variation/vep/vep_formats.html */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "annoFormatVep.h" #include "annoGratorGpVar.h" #include "annoGratorQuery.h" #include "annoStreamDbPslPlus.h" #include "asParse.h" #include "bigGenePred.h" #include "dystring.h" #include "genePred.h" #include "gpFx.h" #include "hdb.h" #include "hgHgvs.h" #include "htmshell.h" #include "pgSnp.h" #include "portable.h" #include "vcf.h" #include <time.h> struct annoFormatVepExtraItem // A single input column whose value should be placed in the Extras output column, // identified by tag. { struct annoFormatVepExtraItem *next; char *tag; // Keyword to use in extras column (tag=value;) char *description; // Text description for output header int rowIx; // Offset of column in row from data source // (N/A for wig sources) boolean isRegulatory; // TRUE if overlap implies regulatory_region boolean isBoolean; // TRUE if we don't need to print out the value, // only whether it is found }; struct annoFormatVep; struct annoFormatVepExtraSource // A streamer or grator that supplies at least one value for Extras output column. { struct annoFormatVepExtraSource *next; struct annoStreamer *source; // streamer or grator: same pointers as below struct annoFormatVepExtraItem *items; // one or more columns of source and their tags void (*printExtra)(struct annoFormatVepExtraSource *self, struct annoFormatVep *afVep, struct annoFormatVepExtraItem *extraItem, struct annoRow *extraRows, struct annoRow *gpvRow, boolean *pGotExtra); /* Method for printing items from this source; pGotExtra is in/out to keep track of * whether any call has actually printed something yet. */ }; struct annoFormatVepConfig // Describe the primary source and grators (whose rows must be delivered in this order) // that provide data for VEP output columns. { struct annoStreamer *variantSource; // Primary source: variants char *variantDescription; // Description to appear in output header struct annoStreamer *gpVarSource; // annoGratorGpVar makes the core predictions char *gpVarDescription; // Description to appear in output header struct annoStreamer *snpSource; // Latest dbSNP provides IDs of known variants char *snpDescription; // Description to appear in output header struct annoFormatVepExtraSource *extraSources; // Everything else that may be tacked on }; enum afVariantDataType // What type of primary data (variants) are we integrating with? { afvdtInvalid, // uninitialized/unknown afvdtVcf, // primary data source is VCF afvdtPgSnpTable, // primary data source is pgSnp table with bin in .as afvdtPgSnpFile // primary data source is pgSnp file, no bin }; struct annoFormatVep // Subclass of annoFormatter that writes VEP-equivalent output to a file. { struct annoFormatter formatter; // superclass / external interface struct annoFormatVepConfig *config; // Description of input sources and values for Extras col char *fileName; // Output filename FILE *f; // Output file handle struct lm *lm; // localmem for scratch storage struct dyString *dyScratch; // dyString for local temporary use struct annoAssembly *assembly; // Reference assembly (for sequence) int lmRowCount; // counter for periodic localmem cleanup int varNameIx; // Index of name column from variant source, or -1 if N/A int varAllelesIx; // Index of alleles column from variant source, or -1 int txNameIx; // Index of transcript name from (big)genePred/PSL int geneNameIx; // Index of gene name from (big)genePred/PSL+ if included int hgvsGIx; // Index of HGVS g. column from annoGratorGpVar int hgvsCNIx; // Index of HGVS c./n. column from annoGratorGpVar int hgvsPIx; // Index of HGVS p. column from annoGratorGpVar int snpNameIx; // Index of name column from dbSNP source, or -1 boolean needHeader; // TRUE if we should print out the header enum afVariantDataType variantType; // Are variants VCF or a flavor of pgSnp? boolean doHtml; // TRUE if we should include html tags & make a <table>. struct seqWindow *gSeqWin; // genomic sequence fetcher for HGVS term generation boolean hgvsMakeG; // Generate genomic (g.) HGVS terms only if this is set boolean hgvsBreakDelIns; // Include deleted sequence (not only ins) e.g. delGGinsAT }; INLINE void afVepLineBreak(FILE *f, boolean doHtml) /* Depending on whether we're printing HTML, print either a newline or <BR>. */ { if (doHtml) fputs("<BR>", f); fputc('\n', f); } INLINE void afVepStartRow(FILE *f, boolean doHtml) /* If we're printing HTML, print a <TR><TD>. */ { if (doHtml) fputs("<TR><TD>", f); } INLINE void afVepNextColumn(FILE *f, boolean doHtml) /* Depending on whether we're printing HTML, print either a tab or </TD><TD>. */ { if (doHtml) fputs("</TD><TD>", f); else fputc('\t', f); } INLINE void afVepEndRow(FILE *f, boolean doHtml) /* If we're printing HTML, print a </TD></TR>, otherwise, just a newline. */ { if (doHtml) fputs("</TD></TR>\n", f); else fputc('\n', f); } INLINE void afVepPuts(char *text, struct annoFormatVep *self) /* Write text to self->f. If we're printing HTML, write encoded text, otherwise just write text. */ { if (self->doHtml) htmTextOut(self->f, text); else fputs(text, self->f); } static void afVepPrintf(struct annoFormatVep *self, char *format, ...) /* Printf to self->f, but if self->doHtml then encode before writing out. */ #ifdef __GNUC__ __attribute__((format(printf, 2, 3))) #endif ; static void afVepPrintf(struct annoFormatVep *self, char *format, ...) /* Printf to self->f, but if self->doHtml then encode before writing out. */ { va_list args; va_start(args, format); if (self->doHtml) vaHtmlFprintf(self->f, format, args); else vfprintf(self->f, format, args); va_end(args); } static void afVepPrintHeaderExtraTags(struct annoFormatVep *self, char *bStart, char *bEnd) /* For each extra column described in config, write out its tag and a brief description. * bStart and bEnd are for bold output in HTML, or if not HTML, starting the line with "## ". */ { struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; if (extras == NULL) return; fprintf(self->f, "%sKeys for Extra column items:%s", bStart, bEnd); afVepLineBreak(self->f, self->doHtml); for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) { struct annoFormatVepExtraItem *extraItem; for (extraItem = extraSrc->items; extraItem != NULL; extraItem = extraItem->next) if (isNotEmpty(extraItem->tag)) { fprintf(self->f, "%s%s%s: %s", bStart, extraItem->tag, bEnd, extraItem->description); afVepLineBreak(self->f, self->doHtml); } } } static void afVepPrintHeaderDate(FILE *f, boolean doHtml) /* VEP header includes a date formatted like "2012-06-16 16:09:38" */ { long now = clock1(); struct tm *tm = localtime(&now); fprintf(f, "## Output produced at %d-%02d-%02d %02d:%02d:%02d", 1900 + tm->tm_year, 1 + tm->tm_mon, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec); afVepLineBreak(f, doHtml); } static char *columnLabels[] = { "Uploaded Variation", "Location", "Allele", "Gene", "Feature", "Feature type", "Consequence", "Position in cDNA", "Position in CDS", "Position in protein", "Amino acid change", "Codon change", "Co-located Variation", "Extra", NULL }; static void afVepPrintColumnLabels(struct annoFormatVep *self) /* If we're printing HTML, begin a table and use TH for labels; otherwise just * print out a tab-sep text line with labels. VEP doesn't begin this line with a #! */ { char **pLabel = columnLabels; if (self->doHtml) { fputs("<BR><TABLE class='stdTbl'>\n<TR><TH>", self->f); fputs(*pLabel++, self->f); while (*pLabel != NULL) { fputs("</TH><TH>", self->f); fputs(*pLabel++, self->f); } fputs("</TH></TR>\n", self->f); } else { fputs(*pLabel++, self->f); while (*pLabel != NULL) { fputc('\t', self->f); fputs(*pLabel++, self->f); } fputc('\n', self->f); } } static void afVepPrintHeader(struct annoFormatVep *self, char *db) /* Print a header that looks almost like a VEP header. */ { FILE *f = self->f; boolean doHtml = self->doHtml; char *bStart = doHtml ? "<B>" : "## "; char *bEnd = doHtml ? "</B>" : ""; if (!doHtml) { // Suppress these lines from HTML output -- IMO they're better suited for a file header: fputs("## ENSEMBL VARIANT EFFECT PREDICTOR format (UCSC Variant Annotation Integrator)", f); afVepLineBreak(f, doHtml); afVepPrintHeaderDate(f, doHtml); fprintf(f, "## Connected to UCSC database %s", db); afVepLineBreak(f, doHtml); } struct annoFormatVepConfig *config = self->config; fprintf(f, "%sVariants:%s ", bStart, bEnd); afVepPrintf(self, "%s", config->variantDescription); if (! strstr(config->variantSource->name, "/trash/")) fprintf(f, " (%s)", config->variantSource->name); afVepLineBreak(f, doHtml); fprintf(f, "%sTranscripts:%s %s (%s)", bStart, bEnd, config->gpVarDescription, config->gpVarSource->name); afVepLineBreak(f, doHtml); if (config->snpSource != NULL) { fprintf(f, "%sdbSNP:%s %s (%s)", bStart, bEnd, config->snpDescription, config->snpSource->name); afVepLineBreak(f, doHtml); } afVepPrintHeaderExtraTags(self, bStart, bEnd); afVepPrintColumnLabels(self); self->needHeader = FALSE; } static void afVepInitialize(struct annoFormatter *fSelf, struct annoStreamer *primarySource, struct annoStreamer *integrators) /* Print header, regardless of whether we get any data after this. */ { struct annoFormatVep *self = (struct annoFormatVep *)fSelf; if (self->needHeader) afVepPrintHeader(self, primarySource->assembly->name); } static void afVepComment(struct annoFormatter *fSelf, char *content) /* Print out a comment, either starting with "# " or as a warnBox depending on doHtml. */ { if (strchr(content, '\n')) errAbort("afVepComment: no multi-line input"); struct annoFormatVep *self = (struct annoFormatVep *)fSelf; if (self->doHtml) warn("%s", content); else fprintf(self->f, "# %s\n", content); } static void compressDashes(char *string) /* If string has a run of '-' characters, turn it into single '-'. */ { char *p = string; while ((p = strchr(p, '-')) != NULL) { char *end = p+1; while (*end == '-') end++; if (end > p+1) memmove(p+1, end, strlen(end)+1); p++; } } static char *afVepGetAlleles(struct annoFormatVep *self, struct annoRow *varRow) /* Return a string with slash-separated alleles. Result is alloc'd by self->lm. */ { char **varWords = (char **)(varRow->data); char *alleles = NULL; if (self->variantType == afvdtVcf) alleles = lmCloneString(self->lm, vcfGetSlashSepAllelesFromWords(varWords, self->dyScratch)); else if (self->varAllelesIx >= 0) alleles = lmCloneString(self->lm, varWords[self->varAllelesIx]); else errAbort("annoFormatVep: afVepSetConfig didn't specify how to get alleles"); compressDashes(alleles); return alleles; } static void afVepPrintNameAndLoc(struct annoFormatVep *self, struct annoRow *varRow) /* Print variant name and position in genome. */ { char **varWords = (char **)(varRow->data); uint start1Based = varRow->start + 1; // Use variant name if available, otherwise construct an identifier: if (self->varNameIx >= 0 && !sameString(varWords[self->varNameIx], ".")) afVepPuts(varWords[self->varNameIx], self); else { char *alleles = afVepGetAlleles(self, varRow); afVepPrintf(self, "%s_%u_%s", varRow->chrom, start1Based, alleles); } afVepNextColumn(self->f, self->doHtml); // Location is chr:start for single-base, chr:start-end for indels: if (varRow->end == start1Based) afVepPrintf(self, "%s:%u", varRow->chrom, start1Based); else if (start1Based > varRow->end) afVepPrintf(self, "%s:%u-%u", varRow->chrom, varRow->end, start1Based); else afVepPrintf(self, "%s:%u-%u", varRow->chrom, start1Based, varRow->end); afVepNextColumn(self->f, self->doHtml); } INLINE void afVepPrintPlaceholders(FILE *f, int count, boolean doHtml) /* VEP uses "-" for N/A. Sometimes there are several consecutive N/A columns. * Count = 0 means print "-" with no tab. Count > 0 means print that many "-" columns. */ { if (count == 0) fputc('-', f); else { int i; for (i = 0; i < count; i++) { fputc('-', f); afVepNextColumn(f, doHtml); } } } #define afVepPrintPlaceholder(f, doHtml) afVepPrintPlaceholders(f, 1, doHtml) INLINE char *placeholderForEmpty(char *val) /* Return VEP placeholder "-" if val is empty, otherwise return val. */ { return (isEmpty(val) ? "-" : val); } static void afVepPrintGene(struct annoFormatVep *self, struct annoRow *gpvRow) /* If the genePred portion of gpvRow contains a gene name (in addition to transcript name), * print it out; otherwise print a placeholder. */ { if (self->geneNameIx >= 0) { char **words = (char **)(gpvRow->data); afVepPuts(placeholderForEmpty(words[self->geneNameIx]), self); afVepNextColumn(self->f, self->doHtml); } else afVepPrintPlaceholder(self->f, self->doHtml); } static void limitLength(char *seq, int baseCount, char *unit) /* If seq is longer than an abbreviated version of itself, change it to the abbreviated version. */ { if (isEmpty(seq)) return; int len = strlen(seq); const int elipsLen = 3; char lengthNote[512]; safef(lengthNote, sizeof(lengthNote), "(%d %s)", len, unit); if (len > 2*baseCount + elipsLen + strlen(lengthNote)) { // First baseCount bases, then "...": int offset = baseCount; safecpy(seq+offset, len+1-offset, "..."); offset += elipsLen; // then last baseCount bases: safecpy(seq+offset, len+1-offset, seq+len-baseCount); offset += baseCount; // then lengthNote: safecpy(seq+offset, len+1-offset, lengthNote); } } static void tweakStopCodonAndLimitLength(char *aaSeq, char *codonSeq) /* If aa from gpFx has a stop 'Z', replace it with "*". * If the strings are very long, truncate with a note about how long they are. */ { char *earlyStop = strchr(aaSeq, 'Z'); if (earlyStop) { earlyStop[0] = '*'; earlyStop[1] = '\0'; } limitLength(aaSeq, 5, "aa"); limitLength(codonSeq, 12, "nt"); } INLINE char *dashForEmpty(char *s) /* Represent empty alleles in insertions/deletions by the customary "-". */ { if (isEmpty(s)) return "-"; else return s; } static void afVepPrintPredictions(struct annoFormatVep *self, struct annoRow *varRow, struct annoRow *gpvRow, struct gpFx *gpFx) /* Print VEP columns computed by annoGratorGpVar (or placeholders) */ { // variant allele used to calculate the consequence (or first alternate allele) char *abbrevAllele = cloneString(gpFx->gAllele); limitLength(abbrevAllele, 24, "nt"); afVepPuts(placeholderForEmpty(abbrevAllele), self); afVepNextColumn(self->f, self->doHtml); // ID of affected gene afVepPrintGene(self, gpvRow); // ID of feature afVepPrintf(self, "%s", placeholderForEmpty(gpFx->transcript)); afVepNextColumn(self->f, self->doHtml); // type of feature {Transcript, RegulatoryFeature, MotifFeature} if (gpFx->soNumber == intergenic_variant) afVepPrintPlaceholder(self->f, self->doHtml); else { fputs("Transcript", self->f); afVepNextColumn(self->f, self->doHtml); } // consequence: SO term e.g. splice_region_variant afVepPuts(soTermToString(gpFx->soNumber), self); afVepNextColumn(self->f, self->doHtml); if (gpFx->detailType == codingChange) { struct codingChange *change = &(gpFx->details.codingChange); uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt); boolean isInsertion = (refLen == 0); boolean isDeletion = altLen < refLen; if (isInsertion) { fprintf(self->f, "%u-%u", change->cDnaPosition, change->cDnaPosition+1); afVepNextColumn(self->f, self->doHtml); fprintf(self->f, "%u-%u", change->cdsPosition, change->cdsPosition+1); } else if (isDeletion && refLen > 1) { fprintf(self->f, "%u-%u", change->cDnaPosition+1, change->cDnaPosition+refLen); afVepNextColumn(self->f, self->doHtml); fprintf(self->f, "%u-%u", change->cdsPosition+1, change->cdsPosition+refLen); } else { fprintf(self->f, "%u", change->cDnaPosition+1); afVepNextColumn(self->f, self->doHtml); fprintf(self->f, "%u", change->cdsPosition+1); } afVepNextColumn(self->f, self->doHtml); fprintf(self->f, "%u", change->pepPosition+1); afVepNextColumn(self->f, self->doHtml); int variantFrame = change->cdsPosition % 3; strLower(change->codonOld); int upLen = min(strlen(change->codonOld+variantFrame), refLen); toUpperN(change->codonOld+variantFrame, upLen); strLower(change->codonNew); int alleleLength = altLen; // watch out for symbolic alleles [actually now that we're using txAlt we shouldn't see these]: if (sameString(change->txAlt, "<X>") || sameString(change->txAlt, "<*>")) alleleLength = upLen; else if (startsWith("<", change->txAlt)) alleleLength = 0; toUpperN(change->codonNew+variantFrame, alleleLength); tweakStopCodonAndLimitLength(change->aaOld, change->codonOld); tweakStopCodonAndLimitLength(change->aaNew, change->codonNew); fprintf(self->f, "%s/%s", dashForEmpty(change->aaOld), dashForEmpty(change->aaNew)); afVepNextColumn(self->f, self->doHtml); fprintf(self->f, "%s/%s", dashForEmpty(change->codonOld), dashForEmpty(change->codonNew)); afVepNextColumn(self->f, self->doHtml); } else if (gpFx->detailType == nonCodingExon) { struct nonCodingExon *change = &(gpFx->details.nonCodingExon); uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt); boolean isInsertion = (refLen == 0); boolean isDeletion = altLen < refLen; int cDnaPosition = change->cDnaPosition; if (isInsertion) fprintf(self->f, "%u-%u", cDnaPosition, cDnaPosition+1); else if (isDeletion) fprintf(self->f, "%u-%u", cDnaPosition+1, cDnaPosition+refLen); else fprintf(self->f, "%u", cDnaPosition+1); afVepNextColumn(self->f, self->doHtml); // Coding effect columns (except for cDnaPosition) are N/A: afVepPrintPlaceholders(self->f, 4, self->doHtml); } else // Coding effect columns are N/A: afVepPrintPlaceholders(self->f, 5, self->doHtml); freez(&abbrevAllele); } static void afVepPrintExistingVar(struct annoFormatVep *self, struct annoRow *varRow, struct annoStreamRows gratorData[], int gratorCount) /* Print existing variant ID (or placeholder) */ { if (self->snpNameIx >= 0) { if (gratorCount < 2 || gratorData[1].streamer != self->config->snpSource) errAbort("annoFormatVep: config error, snpSource is not where expected"); struct annoRow *snpRows = gratorData[1].rowList, *row; if (snpRows != NULL) { int count = 0; for (row = snpRows; row != NULL; row = row->next) { char **snpWords = (char **)(row->data); if (row->start == varRow->start && row->end == varRow->end) { if (count > 0) fputc(',', self->f); afVepPrintf(self, "%s", snpWords[self->snpNameIx]); count++; } } if (count == 0) afVepPrintPlaceholder(self->f, self->doHtml); else afVepNextColumn(self->f, self->doHtml); } else afVepPrintPlaceholder(self->f, self->doHtml); } else afVepPrintPlaceholder(self->f, self->doHtml); } static boolean isCodingSnv(struct annoRow *primaryRow, struct gpFx *gpFx) /* Return TRUE if this is a single-nucleotide non-synonymous change. */ { if (gpFx == NULL || gpFx->detailType != codingChange || gpFx->soNumber == synonymous_variant) return FALSE; char *txRef = gpFx->details.codingChange.txRef; char *txAlt = gpFx->details.codingChange.txAlt; if (txRef == NULL || txAlt == NULL || strlen(txRef) != 1 || sameString(txRef, "-") || strlen(txAlt) != 1 || sameString(txAlt, "-")) return FALSE; return TRUE; } static int commaSepFindIx(char *item, char *s) /* Treating comma-separated, non-NULL s as an array of words, * return the index of item in the array, or -1 if item is not in array. */ { int itemLen = strlen(item); int ix = 0; char *p = strchr(s, ','); while (p != NULL) { int elLen = (p - s); if (elLen == itemLen && strncmp(s, item, itemLen) == 0) return ix; s = p+1; p = strchr(s, ','); ix++; } if (strlen(s) == itemLen && strncmp(s, item, itemLen) == 0) return ix; return -1; } static int commaSepFindIntIx(int item, char *s) /* Treating comma-separated, non-NULL s as an array of words that encode integers, * return the index of item in the array, or -1 if item is not in array. */ { char itemString[64]; safef(itemString, sizeof(itemString), "%d", item); return commaSepFindIx(itemString, s); } static char *commaSepWordFromIxOrWhole(int ix, char *s, struct lm *lm) /* Treating comma-separated, non-NULL s as an array of words, * return the word at ix in the array -- but if s has no commas, * just return (lmCloned) s. errAborts if s is comma-sep but ix is out of bounds. */ { int i = 0; char *p = strchr(s, ','); if (p == NULL) return lmCloneString(lm, s); while (p != NULL) { if (i == ix) return lmCloneStringZ(lm, s, p-s); s = p+1; p = strchr(s, ','); i++; } if (i == ix) return lmCloneString(lm, s); errAbort("commaSepWordFromIxOrWhole: Bad index %d for string '%s'", ix, s); return NULL; } INLINE void afVepNewExtra(struct annoFormatVep *self, boolean *pGotExtra) /* If we already printed an extra item, print the extra column's separator; set pGotExtra. */ { assert(pGotExtra); if (*pGotExtra) { if (self->doHtml) fputs("; ", self->f); else fputc(';', self->f); } *pGotExtra = TRUE; } static void afVepPrintDbNsfpSift(struct annoFormatVep *self, struct annoFormatVepExtraSource *extraSrc, struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, boolean *pGotExtra) /* Match the allele from gpFx to the per-allele scores in row from dbNsfpSift. */ { // Look up column indices only once: static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix; if (ensTxIdIx == -1) { struct asColumn *columns = extraSrc->source->asObj->columnList; ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); altAl1Ix = asColumnMustFindIx(columns, "altAl1"); score1Ix = asColumnMustFindIx(columns, "score1"); altAl2Ix = asColumnMustFindIx(columns, "altAl2"); score2Ix = asColumnMustFindIx(columns, "score2"); altAl3Ix = asColumnMustFindIx(columns, "altAl3"); score3Ix = asColumnMustFindIx(columns, "score3"); } struct annoRow *row; for (row = extraRows; row != NULL; row = row->next) { // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame) char **words = row->data; if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) continue; struct annoFormatVepExtraItem *extraItem = extraSrc->items; char *scoreStr = NULL; if (sameString(gpFx->gAllele, words[altAl1Ix])) scoreStr = words[score1Ix]; else if (sameString(gpFx->gAllele, words[altAl2Ix])) scoreStr = words[score2Ix]; else if (sameString(gpFx->gAllele, words[altAl3Ix])) scoreStr = words[score3Ix]; + if (isNotEmpty(scoreStr) && differentString(scoreStr, ".")) + { double score = atof(scoreStr); char prediction = '?'; if (score < 0.05) prediction = 'D'; else prediction = 'T'; - if (isNotEmpty(scoreStr) && differentString(scoreStr, ".")) - { afVepNewExtra(self, pGotExtra); fprintf(self->f, "%s=%c(%s)", extraItem->tag, prediction, scoreStr); } } } static void afVepPrintDbNsfpPolyPhen2(struct annoFormatVep *self, struct annoFormatVepExtraSource *extraSrc, struct annoRow *extraRows, struct gpFx *gpFx, boolean *pGotExtra) /* Match the allele from gpFx to the per-allele scores in each row from dbNsfpPolyPhen2. */ { // Look up column indices only once: static int aaPosIx=-1, altAl1Ix, hDivScore1Ix, hDivPred1Ix, hVarScore1Ix, hVarPred1Ix, altAl2Ix, hDivScore2Ix, hDivPred2Ix, hVarScore2Ix, hVarPred2Ix, altAl3Ix, hDivScore3Ix, hDivPred3Ix, hVarScore3Ix, hVarPred3Ix; if (aaPosIx == -1) { struct asColumn *columns = extraSrc->source->asObj->columnList; aaPosIx = asColumnMustFindIx(columns, "uniProtAaPos"); altAl1Ix = asColumnMustFindIx(columns, "altAl1"); hDivScore1Ix = asColumnMustFindIx(columns, "hDivScore1"); hDivPred1Ix = asColumnMustFindIx(columns, "hDivPred1"); hVarScore1Ix = asColumnMustFindIx(columns, "hVarScore1"); hVarPred1Ix = asColumnMustFindIx(columns, "hVarPred1"); altAl2Ix = asColumnMustFindIx(columns, "altAl2"); hDivScore2Ix = asColumnMustFindIx(columns, "hDivScore2"); hDivPred2Ix = asColumnMustFindIx(columns, "hDivPred2"); hVarScore2Ix = asColumnMustFindIx(columns, "hVarScore2"); hVarPred2Ix = asColumnMustFindIx(columns, "hVarPred2"); altAl3Ix = asColumnMustFindIx(columns, "altAl3"); hDivScore3Ix = asColumnMustFindIx(columns, "hDivScore3"); hDivPred3Ix = asColumnMustFindIx(columns, "hDivPred3"); hVarScore3Ix = asColumnMustFindIx(columns, "hVarScore3"); hVarPred3Ix = asColumnMustFindIx(columns, "hVarPred3"); } struct codingChange *cc = &(gpFx->details.codingChange); int count = 0; struct annoRow *row; for (row = extraRows; row != NULL; row = row->next) { char **words = row->data; //#*** polyphen2 can actually have multiple scores/preds for the same pepPosition... //#*** so what we really should do is loop on comma-sep words[aaPosIx] and print scores/preds //#*** whenever pepPosition matches. int txIx = commaSepFindIntIx(cc->pepPosition+1, words[aaPosIx]); if (txIx < 0) continue; struct annoFormatVepExtraItem *extraItem = extraSrc->items; for (; extraItem != NULL; extraItem = extraItem->next) { boolean isHdiv = (stringIn("HDIV", extraItem->tag) != NULL); int predIx = -1, scoreIx = -1; if (sameString(gpFx->gAllele, words[altAl1Ix])) { predIx = isHdiv ? hDivPred1Ix : hVarPred1Ix; scoreIx = isHdiv ? hDivScore1Ix : hVarScore1Ix; } else if (sameString(gpFx->gAllele, words[altAl2Ix])) { predIx = isHdiv ? hDivPred2Ix : hVarPred2Ix; scoreIx = isHdiv ? hDivScore2Ix : hVarScore2Ix; } else if (sameString(gpFx->gAllele, words[altAl3Ix])) { predIx = isHdiv ? hDivPred3Ix : hVarPred3Ix; scoreIx = isHdiv ? hDivScore3Ix : hVarScore3Ix; } char *pred = (predIx < 0) ? NULL : words[predIx]; if (pred == NULL || sameString(pred, ".")) continue; pred = commaSepWordFromIxOrWhole(txIx, pred, self->lm); if (isNotEmpty(pred)) { if (count == 0) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "%s=", extraItem->tag); } else fputc(',', self->f); char *score = (scoreIx < 0) ? "?" : commaSepWordFromIxOrWhole(txIx, words[scoreIx], self->lm); fprintf(self->f, "%s(%s)", pred, score); count++; } } } } static void afVepPrintDbNsfpMutationTA(struct annoFormatVep *self, struct annoFormatVepExtraSource *extraSrc, struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, boolean *pGotExtra) /* Match the allele from gpFx to the per-allele scores in row from dbNsfpMutationTaster * or dbNsfpMutationAssessor -- they have identical column indices. */ { // Look up column indices only once: static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix, altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix; if (ensTxIdIx == -1) { struct asColumn *columns = extraSrc->source->asObj->columnList; ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); altAl1Ix = asColumnMustFindIx(columns, "altAl1"); score1Ix = asColumnMustFindIx(columns, "score1"); pred1Ix = asColumnMustFindIx(columns, "pred1"); altAl2Ix = asColumnMustFindIx(columns, "altAl2"); score2Ix = asColumnMustFindIx(columns, "score2"); pred2Ix = asColumnMustFindIx(columns, "pred2"); altAl3Ix = asColumnMustFindIx(columns, "altAl3"); score3Ix = asColumnMustFindIx(columns, "score3"); pred3Ix = asColumnMustFindIx(columns, "pred3"); } struct annoRow *row; for (row = extraRows; row != NULL; row = row->next) { // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame) char **words = row->data; if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) continue; struct annoFormatVepExtraItem *extraItem = extraSrc->items; char *score = NULL, *pred = NULL; if (sameString(gpFx->gAllele, words[altAl1Ix])) { score = words[score1Ix]; pred = words[pred1Ix]; } else if (sameString(gpFx->gAllele, words[altAl2Ix])) { score = words[score2Ix]; pred = words[pred2Ix]; } else if (sameString(gpFx->gAllele, words[altAl3Ix])) { score = words[score3Ix]; pred = words[pred3Ix]; } if (isNotEmpty(score) && differentString(score, ".")) { afVepNewExtra(self, pGotExtra); if (isEmpty(pred)) pred = "?"; fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score); } } } static void afVepPrintDbNsfpLrt(struct annoFormatVep *self, struct annoFormatVepExtraSource *extraSrc, struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, boolean *pGotExtra) /* Match the allele from gpFx to the per-allele scores in row from dbNsfpLrt -- * it also has omega{1,2,3} columns, but I'm not sure what those mean so am leaving out for now. */ { // Look up column indices only once: static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix, altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix; if (ensTxIdIx == -1) { struct asColumn *columns = extraSrc->source->asObj->columnList; ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); altAl1Ix = asColumnMustFindIx(columns, "altAl1"); score1Ix = asColumnMustFindIx(columns, "score1"); pred1Ix = asColumnMustFindIx(columns, "pred1"); altAl2Ix = asColumnMustFindIx(columns, "altAl2"); score2Ix = asColumnMustFindIx(columns, "score2"); pred2Ix = asColumnMustFindIx(columns, "pred2"); altAl3Ix = asColumnMustFindIx(columns, "altAl3"); score3Ix = asColumnMustFindIx(columns, "score3"); pred3Ix = asColumnMustFindIx(columns, "pred3"); } struct annoRow *row; for (row = extraRows; row != NULL; row = row->next) { // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame) char **words = row->data; if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) continue; struct annoFormatVepExtraItem *extraItem = extraSrc->items; char *score = NULL, *pred = NULL; if (sameString(gpFx->gAllele, words[altAl1Ix])) { score = words[score1Ix]; pred = words[pred1Ix]; } else if (sameString(gpFx->gAllele, words[altAl2Ix])) { score = words[score2Ix]; pred = words[pred2Ix]; } else if (sameString(gpFx->gAllele, words[altAl3Ix])) { score = words[score3Ix]; pred = words[pred3Ix]; } if (isNotEmpty(score) && differentString(score, ".")) { afVepNewExtra(self, pGotExtra); if (isEmpty(pred)) pred = "?"; fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score); } } } static void afVepPrintDbNsfpVest(struct annoFormatVep *self, struct annoFormatVepExtraSource *extraSrc, struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, boolean *pGotExtra) /* Match the allele from gpFx to the per-allele scores in row from dbNsfpVest */ { // TODO: also compare var[123] protein change columns to gpFx to make sure we have the right one // Look up column indices only once: static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix; if (ensTxIdIx == -1) { struct asColumn *columns = extraSrc->source->asObj->columnList; ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); altAl1Ix = asColumnMustFindIx(columns, "altAl1"); score1Ix = asColumnMustFindIx(columns, "score1"); altAl2Ix = asColumnMustFindIx(columns, "altAl2"); score2Ix = asColumnMustFindIx(columns, "score2"); altAl3Ix = asColumnMustFindIx(columns, "altAl3"); score3Ix = asColumnMustFindIx(columns, "score3"); } struct annoRow *row; for (row = extraRows; row != NULL; row = row->next) { // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx char **words = row->data; if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) continue; struct annoFormatVepExtraItem *extraItem = extraSrc->items; char *score = NULL; if (sameString(gpFx->gAllele, words[altAl1Ix])) score = words[score1Ix]; else if (sameString(gpFx->gAllele, words[altAl2Ix])) score = words[score2Ix]; else if (sameString(gpFx->gAllele, words[altAl3Ix])) score = words[score3Ix]; if (isNotEmpty(score) && differentString(score, ".")) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "%s=%s", extraItem->tag, score); } } } static void afVepPrintDbNsfpInterPro(struct annoFormatVep *self, struct annoFormatVepExtraSource *extraSrc, struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, boolean *pGotExtra) /* Print out any overlapping (comma-sep list, HTML-encoded ',' and '=') InterPro domains. */ { // Look up column indices only once: static int domainsIx = -1; if (domainsIx == -1) { struct asColumn *columns = extraSrc->source->asObj->columnList; domainsIx = asColumnMustFindIx(columns, "domains"); } struct annoRow *row; for (row = extraRows; row != NULL; row = row->next) { char **words = row->data; struct annoFormatVepExtraItem *extraItem = extraSrc->items; char *domains = words[domainsIx]; int lastC = strlen(domains) - 1; if (domains[lastC] == ',') domains[lastC] = '\0'; if (isNotEmpty(domains) && differentString(domains, ".")) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "%s=%s", extraItem->tag, domains); } } } static boolean allelesAgree(char altNt, char altAa, char **words) /* Return TRUE if dbNsfpSeqChange words have altAa associated with altNt. */ { //#*** TODO: handle stop codon representation if ((altNt == words[11][0] && altAa == words[12][0]) || (altNt == words[13][0] && altAa == words[14][0]) || (altNt == words[15][0] && altAa == words[16][0])) return TRUE; return FALSE; } static struct annoRow *getRowsFromSource(struct annoStreamer *src, struct annoStreamRows gratorData[], int gratorCount) /* Search gratorData for src, and return its rows when found. */ { int i; for (i = 0; i < gratorCount; i++) { if (gratorData[i].streamer == src) return gratorData[i].rowList; } errAbort("annoFormatVep: Can't find source %s in gratorData", src->name); return NULL; } static char *getDbNsfpEnsTx(struct annoFormatVep *self, struct gpFx *gpFx, struct annoStreamRows *gratorData, int gratorCount) /* Find the Ensembl transcript ID, if any, for which dbNsfp has results consistent * with gpFx. */ { int i; for (i = 0; i < gratorCount; i++) { struct annoStreamer *source = gratorData[i].streamer; if (!sameString(source->asObj->name, "dbNsfpSeqChange")) //#*** need metadata! continue; struct codingChange *cc = &(gpFx->details.codingChange); struct annoRow *extraRows = getRowsFromSource(source, gratorData, gratorCount); struct annoRow *row; for (row = extraRows; row != NULL; row = row->next) { char **words = row->data; if (strcasecmp(cc->codonOld, words[7]) != 0) continue; if (!allelesAgree(gpFx->gAllele[0], cc->aaNew[0], words)) continue; int txIx = commaSepFindIntIx(cc->pepPosition+1, words[10]); if (txIx >= 0) { if (sameString(words[4], ".")) return "."; char *ensTxId = commaSepWordFromIxOrWhole(txIx, words[4], self->lm); return ensTxId; } } break; } return NULL; } static void afVepPrintExtrasDbNsfp(struct annoFormatVep *self, struct annoRow *varRow, struct annoRow *gpvRow, struct gpFx *gpFx, struct annoStreamRows gratorData[], int gratorCount, boolean *pGotExtra) /* Print the Extra column's tag=value; components from dbNSFP data, if we have any. */ { // dbNSFP has data only for coding non-synonymous single-nucleotide changes: if (!isCodingSnv(varRow, gpFx)) return; // Does dbNsfpSeqChange have a coding change consistent with gpFx?: char *ensTxId = getDbNsfpEnsTx(self, gpFx, gratorData, gratorCount); // Now cycle through selected dbNsfp* sources, printing out scores for ensTxId: struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) { char *asObjName = extraSrc->source->asObj->name; if (!startsWith("dbNsfp", asObjName)) continue; struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount); if (sameString(asObjName, "dbNsfpPolyPhen2")) { // PolyPhen2 is based on UniProt proteins, not GENCODE/Ensembl transcripts, // so ensTxId doesn't apply. afVepPrintDbNsfpPolyPhen2(self, extraSrc, extraRows, gpFx, pGotExtra); if (ensTxId == NULL) break; // all done now, no need to keep looking } else if (ensTxId != NULL) { if (sameString(asObjName, "dbNsfpSift")) afVepPrintDbNsfpSift(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); else if (sameString(asObjName, "dbNsfpMutationTaster") || sameString(asObjName, "dbNsfpMutationAssessor")) afVepPrintDbNsfpMutationTA(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); else if (sameString(asObjName, "dbNsfpLrt")) afVepPrintDbNsfpLrt(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); else if (sameString(asObjName, "dbNsfpVest")) afVepPrintDbNsfpVest(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); else if (sameString(asObjName, "dbNsfpInterPro")) afVepPrintDbNsfpInterPro(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); else if (!sameString(asObjName, "dbNsfpSeqChange")) errAbort("Unrecognized asObj->name '%s' from dbNSFP source '%s'", asObjName, extraSrc->source->name); } } } static void afVepPrintExtrasHgvs(struct annoFormatVep *self, struct annoRow *gpvRow, boolean *pGotExtra) /* If not empty, print HGVS terms (last column of gpvRow). */ { char **row = gpvRow->data; if (isNotEmpty(row[self->hgvsGIx])) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "HGVSG=%s", row[self->hgvsGIx]); } if (isNotEmpty(row[self->hgvsCNIx])) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "HGVSCN=%s", row[self->hgvsCNIx]); } if (isNotEmpty(row[self->hgvsPIx])) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "HGVSP=%s", row[self->hgvsPIx]); } } static void afVepPrintExtraWigVec(struct annoFormatVepExtraSource *extraSrc, struct annoFormatVep *self, struct annoFormatVepExtraItem *extraItem, struct annoRow *extraRows, struct annoRow *gpvRow, boolean *pGotExtra) /* Print values from a wig source with type arWigVec (per-base floats) */ { int i; struct annoRow *row; for (i = 0, row = extraRows; row != NULL; i++, row = row->next) { float *vector = row->data; int len = row->end - row->start; int j; for (j = 0; j < len; j++) { if (i+j == 0) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "%s=", extraItem->tag); } else fputc(',', self->f); fprintf(self->f, "%g", vector[j]); } } } static void afVepPrintExtraWigSingle(struct annoFormatVepExtraSource *extraSrc, struct annoFormatVep *self, struct annoFormatVepExtraItem *extraItem, struct annoRow *extraRows, struct annoRow *gpvRow, boolean *pGotExtra) /* Print values from a wig source with type arWigSingle (one double per row, e.g. an average) */ { struct annoRow *row; for (row = extraRows; row != NULL; row = row->next) { double *pValue = row->data; if (row == extraRows) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "%s=", extraItem->tag); } else fputc(',', self->f); fprintf(self->f, "%g", *pValue); } } static void afVepPrintExtraWords(struct annoFormatVepExtraSource *extraSrc, struct annoFormatVep *self, struct annoFormatVepExtraItem *extraItem, struct annoRow *extraRows, struct annoRow *gpvRow, boolean *pGotExtra) /* Print comma-separated values in the specified column from the usual array-of-words. */ { if (extraItem->rowIx < 0) errAbort("annoFormatVep: invalid rowIx for tag %s", extraItem->tag); char **gpvWords = gpvRow ? gpvRow->data : NULL; char *gpvTranscript = gpvWords ? gpvWords[self->txNameIx] : NULL; boolean gotGpvTx = FALSE; int i; struct annoRow *row; for (i = 0, row = extraRows; row != NULL; row = row->next) { char **words = row->data; char *extraTranscript = words[0]; // If extraSrc happens to be gpVarSource, use only the row that matches the current // transcript, and use it only once. if (extraSrc->source == self->config->gpVarSource) { extraTranscript = words[self->txNameIx]; if (gpvTranscript == NULL || differentString(extraTranscript, gpvTranscript) || gotGpvTx) continue; gotGpvTx = TRUE; } char *val = words[extraItem->rowIx]; if (isEmpty(val)) continue; // Strip trailing comma if found int valLen = strlen(val); if (val[valLen-1] == ',') val[valLen-1] = '\0'; if (i == 0) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "%s=", extraItem->tag); if (extraItem->isBoolean) fputs("YES", self->f); } else if (! extraItem->isBoolean) fputc(',', self->f); if (! extraItem->isBoolean) { // Watch out for characters that will mess up parsing of EXTRAS column: subChar(val, '=', '_'); subChar(val, ';', '_'); afVepPuts(val, self); } i++; } } static void afVepPrintExtrasOther(struct annoFormatVep *self, struct annoRow *varRow, struct annoRow *gpvRow, struct gpFx *gpFx, struct annoStreamRows gratorData[], int gratorCount, boolean includeRegulatory, boolean *pGotExtra) /* Print the Extra column's tag=value; components (other than dbNSFP) if we have any. */ { struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) { char *asObjName = extraSrc->source->asObj->name; if (startsWith("dbNsfp", asObjName)) continue; struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount); if (extraRows != NULL) { struct annoFormatVepExtraItem *extraItem; for (extraItem = extraSrc->items; extraItem != NULL; extraItem = extraItem->next) if (includeRegulatory || ! extraItem->isRegulatory) extraSrc->printExtra(extraSrc, self, extraItem, extraRows, gpvRow, pGotExtra); } } // VEP automatically adds DISTANCE for upstream/downstream variants if (gpFx && (gpFx->soNumber == upstream_gene_variant || gpFx->soNumber == downstream_gene_variant)) { afVepNewExtra(self, pGotExtra); // In order to really understand the boundary conditions of Ensembl's DISTANCE function, // I'll have to read the code someday. Upstream deletions on + strand still not handled // quite right. int ensemblFudge = 0; int distance = gpvRow->start - varRow->start + ensemblFudge; if (distance < 0) { ensemblFudge = 1; distance = varRow->start - gpvRow->end + ensemblFudge; } fprintf(self->f, "DISTANCE=%d", distance); } boolean includeExonNumber = TRUE; //#*** optional in VEP if (includeExonNumber && gpFx) { // Add Exon or intron number if applicable enum detailType deType = gpFx->detailType; int num = -1, count = 0; if (deType == codingChange) { num = gpFx->details.codingChange.exonNumber; count = gpFx->details.codingChange.exonCount; } else if (deType == nonCodingExon) { num = gpFx->details.nonCodingExon.exonNumber; count = gpFx->details.nonCodingExon.exonCount; } else if (deType == intron) { num = gpFx->details.intron.intronNumber; count = gpFx->details.intron.intronCount; } if (num >= 0) { afVepNewExtra(self, pGotExtra); fprintf(self->f, "%s=%d/%d", (deType == intron) ? "INTRON" : "EXON", num+1, count); } } if (!*pGotExtra) afVepPrintPlaceholders(self->f, 0, self->doHtml); } static void afVepLmCleanup(struct annoFormatVep *self) /* Clean out localmem every 1k rows. */ { self->lmRowCount++; if (self->lmRowCount > 1024) { lmCleanup(&(self->lm)); self->lm = lmInit(0); self->lmRowCount = 0; } } static void afVepPrintOneLineGpVar(struct annoFormatVep *self, struct annoStreamRows *varData, struct annoRow *gpvRow, struct annoStreamRows gratorData[], int gratorCount) /* Print one line of VEP with a coding gene consequence: a variant, an allele, * functional consequences of that allele, and whatever else is included in the config. */ { afVepLmCleanup(self); struct annoRow *varRow = varData->rowList; struct gpFx *gpFx = annoGratorGpVarGpFxFromRow(self->config->gpVarSource, gpvRow, self->lm); afVepStartRow(self->f, self->doHtml); afVepPrintNameAndLoc(self, varRow); afVepPrintPredictions(self, varRow, gpvRow, gpFx); afVepPrintExistingVar(self, varRow, gratorData, gratorCount); boolean gotExtra = FALSE; afVepPrintExtrasDbNsfp(self, varRow, gpvRow, gpFx, gratorData, gratorCount, &gotExtra); afVepPrintExtrasHgvs(self, gpvRow, &gotExtra); afVepPrintExtrasOther(self, varRow, gpvRow, gpFx, gratorData, gratorCount, FALSE, &gotExtra); afVepEndRow(self->f, self->doHtml); } static boolean afVepIsRegulatory(struct annoFormatVep *self, struct annoStreamRows gratorData[], int gratorCount) /* Return TRUE if one of the extraSource items is a regulatory region. */ { struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) { struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount); if (extraRows != NULL) { struct annoFormatVepExtraItem *extraItem; for (extraItem = extraSrc->items; extraItem != NULL; extraItem = extraItem->next) { if (extraItem->isRegulatory) return TRUE; } } } return FALSE; } static char *afVepGetFirstAltAllele(struct annoFormatVep *self, struct annoRow *varRow) /* For VEP we must print out some alternate allele, whether the alternate allele is * related to any other info (like coding changes) or not. Pick the first non-reference * allele. */ { int refAlBufSize = varRow->end - varRow->start + 1; char refAllele[refAlBufSize]; annoAssemblyGetSeq(self->assembly, varRow->chrom, varRow->start, varRow->end, refAllele, sizeof(refAllele)); struct variant *variant = NULL; if (self->variantType == afvdtVcf) variant = variantFromVcfAnnoRow(varRow, refAllele, self->lm, self->dyScratch); else if (self->variantType == afvdtPgSnpTable) variant = variantFromPgSnpAnnoRow(varRow, refAllele, TRUE, self->lm); else if (self->variantType == afvdtPgSnpFile) variant = variantFromPgSnpAnnoRow(varRow, refAllele, FALSE, self->lm); return firstAltAllele(variant->alleles); } static void afVepPrintPredictionsReg(struct annoFormatVep *self, char *alt) /* Print VEP allele, placeholder gene and item names, 'RegulatoryFeature', * 'regulatory_region_variant', and placeholder coding effect columns. */ { afVepPuts(alt, self); afVepNextColumn(self->f, self->doHtml); afVepPrintPlaceholders(self->f, 2, self->doHtml); fputs("RegulatoryFeature", self->f); afVepNextColumn(self->f, self->doHtml); fputs(soTermToString(regulatory_region_variant), self->f); afVepNextColumn(self->f, self->doHtml); // Coding effect columns are N/A: afVepPrintPlaceholders(self->f, 5, self->doHtml); } static void afVepPrintExtrasHgvsGOnly(struct annoFormatVep *self, struct annoRow *varRow, char *alt, boolean *pGotExtra) /* Make our own HGVS g. term (when we aren't taking HGVS terms from annoGratorGpVar). */ { if (self->hgvsMakeG) { struct bed3 *variantBed = (struct bed3 *)varRow; char *chromAcc = hRefSeqAccForChrom(self->assembly->name, varRow->chrom); char *hgvsG = hgvsGFromVariant(self->gSeqWin, variantBed, alt, chromAcc, self->hgvsBreakDelIns); afVepNewExtra(self, pGotExtra); fprintf(self->f, "HGVSG=%s", hgvsG); freeMem(hgvsG); } } static void afVepPrintRegulatory(struct annoFormatVep *self, struct annoStreamRows *varData, struct annoStreamRows gratorData[], int gratorCount) /* If there are EXTRAS column sources that imply regulatory_region_variant, print out * lines with that consequence (no gpFx/dbNSFP). */ { if (afVepIsRegulatory(self, gratorData, gratorCount)) { afVepLmCleanup(self); struct annoRow *varRow = varData->rowList; afVepStartRow(self->f, self->doHtml); afVepPrintNameAndLoc(self, varRow); char *alt = afVepGetFirstAltAllele(self, varRow); afVepPrintPredictionsReg(self, alt); afVepPrintExistingVar(self, varRow, gratorData, gratorCount); boolean gotExtra = FALSE; afVepPrintExtrasHgvsGOnly(self, varRow, alt, &gotExtra); afVepPrintExtrasOther(self, varRow, NULL, NULL, gratorData, gratorCount, TRUE, &gotExtra); afVepEndRow(self->f, self->doHtml); } } static void afVepFormatOne(struct annoFormatter *fSelf, struct annoStreamRows *primaryData, struct annoStreamRows gratorData[], int gratorCount) /* Print one variant's VEP (possibly multiple lines) using collected rows. */ { struct annoFormatVep *self = (struct annoFormatVep *)fSelf; struct annoRow *gpVarRows = gratorData[0].rowList, *gpvRow; for (gpvRow = gpVarRows; gpvRow != NULL; gpvRow = gpvRow->next) afVepPrintOneLineGpVar(self, primaryData, gpvRow, gratorData, gratorCount); afVepPrintRegulatory(self, primaryData, gratorData, gratorCount); } static void afVepEndHtml(struct annoFormatVep *self) { fputs("</TABLE><BR>\n", self->f); } static void afVepClose(struct annoFormatter **pFSelf) /* Close file handle, free self. */ { if (pFSelf == NULL) return; struct annoFormatVep *self = *(struct annoFormatVep **)pFSelf; if (self->doHtml) afVepEndHtml(self); freeMem(self->fileName); carefulClose(&(self->f)); lmCleanup(&(self->lm)); dyStringFree(&(self->dyScratch)); chromSeqWindowFree(&self->gSeqWin); annoFormatterFree(pFSelf); } static void afVepSetConfig(struct annoFormatVep *self, struct annoFormatVepConfig *config) /* Check config and figure out where various output columns will come from. */ { self->config = config; struct asColumn *varAsColumns = config->variantSource->asObj->columnList; self->varNameIx = -1; self->varAllelesIx = -1; if (asObjectsMatch(config->variantSource->asObj, pgSnpAsObj())) self->variantType = afvdtPgSnpTable; else if (asObjectsMatch(config->variantSource->asObj, pgSnpFileAsObj())) self->variantType = afvdtPgSnpFile; else if (asObjectsMatch(config->variantSource->asObj, vcfAsObj())) { self->variantType = afvdtVcf; self->varNameIx = asColumnMustFindIx(varAsColumns, "id"); } else errAbort("afVepSetConfig: variant source %s doesn't look like pgSnp or VCF", config->variantSource->name); if (self->variantType == afvdtPgSnpTable || self->variantType == afvdtPgSnpFile) { // pgSnp's "name" column actually contains slash-separated alleles self->varAllelesIx = asColumnMustFindIx(varAsColumns, "name"); } if (config->gpVarSource == NULL) errAbort("afVepSetConfig: config must have a gpVarSource"); else if (! asColumnNamesMatchFirstN(config->gpVarSource->asObj, genePredAsObj(), 10) && ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, bigGenePredAsObj(), 16) && ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, annoStreamDbPslPlusAsObj(), PSLPLUS_NUM_COLS)) errAbort("afVepSetConfig: gpVarSource %s doesn't look like genePred or pslPlus", config->gpVarSource->name); // refGene and augmented knownGene have extra name fields that have the HGNC gene symbol: struct asColumn *gpvAsColumns = config->gpVarSource->asObj->columnList; self->txNameIx = asColumnFindIx(gpvAsColumns, "name"); if (self->txNameIx < 0) self->txNameIx = asColumnFindIx(gpvAsColumns, "qName"); if (self->txNameIx < 0) errAbort("afVepSetConfig: can't find name or qName column in gpVarSource %s", config->gpVarSource->name); self->geneNameIx = asColumnFindIx(gpvAsColumns, "geneSymbol"); if (self->geneNameIx < 0) self->geneNameIx = asColumnFindIx(gpvAsColumns, "kgXref_geneSymbol"); if (self->geneNameIx < 0) self->geneNameIx = asColumnFindIx(gpvAsColumns, "name2"); if (self->geneNameIx < 0) self->geneNameIx = asColumnFindIx(gpvAsColumns, "proteinID"); self->hgvsGIx = asColumnMustFindIx(gpvAsColumns, "hgvsG"); self->hgvsCNIx = asColumnMustFindIx(gpvAsColumns, "hgvsCN"); self->hgvsPIx = asColumnMustFindIx(gpvAsColumns, "hgvsP"); if (config->snpSource != NULL) { struct asColumn *snpAsColumns = config->snpSource->asObj->columnList; self->snpNameIx = asColumnMustFindIx(snpAsColumns, "name"); } else self->snpNameIx = -1; } struct annoFormatVepConfig *annoFormatVepConfigNew(struct annoStreamer *variantSource, char *variantDescription, struct annoStreamer *gpVarSource, char *gpVarDescription, struct annoStreamer *snpSource, char *snpDescription) /* Return a basic configuration for VEP output. variantSource and gpVarSource must be * provided; snpSource can be NULL. */ { struct annoFormatVepConfig *config; AllocVar(config); config->variantSource = variantSource; config->variantDescription = cloneString(variantDescription); config->gpVarSource = gpVarSource; config->gpVarDescription = cloneString(gpVarDescription); config->snpSource = snpSource; config->snpDescription = cloneString(snpDescription); return config; } struct annoFormatter *annoFormatVepNew(char *fileName, boolean doHtml, struct annoStreamer *variantSource, char *variantDescription, struct annoStreamer *gpVarSource, char *gpVarDescription, struct annoStreamer *snpSource, char *snpDescription, struct annoAssembly *assembly) /* Return a formatter that will write functional predictions in the same format as Ensembl's * Variant Effect Predictor to fileName (can be "stdout"). * variantSource and gpVarSource must be provided; snpSource can be NULL. */ { struct annoFormatVep *self; AllocVar(self); struct annoFormatter *fSelf = &(self->formatter); fSelf->getOptions = annoFormatterGetOptions; fSelf->setOptions = annoFormatterSetOptions; fSelf->initialize = afVepInitialize; fSelf->comment = afVepComment; fSelf->formatOne = afVepFormatOne; fSelf->close = afVepClose; self->fileName = cloneString(fileName); self->f = mustOpen(fileName, "w"); self->lm = lmInit(0); self->dyScratch = dyStringNew(0); self->needHeader = TRUE; self->doHtml = doHtml; self->assembly = assembly; struct annoFormatVepConfig *config = annoFormatVepConfigNew(variantSource, variantDescription, gpVarSource, gpVarDescription, snpSource, snpDescription); afVepSetConfig(self, config); return (struct annoFormatter *)self; } static void afvAddExtraItemMaybeReg(struct annoFormatter *fSelf, struct annoStreamer *extraSource, char *tag, char *description, char *column, boolean isBoolean, boolean isReg) /* Tell annoFormatVep that it should include the given column of extraSource * in the EXTRAS column with tag. The VEP header will include tag's description. * If isReg, overlap implies that the variant has consequence regulatory_region_variant. * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored. */ { struct annoFormatVep *self = (struct annoFormatVep *)fSelf; struct annoFormatVepExtraSource *src; for (src = self->config->extraSources; src != NULL; src = src->next) if (src->source == extraSource) break; if (src == NULL) { AllocVar(src); src->source = extraSource; if (extraSource->rowType == arWigVec) src->printExtra = afVepPrintExtraWigVec; else if (extraSource->rowType == arWigSingle) src->printExtra = afVepPrintExtraWigSingle; else src->printExtra = afVepPrintExtraWords; slAddTail(&(self->config->extraSources), src); } struct annoFormatVepExtraItem *item; AllocVar(item); item->tag = cloneString(tag); item->description = cloneString(description); item->rowIx = -1; if (isNotEmpty(column)) item->rowIx = asColumnMustFindIx(extraSource->asObj->columnList, column); item->isRegulatory = isReg; item->isBoolean = isBoolean; slAddTail(&(src->items), item); } void annoFormatVepAddExtraItem(struct annoFormatter *fSelf, struct annoStreamer *extraSource, char *tag, char *description, char *column, boolean isBoolean) /* Tell annoFormatVep that it should include the given column of extraSource * in the EXTRAS column with tag. The VEP header will include tag's description. * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored. * If isBoolean, the column's description won't be output, only whether a match was found. */ { afvAddExtraItemMaybeReg(fSelf, extraSource, tag, description, column, isBoolean, FALSE); } void annoFormatVepAddRegulatory(struct annoFormatter *fSelf, struct annoStreamer *regSource, char *tag, char *description, char *column) /* Tell annoFormatVep to use the regulatory_region_variant consequence if there is overlap * with regSource, and to include the given column of regSource in the EXTRAS column. * The VEP header will include tag's description. */ { afvAddExtraItemMaybeReg(fSelf, regSource, tag, description, column, FALSE, TRUE); struct annoFormatVep *self = (struct annoFormatVep *)fSelf; if (self->gSeqWin == NULL) { char *db = regSource->assembly->name; self->gSeqWin = chromSeqWindowNew(db, NULL, 0, 0); } } void annoFormatVepSetHgvsOutOptions(struct annoFormatter *fSelf, uint hgvsOutOptions) /* Import the HGVS output options described in hgHgvs.h */ { struct annoFormatVep *self = (struct annoFormatVep *)fSelf; // We only do g. terms, for regulatory region variants, so no need for transcript/protein opts. if (hgvsOutOptions & HGVS_OUT_G) self->hgvsMakeG = TRUE; if (hgvsOutOptions & HGVS_OUT_BREAK_DELINS) self->hgvsBreakDelIns = TRUE; }