533112afe2a2005e80cdb1f82904ea65032d4302 braney Sat Oct 2 11:37:34 2021 -0700 split hg/lib into two separate libaries, one only used by the cgis diff --git src/hg/lib/annoFormatVep.c src/hg/lib/annoFormatVep.c deleted file mode 100644 index 4c54628..0000000 --- src/hg/lib/annoFormatVep.c +++ /dev/null @@ -1,1625 +0,0 @@ -/* annoFormatVep -- write functional predictions in the same format as Ensembl's - * Variant Effect Predictor to fileName, interpreting input rows according to config. - * See http://uswest.ensembl.org/info/docs/variation/vep/vep_formats.html */ - -/* Copyright (C) 2014 The Regents of the University of California - * See README in this or parent directory for licensing information. */ - -#include "annoFormatVep.h" -#include "annoGratorGpVar.h" -#include "annoGratorQuery.h" -#include "annoStreamDbPslPlus.h" -#include "asParse.h" -#include "bigGenePred.h" -#include "dystring.h" -#include "genePred.h" -#include "gpFx.h" -#include "hdb.h" -#include "hgHgvs.h" -#include "htmshell.h" -#include "pgSnp.h" -#include "portable.h" -#include "vcf.h" -#include - -struct annoFormatVepExtraItem - // A single input column whose value should be placed in the Extras output column, - // identified by tag. - { - struct annoFormatVepExtraItem *next; - char *tag; // Keyword to use in extras column (tag=value;) - char *description; // Text description for output header - int rowIx; // Offset of column in row from data source - // (N/A for wig sources) - boolean isRegulatory; // TRUE if overlap implies regulatory_region - boolean isBoolean; // TRUE if we don't need to print out the value, - // only whether it is found - }; - -struct annoFormatVep; - -struct annoFormatVepExtraSource - // A streamer or grator that supplies at least one value for Extras output column. - { - struct annoFormatVepExtraSource *next; - struct annoStreamer *source; // streamer or grator: same pointers as below - struct annoFormatVepExtraItem *items; // one or more columns of source and their tags - - void (*printExtra)(struct annoFormatVepExtraSource *self, - struct annoFormatVep *afVep, struct annoFormatVepExtraItem *extraItem, - struct annoRow *extraRows, struct annoRow *gpvRow, boolean *pGotExtra); - /* Method for printing items from this source; pGotExtra is in/out to keep track of - * whether any call has actually printed something yet. */ - }; - -struct annoFormatVepConfig - // Describe the primary source and grators (whose rows must be delivered in this order) - // that provide data for VEP output columns. - { - struct annoStreamer *variantSource; // Primary source: variants - char *variantDescription; // Description to appear in output header - struct annoStreamer *gpVarSource; // annoGratorGpVar makes the core predictions - char *gpVarDescription; // Description to appear in output header - struct annoStreamer *snpSource; // Latest dbSNP provides IDs of known variants - char *snpDescription; // Description to appear in output header - struct annoFormatVepExtraSource *extraSources; // Everything else that may be tacked on - }; - -enum afVariantDataType - // What type of primary data (variants) are we integrating with? - { - afvdtInvalid, // uninitialized/unknown - afvdtVcf, // primary data source is VCF - afvdtPgSnpTable, // primary data source is pgSnp table with bin in .as - afvdtPgSnpFile // primary data source is pgSnp file, no bin - }; - -struct annoFormatVep -// Subclass of annoFormatter that writes VEP-equivalent output to a file. - { - struct annoFormatter formatter; // superclass / external interface - struct annoFormatVepConfig *config; // Description of input sources and values for Extras col - char *fileName; // Output filename - FILE *f; // Output file handle - struct lm *lm; // localmem for scratch storage - struct dyString *dyScratch; // dyString for local temporary use - struct annoAssembly *assembly; // Reference assembly (for sequence) - int lmRowCount; // counter for periodic localmem cleanup - int varNameIx; // Index of name column from variant source, or -1 if N/A - int varAllelesIx; // Index of alleles column from variant source, or -1 - int txNameIx; // Index of transcript name from (big)genePred/PSL - int geneNameIx; // Index of gene name from (big)genePred/PSL+ if included - int hgvsGIx; // Index of HGVS g. column from annoGratorGpVar - int hgvsCNIx; // Index of HGVS c./n. column from annoGratorGpVar - int hgvsPIx; // Index of HGVS p. column from annoGratorGpVar - int snpNameIx; // Index of name column from dbSNP source, or -1 - boolean needHeader; // TRUE if we should print out the header - enum afVariantDataType variantType; // Are variants VCF or a flavor of pgSnp? - boolean doHtml; // TRUE if we should include html tags & make a . - struct seqWindow *gSeqWin; // genomic sequence fetcher for HGVS term generation - boolean hgvsMakeG; // Generate genomic (g.) HGVS terms only if this is set - boolean hgvsBreakDelIns; // Include deleted sequence (not only ins) e.g. delGGinsAT - }; - - -INLINE void afVepLineBreak(FILE *f, boolean doHtml) -/* Depending on whether we're printing HTML, print either a newline or
. */ -{ -if (doHtml) - fputs("
", f); -fputc('\n', f); -} - -INLINE void afVepStartRow(FILE *f, boolean doHtml) -/* If we're printing HTML, print a , otherwise, just a newline. */ -{ -if (doHtml) - fputs("\n", f); -else - fputc('\n', f); -} - -INLINE void afVepPuts(char *text, struct annoFormatVep *self) -/* Write text to self->f. If we're printing HTML, write encoded text, otherwise just write text. */ -{ -if (self->doHtml) - htmTextOut(self->f, text); -else - fputs(text, self->f); -} - -static void afVepPrintf(struct annoFormatVep *self, char *format, ...) -/* Printf to self->f, but if self->doHtml then encode before writing out. */ -#ifdef __GNUC__ -__attribute__((format(printf, 2, 3))) -#endif - ; - -static void afVepPrintf(struct annoFormatVep *self, char *format, ...) -/* Printf to self->f, but if self->doHtml then encode before writing out. */ -{ -va_list args; -va_start(args, format); -if (self->doHtml) - vaHtmlFprintf(self->f, format, args); -else - vfprintf(self->f, format, args); -va_end(args); -} - -static void afVepPrintHeaderExtraTags(struct annoFormatVep *self, char *bStart, char *bEnd) -/* For each extra column described in config, write out its tag and a brief description. - * bStart and bEnd are for bold output in HTML, or if not HTML, starting the line with "## ". */ -{ -struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; -if (extras == NULL) - return; -fprintf(self->f, "%sKeys for Extra column items:%s", bStart, bEnd); -afVepLineBreak(self->f, self->doHtml); -for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) - { - struct annoFormatVepExtraItem *extraItem; - for (extraItem = extraSrc->items; extraItem != NULL; extraItem = extraItem->next) - if (isNotEmpty(extraItem->tag)) - { - fprintf(self->f, "%s%s%s: %s", bStart, extraItem->tag, bEnd, extraItem->description); - afVepLineBreak(self->f, self->doHtml); - } - } -} - -static void afVepPrintHeaderDate(FILE *f, boolean doHtml) -/* VEP header includes a date formatted like "2012-06-16 16:09:38" */ -{ -long now = clock1(); -struct tm *tm = localtime(&now); -fprintf(f, "## Output produced at %d-%02d-%02d %02d:%02d:%02d", - 1900 + tm->tm_year, 1 + tm->tm_mon, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec); -afVepLineBreak(f, doHtml); -} - -static char *columnLabels[] = { "Uploaded Variation", - "Location", - "Allele", - "Gene", - "Feature", - "Feature type", - "Consequence", - "Position in cDNA", - "Position in CDS", - "Position in protein", - "Amino acid change", - "Codon change", - "Co-located Variation", - "Extra", - NULL }; - -static void afVepPrintColumnLabels(struct annoFormatVep *self) -/* If we're printing HTML, begin a table and use TH for labels; otherwise just - * print out a tab-sep text line with labels. VEP doesn't begin this line with a #! */ -{ -char **pLabel = columnLabels; -if (self->doHtml) - { - fputs("
. */ -{ -if (doHtml) - fputs("
", f); -} - -INLINE void afVepNextColumn(FILE *f, boolean doHtml) -/* Depending on whether we're printing HTML, print either a tab or . */ -{ -if (doHtml) - fputs("", f); -else - fputc('\t', f); -} - -INLINE void afVepEndRow(FILE *f, boolean doHtml) -/* If we're printing HTML, print a
\n\n", self->f); - } -else - { - fputs(*pLabel++, self->f); - while (*pLabel != NULL) - { - fputc('\t', self->f); - fputs(*pLabel++, self->f); - } - fputc('\n', self->f); - } -} - -static void afVepPrintHeader(struct annoFormatVep *self, char *db) -/* Print a header that looks almost like a VEP header. */ -{ -FILE *f = self->f; -boolean doHtml = self->doHtml; -char *bStart = doHtml ? "" : "## "; -char *bEnd = doHtml ? "" : ""; -if (!doHtml) - { - // Suppress these lines from HTML output -- IMO they're better suited for a file header: - fputs("## ENSEMBL VARIANT EFFECT PREDICTOR format (UCSC Variant Annotation Integrator)", f); - afVepLineBreak(f, doHtml); - afVepPrintHeaderDate(f, doHtml); - fprintf(f, "## Connected to UCSC database %s", db); - afVepLineBreak(f, doHtml); - } -struct annoFormatVepConfig *config = self->config; -fprintf(f, "%sVariants:%s ", bStart, bEnd); -afVepPrintf(self, "%s", config->variantDescription); -if (! strstr(config->variantSource->name, "/trash/")) - fprintf(f, " (%s)", config->variantSource->name); -afVepLineBreak(f, doHtml); -fprintf(f, "%sTranscripts:%s %s (%s)", bStart, bEnd, - config->gpVarDescription, config->gpVarSource->name); -afVepLineBreak(f, doHtml); -if (config->snpSource != NULL) - { - fprintf(f, "%sdbSNP:%s %s (%s)", bStart, bEnd, - config->snpDescription, config->snpSource->name); - afVepLineBreak(f, doHtml); - } -afVepPrintHeaderExtraTags(self, bStart, bEnd); -afVepPrintColumnLabels(self); -self->needHeader = FALSE; -} - -static void afVepInitialize(struct annoFormatter *fSelf, struct annoStreamer *primarySource, - struct annoStreamer *integrators) -/* Print header, regardless of whether we get any data after this. */ -{ -struct annoFormatVep *self = (struct annoFormatVep *)fSelf; -if (self->needHeader) - afVepPrintHeader(self, primarySource->assembly->name); -} - -static void afVepComment(struct annoFormatter *fSelf, char *content) -/* Print out a comment, either starting with "# " or as a warnBox depending on doHtml. */ -{ -if (strchr(content, '\n')) - errAbort("afVepComment: no multi-line input"); -struct annoFormatVep *self = (struct annoFormatVep *)fSelf; -if (self->doHtml) - warn("%s", content); -else - fprintf(self->f, "# %s\n", content); -} - -static void compressDashes(char *string) -/* If string has a run of '-' characters, turn it into single '-'. */ -{ -char *p = string; -while ((p = strchr(p, '-')) != NULL) - { - char *end = p+1; - while (*end == '-') - end++; - if (end > p+1) - memmove(p+1, end, strlen(end)+1); - p++; - } -} - -static char *afVepGetAlleles(struct annoFormatVep *self, struct annoRow *varRow) -/* Return a string with slash-separated alleles. Result is alloc'd by self->lm. */ -{ -char **varWords = (char **)(varRow->data); -char *alleles = NULL; -if (self->variantType == afvdtVcf) - alleles = lmCloneString(self->lm, vcfGetSlashSepAllelesFromWords(varWords, self->dyScratch)); -else if (self->varAllelesIx >= 0) - alleles = lmCloneString(self->lm, varWords[self->varAllelesIx]); -else - errAbort("annoFormatVep: afVepSetConfig didn't specify how to get alleles"); -compressDashes(alleles); -return alleles; -} - -static void afVepPrintNameAndLoc(struct annoFormatVep *self, struct annoRow *varRow) -/* Print variant name and position in genome. */ -{ -char **varWords = (char **)(varRow->data); -uint start1Based = varRow->start + 1; -// Use variant name if available, otherwise construct an identifier: -if (self->varNameIx >= 0 && !sameString(varWords[self->varNameIx], ".")) - afVepPuts(varWords[self->varNameIx], self); -else - { - char *alleles = afVepGetAlleles(self, varRow); - afVepPrintf(self, "%s_%u_%s", varRow->chrom, start1Based, alleles); - } -afVepNextColumn(self->f, self->doHtml); -// Location is chr:start for single-base, chr:start-end for indels: -if (varRow->end == start1Based) - afVepPrintf(self, "%s:%u", varRow->chrom, start1Based); -else if (start1Based > varRow->end) - afVepPrintf(self, "%s:%u-%u", varRow->chrom, varRow->end, start1Based); -else - afVepPrintf(self, "%s:%u-%u", varRow->chrom, start1Based, varRow->end); -afVepNextColumn(self->f, self->doHtml); -} - -INLINE void afVepPrintPlaceholders(FILE *f, int count, boolean doHtml) -/* VEP uses "-" for N/A. Sometimes there are several consecutive N/A columns. - * Count = 0 means print "-" with no tab. Count > 0 means print that many "-" columns. */ -{ -if (count == 0) - fputc('-', f); -else - { - int i; - for (i = 0; i < count; i++) - { - fputc('-', f); - afVepNextColumn(f, doHtml); - } - } -} - -#define afVepPrintPlaceholder(f, doHtml) afVepPrintPlaceholders(f, 1, doHtml) - -INLINE char *placeholderForEmpty(char *val) -/* Return VEP placeholder "-" if val is empty, otherwise return val. */ -{ -return (isEmpty(val) ? "-" : val); -} - - -static void afVepPrintGene(struct annoFormatVep *self, struct annoRow *gpvRow) -/* If the genePred portion of gpvRow contains a gene name (in addition to transcript name), - * print it out; otherwise print a placeholder. */ -{ -if (self->geneNameIx >= 0) - { - char **words = (char **)(gpvRow->data); - afVepPuts(placeholderForEmpty(words[self->geneNameIx]), self); - afVepNextColumn(self->f, self->doHtml); - } -else - afVepPrintPlaceholder(self->f, self->doHtml); -} - -static void limitLength(char *seq, int baseCount, char *unit) -/* If seq is longer than an abbreviated version of itself, change it to the abbreviated version. */ -{ -if (isEmpty(seq)) - return; -int len = strlen(seq); -const int elipsLen = 3; -char lengthNote[512]; -safef(lengthNote, sizeof(lengthNote), "(%d %s)", len, unit); -if (len > 2*baseCount + elipsLen + strlen(lengthNote)) - { - // First baseCount bases, then "...": - int offset = baseCount; - safecpy(seq+offset, len+1-offset, "..."); - offset += elipsLen; - // then last baseCount bases: - safecpy(seq+offset, len+1-offset, seq+len-baseCount); - offset += baseCount; - // then lengthNote: - safecpy(seq+offset, len+1-offset, lengthNote); - } -} - -static void tweakStopCodonAndLimitLength(char *aaSeq, char *codonSeq) -/* If aa from gpFx has a stop 'Z', replace it with "*". - * If the strings are very long, truncate with a note about how long they are. */ -{ -char *earlyStop = strchr(aaSeq, 'Z'); -if (earlyStop) - { - earlyStop[0] = '*'; - earlyStop[1] = '\0'; - } -limitLength(aaSeq, 5, "aa"); -limitLength(codonSeq, 12, "nt"); -} - -INLINE char *dashForEmpty(char *s) -/* Represent empty alleles in insertions/deletions by the customary "-". */ -{ -if (isEmpty(s)) - return "-"; -else - return s; -} - -static void afVepPrintPredictions(struct annoFormatVep *self, struct annoRow *varRow, - struct annoRow *gpvRow, struct gpFx *gpFx) -/* Print VEP columns computed by annoGratorGpVar (or placeholders) */ -{ -// variant allele used to calculate the consequence (or first alternate allele) -char *abbrevAllele = cloneString(gpFx->gAllele); -limitLength(abbrevAllele, 24, "nt"); -afVepPuts(placeholderForEmpty(abbrevAllele), self); -afVepNextColumn(self->f, self->doHtml); -// ID of affected gene -afVepPrintGene(self, gpvRow); -// ID of feature -afVepPrintf(self, "%s", placeholderForEmpty(gpFx->transcript)); -afVepNextColumn(self->f, self->doHtml); -// type of feature {Transcript, RegulatoryFeature, MotifFeature} -if (gpFx->soNumber == intergenic_variant) - afVepPrintPlaceholder(self->f, self->doHtml); -else - { - fputs("Transcript", self->f); - afVepNextColumn(self->f, self->doHtml); - } -// consequence: SO term e.g. splice_region_variant -afVepPuts(soTermToString(gpFx->soNumber), self); -afVepNextColumn(self->f, self->doHtml); -if (gpFx->detailType == codingChange) - { - struct codingChange *change = &(gpFx->details.codingChange); - uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt); - boolean isInsertion = (refLen == 0); - boolean isDeletion = altLen < refLen; - if (isInsertion) - { - fprintf(self->f, "%u-%u", change->cDnaPosition, change->cDnaPosition+1); - afVepNextColumn(self->f, self->doHtml); - fprintf(self->f, "%u-%u", change->cdsPosition, change->cdsPosition+1); - } - else if (isDeletion && refLen > 1) - { - fprintf(self->f, "%u-%u", change->cDnaPosition+1, change->cDnaPosition+refLen); - afVepNextColumn(self->f, self->doHtml); - fprintf(self->f, "%u-%u", change->cdsPosition+1, change->cdsPosition+refLen); - } - else - { - fprintf(self->f, "%u", change->cDnaPosition+1); - afVepNextColumn(self->f, self->doHtml); - fprintf(self->f, "%u", change->cdsPosition+1); - } - afVepNextColumn(self->f, self->doHtml); - fprintf(self->f, "%u", change->pepPosition+1); - afVepNextColumn(self->f, self->doHtml); - int variantFrame = change->cdsPosition % 3; - strLower(change->codonOld); - int upLen = min(strlen(change->codonOld+variantFrame), refLen); - toUpperN(change->codonOld+variantFrame, upLen); - strLower(change->codonNew); - int alleleLength = altLen; - // watch out for symbolic alleles [actually now that we're using txAlt we shouldn't see these]: - if (sameString(change->txAlt, "") || sameString(change->txAlt, "<*>")) - alleleLength = upLen; - else if (startsWith("<", change->txAlt)) - alleleLength = 0; - toUpperN(change->codonNew+variantFrame, alleleLength); - tweakStopCodonAndLimitLength(change->aaOld, change->codonOld); - tweakStopCodonAndLimitLength(change->aaNew, change->codonNew); - fprintf(self->f, "%s/%s", dashForEmpty(change->aaOld), dashForEmpty(change->aaNew)); - afVepNextColumn(self->f, self->doHtml); - fprintf(self->f, "%s/%s", dashForEmpty(change->codonOld), dashForEmpty(change->codonNew)); - afVepNextColumn(self->f, self->doHtml); - } -else if (gpFx->detailType == nonCodingExon) - { - struct nonCodingExon *change = &(gpFx->details.nonCodingExon); - uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt); - boolean isInsertion = (refLen == 0); - boolean isDeletion = altLen < refLen; - int cDnaPosition = change->cDnaPosition; - if (isInsertion) - fprintf(self->f, "%u-%u", cDnaPosition, cDnaPosition+1); - else if (isDeletion) - fprintf(self->f, "%u-%u", cDnaPosition+1, cDnaPosition+refLen); - else - fprintf(self->f, "%u", cDnaPosition+1); - afVepNextColumn(self->f, self->doHtml); - // Coding effect columns (except for cDnaPosition) are N/A: - afVepPrintPlaceholders(self->f, 4, self->doHtml); - } -else - // Coding effect columns are N/A: - afVepPrintPlaceholders(self->f, 5, self->doHtml); -freez(&abbrevAllele); -} - -static void afVepPrintExistingVar(struct annoFormatVep *self, struct annoRow *varRow, - struct annoStreamRows gratorData[], int gratorCount) -/* Print existing variant ID (or placeholder) */ -{ -if (self->snpNameIx >= 0) - { - if (gratorCount < 2 || gratorData[1].streamer != self->config->snpSource) - errAbort("annoFormatVep: config error, snpSource is not where expected"); - struct annoRow *snpRows = gratorData[1].rowList, *row; - if (snpRows != NULL) - { - int count = 0; - for (row = snpRows; row != NULL; row = row->next) - { - char **snpWords = (char **)(row->data); - if (row->start == varRow->start && row->end == varRow->end) - { - if (count > 0) - fputc(',', self->f); - afVepPrintf(self, "%s", snpWords[self->snpNameIx]); - count++; - } - } - if (count == 0) - afVepPrintPlaceholder(self->f, self->doHtml); - else - afVepNextColumn(self->f, self->doHtml); - } - else - afVepPrintPlaceholder(self->f, self->doHtml); - } -else - afVepPrintPlaceholder(self->f, self->doHtml); -} - -static boolean isCodingSnv(struct annoRow *primaryRow, struct gpFx *gpFx) -/* Return TRUE if this is a single-nucleotide non-synonymous change. */ -{ -if (gpFx == NULL || - gpFx->detailType != codingChange || - gpFx->soNumber == synonymous_variant) - return FALSE; -char *txRef = gpFx->details.codingChange.txRef; -char *txAlt = gpFx->details.codingChange.txAlt; -if (txRef == NULL || txAlt == NULL || - strlen(txRef) != 1 || sameString(txRef, "-") || - strlen(txAlt) != 1 || sameString(txAlt, "-")) - return FALSE; -return TRUE; -} - -static int commaSepFindIx(char *item, char *s) -/* Treating comma-separated, non-NULL s as an array of words, - * return the index of item in the array, or -1 if item is not in array. */ -{ -int itemLen = strlen(item); -int ix = 0; -char *p = strchr(s, ','); -while (p != NULL) - { - int elLen = (p - s); - if (elLen == itemLen && strncmp(s, item, itemLen) == 0) - return ix; - s = p+1; - p = strchr(s, ','); - ix++; - } -if (strlen(s) == itemLen && strncmp(s, item, itemLen) == 0) - return ix; -return -1; -} - -static int commaSepFindIntIx(int item, char *s) -/* Treating comma-separated, non-NULL s as an array of words that encode integers, - * return the index of item in the array, or -1 if item is not in array. */ -{ -char itemString[64]; -safef(itemString, sizeof(itemString), "%d", item); -return commaSepFindIx(itemString, s); -} - -static char *commaSepWordFromIxOrWhole(int ix, char *s, struct lm *lm) -/* Treating comma-separated, non-NULL s as an array of words, - * return the word at ix in the array -- but if s has no commas, - * just return (lmCloned) s. errAborts if s is comma-sep but ix is out of bounds. */ -{ -int i = 0; -char *p = strchr(s, ','); -if (p == NULL) - return lmCloneString(lm, s); -while (p != NULL) - { - if (i == ix) - return lmCloneStringZ(lm, s, p-s); - s = p+1; - p = strchr(s, ','); - i++; - } -if (i == ix) - return lmCloneString(lm, s); -errAbort("commaSepWordFromIxOrWhole: Bad index %d for string '%s'", ix, s); -return NULL; -} - -INLINE void afVepNewExtra(struct annoFormatVep *self, boolean *pGotExtra) -/* If we already printed an extra item, print the extra column's separator; set pGotExtra. */ -{ -assert(pGotExtra); -if (*pGotExtra) - { - if (self->doHtml) - fputs("; ", self->f); - else - fputc(';', self->f); - } -*pGotExtra = TRUE; -} - -static void afVepPrintDbNsfpSift(struct annoFormatVep *self, - struct annoFormatVepExtraSource *extraSrc, - struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, - boolean *pGotExtra) -/* Match the allele from gpFx to the per-allele scores in row from dbNsfpSift. */ -{ -// Look up column indices only once: -static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix; -if (ensTxIdIx == -1) - { - struct asColumn *columns = extraSrc->source->asObj->columnList; - ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); - altAl1Ix = asColumnMustFindIx(columns, "altAl1"); - score1Ix = asColumnMustFindIx(columns, "score1"); - altAl2Ix = asColumnMustFindIx(columns, "altAl2"); - score2Ix = asColumnMustFindIx(columns, "score2"); - altAl3Ix = asColumnMustFindIx(columns, "altAl3"); - score3Ix = asColumnMustFindIx(columns, "score3"); - } - -struct annoRow *row; -for (row = extraRows; row != NULL; row = row->next) - { - // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx - // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame) - char **words = row->data; - if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && - commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) - continue; - struct annoFormatVepExtraItem *extraItem = extraSrc->items; - char *scoreStr = NULL; - if (sameString(gpFx->gAllele, words[altAl1Ix])) - scoreStr = words[score1Ix]; - else if (sameString(gpFx->gAllele, words[altAl2Ix])) - scoreStr = words[score2Ix]; - else if (sameString(gpFx->gAllele, words[altAl3Ix])) - scoreStr = words[score3Ix]; - double score = atof(scoreStr); - char prediction = '?'; - if (score < 0.05) - prediction = 'D'; - else - prediction = 'T'; - if (isNotEmpty(scoreStr) && differentString(scoreStr, ".")) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "%s=%c(%s)", extraItem->tag, prediction, scoreStr); - } - } -} - -static void afVepPrintDbNsfpPolyPhen2(struct annoFormatVep *self, - struct annoFormatVepExtraSource *extraSrc, - struct annoRow *extraRows, struct gpFx *gpFx, - boolean *pGotExtra) -/* Match the allele from gpFx to the per-allele scores in each row from dbNsfpPolyPhen2. */ -{ -// Look up column indices only once: -static int aaPosIx=-1, - altAl1Ix, hDivScore1Ix, hDivPred1Ix, hVarScore1Ix, hVarPred1Ix, - altAl2Ix, hDivScore2Ix, hDivPred2Ix, hVarScore2Ix, hVarPred2Ix, - altAl3Ix, hDivScore3Ix, hDivPred3Ix, hVarScore3Ix, hVarPred3Ix; -if (aaPosIx == -1) - { - struct asColumn *columns = extraSrc->source->asObj->columnList; - aaPosIx = asColumnMustFindIx(columns, "uniProtAaPos"); - altAl1Ix = asColumnMustFindIx(columns, "altAl1"); - hDivScore1Ix = asColumnMustFindIx(columns, "hDivScore1"); - hDivPred1Ix = asColumnMustFindIx(columns, "hDivPred1"); - hVarScore1Ix = asColumnMustFindIx(columns, "hVarScore1"); - hVarPred1Ix = asColumnMustFindIx(columns, "hVarPred1"); - altAl2Ix = asColumnMustFindIx(columns, "altAl2"); - hDivScore2Ix = asColumnMustFindIx(columns, "hDivScore2"); - hDivPred2Ix = asColumnMustFindIx(columns, "hDivPred2"); - hVarScore2Ix = asColumnMustFindIx(columns, "hVarScore2"); - hVarPred2Ix = asColumnMustFindIx(columns, "hVarPred2"); - altAl3Ix = asColumnMustFindIx(columns, "altAl3"); - hDivScore3Ix = asColumnMustFindIx(columns, "hDivScore3"); - hDivPred3Ix = asColumnMustFindIx(columns, "hDivPred3"); - hVarScore3Ix = asColumnMustFindIx(columns, "hVarScore3"); - hVarPred3Ix = asColumnMustFindIx(columns, "hVarPred3"); - } - -struct codingChange *cc = &(gpFx->details.codingChange); -int count = 0; -struct annoRow *row; -for (row = extraRows; row != NULL; row = row->next) - { - char **words = row->data; -//#*** polyphen2 can actually have multiple scores/preds for the same pepPosition... -//#*** so what we really should do is loop on comma-sep words[aaPosIx] and print scores/preds -//#*** whenever pepPosition matches. - int txIx = commaSepFindIntIx(cc->pepPosition+1, words[aaPosIx]); - if (txIx < 0) - continue; - struct annoFormatVepExtraItem *extraItem = extraSrc->items; - for (; extraItem != NULL; extraItem = extraItem->next) - { - boolean isHdiv = (stringIn("HDIV", extraItem->tag) != NULL); - int predIx = -1, scoreIx = -1; - if (sameString(gpFx->gAllele, words[altAl1Ix])) - { - predIx = isHdiv ? hDivPred1Ix : hVarPred1Ix; - scoreIx = isHdiv ? hDivScore1Ix : hVarScore1Ix; - } - else if (sameString(gpFx->gAllele, words[altAl2Ix])) - { - predIx = isHdiv ? hDivPred2Ix : hVarPred2Ix; - scoreIx = isHdiv ? hDivScore2Ix : hVarScore2Ix; - } - else if (sameString(gpFx->gAllele, words[altAl3Ix])) - { - predIx = isHdiv ? hDivPred3Ix : hVarPred3Ix; - scoreIx = isHdiv ? hDivScore3Ix : hVarScore3Ix; - } - char *pred = (predIx < 0) ? NULL : words[predIx]; - if (pred == NULL || sameString(pred, ".")) - continue; - pred = commaSepWordFromIxOrWhole(txIx, pred, self->lm); - if (isNotEmpty(pred)) - { - if (count == 0) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "%s=", extraItem->tag); - } - else - fputc(',', self->f); - char *score = (scoreIx < 0) ? "?" : commaSepWordFromIxOrWhole(txIx, words[scoreIx], self->lm); - fprintf(self->f, "%s(%s)", pred, score); - count++; - } - } - } -} - -static void afVepPrintDbNsfpMutationTA(struct annoFormatVep *self, - struct annoFormatVepExtraSource *extraSrc, - struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, - boolean *pGotExtra) -/* Match the allele from gpFx to the per-allele scores in row from dbNsfpMutationTaster - * or dbNsfpMutationAssessor -- they have identical column indices. */ -{ -// Look up column indices only once: -static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix, - altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix; -if (ensTxIdIx == -1) - { - struct asColumn *columns = extraSrc->source->asObj->columnList; - ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); - altAl1Ix = asColumnMustFindIx(columns, "altAl1"); - score1Ix = asColumnMustFindIx(columns, "score1"); - pred1Ix = asColumnMustFindIx(columns, "pred1"); - altAl2Ix = asColumnMustFindIx(columns, "altAl2"); - score2Ix = asColumnMustFindIx(columns, "score2"); - pred2Ix = asColumnMustFindIx(columns, "pred2"); - altAl3Ix = asColumnMustFindIx(columns, "altAl3"); - score3Ix = asColumnMustFindIx(columns, "score3"); - pred3Ix = asColumnMustFindIx(columns, "pred3"); - } - -struct annoRow *row; -for (row = extraRows; row != NULL; row = row->next) - { - // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx - // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame) - char **words = row->data; - if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && - commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) - continue; - struct annoFormatVepExtraItem *extraItem = extraSrc->items; - char *score = NULL, *pred = NULL; - if (sameString(gpFx->gAllele, words[altAl1Ix])) - { - score = words[score1Ix]; - pred = words[pred1Ix]; - } - else if (sameString(gpFx->gAllele, words[altAl2Ix])) - { - score = words[score2Ix]; - pred = words[pred2Ix]; - } - else if (sameString(gpFx->gAllele, words[altAl3Ix])) - { - score = words[score3Ix]; - pred = words[pred3Ix]; - } - if (isNotEmpty(score) && differentString(score, ".")) - { - afVepNewExtra(self, pGotExtra); - if (isEmpty(pred)) - pred = "?"; - fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score); - } - } -} - -static void afVepPrintDbNsfpLrt(struct annoFormatVep *self, - struct annoFormatVepExtraSource *extraSrc, - struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, - boolean *pGotExtra) -/* Match the allele from gpFx to the per-allele scores in row from dbNsfpLrt -- - * it also has omega{1,2,3} columns, but I'm not sure what those mean so am leaving out for now. */ -{ -// Look up column indices only once: -static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix, - altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix; -if (ensTxIdIx == -1) - { - struct asColumn *columns = extraSrc->source->asObj->columnList; - ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); - altAl1Ix = asColumnMustFindIx(columns, "altAl1"); - score1Ix = asColumnMustFindIx(columns, "score1"); - pred1Ix = asColumnMustFindIx(columns, "pred1"); - altAl2Ix = asColumnMustFindIx(columns, "altAl2"); - score2Ix = asColumnMustFindIx(columns, "score2"); - pred2Ix = asColumnMustFindIx(columns, "pred2"); - altAl3Ix = asColumnMustFindIx(columns, "altAl3"); - score3Ix = asColumnMustFindIx(columns, "score3"); - pred3Ix = asColumnMustFindIx(columns, "pred3"); - } - -struct annoRow *row; -for (row = extraRows; row != NULL; row = row->next) - { - // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx - // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame) - char **words = row->data; - if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && - commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) - continue; - struct annoFormatVepExtraItem *extraItem = extraSrc->items; - char *score = NULL, *pred = NULL; - if (sameString(gpFx->gAllele, words[altAl1Ix])) - { - score = words[score1Ix]; - pred = words[pred1Ix]; - } - else if (sameString(gpFx->gAllele, words[altAl2Ix])) - { - score = words[score2Ix]; - pred = words[pred2Ix]; - } - else if (sameString(gpFx->gAllele, words[altAl3Ix])) - { - score = words[score3Ix]; - pred = words[pred3Ix]; - } - if (isNotEmpty(score) && differentString(score, ".")) - { - afVepNewExtra(self, pGotExtra); - if (isEmpty(pred)) - pred = "?"; - fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score); - } - } -} - -static void afVepPrintDbNsfpVest(struct annoFormatVep *self, - struct annoFormatVepExtraSource *extraSrc, - struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, - boolean *pGotExtra) -/* Match the allele from gpFx to the per-allele scores in row from dbNsfpVest */ -{ -// TODO: also compare var[123] protein change columns to gpFx to make sure we have the right one -// Look up column indices only once: -static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix; -if (ensTxIdIx == -1) - { - struct asColumn *columns = extraSrc->source->asObj->columnList; - ensTxIdIx = asColumnMustFindIx(columns, "ensTxId"); - altAl1Ix = asColumnMustFindIx(columns, "altAl1"); - score1Ix = asColumnMustFindIx(columns, "score1"); - altAl2Ix = asColumnMustFindIx(columns, "altAl2"); - score2Ix = asColumnMustFindIx(columns, "score2"); - altAl3Ix = asColumnMustFindIx(columns, "altAl3"); - score3Ix = asColumnMustFindIx(columns, "score3"); - } - -struct annoRow *row; -for (row = extraRows; row != NULL; row = row->next) - { - // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx - char **words = row->data; - if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") && - commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0) - continue; - struct annoFormatVepExtraItem *extraItem = extraSrc->items; - char *score = NULL; - if (sameString(gpFx->gAllele, words[altAl1Ix])) - score = words[score1Ix]; - else if (sameString(gpFx->gAllele, words[altAl2Ix])) - score = words[score2Ix]; - else if (sameString(gpFx->gAllele, words[altAl3Ix])) - score = words[score3Ix]; - if (isNotEmpty(score) && differentString(score, ".")) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "%s=%s", extraItem->tag, score); - } - } -} - -static void afVepPrintDbNsfpInterPro(struct annoFormatVep *self, - struct annoFormatVepExtraSource *extraSrc, - struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId, - boolean *pGotExtra) -/* Print out any overlapping (comma-sep list, HTML-encoded ',' and '=') InterPro domains. */ -{ -// Look up column indices only once: -static int domainsIx = -1; -if (domainsIx == -1) - { - struct asColumn *columns = extraSrc->source->asObj->columnList; - domainsIx = asColumnMustFindIx(columns, "domains"); - } - -struct annoRow *row; -for (row = extraRows; row != NULL; row = row->next) - { - char **words = row->data; - struct annoFormatVepExtraItem *extraItem = extraSrc->items; - char *domains = words[domainsIx]; - int lastC = strlen(domains) - 1; - if (domains[lastC] == ',') - domains[lastC] = '\0'; - if (isNotEmpty(domains) && differentString(domains, ".")) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "%s=%s", extraItem->tag, domains); - } - } -} - -static boolean allelesAgree(char altNt, char altAa, char **words) -/* Return TRUE if dbNsfpSeqChange words have altAa associated with altNt. */ -{ -//#*** TODO: handle stop codon representation -if ((altNt == words[11][0] && altAa == words[12][0]) || - (altNt == words[13][0] && altAa == words[14][0]) || - (altNt == words[15][0] && altAa == words[16][0])) - return TRUE; -return FALSE; -} - -static struct annoRow *getRowsFromSource(struct annoStreamer *src, - struct annoStreamRows gratorData[], int gratorCount) -/* Search gratorData for src, and return its rows when found. */ -{ -int i; -for (i = 0; i < gratorCount; i++) - { - if (gratorData[i].streamer == src) - return gratorData[i].rowList; - } -errAbort("annoFormatVep: Can't find source %s in gratorData", src->name); -return NULL; -} - -static char *getDbNsfpEnsTx(struct annoFormatVep *self, struct gpFx *gpFx, - struct annoStreamRows *gratorData, int gratorCount) -/* Find the Ensembl transcript ID, if any, for which dbNsfp has results consistent - * with gpFx. */ -{ -int i; -for (i = 0; i < gratorCount; i++) - { - struct annoStreamer *source = gratorData[i].streamer; - if (!sameString(source->asObj->name, "dbNsfpSeqChange")) //#*** need metadata! - continue; - struct codingChange *cc = &(gpFx->details.codingChange); - struct annoRow *extraRows = getRowsFromSource(source, gratorData, gratorCount); - struct annoRow *row; - for (row = extraRows; row != NULL; row = row->next) - { - char **words = row->data; - if (strcasecmp(cc->codonOld, words[7]) != 0) - continue; - if (!allelesAgree(gpFx->gAllele[0], cc->aaNew[0], words)) - continue; - int txIx = commaSepFindIntIx(cc->pepPosition+1, words[10]); - if (txIx >= 0) - { - if (sameString(words[4], ".")) - return "."; - char *ensTxId = commaSepWordFromIxOrWhole(txIx, words[4], self->lm); - return ensTxId; - } - } - break; - } -return NULL; -} - -static void afVepPrintExtrasDbNsfp(struct annoFormatVep *self, struct annoRow *varRow, - struct annoRow *gpvRow, struct gpFx *gpFx, - struct annoStreamRows gratorData[], int gratorCount, - boolean *pGotExtra) -/* Print the Extra column's tag=value; components from dbNSFP data, if we have any. */ -{ -// dbNSFP has data only for coding non-synonymous single-nucleotide changes: -if (!isCodingSnv(varRow, gpFx)) - return; -// Does dbNsfpSeqChange have a coding change consistent with gpFx?: -char *ensTxId = getDbNsfpEnsTx(self, gpFx, gratorData, gratorCount); -// Now cycle through selected dbNsfp* sources, printing out scores for ensTxId: -struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; -for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) - { - char *asObjName = extraSrc->source->asObj->name; - if (!startsWith("dbNsfp", asObjName)) - continue; - struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount); - if (sameString(asObjName, "dbNsfpPolyPhen2")) - { - // PolyPhen2 is based on UniProt proteins, not GENCODE/Ensembl transcripts, - // so ensTxId doesn't apply. - afVepPrintDbNsfpPolyPhen2(self, extraSrc, extraRows, gpFx, pGotExtra); - if (ensTxId == NULL) - break; // all done now, no need to keep looking - } - else if (ensTxId != NULL) - { - if (sameString(asObjName, "dbNsfpSift")) - afVepPrintDbNsfpSift(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); - else if (sameString(asObjName, "dbNsfpMutationTaster") || - sameString(asObjName, "dbNsfpMutationAssessor")) - afVepPrintDbNsfpMutationTA(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); - else if (sameString(asObjName, "dbNsfpLrt")) - afVepPrintDbNsfpLrt(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); - else if (sameString(asObjName, "dbNsfpVest")) - afVepPrintDbNsfpVest(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); - else if (sameString(asObjName, "dbNsfpInterPro")) - afVepPrintDbNsfpInterPro(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra); - else if (!sameString(asObjName, "dbNsfpSeqChange")) - errAbort("Unrecognized asObj->name '%s' from dbNSFP source '%s'", - asObjName, extraSrc->source->name); - } - } -} - -static void afVepPrintExtrasHgvs(struct annoFormatVep *self, struct annoRow *gpvRow, - boolean *pGotExtra) -/* If not empty, print HGVS terms (last column of gpvRow). */ -{ -char **row = gpvRow->data; -if (isNotEmpty(row[self->hgvsGIx])) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "HGVSG=%s", row[self->hgvsGIx]); - } -if (isNotEmpty(row[self->hgvsCNIx])) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "HGVSCN=%s", row[self->hgvsCNIx]); - } -if (isNotEmpty(row[self->hgvsPIx])) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "HGVSP=%s", row[self->hgvsPIx]); - } -} - -static void afVepPrintExtraWigVec(struct annoFormatVepExtraSource *extraSrc, - struct annoFormatVep *self, - struct annoFormatVepExtraItem *extraItem, - struct annoRow *extraRows, struct annoRow *gpvRow, - boolean *pGotExtra) -/* Print values from a wig source with type arWigVec (per-base floats) */ -{ -int i; -struct annoRow *row; -for (i = 0, row = extraRows; row != NULL; i++, row = row->next) - { - float *vector = row->data; - int len = row->end - row->start; - int j; - for (j = 0; j < len; j++) - { - if (i+j == 0) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "%s=", extraItem->tag); - } - else - fputc(',', self->f); - fprintf(self->f, "%g", vector[j]); - } - } -} - -static void afVepPrintExtraWigSingle(struct annoFormatVepExtraSource *extraSrc, - struct annoFormatVep *self, - struct annoFormatVepExtraItem *extraItem, - struct annoRow *extraRows, struct annoRow *gpvRow, - boolean *pGotExtra) -/* Print values from a wig source with type arWigSingle (one double per row, e.g. an average) */ -{ -struct annoRow *row; -for (row = extraRows; row != NULL; row = row->next) - { - double *pValue = row->data; - if (row == extraRows) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "%s=", extraItem->tag); - } - else - fputc(',', self->f); - fprintf(self->f, "%g", *pValue); - } -} - -static void afVepPrintExtraWords(struct annoFormatVepExtraSource *extraSrc, - struct annoFormatVep *self, - struct annoFormatVepExtraItem *extraItem, - struct annoRow *extraRows, struct annoRow *gpvRow, - boolean *pGotExtra) -/* Print comma-separated values in the specified column from the usual array-of-words. */ -{ -if (extraItem->rowIx < 0) - errAbort("annoFormatVep: invalid rowIx for tag %s", extraItem->tag); -char **gpvWords = gpvRow ? gpvRow->data : NULL; -char *gpvTranscript = gpvWords ? gpvWords[self->txNameIx] : NULL; -boolean gotGpvTx = FALSE; -int i; -struct annoRow *row; -for (i = 0, row = extraRows; row != NULL; row = row->next) - { - char **words = row->data; - char *extraTranscript = words[0]; - // If extraSrc happens to be gpVarSource, use only the row that matches the current - // transcript, and use it only once. - if (extraSrc->source == self->config->gpVarSource) - { - extraTranscript = words[self->txNameIx]; - if (gpvTranscript == NULL || differentString(extraTranscript, gpvTranscript) || gotGpvTx) - continue; - gotGpvTx = TRUE; - } - char *val = words[extraItem->rowIx]; - if (isEmpty(val)) - continue; - // Strip trailing comma if found - int valLen = strlen(val); - if (val[valLen-1] == ',') - val[valLen-1] = '\0'; - if (i == 0) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "%s=", extraItem->tag); - if (extraItem->isBoolean) - fputs("YES", self->f); - } - else if (! extraItem->isBoolean) - fputc(',', self->f); - if (! extraItem->isBoolean) - { - // Watch out for characters that will mess up parsing of EXTRAS column: - subChar(val, '=', '_'); - subChar(val, ';', '_'); - afVepPuts(val, self); - } - i++; - } -} - -static void afVepPrintExtrasOther(struct annoFormatVep *self, struct annoRow *varRow, - struct annoRow *gpvRow, struct gpFx *gpFx, - struct annoStreamRows gratorData[], int gratorCount, - boolean includeRegulatory, boolean *pGotExtra) -/* Print the Extra column's tag=value; components (other than dbNSFP) if we have any. */ -{ -struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; -for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) - { - char *asObjName = extraSrc->source->asObj->name; - if (startsWith("dbNsfp", asObjName)) - continue; - struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount); - if (extraRows != NULL) - { - struct annoFormatVepExtraItem *extraItem; - for (extraItem = extraSrc->items; extraItem != NULL; extraItem = extraItem->next) - if (includeRegulatory || ! extraItem->isRegulatory) - extraSrc->printExtra(extraSrc, self, extraItem, extraRows, gpvRow, pGotExtra); - } - } -// VEP automatically adds DISTANCE for upstream/downstream variants -if (gpFx && (gpFx->soNumber == upstream_gene_variant || gpFx->soNumber == downstream_gene_variant)) - { - afVepNewExtra(self, pGotExtra); - // In order to really understand the boundary conditions of Ensembl's DISTANCE function, - // I'll have to read the code someday. Upstream deletions on + strand still not handled - // quite right. - int ensemblFudge = 0; - int distance = gpvRow->start - varRow->start + ensemblFudge; - if (distance < 0) - { - ensemblFudge = 1; - distance = varRow->start - gpvRow->end + ensemblFudge; - } - fprintf(self->f, "DISTANCE=%d", distance); - } -boolean includeExonNumber = TRUE; //#*** optional in VEP -if (includeExonNumber && gpFx) - { - // Add Exon or intron number if applicable - enum detailType deType = gpFx->detailType; - int num = -1, count = 0; - if (deType == codingChange) - { - num = gpFx->details.codingChange.exonNumber; - count = gpFx->details.codingChange.exonCount; - } - else if (deType == nonCodingExon) - { - num = gpFx->details.nonCodingExon.exonNumber; - count = gpFx->details.nonCodingExon.exonCount; - } - else if (deType == intron) - { - num = gpFx->details.intron.intronNumber; - count = gpFx->details.intron.intronCount; - } - if (num >= 0) - { - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "%s=%d/%d", (deType == intron) ? "INTRON" : "EXON", num+1, count); - } - } -if (!*pGotExtra) - afVepPrintPlaceholders(self->f, 0, self->doHtml); -} - -static void afVepLmCleanup(struct annoFormatVep *self) -/* Clean out localmem every 1k rows. */ -{ -self->lmRowCount++; -if (self->lmRowCount > 1024) - { - lmCleanup(&(self->lm)); - self->lm = lmInit(0); - self->lmRowCount = 0; - } -} - -static void afVepPrintOneLineGpVar(struct annoFormatVep *self, struct annoStreamRows *varData, - struct annoRow *gpvRow, - struct annoStreamRows gratorData[], int gratorCount) -/* Print one line of VEP with a coding gene consequence: a variant, an allele, - * functional consequences of that allele, and whatever else is included in the config. */ -{ -afVepLmCleanup(self); -struct annoRow *varRow = varData->rowList; -struct gpFx *gpFx = annoGratorGpVarGpFxFromRow(self->config->gpVarSource, gpvRow, self->lm); -afVepStartRow(self->f, self->doHtml); -afVepPrintNameAndLoc(self, varRow); -afVepPrintPredictions(self, varRow, gpvRow, gpFx); -afVepPrintExistingVar(self, varRow, gratorData, gratorCount); -boolean gotExtra = FALSE; -afVepPrintExtrasDbNsfp(self, varRow, gpvRow, gpFx, gratorData, gratorCount, &gotExtra); -afVepPrintExtrasHgvs(self, gpvRow, &gotExtra); -afVepPrintExtrasOther(self, varRow, gpvRow, gpFx, gratorData, gratorCount, FALSE, &gotExtra); -afVepEndRow(self->f, self->doHtml); -} - -static boolean afVepIsRegulatory(struct annoFormatVep *self, - struct annoStreamRows gratorData[], int gratorCount) -/* Return TRUE if one of the extraSource items is a regulatory region. */ -{ -struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc; -for (extraSrc = extras; extraSrc != NULL; extraSrc = extraSrc->next) - { - struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount); - if (extraRows != NULL) - { - struct annoFormatVepExtraItem *extraItem; - for (extraItem = extraSrc->items; extraItem != NULL; extraItem = extraItem->next) - { - if (extraItem->isRegulatory) - return TRUE; - } - } - } -return FALSE; -} - -static char *afVepGetFirstAltAllele(struct annoFormatVep *self, struct annoRow *varRow) -/* For VEP we must print out some alternate allele, whether the alternate allele is - * related to any other info (like coding changes) or not. Pick the first non-reference - * allele. */ -{ -int refAlBufSize = varRow->end - varRow->start + 1; -char refAllele[refAlBufSize]; -annoAssemblyGetSeq(self->assembly, varRow->chrom, varRow->start, varRow->end, - refAllele, sizeof(refAllele)); -struct variant *variant = NULL; -if (self->variantType == afvdtVcf) - variant = variantFromVcfAnnoRow(varRow, refAllele, self->lm, self->dyScratch); -else if (self->variantType == afvdtPgSnpTable) - variant = variantFromPgSnpAnnoRow(varRow, refAllele, TRUE, self->lm); -else if (self->variantType == afvdtPgSnpFile) - variant = variantFromPgSnpAnnoRow(varRow, refAllele, FALSE, self->lm); -return firstAltAllele(variant->alleles); -} - -static void afVepPrintPredictionsReg(struct annoFormatVep *self, char *alt) -/* Print VEP allele, placeholder gene and item names, 'RegulatoryFeature', - * 'regulatory_region_variant', and placeholder coding effect columns. */ -{ -afVepPuts(alt, self); -afVepNextColumn(self->f, self->doHtml); -afVepPrintPlaceholders(self->f, 2, self->doHtml); -fputs("RegulatoryFeature", self->f); -afVepNextColumn(self->f, self->doHtml); -fputs(soTermToString(regulatory_region_variant), self->f); -afVepNextColumn(self->f, self->doHtml); -// Coding effect columns are N/A: -afVepPrintPlaceholders(self->f, 5, self->doHtml); -} - -static void afVepPrintExtrasHgvsGOnly(struct annoFormatVep *self, struct annoRow *varRow, char *alt, - boolean *pGotExtra) -/* Make our own HGVS g. term (when we aren't taking HGVS terms from annoGratorGpVar). */ -{ -if (self->hgvsMakeG) - { - struct bed3 *variantBed = (struct bed3 *)varRow; - char *chromAcc = hRefSeqAccForChrom(self->assembly->name, varRow->chrom); - char *hgvsG = hgvsGFromVariant(self->gSeqWin, variantBed, alt, chromAcc, self->hgvsBreakDelIns); - afVepNewExtra(self, pGotExtra); - fprintf(self->f, "HGVSG=%s", hgvsG); - freeMem(hgvsG); - } -} - -static void afVepPrintRegulatory(struct annoFormatVep *self, struct annoStreamRows *varData, - struct annoStreamRows gratorData[], int gratorCount) -/* If there are EXTRAS column sources that imply regulatory_region_variant, print out - * lines with that consequence (no gpFx/dbNSFP). */ -{ -if (afVepIsRegulatory(self, gratorData, gratorCount)) - { - afVepLmCleanup(self); - struct annoRow *varRow = varData->rowList; - afVepStartRow(self->f, self->doHtml); - afVepPrintNameAndLoc(self, varRow); - char *alt = afVepGetFirstAltAllele(self, varRow); - afVepPrintPredictionsReg(self, alt); - afVepPrintExistingVar(self, varRow, gratorData, gratorCount); - boolean gotExtra = FALSE; - afVepPrintExtrasHgvsGOnly(self, varRow, alt, &gotExtra); - afVepPrintExtrasOther(self, varRow, NULL, NULL, gratorData, gratorCount, TRUE, &gotExtra); - afVepEndRow(self->f, self->doHtml); - } -} - -static void afVepFormatOne(struct annoFormatter *fSelf, struct annoStreamRows *primaryData, - struct annoStreamRows gratorData[], int gratorCount) -/* Print one variant's VEP (possibly multiple lines) using collected rows. */ -{ -struct annoFormatVep *self = (struct annoFormatVep *)fSelf; -struct annoRow *gpVarRows = gratorData[0].rowList, *gpvRow; -for (gpvRow = gpVarRows; gpvRow != NULL; gpvRow = gpvRow->next) - afVepPrintOneLineGpVar(self, primaryData, gpvRow, gratorData, gratorCount); -afVepPrintRegulatory(self, primaryData, gratorData, gratorCount); -} - -static void afVepEndHtml(struct annoFormatVep *self) -{ -fputs("
", self->f); - fputs(*pLabel++, self->f); - while (*pLabel != NULL) - { - fputs("", self->f); - fputs(*pLabel++, self->f); - } - fputs("

\n", self->f); -} - -static void afVepClose(struct annoFormatter **pFSelf) -/* Close file handle, free self. */ -{ -if (pFSelf == NULL) - return; -struct annoFormatVep *self = *(struct annoFormatVep **)pFSelf; -if (self->doHtml) - afVepEndHtml(self); -freeMem(self->fileName); -carefulClose(&(self->f)); -lmCleanup(&(self->lm)); -dyStringFree(&(self->dyScratch)); -chromSeqWindowFree(&self->gSeqWin); -annoFormatterFree(pFSelf); -} - -static void afVepSetConfig(struct annoFormatVep *self, struct annoFormatVepConfig *config) -/* Check config and figure out where various output columns will come from. */ -{ -self->config = config; -struct asColumn *varAsColumns = config->variantSource->asObj->columnList; -self->varNameIx = -1; -self->varAllelesIx = -1; -if (asObjectsMatch(config->variantSource->asObj, pgSnpAsObj())) - self->variantType = afvdtPgSnpTable; -else if (asObjectsMatch(config->variantSource->asObj, pgSnpFileAsObj())) - self->variantType = afvdtPgSnpFile; -else if (asObjectsMatch(config->variantSource->asObj, vcfAsObj())) - { - self->variantType = afvdtVcf; - self->varNameIx = asColumnMustFindIx(varAsColumns, "id"); - } -else - errAbort("afVepSetConfig: variant source %s doesn't look like pgSnp or VCF", - config->variantSource->name); -if (self->variantType == afvdtPgSnpTable || self->variantType == afvdtPgSnpFile) - { - // pgSnp's "name" column actually contains slash-separated alleles - self->varAllelesIx = asColumnMustFindIx(varAsColumns, "name"); - } -if (config->gpVarSource == NULL) - errAbort("afVepSetConfig: config must have a gpVarSource"); -else if (! asColumnNamesMatchFirstN(config->gpVarSource->asObj, genePredAsObj(), 10) && - ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, bigGenePredAsObj(), 16) && - ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, annoStreamDbPslPlusAsObj(), - PSLPLUS_NUM_COLS)) - errAbort("afVepSetConfig: gpVarSource %s doesn't look like genePred or pslPlus", - config->gpVarSource->name); -// refGene and augmented knownGene have extra name fields that have the HGNC gene symbol: -struct asColumn *gpvAsColumns = config->gpVarSource->asObj->columnList; -self->txNameIx = asColumnFindIx(gpvAsColumns, "name"); -if (self->txNameIx < 0) - self->txNameIx = asColumnFindIx(gpvAsColumns, "qName"); -if (self->txNameIx < 0) - errAbort("afVepSetConfig: can't find name or qName column in gpVarSource %s", - config->gpVarSource->name); -self->geneNameIx = asColumnFindIx(gpvAsColumns, "geneSymbol"); -if (self->geneNameIx < 0) - self->geneNameIx = asColumnFindIx(gpvAsColumns, "kgXref_geneSymbol"); -if (self->geneNameIx < 0) - self->geneNameIx = asColumnFindIx(gpvAsColumns, "name2"); -if (self->geneNameIx < 0) - self->geneNameIx = asColumnFindIx(gpvAsColumns, "proteinID"); -self->hgvsGIx = asColumnMustFindIx(gpvAsColumns, "hgvsG"); -self->hgvsCNIx = asColumnMustFindIx(gpvAsColumns, "hgvsCN"); -self->hgvsPIx = asColumnMustFindIx(gpvAsColumns, "hgvsP"); -if (config->snpSource != NULL) - { - struct asColumn *snpAsColumns = config->snpSource->asObj->columnList; - self->snpNameIx = asColumnMustFindIx(snpAsColumns, "name"); - } -else - self->snpNameIx = -1; -} - -struct annoFormatVepConfig *annoFormatVepConfigNew(struct annoStreamer *variantSource, - char *variantDescription, - struct annoStreamer *gpVarSource, - char *gpVarDescription, - struct annoStreamer *snpSource, - char *snpDescription) -/* Return a basic configuration for VEP output. variantSource and gpVarSource must be - * provided; snpSource can be NULL. */ -{ -struct annoFormatVepConfig *config; -AllocVar(config); -config->variantSource = variantSource; -config->variantDescription = cloneString(variantDescription); -config->gpVarSource = gpVarSource; -config->gpVarDescription = cloneString(gpVarDescription); -config->snpSource = snpSource; -config->snpDescription = cloneString(snpDescription); -return config; -} - -struct annoFormatter *annoFormatVepNew(char *fileName, boolean doHtml, - struct annoStreamer *variantSource, - char *variantDescription, - struct annoStreamer *gpVarSource, - char *gpVarDescription, - struct annoStreamer *snpSource, - char *snpDescription, - struct annoAssembly *assembly) -/* Return a formatter that will write functional predictions in the same format as Ensembl's - * Variant Effect Predictor to fileName (can be "stdout"). - * variantSource and gpVarSource must be provided; snpSource can be NULL. */ -{ -struct annoFormatVep *self; -AllocVar(self); -struct annoFormatter *fSelf = &(self->formatter); -fSelf->getOptions = annoFormatterGetOptions; -fSelf->setOptions = annoFormatterSetOptions; -fSelf->initialize = afVepInitialize; -fSelf->comment = afVepComment; -fSelf->formatOne = afVepFormatOne; -fSelf->close = afVepClose; -self->fileName = cloneString(fileName); -self->f = mustOpen(fileName, "w"); -self->lm = lmInit(0); -self->dyScratch = dyStringNew(0); -self->needHeader = TRUE; -self->doHtml = doHtml; -self->assembly = assembly; -struct annoFormatVepConfig *config = annoFormatVepConfigNew(variantSource, variantDescription, - gpVarSource, gpVarDescription, - snpSource, snpDescription); -afVepSetConfig(self, config); -return (struct annoFormatter *)self; -} - -static void afvAddExtraItemMaybeReg(struct annoFormatter *fSelf, struct annoStreamer *extraSource, - char *tag, char *description, char *column, boolean isBoolean, - boolean isReg) -/* Tell annoFormatVep that it should include the given column of extraSource - * in the EXTRAS column with tag. The VEP header will include tag's description. - * If isReg, overlap implies that the variant has consequence regulatory_region_variant. - * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored. */ -{ -struct annoFormatVep *self = (struct annoFormatVep *)fSelf; -struct annoFormatVepExtraSource *src; -for (src = self->config->extraSources; src != NULL; src = src->next) - if (src->source == extraSource) - break; -if (src == NULL) - { - AllocVar(src); - src->source = extraSource; - if (extraSource->rowType == arWigVec) - src->printExtra = afVepPrintExtraWigVec; - else if (extraSource->rowType == arWigSingle) - src->printExtra = afVepPrintExtraWigSingle; - else - src->printExtra = afVepPrintExtraWords; - slAddTail(&(self->config->extraSources), src); - } -struct annoFormatVepExtraItem *item; -AllocVar(item); -item->tag = cloneString(tag); -item->description = cloneString(description); -item->rowIx = -1; -if (isNotEmpty(column)) - item->rowIx = asColumnMustFindIx(extraSource->asObj->columnList, column); -item->isRegulatory = isReg; -item->isBoolean = isBoolean; -slAddTail(&(src->items), item); -} - -void annoFormatVepAddExtraItem(struct annoFormatter *fSelf, struct annoStreamer *extraSource, - char *tag, char *description, char *column, boolean isBoolean) -/* Tell annoFormatVep that it should include the given column of extraSource - * in the EXTRAS column with tag. The VEP header will include tag's description. - * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored. - * If isBoolean, the column's description won't be output, only whether a match was found. */ -{ -afvAddExtraItemMaybeReg(fSelf, extraSource, tag, description, column, isBoolean, FALSE); -} - -void annoFormatVepAddRegulatory(struct annoFormatter *fSelf, struct annoStreamer *regSource, - char *tag, char *description, char *column) -/* Tell annoFormatVep to use the regulatory_region_variant consequence if there is overlap - * with regSource, and to include the given column of regSource in the EXTRAS column. - * The VEP header will include tag's description. */ -{ -afvAddExtraItemMaybeReg(fSelf, regSource, tag, description, column, FALSE, TRUE); -struct annoFormatVep *self = (struct annoFormatVep *)fSelf; -if (self->gSeqWin == NULL) - { - char *db = regSource->assembly->name; - self->gSeqWin = chromSeqWindowNew(db, NULL, 0, 0); - } -} - -void annoFormatVepSetHgvsOutOptions(struct annoFormatter *fSelf, uint hgvsOutOptions) -/* Import the HGVS output options described in hgHgvs.h */ -{ -struct annoFormatVep *self = (struct annoFormatVep *)fSelf; -// We only do g. terms, for regulatory region variants, so no need for transcript/protein opts. -if (hgvsOutOptions & HGVS_OUT_G) - self->hgvsMakeG = TRUE; -if (hgvsOutOptions & HGVS_OUT_BREAK_DELINS) - self->hgvsBreakDelIns = TRUE; -}