533112afe2a2005e80cdb1f82904ea65032d4302 braney Sat Oct 2 11:37:34 2021 -0700 split hg/lib into two separate libaries, one only used by the cgis diff --git src/hg/cgilib/annoGratorGpVar.c src/hg/cgilib/annoGratorGpVar.c new file mode 100644 index 0000000..ab8f230 --- /dev/null +++ src/hg/cgilib/annoGratorGpVar.c @@ -0,0 +1,861 @@ +/* annoGratorGpVar -- integrate pgSNP or VCF with gene pred and make gpFx predictions */ + +/* Copyright (C) 2014 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "annoGratorGpVar.h" +#include "annoStreamDbPslPlus.h" +#include "fa.h" +#include "genbank.h" +#include "genePred.h" +#include "hdb.h" +#include "hgHgvs.h" +#include "pgSnp.h" +#include "vcf.h" +#include "variant.h" +#include "gpFx.h" +#include "seqWindow.h" +#include "trackHub.h" +#include "twoBit.h" +#include "variantProjector.h" +#include "annoGratorQuery.h" + +struct annoGratorGpVar +{ + struct annoGrator grator; // external annoGrator/annoStreamer interface + struct lm *lm; // localmem scratch storage + struct dyString *dyScratch; // dyString for local temporary use + struct annoGratorGpVarFuncFilter *funcFilter; // Which categories of effect should we output? + enum annoGratorOverlap gpVarOverlapRule; // Should we set RJFail if no overlap? + struct seqWindow *gSeqWin; // means of fetching genomic sequence for HGVS term generation + boolean hgvsMakeG; // Generate genomic (g.) HGVS terms only if this is set + boolean hgvsMakeCN; // Generate transcript (n. / c.) HGVS terms only if this is set + boolean hgvsMakeP; // Generate protein (p.) HGVS terms only if this is set + boolean hgvsAddParensToP; // Add parentheses around predicted protein changes (strict HGVS) + boolean hgvsBreakDelIns; // Include deleted sequence (not only ins) e.g. delGGinsAT + boolean sourceHasFrames; // True if transcript annotations include exonFrames column + boolean sourceIsBigGenePred;// True if transcript annotations are from bigGenePred + boolean sourceIsPslPlus; // True if transcript annotations are PSL+CDS+seq info + boolean protLookupInited; // For looking up protein accessions in genePred-derived HGVS p. + char *protLookupTable; // " + struct hash *protLookupHash;// " + + struct variant *(*variantFromRow)(struct annoGratorGpVar *self, struct annoRow *row, + char *refAllele); + // Translate row from whatever format it is (pgSnp or VCF) into generic variant. + }; + + +static char *aggvAutoSqlStringStart = +"table genePredWithSO" +"\"genePred with Sequence Ontology annotation\"" +"("; + +static char *aggvAutoSqlStringEnd = +"string allele; \"Allele used to predict functional effect\"" +"string transcript; \"ID of affected transcript\"" +"uint soNumber; \"Sequence Ontology Number \"" +"uint detailType; \"gpFx detail type (1=codingChange, 2=nonCodingExon, 3=intron)\"" +"string detail0; \"detail column (per detailType) 0\"" +"string detail1; \"detail column (per detailType) 1\"" +"string detail2; \"detail column (per detailType) 2\"" +"string detail3; \"detail column (per detailType) 3\"" +"string detail4; \"detail column (per detailType) 4\"" +"string detail5; \"detail column (per detailType) 5\"" +"string detail6; \"detail column (per detailType) 6\"" +"string detail7; \"detail column (per detailType) 7\"" +"string detail8; \"detail column (per detailType) 8\"" +"string detail9; \"detail column (per detailType) 9\"" +"string detail10; \"detail column (per detailType) 10\"" +"string hgvsG; \"HGVS g. term\"" +"string hgvsCN; \"HGVS c. or n. term\"" +"string hgvsP; \"HGVS p. term\"" +")"; + +struct asObject *annoGpVarAsObj(struct asObject *sourceAsObj) +// Return asObject describing fields of internal source plus the fields we add here. +{ +struct dyString *gpPlusGpFx = dyStringCreate("%s", aggvAutoSqlStringStart); +// Translate each column back into autoSql text for combining with new output fields: +struct asColumn *col; +for (col = sourceAsObj->columnList; col != NULL; col = col->next) + { + if (col->fixedSize) + dyStringPrintf(gpPlusGpFx, "%s[%d]\t%s;\t\"%s\"", + col->lowType->name, col->fixedSize, col->name, col->comment); + else if (col->isArray || col->isList) + { + if (col->linkedSizeName) + dyStringPrintf(gpPlusGpFx, "%s[%s]\t%s;\t\"%s\"", + col->lowType->name, col->linkedSizeName, col->name, col->comment); + else + errAbort("Neither col->fixedSize nor col->linkedSizeName given for " + "asObj %s column '%s'", + sourceAsObj->name, col->name); + } + else + dyStringPrintf(gpPlusGpFx, "%s\t%s;\t\"%s\"", col->lowType->name, col->name, col->comment); + } +dyStringAppend(gpPlusGpFx, aggvAutoSqlStringEnd); +struct asObject *asObj = asParseText(gpPlusGpFx->string); +dyStringFree(&gpPlusGpFx); +return asObj; +} + +static boolean passesFilter(struct annoGratorGpVar *self, struct gpFx *gpFx) +/* Based on type of effect, should we print this one? */ +{ +struct annoGratorGpVarFuncFilter *filt = self->funcFilter; +enum soTerm term = gpFx->soNumber; +if (term == NMD_transcript_variant) + // This one gets special treatment because gpFx->detailType might still be codingChange: + return filt->nmdTranscript; +if (filt->intron && (term == intron_variant || term == complex_transcript_variant)) + return TRUE; +if (filt->upDownstream && (term == upstream_gene_variant || term == downstream_gene_variant)) + return TRUE; +if (filt->exonLoss && (term == exon_loss_variant)) + return TRUE; +if ((filt->exonLoss || filt->cdsNonSyn) && term == transcript_ablation) + return TRUE; +if (filt->utr && (term == _5_prime_UTR_variant || term == _3_prime_UTR_variant)) + return TRUE; +if (filt->cdsSyn && term == synonymous_variant) + return TRUE; +if (filt->cdsNonSyn && term != synonymous_variant + && (gpFx->detailType == codingChange || term == complex_transcript_variant)) + return TRUE; +if (filt->nonCodingExon && term == non_coding_transcript_exon_variant) + return TRUE; +if (filt->splice && (term == splice_donor_variant || term == splice_acceptor_variant || + term == splice_region_variant)) + return TRUE; +if (filt->noVariation && term == no_sequence_alteration) + return TRUE; +return FALSE; +} + +static char *blankIfNull(char *input) +{ +if (input == NULL) + return ""; + +return input; +} + +static char *uintToString(struct lm *lm, uint num) +{ +char buffer[10]; + +safef(buffer,sizeof buffer, "%d", num); +return lmCloneString(lm, buffer); +} + +static void aggvStringifyGpFx(char **words, struct gpFx *effect, struct lm *lm) +// turn gpFx structure into array of words +{ +int count = 0; + +words[count++] = lmCloneString(lm, effect->gAllele); +words[count++] = lmCloneString(lm, blankIfNull(effect->transcript)); +words[count++] = uintToString(lm, effect->soNumber); +words[count++] = uintToString(lm, effect->detailType); +int gpFxNumCols = 4; + +if (effect->detailType == intron) + { + words[count++] = uintToString(lm, effect->details.intron.intronNumber); + words[count++] = uintToString(lm, effect->details.intron.intronCount); + } +else if (effect->detailType == nonCodingExon) + { + words[count++] = uintToString(lm, effect->details.nonCodingExon.exonNumber); + words[count++] = uintToString(lm, effect->details.nonCodingExon.exonCount); + words[count++] = uintToString(lm, effect->details.nonCodingExon.cDnaPosition); + words[count++] = lmCloneString(lm, effect->details.nonCodingExon.txRef); + words[count++] = lmCloneString(lm, effect->details.nonCodingExon.txAlt); + } +else if (effect->detailType == codingChange) + { + struct codingChange *cc = &(effect->details.codingChange); + words[count++] = uintToString(lm, cc->exonNumber); + words[count++] = uintToString(lm, cc->exonCount); + words[count++] = uintToString(lm, cc->cDnaPosition); + words[count++] = lmCloneString(lm, cc->txRef); + words[count++] = lmCloneString(lm, cc->txAlt); + words[count++] = uintToString(lm, cc->cdsPosition); + words[count++] = uintToString(lm, cc->pepPosition); + words[count++] = strUpper(lmCloneString(lm, blankIfNull(cc->aaOld))); + words[count++] = strUpper(lmCloneString(lm, blankIfNull(cc->aaNew))); + words[count++] = strUpper(lmCloneString(lm, blankIfNull(cc->codonOld))); + words[count++] = strUpper(lmCloneString(lm, blankIfNull(cc->codonNew))); + } +else if (effect->detailType != none) + errAbort("annoGratorGpVar: unknown effect type %d", effect->detailType); + +// Add max number of columns added in any if clause above +gpFxNumCols += 9; + +while (count < gpFxNumCols) + words[count++] = ""; +} + +struct gpFx *annoGratorGpVarGpFxFromRow(struct annoStreamer *sSelf, struct annoRow *row, + struct lm *lm) +// Turn the string array representation back into a real gpFx. +// I know this is inefficient and am thinking about a better way. +{ +if (row == NULL) + return NULL; +struct gpFx *effect; +lmAllocVar(lm, effect); +struct annoGrator *gSelf = (struct annoGrator *)sSelf; +// get gpFx words which follow internal source's words: +char **words = (char **)(row->data); +int count = gSelf->mySource->numCols; + +effect->gAllele = lmCloneString(lm, words[count++]); +effect->transcript = lmCloneString(lm, words[count++]); +effect->soNumber = atol(words[count++]); +effect->detailType = atol(words[count++]); + +if (effect->detailType == intron) + { + effect->details.intron.intronNumber = atol(words[count++]); + effect->details.intron.intronCount = atol(words[count++]); + } +else if (effect->detailType == nonCodingExon) + { + effect->details.nonCodingExon.exonNumber = atol(words[count++]); + effect->details.nonCodingExon.exonCount = atol(words[count++]); + effect->details.nonCodingExon.cDnaPosition = atol(words[count++]); + effect->details.nonCodingExon.txRef = lmCloneString(lm, words[count++]); + effect->details.nonCodingExon.txAlt = lmCloneString(lm, words[count++]); + } +else if (effect->detailType == codingChange) + { + effect->details.codingChange.exonNumber = atol(words[count++]); + effect->details.codingChange.exonCount = atol(words[count++]); + effect->details.codingChange.cDnaPosition = atol(words[count++]); + effect->details.codingChange.txRef = lmCloneString(lm, words[count++]); + effect->details.codingChange.txAlt = lmCloneString(lm, words[count++]); + effect->details.codingChange.cdsPosition = atol(words[count++]); + effect->details.codingChange.pepPosition = atol(words[count++]); + effect->details.codingChange.aaOld = lmCloneString(lm, words[count++]); + effect->details.codingChange.aaNew = lmCloneString(lm, words[count++]); + effect->details.codingChange.codonOld = lmCloneString(lm, words[count++]); + effect->details.codingChange.codonNew = lmCloneString(lm, words[count++]); + } +else if (effect->detailType != none) + errAbort("annoGratorGpVar: unknown effect type %d", effect->detailType); +return effect; +} + +// Container for optional HGVS terms +struct hgvsTerms + { + char *hgvsG; // HGVS g. term or NULL + char *hgvsCN; // HGVS c. or n. term or NULL + char *hgvsP; // HGVS p. term or NULL + }; + +static void hgvsTermsFree(struct hgvsTerms *hgvs) +/* Free struct hgvsTerms and its members. */ +{ +if (hgvs) + { + freez(&hgvs->hgvsG); + freez(&hgvs->hgvsCN); + freez(&hgvs->hgvsP); + freeMem(hgvs); + } +} + +static struct annoRow *aggvEffectToRow(struct annoGratorGpVar *self, struct gpFx *effect, + struct annoRow *rowIn, struct hgvsTerms *hgvs, + struct lm *callerLm) +// convert a single genePred annoRow and gpFx record to an augmented genePred annoRow; +{ +struct annoGrator *gSelf = &(self->grator); +struct annoStreamer *sSelf = &(gSelf->streamer); +assert(sSelf->numCols > gSelf->mySource->numCols); + +char **wordsOut; +lmAllocArray(self->lm, wordsOut, sSelf->numCols); + +// copy the genePred fields over +int gpColCount = gSelf->mySource->numCols; +char **wordsIn = (char **)rowIn->data; +memcpy(wordsOut, wordsIn, sizeof(char *) * gpColCount); + +// stringify the gpFx structure +aggvStringifyGpFx(&wordsOut[gpColCount], effect, callerLm); + +// Final columns: optional HGVS terms +wordsOut[sSelf->numCols-3] = lmCloneString(callerLm, hgvs && hgvs->hgvsG ? hgvs->hgvsG : ""); +wordsOut[sSelf->numCols-2] = lmCloneString(callerLm, hgvs && hgvs->hgvsCN ? hgvs->hgvsCN : ""); +wordsOut[sSelf->numCols-1] = lmCloneString(callerLm, hgvs && hgvs->hgvsP ? hgvs->hgvsP : ""); + +return annoRowFromStringArray(rowIn->chrom, rowIn->start, rowIn->end, rowIn->rightJoinFail, + wordsOut, sSelf->numCols, callerLm); +} + +struct dnaSeq *genePredToGenomicSequence(struct genePred *pred, struct annoAssembly *aa, + struct lm *lm) +/* Return concatenated genomic sequence of exons of pred. */ +{ +int txLen = 0; +int i; +for (i=0; i < pred->exonCount; i++) + txLen += (pred->exonEnds[i] - pred->exonStarts[i]); +char *seq = lmAlloc(lm, txLen + 1); +int offset = 0; +for (i=0; i < pred->exonCount; i++) + { + int exonStart = pred->exonStarts[i]; + int exonEnd = pred->exonEnds[i]; + annoAssemblyGetSeq(aa, pred->chrom, exonStart, exonEnd, seq+offset, txLen+1-offset); + offset += (exonEnd - exonStart); + } +if(pred->strand[0] == '-') + reverseComplement(seq, txLen); +struct dnaSeq *txSeq = NULL; +lmAllocVar(lm, txSeq); +txSeq->name = lmCloneString(lm, pred->name); +txSeq->dna = seq; +txSeq->size = txLen; +return txSeq; +} + +struct dnaSeq *translateTx(struct dnaSeq *txSeq, struct genbankCds *cds) +/* Translate txSeq into protein sequence, including 'X' for stop codon if present. */ +{ +struct dnaSeq *protSeq = translateSeq(txSeq, cds->start, FALSE); +aaSeqZToX(protSeq); +return protSeq; +} + +static struct udcFile *getCachedUdcFile(char *fileOrUrl) +/* Return an open UDC file handle for fileOrUrl; cache open connections. errAbort if can't open. */ +{ +static struct hash *hash = NULL; +if (hash == NULL) + hash = hashNew(0); +struct udcFile *udcf = hashFindVal(hash, fileOrUrl); +if (udcf == NULL) + { + char *realFileOrUrl = hReplaceGbdb(fileOrUrl); + udcf = udcFileOpen(realFileOrUrl, NULL); + hashAdd(hash, fileOrUrl, udcf); + freeMem(realFileOrUrl); + } +return udcf; +} + +static struct dnaSeq *getCachedSeq(char *fileOrUrl, off_t offset, size_t size, boolean isDna) +/* Parse FASTA in file fileOrUrl at offset and return a cached dnaSeq. */ +{ +static struct hash *hash = NULL; +if (hash == NULL) + hash = hashNew(0); +char key[4096]; +safef(key, sizeof(key), "%s_%lld_%lld", fileOrUrl, (long long)offset, (long long)size); +struct dnaSeq *seq = hashFindVal(hash, key); +if (seq == NULL) + { + char *buf = needMem(size+1); + struct udcFile *udcf = getCachedUdcFile(fileOrUrl); + udcSeek(udcf, offset); + udcRead(udcf, buf, size); + seq = faSeqFromMemText(buf, isDna); + toUpperN(seq->dna, seq->size); + hashAdd(hash, key, seq); + } +return seq; +} + +struct dnaSeq *getProtSeq(struct annoGratorGpVar *self, char *protAcc, + struct dnaSeq *txSeq, struct genbankCds *cds) +/* If protAcc is NULL, translate txSeq+cds; else look up protAcc. */ +{ +static struct hash *hash = NULL; +if (hash == NULL) + hash = hashNew(0); +struct dnaSeq *accSeq = NULL; +if (isEmpty(protAcc)) + { + accSeq = hashFindVal(hash, txSeq->name); + if (accSeq == NULL) + { + accSeq = translateTx(txSeq, cds); + hashAdd(hash, txSeq->name, accSeq); + } + } +else + { + accSeq = hashFindVal(hash, protAcc); + char *db = self->grator.streamer.assembly->name; + char query[1024]; + struct sqlConnection *conn = hAllocConn(db); + //#*** Using presence or absence of dot-version is an ugly hack, but it will do for now; + //#*** otherwise there's new config to pass in & store... + if (strchr(protAcc, '.')) + { + // Use ncbiRefSeqPepTable + sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable where name = '%s'", + protAcc); + char *seq = sqlQuickString(conn, query); + if (seq) + accSeq = newDnaSeq(seq, strlen(seq), protAcc); + } + else + { + // Get GenBank versioned acc and seq + sqlSafef(query, sizeof(query), + "select path,file_offset,file_size from %s,%s " + "where acc = '%s' and gbExtFile.id = gbExtFile", + gbSeqTable, gbExtFileTable, protAcc); + char **row; + struct sqlResult *sr = sqlGetResult(conn, query); + if ((row = sqlNextRow(sr)) != NULL) + accSeq = getCachedSeq(row[0], atoll(row[1]), atoll(row[2]), FALSE); + sqlFreeResult(&sr); + } + if (accSeq == NULL) + // Store a dnaSeq with NULL name and seq so we don't waste time sql querying this again. + accSeq = newDnaSeq(NULL, 0, NULL); + hashAdd(hash, protAcc, accSeq); + hFreeConn(&conn); + } +return (accSeq->name == NULL) ? NULL : accSeq; +} + +static struct hgvsTerms *getHgvsTerms(struct annoGratorGpVar *self, char *chromAcc, + struct psl *psl, struct genbankCds *cds, + struct dnaSeq *txSeq, struct dnaSeq *protSeq, + struct vpTx *vpTx, struct vpPep *vpPep) +/* Return HGVS terms for a variant allele projected onto the genome. */ +{ +struct hgvsTerms *hgvs; +AllocVar(hgvs); +struct bed3 variantBed; +variantBed.chrom = psl->tName; +variantBed.chromStart = min(vpTx->start.gOffset, vpTx->end.gOffset); +variantBed.chromEnd = max(vpTx->start.gOffset, vpTx->end.gOffset); +if (self->hgvsMakeG) + { + int gAltLen = strlen(vpTx->gAlt); + char gAlt[gAltLen+1]; + safecpy(gAlt, sizeof(gAlt), vpTx->gAlt); + if (pslQStrand(psl) == '-') + reverseComplement(gAlt, gAltLen); + hgvs->hgvsG = hgvsGFromVariant(self->gSeqWin, &variantBed, gAlt, chromAcc, + self->hgvsBreakDelIns); + } +if ((self->hgvsMakeCN || self->hgvsMakeP) && txSeq) + { + if (cds->start != cds->end && cds->start >= 0) + { + if (self->hgvsMakeCN) + hgvs->hgvsCN = hgvsCFromVpTx(vpTx, self->gSeqWin, psl, cds, txSeq, + self->hgvsBreakDelIns); + if (self->hgvsMakeP) + { + hgvs->hgvsP = hgvsPFromVpPep(vpPep, protSeq, self->hgvsAddParensToP); + } + } + else if (self->hgvsMakeCN) + hgvs->hgvsCN = hgvsNFromVpTx(vpTx, self->gSeqWin, psl, txSeq, + self->hgvsBreakDelIns); + } +return hgvs; +} + + +static struct annoRow *aggvPredict(struct annoGratorGpVar *self, struct variant *variant, + struct psl *psl, struct genbankCds *cds, + struct dnaSeq *txSeq, struct dnaSeq *protSeq, + struct annoRow *inRow, struct lm *callerLm) +// Project variant through psl onto transcript and predict functional effects. +{ +struct annoRow *rowList = NULL; +char *db = self->grator.streamer.assembly->name; +char *chromAcc = self->hgvsMakeG ? hRefSeqAccForChrom(db, variant->chrom) : NULL; +struct bed3 *variantBed = (struct bed3 *)variant; +vpExpandIndelGaps(psl, self->gSeqWin, txSeq); +struct allele *allele; +for (allele = variant->alleles; allele != NULL; allele = allele->next) + { + char *alt = allele->sequence; + struct vpTx *vpTx = vpGenomicToTranscript(self->gSeqWin, variantBed, alt, psl, txSeq); + if (!allele->isReference || vpTx->genomeMismatch) + { + struct vpPep *vpPep = vpTranscriptToProtein(vpTx, cds, txSeq, protSeq); + struct hgvsTerms *hgvs = getHgvsTerms(self, chromAcc, psl, cds, txSeq, protSeq, + vpTx, vpPep); + struct gpFx *fxList = vpTranscriptToGpFx(vpTx, psl, cds, txSeq, vpPep, protSeq, self->lm); + struct annoRow *rows = NULL; + struct gpFx *fx; + for(fx = fxList; fx != NULL; fx = fx->next) + { + // Intergenic variants never pass through here so we don't have to filter them out + // here if self->funcFilter is null. + if (self->funcFilter == NULL || passesFilter(self, fx)) + { + // Restore the original variant's alt allele + fx->gAllele = lmCloneString(self->lm, allele->sequence); + slAddHead(&rows, aggvEffectToRow(self, fx, inRow, hgvs, callerLm)); + } + } + slReverse(&rows); + rowList = slCat(rows, rowList); + hgvsTermsFree(hgvs); + vpPepFree(&vpPep); + } + vpTxFree(&vpTx); + } +freeMem(chromAcc); +return rowList; +} + +static void lookupProtAcc(struct annoGratorGpVar *self, struct dnaSeq *protSeq) +/* For Gencode, try to find the ENSP* ID associated with the ENST* ID. */ +{ +char *db = self->grator.streamer.assembly->name; +if (! self->protLookupInited) + { + char *sourceName = self->grator.streamer.name; + if (strstr(sourceName, "wgEncodeGencode")) + { + char *version = strrchr(sourceName, 'V'); + if (version) + { + if (!trackHubDatabase(db)) + { + char attrsTable[512]; + safef(attrsTable, sizeof(attrsTable), "wgEncodeGencodeAttrs%s", version); + if (hTableExists(db, attrsTable) && hHasField(db, attrsTable, "proteinId")) + { + self->protLookupHash = hashNew(0); + self->protLookupTable = cloneString(attrsTable); + } + } + } + } + self->protLookupInited = TRUE; + } +if (self->protLookupHash != NULL) + { + char *txAcc = protSeq->name; + struct hashEl *hel = hashLookup(self->protLookupHash, txAcc); + if (hel == NULL) + { + struct sqlConnection *conn = hAllocConn(db); + char query[2048]; + sqlSafef(query, sizeof(query), "select proteinId from %s where transcriptId = '%s'", + self->protLookupTable, txAcc); + char *protAcc = sqlQuickString(conn, query); + hel = hashAdd(self->protLookupHash, txAcc, protAcc); + hFreeConn(&conn); + } + if (hel->val != NULL) + { + freeMem(protSeq->name); + protSeq->name = cloneString(hel->val); + } + } +} + +static struct annoRow *aggvGenRowsGp(struct annoGratorGpVar *self, struct variant *variant, + struct genePred *pred, struct annoRow *inRow, + struct lm *callerLm) +// put out annoRows for all the gpFx that arise from variant and pred +{ +struct annoStreamer *sSelf = &(self->grator.streamer); +struct genbankCds cds; +genePredToCds(pred, &cds); +struct dnaSeq *txSeq = genePredToGenomicSequence(pred, sSelf->assembly, self->lm); +int chromSize = 0; // unused +struct psl *psl = genePredToPsl(pred, chromSize, txSeq->size); +pslRemoveFrameShifts(psl); +vpExpandIndelGaps(psl, self->gSeqWin, txSeq); +struct dnaSeq *protSeq = NULL; +if (cds.end > cds.start) + { + //#*** what if cds.startComplete is not true?? + protSeq = translateTx(txSeq, &cds); + lookupProtAcc(self, protSeq); + } +struct annoRow *rowList = aggvPredict(self, variant, psl, &cds, txSeq, protSeq, inRow, callerLm); +pslFree(&psl); +dnaSeqFree(&protSeq); +return rowList; +} + +static struct annoRow *aggvGenRowsPsl(struct annoGratorGpVar *self, struct variant *variant, + struct annoRow *inRow, struct lm *callerLm) +// put out annoRows for all the gpFx that arise from variant and transcript psl+ +{ +char **ppWords = inRow->data; +struct psl *psl = pslLoad(ppWords); +struct genbankCds cds; +genbankCdsParse(ppWords[PSLPLUS_CDS_IX], &cds); +struct dnaSeq *txSeq = getCachedSeq(ppWords[PSLPLUS_PATH_IX], atoll(ppWords[PSLPLUS_FILEOFFSET_IX]), + atoll(ppWords[PSLPLUS_FILESIZE_IX]), TRUE); +struct dnaSeq *protSeq = getProtSeq(self, ppWords[PSLPLUS_PROTACC_IX], txSeq, &cds); +struct annoRow *rowList = aggvPredict(self, variant, psl, &cds, txSeq, protSeq, inRow, callerLm); +pslFree(&psl); +return rowList; +} + +struct annoRow *aggvGenelessRow(struct annoGratorGpVar *self, struct variant *variant, + boolean rjFail, struct lm *callerLm) +/* If intergenic variants (no overlapping or nearby genes) are to be included in output, + * make an output row with empty genePred and a gpFx that is empty except for soNumber. */ +{ +struct annoGrator *gSelf = &(self->grator); +struct annoStreamer *sSelf = &(gSelf->streamer); +char **wordsOut; +lmAllocArray(self->lm, wordsOut, sSelf->numCols); +// Add empty strings for genePred string columns: +int gpColCount = gSelf->mySource->numCols; +int i; +for (i = 0; i < gpColCount; i++) + wordsOut[i] = ""; +struct gpFx *gpFx; +lmAllocVar(self->lm, gpFx); +enum soTerm term = hasAltAllele(variant->alleles) ? intergenic_variant : no_sequence_alteration; +if (term == no_sequence_alteration) + gpFx->gAllele = variant->alleles->sequence; +else + gpFx->gAllele = firstAltAllele(variant->alleles); +if (isAllNt(gpFx->gAllele, strlen(gpFx->gAllele))) + touppers(gpFx->gAllele); +gpFx->soNumber = term; +gpFx->detailType = none; +aggvStringifyGpFx(&wordsOut[gpColCount], gpFx, self->lm); +if (self->hgvsMakeG) + { + // Add HGVS genomic term + struct bed3 *variantBed = (struct bed3 *)variant; + char *chromAcc = hRefSeqAccForChrom(sSelf->assembly->name, variant->chrom); + char *hgvsG = hgvsGFromVariant(self->gSeqWin, variantBed, gpFx->gAllele, chromAcc, + self->hgvsBreakDelIns); + wordsOut[sSelf->numCols-3] = lmCloneString(callerLm, hgvsG); + freeMem(chromAcc); + freeMem(hgvsG); + } +return annoRowFromStringArray(variant->chrom, variant->chromStart, variant->chromEnd, rjFail, + wordsOut, sSelf->numCols, callerLm); +} + +static struct variant *variantFromPgSnpTableRow(struct annoGratorGpVar *self, struct annoRow *row, + char *refAllele) +/* Translate pgSnp array of words into variant. */ +{ +return variantFromPgSnpAnnoRow(row, refAllele, TRUE, self->lm); +} + +static struct variant *variantFromPgSnpFileRow(struct annoGratorGpVar *self, struct annoRow *row, + char *refAllele) +/* Translate pgSnp array of words into variant. */ +{ +return variantFromPgSnpAnnoRow(row, refAllele, FALSE, self->lm); +} + +static struct variant *variantFromVcfRow(struct annoGratorGpVar *self, struct annoRow *row, + char *refAllele) +/* Translate vcf array of words into variant. */ +{ +return variantFromVcfAnnoRow(row, refAllele, self->lm, self->dyScratch); +} + +static void setVariantFromRow(struct annoGratorGpVar *self, struct annoStreamRows *primaryData) +/* Depending on autoSql definition of primary source, choose a function to translate + * incoming rows into generic variants. */ +{ +if (asObjectsMatch(primaryData->streamer->asObj, pgSnpAsObj())) + self->variantFromRow = variantFromPgSnpTableRow; +else if (asObjectsMatch(primaryData->streamer->asObj, pgSnpFileAsObj())) + self->variantFromRow = variantFromPgSnpFileRow; +else if (asObjectsMatch(primaryData->streamer->asObj, vcfAsObj())) + self->variantFromRow = variantFromVcfRow; +} + +struct annoRow *annoGratorGpVarIntegrate(struct annoGrator *gSelf, + struct annoStreamRows *primaryData, + boolean *retRJFilterFailed, struct lm *callerLm) +// integrate a variant and a genePred, generate as many rows as +// needed to capture all the changes +{ +struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf; +struct annoStreamer *sSelf = &(gSelf->streamer); +lmCleanup(&(self->lm)); +self->lm = lmInit(0); +if (self->variantFromRow == NULL) + setVariantFromRow(self, primaryData); +// TODO Performance improvement: instead of creating the transcript sequence for each +// variant that intersects the transcript, cache transcript sequence; possibly +// an slPair with a concatenation of {chrom, txStart, txEnd, cdsStart, cdsEnd, +// exonStarts, exonEnds} as the name, and sequence as the val. When something in +// the list is no longer in the list of rows from the internal annoGratorIntegrate call, +// drop it. +// BETTER YET: make a callback for gpFx to get CDS sequence only when it needs it. +struct annoRow *primaryRow = primaryData->rowList; +int refAlBufSize = primaryRow->end - primaryRow->start + 1; +char refAllele[refAlBufSize]; +annoAssemblyGetSeq(sSelf->assembly, primaryRow->chrom, primaryRow->start, primaryRow->end, + refAllele, sizeof(refAllele)); +struct variant *variant = self->variantFromRow(self, primaryRow, refAllele); + +// Temporarily tweak primaryRow's start and end to find upstream/downstream overlap: +int pStart = primaryRow->start, pEnd = primaryRow->end; +if (primaryRow->start <= GPRANGE) + primaryRow->start = 0; +else + primaryRow->start -= GPRANGE; +primaryRow->end += GPRANGE; +struct annoRow *rows = annoGratorIntegrate(gSelf, primaryData, retRJFilterFailed, self->lm); +primaryRow->start = pStart; +primaryRow->end = pEnd; + +if (rows == NULL) + { + // No genePreds means that the primary variant is intergenic. + if ((self->funcFilter == NULL || self->funcFilter->intergenic)) + return aggvGenelessRow(self, variant, *retRJFilterFailed, callerLm); + else if (retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) + *retRJFilterFailed = TRUE; + return NULL; + } +if (retRJFilterFailed && *retRJFilterFailed) + return NULL; + +struct annoRow *outRows = NULL; + +for(; rows; rows = rows->next) + { + struct annoRow *outRow = NULL; + if (self->sourceIsPslPlus) + { + outRow = aggvGenRowsPsl(self, variant, rows, callerLm); + } + else + { + struct genePred *gp; + if (self->sourceIsBigGenePred) + gp = (struct genePred *)genePredFromBigGenePredRow(rows->data); + else + { + // work around genePredLoad's trashing its input + char **inWords = rows->data; + char *saveExonStarts = lmCloneString(self->lm, inWords[8]); + char *saveExonEnds = lmCloneString(self->lm, inWords[9]); + gp = self->sourceHasFrames ? genePredExtLoad(inWords, GENEPREDX_NUM_COLS) : + genePredLoad(inWords); + inWords[8] = saveExonStarts; + inWords[9] = saveExonEnds; + } + outRow = aggvGenRowsGp(self, variant, gp, rows, callerLm); + genePredFree(&gp); + } + if (outRow != NULL) + { + slReverse(&outRow); + outRows = slCat(outRow, outRows); + } + } +slReverse(&outRows); +// If all rows failed the filter, and we must overlap, set *retRJFilterFailed. +if (outRows == NULL && retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) + *retRJFilterFailed = TRUE; +return outRows; +} + +static void aggvSetOverlapRule(struct annoGrator *gSelf, enum annoGratorOverlap rule) +/* We need an overlap rule that is independent of the genePred integration because + * if we're including intergenic variants, then we return a row even though no genePred + * rows overlap. */ +{ +struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf; +self->gpVarOverlapRule = rule; +// For mustOverlap, if we're including intergenic variants in output, don't let simple +// integration set retRjFilterFailed: +if (rule == agoMustOverlap && self->funcFilter != NULL && self->funcFilter->intergenic) + gSelf->overlapRule = agoNoConstraint; +else + gSelf->overlapRule = rule; +} + +void aggvClose(struct annoStreamer **pSSelf) +/* Close out localmem and all the usual annoGrator stuff. */ +{ +if (*pSSelf != NULL) + { + struct annoGratorGpVar *self = (struct annoGratorGpVar *)(*pSSelf); + lmCleanup(&(self->lm)); + dyStringFree(&(self->dyScratch)); + freez(&self->protLookupTable); + hashFree(&self->protLookupHash); + annoGratorClose(pSSelf); + } +} + +struct annoGrator *annoGratorGpVarNew(struct annoStreamer *mySource) +/* Make a subclass of annoGrator that combines genePreds from mySource with + * pgSnp rows from primary source to predict functional effects of variants + * on genes. + * mySource becomes property of the new annoGrator (don't close it, close annoGrator). */ +{ +struct annoGratorGpVar *self; +AllocVar(self); +struct annoGrator *gSelf = &(self->grator); +annoGratorInit(gSelf, mySource); +struct annoStreamer *sSelf = &(gSelf->streamer); +// We add columns beyond what comes from mySource, so update our public-facing asObject: +annoGratorSetAutoSqlObject(sSelf, annoGpVarAsObj(mySource->asObj)); +gSelf->setOverlapRule = aggvSetOverlapRule; +sSelf->close = aggvClose; +// integrate by adding gpFx fields +gSelf->integrate = annoGratorGpVarIntegrate; +self->dyScratch = dyStringNew(0); +self->sourceHasFrames = (asColumnFindIx(mySource->asObj->columnList, "exonFrames") >= 0); +self->sourceIsBigGenePred = (asColumnFindIx(mySource->asObj->columnList, "chromStart") >= 0); +self->sourceIsPslPlus = (asColumnFindIx(mySource->asObj->columnList, "tStart") >= 0); +char *db = sSelf->assembly->name; +self->gSeqWin = chromSeqWindowNew(db, NULL, 0, 0); + +return gSelf; +} + +void annoGratorGpVarSetFuncFilter(struct annoGrator *gSelf, + struct annoGratorGpVarFuncFilter *funcFilter) +/* If funcFilter is non-NULL, it specifies which functional categories + * to include in output; if NULL, by default intergenic variants are excluded and + * all other categories are included. + * NOTE: This calls gSelf->setOverlapRule() with the currently set overlap rule because + * overlapRule is affected by filter settings. */ +{ +struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf; +freez(&self->funcFilter); +if (funcFilter != NULL) + self->funcFilter = CloneVar(funcFilter); +// Since our overlapRule behavior depends on filter settings, reevaluate: +gSelf->setOverlapRule(gSelf, self->gpVarOverlapRule); +} + +void annoGratorGpVarSetHgvsOutOptions(struct annoGrator *gSelf, uint hgvsOutOptions) +/* Import the HGVS output options described in hgHgvs.h */ +{ +struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf; +if (hgvsOutOptions & HGVS_OUT_G) + self->hgvsMakeG = TRUE; +if (hgvsOutOptions & HGVS_OUT_CN) + self->hgvsMakeCN = TRUE; +if (hgvsOutOptions & HGVS_OUT_P) + self->hgvsMakeP = TRUE; +if (hgvsOutOptions & HGVS_OUT_P_ADD_PARENS) + self->hgvsAddParensToP = TRUE; +if (hgvsOutOptions & HGVS_OUT_BREAK_DELINS) + self->hgvsBreakDelIns = TRUE; +}