0c56fe6958f347b5ab7070725114c7d4c216bc87 braney Mon Nov 13 15:19:40 2017 -0800 catch weird strings in position boxes diff --git src/hg/hgVai/hgVai.c src/hg/hgVai/hgVai.c index e908f95..4bfda99 100644 --- src/hg/hgVai/hgVai.c +++ src/hg/hgVai/hgVai.c @@ -1,2971 +1,2972 @@ /* hgVai - Variant Annotation Integrator. */ /* Copyright (C) 2014 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "jksql.h" #include "htmshell.h" #include "web.h" #include "cheapcgi.h" #include "cart.h" #include "cartTrackDb.h" #include "genbank.h" #include "hgConfig.h" #include "hgHgvs.h" #include "hui.h" #include "grp.h" #include "hCommon.h" #include "hgFind.h" #include "hPrint.h" #include "jsHelper.h" #include "memalloc.h" #include "textOut.h" #include "trackHub.h" #include "hubConnect.h" #include "twoBit.h" #include "gpFx.h" #include "bigGenePred.h" #include "udc.h" #include "knetUdc.h" #include "md5.h" #include "regexHelper.h" #include "hAnno.h" #include "annoGratorQuery.h" #include "annoGratorGpVar.h" #include "annoFormatVep.h" #include "annoStreamBigBed.h" #include "annoStreamDb.h" +#include "windowsToAscii.h" #include "libifyMe.h" #define GENCODE_TAG_DOC_URL "\"http://www.gencodegenes.org/gencode_tags.html\"" #define GENCODE_BASIC_DOC_URL "\"http://www.gencodegenes.org/faq.html\"" #define REFSEQ_STATUS_DOC_URL "\"https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_status_codes\"" #define APPRIS_DOC_URL "\"http://appris.bioinfo.cnio.es/#/help/database\"" /* Global Variables */ struct cart *cart; /* CGI and other variables */ struct hash *oldVars = NULL; /* The cart before new cgi stuff added. */ char *genome = NULL; /* Name of genome - mouse, human, etc. */ char *database = NULL; /* Current genome database - hg17, mm5, etc. */ char *regionType = NULL; /* genome, ENCODE pilot regions, or specific position range. */ struct grp *fullGroupList = NULL; /* List of all groups. */ struct trackDb *fullTrackList = NULL; /* List of all tracks in database. */ static struct pipeline *compressPipeline = (struct pipeline *)NULL; // Null terminated list of CGI Variables we don't want to save permanently: char *excludeVars[] = {"Submit", "submit", "hgva_startQuery", NULL,}; #define hgvaRange "position" #define hgvaRegionType "hgva_regionType" #define hgvaRegionTypeEncode "encode" #define hgvaRegionTypeGenome "genome" #define hgvaRegionTypeRange "range" #define hgvaPositionContainer "positionContainer" #define hgvsTrashSubDir "hgv" // auto-generated variants as input #define hgvaSampleVariants "hgva_internally_generated_sample_variants" #define hgvaSampleVariantsLabel "Artificial Example Variants" // dbSNP rsIDs as variant inputs: #define hgvaUseVariantIds "hgva_useVariantIds" #define hgvaVariantIdsLabel "Variant Identifiers" #define hgvaVariantIds "hgva_variantIds" #define hgvaUseVariantFileOrUrl "hgva_useVariantFileOrUrl" #define hgvaVariantPasteContainer "variantPasteContainer" // A local file or URL as variant input: #define hgvaVariantFileOrUrlLabel "from file or URL" #define hgvaVariantFileOrUrl "hgva_variantFileOrUrl" #define hgvaVariantFileOrUrlType "hgva_variantFileOrUrlType" // HGVS terms as variant inputs: #define hgvaUseHgvs "hgva_useHgvs" #define hgvaHgvsLabel "HGVS terms" #define hgvaHgvs "hgva_hgvs" #define hgvaHgvsPasteContainer "hgvsPasteContainer" void addSomeCss() /*#*** This should go in a .css file of course. */ { printf("<style>\n" "div.sectionLite { border-width: 1px; border-color: #"HG_COL_BORDER"; border-style: solid;" " background-color: #"HG_COL_INSIDE"; padding-left: 10px; padding-right: 10px; " " padding-top: 8px; padding-bottom: 5px; margin-top: 5px; margin-bottom: 5px }\n" ".sectionLiteHeader { font-weight: bold; font-size:larger; color:#000000;" " text-align:left; vertical-align:bottom; white-space:nowrap; }\n" "div.sectionLiteHeader.noReorderRemove { padding-bottom:5px; }\n" "div.sourceFilter { padding-top: 5px; padding-bottom: 5px }\n" "</style>\n"); } INLINE void startCollapsibleSection(char *sectionSuffix, char *title, boolean onByDefault) // Wrap shared args to jsBeginCollapsibleSectionFontSize { jsBeginCollapsibleSectionFontSize(cart, "hgva", sectionSuffix, title, onByDefault, "1.1em"); } INLINE void startSmallCollapsibleSection(char *sectionSuffix, char *title, boolean onByDefault) // Wrap shared args to jsBeginCollapsibleSectionFontSize { jsBeginCollapsibleSectionFontSize(cart, "hgva", sectionSuffix, title, onByDefault, "0.9em"); } #define endCollapsibleSection jsEndCollapsibleSection static struct dyString *onChangeStart() /* Start up a javascript onChange command */ { struct dyString *dy = jsOnChangeStart(); jsTextCarryOver(dy, hgvaRegionType); jsTextCarryOver(dy, hgvaRange); return dy; } static char *onChangeClade() /* Return javascript executed when they change clade. */ { struct dyString *dy = onChangeStart(); jsDropDownCarryOver(dy, "clade"); dyStringAppend(dy, " document.hiddenForm.org.value=0;"); dyStringAppend(dy, " document.hiddenForm.db.value=0;"); dyStringAppend(dy, " document.hiddenForm." hgvaRange ".value='';"); return jsOnChangeEnd(&dy); } static char *onChangeOrg() /* Return javascript executed when they change organism. */ { struct dyString *dy = onChangeStart(); jsDropDownCarryOver(dy, "clade"); jsDropDownCarryOver(dy, "org"); dyStringAppend(dy, " document.hiddenForm.db.value=0;"); dyStringAppend(dy, " document.hiddenForm." hgvaRange ".value='';"); return jsOnChangeEnd(&dy); } static char *onChangeDb() /* Return javascript executed when they change database. */ { struct dyString *dy = onChangeStart(); jsDropDownCarryOver(dy, "clade"); jsDropDownCarryOver(dy, "db"); dyStringAppend(dy, " document.hiddenForm." hgvaRange ".value='';"); return jsOnChangeEnd(&dy); } INLINE void printOption(char *val, char *selectedVal, char *label) /* For rolling our own select without having to build conditional arrays/lists. */ { printf("<OPTION VALUE='%s'%s>%s\n", val, (sameString(selectedVal, val) ? " SELECTED" : ""), label); } void printRegionListHtml(char *db) /* Make a dropdown choice of region type, with position input box that appears if needed. * Unlike hgTables, don't bother with ENCODE pilot regions -- unless someone misses it. * Return the selected region type. */ { printf("<SELECT ID='"hgvaRegionType"' NAME='"hgvaRegionType"'>\n"); jsOnEventById("change", hgvaRegionType, "hgva.changeRegion();"); printOption(hgvaRegionTypeGenome, regionType, "genome"); printOption(hgvaRegionTypeRange, regionType, "position or search term"); printf("</SELECT>"); } void topLabelSpansStart(char *label) { printf("<span style='display: inline-block; padding-right: 5px;'>" "<span style='display: block;'>%s</span>\n" "<span style='display: block; padding-bottom: 5px;'>\n", label); } void topLabelSpansEnd() { printf("</span></span>"); } char *makePositionInput() /* Return HTML for the position input. */ { struct dyString *dy = dyStringCreate("<INPUT TYPE=TEXT NAME=\"%s\" SIZE=%d VALUE=\"%s\">", hgvaRange, 45, addCommasToPos(NULL, cartString(cart, hgvaRange))); return dyStringCannibalize(&dy); } void printCtAndHubButtons() /* Print a div with buttons for hgCustom and hgHubConnect */ { boolean hasCustomTracks = customTracksExist(cart, NULL); puts("<div style='padding-top: 5px; padding-bottom: 5px'>"); hOnClickButton("prtCtHub_CtBut", "document.customTrackForm.submit(); return false;", hasCustomTracks ? CT_MANAGE_BUTTON_LABEL : CT_ADD_BUTTON_LABEL); printf(" "); if (hubConnectTableExists()) hOnClickButton("prtCtHub_TrkHub", "document.trackHubForm.submit(); return false;", "track hubs"); nbSpaces(3); printf("To reset <B>all</B> user cart settings (including custom tracks), \n" "<A HREF=\"cartReset?destination=%s\">click here</A>.\n", cgiScriptName()); puts("</div>"); } void hgGatewayCladeGenomeDb() /* Make a row of labels and row of buttons like hgGateway, but not using tables. */ { boolean gotClade = hGotClade(); if (gotClade) { topLabelSpansStart("clade"); printCladeListHtml(genome, "change", onChangeClade()); topLabelSpansEnd(); } topLabelSpansStart("genome"); if (gotClade) printGenomeListForCladeHtml(database, "change", onChangeOrg()); else printGenomeListHtml(database, "change", onChangeOrg()); topLabelSpansEnd(); topLabelSpansStart("assembly"); printAssemblyListHtml(database, "change", onChangeDb()); topLabelSpansEnd(); puts("<BR>"); topLabelSpansStart("region to annotate"); printRegionListHtml(database); topLabelSpansEnd(); topLabelSpansStart(""); // Yet another span, for hiding/showing position input and lookup button: printf("<span id='"hgvaPositionContainer"'%s>\n", differentString(regionType, hgvaRegionTypeRange) ? " style='display: none;'" : ""); puts(makePositionInput()); printf("</span>\n"); topLabelSpansEnd(); puts("<BR>"); } void printAssemblySection() /* Print assembly-selection stuff. * Redrawing the whole page when the assembly changes seems fine to me. */ { //#*** ---------- More copied verbatim, from hgTables/mainPage.c: ----------- /* Hidden form - for benefit of javascript. */ { static char *saveVars[] = { "clade", "org", "db", hgvaRange, hgvaRegionType }; jsCreateHiddenForm(cart, cgiScriptName(), saveVars, ArraySize(saveVars)); } /* Hidden form for jumping to custom tracks CGI. */ printf("<FORM ACTION='%s' NAME='customTrackForm' ID='customTrackForm'>", hgCustomName()); cartSaveSession(cart); printf("</FORM>\n"); /* Hidden form for jumping to track hub manager CGI. */ printf("<FORM ACTION='%s' NAME='trackHubForm'>", hgHubConnectName()); //#*** well, almost verbatim... lib version should use cgiScriptName. cartSaveSession(cart); printf("</FORM>\n"); printf("<FORM ACTION=\"%s\" NAME=\"mainForm\" ID=\"mainForm\" METHOD=%s>\n", cgiScriptName(), cartUsualString(cart, "formMethod", "GET")); cartSaveSession(cart); //#*** ------------------ end verbatim --------------- printf("<div class='sectionLiteHeader noReorderRemove'>" "Select Genome Assembly and Region</div>\n"); /* Print clade, genome and assembly line. */ hgGatewayCladeGenomeDb(); //#*** No longer ending form here... } void factorSourceListInputProperties(struct trackDb *tdb, struct slName **retFactorList, struct slName **retCellTypeList, struct slName **retTreatmentList) /* Get a list of factor names used in factorSource track. */ { char *inputsTable = trackDbSetting(tdb, "inputTrackTable"); if (isEmpty(inputsTable)) errAbort("track %s does not have an inputTrackTable setting", tdb->track); char query[2048]; struct sqlConnection *conn = hAllocConn(database); if (retFactorList && retCellTypeList && retTreatmentList) { sqlSafef(query, sizeof(query), "select distinct(factor) from %s order by factor", inputsTable); *retFactorList = sqlQuickList(conn, query); sqlSafef(query, sizeof(query), "select distinct(cellType) from %s order by cellType", inputsTable); *retCellTypeList = sqlQuickList(conn, query); sqlSafef(query, sizeof(query), "select distinct(treatment) from %s order by treatment", inputsTable); *retTreatmentList = sqlQuickList(conn, query); } else errAbort("factorSourceListInputProperties: ret args must be non-NULL."); hFreeConn(&conn); } void printMultiselect(char *track, char *label, char *var, struct slName *optionList) /* Print a label and multi-select, with hgva_track_var as cart var, with options in optionList, * marking as selected any options stored in cart. */ { printf("%s: ", label); char cartVar[1024]; safef(cartVar, sizeof(cartVar), "hgva_filter_%s_%s", track, var); struct hash *selHash = NULL; if (cartListVarExists(cart, cartVar)) selHash = hashFromSlNameList(cartOptionalSlNameList(cart, cartVar)); printf("<select id=\"%s\" name=\"%s\" style=\"display: none; font-size:.9em;\" " "class=\"filterBy\" multiple>\n", cartVar, cartVar); char *selected = ""; if (!selHash || hashLookup(selHash, "All")) selected = " selected"; printf("<option%s>All</option>", selected); struct slName *option; for (option = optionList; option != NULL; option = option->next) { selected = ""; if (selHash && hashLookup(selHash, option->name)) selected = " selected"; printf("<option%s>%s</option>", selected, option->name); } printf("</select>\n"); char shadowVar[1024]; safef(shadowVar, sizeof(shadowVar), "%s%s", cgiMultListShadowPrefix(), cartVar); cgiMakeHiddenVar(shadowVar, "1"); jsInlineF("$(document).ready(function(){ ddcl.setup($('#%s')[0]); });\n", cartVar); } void printFilterOptions(struct trackDb *tdb) /* Print a collapsible filter section for tdb, with controls depending on tdb->type. */ { char sectionName[512], cartVar[512]; safef(sectionName, sizeof(sectionName), "%s_filter", tdb->track); if (sameString(tdb->type, "factorSource")) { puts("<TABLE>"); startSmallCollapsibleSection(sectionName, "filter items", FALSE); struct slName *factorOptions = NULL, *cellTypeOptions = NULL, *treatmentOptions = NULL; factorSourceListInputProperties(tdb, &factorOptions, &cellTypeOptions, &treatmentOptions); printMultiselect(tdb->track, "factor", "name", factorOptions); printMultiselect(tdb->track, "cell type", "cellType", cellTypeOptions); printMultiselect(tdb->track, "treatment", "treatment", treatmentOptions); puts("<BR>"); puts("minimum peak score [0-1000]: "); safef(cartVar, sizeof(cartVar), "hgva_filter_%s_score", tdb->track); char *defaultScore = cartUsualString(cart, cartVar, "0"); printf("<input type=text name=\"%s\" size=12 value=\"%s\"><BR>", cartVar, defaultScore); // The dimensions of ui-dropdownchecklist multiselects are not correct when // the item is hidden. So, when this filter section is made visible, reinit them. jsInlineF( "$(function(){\n" "$('tr[id^=\"%s-\"]').bind('show',\n" " function(jqev) { \n" " var $multisels = $(jqev.target).find('.filterBy');\n" " var multiselElList = $multisels.each(function(ix, el){ return el; });\n" " ddcl.reinit(multiselElList);" " });\n" "});\n" , sectionName); puts("</TABLE>"); endCollapsibleSection(); } if (startsWith("bed 5", tdb->type)) //#*** TODO: detect bed# properly { puts("<TABLE>"); startSmallCollapsibleSection(sectionName, "filter items", FALSE); //#*** Also watch out for noScoreFilter or whatever it's called puts("minimum peak score [0-1000]: "); safef(cartVar, sizeof(cartVar), "hgva_filter_%s_score", tdb->track); char *defaultScore = cartUsualString(cart, cartVar, "0"); printf("<input type=text name=\"%s\" size=12 value=\"%s\"><BR>", cartVar, defaultScore); puts("</TABLE>"); endCollapsibleSection(); } } typedef boolean TdbFilterFunction(struct trackDb *tdb, void *filterData); /* Return TRUE if tdb passes filter criteria. */ void rFilterTrackList(struct trackDb *trackList, struct slRef **pPassingRefs, TdbFilterFunction *filterFunc, void *filterData) /* Recursively apply filterFunc and filterData to all tracks in trackList and * their subtracks. Add an slRef to pPassingRefs for each track/subtrack that passes. * Caller should slReverse(pPassingRefs) when recursion is all done. */ { struct trackDb *tdb; for (tdb = trackList; tdb != NULL; tdb = tdb->next) { if (tdb->subtracks != NULL) rFilterTrackList(tdb->subtracks, pPassingRefs, filterFunc, filterData); if (filterFunc(tdb, filterData)) slAddHead(pPassingRefs, slRefNew(tdb)); } } void tdbFilterGroupTrack(struct trackDb *fullTrackList, struct grp *fullGroupList, TdbFilterFunction *filterFunc, void *filterData, struct slRef **retGroupRefList, struct slRef **retTrackRefList) /* Apply filterFunc and filterData to each track in fullTrackList and all subtracks; * make an slRef list of tracks that pass the filter (retTrackRefList) and an slRef * list of groups to which the passing tracks belong (retGroupRefList). */ { struct slRef *trackRefList = NULL; rFilterTrackList(fullTrackList, &trackRefList, filterFunc, filterData); slReverse(&trackRefList); if (retTrackRefList != NULL) *retTrackRefList = trackRefList; if (retGroupRefList != NULL) { // Which groups have tracks that passed the filter? struct hash *hash = hashNew(0); struct slRef *trackRef; for (trackRef = trackRefList; trackRef != NULL; trackRef = trackRef->next) { struct trackDb *tdb = trackRef->val; hashAddInt(hash, tdb->grp, TRUE); } struct slRef *groupRefList = NULL; struct grp *grp; for (grp = fullGroupList; grp != NULL; grp = grp->next) { if (hashIntValDefault(hash, grp->name, FALSE)) slAddHead(&groupRefList, slRefNew(grp)); } hashFree(&hash); slReverse(&groupRefList); *retGroupRefList = groupRefList; } } boolean isVariantCustomTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get custom or hub tracks with type pgSnp or vcf(Tabix). */ { return ((sameString(tdb->grp, "user") || isHubTrack(tdb->track)) && (sameString(tdb->type, "pgSnp") || startsWith("vcf", tdb->type))); } void selectVariants() /* Offer selection of user's variant custom tracks, example variants, pasted input etc. */ { #define PGSNP_OR_VCF "<A HREF='../FAQ/FAQformat.html#format10' TARGET=_BLANK>pgSnp</A> or " \ "<A HREF='../goldenPath/help/vcf.html' TARGET=_BLANK>VCF</A>" printf("<div class='sectionLiteHeader'>Select Variants</div>\n"); /* Check for variant custom tracks. If there are none, tell the user that they should add one. */ struct slRef *varTrackList = NULL, *varGroupList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isVariantCustomTrack, NULL, &varGroupList, &varTrackList); if (varTrackList == NULL) { printf("Your session doesn't have any custom tracks or hub tracks in " PGSNP_OR_VCF " format.\n"); hOnClickButton("selVar_AddPgpVcfCt", "return hgva.goToAddCustomTrack();", "add pgSnp or VCF custom track"); if (hubConnectTableExists()) { nbSpaces(2); hOnClickButton("selVar_AddTrkHub", "document.trackHubForm.submit(); return false;", "add track hub"); } puts("<BR>"); } else if (slCount(varTrackList) > 1) { printf("If you have more than one custom track or hub track in " PGSNP_OR_VCF " format, please select the one you wish to annotate:<BR>\n"); } printf("<B>variants: </B>"); printf("<SELECT ID='hgva_variantTrack' NAME='hgva_variantTrack'>\n"); jsOnEventById("change", "hgva_variantTrack", "hgva.changeVariantSource();"); char *selected = cartUsualString(cart, "hgva_variantTrack", ""); struct slRef *ref; for (ref = varTrackList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; printOption(tdb->track, selected, tdb->longLabel); } printOption(hgvaSampleVariants, selected, hgvaSampleVariantsLabel); printOption(hgvaUseHgvs, selected, hgvaHgvsLabel); struct trackDb *snpTdb = hFindLatestSnpTrack(database, NULL, &fullTrackList); // We can convert rsIds from snpNNN tables (but not ancient "snp" - no refUCSC column) into VCF boolean hasSnps = (snpTdb != NULL && differentString(snpTdb->table, "snp")); if (hasSnps) printOption(hgvaUseVariantIds, selected, hgvaVariantIdsLabel); printf("</SELECT><BR>\n"); printf("<div id='"hgvaHgvsPasteContainer"'%s>\n", differentString(selected, hgvaUseHgvs) ? " style='display: none;'" : ""); printf("Enter HGVS terms: one term per line; blank lines and comment lines beginning with '#' " "are ignored.<BR>\n"); char *oldPasted = cartUsualString(cart, hgvaHgvs, ""); cgiMakeTextArea(hgvaHgvs, oldPasted, 10, 70); puts("</div>"); if (hasSnps) { printf("<div id='"hgvaVariantPasteContainer"'%s>\n", differentString(selected, hgvaUseVariantIds) ? " style='display: none;'" : ""); printf("Enter dbSNP rs# identifiers separated by whitespace or commas:<BR>\n"); char *oldPasted = cartUsualString(cart, hgvaVariantIds, ""); cgiMakeTextArea(hgvaVariantIds, oldPasted, 10, 70); puts("</div>"); } printf("<B>maximum number of variants to be processed:</B>\n"); char *curLimit = cartUsualString(cart, "hgva_variantLimit", "10000"); char *limitLabels[] = { "10", "100", "1,000", "10,000", "100,000" }; char *limitValues[] = { "10", "100", "1000", "10000", "100000" }; cgiMakeDropListWithVals("hgva_variantLimit", limitLabels, limitValues, ArraySize(limitLabels), curLimit); printCtAndHubButtons(); puts("<BR>"); } boolean isGeneTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get genePred tracks. */ { return (startsWith("genePred", tdb->type) || sameString("bigGenePred", tdb->type)); } char *selectGenes() /* Let user select a gene predictions track; return NULL if there are no genePred tracks. */ { struct slRef *trackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isGeneTrack, NULL, NULL, &trackRefList); boolean gotGP = (trackRefList != NULL); if (!gotGP) warn("This assembly (%s) has no gene prediction tracks, " "so the VAI will not be able to annotate it.", database); printf("<div class='sectionLiteHeader'>Select Genes</div>\n"); if (gotGP) printf("The gene predictions selected here will be used "); else printf("Gene predictions are required in order "); printf("to determine the effect of " "each variant on genes, for example intronic, missense, splice site, intergenic etc."); if (!gotGP) printf(" Since this assembly has no gene prediction tracks, " "the VAI can't provide functional annotations. " "Please select a different genome.<BR>"); printf("<BR>\n"); if (! gotGP) return NULL; char *firstTrack = ((struct trackDb *)(trackRefList->val))->track; char *cartGeneTrack = cartOptionalString(cart, "hgva_geneTrack"); if (isNotEmpty(cartGeneTrack)) { // Make sure it's actually in trackRefList (might have been carried over from other db) boolean exists = FALSE; struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; if (sameString(cartGeneTrack, tdb->track)) { exists = TRUE; break; } } if (!exists) cartGeneTrack = NULL; } char *selected = isNotEmpty(cartGeneTrack) ? cartGeneTrack : firstTrack; //#*** should show more info about each track... button to pop up track desc? if (gotGP) { printf("<SELECT ID='hgva_geneTrack' NAME='hgva_geneTrack'>\n"); jsOnEventById("change", "hgva_geneTrack", "hgva.changeGeneSource();"); struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; if (tdb->subtracks == NULL) printOption(tdb->track, selected, tdb->longLabel); } puts("</SELECT><BR>"); } return selected; } //#*** We really need a dbNsfp.[ch]: enum PolyPhen2Subset { noSubset, HDIV, HVAR }; char *formatDesc(char *url, char *name, char *details, boolean doHtml) /* Return a description with URL for name plus extra details. If doHtml, * wrap URL in <A ...>...</A>. */ { char desc[1024]; if (doHtml) safef(desc, sizeof(desc), "<A HREF='%s' TARGET=_BLANK>%s</A> %s", url, name, details); else safef(desc, sizeof(desc), "(%s) %s %s", url, name, details); return cloneString(desc); } char *dbNsfpDescFromTableName(char *tableName, enum PolyPhen2Subset subset, boolean doHtml) /* Return a description for table name... if these become tracks, use tdb of course. */ { if (sameString(tableName, "dbNsfpSift")) return formatDesc("http://sift.bii.a-star.edu.sg/", "SIFT", "(D = damaging, T = tolerated)", doHtml); else if (sameString(tableName, "dbNsfpPolyPhen2")) { if (subset == HDIV) return formatDesc("http://genetics.bwh.harvard.edu/pph2/", "PolyPhen-2", "with HumDiv training set " "(D = probably damaging, P = possibly damaging, B = benign)", doHtml); else if (subset == HVAR) return formatDesc("http://genetics.bwh.harvard.edu/pph2/", "PolyPhen-2", "with HumVar training set " "(D = probably damaging, P = possibly damaging, B = benign)", doHtml); else errAbort("dbNsfpDescFromTableName: invalid PolyPhen2 subset type (%d)", subset); } else if (sameString(tableName, "dbNsfpMutationTaster")) return formatDesc("http://www.mutationtaster.org/", "MutationTaster", "(A = disease causing automatic, D = disease causing, " "N = polymorphism, P = polymorphism automatic)", doHtml); else if (sameString(tableName, "dbNsfpMutationAssessor")) return formatDesc("http://mutationassessor.org/", "MutationAssessor", "(high or medium: predicted functional; " "low or neutral: predicted non-functional)", doHtml); else if (sameString(tableName, "dbNsfpLrt")) return formatDesc("http://www.genetics.wustl.edu/jflab/lrt_query.html", "Likelihood ratio test (LRT)", "(D = deleterious, N = Neutral, U = unknown)", doHtml); else if (sameString(tableName, "dbNsfpVest")) return formatDesc("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3665549/", "Variant Effect Scoring Tool (VEST)", "(scores [0-1] predict confidence that a change is deleterious", doHtml); else if (sameString(tableName, "dbNsfpGerpNr")) return formatDesc("http://mendel.stanford.edu/SidowLab/downloads/gerp/index.html", "GERP++", "Neutral Rate (NR)", doHtml); else if (sameString(tableName, "dbNsfpGerpRs")) return formatDesc("http://mendel.stanford.edu/SidowLab/downloads/gerp/index.html", "GERP++", "Rejected Substitutions (RS)", doHtml); else if (sameString(tableName, "dbNsfpInterPro")) return formatDesc("http://www.ebi.ac.uk/interpro/", "InterPro", "protein domains", doHtml); return NULL; } struct slName *findDbNsfpTables() /* See if this database contains dbNSFP tables. */ { if (startsWith(hubTrackPrefix, database)) return NULL; struct sqlConnection *conn = hAllocConn(database); struct slName *dbNsfpTables = sqlListTablesLike(conn, "LIKE 'dbNsfp%'"); hFreeConn(&conn); return dbNsfpTables; } void printDbNsfpSource(char *table, enum PolyPhen2Subset subset) /* If we know what to do with table, make a checkbox with descriptive label. */ { char *description = dbNsfpDescFromTableName(table, subset, TRUE); if (description != NULL) { char cartVar[512]; if (subset == HDIV) safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s:HDIV", database, table); else if (subset == HVAR) safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s:HVAR", database, table); else safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s", database, table); boolean defaultChecked = (sameString("dbNsfpSift", table) || sameString("dbNsfpPolyPhen2", table)); cartMakeCheckBox(cart, cartVar, defaultChecked); printf("%s<BR>\n", description); } } void selectDbNsfp(struct slName *dbNsfpTables) /* Let user select scores/predicitions from various tools collected by dbNSFP. */ { if (dbNsfpTables == NULL) return; startCollapsibleSection("dbNsfp", "Database of Non-synonymous Functional Predictions (dbNSFP)", TRUE); //#*** hardcoded version info... we need metadata (#11462) char *dbNsfpVersion = "3.1a"; char *txVersion = "Gencode release 22 (Ensembl 79, Mar. 2015)"; char *refYear = "2015"; char *refUrl = "http://onlinelibrary.wiley.com/doi/10.1002/humu.22932/abstract"; // For the time being hg19 is still on version 2.0: if (sameString(database, "hg19")) { dbNsfpVersion = "2.0"; txVersion = "Gencode release 9 (Ensembl 64, Dec. 2011)"; refYear = "2013"; refUrl = "http://onlinelibrary.wiley.com/doi/10.1002/humu.22376/abstract"; } printf("<A HREF='https://sites.google.com/site/jpopgen/dbNSFP' TARGET=_BLANK>dbNSFP</A> " "(<A HREF='%s' " "TARGET=_BLANK>Liu <em>et al.</em> %s</A>) " "release %s " "provides pre-computed scores and predictions of functional significance " "from a variety of tools. Every possible coding change to transcripts in " "%s gene predictions " "has been evaluated. " "<em>Note: This may not encompass all transcripts in your " "selected gene set.</em><BR>\n", refUrl, refYear, dbNsfpVersion, txVersion); //#*** Another cheap hack: reverse alph order happens to be what we want (until VEST??), //#*** but priorities would be cleaner: slReverse(&dbNsfpTables); jsMakeSetClearContainer(); struct slName *table; for (table = dbNsfpTables; table != NULL; table = table->next) { if (sameString(table->name, "dbNsfpPolyPhen2")) { printDbNsfpSource(table->name, HDIV); printDbNsfpSource(table->name, HVAR); } else printDbNsfpSource(table->name, 0); } jsEndContainer(); puts("<BR>"); endCollapsibleSection(); } boolean findSnpBed4(char *suffix, char **retFileName, struct trackDb **retTdb) /* If we can find the latest snpNNNsuffix table, or better yet a bigBed file for it (faster), * set the appropriate ret* and return TRUE, otherwise return FALSE. */ { struct trackDb *tdb = hFindLatestSnpTrack(database, suffix, &fullTrackList); if (tdb == NULL) return FALSE; boolean foundIt = FALSE; // Do we happen to have a bigBed version? Better yet, bed4 only for current uses: char origFileName[HDB_MAX_PATH_STRING]; safef(origFileName, sizeof(origFileName), "/gbdb/%s/vai/%s.bed4.bb", database, tdb->table); char* fileName = hReplaceGbdb(origFileName); if (fileExists(fileName)) { if (retFileName != NULL) *retFileName = fileName; foundIt = TRUE; } else { // Not bed4; try just .bb: freez(&fileName); safef(origFileName, sizeof(origFileName), "/gbdb/%s/vai/%s.bb", database, tdb->table); fileName = hReplaceGbdb(origFileName); if (fileExists(fileName)) { if (retFileName != NULL) *retFileName = fileName; foundIt = TRUE; } } if (retTdb == NULL) return foundIt; else *retTdb = tdb; return TRUE; } void selectDbSnp(boolean gotSnp) /* Offer to include rsID (and other fields, or leave that for advanced output??) if available */ { if (!gotSnp) return; startCollapsibleSection("dbSnp", "Known variation", TRUE); cartMakeCheckBox(cart, "hgva_rsId", TRUE); printf("Include <A HREF='https://www.ncbi.nlm.nih.gov/projects/SNP/' TARGET=_BLANK>dbSNP</A> " "rs# ID if one exists<BR>\n"); puts("<BR>"); endCollapsibleSection(); } #define GENCODE_PREFIX "wgEncodeGencode" struct slName *getGencodeTagVersions() /* Return a list of version strings from the ends of wgEncodeGencodeTag% tables. */ { static struct slName *tagVersions = NULL; if (tagVersions == NULL && !startsWith(hubTrackPrefix, database)) { struct sqlConnection *conn = hAllocConn(database); struct slName *tagTables = sqlQuickList(conn, NOSQLINJ "show tables like '"GENCODE_PREFIX"Tag%'"); int offset = strlen(GENCODE_PREFIX"Tag"); struct slName *tt; for (tt = tagTables; tt != NULL; tt = tt->next) slAddHead(&tagVersions, slNameNew(tt->name + offset)); hFreeConn(&conn); } return slNameCloneList(tagVersions); } boolean knownGeneHasGencodeTags() /* Return TRUE if this database has knownToTag for knownGene. */ { return hTableExists(database, "knownGene") && hTableExists(database, "knownToTag"); } boolean hasGencodeTags() /* Return TRUE if GENCODE tags can be associated with some gene track in database. */ { return knownGeneHasGencodeTags() || (getGencodeTagVersions() != NULL); } void genbankGetDbTable(char *db, char *gbTable, char **retGenbankDb, char **retTableName) /* Genbank tables (e.g. gbCdnaInfoTable) may now include a 'db.' prefix, e.g. * hgFixed.gbCdnaInfo. If so, separate out the database and table name. If gbTable * does not have a 'db.' prefix then use the configured genbankDb if given, else db. */ { char *genbankDb = cfgOptionEnv("GENBANKDB", "genbankDb"); char *tableName = gbTable; char tableCopy[strlen(gbTable)+1]; safecpy(tableCopy, sizeof(tableCopy), gbTable); char *words[2]; int wordCount = chopByChar(tableCopy, '.', words, ArraySize(words)); if (wordCount == 2) { genbankDb = words[0]; tableName = words[1]; } else if (isEmpty(genbankDb)) genbankDb = db; if (retGenbankDb) *retGenbankDb = cloneString(genbankDb); if (retTableName) *retTableName = cloneString(tableName); } boolean genbankTableExists(char *db, char *gbTable) /* Return TRUE if table (which may or may not be prefixed by genbankDb) exists in * genbankDb or db. */ { char *genbankDb=NULL, *tableName=NULL; genbankGetDbTable(db, gbTable, &genbankDb, &tableName); return hTableExists(genbankDb, tableName); } boolean hasTxStatus() /* Return TRUE if any gene track in database has some kind of transcript status info * like knownCanonical, GENCODE tags and/or RefSeq status. */ { if (hasGencodeTags()) return TRUE; if (hTableExists(database, "knownGene") && hTableExists(database, "knownCanonical")) return TRUE; if (hTableExists(database, "refGene") && genbankTableExists(database, refSeqStatusTable)) return TRUE; return FALSE; } char *getLatestGencodeVersion(struct slName *versionList) /* Return the numerically largest version found in versionList. */ { int maxVersionNum = -1; char *maxVersion = NULL; struct slName *version; for (version = versionList; version != NULL; version = version->next) { int versionNum = atoi(skipToNumeric(version->name)); if (versionNum > maxVersionNum) { maxVersionNum = versionNum; maxVersion = version->name; } } return cloneString(maxVersion); } INLINE char *gencodeTableName(char *suffix, char *version, char *buf, size_t bufSize) /* Write wgEncodeGencode<suffix><version> into buf. Return buf for convenience. */ { safef(buf, bufSize, GENCODE_PREFIX"%s%s", suffix, version); return buf; } boolean refGeneHasGencodeTags(struct slName *gencodeVersionList) /* Return TRUE if this database has a wgEncodeGencodeRefSeq table as well as a ...Tag table. */ { char *version = getLatestGencodeVersion(gencodeVersionList); char table[PATH_LEN]; return hTableExists(database, gencodeTableName("RefSeq", version, table, sizeof(table))); } boolean startsWithGencodeGene(char *geneTrack) /* Return TRUE if geneTrack starts with wgEncodeGencode{Basic,Comp,PseudoGene}. * (There are other GENCODE genepred tracks that don't have tags associated with them.) */ { return (startsWith(GENCODE_PREFIX"Basic", geneTrack) || startsWith(GENCODE_PREFIX"Comp", geneTrack) || startsWith(GENCODE_PREFIX"PseudoGene", geneTrack)); } boolean isGencodeWithVersion(char *geneTrack, struct slName *versionList) /* Return TRUE if geneTrack looks like a Gencode gene track for a supported version. */ { if (! startsWithGencodeGene(geneTrack)) return FALSE; struct slName *v; for (v = versionList; v != NULL; v = v->next) { if (endsWith(geneTrack, v->name)) return TRUE; } return FALSE; } void selectTxStatus(boolean hasTxStatus, char *geneTrack) /* Offer to include transcript status, e.g. whether it is in knownCanonical or has GENCODE tags. * This makes one div per category of txStatus info; each div is visible only if its info is * applicable to the selected gene track. If no divs are visible, display a message that * there's nothing for the currently selected gene track. */ { if (! hasTxStatus) return; startCollapsibleSection("txStatus", "Transcript status", FALSE); boolean somethingIsVisible = FALSE; if (hasGencodeTags()) { struct slName *versionList = getGencodeTagVersions(); char *maybeKnownGene = knownGeneHasGencodeTags() ? "knownGene" : ""; char *maybeRefGene = refGeneHasGencodeTags(versionList) ? "refGene" : ""; char *maybeEnsGene = ""; char *versions = ""; if (versionList != NULL) { if (hTableExists(database, "ensGene")) maybeEnsGene = "ensGene"; versions = slNameListToString(versionList, ' '); } boolean isVisible = (sameString(geneTrack, maybeKnownGene) || sameString(geneTrack, maybeEnsGene) || sameString(geneTrack, maybeRefGene) || isGencodeWithVersion(geneTrack, versionList)); somethingIsVisible |= isVisible; printf("<div class=\"txStatus %s %s %s %s\" style=\"display: %s;\">", maybeKnownGene, maybeRefGene, maybeEnsGene, versions, isVisible ? "block" : "none"); cartMakeCheckBox(cart, "hgva_txStatus_gencode", FALSE); puts("Include the <A HREF=" GENCODE_TAG_DOC_URL " TARGET=_BLANK>GENCODE tags</A> (if any) " "associated with each transcript.<BR>"); puts("</div>"); } if (hTableExists(database, "knownGene") && hTableExists(database, "knownCanonical")) { boolean isVisible = sameString(geneTrack, "knownGene"); somethingIsVisible |= isVisible; printf("<div class=\"txStatus knownGene\" style=\"display: %s;\">", isVisible ? "block" : "none"); cartMakeCheckBox(cart, "hgva_txStatus_knownCanonical", FALSE); char *desc = hTableExists(database, "knownToTag") ? "based on <A HREF=" APPRIS_DOC_URL " TARGET=_BLANK>" "APPRIS</A> status or inclusion in " "<A HREF=" GENCODE_BASIC_DOC_URL " TARGET=_BLANK>GENCODE Basic</A> subset: " "principal > alternative > basic > longest isoform" : "generally the longest isoform of a gene"; printf("Indicate whether each transcript is 'canonical' (%s).<BR>\n", desc); puts("</div>"); } if (hTableExists(database, "refGene") && genbankTableExists(database, refSeqStatusTable)) { boolean isVisible = sameString(geneTrack, "refGene"); somethingIsVisible |= isVisible; printf("<div class=\"txStatus refGene\" style=\"display: %s;\">", isVisible ? "block" : "none"); cartMakeCheckBox(cart, "hgva_txStatus_refSeqStatus", FALSE); puts("Include the " "<A HREF=" REFSEQ_STATUS_DOC_URL " " "TARGET=_BLANK>RefSeq status</A> of each transcript.<BR>"); puts("</div>"); } printf("<div class=\"noTxStatus\" style=\"display: %s;\">", somethingIsVisible ? "none" : "block"); puts("No transcript status data are available for the selected gene track."); puts("</div>"); puts("<BR>"); endCollapsibleSection(); } static boolean canDoHgvsOut(char *geneTrack) /* Return TRUE if we're able to make HGVS output terms for transcripts in geneTrack. */ { return sameString(geneTrack, "refGene") || startsWith("ncbiRefSeq", geneTrack); } static void selectHgvsOut(char *geneTrack) /* Offer HGVS output choices if RefSeq Genes are selected */ { startCollapsibleSection("hgvsOut", "HGVS variant nomenclature", TRUE); printf("The <a href='http://www.hgvs.org/' target=_blank>Human Genome Variation Society (HGVS)</a> " "has established a " "<a href='http://varnomen.hgvs.org/' target=_blank>sequence variant nomenclature</a>, " "an international standard used to report variation in " "genomic, transcript and protein sequences.<br>\n"); boolean hgvsOk = canDoHgvsOut(geneTrack); printf("<div id=\"hgvsOptions\" style=\"display: %s;\">", hgvsOk ? "block" : "none"); cartMakeCheckBox(cart, "hgva_hgvsG", FALSE); printf("Include HGVS genomic (g.) terms in output<br>\n"); cartMakeCheckBox(cart, "hgva_hgvsCN", FALSE); printf("Include HGVS coding (c.) terms if applicable, otherwise noncoding (n.) terms, in output" "<br>\n"); cartMakeCheckBox(cart, "hgva_hgvsP", FALSE); printf("Include HGVS protein (p.) terms (if applicable) in output<br>\n"); cartMakeCheckBox(cart, "hgva_hgvsPAddParens", FALSE); printf("When including HGVS protein (p.) terms, add parentheses around changes to emphasize " "that they are predictions<br>\n"); cartMakeCheckBox(cart, "hgva_hgvsBreakDelIns", FALSE); printf("For variants that involve both a deletion and insertion, " "including multi-nucleotide variants, " "include the deleted sequence (e.g. show \"delAGinsTT\" instead of only \"delinsTT\")" "<br>\n"); puts("</div>"); printf("<div id=\"noHgvs\" style=\"display: %s;\">", hgvsOk ? "none" : "block"); printf("Select RefSeq Genes in the \"Select Genes\" section above " "in order to make options appear.\n"); puts("</div>"); puts("<br>"); endCollapsibleSection(); } boolean isHg19RegulatoryTrack(struct trackDb *tdb, void *filterData) /* For now, just look for a couple specific tracks by tableName. */ { //#*** NEED METADATA return (sameString("wgEncodeRegDnaseClusteredV3", tdb->table) || sameString("wgEncodeRegTfbsClusteredV3", tdb->table)); } boolean isHg38RegulatoryTrack(struct trackDb *tdb, void *filterData) /* For now, just look for a couple specific tracks by tableName. */ { //#*** NEED METADATA return (sameString("wgEncodeRegDnaseClustered", tdb->table) || sameString("wgEncodeRegTfbsClusteredV3", tdb->table)); } struct slRef *findRegulatoryTracks() /* Look for the very limited set of Regulation tracks that hgVai offers. */ { struct slRef *regTrackRefList = NULL; if (sameString(database, "hg19")) tdbFilterGroupTrack(fullTrackList, fullGroupList, isHg19RegulatoryTrack, NULL, NULL, ®TrackRefList); else if (sameString(database, "hg38")) tdbFilterGroupTrack(fullTrackList, fullGroupList, isHg38RegulatoryTrack, NULL, NULL, ®TrackRefList); return regTrackRefList; } void selectRegulatory() /* If trackRefList is non-NULL, make a section with a checkbox for each track, * labelled with longLabel, and optionally some filtering options. */ { struct slRef *trackRefList = findRegulatoryTracks(); if (trackRefList != NULL) { puts("<BR>"); printf("<div class='sectionLiteHeader'>Select Regulatory Annotations</div>\n"); printf("The annotations in this section provide predicted regulatory regions " "based on various experimental data. " "When a variant overlaps an annotation selected here, the consequence term " "<A HREF=\"http://sequenceontology.org/browser/current_release/term/SO:0001566\" " "TARGET=_BLANK>regulatory_region_variant</A> will be assigned. " "Follow the links to description pages that explain how each dataset was " "constructed. " "Some datasets cover a significant portion of the genome and it may be " "desirable to filter these annotations by cell type and/or score in order " "to avoid an overabundance of hits. "); // Use a table with checkboxes in one column and label/other stuff that depends on // checkbox in other column. puts("<TABLE>"); struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; char cartVar[512]; safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s", database, tdb->track); puts("<TR><TD valign=top>"); cartMakeCheckBox(cart, cartVar, FALSE); puts("</TD><TD>"); struct trackDb *topTdb = trackDbTopLevelSelfOrParent(tdb); printf("<A HREF=\"%s?%s&g=%s\">%s</A><BR>\n", hgTrackUiName(), cartSidUrlString(cart), topTdb->track, tdb->longLabel); printFilterOptions(tdb); puts("</TD></TR>"); } puts("</TABLE>"); } } boolean isConsElTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get "phastConsNwayElements" tracks. */ { return (startsWith("phastCons", tdb->table) && stringIn("Elements", tdb->table)); } boolean isConsScoreTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get tracks that start with "phastCons" (no Elements) * or "phyloP". */ { return ((startsWith("phastCons", tdb->table) && !stringIn("Elements", tdb->table)) || startsWith("phyloP", tdb->table)); } void findCons(struct slRef **retElTrackRefList, struct slRef **retScoreTrackRefList) /* See if this database has Conserved Elements and/or Conservation Scores */ { tdbFilterGroupTrack(fullTrackList, fullGroupList, isConsElTrack, NULL, NULL, retElTrackRefList); tdbFilterGroupTrack(fullTrackList, fullGroupList, isConsScoreTrack, NULL, NULL, retScoreTrackRefList); } boolean trackNameMatches(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get track(s) whose tdb->track matches name (filterData). */ { char *name = filterData; return sameString(tdb->track, name); } struct slRef *findTrackRefByName(char *name) /* Return a reference to the named track, if found. */ { struct slRef *trackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, trackNameMatches, name, NULL, &trackRefList); return trackRefList; } void trackCheckBoxSection(char *sectionSuffix, char *title, struct slRef *trackRefList) /* If trackRefList is non-NULL, make a collapsible section with a checkbox for each track, * labelled with longLabel. */ { if (trackRefList != NULL) { startCollapsibleSection(sectionSuffix, title, FALSE); struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; char cartVar[512]; safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s", database, tdb->track); cartMakeCheckBox(cart, cartVar, FALSE); struct trackDb *topTdb = trackDbTopLevelSelfOrParent(tdb); printf("<A HREF=\"%s?%s&g=%s\">%s</A><BR>\n", hgTrackUiName(), cartSidUrlString(cart), topTdb->track, tdb->longLabel); } puts("<BR>"); endCollapsibleSection(); } } void selectAnnotations(char *geneTrack) /* Beyond predictions of protein-coding effect, what other basic data can we integrate? */ { struct slName *dbNsfpTables = findDbNsfpTables(); boolean gotSnp = findSnpBed4("", NULL, NULL); struct slRef *elTrackRefList = NULL, *scoreTrackRefList = NULL; findCons(&elTrackRefList, &scoreTrackRefList); struct slRef *cosmicTrackRefList = findTrackRefByName("cosmic"); boolean hasTxStat = hasTxStatus(); if (dbNsfpTables == NULL && !gotSnp && elTrackRefList == NULL && scoreTrackRefList == NULL && cosmicTrackRefList == NULL && !hasTxStat) return; puts("<BR>"); printf("<div class='sectionLiteHeader'>Select More Annotations (optional)</div>\n"); // Make wrapper table for collapsible sections: puts("<TABLE border=0 cellspacing=5 cellpadding=0 style='padding-left: 10px;'>"); selectDbNsfp(dbNsfpTables); selectTxStatus(hasTxStat, geneTrack); selectHgvsOut(geneTrack); selectDbSnp(gotSnp); trackCheckBoxSection("Cosmic", "COSMIC", cosmicTrackRefList); trackCheckBoxSection("ConsEl", "Conserved elements", elTrackRefList); trackCheckBoxSection("ConsScore", "Conservation scores", scoreTrackRefList); puts("</TABLE>"); } void selectFiltersFunc() /* Options to restrict variants based on gene region/soTerm from gpFx */ { startCollapsibleSection("filtersFunc", "Functional role", FALSE); printf("Include variants annotated as<BR>\n"); jsMakeSetClearContainer(); cartMakeCheckBox(cart, "hgva_include_intergenic", TRUE); printf("intergenic<BR>\n"); cartMakeCheckBox(cart, "hgva_include_upDownstream", TRUE); printf("upstream/downstream of gene<BR>\n"); cartMakeCheckBox(cart, "hgva_include_nmdTranscript", TRUE); printf("in transcript already subject to nonsense-mediated decay (NMD)<BR>\n"); cartMakeCheckBox(cart, "hgva_include_exonLoss", TRUE); printf("exon loss caused by deletion<BR>\n"); cartMakeCheckBox(cart, "hgva_include_utr", TRUE); printf("5' or 3' UTR<BR>\n"); cartMakeCheckBox(cart, "hgva_include_cdsSyn", TRUE); printf("CDS - synonymous coding change<BR>\n"); cartMakeCheckBox(cart, "hgva_include_cdsNonSyn", TRUE); printf("CDS - non-synonymous (missense, stop gain/loss, frameshift etc)<BR>\n"); cartMakeCheckBox(cart, "hgva_include_intron", TRUE); printf("intron<BR>\n"); cartMakeCheckBox(cart, "hgva_include_splice", TRUE); printf("splice site or splice region<BR>\n"); cartMakeCheckBox(cart, "hgva_include_nonCodingExon", TRUE); printf("exon of non-coding gene<BR>\n"); cartMakeCheckBox(cart, "hgva_include_noVariation", TRUE); printf("\"variants\" for which no alternate allele has been observed<BR>\n"); struct slRef *regTrackRefList = findRegulatoryTracks(); if (regTrackRefList != NULL) { cartMakeCheckBox(cart, "hgva_include_regulatory", TRUE); printf("regulatory element (note: these are detected only if one or more tracks " "are selected in Regulatory regions above)<BR>\n"); } jsEndContainer(); puts("<BR>"); endCollapsibleSection(); } void selectFiltersKnownVar() /* Options to restrict output based on overlap with known variants. */ { boolean gotCommon = findSnpBed4("Common", NULL, NULL); boolean gotMult = findSnpBed4("Mult", NULL, NULL); if (!gotCommon && !gotMult) return; startCollapsibleSection("filtersVar", "Known variation", FALSE); if (gotMult) { cartMakeCheckBox(cart, "hgva_include_snpMult", TRUE); printf("Include variants even if they are co-located with variants that have been mapped to " "multiple genomic locations by dbSNP<BR>\n"); } if (gotCommon) { cartMakeCheckBox(cart, "hgva_include_snpCommon", TRUE); printf("Include variants even if they are co-located with \"common\" variants " "(uniquely mapped variants with global minor allele frequency (MAF) " "of at least 1%% according to dbSNP)<BR>\n"); } puts("<BR>"); endCollapsibleSection(); } void selectFiltersCons() /* Options to restrict output based on overlap with conserved elements. */ { struct slRef *elTrackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isConsElTrack, NULL, NULL, &elTrackRefList); if (elTrackRefList == NULL) return; startCollapsibleSection("filtersCons", "Conservation", FALSE); // Use a little table to indent radio buttons (if we do those) past checkbox: puts("<TABLE><TR><TD>"); cartMakeCheckBox(cart, "hgva_require_consEl", FALSE); printf("</TD><TD>Include only the variants that overlap:"); if (slCount(elTrackRefList) == 1) { struct trackDb *tdb = elTrackRefList->val; printf(" %s<BR>\n", tdb->longLabel); cgiMakeHiddenVar("hgva_require_consEl_track", tdb->track); puts("</TD></TR></TABLE>"); } else { puts("</TD></TR>"); char *selected = cartUsualString(cart, "hgva_require_consEl_track", ""); struct slRef *ref; for (ref = elTrackRefList; ref != NULL; ref = ref->next) { printf("<TR><TD></TD><TD>"); struct trackDb *tdb = ref->val; cgiMakeOnEventRadioButtonWithClass("hgva_require_consEl_track", tdb->track, sameString(tdb->track, selected), NULL, "click", "setCheckboxList('hgva_require_consEl', true);"); printf("%s</TD></TR>\n", tdb->longLabel); } puts("</TABLE>"); } endCollapsibleSection(); } void selectFilters() /* Options to restrict output to variants with specific properties */ { puts("<BR>"); printf("<div class='sectionLiteHeader'>Define Filters</div>\n"); puts("<TABLE border=0 cellspacing=5 cellpadding=0 style='padding-left: 10px;'>"); selectFiltersFunc(); selectFiltersKnownVar(); selectFiltersCons(); puts("</TABLE>"); } void selectOutput() /* Just VEP text for now... should also have VEP HTML with limited # lines too. * Next: custom track with same subset of info as we would stuff in VEP */ { puts("<BR>"); printf("<div class='sectionLiteHeader'>Configure Output</div>\n"); printf("<B>output format: </B>"); char *selected = cartUsualString(cart, "hgva_outFormat", "vepTab"); printf("<SELECT ID='hgva_outFormat' NAME='hgva_outFormat'>\n"); printOption("vepTab", selected, "Variant Effect Predictor (tab-separated text)"); printOption("vepHtml", selected, "Variant Effect Predictor (HTML)"); printf("</SELECT><BR>\n"); char *compressType = cartUsualString(cart, "hgva_compressType", textOutCompressNone); char *fileName = cartUsualString(cart, "hgva_outFile", ""); printf("<B>output file:</B> "); cgiMakeTextVar("hgva_outFile", fileName, 29); printf(" (leave blank to keep output in browser)<BR>\n"); printf("<B>file type returned: </B>"); cgiMakeRadioButton("hgva_compressType", textOutCompressNone, sameWord(textOutCompressNone, compressType)); printf(" plain text  "); cgiMakeRadioButton("hgva_compressType", textOutCompressGzip, sameWord(textOutCompressGzip, compressType)); printf(" gzip compressed (ignored if output file is blank)"); puts("<BR>"); } void submitAndDisclaimer() { puts("<div id=disclaimerDialog title='NOTE'>"); puts("This tool is for research use only. While this tool is open to the " "public, users seeking information about a personal medical or genetic " "condition are urged to consult with a qualified physician for " "diagnosis and for answers to personal questions."); puts("</div><BR>"); printf("<div><img id='loadingImg' src='../images/loading.gif' />\n"); printf("<span id='loadingMsg'></span></div>\n"); cgiMakeOnClickButton("subDisclmAgrd","hgva.submitQueryIfDisclaimerAgreed();", "Get results"); puts("<BR><BR>"); } /* * When user clicks submit, we need to roll a JSON querySpec from form selections, * and show data from a submission to hgAnnoGrator. redirect from this CGI? * or have javascript submit directly? * * primary: variants, from custom track * * if there are genes, those w/annoGratorGpVar * * if there are {dbSNP, dbNsfp, regulatory, cons} selections, grator for each of those * * vep output config * * If we get bold & offer 1000Genomes VCF, will def. need handling of split chroms. * Are we really going to offer genome-wide in hgVai? * Up-front limit on #rows of input ? * * Eventually, we might want a FormatVep that produces structs that are passed * forward to multiple output writers... I would want to send it lots of gratorData * like a formatter, but it would produce rows like an annoGrator. * Maybe annoGrators should accept a bunch of input rows like formatters? * or would this grator wrap all the input grators inside? */ void doMainPage() /* Print out initial HTML of control page. */ { jsInit(); webIncludeResourceFile("jquery-ui.css"); webIncludeResourceFile("ui.dropdownchecklist.css"); boolean alreadyAgreed = cartUsualBoolean(cart, "hgva_agreedToDisclaimer", FALSE); jsInlineF( "$(document).ready(function() { hgva.disclaimer.init(%s, hgva.userClickedAgree); });\n" , alreadyAgreed ? "true" : "false"); addSomeCss(); printAssemblySection(); puts("<BR>"); // Make wrapper table for collapsible sections: selectVariants(); char *geneTrack = selectGenes(); if (geneTrack != NULL) { selectRegulatory(); selectAnnotations(geneTrack); selectFilters(); selectOutput(); submitAndDisclaimer(); } printf("</FORM>"); jsReloadOnBackButton(cart); webNewSection("Using the Variant Annotation Integrator"); webIncludeHelpFileSubst("hgVaiHelpText", cart, FALSE); jsIncludeFile("jquery-ui.js", NULL); jsIncludeFile("hgVai.js", NULL); jsIncludeFile("ui.dropdownchecklist.js", NULL); jsIncludeFile("ddcl.js", NULL); } void doUi() /* Set up globals and make web page */ { cartWebStart(cart, database, "Variant Annotation Integrator"); doMainPage(); cartWebEnd(); /* Save variables. */ cartCheckout(&cart); } void checkVariantTrack(struct trackDb *tdb) /* variantTrack should be either pgSnp or VCF. */ { if (! sameString(tdb->type, "pgSnp") && ! startsWith("vcf", tdb->type)) { errAbort("Expected variant track '%s' to be type pgSnp, vcf or vcfTabix, but it's '%s'", tdb->track, tdb->type); } } char *fileNameFromTable(char *table) /* Get fileName from a bigData table (for when we don't have a trackDb, just table). */ { struct sqlConnection *conn = hAllocConn(database); char query[512]; sqlSafef(query, sizeof(query), "select fileName from %s", table); char *fileName = sqlQuickString(conn, query); hFreeConn(&conn); char *fileNameRewrite = hReplaceGbdb(fileName); freez(&fileName); return fileNameRewrite; } void textOpen() /* Start serving up plain text, possibly via a pipeline to gzip. */ { char *fileName = textOutSanitizeHttpFileName(cartUsualString(cart, "hgva_outFile", "")); char *compressType = cartUsualString(cart, "hgva_compressType", textOutCompressGzip); compressPipeline = textOutInit(fileName, compressType, NULL); } void setGpVarFuncFilter(struct annoGrator *gpVarGrator) /* Use cart variables to configure gpVarGrator's filtering by functional category. */ { struct annoGratorGpVarFuncFilter aggvFuncFilter; ZeroVar(&aggvFuncFilter); aggvFuncFilter.intergenic = cartUsualBoolean(cart, "hgva_include_intergenic", TRUE); aggvFuncFilter.upDownstream = cartUsualBoolean(cart, "hgva_include_upDownstream", TRUE); aggvFuncFilter.nmdTranscript = cartUsualBoolean(cart, "hgva_include_nmdTranscript", TRUE); aggvFuncFilter.exonLoss = cartUsualBoolean(cart, "hgva_include_exonLoss", TRUE); aggvFuncFilter.utr = cartUsualBoolean(cart, "hgva_include_utr", TRUE); aggvFuncFilter.cdsSyn = cartUsualBoolean(cart, "hgva_include_cdsSyn", TRUE); aggvFuncFilter.cdsNonSyn = cartUsualBoolean(cart, "hgva_include_cdsNonSyn", TRUE); aggvFuncFilter.intron = cartUsualBoolean(cart, "hgva_include_intron", TRUE); aggvFuncFilter.splice = cartUsualBoolean(cart, "hgva_include_splice", TRUE); aggvFuncFilter.nonCodingExon = cartUsualBoolean(cart, "hgva_include_nonCodingExon", TRUE); aggvFuncFilter.noVariation = cartUsualBoolean(cart, "hgva_include_noVariation", TRUE); annoGratorGpVarSetFuncFilter(gpVarGrator, &aggvFuncFilter); } static void setHgvsOutOptions(struct annoGrator *gpVarGrator, char *geneTrack, struct annoFormatter *vepOut) /* Use cart variables to configure gpVarGrator's HGVS output. */ { uint hgvsOutOptions = 0; if (canDoHgvsOut(geneTrack)) { if (cartUsualBoolean(cart, "hgva_hgvsG", FALSE)) hgvsOutOptions |= HGVS_OUT_G; if (cartUsualBoolean(cart, "hgva_hgvsCN", FALSE)) hgvsOutOptions |= HGVS_OUT_CN; if (cartUsualBoolean(cart, "hgva_hgvsP", FALSE)) hgvsOutOptions |= HGVS_OUT_P; if (cartUsualBoolean(cart, "hgva_hgvsPAddParens", FALSE)) hgvsOutOptions |= HGVS_OUT_P_ADD_PARENS; if (cartUsualBoolean(cart, "hgva_hgvsBreakDelIns", FALSE)) hgvsOutOptions |= HGVS_OUT_BREAK_DELINS; } annoGratorGpVarSetHgvsOutOptions(gpVarGrator, hgvsOutOptions); annoFormatVepSetHgvsOutOptions(vepOut, hgvsOutOptions); } struct annoGrator *gratorForSnpBed4(struct hash *gratorsByName, char *suffix, struct annoAssembly *assembly, char *chrom, enum annoGratorOverlap overlapRule, char **retDescription) /* Look up snpNNNsuffix; if we find it, return a grator (possibly for a bigBed 4 file), * otherwise return NULL. */ { char *fileName = NULL; struct trackDb *tdb = NULL; if (! findSnpBed4(suffix, &fileName, &tdb)) return NULL; struct annoGrator *grator = NULL; // First look in gratorsByName to see if this grator has already been constructed: if (tdb != NULL) { grator = hashFindVal(gratorsByName, tdb->table); if (retDescription) *retDescription = cloneString(tdb->longLabel); } if (grator != NULL) { // Set its overlap behavior and we're done. grator->setOverlapRule(grator, overlapRule); return grator; } // If not in gratorsByName, then attempt to construct it here: if (fileName != NULL) grator = hAnnoGratorFromBigFileUrl(fileName, NULL, assembly, ANNO_NO_LIMIT, overlapRule); else grator = hAnnoGratorFromTrackDb(assembly, tdb->table, tdb, chrom, ANNO_NO_LIMIT, NULL, overlapRule, NULL); if (grator != NULL) hashAdd(gratorsByName, tdb->table, grator); return grator; } char *tableNameFromSourceName(char *sourceName) /* Strip sourceName (file path or db.table) to just the basename or table name. */ { char *tableName = cloneString(sourceName); char *p = strrchr(tableName, '/'); if (p != NULL) { // file path; strip to basename char dir[PATH_LEN], name[FILENAME_LEN], extension[FILEEXT_LEN]; splitPath(tableName, dir, name, extension); safecpy(tableName, strlen(tableName)+1, name); } else { // database.table -- skip database. p = strchr(tableName, '.'); if (p != NULL) tableName = p+1; } return tableName; } char *tagFromTableName(char *tableName, char *suffix) /* Generate a tag for VEP's extras column or VCF's info column. */ { char *p = strstr(tableName, "dbNsfp"); if (p != NULL) tableName = p + strlen("dbNsfp"); int suffixLen = (suffix == NULL) ? 0 : strlen(suffix); int tagSize = strlen(tableName) + suffixLen + 1; char *tag = cloneStringZ(tableName, tagSize); if (isNotEmpty(suffix)) safecat(tag, tagSize, suffix); touppers(tag); // Some custom shortenings, to avoid very long tag names: (void)strSwapStrs(tag, tagSize, "POLYPHEN", "PP"); (void)strSwapStrs(tag, tagSize, "MUTATION", "MUT"); (void)strSwapStrs(tag, tagSize, "PHYLOP", "PHP"); (void)strSwapStrs(tag, tagSize, "PHASTCONS", "PHC"); (void)strSwapStrs(tag, tagSize, "ELEMENTS", "EL"); (void)strSwapStrs(tag, tagSize, "PRIMATES", "PRIM"); (void)strSwapStrs(tag, tagSize, "PLACENTAL", "PLAC"); if (regexMatch(tag, "^PH.*[0-9]WAY")) (void)strSwapStrs(tag, tagSize, "WAY", "W"); (void)strSwapStrs(tag, tagSize, "WGENCODEREGDNASECLUSTERED", "DNASE"); (void)strSwapStrs(tag, tagSize, "WGENCODEREGTFBSCLUSTERED", "TFBS"); return tag; } enum PolyPhen2Subset stripSubsetFromTrackName(char *trackName) /* trackName may have a _suffix for a subset of PolyPhen2; convert that to enum * and zero out the suffix so we have the real trackName. */ { enum PolyPhen2Subset subset = noSubset; char *p = strchr(trackName, ':'); if (p != NULL) { if (sameString(p+1, "HDIV")) subset = HDIV; else if (sameString(p+1, "HVAR")) subset = HVAR; else errAbort("unrecognized suffix in track_suffix '%s'", trackName); *p = '\0'; } return subset; } void updateGratorListAndVepExtra(struct annoGrator *grator, struct annoGrator **pGratorList, struct annoFormatter *vepOut, enum PolyPhen2Subset subset, char *column, char *description, boolean isReg) /* If grator is non-NULL, add it to gratorList and vepOut's list of items for EXTRAs column. */ { if (grator == NULL) return; slAddHead(pGratorList, grator); if (vepOut != NULL) { char *tableName = tableNameFromSourceName(grator->streamer.name); char *suffix = NULL; if (subset == HDIV) suffix = "HDIV"; else if (subset == HVAR) suffix = "HVAR"; char *tag = tagFromTableName(tableName, suffix); if (isEmpty(description)) description = grator->streamer.name; if (isReg) annoFormatVepAddRegulatory(vepOut, (struct annoStreamer *)grator, tag, description, column); else annoFormatVepAddExtraItem(vepOut, (struct annoStreamer *)grator, tag, description, column, FALSE); } } INLINE void updateGratorList(struct annoGrator *grator, struct annoGrator **pGratorList) /* If grator is non-NULL, add it to gratorList. */ { updateGratorListAndVepExtra(grator, pGratorList, NULL, 0, NULL, NULL, FALSE); } void addDbNsfpSeqChange(char *trackName, struct annoAssembly *assembly, struct hash *gratorsByName, struct annoGrator **pGratorList) // If the user has selected dbNsfp* data, we also need the underlying dbNsfpSeqChange // data, so annoFormatVep can tell whether the variant and gpFx are consistent with the // variant and transcript that dbNsfp used to calculate scores. { //#*** Yet another place where we need metadata: char *seqChangeTable = "dbNsfpSeqChange"; if (hashFindVal(gratorsByName, seqChangeTable) == NULL) { char *fileName = fileNameFromTable(seqChangeTable); if (fileName == NULL) errAbort("'%s' requested, but I can't find fileName for %s", trackName, seqChangeTable); struct annoGrator *grator = hAnnoGratorFromBigFileUrl(fileName, NULL, assembly, ANNO_NO_LIMIT, agoNoConstraint); updateGratorList(grator, pGratorList); hashAdd(gratorsByName, seqChangeTable, grator); } } static struct dyString *dyInfo = NULL; struct hash *getTrackFilterVars(char *track) /* Return a hash of filter variable names (cart variable suffixes) to slName lists of values. */ { char filterPrefix[512]; safef(filterPrefix, sizeof(filterPrefix), "hgva_filter_%s_", track); struct slPair *filterVars = cartVarsWithPrefix(cart, filterPrefix), *var; int prefixLen = strlen(filterPrefix); struct hash *varHash = hashNew(0); for (var = filterVars; var != NULL; var = var->next) { char *varName = var->name+prefixLen; char *val = var->val; struct hashEl *hel = hashLookup(varHash, varName); if (hel != NULL) slNameAddHead((struct slName **)(&hel->val), val); else hashAdd(varHash, varName, slNameNew(val)); } return varHash; } INLINE boolean isNotAll(struct slName *valList) /* Return TRUE unless valList has one element with name "All" (for multiselects). */ { if (slCount(valList) == 1 && sameString(valList->name, "All")) return FALSE; return TRUE; } void factorSourceGratorAddFilter(struct annoGrator *grator, char *name, struct slName *valList) /* Add filter to factorSource grator. */ //#*** Do these smarts belong here in hgVai? Probably not -- should be an hg/lib module with //#*** UI/metadata smarts. { struct annoStreamer *gStreamer = (struct annoStreamer *)grator; struct annoFilter *filter = NULL; if (sameString(name, "name") || sameString(name, "cellType") || sameString(name, "treatment")) { if (valList && isNotAll(valList)) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afMatch, valList); } else if (sameString(name, "score")) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afGTE, valList); else errAbort("Unrecognized filter name '%s' for %s, type=factorSource", name, gStreamer->name); if (filter) gStreamer->addFilters(gStreamer, filter); } void bed5AddFilter(struct annoGrator *grator, char *name, struct slName *valList) /* Add filter to bed 5 grator. */ { struct annoStreamer *gStreamer = (struct annoStreamer *)grator; struct annoFilter *filter = NULL; if (sameString(name, "name")) { if (valList && isNotAll(valList)) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afMatch, valList); } else if (sameString(name, "score")) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afGTE, valList); else errAbort("Unrecognized filter name '%s' for %s, type=bed 5", name, gStreamer->name); if (filter) gStreamer->addFilters(gStreamer, filter); } void addFiltersToGrator(struct annoGrator *grator, struct trackDb *tdb) /* Look for filter variables in the cart and add filters to grator accordingly. */ { struct hash *varHash = getTrackFilterVars(tdb->track); struct hashEl *hel, *helList = hashElListHash(varHash); for (hel = helList; hel != NULL; hel = hel->next) { char *filterName = hel->name; struct slName *valList = hel->val; //#*** Need a much better way to dispatch... if (sameString("factorSource", tdb->type)) factorSourceGratorAddFilter(grator, filterName, valList); else if (startsWith("bed 5", tdb->type)) bed5AddFilter(grator, filterName, valList); else dyStringPrintf(dyInfo, "Ignoring %s filter %s\n", tdb->track, filterName); } hashFree(&varHash); } void addOutputTracks(struct annoGrator **pGratorList, struct hash *gratorsByName, struct annoFormatter *vepOut, struct annoAssembly *assembly, char *chrom, boolean doHtml, boolean *retHasRegulatory) // Construct grators for tracks selected to appear in EXTRAS column { boolean includeReg = cartUsualBoolean(cart, "hgva_include_regulatory", TRUE); boolean haveReg = FALSE; char trackPrefix[128]; safef(trackPrefix, sizeof(trackPrefix), "hgva_track_%s_", database); int trackPrefixLen = strlen(trackPrefix); struct slPair *trackVar, *trackVars = cartVarsWithPrefix(cart, trackPrefix); for (trackVar = trackVars; trackVar != NULL; trackVar = trackVar->next) { char *val = trackVar->val; if (! (sameWord(val, "on") || atoi(val) > 0)) continue; char *trackName = trackVar->name + trackPrefixLen; if (sameString(trackName, "dbNsfpPolyPhen2")) // PolyPhen2 must have a suffix now -- skip obsolete cartVar from existing carts continue; struct annoGrator *grator = hashFindVal(gratorsByName, trackName); if (grator != NULL) // We already have this as a grator: continue; enum PolyPhen2Subset subset = noSubset; char *description = NULL; char *column = NULL; boolean isReg = FALSE; if (startsWith("dbNsfp", trackName)) { // trackName for PolyPhen2 has a suffix for subset -- strip it if we find it: subset = stripSubsetFromTrackName(trackName); description = dbNsfpDescFromTableName(trackName, subset, doHtml); addDbNsfpSeqChange(trackName, assembly, gratorsByName, pGratorList); char *fileName = fileNameFromTable(trackName); if (fileName != NULL) grator = hAnnoGratorFromBigFileUrl(fileName, NULL, assembly, ANNO_NO_LIMIT, agoNoConstraint); } else { struct trackDb *tdb = tdbForTrack(database, trackName, &fullTrackList); if (tdb != NULL) { grator = hAnnoGratorFromTrackDb(assembly, tdb->table, tdb, chrom, ANNO_NO_LIMIT, NULL, agoNoConstraint, NULL); if (grator != NULL) { //#*** Need something more sophisticated but this works for our //#*** limited selection of extra tracks: if (asColumnFind(grator->streamer.asObj, "name") != NULL) column = "name"; addFiltersToGrator(grator, tdb); } description = tdb->longLabel; isReg = (includeReg && ((sameString(database, "hg19") && isHg19RegulatoryTrack(tdb, NULL)) || (sameString(database, "hg38") && isHg38RegulatoryTrack(tdb, NULL)))); } } haveReg |= isReg; updateGratorListAndVepExtra(grator, pGratorList, vepOut, subset, column, description, isReg); if (grator != NULL) hashAdd(gratorsByName, trackName, grator); } if (retHasRegulatory) *retHasRegulatory = haveReg; } void addFilterTracks(struct annoGrator **pGratorList, struct hash *gratorsByName, struct annoAssembly *assembly, char *chrom) // Add grators for filters (not added to vepOut): { if (!cartUsualBoolean(cart, "hgva_include_snpCommon", TRUE)) { struct annoGrator *grator = gratorForSnpBed4(gratorsByName, "Common", assembly, chrom, agoMustNotOverlap, NULL); updateGratorList(grator, pGratorList); } if (!cartUsualBoolean(cart, "hgva_include_snpMult", TRUE)) { struct annoGrator *grator = gratorForSnpBed4(gratorsByName, "Mult", assembly, chrom, agoMustNotOverlap, NULL); updateGratorList(grator, pGratorList); } if (cartUsualBoolean(cart, "hgva_require_consEl", FALSE)) { char *consElTrack = cartString(cart, "hgva_require_consEl_track"); struct annoGrator *grator = hashFindVal(gratorsByName, consElTrack); if (grator == NULL) { struct trackDb *tdb = tdbForTrack(database, consElTrack, &fullTrackList); if (tdb != NULL) grator = hAnnoGratorFromTrackDb(assembly, tdb->table, tdb, chrom, ANNO_NO_LIMIT, NULL, agoMustOverlap, NULL); updateGratorList(grator, pGratorList); } else grator->setOverlapRule(grator, agoMustOverlap); } } static void getCartPosOrDie(char **retChrom, uint *retStart, uint *retEnd) /* Get chrom:start-end from cart, errAbort if any problems. */ { -char *position = cartString(cart, hgvaRange); +char *position = windowsToAscii(cartString(cart, hgvaRange)); if (! parsePosition(position, retChrom, retStart, retEnd)) errAbort("Expected position to be chrom:start-end but got '%s'", position); } static char *sampleVariantsPath(struct annoAssembly *assembly, char *geneTrack) /* Construct a path for a trash file of contrived example variants for this assembly * and gene track. */ { char *chrom = NULL; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); mkdirTrashDirectory(hgvsTrashSubDir); struct dyString *dy = dyStringCreate("%s/%s/%s_%s_%s_%u-%u.vcf", trashDir(), hgvsTrashSubDir, assembly->name, geneTrack, chrom, start, end); return dyStringCannibalize(&dy); } static char *makeMinimalVcfHeader(char *db, char *headerDefs) /* Return a string containing a simple no-genotypes VCF header. db must be non-NULL. * headerDefs can be NULL or empty or line(s) starting with '##' and ending w/'\n' */ { struct dyString *dy = dyStringCreate("##fileformat=VCFv4.2\n" "##reference=%s\n" "%s" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", db, (headerDefs ? headerDefs : "")); return dyStringCannibalize(&dy); } static void writeMinimalVcfHeader(FILE *f, char *db, char *headerDefs) /* Write header for VCF with no genotype data. db must be non-NULL. * headerDefs can be NULL or empty or line(s) starting with '##' and ending w/'\n' */ { char *headerString = makeMinimalVcfHeader(db, headerDefs); fputs(headerString, f); freeMem(headerString); } static void mutateBase(char *pBase) /* Change *pBase into a different base. */ { static char *bases = "ACGT"; char c; // In real life the probabilities aren't even, but this is easier. while ((c = bases[(uint)(floor(drand48() * 4.0))]) == *pBase) ; *pBase = c; } static void writeMinimalVcfRow(FILE *f, struct vcfRecord *rec) /* Write a minimalist VCF row (coords, name, alleles and placeholders). */ { // VCF columns: #CHROM POS ID REF ALT QUAL FILTER INFO fprintf(f, "%s\t%u\t%s\t%s\t", rec->chrom, rec->chromStart+1, rec->name, rec->alleles[0]); // Comma-separated alt alleles: int i; for (i = 1; i < rec->alleleCount; i++) fprintf(f, "%s%s", (i > 1 ? "," : ""), rec->alleles[i]); // Placeholder qual, filter, info. fprintf(f, "\t.\t.\t.\n"); } static void writeMinimalVcfRowFromBed(FILE *f, struct bed4 *bed, struct dnaSeq *seq, uint seqOffset) /* Write a minimalist VCF row with a mutation of the reference sequence at the given * position. */ { uint indelBase = 0, start = bed->chromStart, end = bed->chromEnd; if (end != start+1) { // VCF treats indels in a special way: pos is the base to the left of the indel, // and both ref and alt alleles include the base to the left of the indel. indelBase = 1; } // Get reference allele sequence: uint refAlLen = end - start + indelBase; char ref[refAlLen+1]; uint seqStart = start - indelBase - seqOffset; safencpy(ref, sizeof(ref), seq->dna+seqStart, refAlLen); touppers(ref); // Alternate allele seq needs extra room in case of single-base insertion: char alt[refAlLen+2]; alt[0] = ref[0]; alt[1] = '\0'; if (start == end) // insertion: alt gets extra base safecpy(alt+1, sizeof(alt)-1, "A"); else if (end == start+1) // single nucleotide variant mutateBase(alt); // else deletion: leave alt truncated after the shared base to left of indel // VCF columns: #CHROM POS ID REF ALT QUAL FILTER INFO fprintf(f, "%s\t%u\t%s\t%s\t%s\t.\t.\t.\n", bed->chrom, start+1-indelBase, bed->name, ref, alt); } static void writeArbitraryVariants(FILE *f, struct annoAssembly *assembly) /* Concoct some variants at an arbitrary position on some assembly sequence. * We shouldn't ever get here unless the selected geneTrack is empty. */ { char *chrom = "NULL"; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); struct bed4 *bed = bed4New(chrom, start, start, "ex_ins"); struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, start-1, start+100); writeMinimalVcfRowFromBed(f, bed, seq, start-1); dnaSeqFree(&seq); } static void addGpFromRow(struct genePred **pGpList, struct annoRow *row, boolean *pNeedCoding, boolean *pNeedNonCoding, boolean isBig) /* If row is coding and we need a coding gp, add it to pGpList and update pNeedCoding; * likewise for noncoding. */ { struct genePred *gp = isBig ? (struct genePred *)genePredFromBigGenePredRow(row->data) : genePredLoad(row->data); if (gp->cdsStart != gp->cdsEnd && *pNeedCoding) { slAddHead(pGpList, gp); *pNeedCoding = FALSE; } else if (gp->cdsStart == gp->cdsEnd && *pNeedNonCoding) { slAddHead(pGpList, gp); *pNeedNonCoding = FALSE; } } static void addGpFromPos(struct annoStreamer *geneStream, char *chrom, uint start, uint end, struct genePred **pGpList, boolean *pNeedCoding, boolean *pNeedNonCoding, struct lm *lm, boolean isBig) /* Get up to 10 rows from position; if we find one coding and one non-coding gene, * add them to pGpList and update pNeed* accordingly. */ { geneStream->setRegion(geneStream, chrom, start, end); int rowCount = 0; struct annoRow *row = NULL; while (rowCount < 10 && (*pNeedCoding || *pNeedNonCoding) && (row = geneStream->nextRow(geneStream, NULL, 0, lm)) != NULL) addGpFromRow(pGpList, row, pNeedCoding, pNeedNonCoding, isBig); } static struct genePred *genesFromPosition(struct annoStreamer *geneStream, boolean isBig, boolean *retGotCoding, boolean *retGotNonCoding) /* Try to get a coding and non-coding gene from geneStream at cart position. * If there are none, try whole chrom; if none there, first in assembly. */ { struct genePred *gpList = NULL; struct lm *lm = lmInit(0); char *chrom = NULL; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); boolean needCoding = TRUE, needNonCoding = TRUE; // First, look for both coding and noncoding genes at cart position: addGpFromPos(geneStream, chrom, start, end, &gpList, &needCoding, &needNonCoding, lm, isBig); // If that didn't do it, try querying whole chrom if (needCoding || needNonCoding) addGpFromPos(geneStream, chrom, 0, 0, &gpList, &needCoding, &needNonCoding, lm, isBig); // If we still haven't found either coding or non-coding on the cart's current chrom, // try whole genome: if (needCoding && needNonCoding) addGpFromPos(geneStream, NULL, 0, 0, &gpList, &needCoding, &needNonCoding, lm, isBig); slSort(&gpList, genePredCmp); lmCleanup(&lm); if (retGotCoding) *retGotCoding = !needCoding; if (retGotNonCoding) *retGotNonCoding = !needNonCoding; return gpList; } static void addSnv(struct bed4 **pBedList, char *chrom, uint start, char *name) /* Add a single-nucleotide bed4 item. */ { slAddHead(pBedList, bed4New(chrom, start, start+1, name)); } // Stuff within this range (see gpFx.h) is considered upstream/downstream of a transcript: #define UPSTREAMFUDGE GPRANGE static struct bed4 *sampleVarBedFromGenePred(struct genePred *gp, struct annoAssembly *assembly, uint txSeqStart, uint txSeqEnd, boolean exonOnly) /* Construct imaginary variants that hit various parts of the transcript * described in gp, using reference base values from assembly. */ { uint basesLeft = gp->txStart - txSeqStart, basesRight = txSeqEnd - gp->txEnd; struct bed4 *bedList = NULL; if (!exonOnly && basesLeft > UPSTREAMFUDGE) // SNV, intergenic to the left addSnv(&bedList, gp->chrom, txSeqStart, "ex_igLeft"); if (!exonOnly && basesLeft > 0) // SNV, up/downstream to the left addSnv(&bedList, gp->chrom, gp->txStart - 1, "ex_updownLeft"); if (!exonOnly && gp->exonCount > 1) { // SNV, intron uint start = (gp->exonEnds[0] + gp->exonStarts[1]) / 2; addSnv(&bedList, gp->chrom, start, "ex_intron"); // SNV, splice addSnv(&bedList, gp->chrom, gp->exonStarts[1] - 2, "ex_splice"); } boolean isCoding = (gp->cdsStart != gp->cdsEnd); if (isCoding) { if (gp->txStart < gp->cdsStart) // SNV, UTR to the left addSnv(&bedList, gp->chrom, gp->txStart + 1, "ex_utrLeft"); int cdsExon = 0; while (gp->cdsStart > gp->exonEnds[cdsExon] && cdsExon < gp->exonCount) cdsExon++; uint start = gp->cdsStart + 2; // single-base insertion, leftmost codon slAddHead(&bedList, bed4New(gp->chrom, start, start, "ex_codonLeftIns")); // 3-base deletion leftmost codon slAddHead(&bedList, bed4New(gp->chrom, start, start+3, "ex_codonLeftDel")); // SNV, leftmost codon addSnv(&bedList, gp->chrom, start+1, "ex_codonLeft"); if (gp->txEnd > gp->cdsEnd) // SNV, UTR to the right addSnv(&bedList, gp->chrom, gp->txEnd - 1, "ex_utrRight"); } else { // noncoding exon variant uint start = (gp->exonStarts[0] + gp->exonEnds[0]) / 2; addSnv(&bedList, gp->chrom, start, "ex_nce"); } if (!exonOnly && basesRight > 0) // SNV, up/downstream to the left addSnv(&bedList, gp->chrom, gp->txEnd + 1, "ex_updownRight"); return bedList; } // margin for intergenic variants around transcript: #define IGFUDGE 5 static struct annoStreamer *makeSampleVariantsStreamer(struct annoAssembly *assembly, struct trackDb *geneTdb, int maxOutRows) /* Construct a VCF file of example variants for db (unless it already exists) * and return an annoStreamer for that file. If possible, make the variants hit a gene. */ { char *sampleFile = sampleVariantsPath(assembly, geneTdb->track); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildSampleVariants", FALSE); if (! fileExists(sampleFile) || forceRebuild) { struct annoStreamer *geneStream = hAnnoStreamerFromTrackDb(assembly, geneTdb->table, geneTdb, NULL, ANNO_NO_LIMIT, NULL); boolean isBig = sameString(geneTdb->type, "bigGenePred"); boolean gotCoding = FALSE, gotNonCoding = FALSE; struct genePred *gpList = genesFromPosition(geneStream, isBig, &gotCoding, &gotNonCoding); FILE *f = mustOpen(sampleFile, "w"); writeMinimalVcfHeader(f, assembly->name, NULL); if (gpList == NULL) { warn("Unable to find any gene transcripts in '%s' (%s)", geneTdb->shortLabel, geneTdb->track); writeArbitraryVariants(f, assembly); } else { struct bed4 *bedList = NULL; uint chromSize = annoAssemblySeqSize(assembly, gpList->chrom); uint minSeqStart = chromSize, maxSeqEnd = 0; struct genePred *gp; for (gp = gpList; gp != NULL; gp = gp->next) { int txSeqStart = gp->txStart - (UPSTREAMFUDGE + IGFUDGE); uint txSeqEnd = gp->txEnd + UPSTREAMFUDGE + IGFUDGE; if (txSeqStart < 0) txSeqStart = 0; if (txSeqEnd > chromSize) txSeqEnd = chromSize; if (txSeqStart < minSeqStart) minSeqStart = txSeqStart; if (txSeqEnd > maxSeqEnd) maxSeqEnd = txSeqEnd; // If we got a coding gp, but this gp is non-coding, just do its exon variant: boolean exonOnly = gotCoding && (gp->cdsStart == gp->cdsEnd); slCat(&bedList, sampleVarBedFromGenePred(gp, assembly, txSeqStart, txSeqEnd, exonOnly)); } slSort(&bedList, bedCmp); struct dnaSeq *txSeq = twoBitReadSeqFragLower(assembly->tbf, gpList->chrom, minSeqStart, maxSeqEnd); struct bed4 *bed; for (bed = bedList; bed != NULL; bed = bed->next) { writeMinimalVcfRowFromBed(f, bed, txSeq, minSeqStart); } dnaSeqFree(&txSeq); bed4FreeList(&bedList); } geneStream->close(&geneStream); carefulClose(&f); } return annoStreamVcfNew(sampleFile, NULL, FALSE, assembly, maxOutRows); } static char *md5TrashPath(struct annoAssembly *assembly, char *string) /* Use the md5sum of a string to make a hopefully unique trash filename. */ { char *md5sum = md5HexForString(string); mkdirTrashDirectory(hgvsTrashSubDir); struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), hgvsTrashSubDir, assembly->name, md5sum); return dyStringCannibalize(&dy); } static struct slName *hashListNames(struct hash *hash) /* Return a list of all element names in the hash (if any). */ { struct slName *list = NULL; struct hashCookie cookie = hashFirst(hash); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) slAddHead(&list, slNameNew(hel->name)); return list; } static char *encloseInAngleBracketsDbSnp(char *stringIn) /* Return a string that has <dbSNP: stringIn>, with spaces replaced by '_'s. */ { int stringInLen = strlen(stringIn); int stringOutLen = stringInLen + strlen("<dbSNP:>") + 1; char *stringOut = needMem(stringOutLen); safef(stringOut, stringOutLen, "<dbSNP:%s>", stringIn); subChar(stringOut, ' ', '_'); return stringOut; } // dbSNP named alleles have many ways to describe a deletion from the reference, // for example "LARGEDELETION", "LARGE DELETION", "... DELETED", "... DEL": static const char *dbSnpDelRegex = "^\\(.*(DELET.*| DEL)\\)$"; static char **parseDbSnpAltAlleles(char *refAl, char *obsAls, boolean minusStrand, int *retAltAlCount, boolean *retNeedLeftBase) /* Given a non-symbolic reference allele and slash-sep observed alleles from dbSNP, * return an array of +-strand alleles that are not the same as the reference. * If any allele is "-" (deleted, zero-length), then set retNeedLeftBase to TRUE * because in this case VCF requires that the reference base to the left of the indel * must be added to all alleles, and the start coord also moves one base to the left. * Also, if any alt allele is symbolic, padding is required. * Note: this trashes obsAls. Resulting array can be freed but not its contents. */ { int obsCount = countChars(obsAls, '/') + 1; char *obsWords[obsCount]; chopByChar(obsAls, '/', obsWords, obsCount); boolean obsHasDeletion = FALSE; int i; for (i = 0; i < obsCount; i++) if (sameString(obsWords[i], "-")) { obsHasDeletion = TRUE; break; } char **altAls; AllocArray(altAls, obsCount); int altCount = 0; boolean needLeftBase = isEmpty(refAl) || sameString(refAl, "-"); for (i = 0; i < obsCount; i++) { char *altAl = obsWords[i]; int altAlLen = strlen(altAl); if (minusStrand && isAllNt(altAl, altAlLen)) reverseComplement(altAl, altAlLen); if (differentString(altAl, refAl)) { if (sameString(altAl, "-")) { altAls[altCount] = ""; needLeftBase = TRUE; } else { // It would be nice to expand the "(CA)11/12/14/15/16/17/18/19/20" syntax of // some dbSNP observed's. What are these?: "(D1S243)", "(D1S2870)" // Unfortunately for observed="lengthTooLong" we just can't get the correct allele // sequence. (76,130 of those in snp138) // Hmmm, I guess we could at least stick in the right number of N's if we can // parse "(245 BP INSERTION)". (2403 rows rlike "[0-9]+ BP ?INSERTION" in snp138) if (!isAllNt(altAl, altAlLen)) { // Symbolic allele: left base required, and enclose it in <dbSNP:>'s. // But if it's one of dbSNP's LARGEDELETION kind of alleles, that is redundant // with the reference allele, so if we know there is already a "-" allele, // skip it. if (obsHasDeletion && regexMatch(altAl, dbSnpDelRegex)) continue; needLeftBase = TRUE; altAl = encloseInAngleBracketsDbSnp(altAl); } altAls[altCount] = altAl; } altCount++; } } *retAltAlCount = altCount; *retNeedLeftBase = needLeftBase; return altAls; } char *firstNCommaSep(struct slName *nameList, int n) /* Return a comma-separated string with the first n names in nameList. */ { struct dyString *dy = dyStringNew(0); int i; struct slName *el; for (i=0, el=nameList; i < 5 && el != NULL; i++, el = el->next) { if (i > 0) dyStringAppend(dy, ", "); dyStringPrintf(dy, "'%s'", el->name); } return dyStringCannibalize(&dy); } //#*** Variant ID-matching should be metadata-driven too. termRegex -> data source. static const char *rsIdRegex = "^rs[0-9]+$"; static void rsIdsToVcfRecords(struct annoAssembly *assembly, struct slName *rsIds, struct vcfFile *vcff, struct vcfRecord **pRecList, struct slName **pCommentList) /* If possible, look up coordinates and alleles of dbSNP rs IDs. */ { if (rsIds == NULL) return; struct trackDb *tdb = hFindLatestSnpTrack(database, NULL, &fullTrackList); if (tdb == NULL) return; struct sqlConnection *conn = hAllocConn(assembly->name); // Build a 'name in (...)' query, and build a hash of IDs so we can test whether all were found struct dyString *dq = sqlDyStringCreate("select chrom, chromStart, chromEnd, name, strand, " "refUCSC, observed from %s where name in (", tdb->table); struct hash *idHash = hashNew(0); struct slName *id; for (id = rsIds; id != NULL; id = id->next) { tolowers(id->name); dyStringPrintf(dq, "%s'%s'", (id != rsIds ? "," : ""), id->name); hashStoreName(idHash, id->name); } dyStringAppend(dq, ");"); struct sqlResult *sr = sqlGetResult(conn, dq->string); // Construct a minimal VCF row to make a vcfRecord for each variant. char *vcfRow[9]; vcfRow[5] = vcfRow[6] = vcfRow[7] = "."; // placeholder for qual, filter, info // It would be cool to someday add snpNNN's exceptions column to the filter or info. struct dyString *dyAltAlStr = dyStringNew(0); int i; char **row; while ((row = sqlNextRow(sr)) != NULL) { char *chrom = row[0]; uint chromStart = atoll(row[1]); uint chromEnd = atoll(row[2]); char *name = row[3]; char *strand = row[4]; char refAl[max(strlen(row[5]), chromEnd-chromStart) + 2]; safecpy(refAl, sizeof(refAl), row[5]); if (sameString(refAl, "-")) refAl[0] = '\0'; else if (! isAllNt(refAl, strlen(refAl))) { // refUCSC is symbolic like "( 2548bp insertion )" -- just use the actual reference // sequence for now, to keep things simple downstream. //#*** Need something more efficient, like a sequence cache inside assembly! struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, chromStart, chromEnd); safecpy(refAl, sizeof(refAl), seq->dna); toUpperN(refAl, seq->size); dnaSeqFree(&seq); } char *obsAls = row[6]; int altAlCount = 0; boolean needLeftBase = FALSE; char **altAls = parseDbSnpAltAlleles(refAl, obsAls, sameString(strand, "-"), &altAlCount, &needLeftBase); needLeftBase |= (chromStart == chromEnd); // should be redundant, but just in case. hashRemove(idHash, name); uint vcfStart = chromStart + 1; dyStringClear(dyAltAlStr); // If this is a non-symbolic indel, we'll have to move start one base to the left // and add the reference base to the left of the actual alleles if (needLeftBase) { vcfStart--; //#*** Need something more efficient, like a sequence cache inside assembly! struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, chromStart-1, chromStart); toUpperN(seq->dna, 1); char leftBase = seq->dna[0]; dnaSeqFree(&seq); char refAlPlus[strlen(refAl)+2]; safef(refAlPlus, sizeof(refAlPlus), "%c%s", leftBase, refAl); safecpy(refAl, sizeof(refAl), refAlPlus); for (i = 0; i < altAlCount; i++) { if (i > 0) dyStringAppendC(dyAltAlStr, ','); if (isAllNt(altAls[i], strlen(altAls[i]))) dyStringPrintf(dyAltAlStr, "%c%s", leftBase, altAls[i]); else dyStringAppend(dyAltAlStr, altAls[i]); } } else { // Not an indel, just make comma-sep string of alt alleles. for (i = 0; i < altAlCount; i++) { if (i > 0) dyStringAppendC(dyAltAlStr, ','); dyStringAppend(dyAltAlStr, altAls[i]); } } char vcfStartStr[64]; safef(vcfStartStr, sizeof(vcfStartStr), "%d", vcfStart); vcfRow[0] = chrom; vcfRow[1] = vcfStartStr; vcfRow[2] = name; vcfRow[3] = refAl; vcfRow[4] = dyAltAlStr->string; struct vcfRecord *rec = vcfRecordFromRow(vcff, vcfRow); slAddHead(pRecList, rec); } dyStringFree(&dyAltAlStr); // Check for IDs not found struct slName *notFoundIds = hashListNames(idHash); if (notFoundIds != NULL) { char *namesNotFound = firstNCommaSep(notFoundIds, 5); struct dyString *dy = dyStringCreate("%d rs# IDs not found, e.g. %s", slCount(notFoundIds), namesNotFound); slAddTail(pCommentList, slNameNew(dy->string)); freeMem(namesNotFound); dyStringFree(&dy); } slNameFreeList(¬FoundIds); } static struct vcfRecord *parseVariantIds(struct annoAssembly *assembly, char *variantIdText, struct slName **pCommentList) /* Return a sorted list of minimal vcfRecords (coords, name, ref and alt alleles) * corresponding to each pasted variant. */ { struct vcfRecord *recList = NULL; char *p = cloneString(variantIdText), *id; struct slName *rsIds = NULL, *unknownIds = NULL; subChar(p, ',', ' '); while ((id = nextWord(&p)) != NULL) { if (regexMatchNoCase(id, rsIdRegex)) slNameAddHead(&rsIds, id); else slNameAddHead(&unknownIds, id); } if (unknownIds != NULL) { slReverse(&unknownIds); char *firstUnknownIds = firstNCommaSep(unknownIds, 5); struct dyString *dy = dyStringCreate("%d variant identifiers are unrecognized, e.g. %s", slCount(unknownIds), firstUnknownIds); slAddTail(pCommentList, slNameNew(dy->string)); freeMem(firstUnknownIds); dyStringFree(&dy); } struct vcfFile *vcff = vcfFileNew(); rsIdsToVcfRecords(assembly, rsIds, vcff, &recList, pCommentList); slSort(&recList, vcfRecordCmp); slNameFreeList(&rsIds); slNameFreeList(&unknownIds); return recList; } static void adjustRangeForVariants(struct vcfRecord *recList, char **pChrom, uint *pStart, uint *pEnd) /* Given a *sorted* list of VCF records, look at the first and last records and if * they fall outside of the range, expand the range to include them. */ { if (pChrom != NULL && *pChrom != NULL && recList != NULL) { // We have a non-empty sorted list of records, and search is not genome-wide; // look at first and last variants to see if they're not already included in search range. struct vcfRecord *first = recList, *last = recList; while (last->next) last = last->next; if (differentString(*pChrom, first->chrom) || differentString(*pChrom, last->chrom)) { // Variants span multiple chroms; force genome-wide search. *pChrom = NULL; *pStart = *pEnd = 0; } else { // Variant(s) are on the same chrom as search range; check starts and ends. if (*pStart > first->chromEnd) *pStart = first->chromEnd; if (*pEnd < last->chromStart) *pEnd = last->chromStart; } } } static struct annoStreamer *makeVariantIdStreamer(struct annoAssembly *assembly, int maxOutRows, char **pChrom, uint *pStart, uint *pEnd, struct slName **pCommentList) /* Translate user's pasted/uploaded variant IDs into minimal VCF if possible. * Return a VCF streamer for those, and if the current search position is too narrow * to include all of the variants, widen it as necessary. */ { // Hash variant text to get trash filename. Use if exists, otherwise build it. char *variantIds = cartString(cart, hgvaVariantIds); char *varFile = md5TrashPath(assembly, variantIds); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildVariantIds", FALSE); if (! fileExists(varFile) || forceRebuild) { struct vcfRecord *varList = parseVariantIds(assembly, variantIds, pCommentList); //#*** If no variants were recognized, we should probably show main page with a warning. adjustRangeForVariants(varList, pChrom, pStart, pEnd); FILE *f = mustOpen(varFile, "w"); writeMinimalVcfHeader(f, assembly->name, NULL); struct vcfRecord *var; for (var = varList; var != NULL; var = var->next) writeMinimalVcfRow(f, var); carefulClose(&f); } return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows); } static struct vcfRecord *parseHgvs(struct vcfFile *vcff, struct annoAssembly *assembly, char *hgvsTerms, struct slName **pCommentList) /* Return a sorted list of vcfRecords . */ { struct vcfRecord *recList = NULL; struct slName *failedTerms = NULL; struct dyString *dyError = dyStringNew(0); struct lineFile *lf = lineFileOnString("user-provided HGVS terms", TRUE, cloneString(hgvsTerms)); char *term = NULL; while (lineFileNextReal(lf, &term)) { eraseTrailingSpaces(term); dyStringClear(dyError); struct vcfRow *vcfRow = hgvsToVcfRow(assembly->name, term, FALSE, dyError); if (vcfRow) { char *row[8]; row[0] = vcfRow->chrom; char posStr[64]; safef(posStr, sizeof(posStr), "%d", vcfRow->pos); row[1] = posStr; row[2] = vcfRow->id; row[3] = vcfRow->ref; row[4] = vcfRow->alt; row[5] = "."; row[6] = vcfRow->filter; row[7] = vcfRow->info; slAddHead(&recList, vcfRecordFromRow(vcff, row)); } else slNameAddHead(&failedTerms, dyStringContents(dyError)); } if (failedTerms != NULL) { slReverse(&failedTerms); char *firstUnknownIds = firstNCommaSep(failedTerms, 5); struct dyString *dy = dyStringCreate("%d HGVS terms could not be parsed and/or mapped to %s, " "e.g. %s", slCount(failedTerms), assembly->name, firstUnknownIds); slAddTail(pCommentList, slNameNew(dy->string)); freeMem(firstUnknownIds); dyStringFree(&dy); } slSort(&recList, vcfRecordCmp); slNameFreeList(&failedTerms); dyStringFree(&dyError); lineFileClose(&lf); return recList; } static struct annoStreamer *makeHgvsStreamer(struct annoAssembly *assembly, int maxOutRows, char **pChrom, uint *pStart, uint *pEnd, struct slName **pCommentList) /* Translate user's pasted/uploaded variant IDs into minimal VCF if possible. * Return a VCF streamer for those., and if the current search position is too narrow * to include all of the variants, widen it as necessary. */ { // Hash HGVS input to get trash filename. Use if exists, otherwise build it. char *hgvsTerms = cartString(cart, hgvaHgvs); char *varFile = md5TrashPath(assembly, hgvsTerms); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildHgvs", FALSE); if (! fileExists(varFile) || forceRebuild) { char *headerString = makeMinimalVcfHeader(assembly->name, HGVS_VCF_HEADER_DEFS); struct vcfFile *vcff = vcfFileFromHeader("hgVaiHgvs", headerString, VCF_IGNORE_ERRS); struct vcfRecord *varList = parseHgvs(vcff, assembly, hgvsTerms, pCommentList); //#*** If no HGVS terms could be mapped, we should probably show main page with a warning. adjustRangeForVariants(varList, pChrom, pStart, pEnd); FILE *f = mustOpen(varFile, "w"); writeMinimalVcfHeader(f, assembly->name, HGVS_VCF_HEADER_DEFS); struct vcfRecord *var; for (var = varList; var != NULL; var = var->next) writeMinimalVcfRow(f, var); carefulClose(&f); vcfFileFree(&vcff); } return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows); } static struct trackDb *getVariantTrackDb(char *variantTrack) /* Get a tdb for the user's selected variant track, or warn and return NULL * (main page will be displayed) */ { struct trackDb *varTdb = tdbForTrack(database, variantTrack, &fullTrackList); if (varTdb == NULL) { if (isHubTrack(variantTrack)) warn("Can't find hub track '%s'", variantTrack); else warn("Can't find tdb for variant track '%s'", variantTrack); } else checkVariantTrack(varTdb); return varTdb; } static char *gencodeVersionFromTrack(char *track) /* If track is a GENCODE table, find and return a pointer to the version at the end; * otherwise return NULL. */ { if (startsWithGencodeGene(track)) { char *v = strrchr(track, 'V'); return v; } return NULL; } static char *gencodeTagTableForTrack(char *db, char *track) /* If there is a wgEncodeGencodeTag<version> table that can be associated with track, * return it; otherwise return NULL. */ { struct slName *versionList = getGencodeTagVersions(); if (startsWithGencodeGene(track)) { char *version = gencodeVersionFromTrack(track); if (version != NULL) { char table[PATH_LEN]; return cloneString(gencodeTableName("Tag", version, table, sizeof(table))); } } else if (sameString(track, "refGene") && refGeneHasGencodeTags(versionList)) { char *version = getLatestGencodeVersion(versionList); char table[PATH_LEN]; if (hTableExists(db, gencodeTableName("RefSeq", version, table, sizeof(table)))) return cloneString(gencodeTableName("Tag", version, table, sizeof(table))); } else if (sameString(track, "knownGene") && knownGeneHasGencodeTags()) { if (hTableExists(db, "knownToTag")) return cloneString("knownToTag"); } return NULL; } static struct joinerDtf *getTxStatusExtras(char *db, char *track) // Several kinds of transcript status may be enabled in the cart; if any are enabled, // and if they apply to track, return the tables & fields to be joined with the track. { struct joinerDtf *txStatusExtras = NULL; if (cartUsualBoolean(cart, "hgva_txStatus_gencode", FALSE)) { char *gencodeTagTable = gencodeTagTableForTrack(db, track); if (gencodeTagTable != NULL) { char *field = "tag"; if (sameString("knownToTag", gencodeTagTable)) field = "value"; slAddHead(&txStatusExtras, joinerDtfNew(db, gencodeTagTable, field)); } } if (cartUsualBoolean(cart, "hgva_txStatus_knownCanonical", FALSE) && sameString(track, "knownGene") && hTableExists(db, "knownCanonical")) { slAddHead(&txStatusExtras, joinerDtfNew(db, "knownCanonical", "transcript")); } if (cartUsualBoolean(cart, "hgva_txStatus_refSeqStatus", FALSE) && sameString(track, "refGene") && genbankTableExists(db, refSeqStatusTable)) { char *genbankDb=NULL, *tableName=NULL; genbankGetDbTable(db, refSeqStatusTable, &genbankDb, &tableName); slAddHead(&txStatusExtras, joinerDtfNew(genbankDb, tableName, "status")); } return txStatusExtras; } static void configAddDtf(struct dyString *dy, struct joinerDtf *dtf, boolean *pIsFirst) /* Add a JSON object with [db].table and (list of one) field. */ { if (! *pIsFirst) dyStringAppend(dy, ", "); // The convention for related tables in same db as track is to prepend "." instead of "<db>.": char *db = (sameString(dtf->database, database) ? "" : dtf->database); dyStringPrintf(dy, "{ \"table\": \"%s.%s\", \"fields\": [\"%s\"] }", db, dtf->table, dtf->field); *pIsFirst = FALSE; } static void configAddTableField(struct dyString *dy, char *table, char *field, boolean *pIsFirst) /* Add a JSON object with current database, table and (list of one) field. */ { struct joinerDtf dtf; dtf.database = database; dtf.table = table; dtf.field = field; configAddDtf(dy, &dtf, pIsFirst); } static struct jsonElement *configForStreamer(char *db, struct trackDb *tdb, struct joinerDtf *txStatusExtras) /* Add VAI-specific config options, if applicable. */ { struct jsonElement *config = NULL; char *track = tdb->track; struct dyString *dyConfig = dyStringCreate("{ \"naForMissing\": false," " \"relatedTables\": [ "); boolean isFirst = TRUE; // If track is sql-based knownGene and we have kgXref, then add kgXref.geneSymbol after // the columns of knownGene. if (sameString(track, "knownGene") && !isCustomTrack(track) && !isHubTrack(track) && !trackDbSetting(tdb, "bigDataUrl") && hTableExists(db, "kgXref")) { configAddTableField(dyConfig, "kgXref", "geneSymbol", &isFirst); } struct joinerDtf *txStatDtf; for (txStatDtf = txStatusExtras; txStatDtf != NULL; txStatDtf = txStatDtf->next) configAddDtf(dyConfig, txStatDtf, &isFirst); // If any of the above apply, close the relatedTables list and config object // and parse into jsonElements. if (! isFirst) { dyStringAppend(dyConfig, " ] }"); config = jsonParse(dyConfig->string); dyStringFree(&dyConfig); } return config; } static void adjustGpVarOverlapRule(struct annoGrator *gpVarGrator, boolean haveRegulatory) /* If we're able to detect regulatory elements, and want to keep those annotations, loosen up * gpVarGrator's overlap rule from the default (must overlap). */ { if (haveRegulatory && cartUsualBoolean(cart, "hgva_include_regulatory", TRUE)) gpVarGrator->setOverlapRule(gpVarGrator, agoNoConstraint); } static void addTxStatusExtras(struct annoFormatter *vepOut, char *geneTrack, struct annoGrator *gpVarGrator, struct joinerDtf *txStatusExtras) /* Given a list of tables and fields that will be joined with geneTrack to provide transcript * status info, configure vepOut to put them in the EXTRAs column. */ { struct joinerDtf *txStatDtf; for (txStatDtf = txStatusExtras; txStatDtf != NULL; txStatDtf = txStatDtf->next) { char *tag = NULL, *description = NULL; boolean isBoolean = FALSE; // Double-check that we're not joining in some table from a different assembly database: if (differentString(txStatDtf->database, database) && differentString(txStatDtf->database, "hgFixed")) errAbort("addTxStatusExtras: Expected db=%s or hgFixed in txStatDtf but got %s", database, txStatDtf->database); if ((startsWith(GENCODE_PREFIX"Tag", txStatDtf->table) && sameString(txStatDtf->field, "tag")) || (sameString(txStatDtf->table, "knownToTag") && sameString(txStatDtf->field, "value"))) { tag = "GENCODE_TAG"; description = "<A HREF=" GENCODE_TAG_DOC_URL " " "TARGET=_BLANK>GENCODE tags</A> for the transcript"; } else if (sameString(txStatDtf->table, "knownCanonical") && sameString(txStatDtf->field, "transcript")) { tag = "CANONICAL"; description = "If present, the transcript is the 'canonical' transcript of the gene " "(generally the longest isoform of the gene)"; isBoolean = TRUE; } else if (sameString(txStatDtf->table, "refSeqStatus") && sameString(txStatDtf->field, "status")) { tag = "REFSEQ_STATUS"; description = "<A HREF=" REFSEQ_STATUS_DOC_URL " " "TARGET=_BLANK>RefSeq status</A> of the transcript"; } else { errAbort("addTxStatusExtras: Unrecognized {table,field}: {%s,%s}", txStatDtf->table, txStatDtf->field); } char *column = annoStreamDbColumnNameFromDtf(database, geneTrack, txStatDtf); annoFormatVepAddExtraItem(vepOut, (struct annoStreamer *)gpVarGrator, tag, description, column, isBoolean); } } void doQuery() /* Translate simple form inputs into anno* components and execute query. */ { dyInfo = dyStringNew(0); char *chrom = NULL; uint start = 0, end = 0; if (sameString(regionType, hgvaRegionTypeRange)) getCartPosOrDie(&chrom, &start, &end); struct annoAssembly *assembly = hAnnoGetAssembly(database); boolean isCommandLine = (cgiOptionalString("cgiSpoof") != NULL); char *geneTrack = cartString(cart, "hgva_geneTrack"); struct trackDb *geneTdb = tdbForTrack(database, geneTrack, &fullTrackList); if (geneTdb == NULL) { warn("Can't find tdb for gene track %s", geneTrack); if (! isCommandLine) doUi(); return; } int maxVarRows = cartUsualInt(cart, "hgva_variantLimit", 10); struct annoStreamer *primary = NULL; char *primaryLongLabel = NULL; char *variantTrack = cartString(cart, "hgva_variantTrack"); struct slName *commentList = NULL; if (sameString(variantTrack, hgvaSampleVariants)) { primary = makeSampleVariantsStreamer(assembly, geneTdb, maxVarRows); primaryLongLabel = hgvaSampleVariantsLabel; // Sample variants can't always be made within the currently selected position range, // so just for these, force search to be genome-wide. chrom = NULL; start = 0; end = 0; } else if (sameString(variantTrack, hgvaUseVariantIds)) { // Override search position if cart position doesn't include all variants: primary = makeVariantIdStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList); primaryLongLabel = hgvaVariantIdsLabel; } else if (sameString(variantTrack, hgvaUseHgvs)) { primary = makeHgvsStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList); primaryLongLabel = hgvaHgvsLabel; } else if (sameString(variantTrack, hgvaUseVariantFileOrUrl)) { char *fileOrUrl = cartString(cart, hgvaVariantFileOrUrl); char *type = cartOptionalString(cart, hgvaVariantFileOrUrlType); primary = hAnnoStreamerFromBigFileUrl(fileOrUrl, NULL, assembly, maxVarRows, type); primaryLongLabel = hgvaVariantFileOrUrlLabel; } else { struct trackDb *varTdb = getVariantTrackDb(variantTrack); if (varTdb == NULL) { if (! isCommandLine) doUi(); return; } primary = hAnnoStreamerFromTrackDb(assembly, varTdb->table, varTdb, chrom, maxVarRows, NULL); primaryLongLabel = varTdb->longLabel; } enum annoGratorOverlap geneOverlapRule = agoMustOverlap; struct joinerDtf *txStatusExtras = getTxStatusExtras(database, geneTrack); struct jsonElement *gpConfig = configForStreamer(database, geneTdb, txStatusExtras); struct annoGrator *gpVarGrator = hAnnoGratorFromTrackDb(assembly, geneTdb->table, geneTdb, chrom, ANNO_NO_LIMIT, primary->asObj, geneOverlapRule, gpConfig); setGpVarFuncFilter(gpVarGrator); // Some grators may be used as both filters and output values. To avoid making // multiple grators for the same source, hash them by trackName: struct hash *gratorsByName = hashNew(8); struct annoGrator *snpGrator = NULL; char *snpDesc = NULL; if (cartUsualBoolean(cart, "hgva_rsId", TRUE)) snpGrator = gratorForSnpBed4(gratorsByName, "", assembly, chrom, agoNoConstraint, &snpDesc); // Now construct gratorList in the order in which annoFormatVep wants to see them, // i.e. first the gpVar, then the snpNNN, then whatever else: struct annoGrator *gratorList = NULL; slAddHead(&gratorList, gpVarGrator); if (snpGrator != NULL) slAddHead(&gratorList, snpGrator); // Text or HTML output? char *outFormat = cartUsualString(cart, "hgva_outFormat", "vepTab"); boolean doHtml = sameString(outFormat, "vepHtml"); // Initialize VEP formatter: struct annoFormatter *vepOut = annoFormatVepNew("stdout", doHtml, primary, primaryLongLabel, (struct annoStreamer *)gpVarGrator, geneTdb->longLabel, (struct annoStreamer *)snpGrator, snpDesc, assembly); addTxStatusExtras(vepOut, geneTrack, gpVarGrator, txStatusExtras); boolean haveRegulatory = FALSE; addOutputTracks(&gratorList, gratorsByName, vepOut, assembly, chrom, doHtml, &haveRegulatory); adjustGpVarOverlapRule(gpVarGrator, haveRegulatory); setHgvsOutOptions(gpVarGrator, geneTdb->track, vepOut); addFilterTracks(&gratorList, gratorsByName, assembly, chrom); slReverse(&gratorList); if (doHtml) { webStart(cart, database, "Annotated Variants in VEP/HTML format"); } else if (! isCommandLine) { // Undo the htmlPushEarlyHandlers() because after this point they make ugly text: popWarnHandler(); popAbortHandler(); textOpen(); webStartText(); } if (isNotEmpty(dyInfo->string)) puts(dyInfo->string); struct annoGratorQuery *query = annoGratorQueryNew(assembly, primary, gratorList, vepOut); struct slName *comment; for (comment = commentList; comment != NULL; comment = comment->next) vepOut->comment(vepOut, comment->name); if (chrom != NULL) annoGratorQuerySetRegion(query, chrom, start, end); annoGratorQueryExecute(query); annoGratorQueryFree(&query); if (doHtml) webEnd(); else if (! isCommandLine) textOutClose(&compressPipeline, NULL); } int main(int argc, char *argv[]) /* Process command line. */ { long enteredMainTime = clock1000(); if (hIsPrivateHost()) pushCarefulMemHandler(LIMIT_2or6GB); cgiSpoof(&argc, argv); boolean isCommandLine = (cgiOptionalString("cgiSpoof") != NULL); if (!isCommandLine) htmlPushEarlyHandlers(); /* Make errors legible during initialization. */ oldVars = hashNew(10); boolean startQuery = (cgiUsualString("hgva_startQuery", NULL) != NULL); if (startQuery) { if (isCommandLine) // No HTTP header for command-line use. cart = cartForSession(hUserCookie(), excludeVars, oldVars); else cart = cartAndCookieNoContent(hUserCookie(), excludeVars, oldVars); } else cart = cartAndCookie(hUserCookie(), excludeVars, oldVars); // Try to deal with virt chrom position used by hgTracks. if (startsWith("virt:", cartUsualString(cart, "position", ""))) cartSetString(cart, "position", cartUsualString(cart, "nonVirtPosition", "")); /* Set up global variables. */ getDbAndGenome(cart, &database, &genome, oldVars); initGenbankTableNames(database); regionType = cartUsualString(cart, hgvaRegionType, hgvaRegionTypeGenome); if (isEmpty(cartOptionalString(cart, hgvaRange))) cartSetString(cart, hgvaRange, hDefaultPos(database)); int timeout = cartUsualInt(cart, "udcTimeout", 300); if (udcCacheTimeout() < timeout) udcSetCacheTimeout(timeout); knetUdcInstall(); cartTrackDbInit(cart, &fullTrackList, &fullGroupList, TRUE); if (lookupPosition(cart, hgvaRange)) { if (startQuery) doQuery(); else if (! isCommandLine) doUi(); } else { // Revert to lastPosition if we have multiple matches or warnings, // especially in case user manually edits browser location as in #13009: char *position = cartUsualString(cart, "lastPosition", hDefaultPos(database)); cartSetString(cart, hgvaRange, position); if (webGotWarnings()) { // We land here when lookupPosition pops up a warning box. // Reset the problematic position and show the main page. doMainPage(); } // If lookupPosition returned FALSE and didn't report warnings, // then it wrote HTML showing multiple position matches & links. } cartCheckout(&cart); if (! isCommandLine) cgiExitTime("hgVai", enteredMainTime); return 0; }