b7b15a261ae3e28941a2891ee568c9b472fcda7e angie Wed Jul 30 14:44:14 2014 -0700 Added jsMakeSetClearContainer / jsEndContainer to jsHelper.c, and apppliedthem to a couple long lists of checkboxes in hgVai. While at it, I moved jsHelper.c's javascript-inlined-in-HTML out to a new .js file, jsHelper.js. diff --git src/hg/hgVai/hgVai.c src/hg/hgVai/hgVai.c index c0a6e81..b5c68d0 100644 --- src/hg/hgVai/hgVai.c +++ src/hg/hgVai/hgVai.c @@ -1,2317 +1,2321 @@ /* hgVai - Variant Annotation Integrator. */ /* Copyright (C) 2014 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "jksql.h" #include "htmshell.h" #include "web.h" #include "cheapcgi.h" #include "cart.h" #include "hui.h" #include "grp.h" #include "hCommon.h" #include "hgFind.h" #include "hPrint.h" #include "jsHelper.h" #include "memalloc.h" #include "textOut.h" #include "trackHub.h" #include "hubConnect.h" #include "twoBit.h" #include "gpFx.h" #include "udc.h" #include "knetUdc.h" #include "md5.h" #include "regexHelper.h" #include "annoGratorQuery.h" #include "annoGratorGpVar.h" #include "annoFormatVep.h" #include "annoStreamBigBed.h" #include "libifyMe.h" /* Global Variables */ struct cart *cart; /* CGI and other variables */ struct hash *oldVars = NULL; /* The cart before new cgi stuff added. */ char *genome = NULL; /* Name of genome - mouse, human, etc. */ char *database = NULL; /* Current genome database - hg17, mm5, etc. */ char *regionType = NULL; /* genome, ENCODE pilot regions, or specific position range. */ struct grp *fullGroupList = NULL; /* List of all groups. */ struct trackDb *fullTrackList = NULL; /* List of all tracks in database. */ static struct pipeline *compressPipeline = (struct pipeline *)NULL; // Null terminated list of CGI Variables we don't want to save permanently: char *excludeVars[] = {"Submit", "submit", "hgva_startQuery", NULL,}; #define hgvaRange "position" #define hgvaRegionType "hgva_regionType" #define hgvaRegionTypeEncode "encode" #define hgvaRegionTypeGenome "genome" #define hgvaRegionTypeRange "range" #define hgvaPositionContainer "positionContainer" #define hgvaSampleVariants "hgva_internally_generated_sample_variants" #define hgvaSampleVariantsLabel "Artificial Example Variants" #define hgvaUseVariantIds "hgva_useVariantIds" #define hgvaVariantIdsLabel "Variant Identifiers" #define hgvaVariantIds "hgva_variantIds" #define hgvaVariantPasteContainer "variantPasteContainer" void addSomeCss() /*#*** This should go in a .css file of course. */ { printf("<style>\n" "div.sectionLite { border-width: 1px; border-color: #"HG_COL_BORDER"; border-style: solid;" " background-color: #"HG_COL_INSIDE"; padding-left: 10px; padding-right: 10px; " " padding-top: 8px; padding-bottom: 5px; margin-top: 5px; margin-bottom: 5px }\n" ".sectionLiteHeader { font-weight: bold; font-size:larger; color:#000000;" " text-align:left; vertical-align:bottom; white-space:nowrap; }\n" "div.sectionLiteHeader.noReorderRemove { padding-bottom:5px; }\n" "div.sourceFilter { padding-top: 5px; padding-bottom: 5px }\n" "</style>\n"); } INLINE void startCollapsibleSection(char *sectionSuffix, char *title, boolean onByDefault) // Wrap shared args to jsBeginCollapsibleSectionFontSize { jsBeginCollapsibleSectionFontSize(cart, "hgva", sectionSuffix, title, onByDefault, "1.1em"); } INLINE void startSmallCollapsibleSection(char *sectionSuffix, char *title, boolean onByDefault) // Wrap shared args to jsBeginCollapsibleSectionFontSize { jsBeginCollapsibleSectionFontSize(cart, "hgva", sectionSuffix, title, onByDefault, "0.9em"); } #define endCollapsibleSection jsEndCollapsibleSection static struct dyString *onChangeStart() /* Start up a javascript onChange command */ { struct dyString *dy = jsOnChangeStart(); jsTextCarryOver(dy, hgvaRegionType); jsTextCarryOver(dy, hgvaRange); return dy; } static char *onChangeClade() /* Return javascript executed when they change clade. */ { struct dyString *dy = onChangeStart(); jsDropDownCarryOver(dy, "clade"); dyStringAppend(dy, " document.hiddenForm.org.value=0;"); dyStringAppend(dy, " document.hiddenForm.db.value=0;"); dyStringAppend(dy, " document.hiddenForm." hgvaRange ".value='';"); return jsOnChangeEnd(&dy); } static char *onChangeOrg() /* Return javascript executed when they change organism. */ { struct dyString *dy = onChangeStart(); jsDropDownCarryOver(dy, "clade"); jsDropDownCarryOver(dy, "org"); dyStringAppend(dy, " document.hiddenForm.db.value=0;"); dyStringAppend(dy, " document.hiddenForm." hgvaRange ".value='';"); return jsOnChangeEnd(&dy); } static char *onChangeDb() /* Return javascript executed when they change database. */ { struct dyString *dy = onChangeStart(); jsDropDownCarryOver(dy, "clade"); jsDropDownCarryOver(dy, "db"); dyStringAppend(dy, " document.hiddenForm." hgvaRange ".value='';"); return jsOnChangeEnd(&dy); } INLINE void printOption(char *val, char *selectedVal, char *label) /* For rolling our own select without having to build conditional arrays/lists. */ { printf("<OPTION VALUE='%s'%s>%s\n", val, (sameString(selectedVal, val) ? " SELECTED" : ""), label); } void printRegionListHtml(char *db) /* Make a dropdown choice of region type, with position input box that appears if needed. * Unlike hgTables, don't bother with ENCODE pilot regions -- unless someone misses it. * Return the selected region type. */ { printf("<SELECT ID='"hgvaRegionType"' NAME='"hgvaRegionType"' " "onchange=\"hgva.changeRegion();\">\n"); printOption(hgvaRegionTypeGenome, regionType, "genome"); printOption(hgvaRegionTypeRange, regionType, "position or search term"); printf("</SELECT>"); } void topLabelSpansStart(char *label) { printf("<span style='display: inline-block; padding-right: 5px;'>" "<span style='display: block;'>%s</span>\n" "<span style='display: block; padding-bottom: 5px;'>\n", label); } void topLabelSpansEnd() { printf("</span></span>"); } char *makePositionInput() /* Return HTML for the position input. */ { struct dyString *dy = dyStringCreate("<INPUT TYPE=TEXT NAME=\"%s\" SIZE=%d VALUE=\"%s\">", hgvaRange, 45, addCommasToPos(NULL, cartString(cart, hgvaRange))); return dyStringCannibalize(&dy); } void printCtAndHubButtons() /* Print a div with buttons for hgCustom and hgHubConnect */ { puts("<div style='padding-top: 5px; padding-bottom: 5px'>"); hOnClickButton("document.customTrackForm.submit(); return false;", hasCustomTracks(cart) ? CT_MANAGE_BUTTON_LABEL : CT_ADD_BUTTON_LABEL); printf(" "); if (hubConnectTableExists()) hOnClickButton("document.trackHubForm.submit(); return false;", "track hubs"); nbSpaces(3); printf("To reset <B>all</B> user cart settings (including custom tracks), \n" "<A HREF=\"/cgi-bin/cartReset?destination=%s\">click here</A>.\n", cgiScriptName()); puts("</div>"); } void hgGatewayCladeGenomeDb() /* Make a row of labels and row of buttons like hgGateway, but not using tables. */ { boolean gotClade = hGotClade(); if (gotClade) { topLabelSpansStart("clade"); printCladeListHtml(genome, onChangeClade()); topLabelSpansEnd(); } topLabelSpansStart("genome"); if (gotClade) printGenomeListForCladeHtml(database, onChangeOrg()); else printGenomeListHtml(database, onChangeOrg()); topLabelSpansEnd(); topLabelSpansStart("assembly"); printAssemblyListHtml(database, onChangeDb()); topLabelSpansEnd(); puts("<BR>"); topLabelSpansStart("region to annotate"); printRegionListHtml(database); topLabelSpansEnd(); topLabelSpansStart(""); // Yet another span, for hiding/showing position input and lookup button: printf("<span id='"hgvaPositionContainer"'%s>\n", differentString(regionType, hgvaRegionTypeRange) ? " style='display: none;'" : ""); puts(makePositionInput()); printf("</span>\n"); topLabelSpansEnd(); puts("<BR>"); } void printAssemblySection() /* Print assembly-selection stuff. * Redrawing the whole page when the assembly changes seems fine to me. */ { //#*** ---------- More copied verbatim, from hgTables/mainPage.c: ----------- /* Hidden form - for benefit of javascript. */ { static char *saveVars[] = { "clade", "org", "db", hgvaRange, hgvaRegionType }; jsCreateHiddenForm(cart, cgiScriptName(), saveVars, ArraySize(saveVars)); } /* Hidden form for jumping to custom tracks CGI. */ printf("<FORM ACTION='%s' NAME='customTrackForm'>", hgCustomName()); cartSaveSession(cart); printf("</FORM>\n"); /* Hidden form for jumping to track hub manager CGI. */ printf("<FORM ACTION='%s' NAME='trackHubForm'>", hgHubConnectName()); //#*** well, almost verbatim... lib version should use cgiScriptName. cartSaveSession(cart); printf("</FORM>\n"); printf("<FORM ACTION=\"%s\" NAME=\"mainForm\" ID=\"mainForm\" METHOD=%s>\n", cgiScriptName(), cartUsualString(cart, "formMethod", "GET")); cartSaveSession(cart); //#*** ------------------ end verbatim --------------- printf("<div class='sectionLiteHeader noReorderRemove'>" "Select Genome Assembly and Region</div>\n"); /* Print clade, genome and assembly line. */ hgGatewayCladeGenomeDb(); //#*** No longer ending form here... } void factorSourceListInputProperties(struct trackDb *tdb, struct slName **retFactorList, struct slName **retCellTypeList, struct slName **retTreatmentList) /* Get a list of factor names used in factorSource track. */ { char *inputsTable = trackDbSetting(tdb, "inputTrackTable"); if (isEmpty(inputsTable)) errAbort("track %s does not have an inputTrackTable setting", tdb->track); char query[2048]; struct sqlConnection *conn = hAllocConn(database); if (retFactorList && retCellTypeList && retTreatmentList) { sqlSafef(query, sizeof(query), "select distinct(factor) from %s order by factor", inputsTable); *retFactorList = sqlQuickList(conn, query); sqlSafef(query, sizeof(query), "select distinct(cellType) from %s order by cellType", inputsTable); *retCellTypeList = sqlQuickList(conn, query); sqlSafef(query, sizeof(query), "select distinct(treatment) from %s order by treatment", inputsTable); *retTreatmentList = sqlQuickList(conn, query); } else errAbort("factorSourceListInputProperties: ret args must be non-NULL."); hFreeConn(&conn); } void printMultiselect(char *track, char *label, char *var, struct slName *optionList) /* Print a label and multi-select, with hgva_track_var as cart var, with options in optionList, * marking as selected any options stored in cart. */ { printf("%s: ", label); char cartVar[1024]; safef(cartVar, sizeof(cartVar), "hgva_filter_%s_%s", track, var); struct hash *selHash = NULL; if (cartListVarExists(cart, cartVar)) selHash = hashFromSlNameList(cartOptionalSlNameList(cart, cartVar)); printf("<select id=\"%s\" name=\"%s\" style=\"display: none; font-size:.9em;\" " "class=\"filterBy\" multiple>\n", cartVar, cartVar); char *selected = ""; if (!selHash || hashLookup(selHash, "All")) selected = " selected"; printf("<option%s>All</option>", selected); struct slName *option; for (option = optionList; option != NULL; option = option->next) { selected = ""; if (selHash && hashLookup(selHash, option->name)) selected = " selected"; printf("<option%s>%s</option>", selected, option->name); } printf("</select>\n"); char shadowVar[1024]; safef(shadowVar, sizeof(shadowVar), "%s%s", cgiMultListShadowPrefix(), cartVar); cgiMakeHiddenVar(shadowVar, "1"); //printf("<script>$(document).ready(function(){ ddcl.setup($('#%s')[0]); });</script>\n", cartVar); } void printFilterOptions(struct trackDb *tdb) /* Print a collapsible filter section for tdb, with controls depending on tdb->type. */ { char sectionName[512], cartVar[512]; safef(sectionName, sizeof(sectionName), "%s_filter", tdb->track); if (sameString(tdb->type, "factorSource")) { puts("<TABLE>"); startSmallCollapsibleSection(sectionName, "filter items", FALSE); struct slName *factorOptions = NULL, *cellTypeOptions = NULL, *treatmentOptions = NULL; factorSourceListInputProperties(tdb, &factorOptions, &cellTypeOptions, &treatmentOptions); printMultiselect(tdb->track, "factor", "name", factorOptions); printMultiselect(tdb->track, "cell type", "cellType", cellTypeOptions); printMultiselect(tdb->track, "treatment", "treatment", treatmentOptions); puts("<BR>"); puts("minimum peak score [0-1000]: "); safef(cartVar, sizeof(cartVar), "hgva_filter_%s_score", tdb->track); char *defaultScore = cartUsualString(cart, cartVar, "0"); printf("<input type=text name=\"%s\" size=12 value=\"%s\"><BR>", cartVar, defaultScore); // The dimensions of ui-dropdownchecklist multiselects are not correct when // the item is hidden. So, when this filter section is made visible, reinit them. printf("<script>\n" "$(function(){" "$('tr[id^=\"%s-\"]').bind('show'," " function(jqev) { \n" " var $multisels = $(jqev.target).find('.filterBy');\n" " var multiselElList = $multisels.each(function(ix, el){ return el; });\n" " ddcl.reinit(multiselElList);" " });\n" "});" "</script>\n", sectionName); puts("</TABLE>"); endCollapsibleSection(); } if (startsWith("bed 5", tdb->type)) //#*** TODO: detect bed# properly { puts("<TABLE>"); startSmallCollapsibleSection(sectionName, "filter items", FALSE); //#*** Also watch out for noScoreFilter or whatever it's called puts("minimum peak score [0-1000]: "); safef(cartVar, sizeof(cartVar), "hgva_filter_%s_score", tdb->track); char *defaultScore = cartUsualString(cart, cartVar, "0"); printf("<input type=text name=\"%s\" size=12 value=\"%s\"><BR>", cartVar, defaultScore); puts("</TABLE>"); endCollapsibleSection(); } } typedef boolean TdbFilterFunction(struct trackDb *tdb, void *filterData); /* Return TRUE if tdb passes filter criteria. */ void rFilterTrackList(struct trackDb *trackList, struct slRef **pPassingRefs, TdbFilterFunction *filterFunc, void *filterData) /* Recursively apply filterFunc and filterData to all tracks in trackList and * their subtracks. Add an slRef to pPassingRefs for each track/subtrack that passes. * Caller should slReverse(pPassingRefs) when recursion is all done. */ { struct trackDb *tdb; for (tdb = trackList; tdb != NULL; tdb = tdb->next) { if (tdb->subtracks != NULL) rFilterTrackList(tdb->subtracks, pPassingRefs, filterFunc, filterData); if (filterFunc(tdb, filterData)) slAddHead(pPassingRefs, slRefNew(tdb)); } } void tdbFilterGroupTrack(struct trackDb *fullTrackList, struct grp *fullGroupList, TdbFilterFunction *filterFunc, void *filterData, struct slRef **retGroupRefList, struct slRef **retTrackRefList) /* Apply filterFunc and filterData to each track in fullTrackList and all subtracks; * make an slRef list of tracks that pass the filter (retTrackRefList) and an slRef * list of groups to which the passing tracks belong (retGroupRefList). */ { struct slRef *trackRefList = NULL; rFilterTrackList(fullTrackList, &trackRefList, filterFunc, filterData); slReverse(&trackRefList); if (retTrackRefList != NULL) *retTrackRefList = trackRefList; if (retGroupRefList != NULL) { // Which groups have tracks that passed the filter? struct hash *hash = hashNew(0); struct slRef *trackRef; for (trackRef = trackRefList; trackRef != NULL; trackRef = trackRef->next) { struct trackDb *tdb = trackRef->val; hashAddInt(hash, tdb->grp, TRUE); } struct slRef *groupRefList = NULL; struct grp *grp; for (grp = fullGroupList; grp != NULL; grp = grp->next) { if (hashIntValDefault(hash, grp->name, FALSE)) slAddHead(&groupRefList, slRefNew(grp)); } hashFree(&hash); slReverse(&groupRefList); *retGroupRefList = groupRefList; } } boolean isVariantCustomTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get custom or hub tracks with type pgSnp or vcf(Tabix). */ { return ((sameString(tdb->grp, "user") || isHubTrack(tdb->track)) && (sameString(tdb->type, "pgSnp") || startsWith("vcf", tdb->type))); } // Function prototype to avoid shuffling code around: char *findLatestSnpTable(char *suffix); /* Return the name of the 'snp1__<suffix>' table with the highest build number, if any. */ void selectVariants(struct slRef *varGroupList, struct slRef *varTrackList) /* Offer selection of user's variant custom tracks, example variants, pasted input etc. */ { printf("<div class='sectionLiteHeader'>Select Variants</div>\n"); printf("If you have more than one custom track or hub track in " "<A HREF='../FAQ/FAQformat.html#format10' TARGET=_BLANK>pgSnp</A> or " "<A HREF='../goldenPath/help/vcf.html' TARGET=_BLANK>VCF</A> format, " "please select the one you wish to annotate:<BR>\n"); printf("<B>variants: </B>"); printf("<SELECT ID='hgva_variantTrack' NAME='hgva_variantTrack' " "onchange=\"hgva.changeVariantSource();\">\n"); char *selected = cartUsualString(cart, "hgva_variantTrack", ""); struct slRef *ref; for (ref = varTrackList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; printOption(tdb->track, selected, tdb->longLabel); } printOption(hgvaSampleVariants, selected, hgvaSampleVariantsLabel); boolean hasSnps = (findLatestSnpTable(NULL) != NULL); if (hasSnps) printOption(hgvaUseVariantIds, selected, hgvaVariantIdsLabel); printf("</SELECT><BR>\n"); if (hasSnps) { printf("<div id='"hgvaVariantPasteContainer"'%s>\n", differentString(selected, hgvaUseVariantIds) ? " style='display: none;'" : ""); printf("Enter dbSNP rs# identifiers separated by whitespace or commas:<BR>\n"); char *oldPasted = cartUsualString(cart, hgvaVariantIds, ""); cgiMakeTextArea(hgvaVariantIds, oldPasted, 10, 70); puts("</div>"); } printf("<B>maximum number of variants to be processed:</B>\n"); char *curLimit = cartUsualString(cart, "hgva_variantLimit", "10000"); char *limitLabels[] = { "10", "100", "1,000", "10,000", "100,000" }; char *limitValues[] = { "10", "100", "1000", "10000", "100000" }; cgiMakeDropListWithVals("hgva_variantLimit", limitLabels, limitValues, ArraySize(limitLabels), curLimit); printCtAndHubButtons(); puts("<BR>"); } boolean isGeneTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get genePred tracks. */ { return (startsWith("genePred", tdb->type)); } boolean selectGenes() /* Let user select a gene predictions track; return FALSE if there are no genePred tracks. */ { struct slRef *trackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isGeneTrack, NULL, NULL, &trackRefList); boolean gotGP = (trackRefList != NULL); if (!gotGP) warn("This assembly (%s) has no gene prediction tracks, " "so the VAI will not be able to annotate it.", database); printf("<div class='sectionLiteHeader'>Select Genes</div>\n"); if (gotGP) printf("The gene predictions selected here will be used "); else printf("Gene predictions are required in order "); printf("to determine the effect of " "each variant on genes, for example intronic, missense, splice site, intergenic etc."); if (!gotGP) printf(" Since this assembly has no gene prediction tracks, " "the VAI can't provide functional annotations. " "Please select a different genome.<BR>"); printf("<BR>\n"); char *selected = cartUsualString(cart, "hgva_geneTrack", ""); //#*** per-db cart vars?? //#*** should show more info about each track... button to pop up track desc? if (gotGP) { printf("<SELECT ID='hgva_geneTrack' NAME='hgva_geneTrack'>\n"); struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; if (tdb->subtracks == NULL) printOption(tdb->track, selected, tdb->longLabel); } puts("</SELECT><BR>"); } return gotGP; } //#*** We really need a dbNsfp.[ch]: enum PolyPhen2Subset { noSubset, HDIV, HVAR }; char *formatDesc(char *url, char *name, char *details, boolean doHtml) /* Return a description with URL for name plus extra details. If doHtml, * wrap URL in <A ...>...</A>. */ { char desc[1024]; if (doHtml) safef(desc, sizeof(desc), "<A HREF='%s' TARGET=_BLANK>%s</A> %s", url, name, details); else safef(desc, sizeof(desc), "(%s) %s %s", url, name, details); return cloneString(desc); } char *dbNsfpDescFromTableName(char *tableName, enum PolyPhen2Subset subset, boolean doHtml) /* Return a description for table name... if these become tracks, use tdb of course. */ { if (sameString(tableName, "dbNsfpSift")) return formatDesc("http://sift.bii.a-star.edu.sg/", "SIFT", "(D = damaging, T = tolerated)", doHtml); else if (sameString(tableName, "dbNsfpPolyPhen2")) { if (subset == HDIV) return formatDesc("http://genetics.bwh.harvard.edu/pph2/", "PolyPhen-2", "with HumDiv training set " "(D = probably damaging, P = possibly damaging, B = benign)", doHtml); else if (subset == HVAR) return formatDesc("http://genetics.bwh.harvard.edu/pph2/", "PolyPhen-2", "with HumVar training set " "(D = probably damaging, P = possibly damaging, B = benign)", doHtml); else errAbort("dbNsfpDescFromTableName: invalid PolyPhen2 subset type (%d)", subset); } else if (sameString(tableName, "dbNsfpMutationTaster")) return formatDesc("http://www.mutationtaster.org/", "MutationTaster", "(A = disease causing automatic, D = disease causing, " "N = polymorphism, P = polymorphism automatic)", doHtml); else if (sameString(tableName, "dbNsfpMutationAssessor")) return formatDesc("http://mutationassessor.org/", "MutationAssessor", "(high or medium: predicted functional; " "low or neutral: predicted non-functional)", doHtml); else if (sameString(tableName, "dbNsfpLrt")) return formatDesc("http://www.genetics.wustl.edu/jflab/lrt_query.html", "Likelihood ratio test (LRT)", "(D = deleterious, N = Neutral, U = unknown)", doHtml); else if (sameString(tableName, "dbNsfpGerpNr")) return formatDesc("http://mendel.stanford.edu/SidowLab/downloads/gerp/index.html", "GERP++", "Neutral Rate (NR)", doHtml); else if (sameString(tableName, "dbNsfpGerpRs")) return formatDesc("http://mendel.stanford.edu/SidowLab/downloads/gerp/index.html", "GERP++", "Rejected Substitutions (RS)", doHtml); else if (sameString(tableName, "dbNsfpInterPro")) return formatDesc("http://www.ebi.ac.uk/interpro/", "InterPro", "protein domains", doHtml); return NULL; } struct slName *findDbNsfpTables() /* See if this database contains dbNSFP tables. */ { if (startsWith(hubTrackPrefix, database)) return NULL; struct sqlConnection *conn = hAllocConn(database); struct slName *dbNsfpTables = sqlListTablesLike(conn, "LIKE 'dbNsfp%'"); hFreeConn(&conn); return dbNsfpTables; } void printDbNsfpSource(char *table, enum PolyPhen2Subset subset) /* If we know what to do with table, make a checkbox with descriptive label. */ { char *description = dbNsfpDescFromTableName(table, subset, TRUE); if (description != NULL) { char cartVar[512]; if (subset == HDIV) safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s:HDIV", database, table); else if (subset == HVAR) safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s:HVAR", database, table); else safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s", database, table); boolean defaultChecked = (sameString("dbNsfpSift", table) || sameString("dbNsfpPolyPhen2", table)); cartMakeCheckBox(cart, cartVar, defaultChecked); printf("%s<BR>\n", description); } } void selectDbNsfp(struct slName *dbNsfpTables) /* Let user select scores/predicitions from various tools collected by dbNSFP. */ { if (dbNsfpTables == NULL) return; startCollapsibleSection("dbNsfp", "Database of Non-synonymous Functional Predictions (dbNSFP)", TRUE); printf("<A HREF='https://sites.google.com/site/jpopgen/dbNSFP' TARGET=_BLANK>dbNSFP</A> " "(<A HREF='http://onlinelibrary.wiley.com/doi/10.1002/humu.22376/abstract' " "TARGET=_BLANK>Liu <em>et al.</em> 2013</A>) " "release 2.0 " "provides pre-computed scores and predictions of functional significance " "from a variety of tools. Every possible coding change to transcripts in " //#*** hardcoded version info... sigh, we need trackDb... or metaDb?? "Gencode release 9 (Ensembl 64, Dec. 2011) gene predictions " "has been evaluated. " "<em>Note: This may not encompass all transcripts in your " "selected gene set.</em><BR>\n"); //#*** Another cheap hack: reverse alph order happens to be what we want, //#*** but priorities would be cleaner: slReverse(&dbNsfpTables); +jsMakeSetClearContainer(); struct slName *table; for (table = dbNsfpTables; table != NULL; table = table->next) { if (sameString(table->name, "dbNsfpPolyPhen2")) { printDbNsfpSource(table->name, HDIV); printDbNsfpSource(table->name, HVAR); } else printDbNsfpSource(table->name, 0); } +jsEndContainer(); puts("<BR>"); endCollapsibleSection(); } char *findLatestSnpTable(char *suffix) /* Return the name of the 'snp1__<suffix>' table with the highest build number, if any. */ { if (startsWith(hubTrackPrefix, database)) return NULL; if (suffix == NULL) suffix = ""; char likeExpr[64]; safef(likeExpr, sizeof(likeExpr), "LIKE 'snp1__%s'", suffix); struct sqlConnection *conn = hAllocConn(database); struct slName *snpNNNTables = sqlListTablesLike(conn, likeExpr); hFreeConn(&conn); if ((snpNNNTables == NULL) || (slCount(snpNNNTables)==0)) return NULL; // Skip to last in list -- highest number (show tables can't use rlike or 'order by'): struct slName *table = snpNNNTables; while (table->next != NULL && isdigit(table->next->name[4]) && isdigit(table->next->name[5])) table = table->next; char *tableName = NULL; if (table != NULL) tableName = cloneString(table->name); slNameFreeList(&snpNNNTables); return tableName; } boolean findSnpBed4(char *suffix, char **retFileName, struct trackDb **retTdb) /* If we can find the latest snpNNNsuffix table, or better yet a bigBed file for it (faster), * set the appropriate ret* and return TRUE, otherwise return FALSE. */ { char *table = findLatestSnpTable(suffix); if (table == NULL) return FALSE; boolean foundIt = FALSE; // Do we happen to have a bigBed version? Better yet, bed4 only for current uses: char origFileName[HDB_MAX_PATH_STRING]; safef(origFileName, sizeof(origFileName), "/gbdb/%s/vai/%s.bed4.bb", database, table); char* fileName = hReplaceGbdb(origFileName); if (fileExists(fileName)) { if (retFileName != NULL) *retFileName = fileName; foundIt = TRUE; } else { // Not bed4; try just .bb: freez(&fileName); safef(origFileName, sizeof(origFileName), "/gbdb/%s/vai/%s.bb", database, table); fileName = hReplaceGbdb(origFileName); if (fileExists(fileName)) { if (retFileName != NULL) *retFileName = fileName; foundIt = TRUE; } } if (foundIt && retTdb == NULL) return TRUE; struct trackDb *tdb = tdbForTrack(database, table, &fullTrackList); if (tdb != NULL) { if (retTdb != NULL) *retTdb = tdb; return TRUE; } return foundIt; } void selectDbSnp(boolean gotSnp) /* Offer to include rsID (and other fields, or leave that for advanced output??) if available */ { if (!gotSnp) return; startCollapsibleSection("dbSnp", "Known variation", TRUE); cartMakeCheckBox(cart, "hgva_rsId", TRUE); printf("Include <A HREF='http://www.ncbi.nlm.nih.gov/projects/SNP/' TARGET=_BLANK>dbSNP</A> " "rs# ID if one exists<BR>\n"); puts("<BR>"); endCollapsibleSection(); } boolean isRegulatoryTrack(struct trackDb *tdb, void *filterData) /* For now, just look for a couple specific tracks by tableName. */ { //#*** NEED METADATA return (sameString("wgEncodeRegDnaseClusteredV2", tdb->table) || sameString("wgEncodeRegTfbsClusteredV3", tdb->table)); } struct slRef *findRegulatoryTracks() /* Look for the very limited set of Regulation tracks that hgVai offers. */ { struct slRef *regTrackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isRegulatoryTrack, NULL, NULL, ®TrackRefList); return regTrackRefList; } void selectRegulatory() /* If trackRefList is non-NULL, make a section with a checkbox for each track, * labelled with longLabel, and optionally some filtering options. */ { struct slRef *trackRefList = findRegulatoryTracks(); if (trackRefList != NULL) { puts("<BR>"); printf("<div class='sectionLiteHeader'>Select Regulatory Annotations</div>\n"); printf("The annotations in this section provide predicted regulatory regions " "based on various experimental data. " "When a variant overlaps an annotation selected here, the consequence term " "<A HREF=\"http://sequenceontology.org/browser/current_release/term/SO:0001566\" " "TARGET=_BLANK>regulatory_region_variant</A> will be assigned. " "Follow the links to description pages that explain how each dataset was " "constructed. " "Some datasets cover a significant portion of the genome and it may be " "desirable to filter these annotations by cell type and/or score in order " "to avoid an overabundance of hits. "); // Use a table with checkboxes in one column and label/other stuff that depends on // checkbox in other column. puts("<TABLE>"); struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; char cartVar[512]; safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s", database, tdb->track); puts("<TR><TD valign=top>"); cartMakeCheckBox(cart, cartVar, FALSE); puts("</TD><TD>"); struct trackDb *topTdb = trackDbTopLevelSelfOrParent(tdb); printf("<A HREF=\"%s?%s&g=%s\">%s</A><BR>\n", hgTrackUiName(), cartSidUrlString(cart), topTdb->track, tdb->longLabel); printFilterOptions(tdb); puts("</TD></TR>"); } puts("</TABLE>"); } } boolean isConsElTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get "phastConsNwayElements" tracks. */ { return (startsWith("phastCons", tdb->table) && stringIn("Elements", tdb->table)); } boolean isConsScoreTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get tracks that start with "phastCons" (no Elements) * or "phyloP". */ { return ((startsWith("phastCons", tdb->table) && !stringIn("Elements", tdb->table)) || startsWith("phyloP", tdb->table)); } void findCons(struct slRef **retElTrackRefList, struct slRef **retScoreTrackRefList) /* See if this database has Conserved Elements and/or Conservation Scores */ { tdbFilterGroupTrack(fullTrackList, fullGroupList, isConsElTrack, NULL, NULL, retElTrackRefList); tdbFilterGroupTrack(fullTrackList, fullGroupList, isConsScoreTrack, NULL, NULL, retScoreTrackRefList); } boolean trackNameMatches(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get track(s) whose tdb->track matches name (filterData). */ { char *name = filterData; return sameString(tdb->track, name); } struct slRef *findTrackRefByName(char *name) /* Return a reference to the named track, if found. */ { struct slRef *trackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, trackNameMatches, name, NULL, &trackRefList); return trackRefList; } void trackCheckBoxSection(char *sectionSuffix, char *title, struct slRef *trackRefList) /* If trackRefList is non-NULL, make a collapsible section with a checkbox for each track, * labelled with longLabel. */ { if (trackRefList != NULL) { startCollapsibleSection(sectionSuffix, title, FALSE); struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; char cartVar[512]; safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s", database, tdb->track); cartMakeCheckBox(cart, cartVar, FALSE); struct trackDb *topTdb = trackDbTopLevelSelfOrParent(tdb); printf("<A HREF=\"%s?%s&g=%s\">%s</A><BR>\n", hgTrackUiName(), cartSidUrlString(cart), topTdb->track, tdb->longLabel); } puts("<BR>"); endCollapsibleSection(); } } void selectAnnotations() /* Beyond predictions of protein-coding effect, what other basic data can we integrate? */ { struct slName *dbNsfpTables = findDbNsfpTables(); boolean gotSnp = findSnpBed4("", NULL, NULL); struct slRef *elTrackRefList = NULL, *scoreTrackRefList = NULL; findCons(&elTrackRefList, &scoreTrackRefList); struct slRef *cosmicTrackRefList = findTrackRefByName("cosmic"); if (dbNsfpTables == NULL && !gotSnp && elTrackRefList == NULL && scoreTrackRefList == NULL && cosmicTrackRefList == NULL) return; puts("<BR>"); printf("<div class='sectionLiteHeader'>Select More Annotations (optional)</div>\n"); // Make wrapper table for collapsible sections: puts("<TABLE border=0 cellspacing=5 cellpadding=0 style='padding-left: 10px;'>"); selectDbNsfp(dbNsfpTables); selectDbSnp(gotSnp); trackCheckBoxSection("Cosmic", "COSMIC", cosmicTrackRefList); trackCheckBoxSection("ConsEl", "Conserved elements", elTrackRefList); trackCheckBoxSection("ConsScore", "Conservation scores", scoreTrackRefList); puts("</TABLE>"); } void selectFiltersFunc() /* Options to restrict variants based on gene region/soTerm from gpFx */ { startCollapsibleSection("filtersFunc", "Functional role", FALSE); printf("Include variants annotated as<BR>\n"); +jsMakeSetClearContainer(); cartMakeCheckBox(cart, "hgva_include_intergenic", TRUE); printf("intergenic<BR>\n"); cartMakeCheckBox(cart, "hgva_include_upDownstream", TRUE); printf("upstream/downstream of gene<BR>\n"); cartMakeCheckBox(cart, "hgva_include_nmdTranscript", TRUE); printf("in transcript already subject to nonsense-mediated decay (NMD)<BR>\n"); cartMakeCheckBox(cart, "hgva_include_exonLoss", TRUE); printf("exon loss caused by deletion<BR>\n"); cartMakeCheckBox(cart, "hgva_include_utr", TRUE); printf("5' or 3' UTR<BR>\n"); cartMakeCheckBox(cart, "hgva_include_cdsSyn", TRUE); printf("CDS - synonymous coding change<BR>\n"); cartMakeCheckBox(cart, "hgva_include_cdsNonSyn", TRUE); printf("CDS - non-synonymous (missense, stop gain/loss, frameshift etc)<BR>\n"); cartMakeCheckBox(cart, "hgva_include_intron", TRUE); printf("intron<BR>\n"); cartMakeCheckBox(cart, "hgva_include_splice", TRUE); printf("splice site or splice region<BR>\n"); cartMakeCheckBox(cart, "hgva_include_nonCodingExon", TRUE); printf("exon of non-coding gene<BR>\n"); struct slRef *regTrackRefList = findRegulatoryTracks(); if (regTrackRefList != NULL) { cartMakeCheckBox(cart, "hgva_include_regulatory", TRUE); printf("regulatory element (note: these are detected only if one or more tracks " "are selected in Regulatory regions above)<BR>\n"); } +jsEndContainer(); puts("<BR>"); endCollapsibleSection(); } void selectFiltersKnownVar() /* Options to restrict output based on overlap with known variants. */ { boolean gotCommon = findSnpBed4("Common", NULL, NULL); boolean gotMult = findSnpBed4("Mult", NULL, NULL); if (!gotCommon && !gotMult) return; startCollapsibleSection("filtersVar", "Known variation", FALSE); if (gotMult) { cartMakeCheckBox(cart, "hgva_include_snpMult", TRUE); printf("Include variants even if they are co-located with variants that have been mapped to " "multiple genomic locations by dbSNP<BR>\n"); } if (gotCommon) { cartMakeCheckBox(cart, "hgva_include_snpCommon", TRUE); printf("Include variants even if they are co-located with \"common\" variants " "(uniquely mapped variants with global minor allele frequency (MAF) " "of at least 1%% according to dbSNP)<BR>\n"); } puts("<BR>"); endCollapsibleSection(); } void selectFiltersCons() /* Options to restrict output based on overlap with conserved elements. */ { struct slRef *elTrackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isConsElTrack, NULL, NULL, &elTrackRefList); if (elTrackRefList == NULL) return; startCollapsibleSection("filtersCons", "Conservation", FALSE); // Use a little table to indent radio buttons (if we do those) past checkbox: puts("<TABLE><TR><TD>"); cartMakeCheckBox(cart, "hgva_require_consEl", FALSE); printf("</TD><TD>Include only the variants that overlap:"); if (slCount(elTrackRefList) == 1) { struct trackDb *tdb = elTrackRefList->val; printf(" %s<BR>\n", tdb->longLabel); cgiMakeHiddenVar("hgva_require_consEl_track", tdb->track); puts("</TD></TR></TABLE>"); } else { puts("</TD></TR>"); char *selected = cartUsualString(cart, "hgva_require_consEl_track", ""); struct slRef *ref; for (ref = elTrackRefList; ref != NULL; ref = ref->next) { printf("<TR><TD></TD><TD>"); struct trackDb *tdb = ref->val; cgiMakeOnClickRadioButton("hgva_require_consEl_track", tdb->track, sameString(tdb->track, selected), "onclick=\"setCheckboxList('hgva_require_consEl', true);\""); printf("%s</TD></TR>\n", tdb->longLabel); } puts("</TABLE>"); } endCollapsibleSection(); } void selectFilters() /* Options to restrict output to variants with specific properties */ { puts("<BR>"); printf("<div class='sectionLiteHeader'>Define Filters</div>\n"); puts("<TABLE border=0 cellspacing=5 cellpadding=0 style='padding-left: 10px;'>"); selectFiltersFunc(); selectFiltersKnownVar(); selectFiltersCons(); puts("</TABLE>"); } void selectOutput() /* Just VEP text for now... should also have VEP HTML with limited # lines too. * Next: custom track with same subset of info as we would stuff in VEP */ { puts("<BR>"); printf("<div class='sectionLiteHeader'>Configure Output</div>\n"); printf("<B>output format: </B>"); char *selected = cartUsualString(cart, "hgva_outFormat", "vepTab"); printf("<SELECT ID='hgva_outFormat' NAME='hgva_outFormat'>\n"); printOption("vepTab", selected, "Variant Effect Predictor (tab-separated text)"); printOption("vepHtml", selected, "Variant Effect Predictor (HTML)"); printf("</SELECT><BR>\n"); char *compressType = cartUsualString(cart, "hgva_compressType", textOutCompressNone); char *fileName = cartUsualString(cart, "hgva_outFile", ""); printf("<B>output file:</B> "); cgiMakeTextVar("hgva_outFile", fileName, 29); printf(" (leave blank to keep output in browser)<BR>\n"); printf("<B>file type returned: </B>"); cgiMakeRadioButton("hgva_compressType", textOutCompressNone, sameWord(textOutCompressNone, compressType)); printf(" plain text  "); cgiMakeRadioButton("hgva_compressType", textOutCompressGzip, sameWord(textOutCompressGzip, compressType)); printf(" gzip compressed (ignored if output file is blank)"); puts("<BR>"); } void submitAndDisclaimer() { puts("<div id=disclaimerDialog title='NOTE'>"); puts("This tool is for research use only. While this tool is open to the " "public, users seeking information about a personal medical or genetic " "condition are urged to consult with a qualified physician for " "diagnosis and for answers to personal questions."); puts("</div><BR>"); printf("<div><img id='loadingImg' src='../images/loading.gif' />\n"); printf("<span id='loadingMsg'></span></div>\n"); cgiMakeOnClickButton("hgva.submitQueryIfDisclaimerAgreed();", "Get results"); puts("<BR><BR>"); } /* * When user clicks submit, we need to roll a JSON querySpec from form selections, * and show data from a submission to hgAnnoGrator. redirect from this CGI? * or have javascript submit directly? * * primary: variants, from custom track * * if there are genes, those w/annoGratorGpVar * * if there are {dbSNP, dbNsfp, regulatory, cons} selections, grator for each of those * * vep output config * * If we get bold & offer 1000Genomes VCF, will def. need handling of split chroms. * Are we really going to offer genome-wide in hgVai? * Up-front limit on #rows of input ? * * Eventually, we might want a FormatVep that produces structs that are passed * forward to multiple output writers... I would want to send it lots of gratorData * like a formatter, but it would produce rows like an annoGrator. * Maybe annoGrators should accept a bunch of input rows like formatters? * or would this grator wrap all the input grators inside? */ void doMainPage() /* Print out initial HTML of control page. */ { jsInit(); webIncludeResourceFile("jquery-ui.css"); webIncludeResourceFile("ui.dropdownchecklist.css"); boolean alreadyAgreed = cartUsualBoolean(cart, "hgva_agreedToDisclaimer", FALSE); printf("<script>\n" "$(document).ready(function() { hgva.disclaimer.init(%s, hgva.userClickedAgree); });\n" "</script>\n", alreadyAgreed ? "true" : "false"); addSomeCss(); printAssemblySection(); /* Check for variant custom tracks. If there are none, tell user they need to * upload at least one. */ struct slRef *varTrackList = NULL, *varGroupList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isVariantCustomTrack, NULL, &varGroupList, &varTrackList); puts("<BR>"); // Make wrapper table for collapsible sections: selectVariants(varGroupList, varTrackList); boolean gotGP = selectGenes(); if (gotGP) { selectRegulatory(); selectAnnotations(); selectFilters(); selectOutput(); submitAndDisclaimer(); } printf("</FORM>"); jsReloadOnBackButton(cart); webNewSection("Using the Variant Annotation Integrator"); webIncludeHelpFile("hgVaiHelpText", FALSE); jsIncludeFile("jquery-ui.js", NULL); jsIncludeFile("hgVarAnnogrator.js", NULL); jsIncludeFile("ui.dropdownchecklist.js", NULL); jsIncludeFile("ddcl.js", NULL); } void doUi() /* Set up globals and make web page */ { cartWebStart(cart, database, "Variant Annotation Integrator"); doMainPage(); cartWebEnd(); /* Save variables. */ cartCheckout(&cart); } void checkVariantTrack(struct trackDb *tdb) /* variantTrack should be either pgSnp or VCF. */ { if (! sameString(tdb->type, "pgSnp") && ! startsWith("vcf", tdb->type)) { errAbort("Expected variant track '%s' to be type pgSnp, vcf or vcfTabix, but it's '%s'", tdb->track, tdb->type); } } char *fileNameFromTable(char *table) /* Get fileName from a bigData table (for when we don't have a trackDb, just table). */ { struct sqlConnection *conn = hAllocConn(database); char query[512]; sqlSafef(query, sizeof(query), "select fileName from %s", table); char *fileName = sqlQuickString(conn, query); hFreeConn(&conn); char *fileNameRewrite = hReplaceGbdb(fileName); freez(&fileName); return fileNameRewrite; } void textOpen() /* Start serving up plain text, possibly via a pipeline to gzip. */ { char *fileName = cartUsualString(cart, "hgva_outFile", ""); char *compressType = cartUsualString(cart, "hgva_compressType", textOutCompressGzip); compressPipeline = textOutInit(fileName, compressType); } void setGpVarFuncFilter(struct annoGrator *gpVarGrator) /* Use cart variables to configure gpVarGrator's filtering by functional category. */ { struct annoGratorGpVarFuncFilter aggvFuncFilter; ZeroVar(&aggvFuncFilter); aggvFuncFilter.intergenic = cartUsualBoolean(cart, "hgva_include_intergenic", FALSE); aggvFuncFilter.upDownstream = cartUsualBoolean(cart, "hgva_include_upDownstream", TRUE); aggvFuncFilter.nmdTranscript = cartUsualBoolean(cart, "hgva_include_nmdTranscript", TRUE); aggvFuncFilter.exonLoss = cartUsualBoolean(cart, "hgva_include_exonLoss", TRUE); aggvFuncFilter.utr = cartUsualBoolean(cart, "hgva_include_utr", TRUE); aggvFuncFilter.cdsSyn = cartUsualBoolean(cart, "hgva_include_cdsSyn", TRUE); aggvFuncFilter.cdsNonSyn = cartUsualBoolean(cart, "hgva_include_cdsNonSyn", TRUE); aggvFuncFilter.intron = cartUsualBoolean(cart, "hgva_include_intron", TRUE); aggvFuncFilter.splice = cartUsualBoolean(cart, "hgva_include_splice", TRUE); aggvFuncFilter.nonCodingExon = cartUsualBoolean(cart, "hgva_include_nonCodingExon", TRUE); annoGratorGpVarSetFuncFilter(gpVarGrator, &aggvFuncFilter); } #define NO_MAXROWS 0 struct annoGrator *gratorForSnpBed4(struct hash *gratorsByName, char *suffix, struct annoAssembly *assembly, char *chrom, enum annoGratorOverlap overlapRule, char **retDescription) /* Look up snpNNNsuffix; if we find it, return a grator (possibly for a bigBed 4 file), * otherwise return NULL. */ { char *fileName = NULL; struct trackDb *tdb = NULL; if (! findSnpBed4(suffix, &fileName, &tdb)) return NULL; struct annoGrator *grator = NULL; // First look in gratorsByName to see if this grator has already been constructed: if (tdb != NULL) { grator = hashFindVal(gratorsByName, tdb->table); if (retDescription) *retDescription = cloneString(tdb->longLabel); } if (grator != NULL) { // Set its overlap behavior and we're done. grator->setOverlapRule(grator, overlapRule); return grator; } // If not in gratorsByName, then attempt to construct it here: if (fileName != NULL) grator = gratorFromBigDataFileOrUrl(fileName, assembly, NO_MAXROWS, overlapRule); else grator = gratorFromTrackDb(assembly, tdb->table, tdb, chrom, NO_MAXROWS, NULL, overlapRule); if (grator != NULL) hashAdd(gratorsByName, tdb->table, grator); return grator; } char *tableNameFromSourceName(char *sourceName) /* Strip sourceName (file path or db.table) to just the basename or table name. */ { char *tableName = cloneString(sourceName); char *p = strrchr(tableName, '/'); if (p != NULL) { // file path; strip to basename char dir[PATH_LEN], name[FILENAME_LEN], extension[FILEEXT_LEN]; splitPath(tableName, dir, name, extension); safecpy(tableName, strlen(tableName)+1, name); } else { // database.table -- skip database. p = strchr(tableName, '.'); if (p != NULL) tableName = p+1; } return tableName; } char *tagFromTableName(char *tableName, char *suffix) /* Generate a tag for VEP's extras column or VCF's info column. */ { char *p = strstr(tableName, "dbNsfp"); if (p != NULL) tableName = p + strlen("dbNsfp"); int suffixLen = (suffix == NULL) ? 0 : strlen(suffix); int tagSize = strlen(tableName) + suffixLen + 1; char *tag = cloneStringZ(tableName, tagSize); if (isNotEmpty(suffix)) safecat(tag, tagSize, suffix); touppers(tag); // Some custom shortenings, to avoid very long tag names: (void)strSwapStrs(tag, tagSize, "POLYPHEN", "PP"); (void)strSwapStrs(tag, tagSize, "MUTATION", "MUT"); (void)strSwapStrs(tag, tagSize, "PHYLOP", "PHP"); (void)strSwapStrs(tag, tagSize, "PHASTCONS", "PHC"); (void)strSwapStrs(tag, tagSize, "ELEMENTS", "EL"); (void)strSwapStrs(tag, tagSize, "PRIMATES", "PRIM"); (void)strSwapStrs(tag, tagSize, "PLACENTAL", "PLAC"); if (regexMatch(tag, "^PH.*[0-9]WAY")) (void)strSwapStrs(tag, tagSize, "WAY", "W"); (void)strSwapStrs(tag, tagSize, "WGENCODEREGDNASECLUSTERED", "DNASE"); (void)strSwapStrs(tag, tagSize, "WGENCODEREGTFBSCLUSTERED", "TFBS"); return tag; } enum PolyPhen2Subset stripSubsetFromTrackName(char *trackName) /* trackName may have a _suffix for a subset of PolyPhen2; convert that to enum * and zero out the suffix so we have the real trackName. */ { enum PolyPhen2Subset subset = noSubset; char *p = strchr(trackName, ':'); if (p != NULL) { if (sameString(p+1, "HDIV")) subset = HDIV; else if (sameString(p+1, "HVAR")) subset = HVAR; else errAbort("unrecognized suffix in track_suffix '%s'", trackName); *p = '\0'; } return subset; } void updateGratorListAndVepExtra(struct annoGrator *grator, struct annoGrator **pGratorList, struct annoFormatter *vepOut, enum PolyPhen2Subset subset, char *column, char *description, boolean isReg) /* If grator is non-NULL, add it to gratorList and vepOut's list of items for EXTRAs column. */ { if (grator == NULL) return; slAddHead(pGratorList, grator); if (vepOut != NULL) { char *tableName = tableNameFromSourceName(grator->streamer.name); char *suffix = NULL; if (subset == HDIV) suffix = "HDIV"; else if (subset == HVAR) suffix = "HVAR"; char *tag = tagFromTableName(tableName, suffix); if (isEmpty(description)) description = grator->streamer.name; if (isReg) annoFormatVepAddRegulatory(vepOut, (struct annoStreamer *)grator, tag, description, column); else annoFormatVepAddExtraItem(vepOut, (struct annoStreamer *)grator, tag, description, column); } } INLINE void updateGratorList(struct annoGrator *grator, struct annoGrator **pGratorList) /* If grator is non-NULL, add it to gratorList. */ { updateGratorListAndVepExtra(grator, pGratorList, NULL, 0, NULL, NULL, FALSE); } void addDbNsfpSeqChange(char *trackName, struct annoAssembly *assembly, struct hash *gratorsByName, struct annoGrator **pGratorList) // If the user has selected dbNsfp* data, we also need the underlying dbNsfpSeqChange // data, so annoFormatVep can tell whether the variant and gpFx are consistent with the // variant and transcript that dbNsfp used to calculate scores. { //#*** Yet another place where we need metadata: char *seqChangeTable = "dbNsfpSeqChange"; if (hashFindVal(gratorsByName, seqChangeTable) == NULL) { char *fileName = fileNameFromTable(seqChangeTable); if (fileName == NULL) errAbort("'%s' requested, but I can't find fileName for %s", trackName, seqChangeTable); struct annoGrator *grator = gratorFromBigDataFileOrUrl(fileName, assembly, NO_MAXROWS, agoNoConstraint); updateGratorList(grator, pGratorList); hashAdd(gratorsByName, seqChangeTable, grator); } } static struct dyString *dyInfo = NULL; struct hash *getTrackFilterVars(char *track) /* Return a hash of filter variable names (cart variable suffixes) to slName lists of values. */ { char filterPrefix[512]; safef(filterPrefix, sizeof(filterPrefix), "hgva_filter_%s_", track); struct slPair *filterVars = cartVarsWithPrefix(cart, filterPrefix), *var; int prefixLen = strlen(filterPrefix); struct hash *varHash = hashNew(0); for (var = filterVars; var != NULL; var = var->next) { char *varName = var->name+prefixLen; char *val = var->val; struct hashEl *hel = hashLookup(varHash, varName); if (hel != NULL) slNameAddHead((struct slName **)(&hel->val), val); else hashAdd(varHash, varName, slNameNew(val)); } return varHash; } INLINE boolean isNotAll(struct slName *valList) /* Return TRUE unless valList has one element with name "All" (for multiselects). */ { if (slCount(valList) == 1 && sameString(valList->name, "All")) return FALSE; return TRUE; } void factorSourceGratorAddFilter(struct annoGrator *grator, char *name, struct slName *valList) /* Add filter to factorSource grator. */ //#*** Do these smarts belong here in hgVai? Probably not -- should be an hg/lib module with //#*** UI/metadata smarts. { struct annoStreamer *gStreamer = (struct annoStreamer *)grator; struct annoFilter *filter = NULL; if (sameString(name, "name") || sameString(name, "cellType") || sameString(name, "treatment")) { if (valList && isNotAll(valList)) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afMatch, valList); } else if (sameString(name, "score")) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afGTE, valList); else errAbort("Unrecognized filter name '%s' for %s, type=factorSource", name, gStreamer->name); if (filter) gStreamer->addFilters(gStreamer, filter); } void bed5AddFilter(struct annoGrator *grator, char *name, struct slName *valList) /* Add filter to bed 5 grator. */ { struct annoStreamer *gStreamer = (struct annoStreamer *)grator; struct annoFilter *filter = NULL; if (sameString(name, "name")) { if (valList && isNotAll(valList)) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afMatch, valList); } else if (sameString(name, "score")) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afGTE, valList); else errAbort("Unrecognized filter name '%s' for %s, type=bed 5", name, gStreamer->name); if (filter) gStreamer->addFilters(gStreamer, filter); } void addFiltersToGrator(struct annoGrator *grator, struct trackDb *tdb) /* Look for filter variables in the cart and add filters to grator accordingly. */ { struct hash *varHash = getTrackFilterVars(tdb->track); struct hashEl *hel, *helList = hashElListHash(varHash); for (hel = helList; hel != NULL; hel = hel->next) { char *filterName = hel->name; struct slName *valList = hel->val; //#*** Need a much better way to dispatch... if (sameString("factorSource", tdb->type)) factorSourceGratorAddFilter(grator, filterName, valList); else if (startsWith("bed 5", tdb->type)) bed5AddFilter(grator, filterName, valList); else dyStringPrintf(dyInfo, "Ignoring %s filter %s\n", tdb->track, filterName); } hashFree(&varHash); } void addOutputTracks(struct annoGrator **pGratorList, struct hash *gratorsByName, struct annoFormatter *vepOut, struct annoAssembly *assembly, char *chrom, boolean doHtml, boolean *retHasRegulatory) // Construct grators for tracks selected to appear in EXTRAS column { boolean includeReg = cartUsualBoolean(cart, "hgva_include_regulatory", TRUE); boolean haveReg = FALSE; char trackPrefix[128]; safef(trackPrefix, sizeof(trackPrefix), "hgva_track_%s_", database); int trackPrefixLen = strlen(trackPrefix); struct slPair *trackVar, *trackVars = cartVarsWithPrefix(cart, trackPrefix); for (trackVar = trackVars; trackVar != NULL; trackVar = trackVar->next) { char *val = trackVar->val; if (! (sameWord(val, "on") || atoi(val) > 0)) continue; char *trackName = trackVar->name + trackPrefixLen; if (sameString(trackName, "dbNsfpPolyPhen2")) // PolyPhen2 must have a suffix now -- skip obsolete cartVar from existing carts continue; struct annoGrator *grator = hashFindVal(gratorsByName, trackName); if (grator != NULL) // We already have this as a grator: continue; enum PolyPhen2Subset subset = noSubset; char *description = NULL; char *column = NULL; boolean isReg = FALSE; if (startsWith("dbNsfp", trackName)) { // trackName for PolyPhen2 has a suffix for subset -- strip it if we find it: subset = stripSubsetFromTrackName(trackName); description = dbNsfpDescFromTableName(trackName, subset, doHtml); addDbNsfpSeqChange(trackName, assembly, gratorsByName, pGratorList); char *fileName = fileNameFromTable(trackName); if (fileName != NULL) grator = gratorFromBigDataFileOrUrl(fileName, assembly, NO_MAXROWS, agoNoConstraint); } else { struct trackDb *tdb = tdbForTrack(database, trackName, &fullTrackList); if (tdb != NULL) { grator = gratorFromTrackDb(assembly, tdb->table, tdb, chrom, NO_MAXROWS, NULL, agoNoConstraint); if (grator != NULL) { //#*** Need something more sophisticated but this works for our //#*** limited selection of extra tracks: if (asColumnFind(grator->streamer.asObj, "name") != NULL) column = "name"; addFiltersToGrator(grator, tdb); } description = tdb->longLabel; isReg = includeReg && isRegulatoryTrack(tdb, NULL); } } haveReg |= isReg; updateGratorListAndVepExtra(grator, pGratorList, vepOut, subset, column, description, isReg); if (grator != NULL) hashAdd(gratorsByName, trackName, grator); } if (retHasRegulatory) *retHasRegulatory = haveReg; } void addFilterTracks(struct annoGrator **pGratorList, struct hash *gratorsByName, struct annoAssembly *assembly, char *chrom) // Add grators for filters (not added to vepOut): { if (!cartUsualBoolean(cart, "hgva_include_snpCommon", TRUE)) { struct annoGrator *grator = gratorForSnpBed4(gratorsByName, "Common", assembly, chrom, agoMustNotOverlap, NULL); updateGratorList(grator, pGratorList); } if (!cartUsualBoolean(cart, "hgva_include_snpMult", TRUE)) { struct annoGrator *grator = gratorForSnpBed4(gratorsByName, "Mult", assembly, chrom, agoMustNotOverlap, NULL); updateGratorList(grator, pGratorList); } if (cartUsualBoolean(cart, "hgva_require_consEl", FALSE)) { char *consElTrack = cartString(cart, "hgva_require_consEl_track"); struct annoGrator *grator = hashFindVal(gratorsByName, consElTrack); if (grator == NULL) { struct trackDb *tdb = tdbForTrack(database, consElTrack, &fullTrackList); if (tdb != NULL) grator = gratorFromTrackDb(assembly, tdb->table, tdb, chrom, NO_MAXROWS, NULL, agoMustOverlap); updateGratorList(grator, pGratorList); } else grator->setOverlapRule(grator, agoMustOverlap); } } static void getCartPosOrDie(char **retChrom, uint *retStart, uint *retEnd) /* Get chrom:start-end from cart, errAbort if any problems. */ { char *position = cartString(cart, hgvaRange); if (! parsePosition(position, retChrom, retStart, retEnd)) errAbort("Expected position to be chrom:start-end but got '%s'", position); } static char *sampleVariantsPath(struct annoAssembly *assembly, char *geneTrack) /* Construct a path for a trash file of contrived example variants for this assembly * and gene track. */ { char *chrom = NULL; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); char *subDir = "hgv"; mkdirTrashDirectory(subDir); struct dyString *dy = dyStringCreate("%s/%s/%s_%s_%s_%u-%u.vcf", trashDir(), subDir, assembly->name, geneTrack, chrom, start, end); return dyStringCannibalize(&dy); } static void writeMinimalVcfHeader(FILE *f, char *db) /* Write header for VCF with no meaningful qual, filter, info or genotype data. */ { fputs("##fileformat=VCFv4.1\n", f); fprintf(f, "##reference=%s\n", db); fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", f); } static void mutateBase(char *pBase) /* Change *pBase into a different base. */ { static char *bases = "ACGT"; char c; // In real life the probabilities aren't even, but this is easier. while ((c = bases[(uint)(floor(drand48() * 4.0))]) == *pBase) ; *pBase = c; } static void writeMinimalVcfRow(FILE *f, struct vcfRecord *rec) /* Write a minimalist VCF row (coords, name, alleles and placeholders). */ { // VCF columns: #CHROM POS ID REF ALT QUAL FILTER INFO fprintf(f, "%s\t%u\t%s\t%s\t", rec->chrom, rec->chromStart+1, rec->name, rec->alleles[0]); // Comma-separated alt alleles: int i; for (i = 1; i < rec->alleleCount; i++) fprintf(f, "%s%s", (i > 1 ? "," : ""), rec->alleles[i]); // Placeholder qual, filter, info. fprintf(f, "\t.\t.\t.\n"); } static void writeMinimalVcfRowFromBed(FILE *f, struct bed4 *bed, struct dnaSeq *seq, uint seqOffset) /* Write a minimalist VCF row with a mutation of the reference sequence at the given * position. */ { uint indelBase = 0, start = bed->chromStart, end = bed->chromEnd; if (end != start+1) { // VCF treats indels in a special way: pos is the base to the left of the indel, // and both ref and alt alleles include the base to the left of the indel. indelBase = 1; } // Get reference allele sequence: uint refAlLen = end - start + indelBase; char ref[refAlLen+1]; uint seqStart = start - indelBase - seqOffset; safencpy(ref, sizeof(ref), seq->dna+seqStart, refAlLen); touppers(ref); // Alternate allele seq needs extra room in case of single-base insertion: char alt[refAlLen+2]; alt[0] = ref[0]; alt[1] = '\0'; if (start == end) // insertion: alt gets extra base safecpy(alt+1, sizeof(alt)-1, "A"); else if (end == start+1) // single nucleotide variant mutateBase(alt); // else deletion: leave alt truncated after the shared base to left of indel // VCF columns: #CHROM POS ID REF ALT QUAL FILTER INFO fprintf(f, "%s\t%u\t%s\t%s\t%s\t.\t.\t.\n", bed->chrom, start+1-indelBase, bed->name, ref, alt); } static void writeArbitraryVariants(FILE *f, struct annoAssembly *assembly) /* Concoct some variants at an arbitrary position on some assembly sequence. * We shouldn't ever get here unless the selected geneTrack is empty. */ { char *chrom = "NULL"; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); struct bed4 *bed = bed4New(chrom, start, start, "ex_ins"); struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, start-1, start+100); writeMinimalVcfRowFromBed(f, bed, seq, start-1); dnaSeqFree(&seq); } static void addGpFromRow(struct genePred **pGpList, struct annoRow *row, boolean *pNeedCoding, boolean *pNeedNonCoding) /* If row is coding and we need a coding gp, add it to pGpList and update pNeedCoding; * likewise for noncoding. */ { struct genePred *gp = genePredLoad(row->data); if (gp->cdsStart != gp->cdsEnd && *pNeedCoding) { slAddHead(pGpList, gp); *pNeedCoding = FALSE; } else if (gp->cdsStart == gp->cdsEnd && *pNeedNonCoding) { slAddHead(pGpList, gp); *pNeedNonCoding = FALSE; } } static void addGpFromPos(struct annoStreamer *geneStream, char *chrom, uint start, uint end, struct genePred **pGpList, boolean *pNeedCoding, boolean *pNeedNonCoding, struct lm *lm) /* Get up to 10 rows from position; if we find one coding and one non-coding gene, * add them to pGpList and update pNeed* accordingly. */ { geneStream->setRegion(geneStream, chrom, start, end); int rowCount = 0; struct annoRow *row = NULL; while (rowCount < 10 && (*pNeedCoding || *pNeedNonCoding) && (row = geneStream->nextRow(geneStream, NULL, 0, lm)) != NULL) addGpFromRow(pGpList, row, pNeedCoding, pNeedNonCoding); } static struct genePred *genesFromPosition(struct annoStreamer *geneStream, boolean *retGotCoding, boolean *retGotNonCoding) /* Try to get a coding and non-coding gene from geneStream at cart position. * If there are none, try whole chrom; if none there, first in assembly. */ { struct genePred *gpList = NULL; struct lm *lm = lmInit(0); char *chrom = NULL; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); boolean needCoding = TRUE, needNonCoding = TRUE; // First, look for both coding and noncoding genes at cart position: addGpFromPos(geneStream, chrom, start, end, &gpList, &needCoding, &needNonCoding, lm); // If that didn't do it, try querying whole chrom if (needCoding || needNonCoding) addGpFromPos(geneStream, chrom, 0, 0, &gpList, &needCoding, &needNonCoding, lm); // If we still haven't found either coding or non-coding on the cart's current chrom, // try whole genome: if (needCoding && needNonCoding) addGpFromPos(geneStream, NULL, 0, 0, &gpList, &needCoding, &needNonCoding, lm); slSort(&gpList, genePredCmp); lmCleanup(&lm); if (retGotCoding) *retGotCoding = !needCoding; if (retGotNonCoding) *retGotNonCoding = !needNonCoding; return gpList; } static void addSnv(struct bed4 **pBedList, char *chrom, uint start, char *name) /* Add a single-nucleotide bed4 item. */ { slAddHead(pBedList, bed4New(chrom, start, start+1, name)); } // Stuff within this range (see gpFx.h) is considered upstream/downstream of a transcript: #define UPSTREAMFUDGE GPRANGE static struct bed4 *sampleVarBedFromGenePred(struct genePred *gp, struct annoAssembly *assembly, uint txSeqStart, uint txSeqEnd, boolean exonOnly) /* Construct imaginary variants that hit various parts of the transcript * described in gp, using reference base values from assembly. */ { uint basesLeft = gp->txStart - txSeqStart, basesRight = txSeqEnd - gp->txEnd; struct bed4 *bedList = NULL; if (!exonOnly && basesLeft > UPSTREAMFUDGE) // SNV, intergenic to the left addSnv(&bedList, gp->chrom, txSeqStart, "ex_igLeft"); if (!exonOnly && basesLeft > 0) // SNV, up/downstream to the left addSnv(&bedList, gp->chrom, gp->txStart - 1, "ex_updownLeft"); if (!exonOnly && gp->exonCount > 1) { // SNV, intron uint start = (gp->exonEnds[0] + gp->exonStarts[1]) / 2; addSnv(&bedList, gp->chrom, start, "ex_intron"); // SNV, splice addSnv(&bedList, gp->chrom, gp->exonStarts[1] - 2, "ex_splice"); } boolean isCoding = (gp->cdsStart != gp->cdsEnd); if (isCoding) { if (gp->txStart < gp->cdsStart) // SNV, UTR to the left addSnv(&bedList, gp->chrom, gp->txStart + 1, "ex_utrLeft"); int cdsExon = 0; while (gp->cdsStart > gp->exonEnds[cdsExon] && cdsExon < gp->exonCount) cdsExon++; uint start = gp->cdsStart + 2; // single-base insertion, leftmost codon slAddHead(&bedList, bed4New(gp->chrom, start, start, "ex_codonLeftIns")); // 3-base deletion leftmost codon slAddHead(&bedList, bed4New(gp->chrom, start, start+3, "ex_codonLeftDel")); // SNV, leftmost codon addSnv(&bedList, gp->chrom, start+1, "ex_codonLeft"); if (gp->txEnd > gp->cdsEnd) // SNV, UTR to the right addSnv(&bedList, gp->chrom, gp->txEnd - 1, "ex_utrRight"); } else { // noncoding exon variant uint start = (gp->exonStarts[0] + gp->exonEnds[0]) / 2; addSnv(&bedList, gp->chrom, start, "ex_nce"); } if (!exonOnly && basesRight > 0) // SNV, up/downstream to the left addSnv(&bedList, gp->chrom, gp->txEnd + 1, "ex_updownRight"); return bedList; } // margin for intergenic variants around transcript: #define IGFUDGE 5 static struct annoStreamer *makeSampleVariantsStreamer(struct annoAssembly *assembly, struct trackDb *geneTdb, int maxOutRows) /* Construct a VCF file of example variants for db (unless it already exists) * and return an annoStreamer for that file. If possible, make the variants hit a gene. */ { char *sampleFile = sampleVariantsPath(assembly, geneTdb->track); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildSampleVariants", FALSE); if (! fileExists(sampleFile) || forceRebuild) { struct annoStreamer *geneStream = streamerFromTrack(assembly, geneTdb->table, geneTdb, NULL, NO_MAXROWS); boolean gotCoding = FALSE, gotNonCoding = FALSE; struct genePred *gpList = genesFromPosition(geneStream, &gotCoding, &gotNonCoding); FILE *f = mustOpen(sampleFile, "w"); writeMinimalVcfHeader(f, assembly->name); if (gpList == NULL) { warn("Unable to find any gene transcripts in '%s' (%s)", geneTdb->shortLabel, geneTdb->track); writeArbitraryVariants(f, assembly); } else { struct bed4 *bedList = NULL; uint chromSize = annoAssemblySeqSize(assembly, gpList->chrom); uint minSeqStart = chromSize, maxSeqEnd = 0; struct genePred *gp; for (gp = gpList; gp != NULL; gp = gp->next) { int txSeqStart = gp->txStart - (UPSTREAMFUDGE + IGFUDGE); uint txSeqEnd = gp->txEnd + UPSTREAMFUDGE + IGFUDGE; if (txSeqStart < 0) txSeqStart = 0; if (txSeqEnd > chromSize) txSeqEnd = chromSize; if (txSeqStart < minSeqStart) minSeqStart = txSeqStart; if (txSeqEnd > maxSeqEnd) maxSeqEnd = txSeqEnd; // If we got a coding gp, but this gp is non-coding, just do its exon variant: boolean exonOnly = gotCoding && (gp->cdsStart == gp->cdsEnd); slCat(&bedList, sampleVarBedFromGenePred(gp, assembly, txSeqStart, txSeqEnd, exonOnly)); } slSort(&bedList, bedCmp); struct dnaSeq *txSeq = twoBitReadSeqFragLower(assembly->tbf, gpList->chrom, minSeqStart, maxSeqEnd); struct bed4 *bed; for (bed = bedList; bed != NULL; bed = bed->next) { writeMinimalVcfRowFromBed(f, bed, txSeq, minSeqStart); } dnaSeqFree(&txSeq); bed4FreeList(&bedList); } geneStream->close(&geneStream); carefulClose(&f); } return annoStreamVcfNew(sampleFile, FALSE, assembly, maxOutRows); } static char *variantIdPath(struct annoAssembly *assembly, char *variantIds) /* Use the md5sum of the user's pasted/uploaded variants to make a hopefully * unique trash filename. */ { char *md5sum = md5HexForString(variantIds); char *subDir = "hgv"; mkdirTrashDirectory(subDir); struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), subDir, assembly->name, md5sum); return dyStringCannibalize(&dy); } static struct slName *hashListNames(struct hash *hash) /* Return a list of all element names in the hash (if any). */ { struct slName *list = NULL; struct hashCookie cookie = hashFirst(hash); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) slAddHead(&list, slNameNew(hel->name)); return list; } static char *encloseInAngleBracketsDbSnp(char *stringIn) /* Return a string that has <dbSNP: stringIn>, with spaces replaced by '_'s. */ { int stringInLen = strlen(stringIn); int stringOutLen = stringInLen + strlen("<dbSNP:>") + 1; char *stringOut = needMem(stringOutLen); safef(stringOut, stringOutLen, "<dbSNP:%s>", stringIn); subChar(stringOut, ' ', '_'); return stringOut; } // dbSNP named alleles have many ways to describe a deletion from the reference, // for example "LARGEDELETION", "LARGE DELETION", "... DELETED", "... DEL": static const char *dbSnpDelRegex = "^\\(.*(DELET.*| DEL)\\)$"; static char **parseDbSnpAltAlleles(char *refAl, char *obsAls, boolean minusStrand, int *retAltAlCount, boolean *retNeedLeftBase) /* Given a non-symbolic reference allele and slash-sep observed alleles from dbSNP, * return an array of +-strand alleles that are not the same as the reference. * If any allele is "-" (deleted, zero-length), then set retNeedLeftBase to TRUE * because in this case VCF requires that the reference base to the left of the indel * must be added to all alleles, and the start coord also moves one base to the left. * Also, if any alt allele is symbolic, padding is required. * Note: this trashes obsAls. Resulting array can be freed but not its contents. */ { int obsCount = countChars(obsAls, '/') + 1; char *obsWords[obsCount]; chopByChar(obsAls, '/', obsWords, obsCount); boolean obsHasDeletion = FALSE; int i; for (i = 0; i < obsCount; i++) if (sameString(obsWords[i], "-")) { obsHasDeletion = TRUE; break; } char **altAls; AllocArray(altAls, obsCount); int altCount = 0; boolean needLeftBase = isEmpty(refAl) || sameString(refAl, "-"); for (i = 0; i < obsCount; i++) { char *altAl = obsWords[i]; int altAlLen = strlen(altAl); if (minusStrand && isAllNt(altAl, altAlLen)) reverseComplement(altAl, altAlLen); if (differentString(altAl, refAl)) { if (sameString(altAl, "-")) { altAls[altCount] = ""; needLeftBase = TRUE; } else { // It would be nice to expand the "(CA)11/12/14/15/16/17/18/19/20" syntax of // some dbSNP observed's. What are these?: "(D1S243)", "(D1S2870)" // Unfortunately for observed="lengthTooLong" we just can't get the correct allele // sequence. (76,130 of those in snp138) // Hmmm, I guess we could at least stick in the right number of N's if we can // parse "(245 BP INSERTION)". (2403 rows rlike "[0-9]+ BP ?INSERTION" in snp138) if (!isAllNt(altAl, altAlLen)) { // Symbolic allele: left base required, and enclose it in <dbSNP:>'s. // But if it's one of dbSNP's LARGEDELETION kind of alleles, that is redundant // with the reference allele, so if we know there is already a "-" allele, // skip it. if (obsHasDeletion && regexMatch(altAl, dbSnpDelRegex)) continue; needLeftBase = TRUE; altAl = encloseInAngleBracketsDbSnp(altAl); } altAls[altCount] = altAl; } altCount++; } } *retAltAlCount = altCount; *retNeedLeftBase = needLeftBase; return altAls; } char *firstNCommaSep(struct slName *nameList, int n) /* Return a comma-separated string with the first n names in nameList. */ { struct dyString *dy = dyStringNew(0); int i; struct slName *el; for (i=0, el=nameList; i < 5 && el != NULL; i++, el = el->next) { if (i > 0) dyStringAppend(dy, ", "); dyStringPrintf(dy, "'%s'", el->name); } return dyStringCannibalize(&dy); } //#*** Variant ID-matching should be metadata-driven too. termRegex -> data source. static const char *rsIdRegex = "^rs[0-9]+$"; static void rsIdsToVcfRecords(struct annoAssembly *assembly, struct slName *rsIds, struct vcfFile *vcff, struct vcfRecord **pRecList, struct slName **pCommentList) /* If possible, look up coordinates and alleles of dbSNP rs IDs. */ { if (rsIds == NULL) return; char *table = findLatestSnpTable(NULL); if (table == NULL) return; struct sqlConnection *conn = hAllocConn(assembly->name); // Build a 'name in (...)' query, and build a hash of IDs so we can test whether all were found struct dyString *dq = sqlDyStringCreate("select chrom, chromStart, chromEnd, name, strand, " "refUCSC, observed from %s where name in (", table); struct hash *idHash = hashNew(0); struct slName *id; for (id = rsIds; id != NULL; id = id->next) { tolowers(id->name); dyStringPrintf(dq, "%s'%s'", (id != rsIds ? "," : ""), id->name); hashStoreName(idHash, id->name); } dyStringAppend(dq, ");"); struct sqlResult *sr = sqlGetResult(conn, dq->string); // Construct a minimal VCF row to make a vcfRecord for each variant. char *vcfRow[9]; vcfRow[5] = vcfRow[6] = vcfRow[7] = "."; // placeholder for qual, filter, info // It would be cool to someday add snpNNN's exceptions column to the filter or info. struct dyString *dyAltAlStr = dyStringNew(0); int i; char **row; while ((row = sqlNextRow(sr)) != NULL) { char *chrom = row[0]; uint chromStart = atoll(row[1]); uint chromEnd = atoll(row[2]); char *name = row[3]; char *strand = row[4]; char refAl[max(strlen(row[5]), chromEnd-chromStart) + 2]; safecpy(refAl, sizeof(refAl), row[5]); if (sameString(refAl, "-")) refAl[0] = '\0'; else if (! isAllNt(refAl, strlen(refAl))) { // refUCSC is symbolic like "( 2548bp insertion )" -- just use the actual reference // sequence for now, to keep things simple downstream. //#*** Need something more efficient, like a sequence cache inside assembly! struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, chromStart, chromEnd); safecpy(refAl, sizeof(refAl), seq->dna); toUpperN(refAl, seq->size); dnaSeqFree(&seq); } char *obsAls = row[6]; int altAlCount = 0; boolean needLeftBase = FALSE; char **altAls = parseDbSnpAltAlleles(refAl, obsAls, sameString(strand, "-"), &altAlCount, &needLeftBase); needLeftBase |= (chromStart == chromEnd); // should be redundant, but just in case. hashRemove(idHash, name); uint vcfStart = chromStart + 1; dyStringClear(dyAltAlStr); // If this is a non-symbolic indel, we'll have to move start one base to the left // and add the reference base to the left of the actual alleles if (needLeftBase) { vcfStart--; //#*** Need something more efficient, like a sequence cache inside assembly! struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, chromStart-1, chromStart); toUpperN(seq->dna, 1); char leftBase = seq->dna[0]; dnaSeqFree(&seq); char refAlPlus[strlen(refAl)+2]; safef(refAlPlus, sizeof(refAlPlus), "%c%s", leftBase, refAl); safecpy(refAl, sizeof(refAl), refAlPlus); for (i = 0; i < altAlCount; i++) { if (i > 0) dyStringAppendC(dyAltAlStr, ','); if (isAllNt(altAls[i], strlen(altAls[i]))) dyStringPrintf(dyAltAlStr, "%c%s", leftBase, altAls[i]); else dyStringAppend(dyAltAlStr, altAls[i]); } } else { // Not an indel, just make comma-sep string of alt alleles. for (i = 0; i < altAlCount; i++) { if (i > 0) dyStringAppendC(dyAltAlStr, ','); dyStringAppend(dyAltAlStr, altAls[i]); } } char vcfStartStr[64]; safef(vcfStartStr, sizeof(vcfStartStr), "%d", vcfStart); vcfRow[0] = chrom; vcfRow[1] = vcfStartStr; vcfRow[2] = name; vcfRow[3] = refAl; vcfRow[4] = dyAltAlStr->string; struct vcfRecord *rec = vcfRecordFromRow(vcff, vcfRow); slAddHead(pRecList, rec); } dyStringFree(&dyAltAlStr); // Check for IDs not found struct slName *notFoundIds = hashListNames(idHash); if (notFoundIds != NULL) { char *namesNotFound = firstNCommaSep(notFoundIds, 5); struct dyString *dy = dyStringCreate("%d rs# IDs not found, e.g. %s", slCount(notFoundIds), namesNotFound); slAddTail(pCommentList, slNameNew(dy->string)); freeMem(namesNotFound); dyStringFree(&dy); } slNameFreeList(¬FoundIds); } static struct vcfRecord *parseVariantIds(struct annoAssembly *assembly, char *variantIdText, struct slName **pCommentList) /* Return a sorted list of minimal vcfRecords (coords, name, ref and alt alleles) * corresponding to each pasted variant. */ { struct vcfRecord *recList = NULL; char *p = cloneString(variantIdText), *id; struct slName *rsIds = NULL, *unknownIds = NULL; subChar(p, ',', ' '); while ((id = nextWord(&p)) != NULL) { if (regexMatchNoCase(id, rsIdRegex)) slNameAddHead(&rsIds, id); else slNameAddHead(&unknownIds, id); } if (unknownIds != NULL) { slReverse(&unknownIds); char *firstUnknownIds = firstNCommaSep(unknownIds, 5); struct dyString *dy = dyStringCreate("%d variant identifiers are unrecognized, e.g. %s", slCount(unknownIds), firstUnknownIds); slAddTail(pCommentList, slNameNew(dy->string)); freeMem(firstUnknownIds); dyStringFree(&dy); } struct vcfFile *vcff = vcfFileNew(); rsIdsToVcfRecords(assembly, rsIds, vcff, &recList, pCommentList); slSort(&recList, vcfRecordCmp); slNameFreeList(&rsIds); slNameFreeList(&unknownIds); return recList; } static void adjustRangeForVariants(struct vcfRecord *recList, char **pChrom, uint *pStart, uint *pEnd) /* Given a *sorted* list of VCF records, look at the first and last records and if * they fall outside of the range, expand the range to include them. */ { if (pChrom != NULL && *pChrom != NULL && recList != NULL) { // We have a non-empty sorted list of records, and search is not genome-wide; // look at first and last variants to see if they're not already included in search range. struct vcfRecord *first = recList, *last = recList; while (last->next) last = last->next; if (differentString(*pChrom, first->chrom) || differentString(*pChrom, last->chrom)) { // Variants span multiple chroms; force genome-wide search. *pChrom = NULL; *pStart = *pEnd = 0; } else { // Variant(s) are on the same chrom as search range; check starts and ends. if (*pStart > first->chromEnd) *pStart = first->chromEnd; if (*pEnd < last->chromStart) *pEnd = last->chromStart; } } } static struct annoStreamer *makeVariantIdStreamer(struct annoAssembly *assembly, int maxOutRows, char **pChrom, uint *pStart, uint *pEnd, struct slName **pCommentList) /* Translate user's pasted/uploaded variant IDs into minimal VCF if possible. * Return a VCF streamer for those, and if the current search position is too narrow * to include all of the variants, widen it as necessary. */ { // Hash variant text to get trash filename. Use if exists, otherwise build it. char *variantIds = cartString(cart, hgvaVariantIds); char *varFile = variantIdPath(assembly, variantIds); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildVariantIds", FALSE); if (! fileExists(varFile) || forceRebuild) { struct vcfRecord *varList = parseVariantIds(assembly, variantIds, pCommentList); //#*** If no variants were recognized, we should probably show main page with a warning. adjustRangeForVariants(varList, pChrom, pStart, pEnd); FILE *f = mustOpen(varFile, "w"); writeMinimalVcfHeader(f, assembly->name); struct vcfRecord *var; for (var = varList; var != NULL; var = var->next) writeMinimalVcfRow(f, var); carefulClose(&f); } return annoStreamVcfNew(varFile, FALSE, assembly, maxOutRows); } static struct trackDb *getVariantTrackDb(char *variantTrack) /* Get a tdb for the user's selected variant track, or warn and return NULL * (main page will be displayed) */ { struct trackDb *varTdb = tdbForTrack(database, variantTrack, &fullTrackList); if (varTdb == NULL) { if (isHubTrack(variantTrack)) warn("Can't find hub track '%s'", variantTrack); else warn("Can't find tdb for variant track '%s'", variantTrack); } else checkVariantTrack(varTdb); return varTdb; } static void adjustGpVarOverlapRule(struct annoGrator *gpVarGrator, boolean haveRegulatory) /* If we're able to detect regulatory elements, and want to keep those annotations, loosen up * gpVarGrator's overlap rule from the default (must overlap). */ { if (haveRegulatory && cartUsualBoolean(cart, "hgva_include_regulatory", TRUE)) gpVarGrator->setOverlapRule(gpVarGrator, agoNoConstraint); } void doQuery() /* Translate simple form inputs into anno* components and execute query. */ { dyInfo = dyStringNew(0); char *chrom = NULL; uint start = 0, end = 0; if (sameString(regionType, hgvaRegionTypeRange)) getCartPosOrDie(&chrom, &start, &end); struct annoAssembly *assembly = getAnnoAssembly(database); char *geneTrack = cartString(cart, "hgva_geneTrack"); struct trackDb *geneTdb = tdbForTrack(database, geneTrack, &fullTrackList); if (geneTdb == NULL) { warn("Can't find tdb for gene track %s", geneTrack); doUi(); return; } int maxVarRows = cartUsualInt(cart, "hgva_variantLimit", 10); struct annoStreamer *primary = NULL; char *primaryLongLabel = NULL; char *variantTrack = cartString(cart, "hgva_variantTrack"); struct slName *commentList = NULL; if (sameString(variantTrack, hgvaSampleVariants)) { primary = makeSampleVariantsStreamer(assembly, geneTdb, maxVarRows); primaryLongLabel = hgvaSampleVariantsLabel; // Sample variants can't always be made within the currently selected position range, // so just for these, force search to be genome-wide. chrom = NULL; start = 0; end = 0; } else if (sameString(variantTrack, hgvaUseVariantIds)) { // Override search position if cart position doesn't include all variants: primary = makeVariantIdStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList); primaryLongLabel = hgvaVariantIdsLabel; } else { struct trackDb *varTdb = getVariantTrackDb(variantTrack); if (varTdb == NULL) { doUi(); return; } primary = streamerFromTrack(assembly, varTdb->table, varTdb, chrom, maxVarRows); primaryLongLabel = varTdb->longLabel; } enum annoGratorOverlap geneOverlapRule = agoMustOverlap; struct annoGrator *gpVarGrator = gratorFromTrackDb(assembly, geneTdb->table, geneTdb, chrom, NO_MAXROWS, primary->asObj, geneOverlapRule); setGpVarFuncFilter(gpVarGrator); // Some grators may be used as both filters and output values. To avoid making // multiple grators for the same source, hash them by trackName: struct hash *gratorsByName = hashNew(8); struct annoGrator *snpGrator = NULL; char *snpDesc = NULL; if (cartUsualBoolean(cart, "hgva_rsId", FALSE)) snpGrator = gratorForSnpBed4(gratorsByName, "", assembly, chrom, agoNoConstraint, &snpDesc); // Now construct gratorList in the order in which annoFormatVep wants to see them, // i.e. first the gpVar, then the snpNNN, then whatever else: struct annoGrator *gratorList = NULL; slAddHead(&gratorList, gpVarGrator); if (snpGrator != NULL) slAddHead(&gratorList, snpGrator); // Text or HTML output? char *outFormat = cartUsualString(cart, "hgva_outFormat", "vepTab"); boolean doHtml = sameString(outFormat, "vepHtml"); // Initialize VEP formatter: struct annoFormatter *vepOut = annoFormatVepNew("stdout", doHtml, primary, primaryLongLabel, (struct annoStreamer *)gpVarGrator, geneTdb->longLabel, (struct annoStreamer *)snpGrator, snpDesc, assembly); boolean haveRegulatory = FALSE; addOutputTracks(&gratorList, gratorsByName, vepOut, assembly, chrom, doHtml, &haveRegulatory); adjustGpVarOverlapRule(gpVarGrator, haveRegulatory); addFilterTracks(&gratorList, gratorsByName, assembly, chrom); slReverse(&gratorList); if (doHtml) { webStart(cart, database, "Annotated Variants in VEP/HTML format"); } else { // Undo the htmlPushEarlyHandlers() because after this point they make ugly text: popWarnHandler(); popAbortHandler(); textOpen(); webStartText(); } if (isNotEmpty(dyInfo->string)) puts(dyInfo->string); struct annoGratorQuery *query = annoGratorQueryNew(assembly, primary, gratorList, vepOut); struct slName *comment; for (comment = commentList; comment != NULL; comment = comment->next) vepOut->comment(vepOut, comment->name); if (chrom != NULL) annoGratorQuerySetRegion(query, chrom, start, end); annoGratorQueryExecute(query); annoGratorQueryFree(&query); if (doHtml) webEnd(); else textOutClose(&compressPipeline); } int main(int argc, char *argv[]) /* Process command line. */ { long enteredMainTime = clock1000(); if (hIsPrivateHost()) pushCarefulMemHandler(LIMIT_2or6GB); htmlPushEarlyHandlers(); /* Make errors legible during initialization. */ cgiSpoof(&argc, argv); oldVars = hashNew(10); setUdcCacheDir(); boolean startQuery = (cgiUsualString("hgva_startQuery", NULL) != NULL); if (startQuery) cart = cartAndCookieNoContent(hUserCookie(), excludeVars, oldVars); else cart = cartAndCookie(hUserCookie(), excludeVars, oldVars); /* Set up global variables. */ getDbAndGenome(cart, &database, &genome, oldVars); regionType = cartUsualString(cart, hgvaRegionType, hgvaRegionTypeGenome); if (isEmpty(cartOptionalString(cart, hgvaRange))) cartSetString(cart, hgvaRange, hDefaultPos(database)); int timeout = cartUsualInt(cart, "udcTimeout", 300); if (udcCacheTimeout() < timeout) udcSetCacheTimeout(timeout); knetUdcInstall(); initGroupsTracksTables(cart, &fullTrackList, &fullGroupList); if (lookupPosition(cart, hgvaRange)) { if (startQuery) doQuery(); else doUi(); } else { // Revert to lastPosition if we have multiple matches or warnings, // especially in case user manually edits browser location as in #13009: char *position = cartUsualString(cart, "lastPosition", hDefaultPos(database)); cartSetString(cart, hgvaRange, position); if (webGotWarnings()) { // We land here when lookupPosition pops up a warning box. // Reset the problematic position and show the main page. doMainPage(); } // If lookupPosition returned FALSE and didn't report warnings, // then it wrote HTML showing multiple position matches & links. } cartCheckout(&cart); cgiExitTime("hgVai", enteredMainTime); return 0; }