a21b8d5bb7d8ffa63069c412ad71fbaeeb9f4e55 angie Wed Jun 25 11:00:52 2014 -0700 Fixing bug found by Jonathan: with clean cart, when showing contents offilter section for Regulatory Elements, the multiselects didn't appear. Turns out that the 'force' argument of ddcl.reinit actually makes it do nothing! To avoid disrupting anything else, for now I'll just forgo force. refs #11461 diff --git src/hg/hgVai/hgVai.c src/hg/hgVai/hgVai.c index 884a60a..75401da 100644 --- src/hg/hgVai/hgVai.c +++ src/hg/hgVai/hgVai.c @@ -1,2267 +1,2267 @@ /* hgVai - Variant Annotation Integrator. */ /* Copyright (C) 2014 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "jksql.h" #include "htmshell.h" #include "web.h" #include "cheapcgi.h" #include "cart.h" #include "hui.h" #include "grp.h" #include "hCommon.h" #include "hgFind.h" #include "hPrint.h" #include "jsHelper.h" #include "memalloc.h" #include "textOut.h" #include "trackHub.h" #include "hubConnect.h" #include "twoBit.h" #include "gpFx.h" #include "udc.h" #include "knetUdc.h" #include "md5.h" #include "regexHelper.h" #include "annoGratorQuery.h" #include "annoGratorGpVar.h" #include "annoFormatVep.h" #include "annoStreamBigBed.h" #include "libifyMe.h" /* Global Variables */ struct cart *cart; /* CGI and other variables */ struct hash *oldVars = NULL; /* The cart before new cgi stuff added. */ char *genome = NULL; /* Name of genome - mouse, human, etc. */ char *database = NULL; /* Current genome database - hg17, mm5, etc. */ char *regionType = NULL; /* genome, ENCODE pilot regions, or specific position range. */ struct grp *fullGroupList = NULL; /* List of all groups. */ struct trackDb *fullTrackList = NULL; /* List of all tracks in database. */ static struct pipeline *compressPipeline = (struct pipeline *)NULL; // Null terminated list of CGI Variables we don't want to save permanently: char *excludeVars[] = {"Submit", "submit", "hgva_startQuery", NULL,}; #define hgvaRange "position" #define hgvaRegionType "hgva_regionType" #define hgvaRegionTypeEncode "encode" #define hgvaRegionTypeGenome "genome" #define hgvaRegionTypeRange "range" #define hgvaPositionContainer "positionContainer" #define hgvaSampleVariants "hgva_internally_generated_sample_variants" #define hgvaSampleVariantsLabel "Artificial Example Variants" #define hgvaUseVariantIds "hgva_useVariantIds" #define hgvaVariantIdsLabel "Variant Identifiers" #define hgvaVariantIds "hgva_variantIds" #define hgvaVariantPasteContainer "variantPasteContainer" void addSomeCss() /*#*** This should go in a .css file of course. */ { printf("<style>\n" "div.sectionLite { border-width: 1px; border-color: #"HG_COL_BORDER"; border-style: solid;" " background-color: #"HG_COL_INSIDE"; padding-left: 10px; padding-right: 10px; " " padding-top: 8px; padding-bottom: 5px; margin-top: 5px; margin-bottom: 5px }\n" ".sectionLiteHeader { font-weight: bold; font-size:larger; color:#000000;" " text-align:left; vertical-align:bottom; white-space:nowrap; }\n" "div.sectionLiteHeader.noReorderRemove { padding-bottom:5px; }\n" "div.sourceFilter { padding-top: 5px; padding-bottom: 5px }\n" "</style>\n"); } INLINE void startCollapsibleSection(char *sectionSuffix, char *title, boolean onByDefault) // Wrap shared args to jsBeginCollapsibleSectionFontSize { jsBeginCollapsibleSectionFontSize(cart, "hgva", sectionSuffix, title, onByDefault, "1.1em"); } INLINE void startSmallCollapsibleSection(char *sectionSuffix, char *title, boolean onByDefault) // Wrap shared args to jsBeginCollapsibleSectionFontSize { jsBeginCollapsibleSectionFontSize(cart, "hgva", sectionSuffix, title, onByDefault, "0.9em"); } #define endCollapsibleSection jsEndCollapsibleSection static struct dyString *onChangeStart() /* Start up a javascript onChange command */ { struct dyString *dy = jsOnChangeStart(); jsTextCarryOver(dy, hgvaRegionType); jsTextCarryOver(dy, hgvaRange); return dy; } static char *onChangeClade() /* Return javascript executed when they change clade. */ { struct dyString *dy = onChangeStart(); jsDropDownCarryOver(dy, "clade"); dyStringAppend(dy, " document.hiddenForm.org.value=0;"); dyStringAppend(dy, " document.hiddenForm.db.value=0;"); dyStringAppend(dy, " document.hiddenForm." hgvaRange ".value='';"); return jsOnChangeEnd(&dy); } static char *onChangeOrg() /* Return javascript executed when they change organism. */ { struct dyString *dy = onChangeStart(); jsDropDownCarryOver(dy, "clade"); jsDropDownCarryOver(dy, "org"); dyStringAppend(dy, " document.hiddenForm.db.value=0;"); dyStringAppend(dy, " document.hiddenForm." hgvaRange ".value='';"); return jsOnChangeEnd(&dy); } static char *onChangeDb() /* Return javascript executed when they change database. */ { struct dyString *dy = onChangeStart(); jsDropDownCarryOver(dy, "clade"); jsDropDownCarryOver(dy, "db"); dyStringAppend(dy, " document.hiddenForm." hgvaRange ".value='';"); return jsOnChangeEnd(&dy); } INLINE void printOption(char *val, char *selectedVal, char *label) /* For rolling our own select without having to build conditional arrays/lists. */ { printf("<OPTION VALUE='%s'%s>%s\n", val, (sameString(selectedVal, val) ? " SELECTED" : ""), label); } void printRegionListHtml(char *db) /* Make a dropdown choice of region type, with position input box that appears if needed. * Unlike hgTables, don't bother with ENCODE pilot regions -- unless someone misses it. * Return the selected region type. */ { printf("<SELECT ID='"hgvaRegionType"' NAME='"hgvaRegionType"' " "onchange=\"hgva.changeRegion();\">\n"); printOption(hgvaRegionTypeGenome, regionType, "genome"); printOption(hgvaRegionTypeRange, regionType, "position or search term"); printf("</SELECT>"); } void topLabelSpansStart(char *label) { printf("<span style='display: inline-block; padding-right: 5px;'>" "<span style='display: block;'>%s</span>\n" "<span style='display: block; padding-bottom: 5px;'>\n", label); } void topLabelSpansEnd() { printf("</span></span>"); } char *makePositionInput() /* Return HTML for the position input. */ { struct dyString *dy = dyStringCreate("<INPUT TYPE=TEXT NAME=\"%s\" SIZE=%d VALUE=\"%s\">", hgvaRange, 45, addCommasToPos(NULL, cartString(cart, hgvaRange))); return dyStringCannibalize(&dy); } void printCtAndHubButtons() /* Print a div with buttons for hgCustom and hgHubConnect */ { puts("<div style='padding-top: 5px; padding-bottom: 5px'>"); hOnClickButton("document.customTrackForm.submit(); return false;", hasCustomTracks(cart) ? CT_MANAGE_BUTTON_LABEL : CT_ADD_BUTTON_LABEL); printf(" "); if (hubConnectTableExists()) hOnClickButton("document.trackHubForm.submit(); return false;", "track hubs"); nbSpaces(3); printf("To reset <B>all</B> user cart settings (including custom tracks), \n" "<A HREF=\"/cgi-bin/cartReset?destination=%s\">click here</A>.\n", cgiScriptName()); puts("</div>"); } void hgGatewayCladeGenomeDb() /* Make a row of labels and row of buttons like hgGateway, but not using tables. */ { boolean gotClade = hGotClade(); if (gotClade) { topLabelSpansStart("clade"); printCladeListHtml(genome, onChangeClade()); topLabelSpansEnd(); } topLabelSpansStart("genome"); if (gotClade) printGenomeListForCladeHtml(database, onChangeOrg()); else printGenomeListHtml(database, onChangeOrg()); topLabelSpansEnd(); topLabelSpansStart("assembly"); printAssemblyListHtml(database, onChangeDb()); topLabelSpansEnd(); puts("<BR>"); topLabelSpansStart("region to annotate"); printRegionListHtml(database); topLabelSpansEnd(); topLabelSpansStart(""); // Yet another span, for hiding/showing position input and lookup button: printf("<span id='"hgvaPositionContainer"'%s>\n", differentString(regionType, hgvaRegionTypeRange) ? " style='display: none;'" : ""); puts(makePositionInput()); printf("</span>\n"); topLabelSpansEnd(); puts("<BR>"); } void printAssemblySection() /* Print assembly-selection stuff. * Redrawing the whole page when the assembly changes seems fine to me. */ { //#*** ---------- More copied verbatim, from hgTables/mainPage.c: ----------- /* Hidden form - for benefit of javascript. */ { static char *saveVars[] = { "clade", "org", "db", hgvaRange, hgvaRegionType }; jsCreateHiddenForm(cart, cgiScriptName(), saveVars, ArraySize(saveVars)); } /* Hidden form for jumping to custom tracks CGI. */ printf("<FORM ACTION='%s' NAME='customTrackForm'>", hgCustomName()); cartSaveSession(cart); printf("</FORM>\n"); /* Hidden form for jumping to track hub manager CGI. */ printf("<FORM ACTION='%s' NAME='trackHubForm'>", hgHubConnectName()); //#*** well, almost verbatim... lib version should use cgiScriptName. cartSaveSession(cart); printf("</FORM>\n"); printf("<FORM ACTION=\"%s\" NAME=\"mainForm\" ID=\"mainForm\" METHOD=%s>\n", cgiScriptName(), cartUsualString(cart, "formMethod", "GET")); cartSaveSession(cart); //#*** ------------------ end verbatim --------------- printf("<div class='sectionLiteHeader noReorderRemove'>" "Select Genome Assembly and Region</div>\n"); /* Print clade, genome and assembly line. */ hgGatewayCladeGenomeDb(); //#*** No longer ending form here... } void factorSourceListInputProperties(struct trackDb *tdb, struct slName **retFactorList, struct slName **retCellTypeList, struct slName **retTreatmentList) /* Get a list of factor names used in factorSource track. */ { char *inputsTable = trackDbSetting(tdb, "inputTrackTable"); if (isEmpty(inputsTable)) errAbort("track %s does not have an inputTrackTable setting", tdb->track); char query[2048]; struct sqlConnection *conn = hAllocConn(database); if (retFactorList && retCellTypeList && retTreatmentList) { sqlSafef(query, sizeof(query), "select distinct(factor) from %s order by factor", inputsTable); *retFactorList = sqlQuickList(conn, query); sqlSafef(query, sizeof(query), "select distinct(cellType) from %s order by cellType", inputsTable); *retCellTypeList = sqlQuickList(conn, query); sqlSafef(query, sizeof(query), "select distinct(treatment) from %s order by treatment", inputsTable); *retTreatmentList = sqlQuickList(conn, query); } else errAbort("factorSourceListInputProperties: ret args must be non-NULL."); hFreeConn(&conn); } void printMultiselect(char *track, char *label, char *var, struct slName *optionList) /* Print a label and multi-select, with hgva_track_var as cart var, with options in optionList, * marking as selected any options stored in cart. */ { printf("%s: ", label); char cartVar[1024]; safef(cartVar, sizeof(cartVar), "hgva_filter_%s_%s", track, var); struct hash *selHash = NULL; if (cartListVarExists(cart, cartVar)) selHash = hashFromSlNameList(cartOptionalSlNameList(cart, cartVar)); printf("<select id=\"%s\" name=\"%s\" style=\"display: none; font-size:.9em;\" " "class=\"filterBy\" multiple>\n", cartVar, cartVar); char *selected = ""; if (!selHash || hashLookup(selHash, "All")) selected = " selected"; printf("<option%s>All</option>", selected); struct slName *option; for (option = optionList; option != NULL; option = option->next) { selected = ""; if (selHash && hashLookup(selHash, option->name)) selected = " selected"; printf("<option%s>%s</option>", selected, option->name); } printf("</select>\n"); char shadowVar[1024]; safef(shadowVar, sizeof(shadowVar), "%s%s", cgiMultListShadowPrefix(), cartVar); cgiMakeHiddenVar(shadowVar, "1"); //printf("<script>$(document).ready(function(){ ddcl.setup($('#%s')[0]); });</script>\n", cartVar); } void printFilterOptions(struct trackDb *tdb) /* Print a collapsible filter section for tdb, with controls depending on tdb->type. */ { char sectionName[512], cartVar[512]; safef(sectionName, sizeof(sectionName), "%s_filter", tdb->track); if (sameString(tdb->type, "factorSource")) { puts("<TABLE>"); startSmallCollapsibleSection(sectionName, "filter items", FALSE); struct slName *factorOptions = NULL, *cellTypeOptions = NULL, *treatmentOptions = NULL; factorSourceListInputProperties(tdb, &factorOptions, &cellTypeOptions, &treatmentOptions); printMultiselect(tdb->track, "factor", "name", factorOptions); printMultiselect(tdb->track, "cell type", "cellType", cellTypeOptions); printMultiselect(tdb->track, "treatment", "treatment", treatmentOptions); puts("<BR>"); puts("minimum peak score [0-1000]: "); safef(cartVar, sizeof(cartVar), "hgva_filter_%s_score", tdb->track); char *defaultScore = cartUsualString(cart, cartVar, "0"); printf("<input type=text name=\"%s\" size=12 value=\"%s\"><BR>", cartVar, defaultScore); // The dimensions of ui-dropdownchecklist multiselects are not correct when // the item is hidden. So, when this filter section is made visible, reinit them. printf("<script>\n" "$(function(){" "$('tr[id^=\"%s-\"]').bind('show'," " function(jqev) { \n" " var $multisels = $(jqev.target).find('.filterBy');\n" " var multiselElList = $multisels.each(function(ix, el){ return el; });\n" - " ddcl.reinit(multiselElList, true);" + " ddcl.reinit(multiselElList);" " });\n" "});" "</script>\n", sectionName); puts("</TABLE>"); endCollapsibleSection(); } if (startsWith("bed 5", tdb->type)) //#*** TODO: detect bed# properly { puts("<TABLE>"); startSmallCollapsibleSection(sectionName, "filter items", FALSE); //#*** Also watch out for noScoreFilter or whatever it's called puts("minimum peak score [0-1000]: "); safef(cartVar, sizeof(cartVar), "hgva_filter_%s_score", tdb->track); char *defaultScore = cartUsualString(cart, cartVar, "0"); printf("<input type=text name=\"%s\" size=12 value=\"%s\"><BR>", cartVar, defaultScore); puts("</TABLE>"); endCollapsibleSection(); } } typedef boolean TdbFilterFunction(struct trackDb *tdb, void *filterData); /* Return TRUE if tdb passes filter criteria. */ void rFilterTrackList(struct trackDb *trackList, struct slRef **pPassingRefs, TdbFilterFunction *filterFunc, void *filterData) /* Recursively apply filterFunc and filterData to all tracks in trackList and * their subtracks. Add an slRef to pPassingRefs for each track/subtrack that passes. * Caller should slReverse(pPassingRefs) when recursion is all done. */ { struct trackDb *tdb; for (tdb = trackList; tdb != NULL; tdb = tdb->next) { if (tdb->subtracks != NULL) rFilterTrackList(tdb->subtracks, pPassingRefs, filterFunc, filterData); if (filterFunc(tdb, filterData)) slAddHead(pPassingRefs, slRefNew(tdb)); } } void tdbFilterGroupTrack(struct trackDb *fullTrackList, struct grp *fullGroupList, TdbFilterFunction *filterFunc, void *filterData, struct slRef **retGroupRefList, struct slRef **retTrackRefList) /* Apply filterFunc and filterData to each track in fullTrackList and all subtracks; * make an slRef list of tracks that pass the filter (retTrackRefList) and an slRef * list of groups to which the passing tracks belong (retGroupRefList). */ { struct slRef *trackRefList = NULL; rFilterTrackList(fullTrackList, &trackRefList, filterFunc, filterData); slReverse(&trackRefList); if (retTrackRefList != NULL) *retTrackRefList = trackRefList; if (retGroupRefList != NULL) { // Which groups have tracks that passed the filter? struct hash *hash = hashNew(0); struct slRef *trackRef; for (trackRef = trackRefList; trackRef != NULL; trackRef = trackRef->next) { struct trackDb *tdb = trackRef->val; hashAddInt(hash, tdb->grp, TRUE); } struct slRef *groupRefList = NULL; struct grp *grp; for (grp = fullGroupList; grp != NULL; grp = grp->next) { if (hashIntValDefault(hash, grp->name, FALSE)) slAddHead(&groupRefList, slRefNew(grp)); } hashFree(&hash); slReverse(&groupRefList); *retGroupRefList = groupRefList; } } boolean isVariantCustomTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get custom or hub tracks with type pgSnp or vcf(Tabix). */ { return ((sameString(tdb->grp, "user") || isHubTrack(tdb->track)) && (sameString(tdb->type, "pgSnp") || startsWith("vcf", tdb->type))); } // Function prototype to avoid shuffling code around: char *findLatestSnpTable(char *suffix); /* Return the name of the 'snp1__<suffix>' table with the highest build number, if any. */ void selectVariants(struct slRef *varGroupList, struct slRef *varTrackList) /* Offer selection of user's variant custom tracks, example variants, pasted input etc. */ { printf("<div class='sectionLiteHeader'>Select Variants</div>\n"); printf("If you have more than one custom track or hub track in " "<A HREF='../FAQ/FAQformat.html#format10' TARGET=_BLANK>pgSnp</A> or " "<A HREF='../goldenPath/help/vcf.html' TARGET=_BLANK>VCF</A> format, " "please select the one you wish to annotate:<BR>\n"); printf("<B>variants: </B>"); printf("<SELECT ID='hgva_variantTrack' NAME='hgva_variantTrack' " "onchange=\"hgva.changeVariantSource();\">\n"); char *selected = cartUsualString(cart, "hgva_variantTrack", ""); struct slRef *ref; for (ref = varTrackList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; printOption(tdb->track, selected, tdb->longLabel); } printOption(hgvaSampleVariants, selected, hgvaSampleVariantsLabel); boolean hasSnps = (findLatestSnpTable(NULL) != NULL); if (hasSnps) printOption(hgvaUseVariantIds, selected, hgvaVariantIdsLabel); printf("</SELECT><BR>\n"); if (hasSnps) { printf("<div id='"hgvaVariantPasteContainer"'%s>\n", differentString(selected, hgvaUseVariantIds) ? " style='display: none;'" : ""); printf("Enter dbSNP rs# identifiers separated by whitespace or commas:<BR>\n"); char *oldPasted = cartUsualString(cart, hgvaVariantIds, ""); cgiMakeTextArea(hgvaVariantIds, oldPasted, 10, 70); puts("</div>"); } printf("<B>maximum number of variants to be processed:</B>\n"); char *curLimit = cartUsualString(cart, "hgva_variantLimit", "10000"); char *limitLabels[] = { "10", "100", "1,000", "10,000", "100,000" }; char *limitValues[] = { "10", "100", "1000", "10000", "100000" }; cgiMakeDropListWithVals("hgva_variantLimit", limitLabels, limitValues, ArraySize(limitLabels), curLimit); printCtAndHubButtons(); puts("<BR>"); } boolean isGeneTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get genePred tracks. */ { return (startsWith("genePred", tdb->type)); } boolean selectGenes() /* Let user select a gene predictions track; return FALSE if there are no genePred tracks. */ { struct slRef *trackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isGeneTrack, NULL, NULL, &trackRefList); boolean gotGP = (trackRefList != NULL); if (!gotGP) warn("This assembly (%s) has no gene prediction tracks, " "so the VAI will not be able to annotate it.", database); printf("<div class='sectionLiteHeader'>Select Genes</div>\n"); if (gotGP) printf("The gene predictions selected here will be used "); else printf("Gene predictions are required in order "); printf("to determine the effect of " "each variant on genes, for example intronic, missense, splice site, intergenic etc."); if (!gotGP) printf(" Since this assembly has no gene prediction tracks, " "the VAI can't provide functional annotations. " "Please select a different genome.<BR>"); printf("<BR>\n"); char *selected = cartUsualString(cart, "hgva_geneTrack", ""); //#*** per-db cart vars?? //#*** should show more info about each track... button to pop up track desc? if (gotGP) { printf("<SELECT ID='hgva_geneTrack' NAME='hgva_geneTrack'>\n"); struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; if (tdb->subtracks == NULL) printOption(tdb->track, selected, tdb->longLabel); } puts("</SELECT><BR>"); } return gotGP; } //#*** We really need a dbNsfp.[ch]: enum PolyPhen2Subset { noSubset, HDIV, HVAR }; char *formatDesc(char *url, char *name, char *details, boolean doHtml) /* Return a description with URL for name plus extra details. If doHtml, * wrap URL in <A ...>...</A>. */ { char desc[1024]; if (doHtml) safef(desc, sizeof(desc), "<A HREF='%s' TARGET=_BLANK>%s</A> %s", url, name, details); else safef(desc, sizeof(desc), "(%s) %s %s", url, name, details); return cloneString(desc); } char *dbNsfpDescFromTableName(char *tableName, enum PolyPhen2Subset subset, boolean doHtml) /* Return a description for table name... if these become tracks, use tdb of course. */ { if (sameString(tableName, "dbNsfpSift")) return formatDesc("http://sift.bii.a-star.edu.sg/", "SIFT", "(D = damaging, T = tolerated)", doHtml); else if (sameString(tableName, "dbNsfpPolyPhen2")) { if (subset == HDIV) return formatDesc("http://genetics.bwh.harvard.edu/pph2/", "PolyPhen-2", "with HumDiv training set " "(D = probably damaging, P = possibly damaging, B = benign)", doHtml); else if (subset == HVAR) return formatDesc("http://genetics.bwh.harvard.edu/pph2/", "PolyPhen-2", "with HumVar training set " "(D = probably damaging, P = possibly damaging, B = benign)", doHtml); else errAbort("dbNsfpDescFromTableName: invalid PolyPhen2 subset type (%d)", subset); } else if (sameString(tableName, "dbNsfpMutationTaster")) return formatDesc("http://www.mutationtaster.org/", "MutationTaster", "(A = disease causing automatic, D = disease causing, " "N = polymorphism, P = polymorphism automatic)", doHtml); else if (sameString(tableName, "dbNsfpMutationAssessor")) return formatDesc("http://mutationassessor.org/", "MutationAssessor", "(high or medium: predicted functional; " "low or neutral: predicted non-functional)", doHtml); else if (sameString(tableName, "dbNsfpLrt")) return formatDesc("http://www.genetics.wustl.edu/jflab/lrt_query.html", "Likelihood ratio test (LRT)", "(D = deleterious, N = Neutral, U = unknown)", doHtml); else if (sameString(tableName, "dbNsfpGerpNr")) return formatDesc("http://mendel.stanford.edu/SidowLab/downloads/gerp/index.html", "GERP++", "Neutral Rate (NR)", doHtml); else if (sameString(tableName, "dbNsfpGerpRs")) return formatDesc("http://mendel.stanford.edu/SidowLab/downloads/gerp/index.html", "GERP++", "Rejected Substitutions (RS)", doHtml); else if (sameString(tableName, "dbNsfpInterPro")) return formatDesc("http://www.ebi.ac.uk/interpro/", "InterPro", "protein domains", doHtml); return NULL; } struct slName *findDbNsfpTables() /* See if this database contains dbNSFP tables. */ { if (startsWith(hubTrackPrefix, database)) return NULL; struct sqlConnection *conn = hAllocConn(database); struct slName *dbNsfpTables = sqlListTablesLike(conn, "LIKE 'dbNsfp%'"); hFreeConn(&conn); return dbNsfpTables; } void printDbNsfpSource(char *table, enum PolyPhen2Subset subset) /* If we know what to do with table, make a checkbox with descriptive label. */ { char *description = dbNsfpDescFromTableName(table, subset, TRUE); if (description != NULL) { char cartVar[512]; if (subset == HDIV) safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s:HDIV", database, table); else if (subset == HVAR) safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s:HVAR", database, table); else safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s", database, table); boolean defaultChecked = (sameString("dbNsfpSift", table) || sameString("dbNsfpPolyPhen2", table)); cartMakeCheckBox(cart, cartVar, defaultChecked); printf("%s<BR>\n", description); } } void selectDbNsfp(struct slName *dbNsfpTables) /* Let user select scores/predicitions from various tools collected by dbNSFP. */ { if (dbNsfpTables == NULL) return; startCollapsibleSection("dbNsfp", "Database of Non-synonymous Functional Predictions (dbNSFP)", TRUE); printf("<A HREF='https://sites.google.com/site/jpopgen/dbNSFP' TARGET=_BLANK>dbNSFP</A> " "(<A HREF='http://onlinelibrary.wiley.com/doi/10.1002/humu.22376/abstract' " "TARGET=_BLANK>Liu <em>et al.</em> 2013</A>) " "release 2.0 " "provides pre-computed scores and predictions of functional significance " "from a variety of tools. Every possible coding change to transcripts in " //#*** hardcoded version info... sigh, we need trackDb... or metaDb?? "Gencode release 9 (Ensembl 64, Dec. 2011) gene predictions " "has been evaluated. " "<em>Note: This may not encompass all transcripts in your " "selected gene set.</em><BR>\n"); //#*** Another cheap hack: reverse alph order happens to be what we want, //#*** but priorities would be cleaner: slReverse(&dbNsfpTables); struct slName *table; for (table = dbNsfpTables; table != NULL; table = table->next) { if (sameString(table->name, "dbNsfpPolyPhen2")) { printDbNsfpSource(table->name, HDIV); printDbNsfpSource(table->name, HVAR); } else printDbNsfpSource(table->name, 0); } puts("<BR>"); endCollapsibleSection(); } char *findLatestSnpTable(char *suffix) /* Return the name of the 'snp1__<suffix>' table with the highest build number, if any. */ { if (startsWith(hubTrackPrefix, database)) return NULL; if (suffix == NULL) suffix = ""; char likeExpr[64]; safef(likeExpr, sizeof(likeExpr), "LIKE 'snp1__%s'", suffix); struct sqlConnection *conn = hAllocConn(database); struct slName *snpNNNTables = sqlListTablesLike(conn, likeExpr); hFreeConn(&conn); if ((snpNNNTables == NULL) || (slCount(snpNNNTables)==0)) return NULL; // Skip to last in list -- highest number (show tables can't use rlike or 'order by'): struct slName *table = snpNNNTables; while (table->next != NULL && isdigit(table->next->name[4]) && isdigit(table->next->name[5])) table = table->next; char *tableName = NULL; if (table != NULL) tableName = cloneString(table->name); slNameFreeList(&snpNNNTables); return tableName; } boolean findSnpBed4(char *suffix, char **retFileName, struct trackDb **retTdb) /* If we can find the latest snpNNNsuffix table, or better yet a bigBed file for it (faster), * set the appropriate ret* and return TRUE, otherwise return FALSE. */ { char *table = findLatestSnpTable(suffix); if (table == NULL) return FALSE; boolean foundIt = FALSE; // Do we happen to have a bigBed version? Better yet, bed4 only for current uses: char origFileName[HDB_MAX_PATH_STRING]; safef(origFileName, sizeof(origFileName), "/gbdb/%s/vai/%s.bed4.bb", database, table); char* fileName = hReplaceGbdb(origFileName); if (fileExists(fileName)) { if (retFileName != NULL) *retFileName = fileName; foundIt = TRUE; } else { // Not bed4; try just .bb: freez(&fileName); safef(origFileName, sizeof(origFileName), "/gbdb/%s/vai/%s.bb", database, table); fileName = hReplaceGbdb(origFileName); if (fileExists(fileName)) { if (retFileName != NULL) *retFileName = fileName; foundIt = TRUE; } } if (foundIt && retTdb == NULL) return TRUE; struct trackDb *tdb = tdbForTrack(database, table, &fullTrackList); if (tdb != NULL) { if (retTdb != NULL) *retTdb = tdb; return TRUE; } return foundIt; } void selectDbSnp(boolean gotSnp) /* Offer to include rsID (and other fields, or leave that for advanced output??) if available */ { if (!gotSnp) return; startCollapsibleSection("dbSnp", "Known variation", TRUE); cartMakeCheckBox(cart, "hgva_rsId", TRUE); printf("Include <A HREF='http://www.ncbi.nlm.nih.gov/projects/SNP/' TARGET=_BLANK>dbSNP</A> " "rs# ID if one exists<BR>\n"); puts("<BR>"); endCollapsibleSection(); } boolean isRegulatoryTrack(struct trackDb *tdb, void *filterData) /* For now, just look for a couple specific tracks by tableName. */ { //#*** NEED METADATA return (sameString("wgEncodeRegDnaseClusteredV2", tdb->table) || sameString("wgEncodeRegTfbsClusteredV3", tdb->table)); } struct slRef *findRegulatoryTracks() /* Look for the very limited set of Regulation tracks that hgVai offers. */ { struct slRef *regTrackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isRegulatoryTrack, NULL, NULL, ®TrackRefList); return regTrackRefList; } void selectRegulatory(struct slRef *trackRefList) /* If trackRefList is non-NULL, make a collapsible section with a checkbox for each track, * labelled with longLabel, and optionally some filtering options. */ { if (trackRefList != NULL) { startCollapsibleSection("Regulation", "Regulatory regions", FALSE); // Use a table with checkboxes in one column and label/other stuff that depends on // checkbox in other column. puts("<TABLE>"); struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; char cartVar[512]; safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s", database, tdb->track); puts("<TR><TD valign=top>"); cartMakeCheckBox(cart, cartVar, FALSE); puts("</TD><TD>"); struct trackDb *topTdb = trackDbTopLevelSelfOrParent(tdb); printf("<A HREF=\"%s?%s&g=%s\">%s</A><BR>\n", hgTrackUiName(), cartSidUrlString(cart), topTdb->track, tdb->longLabel); printFilterOptions(tdb); puts("</TD></TR>"); } puts("</TABLE><BR>"); endCollapsibleSection(); } } boolean isConsElTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get "phastConsNwayElements" tracks. */ { return (startsWith("phastCons", tdb->table) && stringIn("Elements", tdb->table)); } boolean isConsScoreTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get tracks that start with "phastCons" (no Elements) * or "phyloP". */ { return ((startsWith("phastCons", tdb->table) && !stringIn("Elements", tdb->table)) || startsWith("phyloP", tdb->table)); } void findCons(struct slRef **retElTrackRefList, struct slRef **retScoreTrackRefList) /* See if this database has Conserved Elements and/or Conservation Scores */ { tdbFilterGroupTrack(fullTrackList, fullGroupList, isConsElTrack, NULL, NULL, retElTrackRefList); tdbFilterGroupTrack(fullTrackList, fullGroupList, isConsScoreTrack, NULL, NULL, retScoreTrackRefList); } void trackCheckBoxSection(char *sectionSuffix, char *title, struct slRef *trackRefList) /* If trackRefList is non-NULL, make a collapsible section with a checkbox for each track, * labelled with longLabel. */ { if (trackRefList != NULL) { startCollapsibleSection(sectionSuffix, title, FALSE); struct slRef *ref; for (ref = trackRefList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; char cartVar[512]; safef(cartVar, sizeof(cartVar), "hgva_track_%s_%s", database, tdb->track); cartMakeCheckBox(cart, cartVar, FALSE); struct trackDb *topTdb = trackDbTopLevelSelfOrParent(tdb); printf("<A HREF=\"%s?%s&g=%s\">%s</A><BR>\n", hgTrackUiName(), cartSidUrlString(cart), topTdb->track, tdb->longLabel); } puts("<BR>"); endCollapsibleSection(); } } void selectAnnotations() /* Beyond predictions of protein-coding effect, what other basic data can we integrate? */ { struct slName *dbNsfpTables = findDbNsfpTables(); boolean gotSnp = findSnpBed4("", NULL, NULL); struct slRef *regTrackRefList = findRegulatoryTracks(); struct slRef *elTrackRefList = NULL, *scoreTrackRefList = NULL; findCons(&elTrackRefList, &scoreTrackRefList); if (dbNsfpTables == NULL && !gotSnp && elTrackRefList == NULL && scoreTrackRefList == NULL && regTrackRefList == NULL) return; puts("<BR>"); printf("<div class='sectionLiteHeader'>Select More Annotations (optional)</div>\n"); // Make wrapper table for collapsible sections: puts("<TABLE border=0 cellspacing=5 cellpadding=0 style='padding-left: 10px;'>"); selectDbNsfp(dbNsfpTables); selectDbSnp(gotSnp); selectRegulatory(regTrackRefList); trackCheckBoxSection("ConsEl", "Conserved elements", elTrackRefList); trackCheckBoxSection("ConsScore", "Conservation scores", scoreTrackRefList); puts("</TABLE>"); } void selectFiltersFunc() /* Options to restrict variants based on gene region/soTerm from gpFx */ { startCollapsibleSection("filtersFunc", "Functional role", FALSE); printf("Include variants annotated as<BR>\n"); cartMakeCheckBox(cart, "hgva_include_intergenic", TRUE); printf("intergenic<BR>\n"); cartMakeCheckBox(cart, "hgva_include_upDownstream", TRUE); printf("upstream/downstream of gene<BR>\n"); cartMakeCheckBox(cart, "hgva_include_nmdTranscript", TRUE); printf("in transcript already subject to nonsense-mediated decay (NMD)<BR>\n"); cartMakeCheckBox(cart, "hgva_include_exonLoss", TRUE); printf("exon loss caused by deletion<BR>\n"); cartMakeCheckBox(cart, "hgva_include_utr", TRUE); printf("5' or 3' UTR<BR>\n"); cartMakeCheckBox(cart, "hgva_include_cdsSyn", TRUE); printf("CDS - synonymous coding change<BR>\n"); cartMakeCheckBox(cart, "hgva_include_cdsNonSyn", TRUE); printf("CDS - non-synonymous (missense, stop gain/loss, frameshift etc)<BR>\n"); cartMakeCheckBox(cart, "hgva_include_intron", TRUE); printf("intron<BR>\n"); cartMakeCheckBox(cart, "hgva_include_splice", TRUE); printf("splice site or splice region<BR>\n"); cartMakeCheckBox(cart, "hgva_include_nonCodingExon", TRUE); printf("exon of non-coding gene<BR>\n"); puts("<BR>"); endCollapsibleSection(); } void selectFiltersKnownVar() /* Options to restrict output based on overlap with known variants. */ { boolean gotCommon = findSnpBed4("Common", NULL, NULL); boolean gotMult = findSnpBed4("Mult", NULL, NULL); if (!gotCommon && !gotMult) return; startCollapsibleSection("filtersVar", "Known variation", FALSE); if (gotMult) { cartMakeCheckBox(cart, "hgva_include_snpMult", TRUE); printf("Include variants even if they are co-located with variants that have been mapped to " "multiple genomic locations by dbSNP<BR>\n"); } if (gotCommon) { cartMakeCheckBox(cart, "hgva_include_snpCommon", TRUE); printf("Include variants even if they are co-located with \"common\" variants " "(uniquely mapped variants with global minor allele frequency (MAF) " "of at least 1%% according to dbSNP)<BR>\n"); } puts("<BR>"); endCollapsibleSection(); } void selectFiltersCons() /* Options to restrict output based on overlap with conserved elements. */ { struct slRef *elTrackRefList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isConsElTrack, NULL, NULL, &elTrackRefList); if (elTrackRefList == NULL) return; startCollapsibleSection("filtersCons", "Conservation", FALSE); // Use a little table to indent radio buttons (if we do those) past checkbox: puts("<TABLE><TR><TD>"); cartMakeCheckBox(cart, "hgva_require_consEl", FALSE); printf("</TD><TD>Include only the variants that overlap:"); if (slCount(elTrackRefList) == 1) { struct trackDb *tdb = elTrackRefList->val; printf(" %s<BR>\n", tdb->longLabel); cgiMakeHiddenVar("hgva_require_consEl_track", tdb->track); puts("</TD></TR></TABLE>"); } else { puts("</TD></TR>"); char *selected = cartUsualString(cart, "hgva_require_consEl_track", ""); struct slRef *ref; for (ref = elTrackRefList; ref != NULL; ref = ref->next) { printf("<TR><TD></TD><TD>"); struct trackDb *tdb = ref->val; cgiMakeOnClickRadioButton("hgva_require_consEl_track", tdb->track, sameString(tdb->track, selected), "onclick=\"setCheckboxList('hgva_require_consEl', true);\""); printf("%s</TD></TR>\n", tdb->longLabel); } puts("</TABLE>"); } endCollapsibleSection(); } void selectFilters() /* Options to restrict output to variants with specific properties */ { puts("<BR>"); printf("<div class='sectionLiteHeader'>Define Filters</div>\n"); puts("<TABLE border=0 cellspacing=5 cellpadding=0 style='padding-left: 10px;'>"); selectFiltersFunc(); selectFiltersKnownVar(); selectFiltersCons(); puts("</TABLE>"); } void selectOutput() /* Just VEP text for now... should also have VEP HTML with limited # lines too. * Next: custom track with same subset of info as we would stuff in VEP */ { puts("<BR>"); printf("<div class='sectionLiteHeader'>Configure Output</div>\n"); printf("<B>output format: </B>"); char *selected = cartUsualString(cart, "hgva_outFormat", "vepTab"); printf("<SELECT ID='hgva_outFormat' NAME='hgva_outFormat'>\n"); printOption("vepTab", selected, "Variant Effect Predictor (tab-separated text)"); printOption("vepHtml", selected, "Variant Effect Predictor (HTML)"); printf("</SELECT><BR>\n"); char *compressType = cartUsualString(cart, "hgva_compressType", textOutCompressNone); char *fileName = cartUsualString(cart, "hgva_outFile", ""); printf("<B>output file:</B> "); cgiMakeTextVar("hgva_outFile", fileName, 29); printf(" (leave blank to keep output in browser)<BR>\n"); printf("<B>file type returned: </B>"); cgiMakeRadioButton("hgva_compressType", textOutCompressNone, sameWord(textOutCompressNone, compressType)); printf(" plain text  "); cgiMakeRadioButton("hgva_compressType", textOutCompressGzip, sameWord(textOutCompressGzip, compressType)); printf(" gzip compressed (ignored if output file is blank)"); puts("<BR>"); } void submitAndDisclaimer() { puts("<div id=disclaimerDialog title='NOTE'>"); puts("This tool is for research use only. While this tool is open to the " "public, users seeking information about a personal medical or genetic " "condition are urged to consult with a qualified physician for " "diagnosis and for answers to personal questions."); puts("</div><BR>"); printf("<div><img id='loadingImg' src='../images/loading.gif' />\n"); printf("<span id='loadingMsg'></span></div>\n"); cgiMakeOnClickButton("hgva.submitQueryIfDisclaimerAgreed();", "Get results"); puts("<BR><BR>"); } /* * When user clicks submit, we need to roll a JSON querySpec from form selections, * and show data from a submission to hgAnnoGrator. redirect from this CGI? * or have javascript submit directly? * * primary: variants, from custom track * * if there are genes, those w/annoGratorGpVar * * if there are {dbSNP, dbNsfp, regulatory, cons} selections, grator for each of those * * vep output config * * If we get bold & offer 1000Genomes VCF, will def. need handling of split chroms. * Are we really going to offer genome-wide in hgVai? * Up-front limit on #rows of input ? * * Eventually, we might want a FormatVep that produces structs that are passed * forward to multiple output writers... I would want to send it lots of gratorData * like a formatter, but it would produce rows like an annoGrator. * Maybe annoGrators should accept a bunch of input rows like formatters? * or would this grator wrap all the input grators inside? */ void doMainPage() /* Print out initial HTML of control page. */ { jsInit(); webIncludeResourceFile("jquery-ui.css"); webIncludeResourceFile("ui.dropdownchecklist.css"); boolean alreadyAgreed = cartUsualBoolean(cart, "hgva_agreedToDisclaimer", FALSE); printf("<script>\n" "$(document).ready(function() { hgva.disclaimer.init(%s, hgva.userClickedAgree); });\n" "</script>\n", alreadyAgreed ? "true" : "false"); addSomeCss(); printAssemblySection(); /* Check for variant custom tracks. If there are none, tell user they need to * upload at least one. */ struct slRef *varTrackList = NULL, *varGroupList = NULL; tdbFilterGroupTrack(fullTrackList, fullGroupList, isVariantCustomTrack, NULL, &varGroupList, &varTrackList); puts("<BR>"); // Make wrapper table for collapsible sections: selectVariants(varGroupList, varTrackList); boolean gotGP = selectGenes(); if (gotGP) { selectAnnotations(); selectFilters(); selectOutput(); submitAndDisclaimer(); } printf("</FORM>"); jsReloadOnBackButton(cart); webNewSection("Using the Variant Annotation Integrator"); webIncludeHelpFile("hgVaiHelpText", FALSE); jsIncludeFile("jquery-ui.js", NULL); jsIncludeFile("hgVarAnnogrator.js", NULL); jsIncludeFile("ui.dropdownchecklist.js", NULL); jsIncludeFile("ddcl.js", NULL); } void doUi() /* Set up globals and make web page */ { cartWebStart(cart, database, "Variant Annotation Integrator"); doMainPage(); cartWebEnd(); /* Save variables. */ cartCheckout(&cart); } void checkVariantTrack(struct trackDb *tdb) /* variantTrack should be either pgSnp or VCF. */ { if (! sameString(tdb->type, "pgSnp") && ! startsWith("vcf", tdb->type)) { errAbort("Expected variant track '%s' to be type pgSnp, vcf or vcfTabix, but it's '%s'", tdb->track, tdb->type); } } char *fileNameFromTable(char *table) /* Get fileName from a bigData table (for when we don't have a trackDb, just table). */ { struct sqlConnection *conn = hAllocConn(database); char query[512]; sqlSafef(query, sizeof(query), "select fileName from %s", table); char *fileName = sqlQuickString(conn, query); hFreeConn(&conn); char *fileNameRewrite = hReplaceGbdb(fileName); freez(&fileName); return fileNameRewrite; } void textOpen() /* Start serving up plain text, possibly via a pipeline to gzip. */ { char *fileName = cartUsualString(cart, "hgva_outFile", ""); char *compressType = cartUsualString(cart, "hgva_compressType", textOutCompressGzip); compressPipeline = textOutInit(fileName, compressType); } void setGpVarFuncFilter(struct annoGrator *gpVarGrator) /* Use cart variables to configure gpVarGrator's filtering by functional category. */ { struct annoGratorGpVarFuncFilter aggvFuncFilter; ZeroVar(&aggvFuncFilter); aggvFuncFilter.intergenic = cartUsualBoolean(cart, "hgva_include_intergenic", FALSE); aggvFuncFilter.upDownstream = cartUsualBoolean(cart, "hgva_include_upDownstream", TRUE); aggvFuncFilter.nmdTranscript = cartUsualBoolean(cart, "hgva_include_nmdTranscript", TRUE); aggvFuncFilter.exonLoss = cartUsualBoolean(cart, "hgva_include_exonLoss", TRUE); aggvFuncFilter.utr = cartUsualBoolean(cart, "hgva_include_utr", TRUE); aggvFuncFilter.cdsSyn = cartUsualBoolean(cart, "hgva_include_cdsSyn", TRUE); aggvFuncFilter.cdsNonSyn = cartUsualBoolean(cart, "hgva_include_cdsNonSyn", TRUE); aggvFuncFilter.intron = cartUsualBoolean(cart, "hgva_include_intron", TRUE); aggvFuncFilter.splice = cartUsualBoolean(cart, "hgva_include_splice", TRUE); aggvFuncFilter.nonCodingExon = cartUsualBoolean(cart, "hgva_include_nonCodingExon", TRUE); annoGratorGpVarSetFuncFilter(gpVarGrator, &aggvFuncFilter); } #define NO_MAXROWS 0 struct annoGrator *gratorForSnpBed4(struct hash *gratorsByName, char *suffix, struct annoAssembly *assembly, char *chrom, enum annoGratorOverlap overlapRule, char **retDescription) /* Look up snpNNNsuffix; if we find it, return a grator (possibly for a bigBed 4 file), * otherwise return NULL. */ { char *fileName = NULL; struct trackDb *tdb = NULL; if (! findSnpBed4(suffix, &fileName, &tdb)) return NULL; struct annoGrator *grator = NULL; // First look in gratorsByName to see if this grator has already been constructed: if (tdb != NULL) { grator = hashFindVal(gratorsByName, tdb->table); if (retDescription) *retDescription = cloneString(tdb->longLabel); } if (grator != NULL) { // Set its overlap behavior and we're done. grator->setOverlapRule(grator, overlapRule); return grator; } // If not in gratorsByName, then attempt to construct it here: if (fileName != NULL) grator = gratorFromBigDataFileOrUrl(fileName, assembly, NO_MAXROWS, overlapRule); else grator = gratorFromTrackDb(assembly, tdb->table, tdb, chrom, NO_MAXROWS, NULL, overlapRule); if (grator != NULL) hashAdd(gratorsByName, tdb->table, grator); return grator; } char *tableNameFromSourceName(char *sourceName) /* Strip sourceName (file path or db.table) to just the basename or table name. */ { char *tableName = cloneString(sourceName); char *p = strrchr(tableName, '/'); if (p != NULL) { // file path; strip to basename char dir[PATH_LEN], name[FILENAME_LEN], extension[FILEEXT_LEN]; splitPath(tableName, dir, name, extension); safecpy(tableName, strlen(tableName)+1, name); } else { // database.table -- skip database. p = strchr(tableName, '.'); if (p != NULL) tableName = p+1; } return tableName; } char *tagFromTableName(char *tableName, char *suffix) /* Generate a tag for VEP's extras column or VCF's info column. */ { char *p = strstr(tableName, "dbNsfp"); if (p != NULL) tableName = p + strlen("dbNsfp"); int suffixLen = (suffix == NULL) ? 0 : strlen(suffix); int tagSize = strlen(tableName) + suffixLen + 1; char *tag = cloneStringZ(tableName, tagSize); if (isNotEmpty(suffix)) safecat(tag, tagSize, suffix); touppers(tag); // Some custom shortenings, to avoid very long tag names: (void)strSwapStrs(tag, tagSize, "POLYPHEN", "PP"); (void)strSwapStrs(tag, tagSize, "MUTATION", "MUT"); (void)strSwapStrs(tag, tagSize, "PHYLOP", "PHP"); (void)strSwapStrs(tag, tagSize, "PHASTCONS", "PHC"); (void)strSwapStrs(tag, tagSize, "ELEMENTS", "EL"); (void)strSwapStrs(tag, tagSize, "PRIMATES", "PRIM"); (void)strSwapStrs(tag, tagSize, "PLACENTAL", "PLAC"); if (regexMatch(tag, "^PH.*[0-9]WAY")) (void)strSwapStrs(tag, tagSize, "WAY", "W"); (void)strSwapStrs(tag, tagSize, "WGENCODEREGDNASECLUSTERED", "DNASE"); (void)strSwapStrs(tag, tagSize, "WGENCODEREGTFBSCLUSTERED", "TFBS"); return tag; } enum PolyPhen2Subset stripSubsetFromTrackName(char *trackName) /* trackName may have a _suffix for a subset of PolyPhen2; convert that to enum * and zero out the suffix so we have the real trackName. */ { enum PolyPhen2Subset subset = noSubset; char *p = strchr(trackName, ':'); if (p != NULL) { if (sameString(p+1, "HDIV")) subset = HDIV; else if (sameString(p+1, "HVAR")) subset = HVAR; else errAbort("unrecognized suffix in track_suffix '%s'", trackName); *p = '\0'; } return subset; } void updateGratorListAndVepExtra(struct annoGrator *grator, struct annoGrator **pGratorList, struct annoFormatter *vepOut, enum PolyPhen2Subset subset, char *column, char *description, boolean isReg) /* If grator is non-NULL, add it to gratorList and vepOut's list of items for EXTRAs column. */ { if (grator == NULL) return; slAddHead(pGratorList, grator); if (vepOut != NULL) { char *tableName = tableNameFromSourceName(grator->streamer.name); char *suffix = NULL; if (subset == HDIV) suffix = "HDIV"; else if (subset == HVAR) suffix = "HVAR"; char *tag = tagFromTableName(tableName, suffix); if (isEmpty(description)) description = grator->streamer.name; if (isReg) annoFormatVepAddRegulatory(vepOut, (struct annoStreamer *)grator, tag, description, column); else annoFormatVepAddExtraItem(vepOut, (struct annoStreamer *)grator, tag, description, column); } } INLINE void updateGratorList(struct annoGrator *grator, struct annoGrator **pGratorList) /* If grator is non-NULL, add it to gratorList. */ { updateGratorListAndVepExtra(grator, pGratorList, NULL, 0, NULL, NULL, FALSE); } void addDbNsfpSeqChange(char *trackName, struct annoAssembly *assembly, struct hash *gratorsByName, struct annoGrator **pGratorList) // If the user has selected dbNsfp* data, we also need the underlying dbNsfpSeqChange // data, so annoFormatVep can tell whether the variant and gpFx are consistent with the // variant and transcript that dbNsfp used to calculate scores. { //#*** Yet another place where we need metadata: char *seqChangeTable = "dbNsfpSeqChange"; if (hashFindVal(gratorsByName, seqChangeTable) == NULL) { char *fileName = fileNameFromTable(seqChangeTable); if (fileName == NULL) errAbort("'%s' requested, but I can't find fileName for %s", trackName, seqChangeTable); struct annoGrator *grator = gratorFromBigDataFileOrUrl(fileName, assembly, NO_MAXROWS, agoNoConstraint); updateGratorList(grator, pGratorList); hashAdd(gratorsByName, seqChangeTable, grator); } } static struct dyString *dyInfo = NULL; struct hash *getTrackFilterVars(char *track) /* Return a hash of filter variable names (cart variable suffixes) to slName lists of values. */ { char filterPrefix[512]; safef(filterPrefix, sizeof(filterPrefix), "hgva_filter_%s_", track); struct slPair *filterVars = cartVarsWithPrefix(cart, filterPrefix), *var; int prefixLen = strlen(filterPrefix); struct hash *varHash = hashNew(0); for (var = filterVars; var != NULL; var = var->next) { char *varName = var->name+prefixLen; char *val = var->val; struct hashEl *hel = hashLookup(varHash, varName); if (hel != NULL) slNameAddHead((struct slName **)(&hel->val), val); else hashAdd(varHash, varName, slNameNew(val)); } return varHash; } INLINE boolean isNotAll(struct slName *valList) /* Return TRUE unless valList has one element with name "All" (for multiselects). */ { if (slCount(valList) == 1 && sameString(valList->name, "All")) return FALSE; return TRUE; } void factorSourceGratorAddFilter(struct annoGrator *grator, char *name, struct slName *valList) /* Add filter to factorSource grator. */ //#*** Do these smarts belong here in hgVai? Probably not -- should be an hg/lib module with //#*** UI/metadata smarts. { struct annoStreamer *gStreamer = (struct annoStreamer *)grator; struct annoFilter *filter = NULL; if (sameString(name, "name") || sameString(name, "cellType") || sameString(name, "treatment")) { if (valList && isNotAll(valList)) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afMatch, valList); } else if (sameString(name, "score")) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afGTE, valList); else errAbort("Unrecognized filter name '%s' for %s, type=factorSource", name, gStreamer->name); if (filter) gStreamer->addFilters(gStreamer, filter); } void bed5AddFilter(struct annoGrator *grator, char *name, struct slName *valList) /* Add filter to bed 5 grator. */ { struct annoStreamer *gStreamer = (struct annoStreamer *)grator; struct annoFilter *filter = NULL; if (sameString(name, "name")) { if (valList && isNotAll(valList)) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afMatch, valList); } else if (sameString(name, "score")) filter = annoFilterFromAsColumn(gStreamer->asObj, name, afGTE, valList); else errAbort("Unrecognized filter name '%s' for %s, type=bed 5", name, gStreamer->name); if (filter) gStreamer->addFilters(gStreamer, filter); } void addFiltersToGrator(struct annoGrator *grator, struct trackDb *tdb) /* Look for filter variables in the cart and add filters to grator accordingly. */ { struct hash *varHash = getTrackFilterVars(tdb->track); struct hashEl *hel, *helList = hashElListHash(varHash); for (hel = helList; hel != NULL; hel = hel->next) { char *filterName = hel->name; struct slName *valList = hel->val; //#*** Need a much better way to dispatch... if (sameString("factorSource", tdb->type)) factorSourceGratorAddFilter(grator, filterName, valList); else if (startsWith("bed 5", tdb->type)) bed5AddFilter(grator, filterName, valList); else dyStringPrintf(dyInfo, "Ignoring %s filter %s\n", tdb->track, filterName); } hashFree(&varHash); } void addOutputTracks(struct annoGrator **pGratorList, struct hash *gratorsByName, struct annoFormatter *vepOut, struct annoAssembly *assembly, char *chrom, boolean doHtml) // Construct grators for tracks selected to appear in EXTRAS column { char trackPrefix[128]; safef(trackPrefix, sizeof(trackPrefix), "hgva_track_%s_", database); int trackPrefixLen = strlen(trackPrefix); struct slPair *trackVar, *trackVars = cartVarsWithPrefix(cart, trackPrefix); for (trackVar = trackVars; trackVar != NULL; trackVar = trackVar->next) { char *val = trackVar->val; if (! (sameWord(val, "on") || atoi(val) > 0)) continue; char *trackName = trackVar->name + trackPrefixLen; if (sameString(trackName, "dbNsfpPolyPhen2")) // PolyPhen2 must have a suffix now -- skip obsolete cartVar from existing carts continue; struct annoGrator *grator = hashFindVal(gratorsByName, trackName); if (grator != NULL) // We already have this as a grator: continue; enum PolyPhen2Subset subset = noSubset; char *description = NULL; char *column = NULL; boolean isReg = FALSE; if (startsWith("dbNsfp", trackName)) { // trackName for PolyPhen2 has a suffix for subset -- strip it if we find it: subset = stripSubsetFromTrackName(trackName); description = dbNsfpDescFromTableName(trackName, subset, doHtml); addDbNsfpSeqChange(trackName, assembly, gratorsByName, pGratorList); char *fileName = fileNameFromTable(trackName); if (fileName != NULL) grator = gratorFromBigDataFileOrUrl(fileName, assembly, NO_MAXROWS, agoNoConstraint); } else { struct trackDb *tdb = tdbForTrack(database, trackName, &fullTrackList); if (tdb != NULL) { grator = gratorFromTrackDb(assembly, tdb->table, tdb, chrom, NO_MAXROWS, NULL, agoNoConstraint); if (grator != NULL) { //#*** Need something more sophisticated but this works for our //#*** limited selection of extra tracks: if (asColumnFind(grator->streamer.asObj, "name") != NULL) column = "name"; addFiltersToGrator(grator, tdb); } description = tdb->longLabel; isReg = isRegulatoryTrack(tdb, NULL); } } updateGratorListAndVepExtra(grator, pGratorList, vepOut, subset, column, description, isReg); if (grator != NULL) hashAdd(gratorsByName, trackName, grator); } } void addFilterTracks(struct annoGrator **pGratorList, struct hash *gratorsByName, struct annoAssembly *assembly, char *chrom) // Add grators for filters (not added to vepOut): { if (!cartUsualBoolean(cart, "hgva_include_snpCommon", TRUE)) { struct annoGrator *grator = gratorForSnpBed4(gratorsByName, "Common", assembly, chrom, agoMustNotOverlap, NULL); updateGratorList(grator, pGratorList); } if (!cartUsualBoolean(cart, "hgva_include_snpMult", TRUE)) { struct annoGrator *grator = gratorForSnpBed4(gratorsByName, "Mult", assembly, chrom, agoMustNotOverlap, NULL); updateGratorList(grator, pGratorList); } if (cartUsualBoolean(cart, "hgva_require_consEl", FALSE)) { char *consElTrack = cartString(cart, "hgva_require_consEl_track"); struct annoGrator *grator = hashFindVal(gratorsByName, consElTrack); if (grator == NULL) { struct trackDb *tdb = tdbForTrack(database, consElTrack, &fullTrackList); if (tdb != NULL) grator = gratorFromTrackDb(assembly, tdb->table, tdb, chrom, NO_MAXROWS, NULL, agoMustOverlap); updateGratorList(grator, pGratorList); } else grator->setOverlapRule(grator, agoMustOverlap); } } static void getCartPosOrDie(char **retChrom, uint *retStart, uint *retEnd) /* Get chrom:start-end from cart, errAbort if any problems. */ { char *position = cartString(cart, hgvaRange); if (! parsePosition(position, retChrom, retStart, retEnd)) errAbort("Expected position to be chrom:start-end but got '%s'", position); } static char *sampleVariantsPath(struct annoAssembly *assembly, char *geneTrack) /* Construct a path for a trash file of contrived example variants for this assembly * and gene track. */ { char *chrom = NULL; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); char *subDir = "hgv"; mkdirTrashDirectory(subDir); struct dyString *dy = dyStringCreate("%s/%s/%s_%s_%s_%u-%u.vcf", trashDir(), subDir, assembly->name, geneTrack, chrom, start, end); return dyStringCannibalize(&dy); } static void writeMinimalVcfHeader(FILE *f, char *db) /* Write header for VCF with no meaningful qual, filter, info or genotype data. */ { fputs("##fileformat=VCFv4.1\n", f); fprintf(f, "##reference=%s\n", db); fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", f); } static void mutateBase(char *pBase) /* Change *pBase into a different base. */ { static char *bases = "ACGT"; char c; // In real life the probabilities aren't even, but this is easier. while ((c = bases[(uint)(floor(drand48() * 4.0))]) == *pBase) ; *pBase = c; } static void writeMinimalVcfRow(FILE *f, struct vcfRecord *rec) /* Write a minimalist VCF row (coords, name, alleles and placeholders). */ { // VCF columns: #CHROM POS ID REF ALT QUAL FILTER INFO fprintf(f, "%s\t%u\t%s\t%s\t", rec->chrom, rec->chromStart+1, rec->name, rec->alleles[0]); // Comma-separated alt alleles: int i; for (i = 1; i < rec->alleleCount; i++) fprintf(f, "%s%s", (i > 1 ? "," : ""), rec->alleles[i]); // Placeholder qual, filter, info. fprintf(f, "\t.\t.\t.\n"); } static void writeMinimalVcfRowFromBed(FILE *f, struct bed4 *bed, struct dnaSeq *seq, uint seqOffset) /* Write a minimalist VCF row with a mutation of the reference sequence at the given * position. */ { uint indelBase = 0, start = bed->chromStart, end = bed->chromEnd; if (end != start+1) { // VCF treats indels in a special way: pos is the base to the left of the indel, // and both ref and alt alleles include the base to the left of the indel. indelBase = 1; } // Get reference allele sequence: uint refAlLen = end - start + indelBase; char ref[refAlLen+1]; uint seqStart = start - indelBase - seqOffset; safencpy(ref, sizeof(ref), seq->dna+seqStart, refAlLen); touppers(ref); // Alternate allele seq needs extra room in case of single-base insertion: char alt[refAlLen+2]; alt[0] = ref[0]; alt[1] = '\0'; if (start == end) // insertion: alt gets extra base safecpy(alt+1, sizeof(alt)-1, "A"); else if (end == start+1) // single nucleotide variant mutateBase(alt); // else deletion: leave alt truncated after the shared base to left of indel // VCF columns: #CHROM POS ID REF ALT QUAL FILTER INFO fprintf(f, "%s\t%u\t%s\t%s\t%s\t.\t.\t.\n", bed->chrom, start+1-indelBase, bed->name, ref, alt); } static void writeArbitraryVariants(FILE *f, struct annoAssembly *assembly) /* Concoct some variants at an arbitrary position on some assembly sequence. * We shouldn't ever get here unless the selected geneTrack is empty. */ { char *chrom = "NULL"; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); struct bed4 *bed = bed4New(chrom, start, start, "ex_ins"); struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, start-1, start+100); writeMinimalVcfRowFromBed(f, bed, seq, start-1); dnaSeqFree(&seq); } static void addGpFromRow(struct genePred **pGpList, struct annoRow *row, boolean *pNeedCoding, boolean *pNeedNonCoding) /* If row is coding and we need a coding gp, add it to pGpList and update pNeedCoding; * likewise for noncoding. */ { struct genePred *gp = genePredLoad(row->data); if (gp->cdsStart != gp->cdsEnd && *pNeedCoding) { slAddHead(pGpList, gp); *pNeedCoding = FALSE; } else if (gp->cdsStart == gp->cdsEnd && *pNeedNonCoding) { slAddHead(pGpList, gp); *pNeedNonCoding = FALSE; } } static void addGpFromPos(struct annoStreamer *geneStream, char *chrom, uint start, uint end, struct genePred **pGpList, boolean *pNeedCoding, boolean *pNeedNonCoding, struct lm *lm) /* Get up to 10 rows from position; if we find one coding and one non-coding gene, * add them to pGpList and update pNeed* accordingly. */ { geneStream->setRegion(geneStream, chrom, start, end); int rowCount = 0; struct annoRow *row = NULL; while (rowCount < 10 && (*pNeedCoding || *pNeedNonCoding) && (row = geneStream->nextRow(geneStream, NULL, 0, lm)) != NULL) addGpFromRow(pGpList, row, pNeedCoding, pNeedNonCoding); } static struct genePred *genesFromPosition(struct annoStreamer *geneStream, boolean *retGotCoding, boolean *retGotNonCoding) /* Try to get a coding and non-coding gene from geneStream at cart position. * If there are none, try whole chrom; if none there, first in assembly. */ { struct genePred *gpList = NULL; struct lm *lm = lmInit(0); char *chrom = NULL; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); boolean needCoding = TRUE, needNonCoding = TRUE; // First, look for both coding and noncoding genes at cart position: addGpFromPos(geneStream, chrom, start, end, &gpList, &needCoding, &needNonCoding, lm); // If that didn't do it, try querying whole chrom if (needCoding || needNonCoding) addGpFromPos(geneStream, chrom, 0, 0, &gpList, &needCoding, &needNonCoding, lm); // If we still haven't found either coding or non-coding on the cart's current chrom, // try whole genome: if (needCoding && needNonCoding) addGpFromPos(geneStream, NULL, 0, 0, &gpList, &needCoding, &needNonCoding, lm); slSort(&gpList, genePredCmp); lmCleanup(&lm); if (retGotCoding) *retGotCoding = !needCoding; if (retGotNonCoding) *retGotNonCoding = !needNonCoding; return gpList; } static void addSnv(struct bed4 **pBedList, char *chrom, uint start, char *name) /* Add a single-nucleotide bed4 item. */ { slAddHead(pBedList, bed4New(chrom, start, start+1, name)); } // Stuff within this range (see gpFx.h) is considered upstream/downstream of a transcript: #define UPSTREAMFUDGE GPRANGE static struct bed4 *sampleVarBedFromGenePred(struct genePred *gp, struct annoAssembly *assembly, uint txSeqStart, uint txSeqEnd, boolean exonOnly) /* Construct imaginary variants that hit various parts of the transcript * described in gp, using reference base values from assembly. */ { uint basesLeft = gp->txStart - txSeqStart, basesRight = txSeqEnd - gp->txEnd; struct bed4 *bedList = NULL; if (!exonOnly && basesLeft > UPSTREAMFUDGE) // SNV, intergenic to the left addSnv(&bedList, gp->chrom, txSeqStart, "ex_igLeft"); if (!exonOnly && basesLeft > 0) // SNV, up/downstream to the left addSnv(&bedList, gp->chrom, gp->txStart - 1, "ex_updownLeft"); if (!exonOnly && gp->exonCount > 1) { // SNV, intron uint start = (gp->exonEnds[0] + gp->exonStarts[1]) / 2; addSnv(&bedList, gp->chrom, start, "ex_intron"); // SNV, splice addSnv(&bedList, gp->chrom, gp->exonStarts[1] - 2, "ex_splice"); } boolean isCoding = (gp->cdsStart != gp->cdsEnd); if (isCoding) { if (gp->txStart < gp->cdsStart) // SNV, UTR to the left addSnv(&bedList, gp->chrom, gp->txStart + 1, "ex_utrLeft"); int cdsExon = 0; while (gp->cdsStart > gp->exonEnds[cdsExon] && cdsExon < gp->exonCount) cdsExon++; uint start = gp->cdsStart + 2; // single-base insertion, leftmost codon slAddHead(&bedList, bed4New(gp->chrom, start, start, "ex_codonLeftIns")); // 3-base deletion leftmost codon slAddHead(&bedList, bed4New(gp->chrom, start, start+3, "ex_codonLeftDel")); // SNV, leftmost codon addSnv(&bedList, gp->chrom, start+1, "ex_codonLeft"); if (gp->txEnd > gp->cdsEnd) // SNV, UTR to the right addSnv(&bedList, gp->chrom, gp->txEnd - 1, "ex_utrRight"); } else { // noncoding exon variant uint start = (gp->exonStarts[0] + gp->exonEnds[0]) / 2; addSnv(&bedList, gp->chrom, start, "ex_nce"); } if (!exonOnly && basesRight > 0) // SNV, up/downstream to the left addSnv(&bedList, gp->chrom, gp->txEnd + 1, "ex_updownRight"); return bedList; } // margin for intergenic variants around transcript: #define IGFUDGE 5 static struct annoStreamer *makeSampleVariantsStreamer(struct annoAssembly *assembly, struct trackDb *geneTdb, int maxOutRows) /* Construct a VCF file of example variants for db (unless it already exists) * and return an annoStreamer for that file. If possible, make the variants hit a gene. */ { char *sampleFile = sampleVariantsPath(assembly, geneTdb->track); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildSampleVariants", FALSE); if (! fileExists(sampleFile) || forceRebuild) { struct annoStreamer *geneStream = streamerFromTrack(assembly, geneTdb->table, geneTdb, NULL, NO_MAXROWS); boolean gotCoding = FALSE, gotNonCoding = FALSE; struct genePred *gpList = genesFromPosition(geneStream, &gotCoding, &gotNonCoding); FILE *f = mustOpen(sampleFile, "w"); writeMinimalVcfHeader(f, assembly->name); if (gpList == NULL) { warn("Unable to find any gene transcripts in '%s' (%s)", geneTdb->shortLabel, geneTdb->track); writeArbitraryVariants(f, assembly); } else { struct bed4 *bedList = NULL; uint chromSize = annoAssemblySeqSize(assembly, gpList->chrom); uint minSeqStart = chromSize, maxSeqEnd = 0; struct genePred *gp; for (gp = gpList; gp != NULL; gp = gp->next) { int txSeqStart = gp->txStart - (UPSTREAMFUDGE + IGFUDGE); uint txSeqEnd = gp->txEnd + UPSTREAMFUDGE + IGFUDGE; if (txSeqStart < 0) txSeqStart = 0; if (txSeqEnd > chromSize) txSeqEnd = chromSize; if (txSeqStart < minSeqStart) minSeqStart = txSeqStart; if (txSeqEnd > maxSeqEnd) maxSeqEnd = txSeqEnd; // If we got a coding gp, but this gp is non-coding, just do its exon variant: boolean exonOnly = gotCoding && (gp->cdsStart == gp->cdsEnd); slCat(&bedList, sampleVarBedFromGenePred(gp, assembly, txSeqStart, txSeqEnd, exonOnly)); } slSort(&bedList, bedCmp); struct dnaSeq *txSeq = twoBitReadSeqFragLower(assembly->tbf, gpList->chrom, minSeqStart, maxSeqEnd); struct bed4 *bed; for (bed = bedList; bed != NULL; bed = bed->next) { writeMinimalVcfRowFromBed(f, bed, txSeq, minSeqStart); } dnaSeqFree(&txSeq); bed4FreeList(&bedList); } geneStream->close(&geneStream); carefulClose(&f); } return annoStreamVcfNew(sampleFile, FALSE, assembly, maxOutRows); } static char *variantIdPath(struct annoAssembly *assembly, char *variantIds) /* Use the md5sum of the user's pasted/uploaded variants to make a hopefully * unique trash filename. */ { char *md5sum = md5HexForString(variantIds); char *subDir = "hgv"; mkdirTrashDirectory(subDir); struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), subDir, assembly->name, md5sum); return dyStringCannibalize(&dy); } static struct slName *hashListNames(struct hash *hash) /* Return a list of all element names in the hash (if any). */ { struct slName *list = NULL; struct hashCookie cookie = hashFirst(hash); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) slAddHead(&list, slNameNew(hel->name)); return list; } static char *encloseInAngleBracketsDbSnp(char *stringIn) /* Return a string that has <dbSNP: stringIn>, with spaces replaced by '_'s. */ { int stringInLen = strlen(stringIn); int stringOutLen = stringInLen + strlen("<dbSNP:>") + 1; char *stringOut = needMem(stringOutLen); safef(stringOut, stringOutLen, "<dbSNP:%s>", stringIn); subChar(stringOut, ' ', '_'); return stringOut; } // dbSNP named alleles have many ways to describe a deletion from the reference, // for example "LARGEDELETION", "LARGE DELETION", "... DELETED", "... DEL": static const char *dbSnpDelRegex = "^\\(.*(DELET.*| DEL)\\)$"; static char **parseDbSnpAltAlleles(char *refAl, char *obsAls, boolean minusStrand, int *retAltAlCount, boolean *retNeedLeftBase) /* Given a non-symbolic reference allele and slash-sep observed alleles from dbSNP, * return an array of +-strand alleles that are not the same as the reference. * If any allele is "-" (deleted, zero-length), then set retNeedLeftBase to TRUE * because in this case VCF requires that the reference base to the left of the indel * must be added to all alleles, and the start coord also moves one base to the left. * Also, if any alt allele is symbolic, padding is required. * Note: this trashes obsAls. Resulting array can be freed but not its contents. */ { int obsCount = countChars(obsAls, '/') + 1; char *obsWords[obsCount]; chopByChar(obsAls, '/', obsWords, obsCount); boolean obsHasDeletion = FALSE; int i; for (i = 0; i < obsCount; i++) if (sameString(obsWords[i], "-")) { obsHasDeletion = TRUE; break; } char **altAls; AllocArray(altAls, obsCount); int altCount = 0; boolean needLeftBase = isEmpty(refAl) || sameString(refAl, "-"); for (i = 0; i < obsCount; i++) { char *altAl = obsWords[i]; int altAlLen = strlen(altAl); if (minusStrand && isAllNt(altAl, altAlLen)) reverseComplement(altAl, altAlLen); if (differentString(altAl, refAl)) { if (sameString(altAl, "-")) { altAls[altCount] = ""; needLeftBase = TRUE; } else { // It would be nice to expand the "(CA)11/12/14/15/16/17/18/19/20" syntax of // some dbSNP observed's. What are these?: "(D1S243)", "(D1S2870)" // Unfortunately for observed="lengthTooLong" we just can't get the correct allele // sequence. (76,130 of those in snp138) // Hmmm, I guess we could at least stick in the right number of N's if we can // parse "(245 BP INSERTION)". (2403 rows rlike "[0-9]+ BP ?INSERTION" in snp138) if (!isAllNt(altAl, altAlLen)) { // Symbolic allele: left base required, and enclose it in <dbSNP:>'s. // But if it's one of dbSNP's LARGEDELETION kind of alleles, that is redundant // with the reference allele, so if we know there is already a "-" allele, // skip it. if (obsHasDeletion && regexMatch(altAl, dbSnpDelRegex)) continue; needLeftBase = TRUE; altAl = encloseInAngleBracketsDbSnp(altAl); } altAls[altCount] = altAl; } altCount++; } } *retAltAlCount = altCount; *retNeedLeftBase = needLeftBase; return altAls; } char *firstNCommaSep(struct slName *nameList, int n) /* Return a comma-separated string with the first n names in nameList. */ { struct dyString *dy = dyStringNew(0); int i; struct slName *el; for (i=0, el=nameList; i < 5 && el != NULL; i++, el = el->next) { if (i > 0) dyStringAppend(dy, ", "); dyStringPrintf(dy, "'%s'", el->name); } return dyStringCannibalize(&dy); } //#*** Variant ID-matching should be metadata-driven too. termRegex -> data source. static const char *rsIdRegex = "^rs[0-9]+$"; static void rsIdsToVcfRecords(struct annoAssembly *assembly, struct slName *rsIds, struct vcfFile *vcff, struct vcfRecord **pRecList, struct slName **pCommentList) /* If possible, look up coordinates and alleles of dbSNP rs IDs. */ { if (rsIds == NULL) return; char *table = findLatestSnpTable(NULL); if (table == NULL) return; struct sqlConnection *conn = hAllocConn(assembly->name); // Build a 'name in (...)' query, and build a hash of IDs so we can test whether all were found struct dyString *dq = sqlDyStringCreate("select chrom, chromStart, chromEnd, name, strand, " "refUCSC, observed from %s where name in (", table); struct hash *idHash = hashNew(0); struct slName *id; for (id = rsIds; id != NULL; id = id->next) { tolowers(id->name); dyStringPrintf(dq, "%s'%s'", (id != rsIds ? "," : ""), id->name); hashStoreName(idHash, id->name); } dyStringAppend(dq, ");"); struct sqlResult *sr = sqlGetResult(conn, dq->string); // Construct a minimal VCF row to make a vcfRecord for each variant. char *vcfRow[9]; vcfRow[5] = vcfRow[6] = vcfRow[7] = "."; // placeholder for qual, filter, info // It would be cool to someday add snpNNN's exceptions column to the filter or info. struct dyString *dyAltAlStr = dyStringNew(0); int i; char **row; while ((row = sqlNextRow(sr)) != NULL) { char *chrom = row[0]; uint chromStart = atoll(row[1]); uint chromEnd = atoll(row[2]); char *name = row[3]; char *strand = row[4]; char refAl[max(strlen(row[5]), chromEnd-chromStart) + 2]; safecpy(refAl, sizeof(refAl), row[5]); if (sameString(refAl, "-")) refAl[0] = '\0'; else if (! isAllNt(refAl, strlen(refAl))) { // refUCSC is symbolic like "( 2548bp insertion )" -- just use the actual reference // sequence for now, to keep things simple downstream. //#*** Need something more efficient, like a sequence cache inside assembly! struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, chromStart, chromEnd); safecpy(refAl, sizeof(refAl), seq->dna); toUpperN(refAl, seq->size); dnaSeqFree(&seq); } char *obsAls = row[6]; int altAlCount = 0; boolean needLeftBase = FALSE; char **altAls = parseDbSnpAltAlleles(refAl, obsAls, sameString(strand, "-"), &altAlCount, &needLeftBase); needLeftBase |= (chromStart == chromEnd); // should be redundant, but just in case. hashRemove(idHash, name); uint vcfStart = chromStart + 1; dyStringClear(dyAltAlStr); // If this is a non-symbolic indel, we'll have to move start one base to the left // and add the reference base to the left of the actual alleles if (needLeftBase) { vcfStart--; //#*** Need something more efficient, like a sequence cache inside assembly! struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, chromStart-1, chromStart); toUpperN(seq->dna, 1); char leftBase = seq->dna[0]; dnaSeqFree(&seq); char refAlPlus[strlen(refAl)+2]; safef(refAlPlus, sizeof(refAlPlus), "%c%s", leftBase, refAl); safecpy(refAl, sizeof(refAl), refAlPlus); for (i = 0; i < altAlCount; i++) { if (i > 0) dyStringAppendC(dyAltAlStr, ','); if (isAllNt(altAls[i], strlen(altAls[i]))) dyStringPrintf(dyAltAlStr, "%c%s", leftBase, altAls[i]); else dyStringAppend(dyAltAlStr, altAls[i]); } } else { // Not an indel, just make comma-sep string of alt alleles. for (i = 0; i < altAlCount; i++) { if (i > 0) dyStringAppendC(dyAltAlStr, ','); dyStringAppend(dyAltAlStr, altAls[i]); } } char vcfStartStr[64]; safef(vcfStartStr, sizeof(vcfStartStr), "%d", vcfStart); vcfRow[0] = chrom; vcfRow[1] = vcfStartStr; vcfRow[2] = name; vcfRow[3] = refAl; vcfRow[4] = dyAltAlStr->string; struct vcfRecord *rec = vcfRecordFromRow(vcff, vcfRow); slAddHead(pRecList, rec); } dyStringFree(&dyAltAlStr); // Check for IDs not found struct slName *notFoundIds = hashListNames(idHash); if (notFoundIds != NULL) { char *namesNotFound = firstNCommaSep(notFoundIds, 5); struct dyString *dy = dyStringCreate("%d rs# IDs not found, e.g. %s", slCount(notFoundIds), namesNotFound); slAddTail(pCommentList, slNameNew(dy->string)); freeMem(namesNotFound); dyStringFree(&dy); } slNameFreeList(¬FoundIds); } static struct vcfRecord *parseVariantIds(struct annoAssembly *assembly, char *variantIdText, struct slName **pCommentList) /* Return a sorted list of minimal vcfRecords (coords, name, ref and alt alleles) * corresponding to each pasted variant. */ { struct vcfRecord *recList = NULL; char *p = cloneString(variantIdText), *id; struct slName *rsIds = NULL, *unknownIds = NULL; subChar(p, ',', ' '); while ((id = nextWord(&p)) != NULL) { if (regexMatchNoCase(id, rsIdRegex)) slNameAddHead(&rsIds, id); else slNameAddHead(&unknownIds, id); } if (unknownIds != NULL) { slReverse(&unknownIds); char *firstUnknownIds = firstNCommaSep(unknownIds, 5); struct dyString *dy = dyStringCreate("%d variant identifiers are unrecognized, e.g. %s", slCount(unknownIds), firstUnknownIds); slAddTail(pCommentList, slNameNew(dy->string)); freeMem(firstUnknownIds); dyStringFree(&dy); } struct vcfFile *vcff = vcfFileNew(); rsIdsToVcfRecords(assembly, rsIds, vcff, &recList, pCommentList); slSort(&recList, vcfRecordCmp); slNameFreeList(&rsIds); slNameFreeList(&unknownIds); return recList; } static void adjustRangeForVariants(struct vcfRecord *recList, char **pChrom, uint *pStart, uint *pEnd) /* Given a *sorted* list of VCF records, look at the first and last records and if * they fall outside of the range, expand the range to include them. */ { if (pChrom != NULL && *pChrom != NULL && recList != NULL) { // We have a non-empty sorted list of records, and search is not genome-wide; // look at first and last variants to see if they're not already included in search range. struct vcfRecord *first = recList, *last = recList; while (last->next) last = last->next; if (differentString(*pChrom, first->chrom) || differentString(*pChrom, last->chrom)) { // Variants span multiple chroms; force genome-wide search. *pChrom = NULL; *pStart = *pEnd = 0; } else { // Variant(s) are on the same chrom as search range; check starts and ends. if (*pStart > first->chromEnd) *pStart = first->chromEnd; if (*pEnd < last->chromStart) *pEnd = last->chromStart; } } } static struct annoStreamer *makeVariantIdStreamer(struct annoAssembly *assembly, int maxOutRows, char **pChrom, uint *pStart, uint *pEnd, struct slName **pCommentList) /* Translate user's pasted/uploaded variant IDs into minimal VCF if possible. * Return a VCF streamer for those, and if the current search position is too narrow * to include all of the variants, widen it as necessary. */ { // Hash variant text to get trash filename. Use if exists, otherwise build it. char *variantIds = cartString(cart, hgvaVariantIds); char *varFile = variantIdPath(assembly, variantIds); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildVariantIds", FALSE); if (! fileExists(varFile) || forceRebuild) { struct vcfRecord *varList = parseVariantIds(assembly, variantIds, pCommentList); //#*** If no variants were recognized, we should probably show main page with a warning. adjustRangeForVariants(varList, pChrom, pStart, pEnd); FILE *f = mustOpen(varFile, "w"); writeMinimalVcfHeader(f, assembly->name); struct vcfRecord *var; for (var = varList; var != NULL; var = var->next) writeMinimalVcfRow(f, var); carefulClose(&f); } return annoStreamVcfNew(varFile, FALSE, assembly, maxOutRows); } static struct trackDb *getVariantTrackDb(char *variantTrack) /* Get a tdb for the user's selected variant track, or warn and return NULL * (main page will be displayed) */ { struct trackDb *varTdb = tdbForTrack(database, variantTrack, &fullTrackList); if (varTdb == NULL) { if (isHubTrack(variantTrack)) warn("Can't find hub track '%s'", variantTrack); else warn("Can't find tdb for variant track '%s'", variantTrack); } else checkVariantTrack(varTdb); return varTdb; } void doQuery() /* Translate simple form inputs into anno* components and execute query. */ { dyInfo = dyStringNew(0); char *chrom = NULL; uint start = 0, end = 0; if (sameString(regionType, hgvaRegionTypeRange)) getCartPosOrDie(&chrom, &start, &end); struct annoAssembly *assembly = getAnnoAssembly(database); char *geneTrack = cartString(cart, "hgva_geneTrack"); struct trackDb *geneTdb = tdbForTrack(database, geneTrack, &fullTrackList); if (geneTdb == NULL) { warn("Can't find tdb for gene track %s", geneTrack); doUi(); return; } int maxVarRows = cartUsualInt(cart, "hgva_variantLimit", 10); struct annoStreamer *primary = NULL; char *primaryLongLabel = NULL; char *variantTrack = cartString(cart, "hgva_variantTrack"); struct slName *commentList = NULL; if (sameString(variantTrack, hgvaSampleVariants)) { primary = makeSampleVariantsStreamer(assembly, geneTdb, maxVarRows); primaryLongLabel = hgvaSampleVariantsLabel; // Sample variants can't always be made within the currently selected position range, // so just for these, force search to be genome-wide. chrom = NULL; start = 0; end = 0; } else if (sameString(variantTrack, hgvaUseVariantIds)) { // Override search position if cart position doesn't include all variants: primary = makeVariantIdStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList); primaryLongLabel = hgvaVariantIdsLabel; } else { struct trackDb *varTdb = getVariantTrackDb(variantTrack); if (varTdb == NULL) { doUi(); return; } primary = streamerFromTrack(assembly, varTdb->table, varTdb, chrom, maxVarRows); primaryLongLabel = varTdb->longLabel; } enum annoGratorOverlap geneOverlapRule = agoMustOverlap; struct annoGrator *gpVarGrator = gratorFromTrackDb(assembly, geneTdb->table, geneTdb, chrom, NO_MAXROWS, primary->asObj, geneOverlapRule); setGpVarFuncFilter(gpVarGrator); // Some grators may be used as both filters and output values. To avoid making // multiple grators for the same source, hash them by trackName: struct hash *gratorsByName = hashNew(8); struct annoGrator *snpGrator = NULL; char *snpDesc = NULL; if (cartUsualBoolean(cart, "hgva_rsId", FALSE)) snpGrator = gratorForSnpBed4(gratorsByName, "", assembly, chrom, agoNoConstraint, &snpDesc); // Now construct gratorList in the order in which annoFormatVep wants to see them, // i.e. first the gpVar, then the snpNNN, then whatever else: struct annoGrator *gratorList = NULL; slAddHead(&gratorList, gpVarGrator); if (snpGrator != NULL) slAddHead(&gratorList, snpGrator); // Text or HTML output? char *outFormat = cartUsualString(cart, "hgva_outFormat", "vepTab"); boolean doHtml = sameString(outFormat, "vepHtml"); // Initialize VEP formatter: struct annoFormatter *vepOut = annoFormatVepNew("stdout", doHtml, primary, primaryLongLabel, (struct annoStreamer *)gpVarGrator, geneTdb->longLabel, (struct annoStreamer *)snpGrator, snpDesc, assembly); addOutputTracks(&gratorList, gratorsByName, vepOut, assembly, chrom, doHtml); addFilterTracks(&gratorList, gratorsByName, assembly, chrom); slReverse(&gratorList); if (doHtml) { webStart(cart, database, "Annotated Variants in VEP/HTML format"); } else { // Undo the htmlPushEarlyHandlers() because after this point they make ugly text: popWarnHandler(); popAbortHandler(); textOpen(); webStartText(); } if (isNotEmpty(dyInfo->string)) puts(dyInfo->string); struct annoGratorQuery *query = annoGratorQueryNew(assembly, primary, gratorList, vepOut); struct slName *comment; for (comment = commentList; comment != NULL; comment = comment->next) vepOut->comment(vepOut, comment->name); if (chrom != NULL) annoGratorQuerySetRegion(query, chrom, start, end); annoGratorQueryExecute(query); annoGratorQueryFree(&query); if (doHtml) webEnd(); else textOutClose(&compressPipeline); } int main(int argc, char *argv[]) /* Process command line. */ { long enteredMainTime = clock1000(); if (hIsPrivateHost()) pushCarefulMemHandler(LIMIT_2or6GB); htmlPushEarlyHandlers(); /* Make errors legible during initialization. */ cgiSpoof(&argc, argv); oldVars = hashNew(10); setUdcCacheDir(); boolean startQuery = (cgiUsualString("hgva_startQuery", NULL) != NULL); if (startQuery) cart = cartAndCookieNoContent(hUserCookie(), excludeVars, oldVars); else cart = cartAndCookie(hUserCookie(), excludeVars, oldVars); /* Set up global variables. */ getDbAndGenome(cart, &database, &genome, oldVars); regionType = cartUsualString(cart, hgvaRegionType, hgvaRegionTypeGenome); if (isEmpty(cartOptionalString(cart, hgvaRange))) cartSetString(cart, hgvaRange, hDefaultPos(database)); int timeout = cartUsualInt(cart, "udcTimeout", 300); if (udcCacheTimeout() < timeout) udcSetCacheTimeout(timeout); knetUdcInstall(); initGroupsTracksTables(cart, &fullTrackList, &fullGroupList); if (lookupPosition(cart, hgvaRange)) { if (startQuery) doQuery(); else doUi(); } else { // Revert to lastPosition if we have multiple matches or warnings, // especially in case user manually edits browser location as in #13009: char *position = cartUsualString(cart, "lastPosition", hDefaultPos(database)); cartSetString(cart, hgvaRange, position); if (webGotWarnings()) { // We land here when lookupPosition pops up a warning box. // Reset the problematic position and show the main page. doMainPage(); } // If lookupPosition returned FALSE and didn't report warnings, // then it wrote HTML showing multiple position matches & links. } cartCheckout(&cart); cgiExitTime("hgVai", enteredMainTime); return 0; }