41723d134f8b0c52c78705c2e5da97f8875e3cf6 angie Wed Feb 15 11:44:39 2017 -0800 Added HGVS terms as variant input option in hgVai. refs #11460 diff --git src/hg/hgVai/hgVai.c src/hg/hgVai/hgVai.c index 8297ffa..868a394 100644 --- src/hg/hgVai/hgVai.c +++ src/hg/hgVai/hgVai.c @@ -2,30 +2,31 @@ /* Copyright (C) 2014 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "jksql.h" #include "htmshell.h" #include "web.h" #include "cheapcgi.h" #include "cart.h" #include "cartTrackDb.h" #include "genbank.h" #include "hgConfig.h" +#include "hgHgvs.h" #include "hui.h" #include "grp.h" #include "hCommon.h" #include "hgFind.h" #include "hPrint.h" #include "jsHelper.h" #include "memalloc.h" #include "textOut.h" #include "trackHub.h" #include "hubConnect.h" #include "twoBit.h" #include "gpFx.h" #include "bigGenePred.h" #include "udc.h" #include "knetUdc.h" @@ -53,40 +54,49 @@ char *regionType = NULL; /* genome, ENCODE pilot regions, or specific position range. */ struct grp *fullGroupList = NULL; /* List of all groups. */ struct trackDb *fullTrackList = NULL; /* List of all tracks in database. */ static struct pipeline *compressPipeline = (struct pipeline *)NULL; // Null terminated list of CGI Variables we don't want to save permanently: char *excludeVars[] = {"Submit", "submit", "hgva_startQuery", NULL,}; #define hgvaRange "position" #define hgvaRegionType "hgva_regionType" #define hgvaRegionTypeEncode "encode" #define hgvaRegionTypeGenome "genome" #define hgvaRegionTypeRange "range" #define hgvaPositionContainer "positionContainer" +#define hgvsTrashSubDir "hgv" +// auto-generated variants as input #define hgvaSampleVariants "hgva_internally_generated_sample_variants" #define hgvaSampleVariantsLabel "Artificial Example Variants" +// dbSNP rsIDs as variant inputs: #define hgvaUseVariantIds "hgva_useVariantIds" #define hgvaVariantIdsLabel "Variant Identifiers" #define hgvaVariantIds "hgva_variantIds" #define hgvaUseVariantFileOrUrl "hgva_useVariantFileOrUrl" -#define hgvaVariantFileOrUrlLabel "from file" +#define hgvaVariantPasteContainer "variantPasteContainer" +// A local file or URL as variant input: +#define hgvaVariantFileOrUrlLabel "from file or URL" #define hgvaVariantFileOrUrl "hgva_variantFileOrUrl" #define hgvaVariantFileOrUrlType "hgva_variantFileOrUrlType" -#define hgvaVariantPasteContainer "variantPasteContainer" +// HGVS terms as variant inputs: +#define hgvaUseHgvs "hgva_useHgvs" +#define hgvaHgvsLabel "HGVS terms" +#define hgvaHgvs "hgva_hgvs" +#define hgvaHgvsPasteContainer "hgvsPasteContainer" void addSomeCss() /*#*** This should go in a .css file of course. */ { printf("<style>\n" "div.sectionLite { border-width: 1px; border-color: #"HG_COL_BORDER"; border-style: solid;" " background-color: #"HG_COL_INSIDE"; padding-left: 10px; padding-right: 10px; " " padding-top: 8px; padding-bottom: 5px; margin-top: 5px; margin-bottom: 5px }\n" ".sectionLiteHeader { font-weight: bold; font-size:larger; color:#000000;" " text-align:left; vertical-align:bottom; white-space:nowrap; }\n" "div.sectionLiteHeader.noReorderRemove { padding-bottom:5px; }\n" "div.sourceFilter { padding-top: 5px; padding-bottom: 5px }\n" "</style>\n"); } @@ -471,35 +481,44 @@ { printf("If you have more than one custom track or hub track in " PGSNP_OR_VCF " format, please select the one you wish to annotate:<BR>\n"); } printf("<B>variants: </B>"); printf("<SELECT ID='hgva_variantTrack' NAME='hgva_variantTrack'>\n"); jsOnEventById("change", "hgva_variantTrack", "hgva.changeVariantSource();"); char *selected = cartUsualString(cart, "hgva_variantTrack", ""); struct slRef *ref; for (ref = varTrackList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; printOption(tdb->track, selected, tdb->longLabel); } printOption(hgvaSampleVariants, selected, hgvaSampleVariantsLabel); +printOption(hgvaUseHgvs, selected, hgvaHgvsLabel); boolean hasSnps = (hFindLatestSnpTable(database, NULL) != NULL); if (hasSnps) printOption(hgvaUseVariantIds, selected, hgvaVariantIdsLabel); printf("</SELECT><BR>\n"); +printf("<div id='"hgvaHgvsPasteContainer"'%s>\n", + differentString(selected, hgvaUseHgvs) ? " style='display: none;'" : ""); +printf("Enter HGVS terms: one term per line; blank lines and comment lines beginning with '#' " + "are ignored.<BR>\n"); +char *oldPasted = cartUsualString(cart, hgvaHgvs, ""); +cgiMakeTextArea(hgvaHgvs, oldPasted, 10, 70); +puts("</div>"); + if (hasSnps) { printf("<div id='"hgvaVariantPasteContainer"'%s>\n", differentString(selected, hgvaUseVariantIds) ? " style='display: none;'" : ""); printf("Enter dbSNP rs# identifiers separated by whitespace or commas:<BR>\n"); char *oldPasted = cartUsualString(cart, hgvaVariantIds, ""); cgiMakeTextArea(hgvaVariantIds, oldPasted, 10, 70); puts("</div>"); } printf("<B>maximum number of variants to be processed:</B>\n"); char *curLimit = cartUsualString(cart, "hgva_variantLimit", "10000"); char *limitLabels[] = { "10", "100", "1,000", "10,000", "100,000" }; char *limitValues[] = { "10", "100", "1000", "10000", "100000" }; cgiMakeDropListWithVals("hgva_variantLimit", limitLabels, limitValues, ArraySize(limitLabels), @@ -1747,44 +1766,56 @@ static void getCartPosOrDie(char **retChrom, uint *retStart, uint *retEnd) /* Get chrom:start-end from cart, errAbort if any problems. */ { char *position = cartString(cart, hgvaRange); if (! parsePosition(position, retChrom, retStart, retEnd)) errAbort("Expected position to be chrom:start-end but got '%s'", position); } static char *sampleVariantsPath(struct annoAssembly *assembly, char *geneTrack) /* Construct a path for a trash file of contrived example variants for this assembly * and gene track. */ { char *chrom = NULL; uint start = 0, end = 0; getCartPosOrDie(&chrom, &start, &end); -char *subDir = "hgv"; -mkdirTrashDirectory(subDir); +mkdirTrashDirectory(hgvsTrashSubDir); struct dyString *dy = dyStringCreate("%s/%s/%s_%s_%s_%u-%u.vcf", - trashDir(), subDir, assembly->name, geneTrack, + trashDir(), hgvsTrashSubDir, assembly->name, geneTrack, chrom, start, end); return dyStringCannibalize(&dy); } -static void writeMinimalVcfHeader(FILE *f, char *db) -/* Write header for VCF with no meaningful qual, filter, info or genotype data. */ +static char *makeMinimalVcfHeader(char *db, char *headerDefs) +/* Return a string containing a simple no-genotypes VCF header. db must be non-NULL. + * headerDefs can be NULL or empty or line(s) starting with '##' and ending w/'\n' */ { -fputs("##fileformat=VCFv4.1\n", f); -fprintf(f, "##reference=%s\n", db); -fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", f); +struct dyString *dy = dyStringCreate("##fileformat=VCFv4.2\n" + "##reference=%s\n" + "%s" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", + db, (headerDefs ? headerDefs : "")); +return dyStringCannibalize(&dy); +} + +static void writeMinimalVcfHeader(FILE *f, char *db, char *headerDefs) +/* Write header for VCF with no genotype data. db must be non-NULL. + * headerDefs can be NULL or empty or line(s) starting with '##' and ending w/'\n' */ +{ +char *headerString = makeMinimalVcfHeader(db, headerDefs); +fputs(headerString, f); +freeMem(headerString); } static void mutateBase(char *pBase) /* Change *pBase into a different base. */ { static char *bases = "ACGT"; char c; // In real life the probabilities aren't even, but this is easier. while ((c = bases[(uint)(floor(drand48() * 4.0))]) == *pBase) ; *pBase = c; } static void writeMinimalVcfRow(FILE *f, struct vcfRecord *rec) /* Write a minimalist VCF row (coords, name, alleles and placeholders). */ @@ -1976,31 +2007,31 @@ static struct annoStreamer *makeSampleVariantsStreamer(struct annoAssembly *assembly, struct trackDb *geneTdb, int maxOutRows) /* Construct a VCF file of example variants for db (unless it already exists) * and return an annoStreamer for that file. If possible, make the variants hit a gene. */ { char *sampleFile = sampleVariantsPath(assembly, geneTdb->track); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildSampleVariants", FALSE); if (! fileExists(sampleFile) || forceRebuild) { struct annoStreamer *geneStream = hAnnoStreamerFromTrackDb(assembly, geneTdb->table, geneTdb, NULL, ANNO_NO_LIMIT, NULL); boolean isBig = sameString(geneTdb->type, "bigGenePred"); boolean gotCoding = FALSE, gotNonCoding = FALSE; struct genePred *gpList = genesFromPosition(geneStream, isBig, &gotCoding, &gotNonCoding); FILE *f = mustOpen(sampleFile, "w"); - writeMinimalVcfHeader(f, assembly->name); + writeMinimalVcfHeader(f, assembly->name, NULL); if (gpList == NULL) { warn("Unable to find any gene transcripts in '%s' (%s)", geneTdb->shortLabel, geneTdb->track); writeArbitraryVariants(f, assembly); } else { struct bed4 *bedList = NULL; uint chromSize = annoAssemblySeqSize(assembly, gpList->chrom); uint minSeqStart = chromSize, maxSeqEnd = 0; struct genePred *gp; for (gp = gpList; gp != NULL; gp = gp->next) { int txSeqStart = gp->txStart - (UPSTREAMFUDGE + IGFUDGE); @@ -2022,38 +2053,37 @@ minSeqStart, maxSeqEnd); struct bed4 *bed; for (bed = bedList; bed != NULL; bed = bed->next) { writeMinimalVcfRowFromBed(f, bed, txSeq, minSeqStart); } dnaSeqFree(&txSeq); bed4FreeList(&bedList); } geneStream->close(&geneStream); carefulClose(&f); } return annoStreamVcfNew(sampleFile, NULL, FALSE, assembly, maxOutRows); } -static char *variantIdPath(struct annoAssembly *assembly, char *variantIds) -/* Use the md5sum of the user's pasted/uploaded variants to make a hopefully - * unique trash filename. */ +static char *md5TrashPath(struct annoAssembly *assembly, char *string) +/* Use the md5sum of a string to make a hopefully unique trash filename. */ { -char *md5sum = md5HexForString(variantIds); -char *subDir = "hgv"; -mkdirTrashDirectory(subDir); -struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), subDir, assembly->name, md5sum); +char *md5sum = md5HexForString(string); +mkdirTrashDirectory(hgvsTrashSubDir); +struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), hgvsTrashSubDir, + assembly->name, md5sum); return dyStringCannibalize(&dy); } static struct slName *hashListNames(struct hash *hash) /* Return a list of all element names in the hash (if any). */ { struct slName *list = NULL; struct hashCookie cookie = hashFirst(hash); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) slAddHead(&list, slNameNew(hel->name)); return list; } static char *encloseInAngleBracketsDbSnp(char *stringIn) @@ -2337,47 +2367,126 @@ if (*pEnd < last->chromStart) *pEnd = last->chromStart; } } } static struct annoStreamer *makeVariantIdStreamer(struct annoAssembly *assembly, int maxOutRows, char **pChrom, uint *pStart, uint *pEnd, struct slName **pCommentList) /* Translate user's pasted/uploaded variant IDs into minimal VCF if possible. * Return a VCF streamer for those, and if the current search position is too narrow * to include all of the variants, widen it as necessary. */ { // Hash variant text to get trash filename. Use if exists, otherwise build it. char *variantIds = cartString(cart, hgvaVariantIds); -char *varFile = variantIdPath(assembly, variantIds); +char *varFile = md5TrashPath(assembly, variantIds); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildVariantIds", FALSE); if (! fileExists(varFile) || forceRebuild) { struct vcfRecord *varList = parseVariantIds(assembly, variantIds, pCommentList); //#*** If no variants were recognized, we should probably show main page with a warning. adjustRangeForVariants(varList, pChrom, pStart, pEnd); FILE *f = mustOpen(varFile, "w"); - writeMinimalVcfHeader(f, assembly->name); + writeMinimalVcfHeader(f, assembly->name, NULL); struct vcfRecord *var; for (var = varList; var != NULL; var = var->next) writeMinimalVcfRow(f, var); carefulClose(&f); } return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows); } +static struct vcfRecord *parseHgvs(struct vcfFile *vcff, struct annoAssembly *assembly, + char *hgvsTerms, struct slName **pCommentList) +/* Return a sorted list of vcfRecords . */ +{ +struct vcfRecord *recList = NULL; +struct slName *failedTerms = NULL; +struct dyString *dyError = dyStringNew(0); +struct lineFile *lf = lineFileOnString("user-provided HGVS terms", TRUE, cloneString(hgvsTerms)); +char *term = NULL; +while (lineFileNextReal(lf, &term)) + { + eraseTrailingSpaces(term); + dyStringClear(dyError); + struct vcfRow *vcfRow = hgvsToVcfRow(assembly->name, term, FALSE, dyError); + if (vcfRow) + { + char *row[8]; + row[0] = vcfRow->chrom; + char posStr[64]; + safef(posStr, sizeof(posStr), "%d", vcfRow->pos); + row[1] = posStr; + row[2] = vcfRow->id; + row[3] = vcfRow->ref; + row[4] = vcfRow->alt; + row[5] = "."; + row[6] = vcfRow->filter; + row[7] = vcfRow->info; + slAddHead(&recList, vcfRecordFromRow(vcff, row)); + } + else + slNameAddHead(&failedTerms, dyStringContents(dyError)); + } +if (failedTerms != NULL) + { + slReverse(&failedTerms); + char *firstUnknownIds = firstNCommaSep(failedTerms, 5); + struct dyString *dy = dyStringCreate("%d HGVS terms could not be parsed and/or mapped to %s, " + "e.g. %s", + slCount(failedTerms), assembly->name, firstUnknownIds); + slAddTail(pCommentList, slNameNew(dy->string)); + freeMem(firstUnknownIds); + dyStringFree(&dy); + } +slSort(&recList, vcfRecordCmp); +slNameFreeList(&failedTerms); +dyStringFree(&dyError); +lineFileClose(&lf); +return recList; +} + +static struct annoStreamer *makeHgvsStreamer(struct annoAssembly *assembly, int maxOutRows, + char **pChrom, uint *pStart, uint *pEnd, + struct slName **pCommentList) +/* Translate user's pasted/uploaded variant IDs into minimal VCF if possible. + * Return a VCF streamer for those., and if the current search position is too narrow + * to include all of the variants, widen it as necessary. */ +{ +// Hash HGVS input to get trash filename. Use if exists, otherwise build it. +char *hgvsTerms = cartString(cart, hgvaHgvs); +char *varFile = md5TrashPath(assembly, hgvsTerms); +boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildHgvs", FALSE); +if (! fileExists(varFile) || forceRebuild) + { + char *headerString = makeMinimalVcfHeader(assembly->name, HGVS_VCF_HEADER_DEFS); + struct vcfFile *vcff = vcfFileFromHeader("hgVaiHgvs", headerString, VCF_IGNORE_ERRS); + struct vcfRecord *varList = parseHgvs(vcff, assembly, hgvsTerms, pCommentList); +//#*** If no HGVS terms could be mapped, we should probably show main page with a warning. + adjustRangeForVariants(varList, pChrom, pStart, pEnd); + FILE *f = mustOpen(varFile, "w"); + writeMinimalVcfHeader(f, assembly->name, HGVS_VCF_HEADER_DEFS); + struct vcfRecord *var; + for (var = varList; var != NULL; var = var->next) + writeMinimalVcfRow(f, var); + carefulClose(&f); + vcfFileFree(&vcff); + } +return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows); +} + static struct trackDb *getVariantTrackDb(char *variantTrack) /* Get a tdb for the user's selected variant track, or warn and return NULL * (main page will be displayed) */ { struct trackDb *varTdb = tdbForTrack(database, variantTrack, &fullTrackList); if (varTdb == NULL) { if (isHubTrack(variantTrack)) warn("Can't find hub track '%s'", variantTrack); else warn("Can't find tdb for variant track '%s'", variantTrack); } else checkVariantTrack(varTdb); return varTdb; @@ -2600,30 +2709,35 @@ { primary = makeSampleVariantsStreamer(assembly, geneTdb, maxVarRows); primaryLongLabel = hgvaSampleVariantsLabel; // Sample variants can't always be made within the currently selected position range, // so just for these, force search to be genome-wide. chrom = NULL; start = 0; end = 0; } else if (sameString(variantTrack, hgvaUseVariantIds)) { // Override search position if cart position doesn't include all variants: primary = makeVariantIdStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList); primaryLongLabel = hgvaVariantIdsLabel; } +else if (sameString(variantTrack, hgvaUseHgvs)) + { + primary = makeHgvsStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList); + primaryLongLabel = hgvaHgvsLabel; + } else if (sameString(variantTrack, hgvaUseVariantFileOrUrl)) { char *fileOrUrl = cartString(cart, hgvaVariantFileOrUrl); char *type = cartOptionalString(cart, hgvaVariantFileOrUrlType); primary = hAnnoStreamerFromBigFileUrl(fileOrUrl, NULL, assembly, maxVarRows, type); primaryLongLabel = hgvaVariantFileOrUrlLabel; } else { struct trackDb *varTdb = getVariantTrackDb(variantTrack); if (varTdb == NULL) { if (! isCommandLine) doUi(); return;