41723d134f8b0c52c78705c2e5da97f8875e3cf6
angie
Wed Feb 15 11:44:39 2017 -0800
Added HGVS terms as variant input option in hgVai. refs #11460
diff --git src/hg/hgVai/hgVai.c src/hg/hgVai/hgVai.c
index 8297ffa..868a394 100644
--- src/hg/hgVai/hgVai.c
+++ src/hg/hgVai/hgVai.c
@@ -2,30 +2,31 @@
/* Copyright (C) 2014 The Regents of the University of California
* See README in this or parent directory for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "htmshell.h"
#include "web.h"
#include "cheapcgi.h"
#include "cart.h"
#include "cartTrackDb.h"
#include "genbank.h"
#include "hgConfig.h"
+#include "hgHgvs.h"
#include "hui.h"
#include "grp.h"
#include "hCommon.h"
#include "hgFind.h"
#include "hPrint.h"
#include "jsHelper.h"
#include "memalloc.h"
#include "textOut.h"
#include "trackHub.h"
#include "hubConnect.h"
#include "twoBit.h"
#include "gpFx.h"
#include "bigGenePred.h"
#include "udc.h"
#include "knetUdc.h"
@@ -53,40 +54,49 @@
char *regionType = NULL; /* genome, ENCODE pilot regions, or specific position range. */
struct grp *fullGroupList = NULL; /* List of all groups. */
struct trackDb *fullTrackList = NULL; /* List of all tracks in database. */
static struct pipeline *compressPipeline = (struct pipeline *)NULL;
// Null terminated list of CGI Variables we don't want to save permanently:
char *excludeVars[] = {"Submit", "submit", "hgva_startQuery", NULL,};
#define hgvaRange "position"
#define hgvaRegionType "hgva_regionType"
#define hgvaRegionTypeEncode "encode"
#define hgvaRegionTypeGenome "genome"
#define hgvaRegionTypeRange "range"
#define hgvaPositionContainer "positionContainer"
+#define hgvsTrashSubDir "hgv"
+// auto-generated variants as input
#define hgvaSampleVariants "hgva_internally_generated_sample_variants"
#define hgvaSampleVariantsLabel "Artificial Example Variants"
+// dbSNP rsIDs as variant inputs:
#define hgvaUseVariantIds "hgva_useVariantIds"
#define hgvaVariantIdsLabel "Variant Identifiers"
#define hgvaVariantIds "hgva_variantIds"
#define hgvaUseVariantFileOrUrl "hgva_useVariantFileOrUrl"
-#define hgvaVariantFileOrUrlLabel "from file"
+#define hgvaVariantPasteContainer "variantPasteContainer"
+// A local file or URL as variant input:
+#define hgvaVariantFileOrUrlLabel "from file or URL"
#define hgvaVariantFileOrUrl "hgva_variantFileOrUrl"
#define hgvaVariantFileOrUrlType "hgva_variantFileOrUrlType"
-#define hgvaVariantPasteContainer "variantPasteContainer"
+// HGVS terms as variant inputs:
+#define hgvaUseHgvs "hgva_useHgvs"
+#define hgvaHgvsLabel "HGVS terms"
+#define hgvaHgvs "hgva_hgvs"
+#define hgvaHgvsPasteContainer "hgvsPasteContainer"
void addSomeCss()
/*#*** This should go in a .css file of course. */
{
printf("\n");
}
@@ -471,35 +481,44 @@
{
printf("If you have more than one custom track or hub track in "
PGSNP_OR_VCF " format, please select the one you wish to annotate:
\n");
}
printf("variants: ");
printf("
\n");
+printf("
\n",
+ differentString(selected, hgvaUseHgvs) ? " style='display: none;'" : "");
+printf("Enter HGVS terms: one term per line; blank lines and comment lines beginning with '#' "
+ "are ignored.
\n");
+char *oldPasted = cartUsualString(cart, hgvaHgvs, "");
+cgiMakeTextArea(hgvaHgvs, oldPasted, 10, 70);
+puts("
");
+
if (hasSnps)
{
printf("\n",
differentString(selected, hgvaUseVariantIds) ? " style='display: none;'" : "");
printf("Enter dbSNP rs# identifiers separated by whitespace or commas:
\n");
char *oldPasted = cartUsualString(cart, hgvaVariantIds, "");
cgiMakeTextArea(hgvaVariantIds, oldPasted, 10, 70);
puts("
");
}
printf("maximum number of variants to be processed:\n");
char *curLimit = cartUsualString(cart, "hgva_variantLimit", "10000");
char *limitLabels[] = { "10", "100", "1,000", "10,000", "100,000" };
char *limitValues[] = { "10", "100", "1000", "10000", "100000" };
cgiMakeDropListWithVals("hgva_variantLimit", limitLabels, limitValues, ArraySize(limitLabels),
@@ -1747,44 +1766,56 @@
static void getCartPosOrDie(char **retChrom, uint *retStart, uint *retEnd)
/* Get chrom:start-end from cart, errAbort if any problems. */
{
char *position = cartString(cart, hgvaRange);
if (! parsePosition(position, retChrom, retStart, retEnd))
errAbort("Expected position to be chrom:start-end but got '%s'", position);
}
static char *sampleVariantsPath(struct annoAssembly *assembly, char *geneTrack)
/* Construct a path for a trash file of contrived example variants for this assembly
* and gene track. */
{
char *chrom = NULL;
uint start = 0, end = 0;
getCartPosOrDie(&chrom, &start, &end);
-char *subDir = "hgv";
-mkdirTrashDirectory(subDir);
+mkdirTrashDirectory(hgvsTrashSubDir);
struct dyString *dy = dyStringCreate("%s/%s/%s_%s_%s_%u-%u.vcf",
- trashDir(), subDir, assembly->name, geneTrack,
+ trashDir(), hgvsTrashSubDir, assembly->name, geneTrack,
chrom, start, end);
return dyStringCannibalize(&dy);
}
-static void writeMinimalVcfHeader(FILE *f, char *db)
-/* Write header for VCF with no meaningful qual, filter, info or genotype data. */
+static char *makeMinimalVcfHeader(char *db, char *headerDefs)
+/* Return a string containing a simple no-genotypes VCF header. db must be non-NULL.
+ * headerDefs can be NULL or empty or line(s) starting with '##' and ending w/'\n' */
{
-fputs("##fileformat=VCFv4.1\n", f);
-fprintf(f, "##reference=%s\n", db);
-fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", f);
+struct dyString *dy = dyStringCreate("##fileformat=VCFv4.2\n"
+ "##reference=%s\n"
+ "%s"
+ "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n",
+ db, (headerDefs ? headerDefs : ""));
+return dyStringCannibalize(&dy);
+}
+
+static void writeMinimalVcfHeader(FILE *f, char *db, char *headerDefs)
+/* Write header for VCF with no genotype data. db must be non-NULL.
+ * headerDefs can be NULL or empty or line(s) starting with '##' and ending w/'\n' */
+{
+char *headerString = makeMinimalVcfHeader(db, headerDefs);
+fputs(headerString, f);
+freeMem(headerString);
}
static void mutateBase(char *pBase)
/* Change *pBase into a different base. */
{
static char *bases = "ACGT";
char c;
// In real life the probabilities aren't even, but this is easier.
while ((c = bases[(uint)(floor(drand48() * 4.0))]) == *pBase)
;
*pBase = c;
}
static void writeMinimalVcfRow(FILE *f, struct vcfRecord *rec)
/* Write a minimalist VCF row (coords, name, alleles and placeholders). */
@@ -1976,31 +2007,31 @@
static struct annoStreamer *makeSampleVariantsStreamer(struct annoAssembly *assembly,
struct trackDb *geneTdb, int maxOutRows)
/* Construct a VCF file of example variants for db (unless it already exists)
* and return an annoStreamer for that file. If possible, make the variants hit a gene. */
{
char *sampleFile = sampleVariantsPath(assembly, geneTdb->track);
boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildSampleVariants", FALSE);
if (! fileExists(sampleFile) || forceRebuild)
{
struct annoStreamer *geneStream = hAnnoStreamerFromTrackDb(assembly, geneTdb->table, geneTdb,
NULL, ANNO_NO_LIMIT, NULL);
boolean isBig = sameString(geneTdb->type, "bigGenePred");
boolean gotCoding = FALSE, gotNonCoding = FALSE;
struct genePred *gpList = genesFromPosition(geneStream, isBig, &gotCoding, &gotNonCoding);
FILE *f = mustOpen(sampleFile, "w");
- writeMinimalVcfHeader(f, assembly->name);
+ writeMinimalVcfHeader(f, assembly->name, NULL);
if (gpList == NULL)
{
warn("Unable to find any gene transcripts in '%s' (%s)",
geneTdb->shortLabel, geneTdb->track);
writeArbitraryVariants(f, assembly);
}
else
{
struct bed4 *bedList = NULL;
uint chromSize = annoAssemblySeqSize(assembly, gpList->chrom);
uint minSeqStart = chromSize, maxSeqEnd = 0;
struct genePred *gp;
for (gp = gpList; gp != NULL; gp = gp->next)
{
int txSeqStart = gp->txStart - (UPSTREAMFUDGE + IGFUDGE);
@@ -2022,38 +2053,37 @@
minSeqStart, maxSeqEnd);
struct bed4 *bed;
for (bed = bedList; bed != NULL; bed = bed->next)
{
writeMinimalVcfRowFromBed(f, bed, txSeq, minSeqStart);
}
dnaSeqFree(&txSeq);
bed4FreeList(&bedList);
}
geneStream->close(&geneStream);
carefulClose(&f);
}
return annoStreamVcfNew(sampleFile, NULL, FALSE, assembly, maxOutRows);
}
-static char *variantIdPath(struct annoAssembly *assembly, char *variantIds)
-/* Use the md5sum of the user's pasted/uploaded variants to make a hopefully
- * unique trash filename. */
+static char *md5TrashPath(struct annoAssembly *assembly, char *string)
+/* Use the md5sum of a string to make a hopefully unique trash filename. */
{
-char *md5sum = md5HexForString(variantIds);
-char *subDir = "hgv";
-mkdirTrashDirectory(subDir);
-struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), subDir, assembly->name, md5sum);
+char *md5sum = md5HexForString(string);
+mkdirTrashDirectory(hgvsTrashSubDir);
+struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), hgvsTrashSubDir,
+ assembly->name, md5sum);
return dyStringCannibalize(&dy);
}
static struct slName *hashListNames(struct hash *hash)
/* Return a list of all element names in the hash (if any). */
{
struct slName *list = NULL;
struct hashCookie cookie = hashFirst(hash);
struct hashEl *hel;
while ((hel = hashNext(&cookie)) != NULL)
slAddHead(&list, slNameNew(hel->name));
return list;
}
static char *encloseInAngleBracketsDbSnp(char *stringIn)
@@ -2337,47 +2367,126 @@
if (*pEnd < last->chromStart)
*pEnd = last->chromStart;
}
}
}
static struct annoStreamer *makeVariantIdStreamer(struct annoAssembly *assembly, int maxOutRows,
char **pChrom, uint *pStart, uint *pEnd,
struct slName **pCommentList)
/* Translate user's pasted/uploaded variant IDs into minimal VCF if possible.
* Return a VCF streamer for those, and if the current search position is too narrow
* to include all of the variants, widen it as necessary. */
{
// Hash variant text to get trash filename. Use if exists, otherwise build it.
char *variantIds = cartString(cart, hgvaVariantIds);
-char *varFile = variantIdPath(assembly, variantIds);
+char *varFile = md5TrashPath(assembly, variantIds);
boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildVariantIds", FALSE);
if (! fileExists(varFile) || forceRebuild)
{
struct vcfRecord *varList = parseVariantIds(assembly, variantIds, pCommentList);
//#*** If no variants were recognized, we should probably show main page with a warning.
adjustRangeForVariants(varList, pChrom, pStart, pEnd);
FILE *f = mustOpen(varFile, "w");
- writeMinimalVcfHeader(f, assembly->name);
+ writeMinimalVcfHeader(f, assembly->name, NULL);
struct vcfRecord *var;
for (var = varList; var != NULL; var = var->next)
writeMinimalVcfRow(f, var);
carefulClose(&f);
}
return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows);
}
+static struct vcfRecord *parseHgvs(struct vcfFile *vcff, struct annoAssembly *assembly,
+ char *hgvsTerms, struct slName **pCommentList)
+/* Return a sorted list of vcfRecords . */
+{
+struct vcfRecord *recList = NULL;
+struct slName *failedTerms = NULL;
+struct dyString *dyError = dyStringNew(0);
+struct lineFile *lf = lineFileOnString("user-provided HGVS terms", TRUE, cloneString(hgvsTerms));
+char *term = NULL;
+while (lineFileNextReal(lf, &term))
+ {
+ eraseTrailingSpaces(term);
+ dyStringClear(dyError);
+ struct vcfRow *vcfRow = hgvsToVcfRow(assembly->name, term, FALSE, dyError);
+ if (vcfRow)
+ {
+ char *row[8];
+ row[0] = vcfRow->chrom;
+ char posStr[64];
+ safef(posStr, sizeof(posStr), "%d", vcfRow->pos);
+ row[1] = posStr;
+ row[2] = vcfRow->id;
+ row[3] = vcfRow->ref;
+ row[4] = vcfRow->alt;
+ row[5] = ".";
+ row[6] = vcfRow->filter;
+ row[7] = vcfRow->info;
+ slAddHead(&recList, vcfRecordFromRow(vcff, row));
+ }
+ else
+ slNameAddHead(&failedTerms, dyStringContents(dyError));
+ }
+if (failedTerms != NULL)
+ {
+ slReverse(&failedTerms);
+ char *firstUnknownIds = firstNCommaSep(failedTerms, 5);
+ struct dyString *dy = dyStringCreate("%d HGVS terms could not be parsed and/or mapped to %s, "
+ "e.g. %s",
+ slCount(failedTerms), assembly->name, firstUnknownIds);
+ slAddTail(pCommentList, slNameNew(dy->string));
+ freeMem(firstUnknownIds);
+ dyStringFree(&dy);
+ }
+slSort(&recList, vcfRecordCmp);
+slNameFreeList(&failedTerms);
+dyStringFree(&dyError);
+lineFileClose(&lf);
+return recList;
+}
+
+static struct annoStreamer *makeHgvsStreamer(struct annoAssembly *assembly, int maxOutRows,
+ char **pChrom, uint *pStart, uint *pEnd,
+ struct slName **pCommentList)
+/* Translate user's pasted/uploaded variant IDs into minimal VCF if possible.
+ * Return a VCF streamer for those., and if the current search position is too narrow
+ * to include all of the variants, widen it as necessary. */
+{
+// Hash HGVS input to get trash filename. Use if exists, otherwise build it.
+char *hgvsTerms = cartString(cart, hgvaHgvs);
+char *varFile = md5TrashPath(assembly, hgvsTerms);
+boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildHgvs", FALSE);
+if (! fileExists(varFile) || forceRebuild)
+ {
+ char *headerString = makeMinimalVcfHeader(assembly->name, HGVS_VCF_HEADER_DEFS);
+ struct vcfFile *vcff = vcfFileFromHeader("hgVaiHgvs", headerString, VCF_IGNORE_ERRS);
+ struct vcfRecord *varList = parseHgvs(vcff, assembly, hgvsTerms, pCommentList);
+//#*** If no HGVS terms could be mapped, we should probably show main page with a warning.
+ adjustRangeForVariants(varList, pChrom, pStart, pEnd);
+ FILE *f = mustOpen(varFile, "w");
+ writeMinimalVcfHeader(f, assembly->name, HGVS_VCF_HEADER_DEFS);
+ struct vcfRecord *var;
+ for (var = varList; var != NULL; var = var->next)
+ writeMinimalVcfRow(f, var);
+ carefulClose(&f);
+ vcfFileFree(&vcff);
+ }
+return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows);
+}
+
static struct trackDb *getVariantTrackDb(char *variantTrack)
/* Get a tdb for the user's selected variant track, or warn and return NULL
* (main page will be displayed) */
{
struct trackDb *varTdb = tdbForTrack(database, variantTrack, &fullTrackList);
if (varTdb == NULL)
{
if (isHubTrack(variantTrack))
warn("Can't find hub track '%s'", variantTrack);
else
warn("Can't find tdb for variant track '%s'", variantTrack);
}
else
checkVariantTrack(varTdb);
return varTdb;
@@ -2600,30 +2709,35 @@
{
primary = makeSampleVariantsStreamer(assembly, geneTdb, maxVarRows);
primaryLongLabel = hgvaSampleVariantsLabel;
// Sample variants can't always be made within the currently selected position range,
// so just for these, force search to be genome-wide.
chrom = NULL;
start = 0;
end = 0;
}
else if (sameString(variantTrack, hgvaUseVariantIds))
{
// Override search position if cart position doesn't include all variants:
primary = makeVariantIdStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList);
primaryLongLabel = hgvaVariantIdsLabel;
}
+else if (sameString(variantTrack, hgvaUseHgvs))
+ {
+ primary = makeHgvsStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList);
+ primaryLongLabel = hgvaHgvsLabel;
+ }
else if (sameString(variantTrack, hgvaUseVariantFileOrUrl))
{
char *fileOrUrl = cartString(cart, hgvaVariantFileOrUrl);
char *type = cartOptionalString(cart, hgvaVariantFileOrUrlType);
primary = hAnnoStreamerFromBigFileUrl(fileOrUrl, NULL, assembly, maxVarRows, type);
primaryLongLabel = hgvaVariantFileOrUrlLabel;
}
else
{
struct trackDb *varTdb = getVariantTrackDb(variantTrack);
if (varTdb == NULL)
{
if (! isCommandLine)
doUi();
return;