f78d6e76501646c8b8cc966103bafca896322509 angie Mon Oct 7 13:54:42 2013 -0700 Work in progress for #11460 (paste/upload variant input options...):adding an option for user to paste/upload variant identifiers which will be translated into a sorted list of vcfRecords. Currently we recognize only rs# IDs. I was considering adding dbVar IDs, but those could come from multiple sources (DGV, ClinVar, ISCA) so I'm not sure. Treating all symbolic/named alleles as deletions... non-ideal, but fortunately those are a small minority in dbSNP. Next: recognize HGVS IDs. The grander vision of #11460 includes accepting VEP input format and VCF, but I think those should be new SELECT options so we don't get into quagmire of guessing format. diff --git src/hg/hgVai/hgVai.c src/hg/hgVai/hgVai.c index 6643e56..f4a03ce 100644 --- src/hg/hgVai/hgVai.c +++ src/hg/hgVai/hgVai.c @@ -10,59 +10,66 @@ #include "cart.h" #include "hui.h" #include "grp.h" #include "hCommon.h" #include "hgFind.h" #include "hPrint.h" #include "jsHelper.h" #include "memalloc.h" #include "textOut.h" #include "trackHub.h" #include "hubConnect.h" #include "twoBit.h" #include "gpFx.h" #include "udc.h" #include "knetUdc.h" +#include "md5.h" +#include "regexHelper.h" #include "annoGratorQuery.h" #include "annoGratorGpVar.h" #include "annoFormatVep.h" #include "annoStreamBigBed.h" #include "libifyMe.h" /* Global Variables */ struct cart *cart; /* CGI and other variables */ struct hash *oldVars = NULL; /* The cart before new cgi stuff added. */ char *genome = NULL; /* Name of genome - mouse, human, etc. */ char *database = NULL; /* Current genome database - hg17, mm5, etc. */ char *regionType = NULL; /* genome, ENCODE pilot regions, or specific position range. */ struct grp *fullGroupList = NULL; /* List of all groups. */ struct trackDb *fullTrackList = NULL; /* List of all tracks in database. */ static struct pipeline *compressPipeline = (struct pipeline *)NULL; // Null terminated list of CGI Variables we don't want to save permanently: char *excludeVars[] = {"Submit", "submit", "hgva_startQuery", NULL,}; #define hgvaRange "position" #define hgvaRegionType "hgva_regionType" #define hgvaRegionTypeEncode "encode" #define hgvaRegionTypeGenome "genome" #define hgvaRegionTypeRange "range" #define hgvaPositionContainer "positionContainer" #define hgvaSampleVariants "hgva_internally_generated_sample_variants" #define hgvaSampleVariantsLabel "Artificial Example Variants" +#define hgvaUseVariantIds "hgva_useVariantIds" +#define hgvaVariantIdsLabel "Variant Identifiers" +#define hgvaVariantIds "hgva_variantIds" +#define hgvaVariantPasteContainer "variantPasteContainer" + void addSomeCss() /*#*** This should go in a .css file of course. */ { printf("<style>\n" "div.sectionLite { border-width: 1px; border-color: #"HG_COL_BORDER"; border-style: solid;" " background-color: #"HG_COL_INSIDE"; padding-left: 10px; padding-right: 10px; " " padding-top: 8px; padding-bottom: 5px; margin-top: 5px; margin-bottom: 5px }\n" ".sectionLiteHeader { font-weight: bold; font-size:larger; color:#000000;" " text-align:left; vertical-align:bottom; white-space:nowrap; }\n" "div.sectionLiteHeader.noReorderRemove { padding-bottom:5px; }\n" "div.sourceFilter { padding-top: 5px; padding-bottom: 5px }\n" "</style>\n"); } @@ -293,48 +300,57 @@ } hashFree(&hash); slReverse(&groupRefList); *retGroupRefList = groupRefList; } } boolean isVariantCustomTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get custom or hub tracks with type pgSnp or vcfTabix. */ { return ((sameString(tdb->grp, "user") || isHubTrack(tdb->track)) && (sameString(tdb->type, "pgSnp") || sameString(tdb->type, "vcfTabix"))); } void selectVariants(struct slRef *varGroupList, struct slRef *varTrackList) -/* Offer selection of user's variant custom tracks. */ +/* Offer selection of user's variant custom tracks, example variants, pasted input etc. */ { printf("<div class='sectionLiteHeader'>Select Variants</div>\n"); printf("If you have more than one custom track or hub track in " "<A HREF='../FAQ/FAQformat.html#format10' TARGET=_BLANK>pgSnp</A> or " "<A HREF='../goldenPath/help/vcf.html' TARGET=_BLANK>VCF</A> format, " "please select the one you wish to annotate:<BR>\n"); printf("<B>variants: </B>"); -printf("<SELECT ID='hgva_variantTrack' NAME='hgva_variantTrack'>\n"); +printf("<SELECT ID='hgva_variantTrack' NAME='hgva_variantTrack' " + "onchange=\"hgva.changeVariantSource();\">\n"); char *selected = cartUsualString(cart, "hgva_variantTrack", ""); struct slRef *ref; for (ref = varTrackList; ref != NULL; ref = ref->next) { struct trackDb *tdb = ref->val; printOption(tdb->track, selected, tdb->longLabel); } printOption(hgvaSampleVariants, selected, hgvaSampleVariantsLabel); +printOption(hgvaUseVariantIds, selected, hgvaVariantIdsLabel); printf("</SELECT><BR>\n"); + +printf("<div id='"hgvaVariantPasteContainer"'%s>\n", + differentString(selected, hgvaUseVariantIds) ? " style='display: none;'" : ""); +char *oldPasted = cartUsualString(cart, hgvaVariantIds, ""); +cgiMakeTextArea(hgvaVariantIds, oldPasted, 10, 70); +puts("</div>"); + printf("<B>maximum number of variants to be processed:</B>\n"); char *curLimit = cartUsualString(cart, "hgva_variantLimit", "10000"); char *limitLabels[] = { "10", "100", "1,000", "10,000", "100,000" }; char *limitValues[] = { "10", "100", "1000", "10000", "100000" }; cgiMakeDropListWithVals("hgva_variantLimit", limitLabels, limitValues, ArraySize(limitLabels), curLimit); printCtAndHubButtons(); puts("<BR>"); } boolean isGeneTrack(struct trackDb *tdb, void *filterData) /* This is a TdbFilterFunction to get genePred tracks. */ { return (startsWith("genePred", tdb->type)); } @@ -492,74 +508,84 @@ struct slName *table; for (table = dbNsfpTables; table != NULL; table = table->next) { if (sameString(table->name, "dbNsfpPolyPhen2")) { printDbNsfpSource(table->name, HDIV); printDbNsfpSource(table->name, HVAR); } else printDbNsfpSource(table->name, 0); } puts("<BR>"); endCollapsibleSection(); } -boolean findSnpBed4(char *suffix, char **retFileName, struct trackDb **retTdb) -/* If we can find the latest snpNNNsuffix track using 'show tables rlike "regex"', - * or better yet a bigBed file for it (faster), - * set the appropriate ret* and return TRUE, otherwise return FALSE. */ +char *findLatestSnpTable(char *suffix) +/* Return the name of the 'snp1__<suffix>' table with the highest build number, if any. */ { if (startsWith(hubTrackPrefix, database)) - return FALSE; + return NULL; if (suffix == NULL) suffix = ""; char query[64]; sqlSafef(query, sizeof(query), "show tables like 'snp1__%s'", suffix); struct sqlConnection *conn = hAllocConn(database); struct slName *snpNNNTables = sqlQuickList(conn, query); hFreeConn(&conn); -if (snpNNNTables == NULL) - return FALSE; -boolean foundIt = FALSE; // Skip to last in list -- highest number (show tables can't use rlike or 'order by'): struct slName *table = snpNNNTables; while (table->next != NULL && isdigit(table->next->name[4]) && isdigit(table->next->name[5])) table = table->next; +char *tableName = NULL; +if (table != NULL) + tableName = cloneString(table->name); +slNameFreeList(&snpNNNTables); +return tableName; +} + +boolean findSnpBed4(char *suffix, char **retFileName, struct trackDb **retTdb) +/* If we can find the latest snpNNNsuffix table, or better yet a bigBed file for it (faster), + * set the appropriate ret* and return TRUE, otherwise return FALSE. */ +{ +char *table = findLatestSnpTable(suffix); +if (table == NULL) + return FALSE; +boolean foundIt = FALSE; // Do we happen to have a bigBed version? Better yet, bed4 only for current uses: char fileName[HDB_MAX_PATH_STRING]; -safef(fileName, sizeof(fileName), "/gbdb/%s/vai/%s.bed4.bb", database, table->name); +safef(fileName, sizeof(fileName), "/gbdb/%s/vai/%s.bed4.bb", database, table); if (fileExists(fileName)) { if (retFileName != NULL) *retFileName = cloneString(fileName); foundIt = TRUE; } else { // Not bed4; try just .bb: - safef(fileName, sizeof(fileName), "/gbdb/%s/vai/%s.bb", database, table->name); + safef(fileName, sizeof(fileName), "/gbdb/%s/vai/%s.bb", database, table); if (fileExists(fileName)) { if (retFileName != NULL) *retFileName = cloneString(fileName); foundIt = TRUE; } } if (foundIt && retTdb == NULL) return TRUE; -struct trackDb *tdb = tdbForTrack(database, table->name, &fullTrackList); +struct trackDb *tdb = tdbForTrack(database, table, &fullTrackList); if (tdb != NULL) { if (retTdb != NULL) *retTdb = tdb; return TRUE; } return foundIt; } void selectDbSnp(boolean gotSnp) /* Offer to include rsID (and other fields, or leave that for advanced output??) if available */ { if (!gotSnp) return; startCollapsibleSection("dbSnp", "Known variation", TRUE); @@ -1166,30 +1192,44 @@ fprintf(f, "##reference=%s\n", db); fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", f); } static void mutateBase(char *pBase) /* Change *pBase into a different base. */ { static char *bases = "ACGT"; char c; // In real life the probabilities aren't even, but this is easier. while ((c = bases[(uint)(floor(drand48() * 4.0))]) == *pBase) ; *pBase = c; } +static void writeMinimalVcfRow(FILE *f, struct vcfRecord *rec) +/* Write a minimalist VCF row (coords, name, alleles and placeholders). */ +{ +// VCF columns: #CHROM POS ID REF ALT QUAL FILTER INFO +fprintf(f, "%s\t%u\t%s\t%s\t", + rec->chrom, rec->chromStart+1, rec->name, rec->alleles[0]); +// Comma-separated alt alleles: +int i; +for (i = 1; i < rec->alleleCount; i++) + fprintf(f, "%s%s", (i > 1 ? "," : ""), rec->alleles[i]); +// Placeholder qual, filter, info. +fprintf(f, "\t.\t.\t.\n"); +} + static void writeMinimalVcfRowFromBed(FILE *f, struct bed4 *bed, struct dnaSeq *seq, uint seqOffset) /* Write a minimalist VCF row with a mutation of the reference sequence at the given * position. */ { uint indelBase = 0, start = bed->chromStart, end = bed->chromEnd; if (end != start+1) { // VCF treats indels in a special way: pos is the base to the left of the indel, // and both ref and alt alleles include the base to the left of the indel. indelBase = 1; } // Get reference allele sequence: uint refAlLen = end - start + indelBase; char ref[refAlLen+1]; uint seqStart = start - indelBase - seqOffset; @@ -1348,36 +1388,36 @@ return bedList; } // margin for intergenic variants around transcript: #define IGFUDGE 5 static struct annoStreamer *makeSampleVariantsStreamer(struct annoAssembly *assembly, struct trackDb *geneTdb, int maxOutRows) /* Construct a VCF file of example variants for db (unless it already exists) * and return an annoStreamer for that file. If possible, make the variants hit a gene. */ { char *sampleFile = sampleVariantsPath(assembly); boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildSampleVariants", FALSE); if (! fileExists(sampleFile) || forceRebuild) { - FILE *f = mustOpen(sampleFile, "w"); - writeMinimalVcfHeader(f, assembly->name); struct annoStreamer *geneStream = streamerFromTrack(assembly, geneTdb->table, geneTdb, NULL, NO_MAXROWS); boolean gotCoding = FALSE, gotNonCoding = FALSE; struct genePred *gpList = genesFromPosition(geneStream, &gotCoding, &gotNonCoding); + FILE *f = mustOpen(sampleFile, "w"); + writeMinimalVcfHeader(f, assembly->name); if (gpList == NULL) { warn("Unable to find any gene transcripts in '%s' (%s)", geneTdb->shortLabel, geneTdb->track); writeArbitraryVariants(f, assembly); } else { struct bed4 *bedList = NULL; uint chromSize = annoAssemblySeqSize(assembly, gpList->chrom); uint minSeqStart = chromSize, maxSeqEnd = 0; struct genePred *gp; for (gp = gpList; gp != NULL; gp = gp->next) { int txSeqStart = gp->txStart - (UPSTREAMFUDGE + IGFUDGE); @@ -1399,30 +1439,352 @@ minSeqStart, maxSeqEnd); struct bed4 *bed; for (bed = bedList; bed != NULL; bed = bed->next) { writeMinimalVcfRowFromBed(f, bed, txSeq, minSeqStart); } dnaSeqFree(&txSeq); bed4FreeList(&bedList); } geneStream->close(&geneStream); carefulClose(&f); } return annoStreamVcfNew(sampleFile, FALSE, assembly, maxOutRows); } +static char *variantIdPath(struct annoAssembly *assembly, char *variantIds) +/* Use the md5sum of the user's pasted/uploaded variants to make a hopefully + * unique trash filename. */ +{ +char *md5sum = md5HexForString(variantIds); +char *subDir = "hgv"; +mkdirTrashDirectory(subDir); +struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), subDir, assembly->name, md5sum); +return dyStringCannibalize(&dy); +} + +static struct slName *hashListNames(struct hash *hash) +/* Return a list of all element names in the hash (if any). */ +{ +struct slName *list = NULL; +struct hashCookie cookie = hashFirst(hash); +struct hashEl *hel; +while ((hel = hashNext(&cookie)) != NULL) + slAddHead(&list, slNameNew(hel->name)); +return list; +} + +static char *encloseInAngleBrackets(char *stringIn) +/* If stringIn begins and ends with ()'s, replace them with <> and return stringIn. + * Otherwise, alloc a new string and surround stringIn with <>. */ +{ +char *stringOut = stringIn; +int stringInLen = strlen(stringIn); +if (stringIn[0] == '(' && stringIn[stringInLen-1] == ')') + { + stringIn[0] = '<'; + stringIn[stringInLen-1] = '>'; + } +else + { + int stringOutLen = stringInLen + 2 + 1; + stringOut = needMem(stringOutLen); + safef(stringOut, stringOutLen, "<%s>", stringIn); + } +return stringOut; +} + + +static char **parseDbSnpAltAlleles(char *refAl, char *obsAls, boolean minusStrand, + int *retAltAlCount, boolean *retNeedLeftBase) +/* Given a non-symbolic reference allele and slash-sep observed alleles from dbSNP, + * return an array of +-strand alleles that are not the same as the reference. + * If any allele is "-" (deleted, zero-length), then set retNeedLeftBase to TRUE + * because in this case VCF requires that the reference base to the left of the indel + * must be added to all alleles, and the start coord also moves one base to the left. + * Also, if any alt allele is symbolic, padding is required. + * Note: this trashes obsAls. Resulting array can be freed but not its contents. */ +{ +int obsCount = countChars(obsAls, '/') + 1; +char *obsWords[obsCount]; +chopByChar(obsAls, '/', obsWords, obsCount); +char **altAls; +AllocArray(altAls, obsCount); +int altCount = 0, i; +boolean needLeftBase = isEmpty(refAl) || sameString(refAl, "-"); +for (i = 0; i < obsCount; i++) + { + char *altAl = obsWords[i]; + if (differentString(altAl, refAl)) + { + if (sameString(altAl, "-")) + { + altAls[altCount] = ""; + needLeftBase = TRUE; + } + else + { + // It would be nice to expand the "(CA)11/12/14/15/16/17/18/19/20" syntax of + // some dbSNP observed's. What are these?: "(D1S243)", "(D1S2870)" + // Unfortunately for observed="lengthTooLong" we just can't get the correct allele + // sequence. (76,130 of those in snp138) + // Hmmm, I guess we could at least stick in the right number of N's if we can + // parse "(245 BP INSERTION)". (2403 rows rlike "[0-9]+ BP ?INSERTION" in snp138) + if (!isAllNt(altAl, strlen(altAl))) + { + // Symbolic allele: left base required, and enclose it in <>'s. + needLeftBase = TRUE; + altAl = encloseInAngleBrackets(altAl); + } + else if (minusStrand) + reverseComplement(altAl, strlen(altAl)); + altAls[altCount] = altAl; + } + altCount++; + } + } +*retAltAlCount = altCount; +*retNeedLeftBase = needLeftBase; +return altAls; +} + +char *firstNCommaSep(struct slName *nameList, int n) +/* Return a comma-separated string with the first n names in nameList. */ +{ +struct dyString *dy = dyStringNew(0); +int i; +struct slName *el; +for (i=0, el=nameList; i < 5 && el != NULL; i++, el = el->next) + { + if (i > 0) + dyStringAppend(dy, ", "); + dyStringPrintf(dy, "'%s'", el->name); + } +return dyStringCannibalize(&dy); +} + +//#*** Variant ID-matching should be metadata-driven too. termRegex -> data source. +static const char *rsIdRegex = "^rs[0-9]+$"; + +static void rsIdsToVcfRecords(struct annoAssembly *assembly, struct slName *rsIds, + struct vcfFile *vcff, struct vcfRecord **pRecList, + struct slName **pCommentList) +/* If possible, look up coordinates and alleles of dbSNP rs IDs. */ +{ +if (rsIds == NULL) + return; +char *table = findLatestSnpTable(NULL); +struct sqlConnection *conn = hAllocConn(assembly->name); +// Build a 'name in (...)' query, and build a hash of IDs so we can test whether all were found +struct dyString *dq = sqlDyStringCreate("select chrom, chromStart, chromEnd, name, strand, " + "refUCSC, observed from %s where name in (", + table); +struct hash *idHash = hashNew(0); +struct slName *id; +for (id = rsIds; id != NULL; id = id->next) + { + tolowers(id->name); + dyStringPrintf(dq, "%s'%s'", (id != rsIds ? "," : ""), id->name); + hashStoreName(idHash, id->name); + } +dyStringAppend(dq, ");"); +struct sqlResult *sr = sqlGetResult(conn, dq->string); +// Construct a minimal VCF row to make a vcfRecord for each variant. +char *vcfRow[9]; +vcfRow[5] = vcfRow[6] = vcfRow[7] = "."; // placeholder for qual, filter, info +// It would be cool to someday add snpNNN's exceptions column to the filter or info. +struct dyString *dyAltAlStr = dyStringNew(0); +int i; +char **row; +while ((row = sqlNextRow(sr)) != NULL) + { + char *chrom = row[0]; + uint chromStart = atoll(row[1]); + uint chromEnd = atoll(row[2]); + char *name = row[3]; + char *strand = row[4]; + char refAl[max(strlen(row[5]), chromEnd-chromStart) + 2]; + safecpy(refAl, sizeof(refAl), row[5]); + if (sameString(refAl, "-")) + refAl[0] = '\0'; + else if (! isAllNt(refAl, strlen(refAl))) + { + // refUCSC is symbolic like "( 2548bp insertion )" -- just use the actual reference + // sequence for now, to keep things simple downstream. + //#*** Need something more efficient, like a sequence cache inside assembly! + struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, chromStart, chromEnd); + safecpy(refAl, sizeof(refAl), seq->dna); + toUpperN(refAl, seq->size); + dnaSeqFree(&seq); + } + char *obsAls = row[6]; + int altAlCount = 0; + boolean needLeftBase = FALSE; + char **altAls = parseDbSnpAltAlleles(refAl, obsAls, sameString(strand, "-"), + &altAlCount, &needLeftBase); + needLeftBase |= (chromStart == chromEnd); // should be redundant, but just in case. + hashRemove(idHash, name); + uint vcfStart = chromStart + 1; + dyStringClear(dyAltAlStr); + // If this is a non-symbolic indel, we'll have to move start one base to the left + // and add the reference base to the left of the actual alleles + if (needLeftBase) + { + vcfStart--; + //#*** Need something more efficient, like a sequence cache inside assembly! + struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, + chromStart-1, chromStart); + toUpperN(seq->dna, 1); + char leftBase = seq->dna[0]; + dnaSeqFree(&seq); + char refAlPlus[strlen(refAl)+2]; + safef(refAlPlus, sizeof(refAlPlus), "%c%s", leftBase, refAl); + safecpy(refAl, sizeof(refAl), refAlPlus); + for (i = 0; i < altAlCount; i++) + { + if (i > 0) + dyStringAppendC(dyAltAlStr, ','); + if (isAllNt(altAls[i], strlen(altAls[i]))) + dyStringPrintf(dyAltAlStr, "%c%s", leftBase, altAls[i]); + else + dyStringAppend(dyAltAlStr, altAls[i]); + } + } + else + { + // Not an indel, just make comma-sep string of alt alleles. + for (i = 0; i < altAlCount; i++) + { + if (i > 0) + dyStringAppendC(dyAltAlStr, ','); + dyStringAppend(dyAltAlStr, altAls[i]); + } + } + char vcfStartStr[64]; + safef(vcfStartStr, sizeof(vcfStartStr), "%d", vcfStart); + vcfRow[0] = chrom; + vcfRow[1] = vcfStartStr; + vcfRow[2] = name; + vcfRow[3] = refAl; + vcfRow[4] = dyAltAlStr->string; + struct vcfRecord *rec = vcfRecordFromRow(vcff, vcfRow); + slAddHead(pRecList, rec); + } +dyStringFree(&dyAltAlStr); +// Check for IDs not found +struct slName *notFoundIds = hashListNames(idHash); +if (notFoundIds != NULL) + { + char *namesNotFound = firstNCommaSep(notFoundIds, 5); + struct dyString *dy = dyStringCreate("%d rs# IDs not found, e.g. %s", + slCount(notFoundIds), namesNotFound); + slAddTail(pCommentList, slNameNew(dy->string)); + freeMem(namesNotFound); + dyStringFree(&dy); + } +slNameFreeList(¬FoundIds); +} + +static struct vcfRecord *parseVariantIds(struct annoAssembly *assembly, char *variantIdText, + struct slName **pCommentList) +/* Return a sorted list of minimal vcfRecords (coords, name, ref and alt alleles) + * corresponding to each pasted variant. */ +{ +struct vcfRecord *recList = NULL; +char *p = cloneString(variantIdText), *id; +struct slName *rsIds = NULL, *unknownIds = NULL; +subChar(p, ',', ' '); +while ((id = nextWord(&p)) != NULL) + { + if (regexMatchNoCase(id, rsIdRegex)) + slNameAddHead(&rsIds, id); + else + slNameAddHead(&unknownIds, id); + } +if (unknownIds != NULL) + { + slReverse(&unknownIds); + char *firstUnknownIds = firstNCommaSep(unknownIds, 5); + struct dyString *dy = dyStringCreate("%d variant identifiers are unrecognized, e.g. %s", + slCount(unknownIds), firstUnknownIds); + slAddTail(pCommentList, slNameNew(dy->string)); + freeMem(firstUnknownIds); + dyStringFree(&dy); + } +struct vcfFile *vcff = vcfFileNew(); +rsIdsToVcfRecords(assembly, rsIds, vcff, &recList, pCommentList); +slSort(&recList, vcfRecordCmp); +slNameFreeList(&rsIds); +slNameFreeList(&unknownIds); +return recList; +} + +static void adjustRangeForVariants(struct vcfRecord *recList, char **pChrom, + uint *pStart, uint *pEnd) +/* Given a *sorted* list of VCF records, look at the first and last records and if + * they fall outside of the range, expand the range to include them. */ +{ +if (pChrom != NULL && *pChrom != NULL && recList != NULL) + { + // We have a non-empty sorted list of records, and search is not genome-wide; + // look at first and last variants to see if they're not already included in search range. + struct vcfRecord *first = recList, *last = recList; + while (last->next) + last = last->next; + if (differentString(*pChrom, first->chrom) || differentString(*pChrom, last->chrom)) + { + // Variants span multiple chroms; force genome-wide search. + *pChrom = NULL; + *pStart = *pEnd = 0; + } + else + { + // Variant(s) are on the same chrom as search range; check starts and ends. + if (*pStart > first->chromEnd) + *pStart = first->chromEnd; + if (*pEnd < last->chromStart) + *pEnd = last->chromStart; + } + } +} + +static struct annoStreamer *makeVariantIdStreamer(struct annoAssembly *assembly, int maxOutRows, + char **pChrom, uint *pStart, uint *pEnd, + struct slName **pCommentList) +/* Translate user's pasted/uploaded variant IDs into minimal VCF if possible. + * Return a VCF streamer for those, and if the current search position is too narrow + * to include all of the variants, widen it as necessary. */ +{ +// Hash variant text to get trash filename. Use if exists, otherwise build it. +char *variantIds = cartString(cart, hgvaVariantIds); +char *varFile = variantIdPath(assembly, variantIds); +boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildVariantIds", FALSE); +if (! fileExists(varFile) || forceRebuild) + { + struct vcfRecord *varList = parseVariantIds(assembly, variantIds, pCommentList); +//#*** If no variants were recognized, we should probably show main page with a warning. + adjustRangeForVariants(varList, pChrom, pStart, pEnd); + FILE *f = mustOpen(varFile, "w"); + writeMinimalVcfHeader(f, assembly->name); + struct vcfRecord *var; + for (var = varList; var != NULL; var = var->next) + writeMinimalVcfRow(f, var); + carefulClose(&f); + } +return annoStreamVcfNew(varFile, FALSE, assembly, maxOutRows); +} + static struct trackDb *getVariantTrackDb(char *variantTrack) /* Get a tdb for the user's selected variant track, or warn and return NULL * (main page will be displayed) */ { struct trackDb *varTdb = tdbForTrack(database, variantTrack, &fullTrackList); if (varTdb == NULL) { if (isHubTrack(variantTrack)) warn("Can't find hub track '%s'; try the \"check hub\" button in " "<A href=\"%s?%s&%s=%s\">Track Data Hubs</A>.", variantTrack, hgHubConnectName(), cartSidUrlString(cart), hgHubConnectCgiDestUrl, cgiScriptName()); else warn("Can't find tdb for variant track '%s'", variantTrack); } @@ -1441,40 +1803,47 @@ struct annoAssembly *assembly = getAnnoAssembly(database); char *geneTrack = cartString(cart, "hgva_geneTrack"); struct trackDb *geneTdb = tdbForTrack(database, geneTrack, &fullTrackList); if (geneTdb == NULL) { warn("Can't find tdb for gene track %s", geneTrack); doUi(); return; } int maxVarRows = cartUsualInt(cart, "hgva_variantLimit", 10); struct annoStreamer *primary = NULL; char *primaryLongLabel = NULL; char *variantTrack = cartString(cart, "hgva_variantTrack"); +struct slName *commentList = NULL; if (sameString(variantTrack, hgvaSampleVariants)) { primary = makeSampleVariantsStreamer(assembly, geneTdb, maxVarRows); primaryLongLabel = hgvaSampleVariantsLabel; // Sample variants can't always be made within the currently selected position range, // so just for these, force search to be genome-wide. chrom = NULL; start = 0; end = 0; } +else if (sameString(variantTrack, hgvaUseVariantIds)) + { + // Override search position if cart position doesn't include all variants: + primary = makeVariantIdStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList); + primaryLongLabel = hgvaVariantIdsLabel; + } else { struct trackDb *varTdb = getVariantTrackDb(variantTrack); if (varTdb == NULL) { doUi(); return; } primary = streamerFromTrack(assembly, varTdb->table, varTdb, chrom, maxVarRows); primaryLongLabel = varTdb->longLabel; } enum annoGratorOverlap geneOverlapRule = agoMustOverlap; struct annoGrator *gpVarGrator = gratorFromTrackDb(assembly, geneTdb->table, geneTdb, chrom, NO_MAXROWS, primary->asObj, geneOverlapRule); @@ -1513,30 +1882,33 @@ slReverse(&gratorList); if (doHtml) { webStart(cart, database, "Annotated Variants in VEP/HTML format"); } else { // Undo the htmlPushEarlyHandlers() because after this point they make ugly text: popWarnHandler(); popAbortHandler(); textOpen(); webStartText(); } struct annoGratorQuery *query = annoGratorQueryNew(assembly, primary, gratorList, vepOut); +struct slName *comment; +for (comment = commentList; comment != NULL; comment = comment->next) + vepOut->comment(vepOut, comment->name); if (chrom != NULL) annoGratorQuerySetRegion(query, chrom, start, end); annoGratorQueryExecute(query); annoGratorQueryFree(&query); if (doHtml) webEnd(); else textOutClose(&compressPipeline); } int main(int argc, char *argv[]) /* Process command line. */ { long enteredMainTime = clock1000();