8a0946dd6870f10cde056ba243f1fb4ec1fd16b4 angie Thu Feb 27 11:58:33 2014 -0800 Adding support for plain VCF custom tracks (as opposed to VCF+tabix),since users seem to want to upload VCF, and as long as the file is not too big it will work OK. This means adding a new track type vcf (as opposed to vcfTabix) and supporting it in hgTracks, hgTrackUi, hgc, hgTables and hgVai. (and others I've forgotten?) refs #12416 diff --git src/lib/vcf.c src/lib/vcf.c index b34146f..0d7bb0b 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -755,92 +755,147 @@ int i; for (i = 0; i < rec->alleleCount; i++) { if (rec->alleles[i][1] == '\0') rec->alleles[i] = vcfFilePooledStr(vcff, "-"); else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i]))) rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1); else // don't trim first character of symbolic allele rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]); } } } return chromStartOrig; } -static struct vcfRecord *vcfParseData(struct vcfFile *vcff, int maxRecords) +static boolean chromsMatch(char *chromA, char *chromB) +// Return TRUE if chromA and chromB are non-NULL and identical, possibly ignoring +// "chr" at the beginning of one but not the other. +{ +if (chromA == NULL || chromB == NULL) + return FALSE; +char *chromAPlus = startsWith("chr", chromA) ? chromA+3 : chromA; +char *chromBPlus = startsWith("chr", chromB) ? chromB+3 : chromB; +return sameString(chromAPlus, chromBPlus); +} + +static struct vcfRecord *vcfParseData(struct vcfFile *vcff, char *chrom, int start, int end, + int maxRecords) // Given a vcfFile into which the header has been parsed, and whose // lineFile is positioned at the beginning of a data row, parse and -// return all data rows from lineFile. +// return all data rows (in region, if chrom is non-NULL) from lineFile, +// up to maxRecords. { if (vcff == NULL) return NULL; int recCount = 0; struct vcfRecord *records = NULL; struct vcfRecord *record; while ((record = vcfNextRecord(vcff)) != NULL) { if (maxRecords >= 0 && recCount >= maxRecords) break; + if (chrom == NULL) + { + slAddHead(&records, record); + recCount++; + } + else if (chromsMatch(chrom, record->chrom)) + { + if (end > 0 && record->chromStart >= end) + break; + else if (record->chromEnd > start) + { slAddHead(&records, record); recCount++; } + } + } slReverse(&records); - return records; } -struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, int maxRecords, boolean parseAll) +struct vcfFile *vcfFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, + int maxErr, int maxRecords, boolean parseAll) /* Open fileOrUrl and parse VCF header; return NULL if unable. - * If parseAll, then read in all lines, parse and store in + * If chrom is non-NULL, scan past any variants that precede {chrom, chromStart}. + * Note: this is very inefficient -- it's better to use vcfTabix if possible! + * If parseAll, then read in all lines in region, parse and store in * vcff->records; if maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */ { struct lineFile *lf = NULL; if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) || startsWith("https://", fileOrUrl)) lf = netLineFileOpen(fileOrUrl); else lf = lineFileMayOpen(fileOrUrl, TRUE); struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); -if (parseAll) +if (vcff && chrom != NULL) + { + char *line = NULL; + int size = 0; + while (lineFileNext(vcff->lf, &line, &size)) + { + char lineCopy[size+1]; + safecpy(lineCopy, sizeof(lineCopy), line); + char *words[VCF_MAX_COLUMNS]; + int wordCount = chopTabs(lineCopy, words); + int expected = 8; + if (vcff->genotypeCount > 0) + expected = 9 + vcff->genotypeCount; + lineFileExpectWords(vcff->lf, expected, wordCount); + struct vcfRecord *record = vcfRecordFromRow(vcff, words); + if (chromsMatch(chrom, record->chrom)) + { + if (record->chromEnd < start) + continue; + else + { + lineFileReuse(vcff->lf); + break; + } + } + } + } +if (vcff && parseAll) { - vcff->records = vcfParseData(vcff, maxRecords); + vcff->records = vcfParseData(vcff, chrom, start, end, maxRecords); lineFileClose(&(vcff->lf)); // Not sure why it is closed. Angie? } return vcff; } struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, int maxErr, int maxRecords) /* Open a VCF file that has been compressed and indexed by tabix and * parse VCF header, or return NULL if unable. If chrom is non-NULL, * seek to the position range and parse all lines in range into * vcff->records. If maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */ { struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE); struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); if (vcff == NULL) return NULL; if (isNotEmpty(chrom) && start != end) { if (lineFileSetTabixRegion(lf, chrom, start, end)) { - vcff->records = vcfParseData(vcff, maxRecords); + vcff->records = vcfParseData(vcff, NULL, 0, 0, maxRecords); lineFileClose(&(vcff->lf)); // Not sure why it is closed. Angie? } } return vcff; } int vcfRecordCmp(const void *va, const void *vb) /* Compare to sort based on position. */ { const struct vcfRecord *a = *((struct vcfRecord **)va); const struct vcfRecord *b = *((struct vcfRecord **)vb); int dif; dif = strcmp(a->chrom, b->chrom); if (dif == 0) dif = a->chromStart - b->chromStart; @@ -854,31 +909,31 @@ int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end, int maxErr, int maxRecords) // Reads a batch of records from an opened and indexed VCF file, adding them to // vcff->records and returning the count of new records added in this batch. // Note: vcff->records will continue to be sorted, even if batches are loaded // out of order. Additionally, resulting vcff->records will contain no duplicates // so returned count refects only the new records added, as opposed to all records // in range. If maxErr >= zero, then continue to parse until there are maxErr+1 // errors. A maxErr less than zero does not stop and reports all errors. Set // maxErr to VCF_IGNORE_ERRS for silence. { int oldCount = slCount(vcff->records); if (lineFileSetTabixRegion(vcff->lf, chrom, start, end)) { - struct vcfRecord *records = vcfParseData(vcff, maxRecords); + struct vcfRecord *records = vcfParseData(vcff, NULL, 0, 0, maxRecords); if (records) { struct vcfRecord *lastRec = vcff->records; if (lastRec == NULL) vcff->records = records; else { // Considered just asserting the batches were in order, but a problem may // result when non-overlapping location windows pick up the same long variant. slSortMergeUniq(&(vcff->records), records, vcfRecordCmp, NULL); } } } return slCount(vcff->records) - oldCount;