aa03a541ef25d83f1542202f784c53500c434c2d angie Sat Jun 20 14:35:04 2020 -0700 New util faToVcf for extracting single-nucleotide variants from multi-sequence FASTA alignment. diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c new file mode 100644 index 0000000..3272bfa --- /dev/null +++ src/hg/utils/faToVcf/faToVcf.c @@ -0,0 +1,316 @@ +/* faToVcf - Convert a FASTA alignment file to VCF. */ +#include "common.h" +#include "fa.h" +#include "linefile.h" +#include "hash.h" +#include "iupac.h" +#include "options.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n" + "usage:\n" + " faToVcf in.fa out.vcf\n" + "options:\n" + " -excludeFile=file Exclude sequences named in file which has one sequence name per line\n" + " -includeRef Include the reference in the genotype columns\n" + " (default: omitted as redundant)\n" + " -minAc=N Ignore alternate alleles observed fewer than N times\n" + " -ref=seqName Use seqName as the reference sequence; must be present in faFile\n" + " (default: first sequence in faFile)\n" + " -resolveAmbiguous For IUPAC ambiguous characters like R (A or G), if the character\n" + " represents two bases and one is the reference base, convert it to the\n" + " non-reference base. Otherwise convert it to N.\n" + " -startOffset=N Add N bases to each position (for trimmed alignments)\n" + " -vcfChrom=seqName Use seqName for the CHROM column in VCF (default: ref sequence)\n" + "in.fa must contain a series of sequences with different names and the same length.\n" + "Both N and - are treated as missing information.\n" + ); +} + +#define MAX_ALTS 256 + +/* Command line validation table. */ +static struct optionSpec options[] = { + { "excludeFile", OPTION_STRING }, + { "includeRef", OPTION_BOOLEAN }, + { "minAc", OPTION_INT }, + { "ref", OPTION_STRING }, + { "resolveAmbiguous", OPTION_BOOLEAN }, + { "startOffset", OPTION_INT}, + { "vcfChrom", OPTION_STRING }, + { NULL, 0 }, +}; + +static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences) +/* If -excludeFile was passed in, remove any sequences whose names are found in the file and + * return the modified list. */ +{ +char *excludeFile = optionVal("excludeFile", NULL); +if (excludeFile) + { + struct lineFile *lf = lineFileOpen(excludeFile, TRUE); + struct hash *excludedSeqs = hashNew(0); + char *line; + while (lineFileNextReal(lf, &line)) + { + line = trimSpaces(line); + hashAdd(excludedSeqs, line, NULL); + } + lineFileClose(&lf); + // Make sure there's not a conflict between the reference sequence and the excluded seqs + char *explicitRef = optionVal("ref", NULL); + if (explicitRef && hashLookup(excludedSeqs, explicitRef)) + errAbort("-ref sequence '%s' is included in -excludeFile %s." + "Please resolve this contradiction.", + explicitRef, excludeFile); + int excludeCount = 0; + struct dnaSeq *newList = NULL, *seq, *nextSeq = NULL; + for (seq = sequences; seq != NULL; seq = nextSeq) + { + nextSeq = seq->next; + if (hashLookup(excludedSeqs, seq->name)) + excludeCount++; + else + slAddHead(&newList, seq); + } + hashFree(&excludedSeqs); + slReverse(&newList); + sequences = newList; + verbose(2, "Excluded %d sequences named in %s (%d sequences remaining including reference)\n", + excludeCount, excludeFile, slCount(sequences)); + } +return sequences; +} + +static struct dnaSeq *readSequences(char *faFile) +/* Read all sequences in faFile. Make sure there are at least 2 sequences and that + * all have the same length. If a reference sequence is specified and is not the first + * sequence in the file, then move the reference sequence to the head of the list. */ +{ +verbose(2, "Reading sequences from %s\n", faFile); +struct dnaSeq *sequences = faReadAllMixed(faFile); +int seqCount = slCount(sequences); +verbose(2, "Read %d sequences.\n", seqCount); + +if (seqCount < 2) + errAbort("faToVcf: expecting multiple sequences in %s but found only %d.", faFile, seqCount); + +int seqSize = sequences->size; +struct dnaSeq *seq; +for (seq = sequences->next; seq != NULL; seq = seq->next) + if (seq->size != seqSize) + errAbort("faToVcf: first sequence in %s (%s) has size %d, but sequence %s has size %d. " + "All sequences must have the same size.", + faFile, sequences->name, seqSize, seq->name, seq->size); + +char *refName = optionVal("ref", sequences->name); +if (differentString(sequences->name, refName)) + { + verbose(2, "Using %s as reference.\n", refName); + struct dnaSeq *seq; + for (seq = sequences; seq->next != NULL; seq = seq->next) + { + if (sameString(seq->next->name, refName)) + { + struct dnaSeq *ref = seq->next; + seq->next = ref->next; + ref->next = sequences; + sequences = ref; + break; + } + } + if (differentString(sequences->name, refName)) + errAbort("Could not find -ref sequence '%s' in %s.", refName, faFile); + } + +sequences = removeExcludedSequences(sequences); +return sequences; +} + +static void writeVcfHeader(FILE *outF, char *faFile, char *vcfFile, + struct dnaSeq *sequences, boolean includeRef) +/* Write a simple VCF header with sequence names as genotype columns. */ +{ +fprintf(outF, + "##fileformat=VCFv4.2\n" + "##reference=%s:%s\n" + "##source=faToVcf %s %s\n" //#*** Should document options used. + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", + faFile, sequences->name, faFile, vcfFile); + +struct dnaSeq *seqsForGt = (includeRef ? sequences : sequences->next), *seq; +for (seq = seqsForGt; seq != NULL; seq = seq->next) + fprintf(outF, "\t%s", seq->name); +fprintf(outF, "\n"); +} + +void faToVcf(char *faFile, char *vcfFile) +/* faToVcf - Convert a FASTA alignment file to VCF. */ +{ +struct dnaSeq *sequences = readSequences(faFile); +int seqCount = slCount(sequences); +int seqSize = sequences->size; +boolean includeRef = optionExists("includeRef"); + +verbose(2, "Writing VCF to %s\n", vcfFile); +FILE *outF = mustOpen(vcfFile, "w"); +writeVcfHeader(outF, faFile, vcfFile, sequences, includeRef); + +char *refName = sequences->name; +char *vcfChrom = optionVal("vcfChrom", refName); +int startOffset = optionInt("startOffset", 0); + +struct dnaSeq *seqsForGt = (includeRef ? sequences : sequences->next); +int gtCount = includeRef ? seqCount : seqCount - 1; +int *genotypes = needMem(sizeof(int) * gtCount); +boolean *missing = needMem(sizeof(boolean) * gtCount); + +int chromStart; +for (chromStart = 0; chromStart < seqSize; chromStart++) + { + char ref = toupper(sequences->dna[chromStart]); + if (ref == 'N') + continue; + if (ref == 'U') + ref = 'T'; + char altAlleles[MAX_ALTS]; + int altAlleleCounts[MAX_ALTS]; + int altCount = 0; + memset(altAlleles, 0, sizeof(altAlleles)); + memset(genotypes, 0, sizeof(int) * gtCount); + memset(missing, 0, sizeof(boolean) * gtCount); + struct dnaSeq *seq; + int gtIx; + for (seq = seqsForGt, gtIx = 0; seq != NULL; seq = seq->next, gtIx++) + { + char al = toupper(seq->dna[chromStart]); + if (al == 'U') + al = 'T'; + if (isIupacAmbiguous(al) && optionExists("resolveAmbiguous")) + { + char *bases = iupacAmbiguousToString(al); + if (strlen(bases) > 2 || + (toupper(bases[0]) != ref && toupper(bases[1]) != ref)) + al = 'N'; + else if (toupper(bases[0]) == ref) + al = toupper(bases[1]); + else + al = toupper(bases[0]); + } + if (al == 'N' || al == '-') + missing[gtIx] = TRUE; + else if (al != ref) + { + char *altFound = strchr(altAlleles, al); + int altIx = 0; + if (! altFound) + { + altIx = altCount++; + if (altCount == MAX_ALTS) + errAbort("faToVcf: base %d has at least %d alternate alleles; increase " + "MAX_ALTS in faToVcf.c!", chromStart, MAX_ALTS); + altAlleleCounts[altIx] = 1; + altAlleles[altIx] = al; + } + else + { + altIx = altFound - altAlleles; + altAlleleCounts[altIx]++; + } + genotypes[gtIx] = altIx+1; + } + } + int minAc = optionInt("minAc", 0); + if (minAc > 0) + { + int oldToNewIx[altCount]; + char newAltAlleles[altCount]; + int newAltAlleleCounts[altCount]; + int newAltCount = 0; + int altIx; + for (altIx = 0; altIx < altCount; altIx++) + { + if (altAlleleCounts[altIx] >= minAc) + { + // This alt passes the filter. + int newAltIx = newAltCount++; + newAltAlleleCounts[newAltIx] = altAlleleCounts[altIx]; + newAltAlleles[newAltIx] = altAlleles[altIx]; + oldToNewIx[altIx] = newAltIx; + } + else + { + // We're removing this alt; count its genotypes as missing since they're not + // really reference. + for (gtIx = 0; gtIx < gtCount; gtIx++) + if (genotypes[gtIx] == altIx+1) + missing[gtIx] = TRUE; + oldToNewIx[altIx] = -1; + } + } + if (newAltCount != altCount) + { + for (gtIx = 0; gtIx < gtCount; gtIx++) + { + int oldGt = genotypes[gtIx]; + if (oldGt == 0) + genotypes[gtIx] = 0; + else + genotypes[gtIx] = oldToNewIx[oldGt-1] + 1; + } + altCount = newAltCount; + for (altIx = 0; altIx < newAltCount; altIx++) + { + altAlleles[altIx] = newAltAlleles[altIx]; + altAlleleCounts[altIx] = newAltAlleleCounts[altIx]; + } + } + } + if (altCount) + { + int pos = chromStart + startOffset + 1; + fprintf(outF, "%s\t%d\t", vcfChrom, pos); + int altIx; + for (altIx = 0; altIx < altCount; altIx++) + fprintf(outF, "%s%c%d%c", (altIx > 0) ? "," : "", + ref, pos, altAlleles[altIx]); + fprintf(outF, "\t%c\t", ref); + for (altIx = 0; altIx < altCount; altIx++) + { + if (altIx > 0) + fprintf(outF, ","); + fprintf(outF, "%c", altAlleles[altIx]); + } + fprintf(outF, "\t.\t.\tAC="); + for (altIx = 0; altIx < altCount; altIx++) + { + if (altIx > 0) + fprintf(outF, ","); + fprintf(outF, "%d", altAlleleCounts[altIx]); + } + fprintf(outF, ";AN=%d\tGT", seqCount); + for (gtIx = 0; gtIx < gtCount; gtIx++) + { + if (missing[gtIx]) + fprintf(outF, "\t."); + else + fprintf(outF, "\t%d", genotypes[gtIx]); + } + fprintf(outF, "\n"); + } + } +carefulClose(&outF); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +faToVcf(argv[1], argv[2]); +return 0; +}