0410ce3f2bff7c04762f60551db166f4cae460f7 angie Tue Jun 22 10:13:57 2021 -0700 faToVcf: added -maxDiff to exclude sequences with too many mismatches w/reference (causes usher mem usage increase, no good placement). diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c index 8c9cbf3..454cb1e 100644 --- src/hg/utils/faToVcf/faToVcf.c +++ src/hg/utils/faToVcf/faToVcf.c @@ -9,55 +9,57 @@ void usage(int exitCode) /* Explain usage and exit. */ { fputs( "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n" "usage:\n" " faToVcf in.fa out.vcf\n" "options:\n" " -ambiguousToN Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n" " -excludeFile=file Exclude sequences named in file which has one sequence name per line\n" " -includeRef Include the reference in the genotype columns\n" " (default: omitted as redundant)\n" " -maskSites=file Exclude variants in positions recommended for masking in file\n" " (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n" + " -maxDiff=N Exclude sequences with more than N mismatches with the reference\n" " -minAc=N Ignore alternate alleles observed fewer than N times\n" " -minAf=F Ignore alternate alleles observed in less than F of non-N bases\n" " -noGenotypes Output 8-column VCF, without the sample genotype columns\n" " -ref=seqName Use seqName as the reference sequence; must be present in faFile\n" " (default: first sequence in faFile)\n" " -resolveAmbiguous For IUPAC ambiguous characters like R (A or G), if the character\n" " represents two bases and one is the reference base, convert it to the\n" " non-reference base. Otherwise convert it to N.\n" " -startOffset=N Add N bases to each position (for trimmed alignments)\n" " -vcfChrom=seqName Use seqName for the CHROM column in VCF (default: ref sequence)\n" "in.fa must contain a series of sequences with different names and the same length.\n" "Both N and - are treated as missing information.\n" , stderr ); exit(exitCode); } #define MAX_ALTS 256 /* Command line validation table. */ static struct optionSpec options[] = { { "ambiguousToN", OPTION_BOOLEAN }, { "excludeFile", OPTION_STRING }, { "includeRef", OPTION_BOOLEAN }, { "maskSites", OPTION_STRING }, + { "maxDiff", OPTION_INT }, { "minAc", OPTION_INT }, { "minAf", OPTION_DOUBLE }, { "noGenotypes", OPTION_BOOLEAN }, { "ref", OPTION_STRING }, { "resolveAmbiguous", OPTION_BOOLEAN }, { "startOffset", OPTION_INT}, { "vcfChrom", OPTION_STRING }, { "h", OPTION_BOOLEAN }, { "-help", OPTION_BOOLEAN }, { NULL, 0 }, }; static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences) /* If -excludeFile was passed in, remove any sequences whose names are found in the file and * return the modified list. */ @@ -87,30 +89,75 @@ nextSeq = seq->next; if (hashLookup(excludedSeqs, seq->name)) excludeCount++; else slAddHead(&newList, seq); } hashFree(&excludedSeqs); slReverse(&newList); sequences = newList; verbose(2, "Excluded %d sequences named in %s (%d sequences remaining including reference)\n", excludeCount, excludeFile, slCount(sequences)); } return sequences; } +static int countDiffs(struct dnaSeq *ref, struct dnaSeq *seq) +/* Return the number of bases that differ between ref and seq ignoring 'N'. */ +{ +if (ref->size != seq->size) + errAbort("countDiffs: expecting equally sized sequences but %s size %d != %s size %d", + ref->name, ref->size, seq->name, seq->size); +int diffs = 0; +int i; +for (i = 0; i < ref->size; i++) + { + char refBase = toupper(ref->dna[i]); + char seqBase = toupper(seq->dna[i]); + if (refBase != 'N' && seqBase != 'N' && seqBase != refBase) + diffs++; + } +return diffs; +} + +static struct dnaSeq *filterMaxDiff(struct dnaSeq *sequences) +/* If -maxDiff was passed in, remove any sequences with more than that number of differences + * from the reference (ignoring Ns but not IUPAC ambiguous bases). */ +{ +int maxDiff = optionInt("maxDiff", 0); +if (maxDiff > 0) + { + int excludeCount = 0; + struct dnaSeq *ref = sequences; + struct dnaSeq *newList = NULL, *seq, *nextSeq = NULL; + for (seq = sequences; seq != NULL; seq = nextSeq) + { + nextSeq = seq->next; + int diff = countDiffs(ref, seq); + if (diff > maxDiff) + excludeCount++; + else + slAddHead(&newList, seq); + } + slReverse(&newList); + sequences = newList; + verbose(2, "Excluded %d sequences with >%d differences with the reference (%d sequences remaining including reference)\n", + excludeCount, maxDiff, slCount(sequences)); + } +return sequences; +} + static struct dnaSeq *readSequences(char *faFile) /* Read all sequences in faFile. Make sure there are at least 2 sequences and that * all have the same length. If a reference sequence is specified and is not the first * sequence in the file, then move the reference sequence to the head of the list. */ { verbose(2, "Reading sequences from %s\n", faFile); struct dnaSeq *sequences = faReadAllMixed(faFile); int seqCount = slCount(sequences); verbose(2, "Read %d sequences.\n", seqCount); if (seqCount < 2) errAbort("faToVcf: expecting multiple sequences in %s but found only %d.", faFile, seqCount); int seqSize = sequences->size; struct dnaSeq *seq; @@ -131,30 +178,31 @@ { if (sameString(seq->next->name, refName)) { struct dnaSeq *ref = seq->next; seq->next = ref->next; ref->next = sequences; sequences = ref; break; } } if (differentString(sequences->name, refName)) errAbort("Could not find -ref sequence '%s' in %s.", refName, faFile); } sequences = removeExcludedSequences(sequences); +sequences = filterMaxDiff(sequences); return sequences; } static void writeVcfHeader(FILE *outF, char *faFile, char *vcfFile, struct dnaSeq *sequences, boolean includeRef, boolean includeGenotypes) /* Write a simple VCF header with sequence names as genotype columns. */ { fprintf(outF, "##fileformat=VCFv4.2\n" "##reference=%s:%s\n" "##source=faToVcf %s %s\n" //#*** Should document options used. "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", faFile, sequences->name, faFile, vcfFile); if (includeGenotypes) {