13db5365510728c1e96d96110a6fe86065949b8a chmalee Thu Sep 24 16:17:54 2020 -0700 Make vcfToBed way more basic and don't do any special parsing of VCF specific stuff, just chop out requested fields, refs #25010 diff --git src/inc/vcf.h src/inc/vcf.h index bad2d5d..60af930 100644 --- src/inc/vcf.h +++ src/inc/vcf.h @@ -250,30 +250,36 @@ // using vcfFileFlushRecords to abandon previously read records and free // the associated memory. Very useful when reading an entire file in batches. #define vcfFileLm(vcff) ((vcff)->reusePool ? (vcff)->reusePool : (vcff)->pool->lm) void vcfFileFlushRecords(struct vcfFile *vcff); // Abandons all previously read vcff->records and flushes the reuse pool (if it exists). // USE WITH CAUTION. All previously allocated record pointers are now invalid. struct vcfRecord *vcfNextRecord(struct vcfFile *vcff); /* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file. * Note: this does not store record in vcff->records! */ struct vcfRecord *vcfRecordFromRow(struct vcfFile *vcff, char **words); /* Parse words from a VCF data line into a VCF record structure. */ +boolean allelesHavePaddingBase(char **alleles, int alleleCount); +/* Examine alleles to see if they either a) all start with the same base or + * b) include a symbolic or 0-length allele. In either of those cases, there + * must be an initial padding base that we'll need to trim from non-symbolic + * alleles. */ + unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec); /* For indels, VCF includes the left neighboring base; for example, if the alleles are * AA/- following a G base, then the VCF record will start one base to the left and have * "GAA" and "G" as the alleles. That is not nice for display for two reasons: * 1. Indels appear one base wider than their dbSNP entries. * 2. In pgSnp display mode, the two alleles are always the same color. * However, for hgTracks' mapBox we need the correct chromStart for identifying the * record in hgc -- so return the original chromStart. */ unsigned int vcfRecordTrimAllelesRight(struct vcfRecord *rec); /* Some tools output indels with extra base to the right, for example ref=ACC, alt=ACCC * which should be ref=A, alt=AC. When the extra bases make the variant extend from an * intron (or gap) into an exon, it can cause a false appearance of a frameshift. * To avoid this, when all alleles have identical base(s) at the end, trim all of them, * and update rec->chromEnd.