f78d6e76501646c8b8cc966103bafca896322509 angie Mon Oct 7 13:54:42 2013 -0700 Work in progress for #11460 (paste/upload variant input options...):adding an option for user to paste/upload variant identifiers which will be translated into a sorted list of vcfRecords. Currently we recognize only rs# IDs. I was considering adding dbVar IDs, but those could come from multiple sources (DGV, ClinVar, ISCA) so I'm not sure. Treating all symbolic/named alleles as deletions... non-ideal, but fortunately those are a small minority in dbSNP. Next: recognize HGVS IDs. The grander vision of #11460 includes accepting VEP input format and VCF, but I think those should be new SELECT options so we don't get into quagmire of guessing format. diff --git src/lib/vcf.c src/lib/vcf.c index d017d1e..59c941b 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -398,31 +398,31 @@ expectColumnName(vcff, "FILTER", words, 6); expectColumnName(vcff, "INFO", words, 7); if (wordCount > 8) { expectColumnName(vcff, "FORMAT", words, 8); if (wordCount < 10) vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?"); vcff->genotypeCount = (wordCount - 9); vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; for (i = 9; i < wordCount; i++) vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]); } } -static struct vcfFile *vcfFileNew() +struct vcfFile *vcfFileNew() /* Return a new, empty vcfFile object. */ { struct vcfFile *vcff = NULL; AllocVar(vcff); vcff->pool = hashNew(0); vcff->reusePool = NULL; // Must explicitly request a separate record pool return vcff; } void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize) // Creates a separate memory pool for records. Establishing this pool allows // using vcfFileFlushRecords to abandon previously read records and free // the associated memory. Very useful when reading an entire file in batches. { assert(vcff->reusePool == NULL); // don't duplicate this @@ -688,58 +688,75 @@ { char *words[VCF_MAX_COLUMNS]; int wordCount; if ((wordCount = lineFileChop(vcff->lf, words)) <= 0) return NULL; int expected = 8; if (vcff->genotypeCount > 0) expected = 9 + vcff->genotypeCount; lineFileExpectWords(vcff->lf, expected, wordCount); return vcfRecordFromRow(vcff, words); } unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec) /* For indels, VCF includes the left neighboring base; for example, if the alleles are * AA/- following a G base, then the VCF record will start one base to the left and have - * "GAA" and "G" as the alleles. That is not nice for display for two reasons: + * "GAA" and "G" as the alleles. Also, if any alt allele is symbolic (e.g. ) then + * the ref allele must have a padding base. + * That is not nice for display for two reasons: * 1. Indels appear one base wider than their dbSNP entries. * 2. In pgSnp display mode, the two alleles are always the same color. * However, for hgTracks' mapBox we need the correct chromStart for identifying the * record in hgc -- so return the original chromStart. */ { unsigned int chromStartOrig = rec->chromStart; struct vcfFile *vcff = rec->file; if (rec->alleleCount > 1) { - boolean allSameFirstBase = TRUE; - char firstBase = rec->alleles[0][0]; + boolean needPaddingBase = TRUE; + char firstBase = '\0'; + if (isAllNt(rec->alleles[0], strlen(rec->alleles[0]))) + firstBase = rec->alleles[0][0]; int i; for (i = 1; i < rec->alleleCount; i++) + { + if (isAllNt(rec->alleles[i], strlen(rec->alleles[i]))) + { + if (firstBase == '\0') + firstBase = rec->alleles[i][0]; if (rec->alleles[i][0] != firstBase) + // Different first base implies unpadded alleles. + needPaddingBase = FALSE; + } + else { - allSameFirstBase = FALSE; + // Symbolic ALT allele: REF must have the padding base. + needPaddingBase = TRUE; break; } - if (allSameFirstBase) + } + if (needPaddingBase) { rec->chromStart++; for (i = 0; i < rec->alleleCount; i++) { if (rec->alleles[i][1] == '\0') rec->alleles[i] = vcfFilePooledStr(vcff, "-"); - else + else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i]))) rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1); + else // don't trim first character of symbolic allele + rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]); } } } return chromStartOrig; } static struct vcfRecord *vcfParseData(struct vcfFile *vcff, int maxRecords) // Given a vcfFile into which the header has been parsed, and whose // lineFile is positioned at the beginning of a data row, parse and // return all data rows from lineFile. { if (vcff == NULL) return NULL; int recCount = 0; struct vcfRecord *records = NULL;