f78d6e76501646c8b8cc966103bafca896322509 angie Mon Oct 7 13:54:42 2013 -0700 Work in progress for #11460 (paste/upload variant input options...):adding an option for user to paste/upload variant identifiers which will be translated into a sorted list of vcfRecords. Currently we recognize only rs# IDs. I was considering adding dbVar IDs, but those could come from multiple sources (DGV, ClinVar, ISCA) so I'm not sure. Treating all symbolic/named alleles as deletions... non-ideal, but fortunately those are a small minority in dbSNP. Next: recognize HGVS IDs. The grander vision of #11460 includes accepting VEP input format and VCF, but I think those should be new SELECT options so we don't get into quagmire of guessing format. diff --git src/inc/vcf.h src/inc/vcf.h index ca3d942..20f616a 100644 --- src/inc/vcf.h +++ src/inc/vcf.h @@ -174,30 +174,33 @@ break; case vcfInfoCharacter: fprintf(f, "%c", datum.datChar); break; case vcfInfoString: fprintf(f, "%s", datum.datString); break; default: errAbort("vcfPrintDatum: Unrecognized type %d", type); break; } } #define VCF_IGNORE_ERRS (INT_MAX - 1) +struct vcfFile *vcfFileNew(); +/* Return a new, empty vcfFile object. */ + struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, int maxRecords, boolean parseAll); /* Open fileOrUrl and parse VCF header; return NULL if unable. * If parseAll, then read in all lines, parse and store in * vcff->records; if maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */ struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, int maxErr, int maxRecords); /* Open a VCF file that has been compressed and indexed by tabix and * parse VCF header, or return NULL if unable. If chrom is non-NULL, * seek to the position range and parse all lines in range into * vcff->records. If maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */ @@ -227,30 +230,33 @@ /* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file. * Note: this does not store record in vcff->records! */ struct vcfRecord *vcfRecordFromRow(struct vcfFile *vcff, char **words); /* Parse words from a VCF data line into a VCF record structure. */ unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec); /* For indels, VCF includes the left neighboring base; for example, if the alleles are * AA/- following a G base, then the VCF record will start one base to the left and have * "GAA" and "G" as the alleles. That is not nice for display for two reasons: * 1. Indels appear one base wider than their dbSNP entries. * 2. In pgSnp display mode, the two alleles are always the same color. * However, for hgTracks' mapBox we need the correct chromStart for identifying the * record in hgc -- so return the original chromStart. */ +int vcfRecordCmp(const void *va, const void *vb); +/* Compare to sort based on position. */ + void vcfFileFree(struct vcfFile **vcffPtr); /* Free a vcfFile object. */ const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId); /* Return all records with name=variantId, or NULL if not found. */ const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key); /* Find an INFO element, or NULL. */ struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key); /* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */ void vcfParseGenotypes(struct vcfRecord *record); /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[]. * This destroys genotypesUnparsedStrings. */