4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/cgilib/tests/annoGratorTester.c src/hg/cgilib/tests/annoGratorTester.c index 74e4038..a5e98d8 100644 --- src/hg/cgilib/tests/annoGratorTester.c +++ src/hg/cgilib/tests/annoGratorTester.c @@ -1,516 +1,516 @@ /* annoGratorTester -- exercise anno* lib modules (in kent/src as well as kent/src/hg) */ /* Copyright (C) 2014 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "annoGratorQuery.h" #include "annoGratorGpVar.h" #include "annoStreamBigBed.h" #include "annoStreamDb.h" #include "annoStreamTab.h" #include "annoStreamVcf.h" #include "annoStreamWig.h" #include "annoGrateWigDb.h" #include "annoFormatTab.h" #include "annoFormatVep.h" #include "bigBed.h" #include "dystring.h" #include "genePred.h" #include "hdb.h" #include "knetUdc.h" #include "memalloc.h" #include "pgSnp.h" #include "udc.h" #include "vcf.h" //#*** duplicated from hgVarAnnoGrator... libify me! struct annoAssembly *getAnnoAssembly(char *db) /* Make annoAssembly for db. */ { static struct annoAssembly *aa = NULL; if (aa == NULL) { char *nibOrTwoBitDir = hDbDbNibPath(db); if (nibOrTwoBitDir == NULL) errAbort("Can't find .2bit for db '%s'", db); char twoBitPath[HDB_MAX_PATH_STRING]; safef(twoBitPath, sizeof(twoBitPath), "%s/%s.2bit", nibOrTwoBitDir, db); char *path = hReplaceGbdb(twoBitPath); aa = annoAssemblyNew(db, path); freeMem(path); } return aa; } struct streamerInfo /* Enough info to create a streamer or grator that gets data from sql, file or URL. */ { struct streamerInfo *next; struct annoAssembly *assembly; // Reference assembly name and sequence. char *sqlDb; // If non-NULL, then we are using this SQL database char *tableFileUrl; // If db is non-NULL, table name; else file or URL enum annoRowType type; // Data type (wig or words?) struct asObject *asObj; // not used if we're using a sqlDb table }; struct annoStreamer *streamerFromInfo(struct streamerInfo *info) /* Figure out which constructor to call, call it and return the results. */ { struct annoStreamer *streamer = NULL; if (info->type == arWigVec) streamer = annoStreamWigDbNew(info->sqlDb, info->tableFileUrl, info->assembly, BIGNUM); else if (info->sqlDb != NULL) streamer = annoStreamDbNew(info->sqlDb, info->tableFileUrl, info->assembly, BIGNUM, NULL); else if (info->asObj && asObjectsMatch(info->asObj, vcfAsObj())) { //#*** this is kludgey, should test for .tbi file: boolean looksLikeTabix = endsWith(info->tableFileUrl, ".gz"); streamer = annoStreamVcfNew(info->tableFileUrl, NULL, looksLikeTabix, info->assembly, BIGNUM); } else if (endsWith(info->tableFileUrl, ".bb")) { streamer = annoStreamBigBedNew(info->tableFileUrl, info->assembly, BIGNUM); } else { streamer = annoStreamTabNew(info->tableFileUrl, info->assembly, info->asObj, BIGNUM); } return streamer; } void sourcesFromInfoList(struct streamerInfo *infoList, bool doGpFx, struct annoStreamer **retPrimary, struct annoGrator **retGrators) /* Translate streamerInfo parameters into primary source and list of secondary sources. */ { assert(infoList && retPrimary && retGrators); struct streamerInfo *primaryInfo = infoList; struct streamerInfo *gratorInfoList = infoList->next; struct annoStreamer *primary = streamerFromInfo(primaryInfo); struct annoGrator *gratorList = NULL; struct streamerInfo *grInfo; for (grInfo = gratorInfoList; grInfo != NULL; grInfo = grInfo->next) { struct annoGrator *grator = NULL; if (grInfo->type == arWigVec || grInfo->type == arWigSingle) { if (grInfo->sqlDb == NULL) grator = annoGrateBigWigNew(grInfo->tableFileUrl, grInfo->assembly, agwmAverage); else grator = annoGrateWigDbNew(grInfo->sqlDb, grInfo->tableFileUrl, grInfo->assembly, agwmAverage, BIGNUM); } else { struct annoStreamer *src = streamerFromInfo(grInfo); if (doGpFx && grInfo->asObj && asColumnNamesMatchFirstN(grInfo->asObj, genePredAsObj(), 10)) grator = annoGratorGpVarNew(src); else grator = annoGratorNew(src); } slAddHead(&gratorList, grator); } slReverse(&gratorList); *retPrimary = primary; *retGrators = gratorList; } struct asObject *bigBedAsFromFileName(char *fileName) /* Look up bigBed filename in table and get its internally stored autoSql definition. */ { struct bbiFile *bbi = bigBedFileOpen(fileName); struct asObject *asObj = bigBedAs(bbi); bigBedFileClose(&bbi); return asObj; } void dbToTabOut(struct streamerInfo *infoList, char *outFile, char *chrom, uint start, uint end, bool doGpFx) /* Get data from one or more database tables and print all fields to tab-sep output. */ { struct annoStreamer *primary = NULL; struct annoGrator *gratorList = NULL; sourcesFromInfoList(infoList, doGpFx, &primary, &gratorList); struct annoFormatter *tabOut = annoFormatTabNew(outFile); struct annoGratorQuery *query = annoGratorQueryNew(primary->assembly, primary, gratorList, tabOut); annoGratorQuerySetRegion(query, chrom, start, end); annoGratorQueryExecute(query); annoGratorQueryFree(&query); } void pgSnpDbToTabOut(struct annoAssembly *assembly) // First test: some rows of a pgSnp table { char *sqlDb = assembly->name; struct streamerInfo pgSnpInfo = { NULL, assembly, sqlDb, "pgNA12878", arWords, pgSnpAsObj() }; pgSnpInfo.next = NULL; dbToTabOut(&pgSnpInfo, "stdout", "chr1", 705881, 752721, FALSE); } void pgSnpKgDbToTabOutShort(struct annoAssembly *assembly) // Second test: some rows of a pgSnp table integrated with knownGene { char *sqlDb = assembly->name; struct streamerInfo pgSnpInfo = { NULL, assembly, sqlDb, "pgNA12878", arWords, pgSnpAsObj() }; struct streamerInfo kgInfo = { NULL, assembly, sqlDb, "knownGene", arWords, asParseFile("../../lib/knownGene.as") }; pgSnpInfo.next = &kgInfo; dbToTabOut(&pgSnpInfo, "stdout", "chr1", 705881, 752721, FALSE); } void pgSnpKgDbToTabOutLong(struct annoAssembly *assembly) // Third test: all rows of a pgSnp table integrated with knownGene { char *sqlDb = assembly->name; struct streamerInfo pgSnpInfo = { NULL, assembly, sqlDb, "pgNA12878", arWords, pgSnpAsObj() }; dbToTabOut(&pgSnpInfo, "stdout", NULL, 0, 0, FALSE); } void snpConsDbToTabOutShort(struct annoAssembly *assembly) // Fourth test: some rows of snp135 integrated with phyloP scores { char *sqlDb = assembly->name; struct streamerInfo snp135Info = { NULL, assembly, sqlDb, "snp135", arWords, asParseFile("../../lib/snp132Ext.as") }; struct streamerInfo phyloPInfo = { NULL, assembly, sqlDb, "phyloP46wayPlacental", arWigSingle, NULL }; snp135Info.next = &phyloPInfo; dbToTabOut(&snp135Info, "stdout", "chr1", 737224, 738475, FALSE); } void snpConsDbToTabOutLong(struct annoAssembly *assembly) // Long-running!: All rows of snp135 integrated with phyloP scores { char *sqlDb = assembly->name; struct streamerInfo snp135Info = { NULL, assembly, sqlDb, "snp135", arWords, asParseFile("../../lib/snp132Ext.as") }; struct streamerInfo phyloPInfo = { NULL, assembly, sqlDb, "phyloP46wayPlacental", arWigSingle, NULL }; snp135Info.next = &phyloPInfo; dbToTabOut(&snp135Info, "stdout", NULL, 0, 0, FALSE); } void vcfEx1(struct annoAssembly *assembly) // Fifth test: VCF with genotypes { struct streamerInfo vcfEx1 = { NULL, assembly, NULL, "http://genome.ucsc.edu/goldenPath/help/examples/vcfExample.vcf.gz", arWords, vcfAsObj() }; dbToTabOut(&vcfEx1, "stdout", NULL, 0, 0, FALSE); } void vcfEx2(struct annoAssembly *assembly) // VCF with no genotypes { struct streamerInfo vcfEx2 = { NULL, assembly, NULL, "http://genome.ucsc.edu/goldenPath/help/examples/vcfExampleTwo.vcf", arWords, vcfAsObj() }; dbToTabOut(&vcfEx2, "stdout", NULL, 0, 0, FALSE); } void pgSnpKgDbToGpFx(struct annoAssembly *assembly) // pgSnp + knownGene + gpFx = annotated variants { char *sqlDb = assembly->name; struct streamerInfo pg2SnpInfo = { NULL, assembly, NULL, "input/annoGrator/pgForTestingGpFx.pgSnp.tab", arWords, pgSnpAsObj() }; struct streamerInfo kgInfo = { NULL, assembly, sqlDb, "knownGene", arWords, asParseFile("../../lib/knownGene.as") }; pg2SnpInfo.next = &kgInfo; dbToTabOut(&pg2SnpInfo, "stdout", NULL, 0, 0, TRUE); /* FIXME // 3base insertion CDS - chr3:124,646,699-124,646,718 dbToTabOut(&pg2SnpInfo, "stdout", "chr3",124646699,124646718, TRUE); */ } void bigBedToTabOut(struct annoAssembly *assembly) // like bigBedToBed { struct streamerInfo bigBedInfo = { NULL, assembly, NULL, "http://genome.ucsc.edu/goldenPath/help/examples/bigBedExample.bb", arWords, NULL }; dbToTabOut(&bigBedInfo, "stdout", "chr21", 34716800, 34733700, FALSE); } void snpBigWigToTabOut(struct annoAssembly *assembly) // text + scores { char *sqlDb = assembly->name; struct streamerInfo snp135Info = { NULL, assembly, sqlDb, "snp135", arWords, asParseFile("../../lib/snp132Ext.as") }; struct streamerInfo bigWigInfo = { NULL, assembly, NULL, "http://genome.ucsc.edu/goldenPath/help/examples/bigWigExample.bw", arWigSingle, NULL }; snp135Info.next = &bigWigInfo; dbToTabOut(&snp135Info, "stdout", "chr21", 34716800, 34733700, FALSE); } void vepOut(struct annoAssembly *assembly) // variants + genes + gpFx + snps + annoFormatVep = annotated variants in VEP format { char *sqlDb = assembly->name; struct streamerInfo vepSamplePgSnp = { NULL, assembly, NULL, "input/annoGrator/vepSample.pgSnp.tab", arWords, asParseFile("../../lib/pgSnp.as") }; struct streamerInfo ensGInfo = { NULL, assembly, sqlDb, "ensGene", arWords, asParseFile("../../lib/genePredExt.as") }; struct streamerInfo snpInfo = { NULL, assembly, sqlDb, "snp135", arWords, asParseFile("../../lib/snp132Ext.as") }; vepSamplePgSnp.next = &ensGInfo; ensGInfo.next = &snpInfo; // Instead of dbToTabOut, we need to make a VEP config data structure and // use it to create an annoFormatVep. struct streamerInfo *primaryInfo = &vepSamplePgSnp; struct annoStreamer *primary = NULL; struct annoGrator *gratorList = NULL; sourcesFromInfoList(primaryInfo, TRUE, &primary, &gratorList); struct annoStreamer *gpVarSource = (struct annoStreamer *)gratorList; struct annoStreamer *snpSource = gpVarSource->next; struct annoFormatter *vepOut = annoFormatVepNew("stdout", FALSE, primary, "vepSamplePgSnp", gpVarSource, "UCSC Genes ...", snpSource, "just dbSNP 135", assembly); struct annoGratorQuery *query = annoGratorQueryNew(assembly, primary, gratorList, vepOut); annoGratorQuerySetRegion(query, "chr1", 876900, 886920); annoGratorQueryExecute(query); annoGratorQuerySetRegion(query, "chr5", 135530, 145535); annoGratorQueryExecute(query); annoGratorQueryFree(&query); } void vepOutIndelTrim(struct annoAssembly *assembly) // variants with VCF's awful indel coordinates + ... = VEP { char *sqlDb = assembly->name; struct streamerInfo indelTrimVcf = { NULL, assembly, NULL, "input/annoGrator/indelTrim.vcf", arWords, vcfAsObj() }; struct streamerInfo gencodeInfo = { NULL, assembly, sqlDb, "wgEncodeGencodeBasicV19", arWords, asParseFile("../../lib/genePredExt.as") }; indelTrimVcf.next = &gencodeInfo; // Instead of dbToTabOut, we need to make a VEP config data structure and // use it to create an annoFormatVep. struct streamerInfo *primaryInfo = &indelTrimVcf; struct annoStreamer *primary = NULL; struct annoGrator *gratorList = NULL; sourcesFromInfoList(primaryInfo, TRUE, &primary, &gratorList); struct annoStreamer *gpVarSource = (struct annoStreamer *)gratorList; struct annoFormatter *vepOut = annoFormatVepNew("stdout", FALSE, primary, "indelTrimVcf", gpVarSource, "EnsemblGenes ...", NULL, NULL, assembly); struct annoGratorQuery *query = annoGratorQueryNew(assembly, primary, gratorList, vepOut); annoGratorQuerySetRegion(query, "chr11", 0, 0); annoGratorQueryExecute(query); annoGratorQueryFree(&query); } void gpFx(struct annoAssembly *assembly) // Add in dbNsfp data for missense variants { char *sqlDb = assembly->name; struct streamerInfo variants = { NULL, assembly, NULL, "input/annoGrator/moreVariants.pgSnp.tab", arWords, asParseFile("../../lib/pgSnp.as") }; struct streamerInfo kgInfo = { NULL, assembly, sqlDb, "knownGene", arWords, asParseFile("../../lib/knownGene.as") }; struct streamerInfo snpInfo = { NULL, assembly, sqlDb, "snp137", arWords, asParseFile("../../lib/snp132Ext.as") }; struct asObject *dbNsfpSeqChangeAs = bigBedAsFromFileName("/gbdb/hg19/dbNsfp/dbNsfpSeqChange.bb"); struct streamerInfo dbNsfpSeqChange = { NULL, assembly, NULL, "/gbdb/hg19/dbNsfp/dbNsfpSeqChange.bb", arWords, dbNsfpSeqChangeAs }; struct asObject *dbNsfpSiftAs = bigBedAsFromFileName("/gbdb/hg19/dbNsfp/dbNsfpSift.bb"); struct streamerInfo dbNsfpSift = { NULL, assembly, NULL, "/gbdb/hg19/dbNsfp/dbNsfpSift.bb", arWords, dbNsfpSiftAs }; variants.next = &kgInfo; kgInfo.next = &snpInfo; snpInfo.next = &dbNsfpSeqChange; dbNsfpSeqChange.next = &dbNsfpSift; // Instead of dbToTabOut, we need to make a VEP config data structure and // use it to create an annoFormatVep. struct streamerInfo *primaryInfo = &variants; struct annoStreamer *primary = NULL; struct annoGrator *gratorList = NULL; sourcesFromInfoList(primaryInfo, TRUE, &primary, &gratorList); struct annoStreamer *gpVarSource = (struct annoStreamer *)gratorList; struct annoStreamer *snpSource = gpVarSource->next; struct annoStreamer *dbNsfpSource = snpSource->next->next; struct annoFormatter *vepOut = annoFormatVepNew("stdout", FALSE, primary, "some more variants", gpVarSource, "UCSC Genes of course", snpSource, "now snp137.", assembly); annoFormatVepAddExtraItem(vepOut, dbNsfpSource, "SIFT", "SIFT score from dbNSFP", "", FALSE); struct annoGratorQuery *query = annoGratorQueryNew(assembly, primary, gratorList, vepOut); annoGratorQuerySetRegion(query, "chr19", 45405960, 45419476); annoGratorQueryExecute(query); annoGratorQueryFree(&query); } void doInsertionsRegions(struct streamerInfo *infoList) /* Perform a series of region queries on infoList for the 'insertions' test. */ { // Entire range of features in both primary and secondary: puts("# region: chr1 0 500"); dbToTabOut(infoList, "stdout", "chr1", 0, 500, FALSE); // Region to the left of insLeft: puts("# region: chr1 100 200"); dbToTabOut(infoList, "stdout", "chr1", 100, 200, FALSE); // Region to the right of insLeft and left of insRight: puts("# region: chr1 200 300"); dbToTabOut(infoList, "stdout", "chr1", 200, 300, FALSE); // Region to the right of insRight and left of pi (and insPi): puts("# region: chr1 300 400"); dbToTabOut(infoList, "stdout", "chr1", 300, 400, FALSE); // Region to the right of pi (and insPi): puts("# region: chr1 400 500"); dbToTabOut(infoList, "stdout", "chr1", 400, 500, FALSE); } void insertions(struct annoAssembly *assembly) // Test corner cases of intersection of zero-length insertions with regular items (length > 0) // and with different search regions to make sure that insertions at edges are included. { struct asObject *bed4AS = asParseFile("../../lib/bed.as"); struct streamerInfo primary = { NULL, assembly, NULL, "input/annoGrator/insertionsPrimary.bed", arWords, bed4AS }; struct streamerInfo secondary = { NULL, assembly, NULL, "input/annoGrator/insertionsSecondary.bed", arWords, bed4AS }; primary.next = &secondary; // Plain BED files puts("# BED files"); doInsertionsRegions(&primary); // BigBed versions of same files puts("# BigBed files"); primary.tableFileUrl = "input/annoGrator/insertionsPrimary.bb"; secondary.tableFileUrl = "input/annoGrator/insertionsSecondary.bb"; doInsertionsRegions(&primary); // Mysql tables from BED files puts("# BED tables"); primary.sqlDb = secondary.sqlDb = "test"; primary.tableFileUrl = "insertionsPrimary"; secondary.tableFileUrl = "insertionsSecondary"; doInsertionsRegions(&primary); // Uncompressed VCF puts("# VCF files (uncompressed)"); primary.sqlDb = secondary.sqlDb = NULL; primary.tableFileUrl = "input/annoGrator/insertionsPrimary.vcf"; secondary.tableFileUrl = "input/annoGrator/insertionsSecondary.vcf"; primary.asObj = secondary.asObj = vcfAsObj(); doInsertionsRegions(&primary); // VCF+tabix puts("# VCF files (tabix)"); primary.tableFileUrl = "input/annoGrator/insertionsPrimary.vcf.gz"; secondary.tableFileUrl = "input/annoGrator/insertionsSecondary.vcf.gz"; doInsertionsRegions(&primary); } struct testSpec { char *name; void (*TestFunc)(struct annoAssembly *assembly); }; static const struct testSpec testSpecList[] = { { "pgSnpDbToTabOut", pgSnpDbToTabOut }, { "pgSnpKgDbToTabOutShort", pgSnpKgDbToTabOutShort }, { "pgSnpKgDbToTabOutLong", pgSnpKgDbToTabOutLong }, { "pgSnpKgDbToGpFx", pgSnpKgDbToGpFx }, { "snpConsDbToTabOutShort", snpConsDbToTabOutShort }, { "snpConsDbToTabOutLong", snpConsDbToTabOutLong }, { "vcfEx1", vcfEx1 }, { "vcfEx2", vcfEx2 }, { "bigBedToTabOut", bigBedToTabOut }, { "snpBigWigToTabOut", snpBigWigToTabOut }, { "vepOut", vepOut }, { "vepOutIndelTrim", vepOutIndelTrim }, { "gpFx", gpFx }, { "insertions", insertions }, { NULL, NULL } }; struct slName *makeTestNameList() // Extract just the names of the tests into a list. { struct slName *testNameList = NULL; int i; for (i = 0; testSpecList[i].name != NULL; i++) { slAddHead(&testNameList, slNameNew(testSpecList[i].name)); } slReverse(&testNameList); return testNameList; } char *makeTestNameUsage(struct slName *testNameList) // Make a user-friendly listing of valid test names { struct dyString *dy = dyStringCreate("testName can be one of the following:\n"); struct slName *testName; for (testName = testNameList; testName != NULL; testName = testName->next) { dyStringPrintf(dy, " %s\n", testName->name); } return dyStringCannibalize(&dy); } void usage(struct slName *testNameList) /* explain usage and exit */ { errAbort( "annoGratorTester - test program for anno* lib modules\n\n" "usage:\n" " annoGratorTester db testName\n" // "options:\n" "%s", makeTestNameUsage(testNameList) ); } static struct optionSpec optionSpecs[] = { {NULL, 0} }; int main(int argc, char *argv[]) { // Check args optionInit(&argc, argv, optionSpecs); struct slName *testNameList = makeTestNameList(); if (argc != 3) usage(testNameList); char *db = argv[1]; char *testName = argv[2]; if (! slNameFind(testNameList, testName)) { errAbort("Unrecognized test name '%s'\n" "%s", argv[2], makeTestNameUsage(testNameList)); } // Set up environment pushCarefulMemHandler(LIMIT_2or6GB); if (udcCacheTimeout() < 300) udcSetCacheTimeout(300); udcSetDefaultDir("./udcCache"); knetUdcInstall(); // Run the specified test struct annoAssembly *assembly = getAnnoAssembly(db); int i; for (i = 0; testSpecList[i].name != NULL; i++) { struct testSpec testSpec = testSpecList[i]; if (sameString(testName, testSpec.name)) testSpec.TestFunc(assembly); } return 0; }