4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/encode3/encodeDataWarehouse/edwMakeContaminationQa/edwMakeContaminationQa.c src/hg/encode3/encodeDataWarehouse/edwMakeContaminationQa/edwMakeContaminationQa.c index 8c59969..bff2812 100644 --- src/hg/encode3/encodeDataWarehouse/edwMakeContaminationQa/edwMakeContaminationQa.c +++ src/hg/encode3/encodeDataWarehouse/edwMakeContaminationQa/edwMakeContaminationQa.c @@ -1,198 +1,198 @@ /* edwMakeContaminationQa - Screen for contaminants by aligning against contaminant genomes.. */ /* Copyright (C) 2013 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "options.h" #include "sqlNum.h" #include "jksql.h" #include "basicBed.h" #include "genomeRangeTree.h" #include "correlate.h" #include "hmmstats.h" #include "portable.h" #include "encodeDataWarehouse.h" #include "edwLib.h" boolean keepTemp = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "edwMakeContaminationQa - Screen for contaminants by aligning against contaminant genomes.\n" "usage:\n" " edwMakeContaminationQa startId endId\n" "where startId and endId are id's in the edwFile table\n" "options:\n" " -keepTemp\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"keepTemp", OPTION_BOOLEAN}, {NULL, 0}, }; struct edwFile *edwFileLoadIdRange(struct sqlConnection *conn, long long startId, long long endId) /* Return list of all files in given id range */ { char query[256]; sqlSafef(query, sizeof(query), "select * from edwFile where id>=%lld and id<=%lld and endUploadTime != 0 " "and updateTime != 0 and deprecated = ''", startId, endId); return edwFileLoadByQuery(conn, query); } void alignFastqMakeBed(struct edwFile *ef, struct edwAssembly *assembly, char *fastqPath, struct edwValidFile *vf, FILE *bedF) /* Take a sample fastq and run bwa on it, and then convert that file to a bed. * Update vf->mapRatio and related fields. */ { edwAlignFastqMakeBed(ef, assembly, fastqPath, vf, bedF, &vf->mapRatio, &vf->depth, &vf->sampleCoverage, &vf->uniqueMapRatio); } #define FASTQ_SAMPLE_SIZE 100000 int edwQaContamMade(struct sqlConnection *conn, long long fileId, int targetId) /* Return number of times have fileId paired with targetId in edwQaContam table. */ { char query[256]; sqlSafef(query, sizeof(query), "select count(*) from edwQaContam where fileId=%lld and qaContamTargetId=%d", fileId, targetId); return sqlQuickNum(conn, query); } struct edwQaContamTarget *getContamTargets(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf) /* Get list of contamination targets for file - basically all targets that aren't in same * taxon as self. */ { assert(vf->ucscDb != NULL); struct edwAssembly *origAsm = edwAssemblyForUcscDb(conn, vf->ucscDb); assert(origAsm != NULL); char query[256]; sqlSafef(query, sizeof(query), "select edwQaContamTarget.* from edwQaContamTarget,edwAssembly " "where edwQaContamTarget.assemblyId = edwAssembly.id " " and edwAssembly.taxon != %d", origAsm->taxon); struct edwQaContamTarget *targetList = edwQaContamTargetLoadByQuery(conn, query); edwAssemblyFree(&origAsm); return targetList; } void screenFastqForContaminants(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf) /* The ef/vf point to same file, which is fastq format. Set alignments up for a sample against all * contamination targets. */ { /* Get target list and see if we have any work to do. */ struct edwQaContamTarget *target, *targetList; targetList = getContamTargets(conn, ef, vf); boolean needScreen = FALSE; for (target = targetList; target != NULL; target = target->next) { if (edwQaContamMade(conn, ef->id, target->id) <= 0) { needScreen = TRUE; break; } } if (needScreen) { verbose(1, "screenFastqForContaminants(%u(%s))\n", ef->id, ef->submitFileName); /* Get fastq record. */ struct edwFastqFile *fqf = edwFastqFileFromFileId(conn, ef->id); if (fqf == NULL) errAbort("No edwFastqFile record for file id %lld", (long long)ef->id); /* Create downsampled fastq in temp directory - downsampled more than default even. */ char sampleFastqName[PATH_LEN]; edwMakeTempFastqSample(fqf->sampleFileName, FASTQ_SAMPLE_SIZE, sampleFastqName); verbose(1, "downsampled %s into %s\n", vf->licensePlate, sampleFastqName); for (target = targetList; target != NULL; target = target->next) { /* Get assembly associated with target */ int assemblyId = target->assemblyId; char query[512]; sqlSafef(query, sizeof(query), "select * from edwAssembly where id=%d", assemblyId); struct edwAssembly *newAsm = edwAssemblyLoadByQuery(conn, query); if (newAsm == NULL) errAbort("warehouse edwQaContamTarget %d not found", assemblyId); /* If we don't already have a match, do work to create contam record. */ int matchCount = edwQaContamMade(conn, ef->id, target->id); if (matchCount <= 0) { /* We run the bed-file maker, just for side effect calcs. */ double mapRatio = 0, depth = 0, sampleCoverage = 0, uniqueMapRatio; edwAlignFastqMakeBed(ef, newAsm, sampleFastqName, vf, NULL, &mapRatio, &depth, &sampleCoverage, &uniqueMapRatio); verbose(1, "%s mapRatio %g, depth %g, sampleCoverage %g\n", newAsm->name, mapRatio, depth, sampleCoverage); struct edwQaContam contam = {.fileId=ef->id, .qaContamTargetId=target->id, .mapRatio = mapRatio}; edwQaContamSaveToDb(conn, &contam, "edwQaContam", 256); } edwAssemblyFree(&newAsm); } edwQaContamTargetFreeList(&targetList); if (keepTemp) verbose(1, "%s\n", sampleFastqName); else remove(sampleFastqName); edwFastqFileFree(&fqf); } } void doContaminationQa(struct sqlConnection *conn, struct edwFile *ef) /* Try and do contamination level QA - mostly mapping fastq files to other * genomes. */ { /* Get validated file info. If not validated we don't bother. */ struct edwValidFile *vf = edwValidFileFromFileId(conn, ef->id); if (vf == NULL) return; /* We only work on fastq. */ if (!sameString(vf->format, "fastq")) return; screenFastqForContaminants(conn, ef, vf); } void edwMakeContaminationQa(int startId, int endId) /* edwMakeContaminationQa - Screen for contaminants by aligning against contaminant genomes.. */ { /* Make list with all files in ID range */ struct sqlConnection *conn = edwConnectReadWrite(); struct edwFile *ef, *efList = edwFileLoadIdRange(conn, startId, endId); for (ef = efList; ef != NULL; ef = ef->next) { doContaminationQa(conn, ef); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); keepTemp = optionExists("keepTemp"); edwMakeContaminationQa(sqlUnsigned(argv[1]), sqlUnsigned(argv[2])); return 0; }