4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/encode3/encodeDataWarehouse/edwMakePairedEndQa/edwMakePairedEndQa.c src/hg/encode3/encodeDataWarehouse/edwMakePairedEndQa/edwMakePairedEndQa.c index ede2b9e..736351c 100644 --- src/hg/encode3/encodeDataWarehouse/edwMakePairedEndQa/edwMakePairedEndQa.c +++ src/hg/encode3/encodeDataWarehouse/edwMakePairedEndQa/edwMakePairedEndQa.c @@ -1,306 +1,306 @@ /* edwMakePairedEndQa - Do alignments of paired-end fastq files and calculate distrubution of * insert size. */ /* Copyright (C) 2014 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "jksql.h" #include "portable.h" #include "obscure.h" #include "encodeDataWarehouse.h" #include "edwLib.h" #include "raToStruct.h" #include "ra.h" int maxInsert = 1000; void usage() /* Explain usage and exit. */ { errAbort( "edwMakePairedEndQa - Do alignments of paired-end fastq files and calculate distrubution of \n" "insert size.\n" "usage:\n" " edwMakePairedEndQa startId endId\n" "options:\n" " -maxInsert=N - maximum allowed insert size, default %d\n" , maxInsert ); } /* Command line validation table. */ static struct optionSpec options[] = { {"maxInsert", OPTION_INT}, {NULL, 0}, }; /**** - Start raToStructGen generated code - ****/ struct raToStructReader *edwQaPairedEndFastqRaReader() /* Make a raToStructReader for edwQaPairedEndFastq */ { static char *fields[] = { "fileId1", "concordance", "distanceMean", "distanceStd", "distanceMin", "distanceMax", }; static char *requiredFields[] = { "concordance", "distanceMean", "distanceStd", "distanceMin", "distanceMax", }; return raToStructReaderNew("edwQaPairedEndFastq", ArraySize(fields), fields, ArraySize(requiredFields), requiredFields); } struct edwQaPairedEndFastq *edwQaPairedEndFastqFromNextRa(struct lineFile *lf, struct raToStructReader *reader) /* Return next stanza put into an edwQaPairedEndFastq. */ { enum fields { fileId1Field, concordanceField, distanceMeanField, distanceStdField, distanceMinField, distanceMaxField, }; if (!raSkipLeadingEmptyLines(lf, NULL)) return NULL; struct edwQaPairedEndFastq *el; AllocVar(el); bool *fieldsObserved = reader->fieldsObserved; bzero(fieldsObserved, reader->fieldCount); char *tag, *val; while (raNextTagVal(lf, &tag, &val, NULL)) { struct hashEl *hel = hashLookup(reader->fieldIds, tag); if (hel != NULL) { int id = ptToInt(hel->val); if (fieldsObserved[id]) errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName); fieldsObserved[id] = TRUE; switch (id) { case fileId1Field: { el->fileId1 = sqlUnsigned(val); break; } case concordanceField: { el->concordance = sqlDouble(val); break; } case distanceMeanField: { el->distanceMean = sqlDouble(val); break; } case distanceStdField: { el->distanceStd = sqlDouble(val); break; } case distanceMinField: { el->distanceMin = sqlDouble(val); break; } case distanceMaxField: { el->distanceMax = sqlDouble(val); break; } default: internalErr(); break; } } } raToStructReaderCheckRequiredFields(reader, lf); return el; } struct edwQaPairedEndFastq *edwQaPairedEndFastqLoadRa(char *fileName) /* Return list of all edwQaPairedEndFastq in ra file. */ { struct raToStructReader *reader = edwQaPairedEndFastqRaReader(); struct lineFile *lf = lineFileOpen(fileName, TRUE); struct edwQaPairedEndFastq *el, *list = NULL; while ((el = edwQaPairedEndFastqFromNextRa(lf, reader)) != NULL) slAddHead(&list, el); slReverse(&list); lineFileClose(&lf); raToStructReaderFree(&reader); return list; } struct edwQaPairedEndFastq *edwQaPairedEndFastqOneFromRa(char *fileName) /* Return edwQaPairedEndFastq in file and insist there be exactly one record. */ { struct edwQaPairedEndFastq *one = edwQaPairedEndFastqLoadRa(fileName); if (one == NULL) errAbort("No data in %s", fileName); if (one->next != NULL) errAbort("Multiple records in %s", fileName); return one; } /**** - End raToStructGen generated code - ****/ #define FASTQ_SAMPLE_SIZE 10000 void makeTmpSai(struct sqlConnection *conn, struct edwValidFile *vf, char *genoFile, char **retSampleFile, char **retSaiFile) /* Given a fastq file, make a subsample of it 100k reads long and align it with * bwa producing a sai file of given name. */ { /* Get fastq record */ long long fileId = vf->fileId; struct edwFastqFile *fqf = edwFastqFileFromFileId(conn, fileId); if (fqf == NULL) errAbort("No edwFastqFile record for file id %lld", fileId); /* Create downsampled fastq in temp directory - downsampled more than default even. */ char sampleFastqName[PATH_LEN]; edwMakeTempFastqSample(fqf->sampleFileName, FASTQ_SAMPLE_SIZE, sampleFastqName); verbose(1, "downsampled %s into %s\n", vf->licensePlate, sampleFastqName); /* Do alignment */ char cmd[3*PATH_LEN]; char *saiName = cloneString(rTempName(edwTempDir(), "edwPairSample", ".sai")); safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, sampleFastqName, saiName); mustSystem(cmd); /* Save return variables, clean up, and go home. */ *retSampleFile = cloneString(sampleFastqName); *retSaiFile = saiName; edwFastqFileFree(&fqf); } void pairedEndQa(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf) /* Look for other end, do a pairwise alignment, and save results in database. */ { verbose(2, "pairedEndQa on %u %s %s\n", ef->id, ef->edwFileName, ef->submitFileName); /* Get other end, return if not found. */ struct edwValidFile *otherVf = edwOppositePairedEnd(conn, vf); if (otherVf == NULL) return; if (otherVf->fileId > vf->fileId) return; struct edwValidFile *vf1, *vf2; struct edwQaPairedEndFastq *pair = edwQaPairedEndFastqFromVfs(conn, vf, otherVf, &vf1, &vf2); if (pair != NULL) { edwValidFileFree(&otherVf); return; } /* Get target assembly and figure out path for BWA index. */ struct edwAssembly *assembly = edwAssemblyForUcscDb(conn, vf->ucscDb); assert(assembly != NULL); char genoFile[PATH_LEN]; safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", edwValDataDir, assembly->ucscDb, assembly->ucscDb); verbose(1, "aligning subsamples on %u vs. %u paired reads\n", vf1->fileId, vf2->fileId); /* Make alignments of subsamples. */ char *sample1 = NULL, *sample2 = NULL, *sai1 = NULL, *sai2 = NULL; makeTmpSai(conn, vf1, genoFile, &sample1, &sai1); makeTmpSai(conn, vf2, genoFile, &sample2, &sai2); /* Make paired end alignment */ char *tmpSam = cloneString(rTempName(edwTempDir(), "edwPairSample", ".sam")); char command[6*PATH_LEN]; safef(command, sizeof(command), "bwa sampe -n 1 -N 1 -f %s %s %s %s %s %s" , tmpSam, genoFile, sai1, sai2, sample1, sample2); mustSystem(command); /* Make ra file with pairing statistics */ char *tmpRa = cloneString(rTempName(edwTempDir(), "edwPairSample", ".ra")); safef(command, sizeof(command), "edwSamPairedEndStats -maxInsert=%d %s %s", maxInsert, tmpSam, tmpRa); mustSystem(command); /* Read RA file into variables. */ struct edwQaPairedEndFastq *pe = edwQaPairedEndFastqOneFromRa(tmpRa); /* Update database with record. */ struct sqlConnection *freshConn = edwConnectReadWrite(); char query[256]; sqlSafef(query, sizeof(query), "insert into edwQaPairedEndFastq " "(fileId1,fileId2,concordance,distanceMean,distanceStd,distanceMin,distanceMax,recordComplete) " " values (%u,%u,%g,%g,%g,%g,%g,1)" , vf1->fileId, vf2->fileId, pe->concordance, pe->distanceMean , pe->distanceStd, pe->distanceMin, pe->distanceMax); sqlUpdate(conn, query); sqlDisconnect(&freshConn); /* Clean up and go home. */ edwValidFileFree(&otherVf); remove(sample1); remove(sample2); remove(sai1); remove(sai2); remove(tmpSam); remove(tmpRa); #ifdef SOON #endif /* SOON */ freez(&sample1); freez(&sample2); freez(&sai1); freez(&sai2); freez(&tmpSam); freez(&tmpRa); edwQaPairedEndFastqFree(&pe); edwValidFileFree(&otherVf); } void edwMakePairedEndQa(unsigned startId, unsigned endId) /* edwMakePairedEndQa - Do alignments of paired-end fastq files and calculate distrubution of * insert size. */ { struct sqlConnection *conn = edwConnectReadWrite(); struct edwFile *ef, *efList = edwFileAllIntactBetween(conn, startId, endId); for (ef = efList; ef != NULL; ef = ef->next) { struct edwValidFile *vf = edwValidFileFromFileId(conn, ef->id); if (vf != NULL) { if (sameString(vf->format, "fastq") && !isEmpty(vf->pairedEnd)) pairedEndQa(conn, ef, vf); } } sqlDisconnect(&conn); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); maxInsert = optionInt("maxInsert", maxInsert); edwMakePairedEndQa(sqlUnsigned(argv[1]), sqlUnsigned(argv[2])); return 0; }