4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/cirm/cdw/cdwMakePairedEndQa/cdwMakePairedEndQa.c src/hg/cirm/cdw/cdwMakePairedEndQa/cdwMakePairedEndQa.c
index 9c60e93..36e0696 100644
--- src/hg/cirm/cdw/cdwMakePairedEndQa/cdwMakePairedEndQa.c
+++ src/hg/cirm/cdw/cdwMakePairedEndQa/cdwMakePairedEndQa.c
@@ -1,322 +1,322 @@
 /* cdwMakePairedEndQa - Do alignments of paired-end fastq files and calculate distrubution of 
  * insert size. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "portable.h"
 #include "obscure.h"
 #include "cdw.h"
 #include "cdwLib.h"
 #include "raToStruct.h"
 #include "ra.h"
 #include "cdwLib.h"
 
 int maxInsert = 1000;
 bool keepTemp = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "cdwMakePairedEndQa - Do alignments of paired-end fastq files and calculate distribution of \n"
   "insert size.\n"
   "usage:\n"
   "   cdwMakePairedEndQa startId endId\n"
   "options:\n"
   "   -maxInsert=N - maximum allowed insert size, default %d\n"
   "   -keep - do not delete temp files\n"
   , maxInsert
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"maxInsert", OPTION_INT},
    {NULL, 0},
 };
 
 /**** - Start raToStructGen generated code - ****/
 
 struct raToStructReader *cdwQaPairedEndFastqRaReader()
 /* Make a raToStructReader for cdwQaPairedEndFastq */
 {
 static char *fields[] = {
     "fileId1",
     "concordance",
     "distanceMean",
     "distanceStd",
     "distanceMin",
     "distanceMax",
     };
 static char *requiredFields[] = {
     "concordance",
     "distanceMean",
     "distanceStd",
     "distanceMin",
     "distanceMax",
     };
 return raToStructReaderNew("cdwQaPairedEndFastq", ArraySize(fields), fields, ArraySize(requiredFields), requiredFields);
 }
 
 
 struct cdwQaPairedEndFastq *cdwQaPairedEndFastqFromNextRa(struct lineFile *lf, struct raToStructReader *reader)
 /* Return next stanza put into an cdwQaPairedEndFastq. */
 {
 enum fields
     {
     fileId1Field,
     concordanceField,
     distanceMeanField,
     distanceStdField,
     distanceMinField,
     distanceMaxField,
     };
 if (!raSkipLeadingEmptyLines(lf, NULL))
     return NULL;
 
 struct cdwQaPairedEndFastq *el;
 AllocVar(el);
 
 bool *fieldsObserved = reader->fieldsObserved;
 bzero(fieldsObserved, reader->fieldCount);
 
 char *tag, *val;
 while (raNextTagVal(lf, &tag, &val, NULL))
     {
     struct hashEl *hel = hashLookup(reader->fieldIds, tag);
     if (hel != NULL)
         {
 	int id = ptToInt(hel->val);
 	if (fieldsObserved[id])
 	     errAbort("Duplicate tag %s line %d of %s\n", tag, lf->lineIx, lf->fileName);
 	fieldsObserved[id] = TRUE;
 	switch (id)
 	    {
 	    case fileId1Field:
 	        {
 	        el->fileId1 = sqlUnsigned(val);
 		break;
 	        }
 	    case concordanceField:
 	        {
 	        el->concordance = sqlDouble(val);
 		break;
 	        }
 	    case distanceMeanField:
 	        {
 	        el->distanceMean = sqlDouble(val);
 		break;
 	        }
 	    case distanceStdField:
 	        {
 	        el->distanceStd = sqlDouble(val);
 		break;
 	        }
 	    case distanceMinField:
 	        {
 	        el->distanceMin = sqlDouble(val);
 		break;
 	        }
 	    case distanceMaxField:
 	        {
 	        el->distanceMax = sqlDouble(val);
 		break;
 	        }
 	    default:
 	        internalErr();
 		break;
 	    }
 	}
     }
 
 raToStructReaderCheckRequiredFields(reader, lf);
 return el;
 }
 
 
 struct cdwQaPairedEndFastq *cdwQaPairedEndFastqLoadRa(char *fileName)
 /* Return list of all cdwQaPairedEndFastq in ra file. */
 {
 struct raToStructReader *reader = cdwQaPairedEndFastqRaReader();
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 struct cdwQaPairedEndFastq *el, *list = NULL;
 while ((el = cdwQaPairedEndFastqFromNextRa(lf, reader)) != NULL)
     slAddHead(&list, el);
 slReverse(&list);
 lineFileClose(&lf);
 raToStructReaderFree(&reader);
 return list;
 }
 
 struct cdwQaPairedEndFastq *cdwQaPairedEndFastqOneFromRa(char *fileName)
 /* Return cdwQaPairedEndFastq in file and insist there be exactly one record. */
 {
 struct cdwQaPairedEndFastq *one = cdwQaPairedEndFastqLoadRa(fileName);
 if (one == NULL)
     errAbort("No data in %s", fileName);
 if (one->next != NULL)
     errAbort("Multiple records in %s", fileName);
 return one;
 }
 
 /**** - End raToStructGen generated code - ****/
 
 #define FASTQ_SAMPLE_SIZE 10000
 
 void sampleTrimAndAlign(struct sqlConnection *conn, struct cdwValidFile *vf1, struct cdwValidFile *vf2, 
         char *genoFile, char *assay, char *tmpSam)
 /* Given a fastq file, make a subsample of it 10k reads long and align it with
  * bwa mem producing a sam file of given name. */
 {
 /* Get fastq record 1 */
 long long fileId = vf1->fileId;
 struct cdwFastqFile *fqf1 = cdwFastqFileFromFileId(conn, fileId);
 if (fqf1 == NULL)
     errAbort("No cdwFastqFile record for file id %lld", fileId);
 
 /* Get fastq record 2 */
 fileId = vf2->fileId;
 struct cdwFastqFile *fqf2 = cdwFastqFileFromFileId(conn, fileId);
 if (fqf2 == NULL)
     errAbort("No cdwFastqFile record for file id %lld", fileId);
 
 char sampleFastqName1[PATH_LEN], sampleFastqName2[PATH_LEN];
 char trimmedPath1[PATH_LEN], trimmedPath2[PATH_LEN];
 
 /* Create downsampled fastq1 in temp directory - downsampled more than default even. */
 cdwMakeTempFastqSample(fqf1->sampleFileName, FASTQ_SAMPLE_SIZE, sampleFastqName1);
 verbose(1, "downsampled %s into %s\n", fqf1->sampleFileName, sampleFastqName1);
 
 /* Create downsampled fastq2 in temp directory - downsampled more than default even. */
 cdwMakeTempFastqSample(fqf2->sampleFileName, FASTQ_SAMPLE_SIZE, sampleFastqName2);
 verbose(1, "downsampled %s into %s\n", fqf2->sampleFileName, sampleFastqName2);
 
 /* Trim long-RNA-seq files to ensure no poly-A tails remain*/
 cdwTrimReadsForAssay(sampleFastqName1, trimmedPath1, assay); 
 cdwTrimReadsForAssay(sampleFastqName2, trimmedPath2, assay); 
 
 /* Do alignment */
 char cmd[3*PATH_LEN];
 //also see comments in cdwLib on bowtie on background about the bwa -> bowtie change
 //safef(cmd, sizeof(cmd), "bwa mem -t 3 %s %s %s > %s", genoFile, trimmedPath1, trimmedPath2, tmpSam);
 safef(cmd, sizeof(cmd), "bowtie -l 40 -n 1 %s --threads 3 --mm -1 %s -2 %s -S > %s", genoFile, trimmedPath1, trimmedPath2, tmpSam);
 verbose(1, "running %s\n", cmd);
 mustSystem(cmd);
 
 /* Save return variables, clean up,  and go home. */
 cdwCleanupTrimReads(sampleFastqName1, trimmedPath1); 
 cdwCleanupTrimReads(sampleFastqName2, trimmedPath2); 
 if (!keepTemp) {
     remove(sampleFastqName1);
     remove(sampleFastqName2);
 }
 cdwFastqFileFree(&fqf1);
 cdwFastqFileFree(&fqf2);
 }
 
 void pairedEndQa(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf)
 /* Look for other end,  do a pairwise alignment, and save results in database. */
 {
 verbose(2, "pairedEndQa on %u %s %s\n", ef->id, ef->cdwFileName, ef->submitFileName);
 /* Get other end, return if not found. */
 struct cdwValidFile *otherVf = cdwOppositePairedEnd(conn, ef, vf);
 if (otherVf == NULL) {
     verbose(1, "no opposite paired end file found\n");
     return;
     }
 
 if (otherVf->fileId > vf->fileId) {
     verbose(1, "this is not the first file of the pair\n");
     return;
     }
 
 struct cdwValidFile *vf1, *vf2;
 struct cdwQaPairedEndFastq *pair = cdwQaPairedEndFastqFromVfs(conn, vf, otherVf, &vf1, &vf2);
 if (pair != NULL)
     {
     verbose(1, "pair has already been processed\n");
     cdwValidFileFree(&otherVf);
     return;
     }
 /* Get target assembly and figure out path for BWA index. */
 struct cdwAssembly *assembly = cdwAssemblyForUcscDb(conn, vf->ucscDb);
 assert(assembly != NULL);
 char genoFile[PATH_LEN];
 cdwIndexPath(assembly, genoFile);
 
 verbose(1, "aligning subsamples on %u vs. %u paired reads\n", vf1->fileId, vf2->fileId);
 
 /* Make alignments of subsamples. */
 struct cgiParsedVars *tags = cdwMetaVarsList(conn, ef);
 char *assay = cdwLookupTag(tags, "assay");
 
 char *tmpSam = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".sam"));
 sampleTrimAndAlign(conn, vf1, vf2, genoFile, assay, tmpSam);
 
 /* Make ra file with pairing statistics */
 char command[6*PATH_LEN];
 char *tmpRa = cloneString(rTempName(cdwTempDir(), "cdwPairSample", ".ra"));
 safef(command, sizeof(command), 
     "edwSamPairedEndStats -maxInsert=%d %s %s", maxInsert, tmpSam, tmpRa);
 mustSystem(command);
 
 /* Read RA file into variables. */
 struct cdwQaPairedEndFastq *pe = cdwQaPairedEndFastqOneFromRa(tmpRa);
 
 /* Update database with record. */
 struct sqlConnection *freshConn = cdwConnectReadWrite();
 char query[256];
 sqlSafef(query, sizeof(query),
     "insert into cdwQaPairedEndFastq "
     "(fileId1,fileId2,concordance,distanceMean,distanceStd,distanceMin,distanceMax,recordComplete) "
     " values (%u,%u,%g,%g,%g,%g,%g,1)"
     , vf1->fileId, vf2->fileId, pe->concordance, pe->distanceMean
     , pe->distanceStd, pe->distanceMin, pe->distanceMax);
 sqlUpdate(conn, query);
 sqlDisconnect(&freshConn);
 
 /* Clean up and go home. */
 cdwValidFileFree(&otherVf);
 remove(tmpSam);
 remove(tmpRa);
 
 freez(&tmpSam);
 freez(&tmpRa);
 cdwQaPairedEndFastqFree(&pe);
 cdwValidFileFree(&otherVf);
 }
 
 void cdwMakePairedEndQa(unsigned startId, unsigned endId)
 /* cdwMakePairedEndQa - Do alignments of paired-end fastq files and calculate distrubution of 
  * insert size. */
 {
 struct sqlConnection *conn = cdwConnectReadWrite();
 struct cdwFile *ef, *efList = cdwFileAllIntactBetween(conn, startId, endId);
 for (ef = efList; ef != NULL; ef = ef->next)
     {
     struct cdwValidFile *vf = cdwValidFileFromFileId(conn, ef->id);
     if (vf != NULL)
 	{
 	if (sameString(vf->format, "fastq") && !isEmpty(vf->pairedEnd))
 	    pairedEndQa(conn, ef, vf);
 	}
     }
 sqlDisconnect(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 maxInsert = optionInt("maxInsert", maxInsert);
 keepTemp = optionExists("keep");
 cdwMakePairedEndQa(sqlUnsigned(argv[1]), sqlUnsigned(argv[2]));
 return 0;
 }