4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/ratStuff/mafAddQRows/mafAddQRows.c src/hg/ratStuff/mafAddQRows/mafAddQRows.c
index d10317a..c8632de 100644
--- src/hg/ratStuff/mafAddQRows/mafAddQRows.c
+++ src/hg/ratStuff/mafAddQRows/mafAddQRows.c
@@ -1,384 +1,384 @@
 /* Copyright (C) 2014 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 /* mafAddQRows - Add quality data to a maf.                                  */
 /*                                                                           */
 /* For each species with quality data we want to add to the maf, the quality */
 /* data is stored in an external qac file which contains quality data for    */
 /* each chromosome.  We additionally build an index (qdx file) which allows  */
 /* us to seek directly to quality data for a particular chromosome in the    */
 /* qac file.                                                                 */
 /*                                                                           */
 /* We build a hash containing data for each species with quality data.       */
 /* The hash contains the name of the species, an open file descriptor to     */
 /* the appropriate qac file, a hash mapping chromosome to file offset in     */
 /* the qac file, and a qaSeq structure for holding quality data.  The qaSeq  */
 /* structure acts as a one element cache, holding quality data for the most  */
 /* recently requested chromosome.                                            */
 /*                                                                           */
 /* As we are generally interested in bases with low quality, the quality     */
 /* data in the maf is a compressed version of the actual quality data.       */
 /* The quality data in the maf is:                                           */
 /*                                                                           */
 /*     min( floor(actualy quality value/5), 9)                               */
 /*                                                                           */
 /* This allows us to show more of the low-quality values.  The relationship  */
 /* between quality characters in the maf and the actualy quality value are   */
 /* summarized in the following table:                                        */
 /*                                                                           */
 /*     .: In Gap Q == FAKE_GAP_QUAL                                          */
 /*     0: 0 <= Q < 5 || Q == 98                                              */
 /*     1: 5 <= Q < 10                                                        */
 /*     2: 10 <= Q < 15                                                       */
 /*     3: 15 <= Q < 20                                                       */
 /*     4: 20 <= Q < 25                                                       */
 /*     5: 25 <= Q < 30                                                       */
 /*     6: 30 <= Q < 35                                                       */
 /*     7: 35 <= Q < 40                                                       */
 /*     8: 40 <= Q < 45                                                       */
 /*     9: 45 <= Q < 98                                                       */
 /*     F: Q == 99                                                            */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "obscure.h"
 #include "qaSeq.h"
 #include "maf.h"
 #include "math.h"
 
 /* fake quality value use to indicate gaps */
 #define FAKE_GAP_QUAL 100
 
 static double divisor;
 
 struct speciesList 
 /* list of species for which we are adding quality data */
     {
     struct speciesList *next;
     char *name;
     char *dir;
     };
 
 struct indexEntry
 /* map chromosome name to file offset in a qac file */
     {
     char *name;         /* name of the chrom */
     off_t offset;       /* offset in the qac file */
     };
 
 struct qData
 /* metadata used to locate quality data for a given species,chrom */
     {
     char *name;         /* species name */
     FILE *fd;           /* file descriptor of qac file */
     boolean isSwapped;  /* is the qac file swapped? */
     struct hash *index; /* hash mapping chrom to indexEntry and hence offset in qac file */
     struct qaSeq *qa;   /* cache to avoid reloading quality data */
     };
 
 static struct optionSpec options[] = {
 	{"divisor", OPTION_DOUBLE},
 	{NULL, 0}
 };
 
 
 static void usage()
 /* Explain usage and exit. */
 {
 errAbort(
     "mafAddQRows - Add quality data to a maf\n"
     "usage:\n"
     "   mafAddQRows species.lst in.maf out.maf\n"
     "where each species.lst line contains two fields\n"
     "   1) species name\n"
     "   2) directory where the .qac and .qdx files are located\n"
     "options:\n"
     "  -divisor=n is value to divide Q value by.  Default is 5.\n"
 );
 }
 
 
 static struct slPair *buildSpeciesList(char *speciesFile)
 /* read species and directory info from the species.lst file */
 {
 struct slPair *speciesList = NULL;
 struct lineFile *lf = lineFileOpen(speciesFile, TRUE);
 char *row[2];
 
 while (lineFileNextRow(lf, row, 2))
     slPairAdd(&speciesList, row[0], (void *) cloneString(row[1]));
 
 lineFileClose(&lf);
 
 return speciesList;
 }
 
 
 static struct hash *loadQacIndex(struct slPair *species)
 /* Buid a hash mapping chrom to file offset in a qac file for the */
 /* argument species.                                              */
 {
 char buffer[1024];
 struct lineFile *lf;
 char *row[2];
 int seqCount;
 struct hash *qacIndex;
 struct hashEl *hel;
 struct indexEntry *ie;
 
 safef(buffer, sizeof(buffer), "%s/%s.qdx", (char *) species->val, species->name);
 lf = lineFileOpen(buffer, TRUE);
 
 /* get the count of sequences in the qac file */
 if (lineFileNextRow(lf, row, 1) == FALSE)
     errAbort("Index file %s is empty.", lf->fileName);
 verbose(2, "loadQacIndex: reading file '%s' finds '%s' sequences\n", buffer, row[0]);
 seqCount = lineFileNeedFullNum(lf, row, 0);
 
 /* build and populate a hash mapping chrom to file offset */
 qacIndex = newHash(digitsBaseTwo((unsigned long) seqCount));
 while (lineFileNextRow(lf, row, 2))
     {
     if ((hel = hashLookup(qacIndex, row[0])) == NULL)
         {
         AllocVar(ie);
         ie->name = cloneString(row[0]);
         errno = 0;
         ie->offset = (off_t) strtol(row[1], NULL, 0);
         if (errno != 0)
             errnoAbort("Unable to convert %s to a long on line %d of %s.",
                 row[1], lf->lineIx, lf->fileName);
         hashAdd(qacIndex, ie->name, ie);
         seqCount--;
         }
     else
         {
         errAbort("Duplicate sequence name %s on line %d of %s.",
             row[0], lf->lineIx, lf->fileName);
         }
     }
 
 /* sanity check */
 if (seqCount != 0)
     errAbort("%s quality index is inconsistent.", species->name);
 
 lineFileClose(&lf);
 
 return qacIndex;
 }
 
 
 static struct qData *qDataLoad(struct slPair *species)
 /* Populate a qData struct for the named species. */
 {
 char buffer[1024];
 struct qData *qd;
 
 AllocVar(qd);
 qd->name = cloneString(species->name);
 
 safef(buffer, sizeof(buffer), "%s/%s.qac", (char *) species->val, species->name);
 qd->fd = qacOpenVerify(buffer, &(qd->isSwapped));
 
 qd->index = loadQacIndex(species);
 qd->qa = NULL;
 
 return qd;
 }
 
 
 static struct hash *buildSpeciesHash(struct slPair *speciesList)
 /* Build a hash keyed on species name.  Each species we are adding "q" */
 /* lines for will have an entry in this hash.  The hash contains:      */
 /*   1) species name [hash key]                                        */
 /*   2) file descriptor for qac file (will be opened)                  */
 /*   3) flag telling us if the qac file above needs to be swapped      */
 /*   4) hash mapping chrom name to an indexEntry                       */
 /*   5) a 1-element cache for holding quality data for a chromosome    */
 {
 struct hash *speciesHash;
 struct slPair *pair;
 struct hashEl *hel;
 struct qData *qd;
 
 speciesHash = newHash(digitsBaseTwo((unsigned long) slCount(speciesList)));
 
 for (pair = speciesList; pair != NULL; pair = pair->next)
     {
     if ((hel = hashLookup(speciesHash, pair->name)) != NULL)
         errAbort("Duplicate species %s in species list file.", pair->name);
     qd = qDataLoad(pair);
     hashAdd(speciesHash, qd->name, qd);
     }
 
 return speciesHash;
 }
 
 
 static struct qData *loadQualityData (struct mafComp *mc, struct hash *speciesHash)
 /* return qData if found or NULL if not */
 {
 char buffer[1024];
 char *species, *chrom;
 struct hashEl *hel;
 struct qData *qd;
 struct indexEntry *ie;
 
 /* separate species and chrom */
 strncpy(buffer, mc->src, sizeof(buffer));
 species = chrom = buffer;
 if ((chrom = strchr(buffer, '.')) == NULL)
     errAbort("can't find chrom for %s\n", buffer);
 *chrom++ = '\0';
 
 /* return if we don't have quality data for this species */
 if ((hel = hashLookup(speciesHash, species)) == NULL)
     return(NULL);
 
 /* load the quality data if we need to */
 qd = (struct qData *) hel->val;
 if (qd->qa == NULL || ! sameString(qd->qa->name, chrom))
     {
     if ((hel = hashLookup(qd->index, chrom)) == NULL)
         return(NULL);
     ie = (struct indexEntry *) hel->val;
     if (fseeko(qd->fd, ie->offset, SEEK_SET) == (off_t) -1)
         errnoAbort("fseeko() failed\n");
     if (qd->qa != NULL)
         qaSeqFree(&(qd->qa));
     qd->qa = qacReadNext(qd->fd, qd->isSwapped);
     }
 
 return qd;
 }
 
 
 static char convertToQChar(UBYTE *q)
 /* convert a q value into the appropriate character */
 {
 int val;
 
 switch (*q)
     {
     case FAKE_GAP_QUAL:
         return('.');
         break;
     case 99:
         return('F');
         break;
     case 98:
         return('0');
         break;
     default:
         val = (int) floor((double) *q / divisor);
         val = (val < 9) ? val : 9;
     }
 
 return '0' + val;
 }
 
 
 void mafAddQRows(char *inMaf, char *outMaf, struct hash *speciesHash)
 /* mafAddQRows - Add quality data to a maf. */
 {
 struct mafFile *mf;
 struct mafAli *ma;
 struct mafComp *mc;
 struct qData *qd;
 char *quality, *qp;
 FILE *outf = mustOpen(outMaf, "w");
 UBYTE *q;
 int inc = 1;
 
 mf = mafOpen(inMaf);
 mafWriteStart(outf, mf->scoring);
 
 while ((ma = mafNext(mf)) != NULL)
     {
     for (mc = ma->components; mc != NULL; mc = mc->next)
         {
         /* if this is not an "e" row, and we have quality data for this species */
         if (mc->size != 0 && (qd = loadQualityData(mc, speciesHash)) != NULL)
             {
             quality = cloneString(mc->text);
             qp = mc->quality = quality;
 
             /* set a pointer to the beginning of the quality data in memory */
             if (mc->strand == '-')
                 {
 				/* check qac, maf for sanity */
 				if ((mc->srcSize - mc->start - mc->size < 0)
 						|| (mc->srcSize - mc->start > qd->qa->size))
 					{
 					fprintf(stderr, "Range mismatch: %s qual 0-%d\n",
 								mc->src, qd->qa->size);
 					mafWrite(stderr, ma);
 					exit(EXIT_FAILURE);
 					}
                 q = qd->qa->qa + qd->qa->size - 1 - mc->start;
                 inc = -1;
                 }
             else
                 {
 				/* check qac, maf for sanity */
 				if ((mc->start < 0) || (mc->start + mc->size > qd->qa->size))
 					{
 					fprintf(stderr, "Range mismatch: %s qual 0-%d\n",
 								mc->src, qd->qa->size);
 					mafWrite(stderr, ma);
 					exit(EXIT_FAILURE);
 					}
                 q = qd->qa->qa + mc->start;
                 inc = +1;
                 }
 
             /* replace non-gap characters with the corresponding quality value */
             while (*qp != '\0')
                 {
                 if (*qp == '-')
                     {
                     qp++;
                     continue;
                     }
                 *qp++ = convertToQChar(q);
                 q += inc;
                 }
             }
         }
 
     mafWrite(outf, ma);
     mafAliFree(&ma);
     }
 
 carefulClose(&outf);
 }
 
 
 int main(int argc, char *argv[])
 {
 struct slPair *speciesList;
 struct hash *speciesHash;
 
 optionInit(&argc, argv, options);
 
 if (argc != 4)
     usage();
 
 divisor = optionDouble("divisor", 5.0);
 
 if (divisor <= 0)
     {
     errAbort("Invalid divisor: %f\n", divisor);
     }
 
 /* load metadata about the quality data we're adding */
 speciesList = buildSpeciesList(argv[1]);
 speciesHash = buildSpeciesHash(speciesList);
 slPairFreeValsAndList(&speciesList);
 
 mafAddQRows(argv[2], argv[3], speciesHash);
 
 return EXIT_SUCCESS;
 }