4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/lib/lrg.c src/hg/lib/lrg.c
index cb523c6..4a13c77 100644
--- src/hg/lib/lrg.c
+++ src/hg/lib/lrg.c
@@ -1,541 +1,541 @@
 /* lrg.c was originally generated by the autoSql program, which also 
  * generated lrg.h and lrg.sql.  This module links the database and
  * the RAM representation of objects. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "dystring.h"
 #include "jksql.h"
 #include "lrg.h"
 
 
 
 char *lrgCommaSepFieldNames = "chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,reserved,blockCount,blockSizes,chromStarts,mismatches,indels,lrgSize,hgncId,hgncSymbol,ncbiAcc,lrgSource,lrgSourceUrl,creationDate";
 
 struct lrg *lrgLoad(char **row)
 /* Load a lrg from row fetched with select * from lrg
  * from database.  Dispose of this with lrgFree(). */
 {
 struct lrg *ret;
 
 AllocVar(ret);
 ret->blockCount = sqlSigned(row[9]);
 ret->chrom = cloneString(row[0]);
 ret->chromStart = sqlUnsigned(row[1]);
 ret->chromEnd = sqlUnsigned(row[2]);
 ret->name = cloneString(row[3]);
 ret->score = sqlUnsigned(row[4]);
 safecpy(ret->strand, sizeof(ret->strand), row[5]);
 ret->thickStart = sqlUnsigned(row[6]);
 ret->thickEnd = sqlUnsigned(row[7]);
 ret->reserved = sqlUnsigned(row[8]);
 {
 int sizeOne;
 sqlSignedDynamicArray(row[10], &ret->blockSizes, &sizeOne);
 assert(sizeOne == ret->blockCount);
 }
 {
 int sizeOne;
 sqlSignedDynamicArray(row[11], &ret->chromStarts, &sizeOne);
 assert(sizeOne == ret->blockCount);
 }
 ret->mismatches = cloneString(row[12]);
 ret->indels = cloneString(row[13]);
 ret->lrgSize = sqlUnsigned(row[14]);
 ret->hgncId = sqlSigned(row[15]);
 ret->hgncSymbol = cloneString(row[16]);
 ret->ncbiAcc = cloneString(row[17]);
 ret->lrgSource = cloneString(row[18]);
 ret->lrgSourceUrl = cloneString(row[19]);
 ret->creationDate = cloneString(row[20]);
 return ret;
 }
 
 struct lrg *lrgLoadAll(char *fileName) 
 /* Load all lrg from a whitespace-separated file.
  * Dispose of this with lrgFreeList(). */
 {
 struct lrg *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[21];
 
 while (lineFileRow(lf, row))
     {
     el = lrgLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct lrg *lrgLoadAllByChar(char *fileName, char chopper) 
 /* Load all lrg from a chopper separated file.
  * Dispose of this with lrgFreeList(). */
 {
 struct lrg *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[21];
 
 while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
     {
     el = lrgLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct lrg *lrgCommaIn(char **pS, struct lrg *ret)
 /* Create a lrg out of a comma separated string. 
  * This will fill in ret if non-null, otherwise will
  * return a new lrg */
 {
 char *s = *pS;
 
 if (ret == NULL)
     AllocVar(ret);
 ret->chrom = sqlStringComma(&s);
 ret->chromStart = sqlUnsignedComma(&s);
 ret->chromEnd = sqlUnsignedComma(&s);
 ret->name = sqlStringComma(&s);
 ret->score = sqlUnsignedComma(&s);
 sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
 ret->thickStart = sqlUnsignedComma(&s);
 ret->thickEnd = sqlUnsignedComma(&s);
 ret->reserved = sqlUnsignedComma(&s);
 ret->blockCount = sqlSignedComma(&s);
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->blockSizes, ret->blockCount);
 for (i=0; i<ret->blockCount; ++i)
     {
     ret->blockSizes[i] = sqlSignedComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->chromStarts, ret->blockCount);
 for (i=0; i<ret->blockCount; ++i)
     {
     ret->chromStarts[i] = sqlSignedComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 ret->mismatches = sqlStringComma(&s);
 ret->indels = sqlStringComma(&s);
 ret->lrgSize = sqlUnsignedComma(&s);
 ret->hgncId = sqlSignedComma(&s);
 ret->hgncSymbol = sqlStringComma(&s);
 ret->ncbiAcc = sqlStringComma(&s);
 ret->lrgSource = sqlStringComma(&s);
 ret->lrgSourceUrl = sqlStringComma(&s);
 ret->creationDate = sqlStringComma(&s);
 *pS = s;
 return ret;
 }
 
 void lrgFree(struct lrg **pEl)
 /* Free a single dynamically allocated lrg such as created
  * with lrgLoad(). */
 {
 struct lrg *el;
 
 if ((el = *pEl) == NULL) return;
 freeMem(el->chrom);
 freeMem(el->name);
 freeMem(el->blockSizes);
 freeMem(el->chromStarts);
 freeMem(el->mismatches);
 freeMem(el->indels);
 freeMem(el->hgncSymbol);
 freeMem(el->ncbiAcc);
 freeMem(el->lrgSource);
 freeMem(el->lrgSourceUrl);
 freeMem(el->creationDate);
 freez(pEl);
 }
 
 void lrgFreeList(struct lrg **pList)
 /* Free a list of dynamically allocated lrg's */
 {
 struct lrg *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     lrgFree(&el);
     }
 *pList = NULL;
 }
 
 void lrgOutput(struct lrg *el, FILE *f, char sep, char lastSep) 
 /* Print out lrg.  Separate fields with sep. Follow last field with lastSep. */
 {
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->chrom);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%u", el->chromStart);
 fputc(sep,f);
 fprintf(f, "%u", el->chromEnd);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->name);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%u", el->score);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->strand);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%u", el->thickStart);
 fputc(sep,f);
 fprintf(f, "%u", el->thickEnd);
 fputc(sep,f);
 fprintf(f, "%u", el->reserved);
 fputc(sep,f);
 fprintf(f, "%d", el->blockCount);
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->blockCount; ++i)
     {
     fprintf(f, "%d", el->blockSizes[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->blockCount; ++i)
     {
     fprintf(f, "%d", el->chromStarts[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->mismatches);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->indels);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%u", el->lrgSize);
 fputc(sep,f);
 fprintf(f, "%d", el->hgncId);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->hgncSymbol);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->ncbiAcc);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->lrgSource);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->lrgSourceUrl);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->creationDate);
 if (sep == ',') fputc('"',f);
 fputc(lastSep,f);
 }
 
 /* -------------------------------- End autoSql Generated Code -------------------------------- */
 
 #include "psl.h"
 #include "hdb.h"
 
 static struct lrgDiff *lrgDiffNew(uint chromStart, uint chromEnd, char *chromSeq,
 				  uint lrgStart, uint lrgEnd, char *lrgSeq)
 /* Alloc, populate and return a new lrgDiff. */
 {
 struct lrgDiff *diff;
 AllocVar(diff);
 diff->chromStart = chromStart;
 diff->chromEnd = chromEnd;
 diff->chromSeq = chromSeq;
 diff->lrgStart = lrgStart;
 diff->lrgEnd = lrgEnd;
 diff->lrgSeq = lrgSeq;
 return diff;
 }
 
 static struct lrgDiff *lrgParseDiffs(struct lrg *lrg, char *diffStr)
 /* Parse diffStr, i.e. a comma-separated sequence of colon-separated elements.
  * Each colon-separated element has chromStart, lrgStart, chromSeq and lrgSeq.
  * Return a list of lrgDiffs. */
 {
 struct lrgDiff *diffList = NULL;
 struct slName *diffEl, *diffStrList = slNameListFromString(diffStr, ',');
 for (diffEl = diffStrList;  diffEl != NULL;  diffEl = diffEl->next)
     {
     int expectedCount = 4;
     char *words[expectedCount+1];
     int wordCount = chopByChar(cloneString(diffEl->name), ':', words, ArraySize(words));
     if (wordCount != expectedCount)
 	errAbort("lrgParseMismatches: expected %d :-sep words, got %d", expectedCount, wordCount);
     uint chromStart = atoll(words[0]) + lrg->chromStart;
     uint lrgStart = atoll(words[1]);
     char *chromSeq = words[2];
     char *lrgSeq = words[3];
     uint chromEnd = sameString(chromSeq, "-") ? chromStart : chromStart + strlen(chromSeq);
     uint lrgEnd = sameString(lrgSeq, "-") ? lrgStart : lrgStart + strlen(lrgSeq);
     struct lrgDiff *diff = lrgDiffNew(chromStart, chromEnd, chromSeq, lrgStart, lrgEnd, lrgSeq);
     slAddHead(&diffList, diff);
     }
 slNameFreeList(&diffStrList);
 slReverse(&diffList);
 return diffList;
 }
 
 struct lrgDiff *lrgParseMismatches(struct lrg *lrg)
 /* Parse lrg->mismatches and return a list of lrgDiffs. */
 {
 return lrgParseDiffs(lrg, lrg->mismatches);
 }
 
 struct lrgDiff *lrgParseIndels(struct lrg *lrg)
 /* Parse lrg->indels and return a list of lrgDiffs. */
 {
 return lrgParseDiffs(lrg, lrg->indels);
 }
 
 void lrgDiffFreeList(struct lrgDiff **pDiff)
 /* Free up a list of parsed diffs. */
 {
 if (pDiff == NULL || *pDiff == NULL)
     return;
 struct lrgDiff *diff;
 for (diff = *pDiff;  diff != NULL;  diff = diff->next)
     {
     freeMem(diff->chromSeq);
     freeMem(diff->lrgSeq);
     }
 slFreeList(pDiff);
 }
 
 static boolean lrgNameParseRange(char *lrgName, int lrgSize, char *buf, size_t bufSize,
                                  int *retStart, int *retEnd)
 /* If lrgName is of the format name:start-end, write the name into buf, set retStart and retEnd
  * and return TRUE.  Otherwise leave inputs alone and return FALSE. */
 {
 if (strchr(lrgName, ':'))
     {
     // Parse lrgStart and lrgEnd out of name field (LRG_*:start-end)
     safecpy(buf, bufSize, lrgName);
     char *p = strchr(buf, ':');
     // Chop at : (after name)
     *p++ = '\0';
     char *num = p;
     p = strchr(num, '-');
     if (p == NULL)
         errAbort("Error parsing LRG name:start-end '%s' -- no '-' following ':'", lrgName);
     *p++ = '\0';
     *retStart = atoi(num) - 1;
     num = p;
     *retEnd = atoi(num);
     if (*retStart < 0 || *retStart >= *retEnd || *retEnd < 0 || *retEnd > lrgSize)
         errAbort("lrgNameParseRange: got bad coords (%d, %d) from '%s' size %d",
                  *retStart, *retEnd, lrgName, lrgSize);
     return TRUE;
     }
 return FALSE;
 }
 
 static int calcBlockSize(uint nextTStart, uint nextQStart, struct psl *psl, int blockIx)
 {
 int tLen = nextTStart - psl->tStarts[blockIx];
 int qLen = nextQStart - psl->qStarts[blockIx];
 return min(tLen, qLen);
 }
 
 struct psl *lrgToPsl(struct lrg *lrg, uint chromSize)
 /* Use lrg's mismatches and indels to make a PSL. */
 {
 struct lrgDiff *mismatches = lrgParseMismatches(lrg), *indels = lrgParseIndels(lrg);
 int lrgStart = 0, lrgEnd = lrg->lrgSize;
 char *lrgName = lrg->name;
 char lrgNameBuf[strlen(lrgName)+1];
 if (lrgNameParseRange(lrgName, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd))
     lrgName = lrgNameBuf;
 int blockCount = slCount(indels) + 1;
 unsigned opts = 0;
 struct psl *psl = pslNew(lrgName, lrg->lrgSize, lrgStart, lrgEnd,
                          lrg->chrom, chromSize, lrg->chromStart, lrg->chromEnd,
                          lrg->strand, blockCount, opts);
 psl->blockCount = blockCount;
 psl->misMatch = slCount(mismatches);
 boolean isRc = (lrg->strand[0] == '-');
 // Translate gap coords from indels into block coords:
 psl->qStarts[0] = isRc ? (lrg->lrgSize - lrgEnd) : lrgStart;
 psl->tStarts[0] = lrg->chromStart;
 int alignedBaseCount = 0;
 int blockIx = 1;
 struct lrgDiff *diff;
 for (diff = indels;  diff != NULL;  diff = diff->next)
     {
     // now we know the size of the previous block:
     int lrgStartPosStrand = isRc ? (lrg->lrgSize - diff->lrgEnd) : diff->lrgStart;
     // Insertion on t or q, or both?
     int lrgLen = diff->lrgEnd - diff->lrgStart;
     int refLen = diff->chromEnd - diff->chromStart;
     if (refLen == 0 || lrgLen == 0) // insertion only on t or q
         {
         if (refLen == 0)
             {
             psl->qNumInsert++;
             psl->qBaseInsert += (lrgLen - refLen);
             }
         else if (lrgLen == 0)
             {
             psl->tNumInsert++;
             psl->tBaseInsert += (refLen - lrgLen);
             }
         psl->qStarts[blockIx] = lrgStartPosStrand + lrgLen;
         psl->tStarts[blockIx] = diff->chromEnd;
         }
     else // double sided insertion
         {
         psl->tNumInsert++;
         psl->tBaseInsert += refLen;
         psl->qNumInsert++;
         psl->qBaseInsert += lrgLen;
         // the qStart needs to account for the lrg insertion AND the prev block
         psl->qStarts[blockIx] = diff->lrgStart + lrgLen;
         }
     psl->tStarts[blockIx] = diff->chromEnd;
     psl->blockSizes[blockIx-1] = calcBlockSize(diff->chromStart, lrgStartPosStrand, psl, blockIx-1);
     alignedBaseCount += psl->blockSizes[blockIx-1];
     blockIx++;
     }
 // size of last block:
 psl->blockSizes[blockIx-1] = calcBlockSize(lrg->chromEnd, lrg->lrgSize, psl, blockIx-1);
 alignedBaseCount += psl->blockSizes[blockIx-1];
 if (blockIx != psl->blockCount)
     errAbort("lrgToPsl: expected %d blocks but somehow ended up with %d", psl->blockCount,
 	     blockIx);
 psl->match = alignedBaseCount - psl->misMatch;
 return psl;
 }
 
 static int sumInsertions(struct lrgDiff *indels)
 /* Return the sum of insertions of lrg into reference sequence, ignoring deletions.
  * That is the max headroom that we will need while shifting bases around. */
 {
 int sumIns = 0;
 struct lrgDiff *diff;
 for (diff = indels;  diff != NULL;  diff = diff->next)
     sumIns += (diff->lrgEnd - diff->lrgStart);
 return sumIns;
 }
 
 struct dnaSeq *lrgReconstructSequence(struct lrg *lrg, char *db)
 /* Use genomic sequence, lrg->mismatches and lrg->indels to reconstruct LRG sequence */
 {
 struct lrgDiff *diff, *indels = lrgParseIndels(lrg);
 int refSize = lrg->chromEnd - lrg->chromStart;
 // Fetch genomic sequence:
 struct dnaSeq *lrgSeq = hDnaFromSeq(db, lrg->chrom, lrg->chromStart, lrg->chromEnd, dnaUpper);
 boolean isRc = (lrg->strand[0] == '-');
 if (isRc)
     reverseComplement(lrgSeq->dna, refSize);
 // If this was only a partial mapping of the LRG, pad N's at the beginning and/or end to get
 // the correct size.
 int lrgStart = 0, lrgEnd = lrg->lrgSize;
 char lrgNameBuf[strlen(lrg->name)+1];
 lrgNameParseRange(lrg->name, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd);
 // Replace lrgSeq->dna with a larger version that has room for genomic sequence plus inserted bases
 // as well as possible padding at start and end for partial LRG mappings
 int addedBases = sumInsertions(indels);
 int paddedSize = refSize + lrgStart + (lrg->lrgSize - lrgEnd) + addedBases + 1;
 char *lrgSeqDna = needMem(paddedSize);
 if (lrgStart > 0)
     memset(lrgSeqDna, 'N', lrgStart);
 // Copy in the genomic data
 safencpy(lrgSeqDna+lrgStart, paddedSize-lrgStart, lrgSeq->dna, refSize);
 refSize += lrgStart;
 // If the LRG has inserted bases, zero out some extra bytes after genomic seq
 if (addedBases)
     zeroBytes(lrgSeqDna+refSize, addedBases);
 freeMem(lrgSeq->dna);
 lrgSeq->dna = lrgSeqDna;
 // Go through indels backwards w.r.t. LRG so we move sequence to the right
 // while not changing coords to the left:
 if (!isRc)
     slReverse(&indels);
 for (diff = indels;  diff != NULL;  diff = diff->next)
     {
     int lrgLen = diff->lrgEnd - diff->lrgStart;
     int refLen = diff->chromEnd - diff->chromStart;
     int refStart = diff->chromStart - lrg->chromStart + lrgStart;
     if (isRc)
 	refStart = refSize - (refStart + refLen);
     if (lrgLen > refLen)
 	{
 	// LRG inserts sequence: shift the rest of the sequence to the right
 	int insSize = lrgLen - refLen;
 	int moveSize = refSize + addedBases - refStart - insSize;
 	memmove(lrgSeqDna+refStart+insSize, lrgSeqDna+refStart, moveSize);
 	}
     else
 	{
 	// LRG deletes sequence: shift the rest of the sequence to the left
 	int delSize = refLen - lrgLen;
 	int moveSize = refSize + addedBases - refStart - delSize + 1; // '\0' at end too
 	memmove(lrgSeqDna+refStart, lrgSeqDna+refStart+delSize, moveSize);
 	}
     // If there is LRG sequence, copy it in.
     if (lrgLen > 0)
 	memcpy(lrgSeqDna+refStart, diff->lrgSeq, lrgLen);
     }
 int len = strlen(lrgSeqDna);
 if (len != lrgEnd)
     errAbort("lrgReconstructSequence: expected sequence length to be lrgEnd %d, got %d",
              lrgEnd, len);
 // Now that indels have been resolved, use LRG coords to substitute mismatches:
 struct lrgDiff *mismatches = lrgParseMismatches(lrg);
 for (diff = mismatches;  diff != NULL;  diff = diff->next)
     {
     if (lrgSeqDna[diff->lrgStart] != diff->chromSeq[0])
 	errAbort("Mismatch with unexpected assembly sequence: expected '%c' on %c strand, "
 		 "got '%c'", diff->chromSeq[0], lrg->strand[0], lrgSeqDna[diff->lrgStart]);
     int size = diff->lrgEnd - diff->lrgStart;
     memcpy(lrgSeqDna+diff->lrgStart, diff->lrgSeq, size);
     }
 // Add padding N's at end if applicable
 if (lrgEnd < lrg->lrgSize)
     {
     memset(lrgSeqDna+lrgEnd, 'N', lrg->lrgSize - lrgEnd);
     lrgSeqDna[lrg->lrgSize] = '\0';
     }
 len = strlen(lrgSeqDna);
 if (len != lrg->lrgSize)
     errAbort("lrgReconstructSequence: Error applying LRG indels for '%s': expected size %d, got %d",
              lrg->name, lrg->lrgSize, len);
 lrgSeq->size = lrg->lrgSize;
 return lrgSeq;
 }