4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/lib/lrg.c src/hg/lib/lrg.c index cb523c6..4a13c77 100644 --- src/hg/lib/lrg.c +++ src/hg/lib/lrg.c @@ -1,541 +1,541 @@ /* lrg.c was originally generated by the autoSql program, which also * generated lrg.h and lrg.sql. This module links the database and * the RAM representation of objects. */ /* Copyright (C) 2014 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "dystring.h" #include "jksql.h" #include "lrg.h" char *lrgCommaSepFieldNames = "chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,reserved,blockCount,blockSizes,chromStarts,mismatches,indels,lrgSize,hgncId,hgncSymbol,ncbiAcc,lrgSource,lrgSourceUrl,creationDate"; struct lrg *lrgLoad(char **row) /* Load a lrg from row fetched with select * from lrg * from database. Dispose of this with lrgFree(). */ { struct lrg *ret; AllocVar(ret); ret->blockCount = sqlSigned(row[9]); ret->chrom = cloneString(row[0]); ret->chromStart = sqlUnsigned(row[1]); ret->chromEnd = sqlUnsigned(row[2]); ret->name = cloneString(row[3]); ret->score = sqlUnsigned(row[4]); safecpy(ret->strand, sizeof(ret->strand), row[5]); ret->thickStart = sqlUnsigned(row[6]); ret->thickEnd = sqlUnsigned(row[7]); ret->reserved = sqlUnsigned(row[8]); { int sizeOne; sqlSignedDynamicArray(row[10], &ret->blockSizes, &sizeOne); assert(sizeOne == ret->blockCount); } { int sizeOne; sqlSignedDynamicArray(row[11], &ret->chromStarts, &sizeOne); assert(sizeOne == ret->blockCount); } ret->mismatches = cloneString(row[12]); ret->indels = cloneString(row[13]); ret->lrgSize = sqlUnsigned(row[14]); ret->hgncId = sqlSigned(row[15]); ret->hgncSymbol = cloneString(row[16]); ret->ncbiAcc = cloneString(row[17]); ret->lrgSource = cloneString(row[18]); ret->lrgSourceUrl = cloneString(row[19]); ret->creationDate = cloneString(row[20]); return ret; } struct lrg *lrgLoadAll(char *fileName) /* Load all lrg from a whitespace-separated file. * Dispose of this with lrgFreeList(). */ { struct lrg *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[21]; while (lineFileRow(lf, row)) { el = lrgLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct lrg *lrgLoadAllByChar(char *fileName, char chopper) /* Load all lrg from a chopper separated file. * Dispose of this with lrgFreeList(). */ { struct lrg *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[21]; while (lineFileNextCharRow(lf, chopper, row, ArraySize(row))) { el = lrgLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct lrg *lrgCommaIn(char **pS, struct lrg *ret) /* Create a lrg out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new lrg */ { char *s = *pS; if (ret == NULL) AllocVar(ret); ret->chrom = sqlStringComma(&s); ret->chromStart = sqlUnsignedComma(&s); ret->chromEnd = sqlUnsignedComma(&s); ret->name = sqlStringComma(&s); ret->score = sqlUnsignedComma(&s); sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand)); ret->thickStart = sqlUnsignedComma(&s); ret->thickEnd = sqlUnsignedComma(&s); ret->reserved = sqlUnsignedComma(&s); ret->blockCount = sqlSignedComma(&s); { int i; s = sqlEatChar(s, '{'); AllocArray(ret->blockSizes, ret->blockCount); for (i=0; i<ret->blockCount; ++i) { ret->blockSizes[i] = sqlSignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } { int i; s = sqlEatChar(s, '{'); AllocArray(ret->chromStarts, ret->blockCount); for (i=0; i<ret->blockCount; ++i) { ret->chromStarts[i] = sqlSignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } ret->mismatches = sqlStringComma(&s); ret->indels = sqlStringComma(&s); ret->lrgSize = sqlUnsignedComma(&s); ret->hgncId = sqlSignedComma(&s); ret->hgncSymbol = sqlStringComma(&s); ret->ncbiAcc = sqlStringComma(&s); ret->lrgSource = sqlStringComma(&s); ret->lrgSourceUrl = sqlStringComma(&s); ret->creationDate = sqlStringComma(&s); *pS = s; return ret; } void lrgFree(struct lrg **pEl) /* Free a single dynamically allocated lrg such as created * with lrgLoad(). */ { struct lrg *el; if ((el = *pEl) == NULL) return; freeMem(el->chrom); freeMem(el->name); freeMem(el->blockSizes); freeMem(el->chromStarts); freeMem(el->mismatches); freeMem(el->indels); freeMem(el->hgncSymbol); freeMem(el->ncbiAcc); freeMem(el->lrgSource); freeMem(el->lrgSourceUrl); freeMem(el->creationDate); freez(pEl); } void lrgFreeList(struct lrg **pList) /* Free a list of dynamically allocated lrg's */ { struct lrg *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; lrgFree(&el); } *pList = NULL; } void lrgOutput(struct lrg *el, FILE *f, char sep, char lastSep) /* Print out lrg. Separate fields with sep. Follow last field with lastSep. */ { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->chrom); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->chromStart); fputc(sep,f); fprintf(f, "%u", el->chromEnd); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->name); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->score); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->strand); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->thickStart); fputc(sep,f); fprintf(f, "%u", el->thickEnd); fputc(sep,f); fprintf(f, "%u", el->reserved); fputc(sep,f); fprintf(f, "%d", el->blockCount); fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; i<el->blockCount; ++i) { fprintf(f, "%d", el->blockSizes[i]); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; i<el->blockCount; ++i) { fprintf(f, "%d", el->chromStarts[i]); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->mismatches); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->indels); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->lrgSize); fputc(sep,f); fprintf(f, "%d", el->hgncId); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->hgncSymbol); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->ncbiAcc); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->lrgSource); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->lrgSourceUrl); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->creationDate); if (sep == ',') fputc('"',f); fputc(lastSep,f); } /* -------------------------------- End autoSql Generated Code -------------------------------- */ #include "psl.h" #include "hdb.h" static struct lrgDiff *lrgDiffNew(uint chromStart, uint chromEnd, char *chromSeq, uint lrgStart, uint lrgEnd, char *lrgSeq) /* Alloc, populate and return a new lrgDiff. */ { struct lrgDiff *diff; AllocVar(diff); diff->chromStart = chromStart; diff->chromEnd = chromEnd; diff->chromSeq = chromSeq; diff->lrgStart = lrgStart; diff->lrgEnd = lrgEnd; diff->lrgSeq = lrgSeq; return diff; } static struct lrgDiff *lrgParseDiffs(struct lrg *lrg, char *diffStr) /* Parse diffStr, i.e. a comma-separated sequence of colon-separated elements. * Each colon-separated element has chromStart, lrgStart, chromSeq and lrgSeq. * Return a list of lrgDiffs. */ { struct lrgDiff *diffList = NULL; struct slName *diffEl, *diffStrList = slNameListFromString(diffStr, ','); for (diffEl = diffStrList; diffEl != NULL; diffEl = diffEl->next) { int expectedCount = 4; char *words[expectedCount+1]; int wordCount = chopByChar(cloneString(diffEl->name), ':', words, ArraySize(words)); if (wordCount != expectedCount) errAbort("lrgParseMismatches: expected %d :-sep words, got %d", expectedCount, wordCount); uint chromStart = atoll(words[0]) + lrg->chromStart; uint lrgStart = atoll(words[1]); char *chromSeq = words[2]; char *lrgSeq = words[3]; uint chromEnd = sameString(chromSeq, "-") ? chromStart : chromStart + strlen(chromSeq); uint lrgEnd = sameString(lrgSeq, "-") ? lrgStart : lrgStart + strlen(lrgSeq); struct lrgDiff *diff = lrgDiffNew(chromStart, chromEnd, chromSeq, lrgStart, lrgEnd, lrgSeq); slAddHead(&diffList, diff); } slNameFreeList(&diffStrList); slReverse(&diffList); return diffList; } struct lrgDiff *lrgParseMismatches(struct lrg *lrg) /* Parse lrg->mismatches and return a list of lrgDiffs. */ { return lrgParseDiffs(lrg, lrg->mismatches); } struct lrgDiff *lrgParseIndels(struct lrg *lrg) /* Parse lrg->indels and return a list of lrgDiffs. */ { return lrgParseDiffs(lrg, lrg->indels); } void lrgDiffFreeList(struct lrgDiff **pDiff) /* Free up a list of parsed diffs. */ { if (pDiff == NULL || *pDiff == NULL) return; struct lrgDiff *diff; for (diff = *pDiff; diff != NULL; diff = diff->next) { freeMem(diff->chromSeq); freeMem(diff->lrgSeq); } slFreeList(pDiff); } static boolean lrgNameParseRange(char *lrgName, int lrgSize, char *buf, size_t bufSize, int *retStart, int *retEnd) /* If lrgName is of the format name:start-end, write the name into buf, set retStart and retEnd * and return TRUE. Otherwise leave inputs alone and return FALSE. */ { if (strchr(lrgName, ':')) { // Parse lrgStart and lrgEnd out of name field (LRG_*:start-end) safecpy(buf, bufSize, lrgName); char *p = strchr(buf, ':'); // Chop at : (after name) *p++ = '\0'; char *num = p; p = strchr(num, '-'); if (p == NULL) errAbort("Error parsing LRG name:start-end '%s' -- no '-' following ':'", lrgName); *p++ = '\0'; *retStart = atoi(num) - 1; num = p; *retEnd = atoi(num); if (*retStart < 0 || *retStart >= *retEnd || *retEnd < 0 || *retEnd > lrgSize) errAbort("lrgNameParseRange: got bad coords (%d, %d) from '%s' size %d", *retStart, *retEnd, lrgName, lrgSize); return TRUE; } return FALSE; } static int calcBlockSize(uint nextTStart, uint nextQStart, struct psl *psl, int blockIx) { int tLen = nextTStart - psl->tStarts[blockIx]; int qLen = nextQStart - psl->qStarts[blockIx]; return min(tLen, qLen); } struct psl *lrgToPsl(struct lrg *lrg, uint chromSize) /* Use lrg's mismatches and indels to make a PSL. */ { struct lrgDiff *mismatches = lrgParseMismatches(lrg), *indels = lrgParseIndels(lrg); int lrgStart = 0, lrgEnd = lrg->lrgSize; char *lrgName = lrg->name; char lrgNameBuf[strlen(lrgName)+1]; if (lrgNameParseRange(lrgName, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd)) lrgName = lrgNameBuf; int blockCount = slCount(indels) + 1; unsigned opts = 0; struct psl *psl = pslNew(lrgName, lrg->lrgSize, lrgStart, lrgEnd, lrg->chrom, chromSize, lrg->chromStart, lrg->chromEnd, lrg->strand, blockCount, opts); psl->blockCount = blockCount; psl->misMatch = slCount(mismatches); boolean isRc = (lrg->strand[0] == '-'); // Translate gap coords from indels into block coords: psl->qStarts[0] = isRc ? (lrg->lrgSize - lrgEnd) : lrgStart; psl->tStarts[0] = lrg->chromStart; int alignedBaseCount = 0; int blockIx = 1; struct lrgDiff *diff; for (diff = indels; diff != NULL; diff = diff->next) { // now we know the size of the previous block: int lrgStartPosStrand = isRc ? (lrg->lrgSize - diff->lrgEnd) : diff->lrgStart; // Insertion on t or q, or both? int lrgLen = diff->lrgEnd - diff->lrgStart; int refLen = diff->chromEnd - diff->chromStart; if (refLen == 0 || lrgLen == 0) // insertion only on t or q { if (refLen == 0) { psl->qNumInsert++; psl->qBaseInsert += (lrgLen - refLen); } else if (lrgLen == 0) { psl->tNumInsert++; psl->tBaseInsert += (refLen - lrgLen); } psl->qStarts[blockIx] = lrgStartPosStrand + lrgLen; psl->tStarts[blockIx] = diff->chromEnd; } else // double sided insertion { psl->tNumInsert++; psl->tBaseInsert += refLen; psl->qNumInsert++; psl->qBaseInsert += lrgLen; // the qStart needs to account for the lrg insertion AND the prev block psl->qStarts[blockIx] = diff->lrgStart + lrgLen; } psl->tStarts[blockIx] = diff->chromEnd; psl->blockSizes[blockIx-1] = calcBlockSize(diff->chromStart, lrgStartPosStrand, psl, blockIx-1); alignedBaseCount += psl->blockSizes[blockIx-1]; blockIx++; } // size of last block: psl->blockSizes[blockIx-1] = calcBlockSize(lrg->chromEnd, lrg->lrgSize, psl, blockIx-1); alignedBaseCount += psl->blockSizes[blockIx-1]; if (blockIx != psl->blockCount) errAbort("lrgToPsl: expected %d blocks but somehow ended up with %d", psl->blockCount, blockIx); psl->match = alignedBaseCount - psl->misMatch; return psl; } static int sumInsertions(struct lrgDiff *indels) /* Return the sum of insertions of lrg into reference sequence, ignoring deletions. * That is the max headroom that we will need while shifting bases around. */ { int sumIns = 0; struct lrgDiff *diff; for (diff = indels; diff != NULL; diff = diff->next) sumIns += (diff->lrgEnd - diff->lrgStart); return sumIns; } struct dnaSeq *lrgReconstructSequence(struct lrg *lrg, char *db) /* Use genomic sequence, lrg->mismatches and lrg->indels to reconstruct LRG sequence */ { struct lrgDiff *diff, *indels = lrgParseIndels(lrg); int refSize = lrg->chromEnd - lrg->chromStart; // Fetch genomic sequence: struct dnaSeq *lrgSeq = hDnaFromSeq(db, lrg->chrom, lrg->chromStart, lrg->chromEnd, dnaUpper); boolean isRc = (lrg->strand[0] == '-'); if (isRc) reverseComplement(lrgSeq->dna, refSize); // If this was only a partial mapping of the LRG, pad N's at the beginning and/or end to get // the correct size. int lrgStart = 0, lrgEnd = lrg->lrgSize; char lrgNameBuf[strlen(lrg->name)+1]; lrgNameParseRange(lrg->name, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd); // Replace lrgSeq->dna with a larger version that has room for genomic sequence plus inserted bases // as well as possible padding at start and end for partial LRG mappings int addedBases = sumInsertions(indels); int paddedSize = refSize + lrgStart + (lrg->lrgSize - lrgEnd) + addedBases + 1; char *lrgSeqDna = needMem(paddedSize); if (lrgStart > 0) memset(lrgSeqDna, 'N', lrgStart); // Copy in the genomic data safencpy(lrgSeqDna+lrgStart, paddedSize-lrgStart, lrgSeq->dna, refSize); refSize += lrgStart; // If the LRG has inserted bases, zero out some extra bytes after genomic seq if (addedBases) zeroBytes(lrgSeqDna+refSize, addedBases); freeMem(lrgSeq->dna); lrgSeq->dna = lrgSeqDna; // Go through indels backwards w.r.t. LRG so we move sequence to the right // while not changing coords to the left: if (!isRc) slReverse(&indels); for (diff = indels; diff != NULL; diff = diff->next) { int lrgLen = diff->lrgEnd - diff->lrgStart; int refLen = diff->chromEnd - diff->chromStart; int refStart = diff->chromStart - lrg->chromStart + lrgStart; if (isRc) refStart = refSize - (refStart + refLen); if (lrgLen > refLen) { // LRG inserts sequence: shift the rest of the sequence to the right int insSize = lrgLen - refLen; int moveSize = refSize + addedBases - refStart - insSize; memmove(lrgSeqDna+refStart+insSize, lrgSeqDna+refStart, moveSize); } else { // LRG deletes sequence: shift the rest of the sequence to the left int delSize = refLen - lrgLen; int moveSize = refSize + addedBases - refStart - delSize + 1; // '\0' at end too memmove(lrgSeqDna+refStart, lrgSeqDna+refStart+delSize, moveSize); } // If there is LRG sequence, copy it in. if (lrgLen > 0) memcpy(lrgSeqDna+refStart, diff->lrgSeq, lrgLen); } int len = strlen(lrgSeqDna); if (len != lrgEnd) errAbort("lrgReconstructSequence: expected sequence length to be lrgEnd %d, got %d", lrgEnd, len); // Now that indels have been resolved, use LRG coords to substitute mismatches: struct lrgDiff *mismatches = lrgParseMismatches(lrg); for (diff = mismatches; diff != NULL; diff = diff->next) { if (lrgSeqDna[diff->lrgStart] != diff->chromSeq[0]) errAbort("Mismatch with unexpected assembly sequence: expected '%c' on %c strand, " "got '%c'", diff->chromSeq[0], lrg->strand[0], lrgSeqDna[diff->lrgStart]); int size = diff->lrgEnd - diff->lrgStart; memcpy(lrgSeqDna+diff->lrgStart, diff->lrgSeq, size); } // Add padding N's at end if applicable if (lrgEnd < lrg->lrgSize) { memset(lrgSeqDna+lrgEnd, 'N', lrg->lrgSize - lrgEnd); lrgSeqDna[lrg->lrgSize] = '\0'; } len = strlen(lrgSeqDna); if (len != lrg->lrgSize) errAbort("lrgReconstructSequence: Error applying LRG indels for '%s': expected size %d, got %d", lrg->name, lrg->lrgSize, len); lrgSeq->size = lrg->lrgSize; return lrgSeq; }