cff41d740a8e5ee4bbc5a3f4385b7e939f42841f chmalee Tue Apr 18 15:54:01 2023 -0700 Put the forward and reverse hgPcr primers into the qName of the psl so when we have multiple results, we are sure to show the primer pair for the right psl, refs #30925 diff --git src/jkOwnLib/gfPcrLib.c src/jkOwnLib/gfPcrLib.c index 1b0773a..3d20057 100644 --- src/jkOwnLib/gfPcrLib.c +++ src/jkOwnLib/gfPcrLib.c @@ -1,575 +1,580 @@ /* gfPcrLib - Routines to help do in silico PCR. * Copyright 2004-2005 Jim Kent. All rights reserved. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "fa.h" #include "net.h" #include "genoFind.h" #include "sqlNum.h" #include "gfInternal.h" #include "gfPcrLib.h" /**** Input and Output Handlers *****/ void gfPcrOutputFree(struct gfPcrOutput **pOut) /* Free up a gfPcrOutput structure. */ { struct gfPcrOutput *out = *pOut; if (pOut != NULL) { freeMem(out->name); freeMem(out->fPrimer); freeMem(out->rPrimer); freeMem(out->seqName); freeMem(out->dna); freez(pOut); } } void gfPcrOutputFreeList(struct gfPcrOutput **pList) /* Free up a list of gfPcrOutput structures. */ { struct gfPcrOutput *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; gfPcrOutputFree(&el); } *pList = NULL; } void gfPcrInputStaticLoad(char **row, struct gfPcrInput *ret) /* Load a row from gfPcrInput table into ret. The contents of ret will * be replaced at the next call to this function. */ { ret->name = row[0]; ret->fPrimer = row[1]; ret->rPrimer = row[2]; } struct gfPcrInput *gfPcrInputLoad(char **row) /* Load a gfPcrInput from row fetched with select * from gfPcrInput * from database. Dispose of this with gfPcrInputFree(). */ { struct gfPcrInput *ret; AllocVar(ret); ret->name = cloneString(row[0]); ret->fPrimer = cloneString(row[1]); ret->rPrimer = cloneString(row[2]); return ret; } struct gfPcrInput *gfPcrInputLoadAll(char *fileName) /* Load all gfPcrInput from a whitespace-separated file. * Dispose of this with gfPcrInputFreeList(). */ { struct gfPcrInput *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[3]; while (lineFileRow(lf, row)) { el = gfPcrInputLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } void gfPcrInputFree(struct gfPcrInput **pEl) /* Free a single dynamically allocated gfPcrInput such as created * with gfPcrInputLoad(). */ { struct gfPcrInput *el; if ((el = *pEl) == NULL) return; freeMem(el->name); freeMem(el->fPrimer); freeMem(el->rPrimer); freez(pEl); } void gfPcrInputFreeList(struct gfPcrInput **pList) /* Free a list of dynamically allocated gfPcrInput's */ { struct gfPcrInput *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; gfPcrInputFree(&el); } *pList = NULL; } /**** Handle Refinement (after Index has given us approximate position) *****/ static boolean goodMatch(char *a, char *b, int size) /* Return TRUE if there are 2 matches per mismatch. */ { int score = 0, i; for (i=0; i<size; ++i) { if (a[i] == b[i]) score += 1; else score -= 2; } return score >= 0; } static void upperMatch(char *dna, char *primer, int size) /* Uppercase DNA where it matches primer. */ { int i; for (i=0; i<size; ++i) { if (dna[i] == primer[i]) dna[i] = toupper(dna[i]); } } static void outputFa(struct gfPcrOutput *out, FILE *f, char *url) /* Output match in fasta format. */ { int fPrimerSize = strlen(out->fPrimer); int rPrimerSize = strlen(out->rPrimer); int productSize = out->rPos - out->fPos; char *dna = cloneStringZ(out->dna, productSize); char *rrPrimer = cloneString(out->rPrimer); char *ffPrimer = cloneString(out->fPrimer); struct dyString *faLabel = dyStringNew(0); char *name = out->name; /* Create fasta header with position, possibly empty name, and upper cased primers with position optionally hyperlinked. */ touppers(rrPrimer); touppers(ffPrimer); if (url != NULL) { dyStringAppend(faLabel, "<A HREF=\""); dyStringPrintf(faLabel, url, out->seqName, out->fPos+1, out->rPos); dyStringAppend(faLabel, "\">"); } dyStringPrintf(faLabel, "%s:%d%c%d", out->seqName, out->fPos+1, out->strand, out->rPos); if (url != NULL) dyStringAppend(faLabel, "</A>"); if (name != NULL) dyStringPrintf(faLabel, " %s", name); dyStringPrintf(faLabel, " %dbp %s %s", out->rPos - out->fPos, ffPrimer, rrPrimer); /* Flip reverse primer to be in same direction and case as sequence. */ reverseComplement(rrPrimer, rPrimerSize); tolowers(rrPrimer); /* Capitalize where sequence and primer match, and write out sequence. */ upperMatch(dna, out->fPrimer, fPrimerSize); upperMatch(dna + productSize - rPrimerSize, rrPrimer, rPrimerSize); faWriteNext(f, faLabel->string, dna, productSize); /* Clean up. */ freez(&dna); freez(&rrPrimer); freez(&ffPrimer); dyStringFree(&faLabel); } static int countMatch(char *a, char *b, int size) /* Count number of letters in a, b that match */ { int count = 0, i; for (i=0; i<size; ++i) if (a[i] == b[i]) ++count; return count; } static int getBedScore(struct gfPcrOutput *out) /* Return a score in BED range (0-1000). */ { int size = out->rPos - out->fPos; int fPrimerSize = strlen(out->fPrimer); int rPrimerSize = strlen(out->rPrimer); int match = countMatch(out->dna, out->fPrimer, fPrimerSize); assert(size > 0); reverseComplement(out->rPrimer, rPrimerSize); match += countMatch(out->dna + size - rPrimerSize, out->rPrimer, rPrimerSize); reverseComplement(out->rPrimer, rPrimerSize); return round(1000.0 * match / (double)(fPrimerSize + rPrimerSize)); } static void outputBed(struct gfPcrOutput *out, FILE *f, char *url) /* Output match in BED 6 format. */ { char *name = out->name; if (name == NULL) name = "n/a"; fprintf(f, "%s\t%d\t%d\t", out->seqName, out->fPos, out->rPos); fprintf(f, "%s\t", name); fprintf(f, "%d\t", getBedScore(out)); fprintf(f, "%c\n", out->strand); } static void outputBed12(struct gfPcrOutput *out, FILE *f, char *url) /* Output match in BED 12 format (a block for each primer). */ { int fPrimerSize = strlen(out->fPrimer); int rPrimerSize = strlen(out->rPrimer); char *name = out->name; if (name == NULL) name = "n/a"; fprintf(f, "%s\t%d\t%d\t%s\t", out->seqName, out->fPos, out->rPos, name); fprintf(f, "%d\t%c\t", getBedScore(out), out->strand); fprintf(f, "%d\t%d\t", out->fPos, out->rPos); fprintf(f, "0\t2\t%d,%d,\t0,%d\n", fPrimerSize, rPrimerSize, out->rPos - out->fPos - rPrimerSize); } static void outputPsl(struct gfPcrOutput *out, FILE *f, char *url) /* Output match in PSL format. */ { int match; int size = out->rPos - out->fPos; int fPrimerSize = strlen(out->fPrimer); int rPrimerSize = strlen(out->rPrimer); int bothSize = fPrimerSize + rPrimerSize; int gapSize = size - bothSize; char *name = out->name; -if (name == NULL) name = "n/a"; +if (name == NULL) + { + struct dyString *dy = dyStringNew(0); + dyStringPrintf(dy, "%s_%s", out->fPrimer, out->rPrimer); + name = dyStringCannibalize(&dy);; + } match = countMatch(out->dna, out->fPrimer, fPrimerSize); reverseComplement(out->rPrimer, rPrimerSize); assert(size > 0); match += countMatch(out->dna + size - rPrimerSize, out->rPrimer, rPrimerSize); reverseComplement(out->rPrimer, rPrimerSize); fprintf(f, "%d\t", match); fprintf(f, "%d\t", bothSize - match); fprintf(f, "0\t0\t"); /* repMatch, nCount. */ fprintf(f, "1\t%d\t", gapSize); /* qNumInsert, qBaseInsert */ fprintf(f, "1\t%d\t", gapSize); /* tNumInsert, tBaseInsert */ fprintf(f, "%c\t", out->strand); fprintf(f, "%s\t", name); fprintf(f, "%d\t", size); fprintf(f, "0\t%d\t", size); /* qStart, qEnd */ fprintf(f, "%s\t%d\t", out->seqName, out->seqSize); fprintf(f, "%d\t%d\t", out->fPos, out->rPos); fprintf(f, "2\t"); if (out->strand == '+') { fprintf(f, "%d,%d,\t", fPrimerSize, rPrimerSize); fprintf(f, "%d,%d,\t", 0,size - rPrimerSize); fprintf(f, "%d,%d,\n", out->fPos, out->rPos - rPrimerSize); } else { fprintf(f, "%d,%d,\t", rPrimerSize, fPrimerSize); fprintf(f, "%d,%d,\t", 0,size - fPrimerSize); fprintf(f, "%d,%d,\n", out->fPos, out->rPos - fPrimerSize); } } typedef void (*outFunction)(struct gfPcrOutput *out, FILE *f, char *url) ; static outFunction gfPcrOutputFunction(char *outType) /* Return a pointer to output function for the given output type. */ { outFunction output = NULL; if (sameWord(outType, "fa")) output = outputFa; else if (sameWord(outType, "bed")) output = outputBed; else if (sameWord(outType, "bed12")) output = outputBed12; else if (sameWord(outType, "psl")) output = outputPsl; else errAbort("Unrecognized pcr output type %s", outType); return output; } void gfPcrOutputWriteList(struct gfPcrOutput *outList, char *outType, char *url, FILE *f) /* Write list of outputs in specified format (either "fa" or "bed") * to file. If url is non-null it should be a printf formatted * string that takes %s, %d, %d for chromosome, start, end. */ { outFunction output = gfPcrOutputFunction(outType); struct gfPcrOutput *out; for (out = outList; out != NULL; out = out->next) { output(out, f, url); } } void gfPcrOutputWriteOne(struct gfPcrOutput *out, char *outType, char *url, FILE *f) /* Write a single output in specified format (either "fa" or "bed") * to file. If url is non-null it should be a printf formatted * string that takes %s, %d, %d for chromosome, start, end. */ { outFunction output = gfPcrOutputFunction(outType); output(out, f, url); } void gfPcrOutputWriteAll(struct gfPcrOutput *outList, char *outType, char *url, char *fileName) /* Create file of outputs in specified format (either "fa" or "bed") * to file. If url is non-null it should be a printf formatted * string that takes %s, %d, %d for chromosome, start, end. */ { FILE *f = mustOpen(fileName, "a"); gfPcrOutputWriteList(outList, outType, url, f); carefulClose(&f); } static void pcrLocalStrand(char *pcrName, struct dnaSeq *seq, int seqOffset, char *seqName, int seqSize, int maxSize, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, int minPerfect, int minGood, char strand, struct gfPcrOutput **pOutList) /* Do detailed PCR scan on one strand and report results. */ { char *fDna = seq->dna, *rDna; char *endDna = fDna + seq->size; char *fpPerfect = fPrimer + fPrimerSize - minPerfect; char *rpPerfect = rPrimer; char *fpMatch, *rpMatch; int goodSize = minGood - minPerfect; struct gfPcrOutput *out; int matchSize; reverseComplement(rPrimer, rPrimerSize); for (;;) { fpMatch = memMatch(fpPerfect, minPerfect, fDna, endDna - fDna); if (fpMatch == NULL) break; rDna = fpMatch; for (;;) { int fPos, rPos, fGoodPos, rGoodPos, fTrim, rTrim; rpMatch = memMatch(rpPerfect, minPerfect, rDna, endDna - rDna); if (rpMatch == NULL) break; fPos = fpMatch - (fPrimerSize - minPerfect) - seq->dna; rPos = rpMatch + rPrimerSize - seq->dna; /* deal with primers dangling off either end of the target sequence */ if (fPos < 0) { fTrim = 0 - fPos; fPos = 0; } else fTrim = 0; if (rPos > seq->size) { rTrim = rPos - seq->size; rPos = seq->size; } else rTrim = 0; fGoodPos = fpMatch - goodSize - seq->dna; rGoodPos = rpMatch + minPerfect - seq->dna; fGoodPos = max(fGoodPos, 0); rGoodPos = min(rGoodPos, seq->size); matchSize = rPos - fPos; int fGoodSize = fpMatch - (seq->dna + fGoodPos); int rGoodSize = rPos - rGoodPos; if (rGoodSize >= goodSize && fGoodSize >= goodSize) { if (rPos >= 0 && fPos >= 0 && fPos < seq->size && matchSize <= maxSize) { /* If matches well enough create output record. */ if (goodMatch(seq->dna + fGoodPos, fpPerfect - fGoodSize, fGoodSize) && goodMatch(seq->dna + rGoodPos, rpPerfect + minPerfect, rGoodSize)) { /* Truncate the copy of the primers going into the out-> record using rTrim and fTrim if needed. */ AllocVar(out); out->name = cloneString(pcrName); out->fPrimer = cloneString(fPrimer + fTrim); out->rPrimer = cloneStringZ(rPrimer, rPrimerSize - rTrim); reverseComplement(out->rPrimer, rPrimerSize - rTrim); out->seqName = cloneString(seqName); out->seqSize = seqSize; out->fPos = fPos + seqOffset; out->rPos = rPos + seqOffset; out->strand = strand; out->dna = cloneStringZ(seq->dna + fPos, matchSize); /* Dealing with the strand of darkness.... Here we just have to swap * forward and reverse primers to flip strands, and reverse complement * the amplified area.. */ if (strand == '-') { char *temp = out->rPrimer; out->rPrimer = out->fPrimer; out->fPrimer = temp; reverseComplement(out->dna, matchSize); } slAddHead(pOutList, out); } } } rDna = rpMatch+1; } fDna = fpMatch + 1; } reverseComplement(rPrimer, rPrimerSize); } void gfPcrLocal(char *pcrName, struct dnaSeq *seq, int seqOffset, char *seqName, int seqSize, int maxSize, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, int minPerfect, int minGood, char strand, struct gfPcrOutput **pOutList) /* Do detailed PCR scan on DNA already loaded into memory and put results * (in reverse order) on *pOutList. */ { /* For PCR primers reversing search strand just means switching * order of primers. */ if (strand == '-') pcrLocalStrand(pcrName, seq, seqOffset, seqName, seqSize, maxSize, rPrimer, rPrimerSize, fPrimer, fPrimerSize, minPerfect, minGood, strand, pOutList); else pcrLocalStrand(pcrName, seq, seqOffset, seqName, seqSize, maxSize, fPrimer, fPrimerSize, rPrimer, rPrimerSize, minPerfect, minGood, strand, pOutList); } struct gfRange *gfPcrGetRanges(struct gfConnection *conn, char *fPrimer, char *rPrimer, int maxSize) /* Query gfServer with primers and convert response to a list of gfRanges. */ { char buf[4096]; struct gfRange *rangeList = NULL, *range; /* Query server and put results into rangeList. */ gfBeginRequest(conn); if (conn->isDynamic) safef(buf, sizeof(buf), "%spcr %s %s %s %s %d", gfSignature(), conn->genome, conn->genomeDataDir, fPrimer, rPrimer, maxSize); else safef(buf, sizeof(buf), "%spcr %s %s %d", gfSignature(), fPrimer, rPrimer, maxSize); mustWriteFd(conn->fd, buf, strlen(buf)); for (;;) { if (netGetString(conn->fd, buf) == NULL) break; if (sameString(buf, "end")) break; else if (startsWith("Error:", buf)) errAbort("%s", buf); else { char *s = buf; char *name, *start, *end, *strand; name = nextWord(&s); start = nextWord(&s); end = nextWord(&s); strand = nextWord(&s); if (strand == NULL) errAbort("Truncated gfServer response"); AllocVar(range); range->tName = cloneString(name); range->tStart = atoi(start); range->tEnd = atoi(end); range->tStrand = strand[0]; slAddHead(&rangeList, range); } } gfEndRequest(conn); slReverse(&rangeList); return rangeList; } static void gfPcrOneViaNet( struct gfConnection *conn, char *seqDir, char *pcrName, char *fPrimer, char *rPrimer, int maxSize, int minPerfect, int minGood, struct hash *tFileCache, struct gfPcrOutput **pOutList) /* Query gfServer for likely primer hits, load DNA to do detailed * examination, and output hits to head of *pOutList.. */ { struct gfRange *rangeList = NULL, *range; int fPrimerSize = strlen(fPrimer); int rPrimerSize = strlen(rPrimer); int maxPrimerSize = max(fPrimerSize, rPrimerSize); int minPrimerSize = min(fPrimerSize, rPrimerSize); tolowers(fPrimer); tolowers(rPrimer); /* Check for obvious user mistake. */ if (minPrimerSize < minGood || minPrimerSize < minPerfect) errAbort("Need longer primer in pair %s (%dbp) %s (%dbp). Minimum size is %d\n", fPrimer, fPrimerSize, rPrimer, rPrimerSize, minGood); /* Load ranges and do more detailed snooping. */ rangeList = gfPcrGetRanges(conn, fPrimer, rPrimer, maxSize); for (range = rangeList; range != NULL; range = range->next) { int tSeqSize; char seqName[PATH_LEN]; struct dnaSeq *seq = gfiExpandAndLoadCached(range, tFileCache, seqDir, 0, &tSeqSize, FALSE, FALSE, maxPrimerSize); gfiGetSeqName(range->tName, seqName, NULL); gfPcrLocal(pcrName, seq, range->tStart, seqName, tSeqSize, maxSize, fPrimer, fPrimerSize, rPrimer, rPrimerSize, minPerfect, minGood, range->tStrand, pOutList); dnaSeqFree(&seq); } gfRangeFreeList(&rangeList); } struct gfPcrOutput *gfPcrViaNet(struct gfConnection *conn, char *seqDir, struct gfPcrInput *inList, int maxSize, int minPerfect, int minGood) /* Do PCRs using gfServer index, returning list of results. */ { struct hash *tFileCache = gfFileCacheNew(); struct gfPcrInput *in; struct gfPcrOutput *outList = NULL; for (in = inList; in != NULL; in = in->next) { gfPcrOneViaNet(conn, seqDir, in->name, in->fPrimer, in->rPrimer, maxSize, minPerfect, minGood, tFileCache, &outList); } gfFileCacheFree(&tFileCache); slReverse(&outList); return outList; } char *gfPcrMakePrimer(char *s) /* Make primer (lowercased DNA) out of text. Complain if * it is too short or too long. */ { int size = dnaFilteredSize(s); int realSize; char *primer = needMem(size+1); dnaFilter(s, primer); realSize = size - countChars(primer, 'n'); if (realSize < 10 || realSize < size/2) errAbort("%s does not seem to be a good primer", s); return primer; }