533112afe2a2005e80cdb1f82904ea65032d4302 braney Sat Oct 2 11:37:34 2021 -0700 split hg/lib into two separate libaries, one only used by the cgis diff --git src/hg/cgilib/cutter.c src/hg/cgilib/cutter.c new file mode 100644 index 0000000..8c5425e --- /dev/null +++ src/hg/cgilib/cutter.c @@ -0,0 +1,791 @@ +/* cutter.c was originally generated by the autoSql program, which also + * generated cutter.h and cutter.sql. This module links the database and + * the RAM representation of objects. */ + +/* Copyright (C) 2014 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "common.h" +#include "linefile.h" +#include "dystring.h" +#include "jksql.h" +#include "dnaseq.h" +#include "dnautil.h" +#include "bed.h" +#include "cutter.h" + + +struct cutter *cutterLoad(char **row) +/* Load a cutter from row fetched with select * from cutter + * from database. Dispose of this with cutterFree(). */ +{ +struct cutter *ret; + +AllocVar(ret); +ret->numSciz = sqlUnsigned(row[8]); +ret->numCompanies = sqlUnsigned(row[10]); +ret->numRefs = sqlUnsigned(row[12]); +ret->name = cloneString(row[0]); +ret->size = sqlUnsigned(row[1]); +ret->matchSize = sqlUnsigned(row[2]); +ret->seq = cloneString(row[3]); +ret->cut = sqlUnsigned(row[4]); +ret->overhang = sqlSigned(row[5]); +ret->palindromic = sqlUnsigned(row[6]); +ret->semicolon = sqlUnsigned(row[7]); +{ +int sizeOne; +sqlStringDynamicArray(row[9], &ret->scizs, &sizeOne); +assert(sizeOne == ret->numSciz); +} +{ +int sizeOne; +sqlCharDynamicArray(row[11], &ret->companies, &sizeOne); +assert(sizeOne == ret->numCompanies); +} +{ +int sizeOne; +sqlUnsignedDynamicArray(row[13], &ret->refs, &sizeOne); +assert(sizeOne == ret->numRefs); +} +return ret; +} + +struct cutter *cutterLoadAll(char *fileName) +/* Load all cutter from a whitespace-separated file. + * Dispose of this with cutterFreeList(). */ +{ +struct cutter *list = NULL, *el; +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *row[14]; + +while (lineFileRow(lf, row)) + { + el = cutterLoad(row); + slAddHead(&list, el); + } +lineFileClose(&lf); +slReverse(&list); +return list; +} + +struct cutter *cutterLoadAllByChar(char *fileName, char chopper) +/* Load all cutter from a chopper separated file. + * Dispose of this with cutterFreeList(). */ +{ +struct cutter *list = NULL, *el; +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *row[14]; + +while (lineFileNextCharRow(lf, chopper, row, ArraySize(row))) + { + el = cutterLoad(row); + slAddHead(&list, el); + } +lineFileClose(&lf); +slReverse(&list); +return list; +} + +struct cutter *cutterLoadByQuery(struct sqlConnection *conn, char *query) +/* Load all cutter from table that satisfy the query given. + * Where query is of the form 'select * from example where something=something' + * or 'select example.* from example, anotherTable where example.something = + * anotherTable.something'. + * Dispose of this with cutterFreeList(). */ +{ +struct cutter *list = NULL, *el; +struct sqlResult *sr; +char **row; + +sr = sqlGetResult(conn, query); +while ((row = sqlNextRow(sr)) != NULL) + { + el = cutterLoad(row); + slAddHead(&list, el); + } +slReverse(&list); +sqlFreeResult(&sr); +return list; +} + +void cutterSaveToDb(struct sqlConnection *conn, struct cutter *el, char *tableName, int updateSize) +/* Save cutter as a row to the table specified by tableName. + * As blob fields may be arbitrary size updateSize specifies the approx size + * of a string that would contain the entire query. Arrays of native types are + * converted to comma separated strings and loaded as such, User defined types are + * inserted as NULL. Strings are automatically escaped to allow insertion into the database. */ +{ +struct dyString *update = newDyString(updateSize); +char *scizsArray, *companiesArray, *refsArray; +scizsArray = sqlStringArrayToString(el->scizs, el->numSciz); +companiesArray = sqlCharArrayToString(el->companies, el->numCompanies); +refsArray = sqlUnsignedArrayToString(el->refs, el->numRefs); +sqlDyStringPrintf(update, "insert into %s values ( '%s',%u,%u,'%s',%u,%d,%u,%u,%u,'%s',%u,'%s',%u,'%s')", + tableName, el->name, el->size, el->matchSize, el->seq, el->cut, el->overhang, el->palindromic, el->semicolon, el->numSciz, scizsArray , el->numCompanies, companiesArray , el->numRefs, refsArray ); +sqlUpdate(conn, update->string); +freeDyString(&update); +freez(&scizsArray); +freez(&companiesArray); +freez(&refsArray); +} + + +struct cutter *cutterCommaIn(char **pS, struct cutter *ret) +/* Create a cutter out of a comma separated string. + * This will fill in ret if non-null, otherwise will + * return a new cutter */ +{ +char *s = *pS; + +if (ret == NULL) + AllocVar(ret); +ret->name = sqlStringComma(&s); +ret->size = sqlUnsignedComma(&s); +ret->matchSize = sqlUnsignedComma(&s); +ret->seq = sqlStringComma(&s); +ret->cut = sqlUnsignedComma(&s); +ret->overhang = sqlSignedComma(&s); +ret->palindromic = sqlUnsignedComma(&s); +ret->semicolon = sqlUnsignedComma(&s); +ret->numSciz = sqlUnsignedComma(&s); +{ +int i; +s = sqlEatChar(s, '{'); +AllocArray(ret->scizs, ret->numSciz); +for (i=0; i<ret->numSciz; ++i) + { + ret->scizs[i] = sqlStringComma(&s); + } +s = sqlEatChar(s, '}'); +s = sqlEatChar(s, ','); +} +ret->numCompanies = sqlUnsignedComma(&s); +{ +int i; +s = sqlEatChar(s, '{'); +AllocArray(ret->companies, ret->numCompanies); +for (i=0; i<ret->numCompanies; ++i) + { + ret->companies[i] = sqlCharComma(&s); + } +s = sqlEatChar(s, '}'); +s = sqlEatChar(s, ','); +} +ret->numRefs = sqlUnsignedComma(&s); +{ +int i; +s = sqlEatChar(s, '{'); +AllocArray(ret->refs, ret->numRefs); +for (i=0; i<ret->numRefs; ++i) + { + ret->refs[i] = sqlUnsignedComma(&s); + } +s = sqlEatChar(s, '}'); +s = sqlEatChar(s, ','); +} +*pS = s; +return ret; +} + +void cutterFree(struct cutter **pEl) +/* Free a single dynamically allocated cutter such as created + * with cutterLoad(). */ +{ +struct cutter *el; + +if ((el = *pEl) == NULL) return; +freeMem(el->name); +freeMem(el->seq); +/* All strings in scizs are allocated at once, so only need to free first. */ +if (el->scizs != NULL) + freeMem(el->scizs[0]); +freeMem(el->scizs); +freeMem(el->companies); +freeMem(el->refs); +freez(pEl); +} + +void cutterFreeList(struct cutter **pList) +/* Free a list of dynamically allocated cutter's */ +{ +struct cutter *el, *next; + +for (el = *pList; el != NULL; el = next) + { + next = el->next; + cutterFree(&el); + } +*pList = NULL; +} + +void cutterOutput(struct cutter *el, FILE *f, char sep, char lastSep) +/* Print out cutter. Separate fields with sep. Follow last field with lastSep. */ +{ +if (sep == ',') fputc('"',f); +fprintf(f, "%s", el->name); +if (sep == ',') fputc('"',f); +fputc(sep,f); +fprintf(f, "%u", el->size); +fputc(sep,f); +fprintf(f, "%u", el->matchSize); +fputc(sep,f); +if (sep == ',') fputc('"',f); +fprintf(f, "%s", el->seq); +if (sep == ',') fputc('"',f); +fputc(sep,f); +fprintf(f, "%u", el->cut); +fputc(sep,f); +fprintf(f, "%d", el->overhang); +fputc(sep,f); +fprintf(f, "%u", el->palindromic); +fputc(sep,f); +fprintf(f, "%u", el->semicolon); +fputc(sep,f); +fprintf(f, "%u", el->numSciz); +fputc(sep,f); +{ +int i; +if (sep == ',') fputc('{',f); +for (i=0; i<el->numSciz; ++i) + { + if (sep == ',') fputc('"',f); + fprintf(f, "%s", el->scizs[i]); + if (sep == ',') fputc('"',f); + fputc(',', f); + } +if (sep == ',') fputc('}',f); +} +fputc(sep,f); +fprintf(f, "%u", el->numCompanies); +fputc(sep,f); +{ +int i; +if (sep == ',') fputc('{',f); +for (i=0; i<el->numCompanies; ++i) + { + if (sep == ',') fputc('"',f); + fprintf(f, "%c", el->companies[i]); + if (sep == ',') fputc('"',f); + fputc(',', f); + } +if (sep == ',') fputc('}',f); +} +fputc(sep,f); +fprintf(f, "%u", el->numRefs); +fputc(sep,f); +{ +int i; +if (sep == ',') fputc('{',f); +for (i=0; i<el->numRefs; ++i) + { + fprintf(f, "%u", el->refs[i]); + fputc(',', f); + } +if (sep == ',') fputc('}',f); +} +fputc(lastSep,f); +} + +/* -------------------------------- End autoSql Generated Code -------------------------------- */ + +boolean isIsosciz(struct cutter *mainEnz, struct cutter *possibleScizEnz) +/* Is possibleScizEnz an isoscizomer of mainEnz? */ +{ +int i; +for (i = 0; i < mainEnz->numSciz; i++) + if (sameString(possibleScizEnz->name, mainEnz->scizs[i])) + return TRUE; +return FALSE; +} + +struct dnaSeq *stickyEnd(struct cutter *enz) +/* Return the sticky end sequence of an enzyme. Strand is unspecified. Free this. */ +{ +struct dnaSeq *ret = NULL; +if (!enz) + return NULL; +if (enz->overhang > 0) + { + char *seq = cloneStringZ(enz->seq + enz->cut, enz->overhang); + ret = newDnaSeq(seq, enz->overhang, "sticky"); + } +else if (enz->overhang < 0) + { + int size = intAbs(enz->overhang); + char *seq = cloneStringZ(enz->seq + enz->cut - size, size); + complement(seq, strlen(seq)); + ret = newDnaSeq(seq, strlen(seq), "sticky"); + } +return ret; +} + +int acgtCount(char *seq) +/* Return the count of canonical bases in a sequence. */ +{ +return countChars(seq,'A') + countChars(seq,'C') + + countChars(seq,'G') + countChars(seq,'T'); +} + +boolean sameStickyEnd(struct cutter *enz1, struct cutter *enz2) +/* Check to see if two enzymes make the same sticky ends. If either of the + enzymes have sticky ends that isn't all ACGT, then this returns false. */ +{ +boolean ret = FALSE; +struct dnaSeq *sticky1 = stickyEnd(enz1); +struct dnaSeq *sticky2 = stickyEnd(enz2); + +if (sticky1 && sticky2) +if (sticky1 && sticky2 && (sticky1->size == sticky2->size) && + (acgtCount(sticky1->dna) == sticky1->size) && (acgtCount(sticky2->dna) == sticky2->size)) + { + if (sameString(sticky1->dna, sticky2->dna)) + ret = TRUE; + else + { + reverseComplement(sticky2->dna, sticky2->size); + if (sameString(sticky1->dna, sticky2->dna)) + ret = TRUE; + } + } +freeDnaSeq(&sticky1); +freeDnaSeq(&sticky2); +return ret; +} + +struct slName *findIsoligamers(struct cutter *enz, struct cutter *enzList) +/* Find isoligamers to an enzyme in a list of enzymes. */ +{ +struct slName *list = NULL; +struct cutter *cur; +if (!enz || !enzList) + return NULL; +for (cur = enzList; cur != NULL; cur = cur->next) + { + if (!sameString(cur->name, enz->name) && (sameStickyEnd(enz, cur))) + { + struct slName *newname = newSlName(cur->name); + slAddHead(&list, newname); + } + } +slReverse(&list); +return list; +} + +boolean isPalindrome(char *seq) +/* Determine if a sequence is palindromic. */ +{ +int i, j, size = strlen(seq); +for (i = 0; i < size; i++) + switch (seq[i]) + { + case 'H': + case 'B': + case 'V': + case 'D': return FALSE; + } +/* Loop from both ends of the sequence: i from the left, j from the right. */ +for (i = 0, j = size-1; (i < size/2) && (j >= ((size%2==1) ? size/2+1 : size/2)); i++, j--) + { + switch (seq[i]) + { + case 'A': + { + if (seq[j] != 'T') + return FALSE; + break; + } + case 'C': + { + if (seq[j] != 'G') + return FALSE; + break; + } + case 'G': + { + if (seq[j] != 'C') + return FALSE; + break; + } + case 'T': + { + if (seq[j] != 'A') + return FALSE; + break; + } + case 'W': + { + if (seq[j] != 'S') + return FALSE; + break; + } + case 'S': + { + if (seq[j] != 'S') + return FALSE; + break; + } + case 'Y': + { + if (seq[j] != 'R') + return FALSE; + break; + } + case 'R': + { + if (seq[j] != 'Y') + return FALSE; + break; + } + case 'M': + { + if (seq[j] != 'K') + return FALSE; + break; + } + case 'K': + { + if (seq[j] != 'M') + return FALSE; + break; + } + case 'N': + { + if (seq[j] != 'N') + return FALSE; + break; + } + } + } +return TRUE; +} + +struct cutter *readGcg(char *gcgFile) +/* Parse a GCG file and load it into cutter format. */ +{ +struct lineFile *lf = lineFileOpen(gcgFile,TRUE); +struct cutter *enzList = NULL; +char *line = "whatever", *words[10], numWords; + +/* Skip to the right line. */ +while (lineFileNext(lf,&line,NULL) && !startsWith("..",line)); +/* */ +while ((numWords=lineFileChop(lf,words))) + { + struct cutter *newone = NULL; + int comIx = (numWords==7) ? 5 : 6; + int refIx = (numWords==7) ? 6 : 7; + int i; + char *items[100]; + + /* Skip ones */ + if (words[4][0] == '?') + continue; + AllocVar(newone); + newone->semicolon = (words[0][0] == ';') ? TRUE : FALSE; + /* Deal with the first few columns */ + if (!isdigit(words[1][0])) + errAbort("Error: expecting a number in cut site column on line %d\n", lf->lineIx+1); + if (!isdigit(words[3][0]) && words[3][0]!='-') + errAbort("Error: expecting a number in the overhang column on line %d\n", lf->lineIx+1); + if (words[comIx][0] != '>') + errAbort("Error: expecting a \'>\' in the commercial sources column of line %d\n", lf->lineIx+1); + newone->name = (words[0][0] == ';') ? cloneString(words[0]+1) : cloneString(words[0]); + newone->cut = atoi(words[1]); + newone->seq = cloneString(words[2]); + touppers(newone->seq); + stripChar(newone->seq,'\''); + stripChar(newone->seq,'_'); + newone->size = strlen(newone->seq); + newone->matchSize = newone->size - countChars(newone->seq, 'N'); + newone->palindromic = isPalindrome(newone->seq); + newone->overhang = atoi(words[3]); + newone->numCompanies = strlen(words[comIx]+1); + if (newone->numCompanies > 0) + newone->companies = cloneMem(words[comIx]+1, newone->numCompanies*sizeof(char)); + newone->numRefs = chopString(words[refIx], ",", items, ArraySize(items)); + AllocArray(newone->refs, newone->numRefs); + for (i = 0; i < newone->numRefs; i++) + { + if (i == 100) + errAbort("Error: Andy didn't make the array for holding references big enough\n"); + if (!isdigit(items[i][0])) + errAbort("Error: expecting number in references column in line %d\n", lf->lineIx+1); + newone->refs[i] = atoi(items[i]); + } + /* Deal with isoscizomers. */ + if (numWords == 8) + { + newone->numSciz = chopString(words[5], ",", items, ArraySize(items)); + AllocArray(newone->scizs, newone->numSciz*sizeof(int)); + for (i = 0; i < newone->numSciz; i++) + { + if (i == 100) + errAbort("Error: Andy didn't make the array for having isoscizomers big enough\n"); + newone->scizs[i] = cloneString(items[i]); + } + } + else + newone->numSciz = 0; + slAddHead(&enzList, newone); + } +slReverse(&enzList); +lineFileClose(&lf); +return enzList; +} + +boolean matchingBase(char enzBase, char seqBase) +/* An enzyme matching */ +{ +switch (seqBase) + { + /* Ignore hard-masked and gaps for now. */ + case 'A': + { + switch (enzBase) + { + case 'A': + case 'R': + case 'M': + case 'W': + case 'H': + case 'V': + case 'D': + case 'N': return TRUE; + default: return FALSE; + } + } + case 'C': + { + switch (enzBase) + { + case 'C': + case 'Y': + case 'M': + case 'S': + case 'H': + case 'B': + case 'V': + case 'N': return TRUE; + default: return FALSE; + } + } + case 'G': + { + switch (enzBase) + { + case 'G': + case 'R': + case 'K': + case 'S': + case 'B': + case 'V': + case 'D': + case 'N': return TRUE; + default: return FALSE; + } + } + case 'T': + { + switch (enzBase) + { + case 'T': + case 'Y': + case 'K': + case 'W': + case 'H': + case 'B': + case 'D': + case 'N': return TRUE; + default: return FALSE; + } + } + } +return FALSE; +} + +struct bed *allocBedEnz(struct cutter *enz, char *seqName, int seqPos, char strand) +{ +struct bed *newbed = NULL; +AllocVar(newbed); +newbed->chrom = cloneString(seqName); +newbed->chromStart = (strand == '-') ? seqPos - enz->size : seqPos; +newbed->chromEnd = (strand == '-') ? seqPos : seqPos + enz->size; +newbed->name = cloneString(enz->name); +newbed->score = 1000; +newbed->strand[0] = strand; +return newbed; +} + +struct bed *searchStrand(struct hash *sixers, struct cutter *ACGTo[], struct dnaSeq *seq, int startOffset, char strand) +/* Cheesey function that checks a strand for the enzymes after they're put in the hash/array structures. + This used to be a part of the matchEnzymes function but I do it twice now. */ +{ +struct cutter *enz; +struct bed *bedList = NULL; +int seqPos; + +if (ACGTo[0] || ACGTo[1] || ACGTo[2] || ACGTo[3] || (sixers->elCount > 0)) + { + for (seqPos = 0; seqPos < seq->size; seqPos++) + { + struct cutter *enzList = NULL; + char sixer[7]; + int bedPos = (strand == '-') ? (seq->size - seqPos) : seqPos; + if (seq->size - seqPos >= 6) + { + struct hashEl *el = NULL; + sixer[6] = '\0'; + memcpy(sixer, seq->dna+seqPos, 6); + el = hashLookup(sixers, sixer); + if (el) + { + struct bed *add; + enz = el->val; + add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand); + slAddHead(&bedList, add); + /* Just in case there's another one with the same sequence. */ + while ((el = hashLookupNext(el))) + { + enz = el->val; + add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand); + slAddHead(&bedList, add); + } + } + } + /* Use a certain list depending on which letter we're on in the sequence. */ + if (seq->dna[seqPos] == 'A') + enzList = ACGTo[0]; + else if (seq->dna[seqPos] == 'C') + enzList = ACGTo[1]; + else if (seq->dna[seqPos] == 'G') + enzList = ACGTo[2]; + else if (seq->dna[seqPos] == 'T') + enzList = ACGTo[3]; + for (enz = enzList; enz != NULL; enz = enz->next) + { + int enzPos = 0; + int seqCurPos = seqPos; + while (enzPos < enz->size && seqCurPos < seq->size && matchingBase(enz->seq[enzPos],seq->dna[seqCurPos])) + { + enzPos++; + seqCurPos++; + } + if (enzPos == enz->size) + { + struct bed *add = allocBedEnz(enz, seq->name, bedPos + startOffset, strand); + slAddHead(&bedList, add); + } + } + } + } +return bedList; +} + +struct bed *matchEnzymes(struct cutter *cutters, struct dnaSeq *seq, int startOffset) +/* Match the enzymes to sequence and return a bed list in all cases. */ +{ +struct hash *sixers = newHash(8), *palinSixers = newHash(8); +struct cutter *enz; +struct cutter *ACGTo[5], *palinACGTo[5]; +struct bed *bedList = NULL, *tmp; +int i; +if (!cutters) + return NULL; +for (i = 0; i < 5; i++) + ACGTo[i] = palinACGTo[i] = NULL; +/* Put each of the enzymes in either a hash table of six-cutters or */ +enz = cutters; +while (enz != NULL) + { + int acgtCount = 0; + struct cutter *next = enz->next; + acgtCount = countChars(enz->seq,'A') + countChars(enz->seq,'C') + + countChars(enz->seq,'G') + countChars(enz->seq,'T'); + /* Super dumb coding here but it's quick. */ + if (enz->palindromic) + { + if (enz->size==6 && acgtCount==6) + hashAdd(palinSixers, enz->seq, enz); + else + { + if (enz->seq[0] == 'A') + slAddHead(&palinACGTo[0], enz); + else if (enz->seq[0] == 'C') + slAddHead(&palinACGTo[1], enz); + else if (enz->seq[0] == 'G') + slAddHead(&palinACGTo[2], enz); + else if (enz->seq[0] == 'T') + slAddHead(&palinACGTo[3], enz); + else + { + slAddHead(&palinACGTo[4], enz); + } + } + } + else + { + if (enz->size==6 && acgtCount==6) + hashAdd(sixers, enz->seq, enz); + else + { + if (enz->seq[0] == 'A') + slAddHead(&ACGTo[0], enz); + else if (enz->seq[0] == 'C') + slAddHead(&ACGTo[1], enz); + else if (enz->seq[0] == 'G') + slAddHead(&ACGTo[2], enz); + else if (enz->seq[0] == 'T') + slAddHead(&ACGTo[3], enz); + else + slAddHead(&ACGTo[4], enz); + } + } + enz = next; + } +/* At this point we got a hash for the palindromes and non-palindromic six-cutters, + plus an array for each too. The array is set up so the enzymes starting with 'A' go into [0], 'C' + into [1], 'G' into [2], 'T' into [3], and other bases into [4]. */ +if (ACGTo[4]) + { + ACGTo[0] = slCat(ACGTo[0], ACGTo[4]); + ACGTo[1] = slCat(ACGTo[1], ACGTo[4]); + ACGTo[2] = slCat(ACGTo[2], ACGTo[4]); + ACGTo[3] = slCat(ACGTo[3], ACGTo[4]); + } +if (palinACGTo[4]) + { + palinACGTo[0] = slCat(palinACGTo[0], palinACGTo[4]); + palinACGTo[1] = slCat(palinACGTo[1], palinACGTo[4]); + palinACGTo[2] = slCat(palinACGTo[2], palinACGTo[4]); + palinACGTo[3] = slCat(palinACGTo[3], palinACGTo[4]); + } +/* Search the DNA in three ways: on the plus strand for both palindromes and nonpalindromes, and then + just nonpalindromes on the minus strand. */ +bedList = searchStrand(palinSixers, palinACGTo, seq, startOffset, '+'); +tmp = searchStrand(sixers, ACGTo, seq, startOffset, '+'); +bedList = slCat(bedList, tmp); +reverseComplement(seq->dna, seq->size); +tmp = searchStrand(sixers, ACGTo, seq, startOffset, '-'); +bedList = slCat(bedList, tmp); +if (bedList) + slReverse(&bedList); +return bedList; +} + +void cullCutters(struct cutter **enzList, boolean semicolon, struct slName *justThese, int matchSize) +/* Reduce the list of enzymes based on different options. */ +{ +struct cutter *enz = *enzList, *next; +while (enz != NULL) + { + next = enz->next; + if ((justThese && !slNameInList(justThese, enz->name)) || + (enz->matchSize < matchSize) || (!semicolon && enz->semicolon)) + { + slRemoveEl(enzList, enz); + cutterFree(&enz); + } + enz = next; + } +}