44ccfacbe3a3d4b300f80d48651c77837a4b571e galt Tue Apr 26 11:12:02 2022 -0700 SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix. diff --git src/hg/visiGene/vgProbeTrack/vgProbeTrack.c src/hg/visiGene/vgProbeTrack/vgProbeTrack.c index bba913a..883e2e8 100644 --- src/hg/visiGene/vgProbeTrack/vgProbeTrack.c +++ src/hg/visiGene/vgProbeTrack/vgProbeTrack.c @@ -1,1677 +1,1687 @@ /* Copyright (C) 2013 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ /* vgProbeTrack - build vgPrb% permanent id tables in visiGene database, * also updates $db.vg{All}Probes tables * * This finds sequence for all probes, using the best method available, * which can be given in probe.seq, using the primers to * fetch the probe using isPcr, or using accession or gene to fetch sequence. * * When all the sequences are availabe and ready, then blat is run to * create psl alignments for use in probe tracks which may also be * mapped over to human orthologous position using either chains with pslMap (mouse) * or blatz (frog). * */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "obscure.h" #include "ra.h" #include "jksql.h" #include "dystring.h" #include "portable.h" #include "fa.h" #include "hdb.h" #include "vgPrb.h" /* Variables you can override from command line. */ char *database = "visiGene"; char *sqlPath = "."; void usage() /* Explain usage and exit. */ { errAbort( "vgProbeTrack - build permanent-Id vgPrb* tables in visiGene database\n" " and update assembly-specific tables vgProbes and vgAllProbes\n" "usage:\n" " vgProbeTrack <COMMAND> {workingDir db} {optional params}\n" "\n" "Commands:\n" " INIT - WARNING! drops and creates all tables (except vgPrb is not dropped). \n" " POP - populate vgPrb to catch probes for newly added submission sets. \n" " SEQ workingDir db - find sequence using assembly db. \n" " ALI workingDir db - find or blat any needed alignments for vgProbes track. \n" " EXT workingDir db - add any needed seq and extfile records for vgProbes track. \n" " PSLMAP workingDir db fromDb - pslMap using chains fromDb to db for vgAllProbes track. \n" " REMAP workingDir db fromDb track fa - import db.track psls of fromDb using fa fasta file for vgAllProbes track. \n" " SELFMAP workingDir db - migrate self vgProbes to vgAllProbes track. \n" " EXTALL workingDir db - add any needed seq and extfile records for vgAllProbes track. \n" "\n" "workingDir is a directory with space for intermediate and final results.\n" "db is the target assembly to use.\n" "Options:\n" " -database=%s - Specifically set database (default visiGene)\n" " -sqlPath=%s - specify location of vgPrb.sql, relative to workingDir \n" , database, sqlPath ); } static struct optionSpec options[] = { {"database", OPTION_STRING,}, {"sqlPath", OPTION_STRING,}, {NULL, 0}, }; /* --- defining just the fields needed from bac ---- */ struct bac /* Information from bac */ { struct bac *next; /* Next in singly linked list. */ char *chrom; /* chrom */ int chromStart; /* start */ int chromEnd; /* end */ char *strand; /* strand */ int probe; /* probe */ }; struct bac *bacLoad(char **row) /* Load a bac from row fetched with select * from bac * from database. Dispose of this with bacFree(). */ { struct bac *ret; AllocVar(ret); ret->chrom = cloneString(row[0]); ret->chromStart = atoi(row[1]); ret->chromEnd = atoi(row[2]); ret->strand = cloneString(row[3]); ret->probe = atoi(row[4]); return ret; } void bacFree(struct bac **pEl) /* Free a single dynamically allocated bac such as created * with bacLoad(). */ { struct bac *el; if ((el = *pEl) == NULL) return; freeMem(el->chrom); freeMem(el->strand); freez(pEl); } void bacFreeList(struct bac **pList) /* Free a list of dynamically allocated bac's */ { struct bac *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; bacFree(&el); } *pList = NULL; } struct bac *bacRead(struct sqlConnection *conn, int taxon, char *db) /* Slurp in the bac rows sorted by chrom */ { struct bac *list=NULL, *el; char query[512]; struct sqlResult *sr; char **row; sqlSafef(query, sizeof(query), "select e.chrom, e.chromStart, e.chromEnd, e.strand, pe.id" " from probe p, vgPrbMap m, vgPrb pe, bac b, %s.bacEndPairs e" " where p.bac = b.id and p.id = m.probe and pe.id = m.vgPrb" " and pe.taxon=%d and b.name = e.name" " and pe.type='bac' and pe.state='new'" " order by e.chrom, e.chromStart", db,taxon); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { el = bacLoad(row); slAddHead(&list,el); } sqlFreeResult(&sr); slReverse(&list); return list; } char *checkAndFetchBacDna(struct dnaSeq *chromSeq, struct bac *bac) /* fetch bac dna return string to be freed later. Reports if segment exceeds ends of chromosome. */ { if (bac->chromStart < 0) { errAbort("bac error chromStart < 0 : %s %u %u %s %d \n", bac->chrom, bac->chromStart, bac->chromEnd, bac->strand, bac->probe ); return NULL; } if (bac->chromEnd > chromSeq->size) { errAbort("bac error chromEnd > chromSeq->size (%lu) : %s %u %u %s %d \n", (unsigned long) chromSeq->size, bac->chrom, bac->chromStart, bac->chromEnd, bac->strand, bac->probe ); return NULL; } return cloneStringZ(chromSeq->dna + bac->chromStart, bac->chromEnd - bac->chromStart); } static int findVgPrbBySeq(struct sqlConnection *conn, char *seq, int taxon) /* find vgPrb.id by seq given or return 0 if not found */ { char *fmt = "select id from vgPrb where seq = '%s' and taxon=%d"; int size = strlen(fmt)+strlen(seq)+4; char *sql = needMem(size); sqlSafef(sql,size,fmt,seq,taxon); return sqlQuickNum(conn,sql); freez(&sql); } static void populateMissingVgPrb(struct sqlConnection *conn) /* populate vgPrb where missing, usually after new records added to visiGene */ { struct sqlResult *sr; char **row; struct dyString *dy = dyStringNew(0); struct sqlConnection *conn2 = sqlConnect(database); struct sqlConnection *conn3 = sqlConnect(database); int probeCount=0, vgPrbCount=0; sqlDyStringPrintf(dy, "select p.id,p.gene,antibody,probeType,fPrimer,rPrimer,p.seq,bac,g.taxon" " from probe p join gene g" " left join vgPrbMap m on m.probe = p.id" " where g.id = p.gene" " and m.probe is NULL"); sr = sqlGetResult(conn, dy->string); while ((row = sqlNextRow(sr)) != NULL) { int id = sqlUnsigned(row[0]); /* int gene = sqlUnsigned(row[1]); */ /* int antibody = sqlUnsigned(row[2]); */ /* int probeType = sqlUnsigned(row[3]); */ char *fPrimer = row[4]; char *rPrimer = row[5]; char *seq = row[6]; int bac = sqlUnsigned(row[7]); int taxon = sqlUnsigned(row[8]); char *peType = "none"; int peProbe = id; char *peSeq = seq; char *tName = ""; int tStart = 0; int tEnd = 0; char *tStrand = " "; /* char *peGene = ""; int bacInfo = 0; int seqid = 0; int pslid = 0; */ char *state = "new"; char *db = ""; int vgPrb = 0; if (isNotEmpty(seq)) { peType = "probe"; state = "seq"; } else if (isNotEmpty(fPrimer) && isNotEmpty(rPrimer)) { peType = "primersMrna"; } else if (isNotEmpty(fPrimer) && isEmpty(rPrimer)) { /* only have fPrimer, it's probably a comment, not dna seq */ peType = "refSeq"; /* use accession or gene */ } else if (bac > 0) { peType = "bac"; /* use bacEndPairs */ } else { peType = "refSeq"; /* use accession or gene */ } if (!sameString(peSeq,"")) { vgPrb = findVgPrbBySeq(conn3,peSeq,taxon); } if (vgPrb == 0) { dyStringClear(dy); sqlDyStringPrintf(dy, "insert into vgPrb set"); sqlDyStringPrintf(dy, " id=default,\n"); sqlDyStringPrintf(dy, " type='%s',\n", peType); sqlDyStringPrintf(dy, " seq='%s'\n", peSeq); sqlDyStringPrintf(dy, " tName='%s',\n", tName); sqlDyStringPrintf(dy, " tStart=%d,\n", tStart); sqlDyStringPrintf(dy, " tEnd=%d,\n", tEnd); sqlDyStringPrintf(dy, " tStrand='%s',\n", tStrand); sqlDyStringPrintf(dy, " db='%s',\n", db); sqlDyStringPrintf(dy, " taxon='%d',\n", taxon); sqlDyStringPrintf(dy, " state='%s'\n", state); verbose(2, "%s\n", dy->string); sqlUpdate(conn2, dy->string); vgPrb = sqlLastAutoId(conn2); vgPrbCount++; } dyStringClear(dy); sqlDyStringPrintf(dy, "insert into vgPrbMap set"); sqlDyStringPrintf(dy, " probe=%d,\n", peProbe); sqlDyStringPrintf(dy, " vgPrb=%d \n", vgPrb); verbose(2, "%s\n", dy->string); sqlUpdate(conn2, dy->string); probeCount++; } verbose(1, "# new probe records found = %d, # new vgPrb records added = %d\n", probeCount, vgPrbCount); dyStringFree(&dy); sqlFreeResult(&sr); sqlDisconnect(&conn3); sqlDisconnect(&conn2); } static void processIsPcr(struct sqlConnection *conn, int taxon, char *db) /* process isPcr results */ { /* >NM_010919:371+1088 2 718bp CGCGGATCCAAGGACATCTTGGACCTTCCG CCCAAGCTTGCATGTGCTGCAGCGACTGCG */ struct dyString *dy = dyStringNew(0); struct lineFile *lf = lineFileOpen("isPcr.fa", TRUE); int lineSize; char *line; char *name; char *dna; char *word, *end; char *tName; int tStart; int tEnd; char *tStrand; int probeid=0; /* really a vgPrb id */ boolean more = lineFileNext(lf, &line, &lineSize); while(more) { if (line[0] != '>') errAbort("unexpected error out of phase\n"); name = cloneString(line); verbose(1,"name=%s\n",name); dyStringClear(dy); while((more=lineFileNext(lf, &line, &lineSize))) { if (line[0] == '>') { break; } dyStringAppend(dy,line); } dna = cloneString(dy->string); word = name+1; end = strchr(word,':'); tName = cloneStringZ(word,end-word); word = end+1; end = strchr(word,'+'); tStrand = "+"; if (!end) { end = strchr(word,'-'); tStrand = "-"; } tStart = atoi(word); word = end+1; end = strchr(word,' '); tEnd = atoi(word); word = end+1; end = strchr(word,' '); probeid = atoi(word); dyStringClear(dy); sqlDyStringPrintf(dy, "select count(*) from vgPrb where id=%d and state='new'",probeid); if (sqlQuickNum(conn,dy->string)>0) { /* record exists and hasn't already been updated */ int vgPrb = findVgPrbBySeq(conn,dna,taxon); if (vgPrb == 0) { dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrb set"); sqlDyStringPrintf(dy, " seq='%s'\n", dna); sqlDyStringPrintf(dy, " tName='%s',\n", tName); sqlDyStringPrintf(dy, " tStart=%d,\n", tStart); sqlDyStringPrintf(dy, " tEnd=%d,\n", tEnd); sqlDyStringPrintf(dy, " tStrand='%s',\n", tStrand); sqlDyStringPrintf(dy, " db='%s',\n", db); sqlDyStringPrintf(dy, " state='%s'\n", "seq"); sqlDyStringPrintf(dy, " where id=%d\n", probeid); sqlDyStringPrintf(dy, " and state='%s'\n", "new"); verbose(2, "%s\n", dy->string); sqlUpdate(conn, dy->string); } else /* probe seq already exists */ { /* just re-map the probe table recs to it */ dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrbMap set vgPrb=%d where vgPrb=%d",vgPrb,probeid); sqlUpdate(conn, dy->string); /* and delete it from vgPrb */ dyStringClear(dy); sqlDyStringPrintf(dy, "delete from vgPrb where id=%d",probeid); sqlUpdate(conn, dy->string); } } freez(&tName); freez(&name); freez(&dna); } lineFileClose(&lf); dyStringFree(&dy); } static int doBacs(struct sqlConnection *conn, int taxon, char *db) /* fetch available sequence for bacEndPairs */ { struct dyString *dy = dyStringNew(0); struct dnaSeq *chromSeq = NULL; struct bac *bacs = bacRead(conn, taxon, db); struct bac *bac = NULL; char *chrom = cloneString(""); int count = 0; verbose(1,"bac list read done.\n"); for(bac=bacs;bac;bac=bac->next) { if (differentWord(chrom,bac->chrom)) { verbose(1,"switching to chrom %s\n",bac->chrom); dnaSeqFree(&chromSeq); chromSeq = hLoadChrom(bac->chrom,db); freez(&chrom); chrom = cloneString(bac->chrom); } char *dna = checkAndFetchBacDna(chromSeq, bac); if (sameString(bac->strand,"-")) { reverseComplement(dna,strlen(dna)); } dyStringClear(dy); sqlDyStringPrintf(dy, "select count(*) from vgPrb where id=%d and state='new'",bac->probe); if (sqlQuickNum(conn,dy->string)>0) { /* record exists and hasn't already been updated */ int vgPrb = findVgPrbBySeq(conn,dna,taxon); if (vgPrb == 0) { dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrb set"); sqlDyStringPrintf(dy, " seq='%s'\n", dna); sqlDyStringPrintf(dy, " tName='%s',\n", bac->chrom); sqlDyStringPrintf(dy, " tStart=%d,\n", bac->chromStart); sqlDyStringPrintf(dy, " tEnd=%d,\n", bac->chromEnd); sqlDyStringPrintf(dy, " tStrand='%s',\n", bac->strand); sqlDyStringPrintf(dy, " db='%s',\n", db); sqlDyStringPrintf(dy, " state='%s'\n", "seq"); sqlDyStringPrintf(dy, " where id=%d\n", bac->probe); sqlDyStringPrintf(dy, " and state='%s'\n", "new"); //verbose(2, "%s\n", dy->string); // the sql string could be quite large sqlUpdate(conn, dy->string); } else /* probe seq already exists */ { /* just re-map the probe table recs to it */ dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrbMap set vgPrb=%d where vgPrb=%d",vgPrb,bac->probe); sqlUpdate(conn, dy->string); /* and delete it from vgPrb */ dyStringClear(dy); sqlDyStringPrintf(dy, "delete from vgPrb where id=%d",bac->probe); sqlUpdate(conn, dy->string); } ++count; verbose(2,"%d finished bac for probe id %d size %d\n", count, bac->probe, bac->chromEnd - bac->chromStart); } freez(&dna); } freez(&chrom); dnaSeqFree(&chromSeq); bacFreeList(&bacs); dyStringFree(&dy); return count; } static void doPrimers(struct sqlConnection *conn, int taxon, char *db) /* get probe seq from primers */ { int rc = 0; struct dyString *dy = dyStringNew(0); char cmdLine[256]; char path1[256]; char path2[256]; dyStringClear(dy); sqlDyStringPrintf(dy, "select e.id, p.fPrimer, p.rPrimer from probe p, vgPrbMap m, vgPrb e, gene g"); sqlDyStringPrintf(dy, " where p.id = m.probe and m.vgPrb = e.id and g.id = p.gene and g.taxon = %d",taxon); sqlDyStringPrintf(dy, " and e.state = 'new' and e.type='primersMrna'"); rc = sqlSaveQuery(conn, dy->string, "primers.query", FALSE); verbose(1,"rc = %d = count of primers for mrna search for taxon %d\n",rc,taxon); if (rc > 0) /* something to do */ { dyStringClear(dy); sqlDyStringPrintf(dy, "select qName from %s.all_mrna",db); rc = 0; rc = sqlSaveQuery(conn, dy->string, "accFile.txt", FALSE); safef(cmdLine,sizeof(cmdLine),"getRna %s accFile.txt mrna.fa",db); system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date"); verbose(1,"rc = %d = count of mrna for %s\n",rc,db); system("date"); system("isPcr mrna.fa primers.query isPcr.fa -out=fa"); system("date"); system("ls -l"); processIsPcr(conn,taxon,db); unlink("mrna.fa"); unlink("accFile.txt"); unlink("isPcr.fa"); } unlink("primers.query"); /* find any remaining type primersMrna that couldn't be resolved and demote * them to type primersGenome */ dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrb set type='primersGenome'"); sqlDyStringPrintf(dy, " where taxon = %d",taxon); sqlDyStringPrintf(dy, " and state = 'new' and type='primersMrna'"); sqlUpdate(conn, dy->string); /* get primers for those probes that did not find mrna isPcr matches * and then do them against the genome instead */ dyStringClear(dy); sqlDyStringPrintf(dy, "select e.id, p.fPrimer, p.rPrimer from probe p, vgPrbMap m, vgPrb e, gene g"); sqlDyStringPrintf(dy, " where p.id = m.probe and m.vgPrb = e.id and g.id = p.gene and g.taxon = %d",taxon); sqlDyStringPrintf(dy, " and e.state = 'new' and e.type='primersGenome'"); rc = 0; rc = sqlSaveQuery(conn, dy->string, "primers.query", FALSE); verbose(1,"rc = %d = count of primers for genome search for taxon %d\n",rc,taxon); if (rc > 0) /* something to do */ { safef(path1,sizeof(path1),"/gbdb/%s/%s.2bit",db,db); safef(path2,sizeof(path2),"%s/%s.2bit",getCurrentDir(),db); verbose(1,"copy: [%s] to [%s]\n",path1,path2); copyFile(path1,path2); safef(cmdLine,sizeof(cmdLine), "ssh kolossus 'cd %s; isPcr %s.2bit primers.query isPcr.fa -out=fa'", getCurrentDir(),db); system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date"); safef(path2,sizeof(path2),"%s/%s.2bit",getCurrentDir(),db); verbose(1,"rm %s\n",path2); unlink(path2); system("ls -l"); processIsPcr(conn,taxon,db); unlink("mrna.fa"); unlink("accFile.txt"); unlink("isPcr.fa"); } unlink("primers.query"); /* find any remaining type primersGenome that couldn't be resolved and demote * them to type refSeq */ dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrb set type='refSeq'"); sqlDyStringPrintf(dy, " where taxon = %d",taxon); sqlDyStringPrintf(dy, " and state = 'new' and type='primersGenome'"); sqlUpdate(conn, dy->string); dyStringFree(&dy); } static void doBacEndPairs(struct sqlConnection *conn, int taxon, char *db) /* get probe seq bacEndPairs */ { struct dyString *dy = dyStringNew(0); int rc = 0; /* fetch available sequence for bacEndPairs */ rc = doBacs(conn, taxon, db); verbose(1,"found seq for %d bacEndPairs\n",rc); /* find any remaining type bac that couldn't be resolved and demote * them to type refSeq */ dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrb set type='refSeq'"); sqlDyStringPrintf(dy, " where taxon = %d",taxon); sqlDyStringPrintf(dy, " and state = 'new' and type='bac'"); sqlUpdate(conn, dy->string); dyStringFree(&dy); } static void setTName(struct sqlConnection *conn, int taxon, char *db, char *type, char *fld) /* set vgPrb.tName to the desired value to try, * e.g. gene.refSeq, gene.genbank, mm6.refFlat.name(genoName=gene.name). */ { struct dyString *dy = dyStringNew(0); dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrb e, vgPrbMap m, probe p, gene g" " set e.seq = '', e.tName = g.%s" " where e.id = m.vgPrb and m.probe = p.id and p.gene = g.id" " and e.type = '%s'" " and e.state = 'new'" " and e.taxon = %d" , fld, type, taxon); sqlUpdate(conn, dy->string); dyStringFree(&dy); } static void setTNameMapped(struct sqlConnection *conn, int taxon, char *db, char *type, char *fld, char *mapTable, char *mapField, char *mapTo) /* set vgPrb.tName to the desired value to try, * e.g. gene.refSeq, gene.genbank, mm6.refFlat.name(genoName=gene.name). */ { struct dyString *dy = dyStringNew(0); dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrb e, vgPrbMap m, probe p, gene g, %s.%s f" " set e.seq = '', e.tName = f.%s" " where e.id = m.vgPrb and m.probe = p.id and p.gene = g.id" " and g.%s = f.%s" " and e.type = '%s'" " and e.state = 'new'" " and g.taxon = %d" ,db, mapTable, mapTo, fld, mapField, type, taxon); sqlUpdate(conn, dy->string); dyStringFree(&dy); } static void processMrnaFa(struct sqlConnection *conn, int taxon, char *type, char *db) /* process isPcr results */ { struct dyString *dy = dyStringNew(0); struct lineFile *lf = lineFileOpen("mrna.fa", TRUE); int lineSize; char *line; char *name; char *dna; boolean more = lineFileNext(lf, &line, &lineSize); while(more) { if (line[0] != '>') errAbort("unexpected error out of phase\n"); name = cloneString(line+1); verbose(2,"name=%s\n",name); dyStringClear(dy); while((more=lineFileNext(lf, &line, &lineSize))) { if (line[0] == '>') { break; } dyStringAppend(dy,line); } dna = cloneString(dy->string); while(1) { int oldProbe = 0; dyStringClear(dy); sqlDyStringPrintf(dy, "select id from vgPrb " "where taxon=%d and type='%s' and tName='%s' and state='new'",taxon,type,name); oldProbe = sqlQuickNum(conn,dy->string); if (oldProbe==0) break; /* no more records match */ /* record exists and hasn't already been updated */ int vgPrb = findVgPrbBySeq(conn,dna,taxon); if (vgPrb == 0) { dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrb set"); sqlDyStringPrintf(dy, " seq = '%s'\n", dna); sqlDyStringPrintf(dy, " db = '%s',\n", db); sqlDyStringPrintf(dy, " state = 'seq'\n"); sqlDyStringPrintf(dy, " where id=%d\n", oldProbe); sqlDyStringPrintf(dy, " and state='%s'\n", "new"); verbose(2, "%s\n", dy->string); sqlUpdate(conn, dy->string); } else /* probe seq already exists */ { /* just re-map the probe table recs to it */ dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrbMap set vgPrb=%d where vgPrb=%d",vgPrb,oldProbe); sqlUpdate(conn, dy->string); /* and delete it from vgPrb */ dyStringClear(dy); sqlDyStringPrintf(dy, "delete from vgPrb where id=%d",oldProbe); sqlUpdate(conn, dy->string); } } freez(&name); freez(&dna); } lineFileClose(&lf); dyStringFree(&dy); } static int getAccMrnas(struct sqlConnection *conn, int taxon, char *db, char *type, char *table) /* get mRNAs for one vgPrb acc type */ { int rc = 0; struct dyString *dy = dyStringNew(0); char cmdLine[256]; dyStringClear(dy); sqlDyStringPrintf(dy, "select distinct e.tName from vgPrb e, %s.%s m" " where e.tName = m.qName" " and e.taxon = %d and e.type = '%s' and e.tName <> '' and e.state = 'new'" ,db,table,taxon,type); rc = 0; rc = sqlSaveQuery(conn, dy->string, "accFile.txt", FALSE); safef(cmdLine,sizeof(cmdLine),"getRna %s accFile.txt mrna.fa",db); //system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); //system("date"); processMrnaFa(conn, taxon, type, db); unlink("mrna.fa"); unlink("accFile.txt"); dyStringFree(&dy); return rc; } static void advanceType(struct sqlConnection *conn, int taxon, char *oldType, char *newType) /* find any remaining type refSeq that couldn't be resolved and demote * them to type genbank */ { struct dyString *dy = dyStringNew(0); dyStringClear(dy); sqlDyStringPrintf(dy, "update vgPrb set type='%s', tName=''" " where taxon = %d and state = 'new' and type='%s'" ,newType,taxon,oldType); sqlUpdate(conn, dy->string); dyStringFree(&dy); } static void doAccessionsSeq(struct sqlConnection *conn, int taxon, char *db) /* get probe seq from Accessions */ { int rc = 0; struct dyString *dy = dyStringNew(0); /* get refSeq accessions and rna */ setTName(conn, taxon, db, "refSeq", "refSeq"); rc = getAccMrnas(conn, taxon, db, "refSeq", "refSeqAli"); verbose(1,"rc = %d = count of refSeq mrna for %s\n",rc,db); advanceType(conn,taxon,"refSeq","genRef"); /* get refSeq-in-gene.genbank accessions and rna */ setTName(conn, taxon, db, "genRef", "genbank"); rc = getAccMrnas(conn, taxon, db, "genRef", "refSeqAli"); verbose(1,"rc = %d = count of genRef mrna for %s\n",rc,db); advanceType(conn,taxon,"genRef","genbank"); /* get genbank accessions and rna */ setTName(conn, taxon, db, "genbank", "genbank"); rc = getAccMrnas(conn, taxon, db, "genbank", "all_mrna"); verbose(1,"rc = %d = count of genbank mrna for %s\n",rc,db); advanceType(conn,taxon,"genbank","flatRef"); /* get gene.name -> refFlat to refSeq accessions and rna */ setTNameMapped(conn, taxon, db, "flatRef", "name", "refFlat", "geneName", "name"); rc = getAccMrnas(conn, taxon, db, "flatRef", "refSeqAli"); verbose(1,"rc = %d = count of flatRef mrna for %s\n",rc,db); advanceType(conn,taxon,"flatRef","flatAll"); /* get gene.name -> refFlat to all_mrna accessions */ setTNameMapped(conn, taxon, db, "flatAll", "name", "refFlat", "geneName", "name"); rc = getAccMrnas(conn, taxon, db, "flatAll", "all_mrna"); verbose(1,"rc = %d = count of flatAll mrna for %s\n",rc,db); advanceType(conn,taxon,"flatAll","linkRef"); /* get gene.name -> refLink to refSeq accessions and rna */ setTNameMapped(conn, taxon, db, "linkRef", "name", "refLink", "name", "mrnaAcc"); rc = getAccMrnas(conn, taxon, db, "linkRef", "refSeqAli"); verbose(1,"rc = %d = count of linkRef mrna for %s\n",rc,db); advanceType(conn,taxon,"linkRef","linkAll"); /* get gene.name -> refLink to all_mrna accessions */ setTNameMapped(conn, taxon, db, "linkAll", "name", "refLink", "name", "mrnaAcc"); rc = getAccMrnas(conn, taxon, db, "linkAll", "all_mrna"); verbose(1,"rc = %d = count of linkAll mrna for %s\n",rc,db); advanceType(conn,taxon,"linkAll","kgAlRef"); /* get gene.name -> kgAlias to refSeq accessions and rna */ setTNameMapped(conn, taxon, db, "kgAlRef", "name", "kgAlias", "alias", "kgId"); rc = getAccMrnas(conn, taxon, db, "kgAlRef", "refSeqAli"); verbose(1,"rc = %d = count of kgAlRef mrna for %s\n",rc,db); advanceType(conn,taxon,"kgAlRef","kgAlAll"); /* get gene.name -> kgAlias to all_mrna accessions */ setTNameMapped(conn, taxon, db, "kgAlAll", "name", "kgAlias", "alias", "kgId"); rc = getAccMrnas(conn, taxon, db, "kgAlAll", "all_mrna"); verbose(1,"rc = %d = count of kgAlAll mrna for %s\n",rc,db); advanceType(conn,taxon,"kgAlAll","gene"); dyStringFree(&dy); } static void initTable(struct sqlConnection *conn, char *table, boolean nuke) /* build tables */ { char *sql = NULL; char path[256]; if (nuke) sqlDropTable(conn, table); if (!sqlTableExists(conn, table)) { safef(path,sizeof(path),"%s/%s.sql",sqlPath,table); readInGulp(path, &sql, NULL); sqlUpdate(conn, sql); } } static void populateMissingVgPrbAli(struct sqlConnection *conn, int taxon, char *db, char *table) /* populate vgPrbAli for db */ { struct dyString *dy = dyStringNew(0); dyStringClear(dy); sqlDyStringPrintf(dy, "insert into %s" " select distinct '%s', e.id, 'new' from vgPrb e" " left join %s a on e.id = a.vgPrb and a.db = '%s'" " where a.vgPrb is NULL " " and e.taxon = %d" ,table,db,table,db,taxon); sqlUpdate(conn, dy->string); dyStringFree(&dy); } static void updateVgPrbAli(struct sqlConnection *conn, char *db, char *table, char *track) /* update vgPrbAli from vgProbes track for db */ { struct dyString *dy = dyStringNew(0); char dbTrk[256]; safef(dbTrk,sizeof(dbTrk),"%s.%s",db,track); if (!sqlTableExists(conn, dbTrk)) { struct sqlConnection *conn2 = sqlConnect(db); verbose(1,"FYI: Table %s does not exist\n",dbTrk); initTable(conn2, track, FALSE); sqlDisconnect(&conn2); } dyStringClear(dy); sqlDyStringPrintf(dy, "update %s a, %s.%s v" " set a.status = 'ali'" " where v.qName = concat('vgPrb_',a.vgPrb)" " and a.db = '%s'" ,table,db,track,db); sqlUpdate(conn, dy->string); dyStringFree(&dy); } static void markNoneVgPrbAli(struct sqlConnection *conn, int fromTaxon, char *db, char *table) /* mark records in vgPrbAli that did not find any alignment for db */ { struct dyString *dy = dyStringNew(0); dyStringClear(dy); sqlDyStringPrintf(dy, "update %s a, vgPrb e" " set a.status = 'none'" " where a.status = 'new'" " and a.db = '%s' and e.taxon = %d" " and a.vgPrb = e.id" ,table,db,fromTaxon); sqlUpdate(conn, dy->string); dyStringFree(&dy); } static int doAccPsls(struct sqlConnection *conn, int taxon, char *db, char *type, char *table) /* get psls for one vgPrb acc type */ { int rc = 0; struct dyString *dy = dyStringNew(0); char outName[256]; dyStringClear(dy); sqlDyStringPrintf(dy, "select m.matches,m.misMatches,m.repMatches,m.nCount,m.qNumInsert,m.qBaseInsert," "m.tNumInsert,m.tBaseInsert,m.strand," "concat(\"vgPrb_\",e.id)," "m.qSize,m.qStart,m.qEnd,m.tName,m.tSize,m.tStart,m.tEnd,m.blockCount," "m.blockSizes,m.qStarts,m.tStarts" " from vgPrb e, vgPrbAli a, %s.%s m" " where e.id = a.vgPrb and a.db = '%s' and a.status='new' and e.tName = m.qName" " and e.taxon = %d and e.type = '%s' and e.tName <> '' and e.state='seq' and e.seq <> ''" " order by m.tName,m.tStart" ,db,table,db,taxon,type); rc = 0; safef(outName,sizeof(outName),"%s.psl",type); rc = sqlSaveQuery(conn, dy->string, outName, FALSE); dyStringFree(&dy); return rc; } static int dumpPslTable(struct sqlConnection *conn, char *db, char *table) /* dump psls for db.table to table.psl in current (working) dir */ { int rc = 0; struct dyString *dy = dyStringNew(0); char outName[256]; dyStringClear(dy); sqlDyStringPrintf(dy, "select matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert," "tNumInsert,tBaseInsert,strand," "qName,qSize,qStart,qEnd,tName,tSize,tStart,tEnd," "blockCount,blockSizes,qStarts,tStarts" " from %s.%s" " order by tName,tStart" ,db,table); rc = 0; safef(outName,sizeof(outName),"%s.psl",table); rc = sqlSaveQuery(conn, dy->string, outName, FALSE); dyStringFree(&dy); return rc; } static void doAccessionsAli(struct sqlConnection *conn, int taxon, char *db) /* get probe alignments from Accessions */ { int rc = 0; /* get refSeq psls */ rc = doAccPsls(conn, taxon, db, "refSeq", "refSeqAli"); verbose(1,"rc = %d = count of refSeq psls for %s\n",rc,db); /* get genRef psls */ rc = doAccPsls(conn, taxon, db, "genRef", "refSeqAli"); verbose(1,"rc = %d = count of genRef psls for %s\n",rc,db); /* get genbank psls */ rc = doAccPsls(conn, taxon, db, "genbank", "all_mrna"); verbose(1,"rc = %d = count of genbank psls for %s\n",rc,db); /* get flatRef psls */ rc = doAccPsls(conn, taxon, db, "flatRef", "refSeqAli"); verbose(1,"rc = %d = count of flatRef psls for %s\n",rc,db); /* get flatAll psls */ rc = doAccPsls(conn, taxon, db, "flatAll", "all_mrna"); verbose(1,"rc = %d = count of flatAll psls for %s\n",rc,db); /* get linkRef psls */ rc = doAccPsls(conn, taxon, db, "linkRef", "refSeqAli"); verbose(1,"rc = %d = count of linkRef psls for %s\n",rc,db); /* get linkAll psls */ rc = doAccPsls(conn, taxon, db, "linkAll", "all_mrna"); verbose(1,"rc = %d = count of linkAll psls for %s\n",rc,db); /* get kgAlRef psls */ rc = doAccPsls(conn, taxon, db, "kgAlRef", "refSeqAli"); verbose(1,"rc = %d = count of kgAlRef psls for %s\n",rc,db); /* get kgAlAll psls */ rc = doAccPsls(conn, taxon, db, "kgAlAll", "all_mrna"); verbose(1,"rc = %d = count of kgAlRef psls for %s\n",rc,db); } static void doFakeBacAli(struct sqlConnection *conn, int taxon, char *db) /* place probe seq from primers, etc with blat */ { int rc = 0; struct dyString *dy = dyStringNew(0); /* create fake psls as blatBAC.psl */ dyStringClear(dy); sqlDyStringPrintf(dy, "select length(e.seq), 0, 0, 0, 0, 0, 0, 0, e.tStrand, concat('vgPrb_',e.id), length(e.seq)," " 0, length(e.seq), e.tName, ci.size, e.tStart, e.tEnd, 1," " concat(length(e.seq),','), concat(0,','), concat(e.tStart,',')" " from vgPrb e, %s.chromInfo ci, vgPrbAli a" " where ci.chrom = e.tName and e.id = a.vgPrb" " and e.type = 'bac'" " and e.taxon = %d" " and a.db = '%s' and a.status='new'" , db, taxon, db); //restore: rc = sqlSaveQuery(conn, dy->string, "fakeBAC.psl", FALSE); verbose(1,"rc = %d = count of sequences for fakeBAC.psl, for taxon %d\n",rc,taxon); dyStringFree(&dy); } static void doBlat(struct sqlConnection *conn, int taxon, char *db) /* place probe seq from non-BAC with blat that have no alignments yet */ { int rc = 0; char *blatSpec=NULL; char cmdLine[256]; char path1[256]; char path2[256]; struct dyString *dy = dyStringNew(0); /* (non-BACs needing alignment) */ dyStringClear(dy); sqlDyStringPrintf(dy, "select concat(\"vgPrb_\",e.id), e.seq" " from vgPrb e, vgPrbAli a" " where e.id = a.vgPrb" " and a.db = '%s'" " and a.status = 'new'" " and e.taxon = %d" " and e.type <> 'bac'" " and e.seq <> ''" " order by e.id" , db, taxon); //restore: rc = sqlSaveQuery(conn, dy->string, "blat.fa", TRUE); verbose(1,"rc = %d = count of sequences for blat, to get psls for taxon %d\n",rc,taxon); if (rc == 0) { unlink("blat.fa"); system("rm -f blatNearBest.psl; touch blatNearBest.psl"); /* make empty file */ return; } /* make .ooc and blat on kolossus */ safef(path1,sizeof(path1),"/gbdb/%s/%s.2bit",db,db); safef(path2,sizeof(path2),"%s/%s.2bit",getCurrentDir(),db); //restore: verbose(1,"copy: [%s] to [%s]\n",path1,path2); copyFile(path1,path2); safef(cmdLine,sizeof(cmdLine), "ssh kolossus 'cd %s; blat -makeOoc=11.ooc -tileSize=11" " -repMatch=1024 %s.2bit /dev/null /dev/null'", getCurrentDir(),db); //restore: system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date"); safef(cmdLine,sizeof(cmdLine), "ssh kolossus 'cd %s; blat %s.2bit blat.fa -ooc=11.ooc -noHead blat.psl'", getCurrentDir(),db); //restore: system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date"); /* using blat even with -fastMap was way too slow - took over a day, * so instead I will make a procedure to write a fake psl for the BACs * which you will see called below */ safef(path2,sizeof(path2),"%s.2bit",db); verbose(1,"rm %s\n",path2); unlink(path2); safef(path2,sizeof(path2),"11.ooc"); verbose(1,"rm %s\n",path2); unlink(path2); /* skip psl header and sort on query name */ safef(cmdLine,sizeof(cmdLine), "sort -k 10,10 blat.psl > blatS.psl"); verbose(1,"cmdLine=[%s]\n",cmdLine); system(cmdLine); /* keep near best within 5% of the best */ safef(cmdLine,sizeof(cmdLine), "pslCDnaFilter -globalNearBest=0.005 -minId=0.96 -minNonRepSize=20 -minCover=0.50" " blatS.psl blatNearBest.psl"); verbose(1,"cmdLine=[%s]\n",cmdLine); system(cmdLine); unlink("blat.fa"); unlink("blat.psl"); unlink("blatS.psl"); freez(&blatSpec); dyStringFree(&dy); } void static assembleAllPsl(struct sqlConnection *conn, int taxon, char *db) /* assemble NonBlat.psl from variouls psl alignments */ { // " blatNearBest.psl" not included /* make final psl */ system( "cat" " fakeBAC.psl" " flatAll.psl" " flatRef.psl" " genRef.psl" " genbank.psl" " kgAlAll.psl" " kgAlRef.psl" " linkAll.psl" " linkRef.psl" " refSeq.psl" " > vgPrbNonBlat.psl" ); verbose(1,"vgPrbNonBlat.psl assembled for taxon %d\n",taxon); //unlink("blatNearBest.psl"); //unlink("fakeBAC.psl"); //system("ls -ltr"); } static void rollupPsl(char *pslName, char *table, struct sqlConnection *conn, char *db) { char cmd[256]; char dbTbl[256]; if (fileSize(pslName)==0) return; safef(dbTbl,sizeof(dbTbl),"%s.%s",db,table); if (!sqlTableExists(conn, dbTbl)) { verbose(1,"FYI: Table %s does not exist\n",dbTbl); safef(cmd,sizeof(cmd),"rm -f %s.psl; touch %s.psl",table,table); /* make empty file */ verbose(1,"%s\n",cmd); system(cmd); } else { dumpPslTable(conn, db, table); } safef(cmd,sizeof(cmd),"cat %s %s.psl | sort -u | sort -k 10,10 > %sNew.psl", pslName, table, table); verbose(1,"%s\n",cmd); system(cmd); safef(cmd,sizeof(cmd),"hgLoadPsl %s %sNew.psl -table=%s",db,table,table); verbose(1,"%s\n",cmd); system(cmd); safef(cmd,sizeof(cmd),"rm %s %s.psl %sNew.psl",pslName,table,table); verbose(1,"%s\n",cmd); //system(cmd); } static int findTaxon(struct sqlConnection *conn, char *db) /* return the taxon for the given db, or abort */ { char sql[256]; int taxon = 0; char *fmt = "select ncbi_taxa_id from go.species " "where concat(genus,' ',species) = '%s'"; sqlSafef(sql,sizeof(sql),fmt,hScientificName(db)); taxon = sqlQuickNum(conn, sql); if (taxon == 0) { if (sameString(db,"nibb")) /* we don't really have this frog as an assembly */ taxon = 8355; /* Xenopus laevis - African clawed frog */ /* can put more here in future */ } if (taxon == 0) errAbort("unknown taxon for db = %s, unable to continue until its taxon is defined.",db); return taxon; } static void makeFakeProbeSeq(struct sqlConnection *conn, char *db) /* get probe seq from primers, bacEndPairs, refSeq, genbank, gene-name */ { int taxon = findTaxon(conn,db); //restore: doPrimers(conn, taxon, db); //restore: doBacEndPairs(conn, taxon, db); //restore: doAccessionsSeq(conn, taxon, db); } /* keep around, but dangerous in the wrong hands ;) static void init(struct sqlConnection *conn) / * build tables - for the first time * / { +char query[1024]; if (!sqlTableExists(conn, "vgPrb")) { initTable(conn, "vgPrb", FALSE); / * this most important table should never be nuked automatically * / - sqlUpdate(conn, NOSQLINJ "create index tName on vgPrb(tName(20));"); - sqlUpdate(conn, NOSQLINJ "create index seq on vgPrb(seq(40));"); + sqlSafef(query, sizeof query, "create index tName on vgPrb(tName(20));"); + sqlUpdate(conn, query); + sqlSafef(query, sizeof query, "create index seq on vgPrb(seq(40));"); + sqlUpdate(conn, query); } initTable(conn, "vgPrbMap", TRUE); -sqlUpdate(conn, NOSQLINJ "create index probe on vgPrbMap(probe);"); -sqlUpdate(conn, NOSQLINJ "create index vgPrb on vgPrbMap(vgPrb);"); +sqlSafef(query, sizeof query, "create index probe on vgPrbMap(probe);"); +sqlUpdate(conn, query); +sqlSafef(query, sizeof query, "create index vgPrb on vgPrbMap(vgPrb);"); +sqlUpdate(conn, query); initTable(conn, "vgPrbAli", TRUE); initTable(conn, "vgPrbAliAll", TRUE); } */ static void doAlignments(struct sqlConnection *conn, char *db) { int taxon = findTaxon(conn,db); populateMissingVgPrbAli(conn, taxon, db, "vgPrbAli"); updateVgPrbAli(conn, db, "vgPrbAli","vgProbes"); doAccessionsAli(conn, taxon, db); doFakeBacAli(conn, taxon, db); assembleAllPsl(conn, taxon, db); rollupPsl("vgPrbNonBlat.psl", "vgProbes", conn, db); updateVgPrbAli(conn, db, "vgPrbAli","vgProbes"); doBlat(conn, taxon, db); rollupPsl("blatNearBest.psl", "vgProbes", conn, db); updateVgPrbAli(conn, db, "vgPrbAli","vgProbes"); markNoneVgPrbAli(conn, taxon, db, "vgPrbAli"); } static void getPslMapAli(struct sqlConnection *conn, char *db, int fromTaxon, char *fromDb, boolean isBac) { int rc = 0; struct dyString *dy = dyStringNew(0); char outName[256]; /* get {non-}bac $db.vgProbes not yet aligned */ dyStringClear(dy); sqlDyStringPrintf(dy, "select m.matches,m.misMatches,m.repMatches,m.nCount,m.qNumInsert,m.qBaseInsert," "m.tNumInsert,m.tBaseInsert,m.strand," "m.qName,m.qSize,m.qStart,m.qEnd,m.tName,m.tSize,m.tStart,m.tEnd,m.blockCount," "m.blockSizes,m.qStarts,m.tStarts" " from vgPrb e, vgPrbAliAll a, %s.vgProbes m" " where e.id = a.vgPrb and a.db = '%s' and a.status='new'" " and m.qName = concat(\"vgPrb_\",e.id)" " and e.taxon = %d and e.type %s 'bac' and e.state='seq' and e.seq <> ''" " order by m.tName,m.tStart" ,fromDb,db,fromTaxon, isBac ? "=" : "<>"); rc = 0; safef(outName,sizeof(outName), isBac ? "bac.psl" : "nonBac.psl"); rc = sqlSaveQuery(conn, dy->string, outName, FALSE); verbose(1,"Count of %s Psls found for pslMap: %d\n", isBac ? "bac" : "nonBac", rc); } static void getPslMapFa(struct sqlConnection *conn, char *db, int fromTaxon) { int rc = 0; struct dyString *dy = dyStringNew(0); /* get .fa for pslRecalcMatch use */ dyStringClear(dy); sqlDyStringPrintf(dy, "select concat(\"vgPrb_\",e.id), e.seq" " from vgPrb e, vgPrbAliAll a" " where e.id = a.vgPrb" " and a.db = '%s'" " and a.status = 'new'" " and e.taxon = %d" " and e.seq <> ''" " order by e.id" , db, fromTaxon); //restore: rc = sqlSaveQuery(conn, dy->string, "pslMap.fa", TRUE); verbose(1,"rc = %d = count of sequences for pslMap for taxon %d\n",rc,fromTaxon); } static void doPslMapAli(struct sqlConnection *conn, int taxon, char *db, int fromTaxon, char *fromDb) { char cmd[256]; struct dyString *dy = dyStringNew(0); char path[256]; char dnaPath[256]; char toDb[12]; safef(toDb,sizeof(toDb),"%s", db); toDb[0]=toupper(toDb[0]); safef(dnaPath,sizeof(dnaPath),"/cluster/data/%s/nib", db); if (!fileExists(dnaPath)) { safef(dnaPath,sizeof(dnaPath),"/cluster/data/%s/%s.2bit", db, db); if (!fileExists(dnaPath)) errAbort("unable to locate nib dir or .2bit for %s: %s", db, dnaPath); } safef(path,sizeof(path),"/gbdb/%s/liftOver/%sTo%s.over.chain.gz", fromDb, fromDb, toDb); if (!fileExists(path)) errAbort("unable to locate chain file %s",path); /* get non-bac $db.vgProbes not yet aligned */ getPslMapAli(conn, db, fromTaxon, fromDb, FALSE); /* get bac $db.vgProbes not yet aligned */ getPslMapAli(conn, db, fromTaxon, fromDb, TRUE); /* get .fa for pslRecalcMatch use */ getPslMapFa(conn, db, fromTaxon); /* non-bac */ safef(cmd,sizeof(cmd), "zcat %s | pslMap -chainMapFile -swapMap nonBac.psl stdin stdout " "| sort -k 14,14 -k 16,16n > unscoredNB.psl" ,path); verbose(1,"%s\n",cmd); system(cmd); safef(cmd,sizeof(cmd), "pslRecalcMatch unscoredNB.psl %s" " pslMap.fa nonBac.psl" ,dnaPath); verbose(1,"%s\n",cmd); system(cmd); /* bac */ safef(cmd,sizeof(cmd), "zcat %s | pslMap -chainMapFile -swapMap bac.psl stdin stdout " "| sort -k 14,14 -k 16,16n > unscoredB.psl" ,path); verbose(1,"%s\n",cmd); system(cmd); safef(cmd,sizeof(cmd), "pslRecalcMatch unscoredB.psl %s" " pslMap.fa bacTemp.psl" ,dnaPath); verbose(1,"%s\n",cmd); system(cmd); safef(cmd,sizeof(cmd), "pslCDnaFilter -globalNearBest=0.00001 -minCover=0.05" " bacTemp.psl bac.psl"); verbose(1,"%s\n",cmd); system(cmd); safef(cmd,sizeof(cmd),"cat bac.psl nonBac.psl > vgPrbPslMap.psl"); verbose(1,"%s\n",cmd); system(cmd); dyStringFree(&dy); } static void doAlignmentsPslMap(struct sqlConnection *conn, char *db, char *fromDb) { int taxon = findTaxon(conn,db); int fromTaxon = findTaxon(conn,fromDb); populateMissingVgPrbAli(conn, fromTaxon, db, "vgPrbAliAll"); updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes"); doPslMapAli(conn, taxon, db, fromTaxon, fromDb); rollupPsl("vgPrbPslMap.psl", "vgAllProbes", conn, db); updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes"); markNoneVgPrbAli(conn, fromTaxon, db, "vgPrbAliAll"); } static void doReMapAli(struct sqlConnection *conn, int taxon, char *db, int fromTaxon, char *fromDb, char *track, char *fasta ) /* re-map anything in track specified that is not aligned, nor even attempted yet, using specified fasta file. */ { char cmd[256]; +char query[1024]; int rc = 0; struct dyString *dy = dyStringNew(0); char dbTrk[256]; safef(dbTrk,sizeof(dbTrk),"%s.%s",db,track); if (!sqlTableExists(conn, dbTrk)) errAbort("Track %s does not exist\n",dbTrk); if (!fileExists(fasta)) errAbort("Unable to locate fasta file %s",fasta); if (sqlTableExists(conn, "vgRemapTemp")) { - sqlUpdate(conn, NOSQLINJ "drop table vgRemapTemp;"); + sqlSafef(query, sizeof query, "drop table vgRemapTemp;"); + sqlUpdate(conn, query); } safef(cmd,sizeof(cmd), "hgPepPred %s generic vgRemapTemp %s " ,database,fasta); verbose(1,"%s\n",cmd); system(cmd); /* required for mysql 5 longtext for case-insensitive comparisons of blobs */ -sqlUpdate(conn, NOSQLINJ "ALTER table vgRemapTemp modify seq longtext;"); -sqlUpdate(conn, NOSQLINJ "create index seq on vgRemapTemp(seq(40));"); +sqlSafef(query, sizeof query, "ALTER table vgRemapTemp modify seq longtext;"); +sqlUpdate(conn, query); +sqlSafef(query, sizeof query, "create index seq on vgRemapTemp(seq(40));"); +sqlUpdate(conn, query); /* get remapped psl probes not yet aligned */ dyStringClear(dy); sqlDyStringPrintf(dy, "select m.matches,m.misMatches,m.repMatches,m.nCount,m.qNumInsert,m.qBaseInsert," "m.tNumInsert,m.tBaseInsert,m.strand," "concat('vgPrb_',e.id),m.qSize,m.qStart,m.qEnd,m.tName,m.tSize,m.tStart,m.tEnd,m.blockCount," "m.blockSizes,m.qStarts,m.tStarts" " from vgPrb e, vgPrbAliAll a, %s.%s m, vgRemapTemp n" " where e.id = a.vgPrb and a.db = '%s' and a.status='new'" " and m.qName = n.name and n.seq = e.seq" " and e.taxon = %d and e.state='seq' and e.seq <> ''" " order by m.tName,m.tStart" ,db,track,db,fromTaxon); rc = 0; rc = sqlSaveQuery(conn, dy->string, "vgPrbReMap.psl", FALSE); verbose(1,"Count of Psls found for reMap: %d\n", rc); -sqlUpdate(conn, NOSQLINJ "drop table vgRemapTemp;"); +sqlSafef(query, sizeof query, "drop table vgRemapTemp;"); +sqlUpdate(conn, query); dyStringFree(&dy); } static void doAlignmentsReMap( struct sqlConnection *conn, char *db, char *fromDb, char *track, char *fasta) /* re-map anything in track specified that is not aligned, nor even attempted yet, using specified fasta file. */ { int taxon = findTaxon(conn,db); int fromTaxon = findTaxon(conn,fromDb); populateMissingVgPrbAli(conn, fromTaxon, db, "vgPrbAliAll"); updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes"); doReMapAli(conn, taxon, db, fromTaxon, fromDb, track, fasta); rollupPsl("vgPrbReMap.psl", "vgAllProbes", conn, db); updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes"); markNoneVgPrbAli(conn, fromTaxon, db, "vgPrbAliAll"); } static void doSelfMapAli(struct sqlConnection *conn, int taxon, char *db) { char cmd[256]; /* get non-bac $db.vgProbes not yet aligned */ getPslMapAli(conn, db, taxon, db, FALSE); /* get bac $db.vgProbes not yet aligned */ getPslMapAli(conn, db, taxon, db, TRUE); safef(cmd,sizeof(cmd),"cat bac.psl nonBac.psl > vgPrbSelfMap.psl"); verbose(1,"%s\n",cmd); system(cmd); } static void doAlignmentsSelfMap( struct sqlConnection *conn, char *db) /* copy anything in vgProbes but not in vgAllProbes to vgAllProbes */ { int taxon = findTaxon(conn,db); populateMissingVgPrbAli(conn, taxon, db, "vgPrbAliAll"); updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes"); doSelfMapAli(conn, taxon, db); rollupPsl("vgPrbSelfMap.psl", "vgAllProbes", conn, db); updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes"); markNoneVgPrbAli(conn, taxon, db, "vgPrbAliAll"); } static void doSeqAndExtFile(struct sqlConnection *conn, char *db, char *table) { int rc = 0; char cmd[256]; char path[256]; char bedPath[256]; char gbdbPath[256]; char *fname=NULL; struct dyString *dy = dyStringNew(0); dyStringClear(dy); sqlDyStringPrintf(dy, "select distinct concat('vgPrb_',e.id), e.seq" " from vgPrb e join %s.%s v" " left join %s.seq s on s.acc = v.qName" " where concat('vgPrb_',e.id) = v.qName" " and s.acc is NULL" " order by e.id" , db, table, db); rc = sqlSaveQuery(conn, dy->string, "vgPrbExt.fa", TRUE); verbose(1,"rc = %d = count of sequences for vgPrbExt.fa, to use with %s track %s\n",rc,db,table); if (rc > 0) /* can set any desired minimum */ { safef(bedPath,sizeof(bedPath),"/cluster/data/%s/bed/visiGene/",db); if (!fileExists(bedPath)) { safef(cmd,sizeof(cmd),"mkdir %s",bedPath); verbose(1,"%s\n",cmd); system(cmd); } safef(gbdbPath,sizeof(gbdbPath),"/gbdb/%s/visiGene/",db); if (!fileExists(gbdbPath)) { safef(cmd,sizeof(cmd),"mkdir %s",gbdbPath); verbose(1,"%s\n",cmd); system(cmd); } while(1) { int i=0; safef(path,sizeof(path),"%svgPrbExt_AAAAAA.fa",bedPath); char *c = rStringIn("AAAAAA",path); srand( (unsigned)time( NULL ) ); for(i=0;i<6;++i) { *c++ += (int) 26 * (rand() / (RAND_MAX + 1.0)); } if (!fileExists(path)) break; } safef(cmd,sizeof(cmd),"cp vgPrbExt.fa %s",path); verbose(1,"%s\n",cmd); system(cmd); fname = rStringIn("/", path); ++fname; safef(cmd,sizeof(cmd),"ln -s %s %s%s",path,gbdbPath,fname); verbose(1,"%s\n",cmd); system(cmd); safef(cmd,sizeof(cmd),"hgLoadSeq %s %s%s", db, gbdbPath,fname); verbose(1,"%s\n",cmd); system(cmd); } dyStringFree(&dy); } int main(int argc, char *argv[]) /* Process command line. */ { struct sqlConnection *conn = NULL; char *command = NULL; optionInit(&argc, argv, options); database = optionVal("database", database); sqlPath = optionVal("sqlPath", sqlPath); if (argc < 2) usage(); command = argv[1]; if (argc >= 3) setCurrentDir(argv[2]); conn = sqlConnect(database); if (sameWord(command,"INIT")) { if (argc != 2) usage(); errAbort("INIT is probably too dangerous. DO NOT USE."); /* init(conn); */ } else if (sameWord(command,"POP")) { if (argc != 2) usage(); /* populate vgPrb where missing */ populateMissingVgPrb(conn); } else if (sameWord(command,"SEQ")) { if (argc != 4) usage(); /* make fake probe sequences */ makeFakeProbeSeq(conn,argv[3]); } else if (sameWord(command,"ALI")) { if (argc != 4) usage(); /* blat anything left that is not aligned, nor even attempted */ doAlignments(conn,argv[3]); } else if (sameWord(command,"EXT")) { if (argc != 4) usage(); /* update seq and extfile as necessary */ doSeqAndExtFile(conn,argv[3],"vgProbes"); } else if (sameWord(command,"PSLMAP")) { if (argc != 5) usage(); /* pslMap anything left that is not aligned, nor even attempted */ doAlignmentsPslMap(conn,argv[3],argv[4]); } else if (sameWord(command,"REMAP")) { if (argc != 7) usage(); /* re-map anything in track specified that is not aligned, nor even attempted yet, using specified fasta file. */ doAlignmentsReMap(conn,argv[3],argv[4],argv[5],argv[6]); } else if (sameWord(command,"SELFMAP")) { if (argc != 4) usage(); /* re-map anything in track specified that is not aligned, nor even attempted yet, using specified fasta file. */ doAlignmentsSelfMap(conn,argv[3]); } else if (sameWord(command,"EXTALL")) { if (argc != 4) usage(); /* update seq and extfile as necessary */ doSeqAndExtFile(conn,argv[3],"vgAllProbes"); } else usage(); sqlDisconnect(&conn); return 0; }