src/hg/visiGene/vgProbeTrack/vgProbeTrack.c 1.17

1.17 2009/10/22 16:38:51 galt
slightly faster
Index: src/hg/visiGene/vgProbeTrack/vgProbeTrack.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/visiGene/vgProbeTrack/vgProbeTrack.c,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/visiGene/vgProbeTrack/vgProbeTrack.c	14 Oct 2009 19:03:02 -0000	1.16
+++ src/hg/visiGene/vgProbeTrack/vgProbeTrack.c	22 Oct 2009 16:38:51 -0000	1.17
@@ -1,1682 +1,1682 @@
 /* vgProbeTrack - build vgPrb% permanent id tables in visiGene database, 
  *   also updates $db.vg{All}Probes tables
  *
  *   This finds sequence for all probes, using the best method available,
  *   which can be given in probe.seq, using the primers to
  *   fetch the probe using isPcr, or using accession or gene to fetch sequence.
  * 
  *   When all the sequences are availabe and ready, then blat is run to 
  *   create psl alignments for use in probe tracks which may also be 
  *   mapped over to human orthologous position using either chains with pslMap (mouse)
  *   or blatz (frog).
  *
  */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "obscure.h"
 #include "ra.h"
 #include "jksql.h"
 #include "dystring.h"
 #include "portable.h"
 #include "fa.h"
 #include "hdb.h"
 
 #include "vgPrb.h"
 
 /* Variables you can override from command line. */
 char *database = "visiGene";
 char *sqlPath = ".";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "vgProbeTrack - build permanent-Id vgPrb* tables in visiGene database\n"
   " and update assembly-specific tables vgProbes and vgAllProbes\n"
   "usage:\n"
   "   vgProbeTrack <COMMAND> {workingDir db} {optional params}\n"
   "\n"
   "Commands:\n"
   "   INIT - WARNING! drops and creates all tables (except vgPrb is not dropped). \n"
   "   POP  - populate vgPrb to catch probes for newly added submission sets. \n"
   "   SEQ workingDir db - find sequence using assembly db. \n"
   "   ALI workingDir db - find or blat any needed alignments for vgProbes track. \n"
   "   EXT workingDir db - add any needed seq and extfile records for vgProbes track. \n"
   "   PSLMAP  workingDir db fromDb - pslMap using chains fromDb to db for vgAllProbes track. \n"
   "   REMAP   workingDir db fromDb track fa - import db.track psls of fromDb using fa fasta file for vgAllProbes track. \n"
   "   SELFMAP workingDir db - migrate self vgProbes to vgAllProbes track. \n"
   "   EXTALL workingDir db - add any needed seq and extfile records for vgAllProbes track. \n"
   "\n"
   "workingDir is a directory with space for intermediate and final results.\n"
   "db is the target assembly to use.\n"
   "Options:\n"
   "   -database=%s - Specifically set database (default visiGene)\n"
   "   -sqlPath=%s - specify location of vgPrb.sql, relative to workingDir \n"
   , database, sqlPath
   );
 }
 
 static struct optionSpec options[] = {
    {"database", OPTION_STRING,},
    {"sqlPath", OPTION_STRING,},
    {NULL, 0},
 };
 
 
 /* --- defining just the fields needed from bac  ---- */
 
 struct bac
 /* Information from bac */
     {
     struct bac *next;           /* Next in singly linked list. */
     char *chrom;                /* chrom */
     int chromStart;             /* start */
     int chromEnd;               /* end   */
     char *strand;               /* strand */
     int probe;                  /* probe  */
     };
 
 struct bac *bacLoad(char **row)
 /* Load a bac from row fetched with select * from bac
  * from database.  Dispose of this with bacFree(). */
 {
 struct bac *ret;
 
 AllocVar(ret);
 ret->chrom      = cloneString(row[0]);
 ret->chromStart =        atoi(row[1]);
 ret->chromEnd   =        atoi(row[2]);
 ret->strand     = cloneString(row[3]);
 ret->probe      =        atoi(row[4]);
 return ret;
 }
 
 void bacFree(struct bac **pEl)
 /* Free a single dynamically allocated bac such as created
  * with bacLoad(). */
 {
 struct bac *el;
 
 if ((el = *pEl) == NULL) return;
 freeMem(el->chrom);
 freeMem(el->strand);
 freez(pEl);
 }
 
 void bacFreeList(struct bac **pList)
 /* Free a list of dynamically allocated bac's */
 {
 struct bac *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     bacFree(&el);
     }
 *pList = NULL;
 }
 
 
 struct bac *bacRead(struct sqlConnection *conn, int taxon, char *db)
 /* Slurp in the bac rows sorted by chrom */
 {
 struct bac *list=NULL, *el;
 char query[512];
 struct sqlResult *sr;
 char **row;
 safef(query, sizeof(query), 
 "select e.chrom, e.chromStart, e.chromEnd, e.strand, pe.id"
 " from probe p, vgPrbMap m, vgPrb pe, bac b, %s.bacEndPairs e"
 " where p.bac = b.id and p.id = m.probe and pe.id = m.vgPrb"
 " and pe.taxon=%d and b.name = e.name"
 " and pe.type='bac' and pe.state='new'"
 " order by e.chrom, e.chromStart",
 db,taxon);
 sr = sqlGetResult(conn, query);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     el = bacLoad(row);
     slAddHead(&list,el);
     }
 sqlFreeResult(&sr);
 slReverse(&list);  
 return list;
 }
 
 char *checkAndFetchBacDna(struct dnaSeq *chromSeq, struct bac *bac)
 /* fetch bac dna return string to be freed later. Reports if segment exceeds ends of chromosome. */
 {
 if (bac->chromStart < 0)
     {
     errAbort("bac error chromStart < 0 : %s %u %u %s %d \n",
         bac->chrom,
         bac->chromStart,
         bac->chromEnd,
         bac->strand,
         bac->probe
         );
     return NULL;
     }
 if (bac->chromEnd > chromSeq->size)
     {
     errAbort("bac error chromEnd > chromSeq->size (%lu) : %s %u %u %s %d \n",
         (unsigned long) chromSeq->size,
         bac->chrom,
         bac->chromStart,
         bac->chromEnd,
         bac->strand,
         bac->probe
         );
     return NULL;
     }
 
 return cloneStringZ(chromSeq->dna + bac->chromStart, bac->chromEnd - bac->chromStart);
 }
 
 static int findVgPrbBySeq(struct sqlConnection *conn, char *seq, int taxon)
 /* find vgPrb.id by seq given or return 0 if not found */
 {
 char *fmt = "select id from vgPrb where seq = '%s' and taxon=%d";
 int size = strlen(fmt)+strlen(seq)+4;
 char *sql = needMem(size);
 safef(sql,size,fmt,seq,taxon);
 return sqlQuickNum(conn,sql);
 freez(&sql);
 }
 
 static void populateMissingVgPrb(struct sqlConnection *conn)
 /* populate vgPrb where missing, usually after new records added to visiGene */
 {
 struct sqlResult *sr;
 char **row;
 struct dyString *dy = dyStringNew(0);
 struct sqlConnection *conn2 = sqlConnect(database);
 struct sqlConnection *conn3 = sqlConnect(database);
 int probeCount=0, vgPrbCount=0;
 
 dyStringAppend(dy, 
 "select p.id,p.gene,antibody,probeType,fPrimer,rPrimer,p.seq,bac,g.taxon"
 " from probe p join gene g"
 " left join vgPrbMap m on m.probe = p.id"
 " where g.id = p.gene"
 "   and m.probe is NULL");
 sr = sqlGetResult(conn, dy->string);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     int id = sqlUnsigned(row[0]); 
     /* int gene = sqlUnsigned(row[1]); */
     /* int antibody = sqlUnsigned(row[2]); */
     /* int probeType = sqlUnsigned(row[3]); */
     char *fPrimer = row[4]; 
     char *rPrimer = row[5]; 
     char *seq = row[6]; 
     int bac = sqlUnsigned(row[7]); 
     int taxon = sqlUnsigned(row[8]); 
 
     char *peType = "none";
     int peProbe = id;
     char *peSeq = seq;
     char *tName = "";
     int tStart = 0;
     int tEnd = 0;
     char *tStrand = " ";
     /*
     char *peGene = "";
     int bacInfo = 0;
     int seqid = 0;
     int pslid = 0;
     */
     char *state = "new";
     char *db = "";
     int vgPrb = 0;
 
     if (isNotEmpty(seq))
     	{
 	peType = "probe";
 	state = "seq";
 	}
     else if (isNotEmpty(fPrimer) && isNotEmpty(rPrimer))
     	{
 	peType = "primersMrna";
 	}
     else if (isNotEmpty(fPrimer) && isEmpty(rPrimer))
     	{ /* only have fPrimer, it's probably a comment, not dna seq */
 	peType = "refSeq";   /* use accession or gene */
 	}
     else if (bac > 0)
     	{
 	peType = "bac";   /* use bacEndPairs */
 	}
     else	    
     	{
 	peType = "refSeq";   /* use accession or gene */
 	}
 
     if (!sameString(peSeq,""))
 	{
 	vgPrb = findVgPrbBySeq(conn3,peSeq,taxon);
 	}
 
     if (vgPrb == 0)
 	{
 	dyStringClear(dy);
 	dyStringAppend(dy, "insert into vgPrb set");
 	dyStringPrintf(dy, " id=default,\n");
 	dyStringPrintf(dy, " type='%s',\n", peType);
 	dyStringAppend(dy, " seq='");
 	dyStringAppend(dy, peSeq);
 	dyStringAppend(dy, "',\n");
 	dyStringPrintf(dy, " tName='%s',\n", tName);
 	dyStringPrintf(dy, " tStart=%d,\n", tStart);
 	dyStringPrintf(dy, " tEnd=%d,\n", tEnd);
 	dyStringPrintf(dy, " tStrand='%s',\n", tStrand);
 	dyStringPrintf(dy, " db='%s',\n", db);
 	dyStringPrintf(dy, " taxon='%d',\n", taxon);
 	dyStringPrintf(dy, " state='%s'\n", state);
 	verbose(2, "%s\n", dy->string);
 	sqlUpdate(conn2, dy->string);
 	vgPrb = sqlLastAutoId(conn2);
 	vgPrbCount++;
 	}
 	
     dyStringClear(dy);
     dyStringAppend(dy, "insert into vgPrbMap set");
     dyStringPrintf(dy, " probe=%d,\n", peProbe);
     dyStringPrintf(dy, " vgPrb=%d \n", vgPrb);
     verbose(2, "%s\n", dy->string);
     sqlUpdate(conn2, dy->string);
 
     probeCount++;
 	
     }
 
 verbose(1, "# new probe records found = %d, # new vgPrb records added = %d\n", probeCount, vgPrbCount);
 
 dyStringFree(&dy);
     
 sqlFreeResult(&sr);
 
 sqlDisconnect(&conn3);
 sqlDisconnect(&conn2);
 
 }
 
 static void processIsPcr(struct sqlConnection *conn, int taxon, char *db)
 /* process isPcr results  */
 {
 
 /* >NM_010919:371+1088 2 718bp CGCGGATCCAAGGACATCTTGGACCTTCCG CCCAAGCTTGCATGTGCTGCAGCGACTGCG */
 
 struct dyString *dy = dyStringNew(0);
 struct lineFile *lf = lineFileOpen("isPcr.fa", TRUE);
 int lineSize;
 char *line;
 char *name;
 char *dna;
 char *word, *end;
 char *tName;
 int tStart;
 int tEnd;
 char *tStrand;
 int probeid=0;  /* really a vgPrb id */
 boolean more = lineFileNext(lf, &line, &lineSize);
 while(more)
     {
     if (line[0] != '>')
 	errAbort("unexpected error out of phase\n");
     name = cloneString(line);
     verbose(1,"name=%s\n",name);
     dyStringClear(dy);
     while((more=lineFileNext(lf, &line, &lineSize)))
 	{
 	if (line[0] == '>')
 	    {
 	    break;
 	    }
 	dyStringAppend(dy,line);	    
 	}
     dna = cloneString(dy->string);
     word = name+1;
     end = strchr(word,':');
     tName = cloneStringZ(word,end-word); 
     word = end+1;
     end = strchr(word,'+');
     tStrand = "+";
     if (!end)
 	{
 	end = strchr(word,'-');
 	tStrand = "-";
 	}
     tStart = atoi(word); 
     word = end+1;
     end = strchr(word,' ');
     tEnd = atoi(word); 
     word = end+1;
     end = strchr(word,' ');
     probeid = atoi(word); 
 
     dyStringClear(dy);
     dyStringPrintf(dy, "select count(*) from vgPrb where id=%d and state='new'",probeid);
     if (sqlQuickNum(conn,dy->string)>0)
 	{
 	/* record exists and hasn't already been updated */
 
 	int vgPrb = findVgPrbBySeq(conn,dna,taxon);
 	
 	if (vgPrb == 0)
 	    {
 	    dyStringClear(dy);
 	    dyStringAppend(dy, "update vgPrb set");
 	    dyStringAppend(dy, " seq='");
 	    dyStringAppend(dy, dna);
 	    dyStringAppend(dy, "',\n");
 	    dyStringPrintf(dy, " tName='%s',\n", tName);
 	    dyStringPrintf(dy, " tStart=%d,\n", tStart);
 	    dyStringPrintf(dy, " tEnd=%d,\n", tEnd);
 	    dyStringPrintf(dy, " tStrand='%s',\n", tStrand);
 	    dyStringPrintf(dy, " db='%s',\n", db);
 	    dyStringPrintf(dy, " state='%s'\n", "seq");
 	    dyStringPrintf(dy, " where id=%d\n", probeid);
 	    dyStringPrintf(dy, " and state='%s'\n", "new");
 	    verbose(2, "%s\n", dy->string);
 	    sqlUpdate(conn, dy->string);
 	    }
 	else  /* probe seq already exists */ 
 	    { 
 	    /* just re-map the probe table recs to it */
 	    dyStringClear(dy);
 	    dyStringPrintf(dy, "update vgPrbMap set vgPrb=%d where vgPrb=%d",vgPrb,probeid);
 	    sqlUpdate(conn, dy->string);
 	    /* and delete it from vgPrb */
 	    dyStringClear(dy);
 	    dyStringPrintf(dy, "delete from vgPrb where id=%d",probeid);
 	    sqlUpdate(conn, dy->string);
 	    }
 	}
     
     freez(&tName);
     freez(&name);
     freez(&dna);
     }
 lineFileClose(&lf);
 
 dyStringFree(&dy);
 }
 
 static int doBacs(struct sqlConnection *conn, int taxon, char *db)
 /* fetch available sequence for bacEndPairs */
 {
 struct dyString *dy = dyStringNew(0);
 struct dnaSeq *chromSeq = NULL;
 struct bac *bacs = bacRead(conn, taxon, db);
 struct bac *bac = NULL;
 char *chrom = cloneString("");
 int count = 0;
 
 verbose(1,"bac list read done.\n");
 
 for(bac=bacs;bac;bac=bac->next)
     {
     
     if (differentWord(chrom,bac->chrom))
 	{
 	verbose(1,"switching to chrom %s\n",bac->chrom);
 	dnaSeqFree(&chromSeq); 
 	chromSeq = hLoadChrom(bac->chrom,db);
 	freez(&chrom);
 	chrom = cloneString(bac->chrom);
 	}
 
     char *dna = checkAndFetchBacDna(chromSeq, bac);
     if (sameString(bac->strand,"-"))
 	{
 	reverseComplement(dna,strlen(dna));
 	}
 
 
     dyStringClear(dy);
     dyStringPrintf(dy, "select count(*) from vgPrb where id=%d and state='new'",bac->probe);
     if (sqlQuickNum(conn,dy->string)>0)
 	{
 	/* record exists and hasn't already been updated */
 
 	int vgPrb = findVgPrbBySeq(conn,dna,taxon);
 	
 	if (vgPrb == 0)
 	    {
 	    dyStringClear(dy);
 	    dyStringAppend(dy, "update vgPrb set");
 	    dyStringAppend(dy, " seq='");
 	    dyStringAppend(dy, dna);
 	    dyStringAppend(dy, "',\n");
 	    dyStringPrintf(dy, " tName='%s',\n", bac->chrom);
 	    dyStringPrintf(dy, " tStart=%d,\n", bac->chromStart);
 	    dyStringPrintf(dy, " tEnd=%d,\n", bac->chromEnd);
 	    dyStringPrintf(dy, " tStrand='%s',\n", bac->strand);
 	    dyStringPrintf(dy, " db='%s',\n", db);
 	    dyStringPrintf(dy, " state='%s'\n", "seq");
 	    dyStringPrintf(dy, " where id=%d\n", bac->probe);
 	    dyStringPrintf(dy, " and state='%s'\n", "new");
 	    //verbose(2, "%s\n", dy->string); // the sql string could be quite large
 	    sqlUpdate(conn, dy->string);
 	    }
 	else  /* probe seq already exists */ 
 	    { 
 	    /* just re-map the probe table recs to it */
 	    dyStringClear(dy);
 	    dyStringPrintf(dy, "update vgPrbMap set vgPrb=%d where vgPrb=%d",vgPrb,bac->probe);
 	    sqlUpdate(conn, dy->string);
 	    /* and delete it from vgPrb */
 	    dyStringClear(dy);
 	    dyStringPrintf(dy, "delete from vgPrb where id=%d",bac->probe);
 	    sqlUpdate(conn, dy->string);
 	    }
 	    
 	++count; 
 	
     
 	verbose(2,"%d finished bac for probe id %d size %d\n", 
 	    count, bac->probe, bac->chromEnd - bac->chromStart);
 	}
 
     freez(&dna);
     }
 
 freez(&chrom);
 dnaSeqFree(&chromSeq);
 
 bacFreeList(&bacs);
 
 dyStringFree(&dy);
 
 return count;  
 }
 
 static void doPrimers(struct sqlConnection *conn, int taxon, char *db)
 /* get probe seq from primers */
 {
 int rc = 0;
 struct dyString *dy = dyStringNew(0);
 char cmdLine[256];
 char path1[256];
 char path2[256];
 
 dyStringClear(dy);
 dyStringAppend(dy, "select e.id, p.fPrimer, p.rPrimer from probe p, vgPrbMap m, vgPrb e, gene g");
 dyStringPrintf(dy, " where p.id = m.probe and m.vgPrb = e.id and g.id = p.gene and g.taxon = %d",taxon);
 dyStringAppend(dy, " and e.state = 'new' and e.type='primersMrna'");
 rc = sqlSaveQuery(conn, dy->string, "primers.query", FALSE);
 verbose(1,"rc = %d = count of primers for mrna search for taxon %d\n",rc,taxon);
 
 if (rc > 0) /* something to do */
     {
 
     dyStringClear(dy);
     dyStringPrintf(dy, "select qName from %s.all_mrna",db);
     rc = 0;
     rc = sqlSaveQuery(conn, dy->string, "accFile.txt", FALSE);
     safef(cmdLine,sizeof(cmdLine),"getRna %s accFile.txt mrna.fa",db);
     system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date");
     
     verbose(1,"rc = %d = count of mrna for %s\n",rc,db);
 
     system("date"); system("isPcr mrna.fa primers.query isPcr.fa -out=fa"); system("date");
     system("ls -l");
 
     processIsPcr(conn,taxon,db);
 
     unlink("mrna.fa"); unlink("accFile.txt"); unlink("isPcr.fa");
 
     }
 unlink("primers.query");    
 
 /* find any remaining type primersMrna that couldn't be resolved and demote 
  * them to type primersGenome
  */
 dyStringClear(dy);
 dyStringAppend(dy, "update vgPrb set type='primersGenome'"); 
 dyStringPrintf(dy, " where taxon = %d",taxon);
 dyStringAppend(dy, " and state = 'new' and type='primersMrna'");
 sqlUpdate(conn, dy->string);
 
 
 
 /* get primers for those probes that did not find mrna isPcr matches 
  * and then do them against the genome instead */
 dyStringClear(dy);
 dyStringAppend(dy, "select e.id, p.fPrimer, p.rPrimer from probe p, vgPrbMap m, vgPrb e, gene g");
 dyStringPrintf(dy, " where p.id = m.probe and m.vgPrb = e.id and g.id = p.gene and g.taxon = %d",taxon);
 dyStringAppend(dy, " and e.state = 'new' and e.type='primersGenome'");
 rc = 0;
 rc = sqlSaveQuery(conn, dy->string, "primers.query", FALSE);
 verbose(1,"rc = %d = count of primers for genome search for taxon %d\n",rc,taxon);
 
 if (rc > 0) /* something to do */
     {
     safef(path1,sizeof(path1),"/gbdb/%s/%s.2bit",db,db);
     safef(path2,sizeof(path2),"%s/%s.2bit",getCurrentDir(),db);
     verbose(1,"copy: [%s] to [%s]\n",path1,path2);  copyFile(path1,path2);
 
     safef(cmdLine,sizeof(cmdLine),
 	    "ssh kolossus 'cd %s; isPcr %s.2bit primers.query isPcr.fa -out=fa'",
 	    getCurrentDir(),db);
     system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date");
     safef(path2,sizeof(path2),"%s/%s.2bit",getCurrentDir(),db);
     verbose(1,"rm %s\n",path2); unlink(path2); system("ls -l");
 
     processIsPcr(conn,taxon,db);
     
     unlink("mrna.fa"); unlink("accFile.txt"); unlink("isPcr.fa");
 
     }
 unlink("primers.query");    
 
 /* find any remaining type primersGenome that couldn't be resolved and demote 
  * them to type refSeq
  */
 dyStringClear(dy);
 dyStringAppend(dy, "update vgPrb set type='refSeq'"); 
 dyStringPrintf(dy, " where taxon = %d",taxon);
 dyStringAppend(dy, " and state = 'new' and type='primersGenome'");
 sqlUpdate(conn, dy->string);
 
 dyStringFree(&dy);
 }
 
 static void doBacEndPairs(struct sqlConnection *conn, int taxon, char *db)
 /* get probe seq bacEndPairs */
 {
 struct dyString *dy = dyStringNew(0);
 int rc = 0;
 /* fetch available sequence for bacEndPairs */
 rc = doBacs(conn, taxon, db); verbose(1,"found seq for %d bacEndPairs\n",rc);
 
 /* find any remaining type bac that couldn't be resolved and demote 
  * them to type refSeq
  */
 dyStringClear(dy);
 
 dyStringAppend(dy, "update vgPrb set type='refSeq'"); 
 dyStringPrintf(dy, " where taxon = %d",taxon);
 dyStringAppend(dy, " and state = 'new' and type='bac'");
 
 sqlUpdate(conn, dy->string);
 
 dyStringFree(&dy);
 }
 
 
 
 static void setTName(struct sqlConnection *conn, int taxon, char *db, char *type, char *fld)
 /* set vgPrb.tName to the desired value to try, 
  *  e.g. gene.refSeq, gene.genbank, mm6.refFlat.name(genoName=gene.name). */
 {
 struct dyString *dy = dyStringNew(0);
 dyStringClear(dy);
 dyStringPrintf(dy, 
     "update vgPrb e, vgPrbMap m, probe p, gene g"
     " set e.seq = '', e.tName = g.%s"
     " where e.id = m.vgPrb and m.probe = p.id and p.gene = g.id"
     " and e.type = '%s'"
     " and e.state = 'new'"
     " and e.taxon = %d"
     , fld, type, taxon);
 sqlUpdate(conn, dy->string);
 dyStringFree(&dy);
 }
 
 static void setTNameMapped(struct sqlConnection *conn, int taxon, char *db, char *type, char *fld, 
             char *mapTable, char *mapField, char *mapTo)
 /* set vgPrb.tName to the desired value to try, 
  *  e.g. gene.refSeq, gene.genbank, mm6.refFlat.name(genoName=gene.name). */
 {
 struct dyString *dy = dyStringNew(0);
 dyStringClear(dy);
 dyStringPrintf(dy, 
 "update vgPrb e, vgPrbMap m, probe p, gene g, %s.%s f"
 " set e.seq = '', e.tName = f.%s"
 " where e.id = m.vgPrb and m.probe = p.id and p.gene = g.id"
 " and g.%s = f.%s"
     " and e.type = '%s'"
     " and e.state = 'new'"
     " and g.taxon = %d"
     ,db, mapTable, mapTo, fld, mapField, type, taxon);
 sqlUpdate(conn, dy->string);
 dyStringFree(&dy);
 }
 
 static void processMrnaFa(struct sqlConnection *conn, int taxon, char *type, char *db)
 /* process isPcr results  */
 {
 
 struct dyString *dy = dyStringNew(0);
 struct lineFile *lf = lineFileOpen("mrna.fa", TRUE);
 int lineSize;
 char *line;
 char *name;
 char *dna;
 boolean more = lineFileNext(lf, &line, &lineSize);
 while(more)
     {
     if (line[0] != '>')
 	errAbort("unexpected error out of phase\n");
     name = cloneString(line+1);
     verbose(2,"name=%s\n",name);
     dyStringClear(dy);
     while((more=lineFileNext(lf, &line, &lineSize)))
 	{
 	if (line[0] == '>')
 	    {
 	    break;
 	    }
 	dyStringAppend(dy,line);	    
 	}
     dna = cloneString(dy->string);
 
     while(1)
 	{
 	int oldProbe = 0;
 	dyStringClear(dy);
 	dyStringPrintf(dy, "select id from vgPrb "
 	   "where taxon=%d and type='%s' and tName='%s' and state='new'",taxon,type,name);
 	oldProbe = sqlQuickNum(conn,dy->string);
 	if (oldProbe==0)
 	    break;       /* no more records match */
 	    
 	/* record exists and hasn't already been updated */
 	
 	int vgPrb = findVgPrbBySeq(conn,dna,taxon);
 	
 	if (vgPrb == 0)
 	    {
 	    dyStringClear(dy);
 	    dyStringAppend(dy, "update vgPrb set");
 	    dyStringAppend(dy, " seq = '");
 	    dyStringAppend(dy, dna);
 	    dyStringAppend(dy, "',\n");
 	    dyStringPrintf(dy, " db = '%s',\n", db);
 	    dyStringAppend(dy, " state = 'seq'\n");
 	    dyStringPrintf(dy, " where id=%d\n", oldProbe);
 	    dyStringPrintf(dy, " and state='%s'\n", "new");
 	    verbose(2, "%s\n", dy->string);
 	    sqlUpdate(conn, dy->string);
 	    }
 	else  /* probe seq already exists */ 
 	    { 
 	    /* just re-map the probe table recs to it */
 	    dyStringClear(dy);
 	    dyStringPrintf(dy, "update vgPrbMap set vgPrb=%d where vgPrb=%d",vgPrb,oldProbe);
 	    sqlUpdate(conn, dy->string);
 	    /* and delete it from vgPrb */
 	    dyStringClear(dy);
 	    dyStringPrintf(dy, "delete from vgPrb where id=%d",oldProbe);
 	    sqlUpdate(conn, dy->string);
 	    }
 	    
 	}    
 
     freez(&name);
     freez(&dna);
     }
 lineFileClose(&lf);
 
 dyStringFree(&dy);
 }
 
 
 
 static int getAccMrnas(struct sqlConnection *conn, int taxon, char *db, char *type, char *table)
 /* get mRNAs for one vgPrb acc type */
 {
 int rc = 0;
 struct dyString *dy = dyStringNew(0);
 char cmdLine[256];
 dyStringClear(dy);
 dyStringPrintf(dy, 
     "select distinct e.tName from vgPrb e, %s.%s m"
     " where e.tName = m.qName"
     " and e.taxon = %d and e.type = '%s' and e.tName <> '' and e.state = 'new'"
     ,db,table,taxon,type);
 rc = 0;
 rc = sqlSaveQuery(conn, dy->string, "accFile.txt", FALSE);
 safef(cmdLine,sizeof(cmdLine),"getRna %s accFile.txt mrna.fa",db);
 //system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); 
 system(cmdLine); 
 //system("date");
 
 processMrnaFa(conn, taxon, type, db);
 
 unlink("mrna.fa"); 
 unlink("accFile.txt");
 
 dyStringFree(&dy);
 return rc;
 }
 
 static void advanceType(struct sqlConnection *conn, int taxon, char *oldType, char *newType)
 /* find any remaining type refSeq that couldn't be resolved and demote 
  * them to type genbank
  */
 { 
 struct dyString *dy = dyStringNew(0);
 dyStringClear(dy);
 dyStringPrintf(dy, 
  "update vgPrb set type='%s', tName=''"
  " where taxon = %d and state = 'new' and type='%s'"
  ,newType,taxon,oldType);
 sqlUpdate(conn, dy->string);
 dyStringFree(&dy);
 }
 
 static void doAccessionsSeq(struct sqlConnection *conn, int taxon, char *db)
 /* get probe seq from Accessions */
 {
 int rc = 0;
 struct dyString *dy = dyStringNew(0);
 
 /* get refSeq accessions and rna */
 setTName(conn, taxon, db, "refSeq", "refSeq");
 rc = getAccMrnas(conn, taxon, db, "refSeq", "refSeqAli");
 verbose(1,"rc = %d = count of refSeq mrna for %s\n",rc,db);
 advanceType(conn,taxon,"refSeq","genRef");
 
 /* get refSeq-in-gene.genbank accessions and rna */
 setTName(conn, taxon, db, "genRef", "genbank");
 rc = getAccMrnas(conn, taxon, db, "genRef", "refSeqAli");
 verbose(1,"rc = %d = count of genRef mrna for %s\n",rc,db);
 advanceType(conn,taxon,"genRef","genbank");
 
 /* get genbank accessions and rna */
 setTName(conn, taxon, db, "genbank", "genbank");
 rc = getAccMrnas(conn, taxon, db, "genbank", "all_mrna");
 verbose(1,"rc = %d = count of genbank mrna for %s\n",rc,db);
 advanceType(conn,taxon,"genbank","flatRef");
 
 /* get gene.name -> refFlat to refSeq accessions and rna */
 setTNameMapped(conn, taxon, db, "flatRef", "name", "refFlat", "geneName", "name");
 rc = getAccMrnas(conn, taxon, db, "flatRef", "refSeqAli");
 verbose(1,"rc = %d = count of flatRef mrna for %s\n",rc,db);
 advanceType(conn,taxon,"flatRef","flatAll");
 
 /* get gene.name -> refFlat to all_mrna accessions */
 setTNameMapped(conn, taxon, db, "flatAll", "name", "refFlat", "geneName", "name");
 rc = getAccMrnas(conn, taxon, db, "flatAll", "all_mrna");
 verbose(1,"rc = %d = count of flatAll mrna for %s\n",rc,db);
 advanceType(conn,taxon,"flatAll","linkRef");
 
 
 
 /* get gene.name -> refLink to refSeq accessions and rna */
 setTNameMapped(conn, taxon, db, "linkRef", "name", "refLink", "name", "mrnaAcc");
 rc = getAccMrnas(conn, taxon, db, "linkRef", "refSeqAli");
 verbose(1,"rc = %d = count of linkRef mrna for %s\n",rc,db);
 advanceType(conn,taxon,"linkRef","linkAll");
 
 /* get gene.name -> refLink to all_mrna accessions */
 setTNameMapped(conn, taxon, db, "linkAll", "name", "refLink", "name", "mrnaAcc");
 rc = getAccMrnas(conn, taxon, db, "linkAll", "all_mrna");
 verbose(1,"rc = %d = count of linkAll mrna for %s\n",rc,db);
 advanceType(conn,taxon,"linkAll","kgAlRef");
 
 
 
 /* get gene.name -> kgAlias to refSeq accessions and rna */
 setTNameMapped(conn, taxon, db, "kgAlRef", "name", "kgAlias", "alias", "kgId");
 rc = getAccMrnas(conn, taxon, db, "kgAlRef", "refSeqAli");
 verbose(1,"rc = %d = count of kgAlRef mrna for %s\n",rc,db);
 advanceType(conn,taxon,"kgAlRef","kgAlAll");
 
 /* get gene.name -> kgAlias to all_mrna accessions */
 setTNameMapped(conn, taxon, db, "kgAlAll", "name", "kgAlias", "alias", "kgId");
 rc = getAccMrnas(conn, taxon, db, "kgAlAll", "all_mrna");
 verbose(1,"rc = %d = count of kgAlAll mrna for %s\n",rc,db);
 advanceType(conn,taxon,"kgAlAll","gene");
 
 dyStringFree(&dy);
 }
 
 static void initTable(struct sqlConnection *conn, char *table, boolean nuke)
 /* build tables */
 {
 char *sql = NULL;
 char path[256];
 if (nuke)
     sqlDropTable(conn, table);
 if (!sqlTableExists(conn, table))
     {
     safef(path,sizeof(path),"%s/%s.sql",sqlPath,table);
     readInGulp(path, &sql, NULL);
     sqlUpdate(conn, sql);
     }
 }
 
 
 static void populateMissingVgPrbAli(struct sqlConnection *conn, int taxon, char *db, char *table)
 /* populate vgPrbAli for db */
 {
 struct dyString *dy = dyStringNew(0);
 dyStringClear(dy);
 dyStringPrintf(dy,
 "insert into %s"
 " select distinct '%s', e.id, 'new' from vgPrb e"
 " left join %s a on e.id = a.vgPrb and a.db = '%s'"
 " where a.vgPrb is NULL "
 " and e.taxon = %d"
     ,table,db,table,db,taxon);
 sqlUpdate(conn, dy->string);
 dyStringFree(&dy);
 }
 
 
 static void updateVgPrbAli(struct sqlConnection *conn, char *db, char *table, char *track)
 /* update vgPrbAli from vgProbes track for db */
 {
 struct dyString *dy = dyStringNew(0);
 char dbTrk[256];
 safef(dbTrk,sizeof(dbTrk),"%s.%s",db,track);
 if (!sqlTableExists(conn, dbTrk))
     {
     struct sqlConnection *conn2 = sqlConnect(db);
     verbose(1,"FYI: Table %s does not exist\n",dbTrk);
     initTable(conn2, track, FALSE);
     sqlDisconnect(&conn2);
     }
 dyStringClear(dy);
 dyStringPrintf(dy,
 "update %s a, %s.%s v"
 " set a.status = 'ali'"
 " where v.qName = concat('vgPrb_',a.vgPrb)"
 " and a.db = '%s'"
     ,table,db,track,db);
 sqlUpdate(conn, dy->string);
 dyStringFree(&dy);
 }
 
 
 static void markNoneVgPrbAli(struct sqlConnection *conn, int fromTaxon, char *db, char *table)
 /* mark records in vgPrbAli that did not find any alignment for db */
 {
 struct dyString *dy = dyStringNew(0);
 dyStringClear(dy);
 dyStringPrintf(dy,
 "update %s a, vgPrb e"
 " set a.status = 'none'"
 " where a.status = 'new'"
 " and a.db = '%s' and e.taxon = %d"
 " and a.vgPrb = e.id"
     ,table,db,fromTaxon);
 sqlUpdate(conn, dy->string);
 dyStringFree(&dy);
 }
 
 
 
 static int doAccPsls(struct sqlConnection *conn, int taxon, char *db, char *type, char *table)
 /* get psls for one vgPrb acc type */
 {
 int rc = 0;
 struct dyString *dy = dyStringNew(0);
 char outName[256];
 dyStringClear(dy);
 dyStringPrintf(dy, 
     "select m.matches,m.misMatches,m.repMatches,m.nCount,m.qNumInsert,m.qBaseInsert,"
     "m.tNumInsert,m.tBaseInsert,m.strand,"
     "concat(\"vgPrb_\",e.id),"
     "m.qSize,m.qStart,m.qEnd,m.tName,m.tSize,m.tStart,m.tEnd,m.blockCount,"
     "m.blockSizes,m.qStarts,m.tStarts"    
     " from vgPrb e, vgPrbAli a, %s.%s m"
     " where e.id = a.vgPrb and a.db = '%s' and a.status='new' and e.tName = m.qName"
     " and e.taxon = %d and e.type = '%s' and e.tName <> '' and e.state='seq' and e.seq <> ''"
     " order by m.tName,m.tStart"
     ,db,table,db,taxon,type);
 rc = 0;
 safef(outName,sizeof(outName),"%s.psl",type);
 rc = sqlSaveQuery(conn, dy->string, outName, FALSE);
 dyStringFree(&dy);
 return rc;
 }
 
 static int dumpPslTable(struct sqlConnection *conn, char *db, char *table)
 /* dump psls for db.table to table.psl in current (working) dir */
 {
 int rc = 0;
 struct dyString *dy = dyStringNew(0);
 char outName[256];
 dyStringClear(dy);
 dyStringPrintf(dy, 
     "select matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,"
     "tNumInsert,tBaseInsert,strand,"
     "qName,qSize,qStart,qEnd,tName,tSize,tStart,tEnd,"
     "blockCount,blockSizes,qStarts,tStarts"    
     " from %s.%s"
     " order by tName,tStart"
     ,db,table);
 rc = 0;
 safef(outName,sizeof(outName),"%s.psl",table);
 rc = sqlSaveQuery(conn, dy->string, outName, FALSE);
 dyStringFree(&dy);
 return rc;
 }
 
 static void doAccessionsAli(struct sqlConnection *conn, int taxon, char *db)
 /* get probe alignments from Accessions */
 {
 int rc = 0;
 
 /* get refSeq psls */
 rc = doAccPsls(conn, taxon, db, "refSeq", "refSeqAli");
 verbose(1,"rc = %d = count of refSeq psls for %s\n",rc,db);
 
 /* get genRef psls */
 rc = doAccPsls(conn, taxon, db, "genRef", "refSeqAli");
 verbose(1,"rc = %d = count of genRef psls for %s\n",rc,db);
 
 /* get genbank psls */
 rc = doAccPsls(conn, taxon, db, "genbank", "all_mrna");
 verbose(1,"rc = %d = count of genbank psls for %s\n",rc,db);
 
 /* get flatRef psls */
 rc = doAccPsls(conn, taxon, db, "flatRef", "refSeqAli");
 verbose(1,"rc = %d = count of flatRef psls for %s\n",rc,db);
 
 /* get flatAll psls */
 rc = doAccPsls(conn, taxon, db, "flatAll", "all_mrna");
 verbose(1,"rc = %d = count of flatAll psls for %s\n",rc,db);
 
 /* get linkRef psls */
 rc = doAccPsls(conn, taxon, db, "linkRef", "refSeqAli");
 verbose(1,"rc = %d = count of linkRef psls for %s\n",rc,db);
 
 /* get linkAll psls */
 rc = doAccPsls(conn, taxon, db, "linkAll", "all_mrna");
 verbose(1,"rc = %d = count of linkAll psls for %s\n",rc,db);
 
 /* get kgAlRef psls */
 rc = doAccPsls(conn, taxon, db, "kgAlRef", "refSeqAli");
 verbose(1,"rc = %d = count of kgAlRef psls for %s\n",rc,db);
 
 /* get kgAlAll psls */
 rc = doAccPsls(conn, taxon, db, "kgAlAll", "all_mrna");
 verbose(1,"rc = %d = count of kgAlRef psls for %s\n",rc,db);
 
 }
 
 static void doFakeBacAli(struct sqlConnection *conn, int taxon, char *db)
 /* place probe seq from primers, etc with blat */
 {
 int rc = 0;
 struct dyString *dy = dyStringNew(0);
 /* create fake psls as blatBAC.psl */
 dyStringClear(dy);
 dyStringPrintf(dy, 
     "select length(e.seq), 0, 0, 0, 0, 0, 0, 0, e.tStrand, concat('vgPrb_',e.id), length(e.seq),"
     " 0, length(e.seq), e.tName, ci.size, e.tStart, e.tEnd, 1,"
     " concat(length(e.seq),','), concat(0,','), concat(e.tStart,',')"
     " from vgPrb e, %s.chromInfo ci, vgPrbAli a"
     " where ci.chrom = e.tName and e.id = a.vgPrb"
     " and e.type = 'bac'"
     " and e.taxon = %d"
     " and a.db = '%s' and a.status='new'"
     , db, taxon, db);
 //restore: 
 rc = sqlSaveQuery(conn, dy->string, "fakeBAC.psl", FALSE);
 verbose(1,"rc = %d = count of sequences for fakeBAC.psl, for taxon %d\n",rc,taxon);
 dyStringFree(&dy);
 }
 
 static void doBlat(struct sqlConnection *conn, int taxon, char *db)
 /* place probe seq from non-BAC with blat that have no alignments yet */
 {
 int rc = 0;
 char *blatSpec=NULL;
 char cmdLine[256];
 char path1[256];
 char path2[256];
 struct dyString *dy = dyStringNew(0);
 
 /* (non-BACs needing alignment) */
 dyStringClear(dy);
 dyStringPrintf(dy, 
     "select concat(\"vgPrb_\",e.id), e.seq"
     " from vgPrb e, vgPrbAli a"
     " where e.id = a.vgPrb"
     " and a.db = '%s'"
     " and a.status = 'new'"
     " and e.taxon = %d"
     " and e.type <> 'bac'"
     " and e.seq <> ''"
     " order by e.id"
     , db, taxon);
 //restore: 
 rc = sqlSaveQuery(conn, dy->string, "blat.fa", TRUE);
 verbose(1,"rc = %d = count of sequences for blat, to get psls for taxon %d\n",rc,taxon);
 
 if (rc == 0) 
     {
     unlink("blat.fa");
     system("rm -f blatNearBest.psl; touch blatNearBest.psl");  /* make empty file */
     return;
     }
 
 /* make .ooc and blat on kolossus */
 
 safef(path1,sizeof(path1),"/gbdb/%s/%s.2bit",db,db);
 safef(path2,sizeof(path2),"%s/%s.2bit",getCurrentDir(),db);
 //restore: 
 verbose(1,"copy: [%s] to [%s]\n",path1,path2);  copyFile(path1,path2);
 
 safef(cmdLine,sizeof(cmdLine),
 "ssh kolossus 'cd %s; blat -makeOoc=11.ooc -tileSize=11"
 " -repMatch=1024 %s.2bit /dev/null /dev/null'",
     getCurrentDir(),db);
 //restore: 
 system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date");
 
 safef(cmdLine,sizeof(cmdLine),
 	"ssh kolossus 'cd %s; blat %s.2bit blat.fa -ooc=11.ooc -noHead blat.psl'",
 	getCurrentDir(),db);
 //restore: 
 system("date"); verbose(1,"cmdLine: [%s]\n",cmdLine); system(cmdLine); system("date");
 
 /* using blat even with -fastMap was way too slow - took over a day,
  * so instead I will make a procedure to write a fake psl for the BACs
  * which you will see called below */
 
 safef(path2,sizeof(path2),"%s.2bit",db);
 verbose(1,"rm %s\n",path2); unlink(path2); 
 
 safef(path2,sizeof(path2),"11.ooc");
 verbose(1,"rm %s\n",path2); unlink(path2); 
 
 
 /* skip psl header and sort on query name */
 safef(cmdLine,sizeof(cmdLine), "sort -k 10,10 blat.psl > blatS.psl");
 verbose(1,"cmdLine=[%s]\n",cmdLine);
 system(cmdLine); 
 
 /* keep near best within 5% of the best */
 safef(cmdLine,sizeof(cmdLine), 
     "pslCDnaFilter -globalNearBest=0.005 -minId=0.96 -minNonRepSize=20 -minCover=0.50"
     " blatS.psl blatNearBest.psl");
 verbose(1,"cmdLine=[%s]\n",cmdLine);
 system(cmdLine); 
 
 unlink("blat.fa");
 unlink("blat.psl");
 unlink("blatS.psl");
 
 freez(&blatSpec);
 dyStringFree(&dy);
 }
 
 
 
 void static assembleAllPsl(struct sqlConnection *conn, int taxon, char *db)
 /* assemble NonBlat.psl from variouls psl alignments */
 {
 // " blatNearBest.psl" not included
 /* make final psl */
 system(
 "cat"
 " fakeBAC.psl"
 " flatAll.psl"
 " flatRef.psl"
 " genRef.psl"
 " genbank.psl"
 " kgAlAll.psl"
 " kgAlRef.psl"
 " linkAll.psl"
 " linkRef.psl"
 " refSeq.psl"
 " > vgPrbNonBlat.psl"
 );
 
 verbose(1,"vgPrbNonBlat.psl assembled for taxon %d\n",taxon);
 
 
 //unlink("blatNearBest.psl");  
 //unlink("fakeBAC.psl");  
 
 //system("ls -ltr");
 
 }
 
 
 static void rollupPsl(char *pslName, char *table, struct sqlConnection *conn, char *db)
 {
 char cmd[256];
 char dbTbl[256];
 
 if (fileSize(pslName)==0)
     return;
 
 safef(dbTbl,sizeof(dbTbl),"%s.%s",db,table);
 if (!sqlTableExists(conn, dbTbl))
     {
     verbose(1,"FYI: Table %s does not exist\n",dbTbl);
     safef(cmd,sizeof(cmd),"rm -f %s.psl; touch %s.psl",table,table); /* make empty file */
     verbose(1,"%s\n",cmd); system(cmd);
     }
 else
     {
     dumpPslTable(conn, db, table);
     }
 
 safef(cmd,sizeof(cmd),"cat %s %s.psl | sort -u | sort -k 10,10 > %sNew.psl", pslName, table, table);
 verbose(1,"%s\n",cmd); system(cmd);
 
 safef(cmd,sizeof(cmd),"hgLoadPsl %s %sNew.psl -table=%s",db,table,table);
 verbose(1,"%s\n",cmd); system(cmd);
 
 safef(cmd,sizeof(cmd),"rm %s %s.psl %sNew.psl",pslName,table,table);
 verbose(1,"%s\n",cmd); 
 //system(cmd);
 
 }
 
 
 static int findTaxon(struct sqlConnection *conn, char *db)
 /* return the taxon for the given db, or abort */
 {
 char sql[256];
 int taxon = 0;
 char *fmt = "select ncbi_taxa_id from go.species "
 "where concat(genus,' ',species) = '%s'";
 safef(sql,sizeof(sql),fmt,hScientificName(db));
 taxon = sqlQuickNum(conn, sql);
 if (taxon == 0) 
     {
     if (sameString(db,"nibb"))   /* we don't really have this frog as an assembly */
 	taxon = 8355;    /* Xenopus laevis - African clawed frog */
     /* can put more here in future */    
     }
 if (taxon == 0)
    errAbort("unknown taxon for db = %s, unable to continue until its taxon is defined.",db);
 return taxon;
 }
 
 
 static void makeFakeProbeSeq(struct sqlConnection *conn, char *db)
 /* get probe seq from primers, bacEndPairs, refSeq, genbank, gene-name */
 {
 
 int taxon = findTaxon(conn,db);
 
 //restore: 
 doPrimers(conn, taxon, db);
 
 //restore: 
 doBacEndPairs(conn, taxon, db);
 
 //restore: 
 doAccessionsSeq(conn, taxon, db);
 
 }
 
 /*  keep around, but dangerous in the wrong hands ;)
 static void init(struct sqlConnection *conn)
 / * build tables - for the first time * /
 {
 if (!sqlTableExists(conn, "vgPrb"))
     {
     initTable(conn, "vgPrb", FALSE);  / * this most important table should never be nuked automatically * /
     sqlUpdate(conn, "create index tName on vgPrb(tName(20));");
     sqlUpdate(conn, "create index seq on vgPrb(seq(40));");
     }
 
 initTable(conn, "vgPrbMap", TRUE);
 sqlUpdate(conn, "create index probe on vgPrbMap(probe);");
 sqlUpdate(conn, "create index vgPrb on vgPrbMap(vgPrb);");
 
 initTable(conn, "vgPrbAli", TRUE);
 initTable(conn, "vgPrbAliAll", TRUE);
 
 }
 */
 
 
 static void doAlignments(struct sqlConnection *conn, char *db)
 {
 int taxon = findTaxon(conn,db);
 
 populateMissingVgPrbAli(conn, taxon, db, "vgPrbAli");
 
 updateVgPrbAli(conn, db, "vgPrbAli","vgProbes");
 
 doAccessionsAli(conn, taxon, db);
 
 doFakeBacAli(conn, taxon, db);
 
 assembleAllPsl(conn, taxon, db);
 
 rollupPsl("vgPrbNonBlat.psl", "vgProbes", conn, db);
 
 updateVgPrbAli(conn, db, "vgPrbAli","vgProbes");
 
 doBlat(conn, taxon, db);   
 
 rollupPsl("blatNearBest.psl", "vgProbes", conn, db);
 
 updateVgPrbAli(conn, db, "vgPrbAli","vgProbes");
 
 markNoneVgPrbAli(conn, taxon, db, "vgPrbAli");
 
 }
 
 
 static void getPslMapAli(struct sqlConnection *conn, 
     char *db, int fromTaxon, char *fromDb, boolean isBac)
 {
 int rc = 0;
 struct dyString *dy = dyStringNew(0);
 char outName[256];
 /* get {non-}bac $db.vgProbes not yet aligned */
 dyStringClear(dy);
 dyStringPrintf(dy, 
     "select m.matches,m.misMatches,m.repMatches,m.nCount,m.qNumInsert,m.qBaseInsert,"
     "m.tNumInsert,m.tBaseInsert,m.strand,"
     "m.qName,m.qSize,m.qStart,m.qEnd,m.tName,m.tSize,m.tStart,m.tEnd,m.blockCount,"
     "m.blockSizes,m.qStarts,m.tStarts"    
     " from vgPrb e, vgPrbAliAll a, %s.vgProbes m"
     " where e.id = a.vgPrb and a.db = '%s' and a.status='new'"
     " and m.qName = concat(\"vgPrb_\",e.id)"
     " and e.taxon = %d and e.type %s 'bac' and e.state='seq' and e.seq <> ''"
     " order by m.tName,m.tStart"
     ,fromDb,db,fromTaxon, isBac ? "=" : "<>");
 rc = 0;
 safef(outName,sizeof(outName), isBac ? "bac.psl" : "nonBac.psl");
 rc = sqlSaveQuery(conn, dy->string, outName, FALSE);
 verbose(1,"Count of %s Psls found for pslMap: %d\n", isBac ? "bac" : "nonBac", rc);
 }
 
 
 static void getPslMapFa(struct sqlConnection *conn, 
     char *db, int fromTaxon)
 {
 int rc = 0;
 struct dyString *dy = dyStringNew(0);
 /* get .fa for pslRecalcMatch use */
 dyStringClear(dy);
 dyStringPrintf(dy, 
     "select concat(\"vgPrb_\",e.id), e.seq"
     " from vgPrb e, vgPrbAliAll a"
     " where e.id = a.vgPrb"
     " and a.db = '%s'"
     " and a.status = 'new'"
     " and e.taxon = %d"
     " and e.seq <> ''"
     " order by e.id"
     , db, fromTaxon);
 //restore: 
 rc = sqlSaveQuery(conn, dy->string, "pslMap.fa", TRUE);
 verbose(1,"rc = %d = count of sequences for pslMap for taxon %d\n",rc,fromTaxon);
 }
 
 static void doPslMapAli(struct sqlConnection *conn, 
     int taxon, char *db, 
     int fromTaxon, char *fromDb)
 {
 char cmd[256];
 
 struct dyString *dy = dyStringNew(0);
 char path[256];
 char dnaPath[256];
 char toDb[12];
 
 safef(toDb,sizeof(toDb),"%s", db);
 toDb[0]=toupper(toDb[0]);
 
 safef(dnaPath,sizeof(dnaPath),"/cluster/data/%s/nib", db);
 if (!fileExists(dnaPath))
     {
     safef(dnaPath,sizeof(dnaPath),"/cluster/data/%s/%s.2bit", db, db);
     if (!fileExists(dnaPath))
 	errAbort("unable to locate nib dir or .2bit for %s: %s", db, dnaPath);
     }
     
 safef(path,sizeof(path),"/gbdb/%s/liftOver/%sTo%s.over.chain.gz", fromDb, fromDb, toDb);
 if (!fileExists(path))
     errAbort("unable to locate chain file %s",path);
 
 /* get non-bac $db.vgProbes not yet aligned */
 getPslMapAli(conn, db, fromTaxon, fromDb, FALSE);
 /* get bac $db.vgProbes not yet aligned */
 getPslMapAli(conn, db, fromTaxon, fromDb, TRUE);
 /* get .fa for pslRecalcMatch use */
 getPslMapFa(conn, db, fromTaxon);
 
 /* non-bac */
 safef(cmd,sizeof(cmd),
 "zcat %s | pslMap -chainMapFile -swapMap  nonBac.psl stdin stdout "
 "|  sort -k 14,14 -k 16,16n > unscoredNB.psl"
 ,path);
 verbose(1,"%s\n",cmd); system(cmd);
 
 safef(cmd,sizeof(cmd),
 "pslRecalcMatch unscoredNB.psl %s" 
 " pslMap.fa nonBac.psl"
 ,dnaPath);
 verbose(1,"%s\n",cmd); system(cmd);
 
 /* bac */
 safef(cmd,sizeof(cmd),
 "zcat %s | pslMap -chainMapFile -swapMap  bac.psl stdin stdout "
 "|  sort -k 14,14 -k 16,16n > unscoredB.psl"
 ,path);
 verbose(1,"%s\n",cmd); system(cmd);
 
 safef(cmd,sizeof(cmd),
 "pslRecalcMatch unscoredB.psl %s" 
 " pslMap.fa bacTemp.psl"
 ,dnaPath);
 verbose(1,"%s\n",cmd); system(cmd);
 
 safef(cmd,sizeof(cmd),
 "pslCDnaFilter -globalNearBest=0.00001 -minCover=0.05"
 " bacTemp.psl bac.psl");
 verbose(1,"%s\n",cmd); system(cmd);
 
 safef(cmd,sizeof(cmd),"cat bac.psl nonBac.psl > vgPrbPslMap.psl");
 verbose(1,"%s\n",cmd); system(cmd);
 
 dyStringFree(&dy);
 
 }
 
 static void doAlignmentsPslMap(struct sqlConnection *conn, char *db, char *fromDb)
 {
 int taxon = findTaxon(conn,db);
 int fromTaxon = findTaxon(conn,fromDb);
 
 populateMissingVgPrbAli(conn, fromTaxon, db, "vgPrbAliAll");
 
 updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes");
 
 doPslMapAli(conn, taxon, db, fromTaxon, fromDb);
 
 rollupPsl("vgPrbPslMap.psl", "vgAllProbes", conn, db);
 
 updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes");
 
 markNoneVgPrbAli(conn, fromTaxon, db, "vgPrbAliAll");
 
 }
 
 static void doReMapAli(struct sqlConnection *conn, 
     int taxon, char *db, 
     int fromTaxon, char *fromDb,
     char *track, char *fasta
     )
 /* re-map anything in track specified that is not aligned, 
       nor even attempted yet, using specified fasta file. */
 {
 char cmd[256];
 
 int rc = 0;
 struct dyString *dy = dyStringNew(0);
 char dbTrk[256];
 
 safef(dbTrk,sizeof(dbTrk),"%s.%s",db,track);
 if (!sqlTableExists(conn, dbTrk))
     errAbort("Track %s does not exist\n",dbTrk);
 
 if (!fileExists(fasta))
     errAbort("Unable to locate fasta file %s",fasta);
 
 if (sqlTableExists(conn, "vgRemapTemp"))
     {
     sqlUpdate(conn, "drop table vgRemapTemp;");
     }
 
 safef(cmd,sizeof(cmd),
 "hgPepPred %s generic vgRemapTemp %s "
 ,database,fasta);
 verbose(1,"%s\n",cmd); system(cmd);
 
-sqlUpdate(conn, "create index seq on vgRemapTemp(seq(40));");
 /* required for mysql 5 longtext for case-insensitive comparisons of blobs */
 sqlUpdate(conn, "ALTER table vgRemapTemp modify seq longtext;");
+sqlUpdate(conn, "create index seq on vgRemapTemp(seq(40));");
 
 /* get remapped psl probes not yet aligned */
 dyStringClear(dy);
 dyStringPrintf(dy, 
     "select m.matches,m.misMatches,m.repMatches,m.nCount,m.qNumInsert,m.qBaseInsert,"
     "m.tNumInsert,m.tBaseInsert,m.strand,"
     "concat('vgPrb_',e.id),m.qSize,m.qStart,m.qEnd,m.tName,m.tSize,m.tStart,m.tEnd,m.blockCount,"
     "m.blockSizes,m.qStarts,m.tStarts"    
     " from vgPrb e, vgPrbAliAll a, %s.%s m, vgRemapTemp n"
     " where e.id = a.vgPrb and a.db = '%s' and a.status='new'"
     " and m.qName = n.name and n.seq = e.seq"
     " and e.taxon = %d and e.state='seq' and e.seq <> ''"
     " order by m.tName,m.tStart"
     ,db,track,db,fromTaxon);
 rc = 0;
 
 rc = sqlSaveQuery(conn, dy->string, "vgPrbReMap.psl", FALSE);
 verbose(1,"Count of Psls found for reMap: %d\n", rc);
 
 sqlUpdate(conn, "drop table vgRemapTemp;");
 
 dyStringFree(&dy);
 
 }
 
 
 static void doAlignmentsReMap(
     struct sqlConnection *conn, 
     char *db, char *fromDb, char *track, char *fasta)
 /* re-map anything in track specified that is not aligned, 
       nor even attempted yet, using specified fasta file. */
 {
 int taxon = findTaxon(conn,db);
 int fromTaxon = findTaxon(conn,fromDb);
 
 populateMissingVgPrbAli(conn, fromTaxon, db, "vgPrbAliAll");
 
 updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes");
 
 doReMapAli(conn, taxon, db, fromTaxon, fromDb, track, fasta);
 
 rollupPsl("vgPrbReMap.psl", "vgAllProbes", conn, db);
 
 updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes");
 
 markNoneVgPrbAli(conn, fromTaxon, db, "vgPrbAliAll");
 
 }
 
 static void doSelfMapAli(struct sqlConnection *conn, 
     int taxon, char *db)
 {
 char cmd[256];
 
 /* get non-bac $db.vgProbes not yet aligned */
 getPslMapAli(conn, db, taxon, db, FALSE);
 /* get bac $db.vgProbes not yet aligned */
 getPslMapAli(conn, db, taxon, db, TRUE);
 
 safef(cmd,sizeof(cmd),"cat bac.psl nonBac.psl > vgPrbSelfMap.psl");
 verbose(1,"%s\n",cmd); system(cmd);
 
 }
 
 static void doAlignmentsSelfMap(
     struct sqlConnection *conn, char *db)
 /* copy anything in vgProbes but not in vgAllProbes to vgAllProbes */
 {
 int taxon = findTaxon(conn,db);
 
 populateMissingVgPrbAli(conn, taxon, db, "vgPrbAliAll");
 
 updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes");
 
 doSelfMapAli(conn, taxon, db);
 
 rollupPsl("vgPrbSelfMap.psl", "vgAllProbes", conn, db);
 
 updateVgPrbAli(conn, db, "vgPrbAliAll","vgAllProbes");
 
 markNoneVgPrbAli(conn, taxon, db, "vgPrbAliAll");
 
 }
 
 
 static void doSeqAndExtFile(struct sqlConnection *conn, char *db, char *table)
 {
 int rc = 0;
 char cmd[256];
 char path[256];
 char bedPath[256];
 char gbdbPath[256];
 char *fname=NULL;
 struct dyString *dy = dyStringNew(0);
 dyStringClear(dy);
 dyStringPrintf(dy, 
 "select distinct concat('vgPrb_',e.id), e.seq"
 " from vgPrb e join %s.%s v"
 " left join %s.seq s on s.acc = v.qName"
 " where concat('vgPrb_',e.id) = v.qName"
 " and s.acc is NULL"
 " order by e.id"
     , db, table, db);
 rc = sqlSaveQuery(conn, dy->string, "vgPrbExt.fa", TRUE);
 verbose(1,"rc = %d = count of sequences for vgPrbExt.fa, to use with %s track %s\n",rc,db,table);
 if (rc > 0)  /* can set any desired minimum */
     {
     safef(bedPath,sizeof(bedPath),"/cluster/data/%s/bed/visiGene/",db);
     if (!fileExists(bedPath))
 	{
 	safef(cmd,sizeof(cmd),"mkdir %s",bedPath);
 	verbose(1,"%s\n",cmd); system(cmd);
 	}
     
     safef(gbdbPath,sizeof(gbdbPath),"/gbdb/%s/visiGene/",db);
     if (!fileExists(gbdbPath))
 	{
 	safef(cmd,sizeof(cmd),"mkdir %s",gbdbPath);
     	verbose(1,"%s\n",cmd); system(cmd);
 	}
    
     while(1)
 	{
 	int i=0;
 	safef(path,sizeof(path),"%svgPrbExt_AAAAAA.fa",bedPath);
         char *c = rStringIn("AAAAAA",path);
         srand( (unsigned)time( NULL ) );
         for(i=0;i<6;++i)
             {
             *c++ += (int) 26 * (rand() / (RAND_MAX + 1.0));
             }
 	if (!fileExists(path))
 	    break;
 	}
 
     
     safef(cmd,sizeof(cmd),"cp vgPrbExt.fa %s",path);
     verbose(1,"%s\n",cmd); system(cmd);
     
     fname = rStringIn("/", path);
     ++fname;
     
     safef(cmd,sizeof(cmd),"ln -s %s %s%s",path,gbdbPath,fname);
     verbose(1,"%s\n",cmd); system(cmd);
     
     safef(cmd,sizeof(cmd),"hgLoadSeq %s %s%s", db, gbdbPath,fname);
     verbose(1,"%s\n",cmd); system(cmd);
     }
 
 dyStringFree(&dy);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 struct sqlConnection *conn = NULL;
 char *command = NULL;
 optionInit(&argc, argv, options);
 database = optionVal("database", database);
 sqlPath = optionVal("sqlPath", sqlPath);
 if (argc < 2)
     usage();
 command = argv[1];
 if (argc >= 3)
     setCurrentDir(argv[2]);
 conn = sqlConnect(database);
 if (sameWord(command,"INIT"))
     {
     if (argc != 2)
 	usage();
     errAbort("INIT is probably too dangerous. DO NOT USE.");
     /*	    
     init(conn);	    
     */
     }
 else if (sameWord(command,"POP"))
     {
     if (argc != 2)
 	usage();
     /* populate vgPrb where missing */
     populateMissingVgPrb(conn);
     }
 else if (sameWord(command,"SEQ"))
     {
     if (argc != 4)
 	usage();
     /* make fake probe sequences */
     makeFakeProbeSeq(conn,argv[3]);
     }
 else if (sameWord(command,"ALI"))
     {
     if (argc != 4)
 	usage();
     /* blat anything left that is not aligned, 
       nor even attempted */
     doAlignments(conn,argv[3]);
     }
 else if (sameWord(command,"EXT"))
     {
     if (argc != 4)
 	usage();
     /* update seq and extfile as necessary */
     doSeqAndExtFile(conn,argv[3],"vgProbes");
     }
 else if (sameWord(command,"PSLMAP"))
     {
     if (argc != 5)
 	usage();
     /* pslMap anything left that is not aligned, 
       nor even attempted */
     doAlignmentsPslMap(conn,argv[3],argv[4]);
     }
 else if (sameWord(command,"REMAP"))
     {
     if (argc != 7)
 	usage();
     /* re-map anything in track specified that is not aligned, 
       nor even attempted yet, using specified fasta file. */
     doAlignmentsReMap(conn,argv[3],argv[4],argv[5],argv[6]);
     }
 else if (sameWord(command,"SELFMAP"))
     {
     if (argc != 4)
 	usage();
     /* re-map anything in track specified that is not aligned, 
       nor even attempted yet, using specified fasta file. */
     doAlignmentsSelfMap(conn,argv[3]);
     }
 else if (sameWord(command,"EXTALL"))
     {
     if (argc != 4)
 	usage();
     /* update seq and extfile as necessary */
     doSeqAndExtFile(conn,argv[3],"vgAllProbes");
     }
 else
     usage();
 sqlDisconnect(&conn);
 return 0;
 }