7ae4a70c26c3414a34b262329b2f7febe831923c
hiram
  Thu Jul 18 10:26:34 2013 -0700
checking in Robert's code refs #9741
diff --git src/hg/lib/rmskOut2.c src/hg/lib/rmskOut2.c
new file mode 100644
index 0000000..fea3fc9
--- /dev/null
+++ src/hg/lib/rmskOut2.c
@@ -0,0 +1,442 @@
+/* rmskOut2.c was originally generated by the autoSql program, which also 
+ * generated rmskOut2.h and rmskOut2.sql.  This module links the database and
+ * the RAM representation of objects. */
+
+#include "common.h"
+#include "linefile.h"
+#include "dystring.h"
+#include "jksql.h"
+#include "rmskOut.h"
+#include "rmskOut2.h"
+#include "obscure.h"
+#include "binRange.h"
+
+
+
+void rmskOut2StaticLoad(char **row, struct rmskOut2 *ret)
+/* Load a row from rmskOut2 table into ret.  The contents of ret will
+ * be replaced at the next call to this function. */
+{
+
+ret->swScore = sqlUnsigned(row[0]);
+ret->milliDiv = sqlUnsigned(row[1]);
+ret->milliDel = sqlUnsigned(row[2]);
+ret->milliIns = sqlUnsigned(row[3]);
+ret->genoName = row[4];
+ret->genoStart = sqlUnsigned(row[5]);
+ret->genoEnd = sqlUnsigned(row[6]);
+ret->genoLeft = sqlSigned(row[7]);
+safecpy(ret->strand, sizeof(ret->strand), row[8]);
+ret->repName = row[9];
+ret->repClass = row[10];
+ret->repFamily = row[11];
+ret->repStart = sqlSigned(row[12]);
+ret->repEnd = sqlUnsigned(row[13]);
+ret->repLeft = sqlSigned(row[14]);
+ret->id = sqlUnsigned(row[15]);
+}
+
+struct rmskOut2 *rmskOut2Load(char **row)
+/* Load a rmskOut2 from row fetched with select * from rmskOut2
+ * from database.  Dispose of this with rmskOut2Free(). */
+{
+struct rmskOut2 *ret;
+
+AllocVar(ret);
+ret->swScore = sqlUnsigned(row[0]);
+ret->milliDiv = sqlUnsigned(row[1]);
+ret->milliDel = sqlUnsigned(row[2]);
+ret->milliIns = sqlUnsigned(row[3]);
+ret->genoName = cloneString(row[4]);
+ret->genoStart = sqlUnsigned(row[5]);
+ret->genoEnd = sqlUnsigned(row[6]);
+ret->genoLeft = sqlSigned(row[7]);
+safecpy(ret->strand, sizeof(ret->strand), row[8]);
+ret->repName = cloneString(row[9]);
+ret->repClass = cloneString(row[10]);
+ret->repFamily = cloneString(row[11]);
+ret->repStart = sqlSigned(row[12]);
+ret->repEnd = sqlUnsigned(row[13]);
+ret->repLeft = sqlSigned(row[14]);
+ret->id = sqlUnsigned(row[15]);
+return ret;
+}
+
+struct rmskOut2 *rmskOut2LoadAll(char *fileName) 
+/* Load all rmskOut2 from a whitespace-separated file.
+ * Dispose of this with rmskOut2FreeList(). */
+{
+struct rmskOut2 *list = NULL, *el;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *row[16];
+
+while (lineFileRow(lf, row))
+    {
+    el = rmskOut2Load(row);
+    slAddHead(&list, el);
+    }
+lineFileClose(&lf);
+slReverse(&list);
+return list;
+}
+
+struct rmskOut2 *rmskOut2LoadAllByChar(char *fileName, char chopper) 
+/* Load all rmskOut2 from a chopper separated file.
+ * Dispose of this with rmskOut2FreeList(). */
+{
+struct rmskOut2 *list = NULL, *el;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *row[16];
+
+while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
+    {
+    el = rmskOut2Load(row);
+    slAddHead(&list, el);
+    }
+lineFileClose(&lf);
+slReverse(&list);
+return list;
+}
+
+struct rmskOut2 *rmskOut2CommaIn(char **pS, struct rmskOut2 *ret)
+/* Create a rmskOut2 out of a comma separated string. 
+ * This will fill in ret if non-null, otherwise will
+ * return a new rmskOut2 */
+{
+char *s = *pS;
+
+if (ret == NULL)
+    AllocVar(ret);
+ret->swScore = sqlUnsignedComma(&s);
+ret->milliDiv = sqlUnsignedComma(&s);
+ret->milliDel = sqlUnsignedComma(&s);
+ret->milliIns = sqlUnsignedComma(&s);
+ret->genoName = sqlStringComma(&s);
+ret->genoStart = sqlUnsignedComma(&s);
+ret->genoEnd = sqlUnsignedComma(&s);
+ret->genoLeft = sqlSignedComma(&s);
+sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
+ret->repName = sqlStringComma(&s);
+ret->repClass = sqlStringComma(&s);
+ret->repFamily = sqlStringComma(&s);
+ret->repStart = sqlSignedComma(&s);
+ret->repEnd = sqlUnsignedComma(&s);
+ret->repLeft = sqlSignedComma(&s);
+ret->id = sqlUnsignedComma(&s);
+*pS = s;
+return ret;
+}
+
+void rmskOut2Free(struct rmskOut2 **pEl)
+/* Free a single dynamically allocated rmskOut2 such as created
+ * with rmskOut2Load(). */
+{
+struct rmskOut2 *el;
+
+if ((el = *pEl) == NULL) return;
+freeMem(el->genoName);
+freeMem(el->repName);
+freeMem(el->repClass);
+freeMem(el->repFamily);
+freez(pEl);
+}
+
+void rmskOut2FreeList(struct rmskOut2 **pList)
+/* Free a list of dynamically allocated rmskOut2's */
+{
+struct rmskOut2 *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+    {
+    next = el->next;
+    rmskOut2Free(&el);
+    }
+*pList = NULL;
+}
+
+void rmskOut2Output(struct rmskOut2 *el, FILE *f, char sep, char lastSep) 
+/* Print out rmskOut2.  Separate fields with sep. Follow last field with lastSep. */
+{
+fprintf(f, "%u", el->swScore);
+fputc(sep,f);
+fprintf(f, "%u", el->milliDiv);
+fputc(sep,f);
+fprintf(f, "%u", el->milliDel);
+fputc(sep,f);
+fprintf(f, "%u", el->milliIns);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->genoName);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+fprintf(f, "%u", el->genoStart);
+fputc(sep,f);
+fprintf(f, "%u", el->genoEnd);
+fputc(sep,f);
+fprintf(f, "%d", el->genoLeft);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->strand);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->repName);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->repClass);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->repFamily);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+fprintf(f, "%d", el->repStart);
+fputc(sep,f);
+fprintf(f, "%u", el->repEnd);
+fputc(sep,f);
+fprintf(f, "%d", el->repLeft);
+fputc(sep,f);
+fprintf(f, "%u", el->id);
+fputc(lastSep,f);
+}
+
+/* -------------------------------- End autoSql Generated Code -------------------------------- */
+
+
+void rmskOut2OpenVerify(char *fileName, struct lineFile **retFile, boolean *retEmpty)
+/* Open repeat masker .out file and verify that it is good.
+ * Set retEmpty if it has header characteristic of an empty file. */
+{
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *line;
+int lineSize;
+
+lineFileNeedNext(lf, &line, &lineSize);
+if (startsWith("There were no", line))
+    *retEmpty = TRUE;
+line = skipLeadingSpaces(line);
+if (! ( startsWith("SW", line) || startsWith("bit", line) ) )
+  errAbort("%s doesn't seem to be a RepeatMasker .out file", fileName);
+lineFileSkip(lf, 2);
+*retEmpty = FALSE;
+*retFile = lf;
+}
+
+static int negParenNum2(struct lineFile *lf, char *s)
+/* Return number where negative is shown by parenthization. */
+{
+boolean hasParen = FALSE;
+int result;
+if (*s == '(')
+   {
+   hasParen = TRUE;
+   ++s;
+   }
+if (!isdigit(s[0]) && s[0] != '-')
+   errAbort("Expecting digit line %d of %s got %s\n",
+        lf->lineIx, lf->fileName, s);
+result = atoi(s);
+if (hasParen)
+    result = -result;
+return result;
+}
+
+static void parseClassAndFamily(char *s, char **retClass, char **retFamily)
+/* Separate repeatMasker class/family .*/
+{
+char *e = strchr(s, '/');
+if (e == NULL)
+    *retClass = *retFamily = s;
+else
+    {
+    *e++ = 0;
+    *retClass = s;
+    *retFamily = e;
+    }
+}
+
+struct rmskOut2 *rmskOut2ReadNext(struct lineFile *lf)
+/* Read next record from repeat masker file.  Return NULL at EOF. */
+{
+char *words[32];
+int wordCount;
+char id;
+struct rmskOut2 *ret;
+char *class, *family;
+
+if ((wordCount = lineFileChop(lf, words)) == 0)
+    return NULL;
+if (wordCount != 15 )
+    errAbort("Expecting 15 words - line %d of %s", lf->lineIx, lf->fileName);
+
+id = words[14][0];
+AllocVar(ret);
+ret->swScore = lineFileNeedNum(lf, words, 0);
+ret->milliDiv = round(10.0*atof(words[1]));
+ret->milliDel = round(10.0*atof(words[2]));
+ret->milliIns = round(10.0*atof(words[3]));
+ret->genoName = cloneString(words[4]);
+ret->genoStart = lineFileNeedNum(lf, words, 5)-1;
+ret->genoEnd = lineFileNeedNum(lf, words, 6);
+ret->genoLeft = -negParenNum2(lf, words[7]);
+if (sameString(words[8], "C"))
+    ret->strand[0] = '-';
+else if (sameString(words[8], "+"))
+    ret->strand[0] = '+';
+else
+    errAbort("Unexpected strand char line %d of %s", lf->lineIx, lf->fileName);
+ret->repName = cloneString(words[9]);
+parseClassAndFamily(words[10], &class, &family);
+ret->repClass = cloneString(class);
+ret->repFamily = cloneString(family);
+if (sameString(words[8], "C"))
+{
+    ret->repStart = negParenNum2(lf, words[11])-1;
+    ret->repEnd = sqlSigned(words[12]);
+    ret->repLeft = -negParenNum2(lf, words[13]);
+}else
+{
+    ret->repLeft = -negParenNum2(lf, words[11]);
+    ret->repEnd = sqlSigned(words[12]);
+    ret->repStart = negParenNum2(lf, words[13])-1;
+}
+return ret;
+}
+
+struct rmskOut2 *rmskOut2Read(char *fileName)
+/* Read all records in .out file and return as list. */
+{
+struct lineFile *lf;
+boolean isEmpty;
+struct rmskOut2 *list = NULL, *el;
+
+rmskOut2OpenVerify(fileName, &lf, &isEmpty);
+if (!isEmpty)
+    {
+    while ((el = rmskOut2ReadNext(lf)) != NULL)
+       {
+       slAddHead(&list, el);
+       }
+    slReverse(&list);
+    }
+lineFileClose(&lf);
+return list;
+}
+
+void rmskOut2WriteHead(FILE *f)
+/* Write out rmsk header lines. */
+{
+fprintf(f,
+"   SW  perc perc perc  query      position in query           matching       repeat              position in  repeat\n"
+"score  div. del. ins.  sequence    begin     end    (left)    repeat         class/family         begin  end (left)   ID\n"
+"\n");
+}
+
+static void parenNeg2(int num, char *s, size_t sSize)
+/* Write number to s, parenthesizing if negative. */
+{
+if (num <= 0)
+   safef(s, sSize, "(%d)", -num);
+else
+   safef(s, sSize, "%d", num);
+}
+
+void rmskOut2WriteOneOut(struct rmskOut2 *rmsk, FILE *f)
+/* Write one rmsk in .out format to file. */
+{
+char genoLeft[24], repStart[24], repLeft[24];
+char classFam[128];
+
+parenNeg2(-rmsk->genoLeft, genoLeft, sizeof(genoLeft));
+parenNeg2(rmsk->repStart+1, repStart, sizeof(repStart));
+parenNeg2(-rmsk->repLeft, repLeft, sizeof(repLeft));
+if (sameString(rmsk->repClass, rmsk->repFamily))
+    safef(classFam, sizeof(classFam), "%s", rmsk->repClass);
+else
+    safef(classFam, sizeof(classFam), "%s/%s",
+	  rmsk->repClass, rmsk->repFamily);
+if ( rmsk->strand[0] == '+' )
+  fprintf(f, 
+    "%5d %5.1f %4.1f %4.1f  %-9s %7d %7d %9s %1s  %-14s %-19s %6s %4d %6s %6d\n",
+    rmsk->swScore, 0.1*rmsk->milliDiv, 0.1*rmsk->milliDel, 0.1*rmsk->milliIns, 
+    rmsk->genoName, rmsk->genoStart+1, rmsk->genoEnd, genoLeft, "+", 
+    rmsk->repName, classFam, repStart, rmsk->repEnd, repLeft, rmsk->id);
+else
+  fprintf(f, 
+    "%5d %5.1f %4.1f %4.1f  %-9s %7d %7d %9s %1s  %-14s %-19s %6s %4d %6s %6d\n",
+    rmsk->swScore, 0.1*rmsk->milliDiv, 0.1*rmsk->milliDel, 0.1*rmsk->milliIns, 
+    rmsk->genoName, rmsk->genoStart+1, rmsk->genoEnd, genoLeft, "C",
+    rmsk->repName, classFam, repLeft, rmsk->repEnd, repStart, rmsk->id);
+
+}
+
+void rmskOut2WriteAllOut(char *fileName, struct rmskOut2 *rmskList)
+/* Write .out format file containing all in rmskList. */
+{
+FILE *f = mustOpen(fileName, "w");
+struct rmskOut2 *rmsk;
+
+rmskOut2WriteHead(f);
+for (rmsk = rmskList; rmsk != NULL; rmsk = rmsk->next)
+    rmskOut2WriteOneOut(rmsk, f);
+fclose(f);
+}
+
+struct binKeeper *readRepeats2(char *chrom, char *rmskFileName, struct hash *tSizeHash)
+/* read all repeats for a chromosome of size size, returns results in binKeeper structure for fast query*/
+{
+    boolean rmskRet;
+    struct lineFile *rmskF = NULL;
+    struct rmskOut2 *rmsk;
+    struct binKeeper *bk; 
+    int size;
+
+    size = hashIntVal(tSizeHash, chrom);
+    bk = binKeeperNew(0, size);
+    assert(size > 1);
+    rmskOut2OpenVerify(rmskFileName ,&rmskF , &rmskRet);
+    while ((rmsk = rmskOut2ReadNext(rmskF)) != NULL)
+        {
+        binKeeperAdd(bk, rmsk->genoStart, rmsk->genoEnd, rmsk);
+        }
+    lineFileClose(&rmskF);
+    return bk;
+}
+
+struct hash *readRepeatsAll2(char *sizeFileName, char *rmskDir)
+/* read all repeats for a all chromosomes getting sizes from sizeFileNmae , returns results in hash of binKeeper structure for fast query*/
+{
+boolean rmskRet;
+struct binKeeper *bk; 
+struct lineFile *rmskF = NULL;
+struct rmskOut2 *rmsk;
+struct lineFile *lf = lineFileOpen(sizeFileName, TRUE);
+struct hash *hash = newHash(0);
+char *row[2];
+char rmskFileName[256];
+
+while (lineFileRow(lf, row))
+    {
+    char *name = row[0];
+    int size = lineFileNeedNum(lf, row, 1);
+
+    if (hashLookup(hash, name) != NULL)
+        warn("Duplicate %s, ignoring all but first\n", name);
+    else
+        {
+        bk = binKeeperNew(0, size);
+        assert(size > 1);
+        safef(rmskFileName, sizeof(rmskFileName), "%s/%s.fa.out",rmskDir,name);
+        rmskOut2OpenVerify(rmskFileName ,&rmskF , &rmskRet);
+        while ((rmsk = rmskOut2ReadNext(rmskF)) != NULL)
+            {
+            binKeeperAdd(bk, rmsk->genoStart, rmsk->genoEnd, rmsk);
+            }
+        lineFileClose(&rmskF);
+	hashAdd(hash, name, bk);
+        }
+    }
+lineFileClose(&lf);
+return hash;
+}