d8feabb353b3c2650facea4afd08c86bb56e5549
kent
  Fri Apr 6 16:51:54 2012 -0700
Moving autoSql and autoDtd and autoXml back to just under hg.  A little autoSql -django fix.
diff --git src/hg/autoDtd/autoDtd.c src/hg/autoDtd/autoDtd.c
new file mode 100644
index 0000000..42e1084
--- /dev/null
+++ src/hg/autoDtd/autoDtd.c
@@ -0,0 +1,463 @@
+/* autoDtd - Give this a XML document to look at and it will come up with a 
+ * DTD to describe it, and possibly some more readable and informative outputs
+ * as well. */
+/* This file is copyright 2005 Jim Kent, but license is hereby
+ * granted for all use - public, private or commercial. */
+
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "xap.h"
+
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "autoDtd - Give this a XML document to look at and it will come up with a DTD\n"
+  "to describe it.\n"
+  "usage:\n"
+  "   autoDtd in.xml out.dtd out.stats\n"
+  "options:\n"
+  "   -tree=out.tree - Output tag tree.\n"
+  "   -atree=out.atree - Output attributed tag tree.\n"
+  );
+}
+
+static struct optionSpec options[] = {
+   {"tree", OPTION_STRING},
+   {"atree", OPTION_STRING},
+   {NULL, 0},
+};
+
+struct type
+/* Information on a type. */
+    {
+    struct type *next;
+    char *name;		/* Name of type/field. */
+    int count;	        /* Number of occurences of this tag. */
+    struct hash *attHash;	/* Hash of all elements keyed by name */
+    struct attribute *attributes;
+    struct hash *elHash;	/* Hash of all elements keyed by type->name */
+    struct element *elements;
+    struct attribute *textAttribute;	/* Information on text. */
+    };
+
+struct attribute
+/* Information on an attribute */
+    {
+    struct attribute *next;
+    char *name;
+    int count;		/* Number of times we've seen this attribute. */
+    boolean isOptional;	/* True if it's not always there. */
+    boolean nonInt;	/* True if not an int. */
+    boolean nonFloat;	/* True if not a number. */
+    boolean seenThisRound;  /* True if seen this round. */
+    struct hash *values;	/* Hash of unique values. */
+    int maxLen;		/* Maximum length */
+    };
+
+struct element
+/* Information on an element */
+    {
+    struct element *next;
+    struct type *type;	/* Element type */
+    boolean isOptional;	/* True if it's optional. */
+    boolean isList;	/* True if it's a list. */
+    boolean seenThisRound;  /* True if seen this round. */
+    };
+
+struct hash *typeHash;	/* Keyed by struct type */
+struct type *topType;	/* Highest level type */
+
+boolean hasLeftPaddedZero(char *s)
+/* does the string have a leading zero */
+{
+if (strlen(s) < 2)
+    return FALSE;
+return s[0]=='0';
+}
+
+boolean isAllUInt(char *s)
+/* Return true if it looks like an unsigned integer */
+{
+char c;
+while ((c = *s++) != 0)
+    if (!isdigit(c))
+        return FALSE;
+return TRUE;
+}
+
+boolean isAllInt(char *s)
+/* Return true if it looks like an integer */
+{
+if (*s == '-')
+   ++s;
+return isAllUInt(s) && !hasLeftPaddedZero(s);
+}
+
+boolean isAllFloat(char *s)
+/* Return true if it looks like an floating point */
+{
+char *point = strchr(s,'.');
+if (!point)
+    return isAllInt(s);
+if (!isAllUInt(point+1))
+    return FALSE;
+char *temp=cloneStringZ(s,point-s);
+boolean result = isAllInt(temp);
+freeMem(temp);
+return result;
+}
+
+
+void *startHandler(struct xap *xap, char *name, char **atts)
+/* Called at the start of a tag after attributes are parsed. */
+{
+int i;
+struct type *type = hashFindVal(typeHash, name);
+struct attribute *att;
+struct element *el;
+
+if (type == NULL)
+    {
+    AllocVar(type);
+    hashAddSaveName(typeHash, name, type, &type->name);
+    type->elHash = hashNew(6);
+    type->attHash = hashNew(6);
+    }
+
+/* Zero out seenThisRound flags */
+for (el = type->elements; el != NULL; el = el->next)
+    el->seenThisRound = FALSE;
+for (att = type->attributes; att != NULL; att = att->next)
+    att->seenThisRound = FALSE;
+
+for (i=0; atts[i] != NULL; i += 2)
+    {
+    char *name = atts[i], *val = atts[i+1];
+    int valLen = strlen(val);
+    att = hashFindVal(type->attHash, name);
+    if (att == NULL)
+        {
+	AllocVar(att);
+	hashAddSaveName(type->attHash, name, att, &att->name);
+	att->values = hashNew(16);
+	slAddTail(&type->attributes, att);
+	if (type->count != 0)
+	    att->isOptional = TRUE;
+	}
+    att->count += 1;
+    hashStore(att->values, val);
+    if (valLen > att->maxLen)
+        att->maxLen = valLen;
+    if (!att->nonInt)
+	if (!isAllInt(val) || hasLeftPaddedZero(val))
+	    att->nonInt = TRUE;
+    if (!att->nonFloat)
+	if (!isAllFloat(val))
+	    att->nonFloat = TRUE;
+    att->seenThisRound = TRUE;
+    }
+for (att = type->attributes; att != NULL; att = att->next)
+    {
+    if (!att->seenThisRound)
+        att->isOptional = TRUE;
+    }
+
+if (xap->stackDepth > 1)
+    {
+    struct xapStack *st = xap->stack+1;
+    struct type *parent = st->object;
+    el = hashFindVal(parent->elHash, name);
+    if (el == NULL)
+        {
+	AllocVar(el);
+	hashAdd(parent->elHash, name, el);
+	el->type = type;
+	slAddTail(&parent->elements, el);
+	if (parent->count != 0)
+	    el->isOptional = TRUE;
+	}
+    if (el->seenThisRound)
+        el->isList = TRUE;
+    el->seenThisRound = TRUE;
+    }
+return type;
+}
+
+void endHandler(struct xap *xap, char *name)
+/* Called at end of a tag */
+{
+struct type *type = xap->stack->object;
+char *text = skipLeadingSpaces(xap->stack->text->string);
+struct element *el;
+for (el = type->elements; el != NULL; el = el->next)
+    {
+    if (!el->seenThisRound)
+        el->isOptional = TRUE;
+    }
+if (text[0] == 0)
+    {
+    if (type->textAttribute != NULL)
+        type->textAttribute->isOptional = TRUE;
+    }
+else
+    {
+    int textLen = strlen(text);
+    struct attribute *att = type->textAttribute;
+    if (att == NULL)
+	{
+	type->textAttribute = AllocVar(att);
+	att->name = "<text>";
+	att->values = hashNew(16);
+	if (type->count != 0)
+	    att->isOptional = TRUE;
+	}
+    if (att->maxLen < textLen)
+        att->maxLen = textLen;
+    hashStore(att->values, text);
+    att->count += 1;
+    if (!att->nonInt)
+	if (!isAllInt(text) || hasLeftPaddedZero(text))
+	    att->nonInt = TRUE;
+    if (!att->nonFloat)
+	if (!isAllFloat(text))
+	    att->nonFloat = TRUE;
+    }
+type->count += 1;
+topType = type;
+}
+
+char *attDataType(struct attribute *att)
+/* Return data type associated with attribute as a string */
+{
+if (!att->nonInt)
+    return "int";
+else if (!att->nonFloat)
+    return "float";
+else
+    return "string";
+}
+
+void rWriteDtd(FILE *dtdFile, FILE *statsFile, struct type *type, 
+	struct hash *uniqHash)
+/* Recursively write out DTD. */
+{
+struct element *el;
+struct attribute *att;
+int elCount = slCount(type->elements);
+boolean multiline;
+
+if (type->textAttribute != NULL)
+    elCount += 1;
+multiline = (elCount > 3);
+hashAdd(uniqHash, type->name, type);
+fprintf(dtdFile, "<!ELEMENT %s (", type->name);
+if (multiline)
+    fprintf(dtdFile, "\n");
+for (el = type->elements; el != NULL; el = el->next)
+    {
+    if (multiline)
+       fprintf(dtdFile, "\t");
+    fprintf(dtdFile, "%s", el->type->name);
+    if (el->isList)
+        {
+	if (el->isOptional)
+	    fprintf(dtdFile, "*");
+	else
+	    fprintf(dtdFile, "+");
+	}
+    else
+        {
+	if (el->isOptional)
+	    fprintf(dtdFile, "?");
+	}
+    if (el->next != NULL || type->textAttribute != NULL)
+        fprintf(dtdFile, ", ");
+    if (multiline)
+	fprintf(dtdFile, "\n");
+    }
+if (type->textAttribute != NULL)
+    {
+    if (multiline)
+	fprintf(dtdFile, "\t");
+    if (!type->textAttribute->nonInt)
+        fprintf(dtdFile, "%%INTEGER;");
+    else if (!type->textAttribute->nonFloat)
+        fprintf(dtdFile, "%%REAL;");
+    else
+        fprintf(dtdFile, "#PCDATA");
+    if (multiline)
+	fprintf(dtdFile, "\n");
+    }
+fprintf(dtdFile, ")>\n");
+fprintf(statsFile, "%s %d\n", type->name, type->count);
+if ((att = type->textAttribute) != NULL)
+    {
+    fprintf(statsFile, "\t%s\t%d\t%s\t%d\t%d\n", att->name, att->maxLen,
+    	attDataType(att), att->count, att->values->elCount);
+    }
+else
+    {
+    fprintf(statsFile, "\t<text>\t0\tnone\t0\t0\n");
+    }
+
+for (att = type->attributes; att != NULL; att = att->next)
+    {
+    fprintf(dtdFile, "<!ATTLIST %s %s ", type->name, att->name);
+    if (!att->nonInt)
+        fprintf(dtdFile, "%%int;");
+    else if (!att->nonFloat)
+        fprintf(dtdFile, "%%float;");
+    else
+        fprintf(dtdFile, "CDATA");
+    if (att->isOptional)
+        fprintf(dtdFile, " #IMPLIED");
+    else
+	fprintf(dtdFile, " #REQUIRED");
+    fprintf(dtdFile, ">\n");
+    fprintf(statsFile, "\t%s\t%d\t%s\t%d\t%d\n", att->name, att->maxLen,
+    	attDataType(att), att->count, att->values->elCount);
+    }
+fprintf(dtdFile, "\n");
+fprintf(statsFile, "\n");
+
+/* Now recurse if we haven't written children yet. */
+for (el = type->elements; el != NULL; el = el->next)
+    {
+    if (!hashLookup(uniqHash, el->type->name))
+        {
+	rWriteDtd(dtdFile, statsFile, el->type, uniqHash);
+	}
+    }
+}
+
+void writeDtd(char *dtdFileName, char *statsFileName, char *xmlFileName, 
+	struct type *type)
+/* Write out DTD. */
+{
+struct hash *uniqHash = newHash(0);  /* Prevent writing dup defs for shared types. */
+FILE *dtdFile = mustOpen(dtdFileName, "w");
+FILE *statsFile = mustOpen(statsFileName, "w");
+fprintf(dtdFile, "<!-- This file was created by autoDtd based on %s -->\n\n", xmlFileName);
+fprintf(dtdFile, "<!-- First some entities to mark numeric types in between tags.  Same as NCBI. -->\n");
+fprintf(dtdFile, "<!ENTITY %% INTEGER \"#PCDATA\">\n");
+fprintf(dtdFile, "<!ENTITY %% REAL \"#PCDATA\">\n\n");
+fprintf(dtdFile, "<!-- Now some entities for numeric attributes. NCBI doesn't define these but we do. -->\n");
+fprintf(dtdFile, "<!ENTITY %% int \"CDATA\">\n");
+fprintf(dtdFile, "<!ENTITY %% float \"CDATA\">\n\n");
+fprintf(dtdFile, "<!-- Now the data structure in %s. -->\n", xmlFileName);
+fprintf(statsFile, "#Statistics on %s\n", xmlFileName);
+fprintf(statsFile, "#Format is:\n");
+fprintf(statsFile, "#<tag name>  <tag count>\n");
+fprintf(statsFile, "#      <<text>> <max length> <type> <count> <unique count>\n");
+fprintf(statsFile, "#      <attribute name> <max length> <type> <count> <unique count>\n");
+fprintf(statsFile, "\n");
+rWriteDtd(dtdFile, statsFile, type, uniqHash);
+carefulClose(&dtdFile);
+carefulClose(&statsFile);
+}
+
+void writeAttValType(FILE *f, struct attribute *att)
+/* Write out #, % or ? depending if type is int, float, or string */
+{
+if (!att->nonInt)
+    fprintf(f, "#");
+else if (!att->nonFloat)
+    fprintf(f, "%%");
+else
+    fprintf(f, "$");
+}
+
+
+void writeAttribute(FILE *f, struct attribute *att)
+/* Write out information about attribute. */
+{
+fprintf(f, " ");
+writeAttValType(f, att);
+fprintf(f, "%s", att->name);
+if (att->isOptional)
+    fprintf(f, "?");
+}
+
+void rWriteTree(FILE *f, struct type *type, boolean isOptional, 
+	boolean isList, struct hash *uniqHash, 
+	boolean withAttributes, int level)
+/* Write out type and it's children. */
+{
+struct attribute *att;
+struct element *el;
+static struct type *parentStack[256];
+int i;
+
+
+spaceOut(f, level*2);
+if (withAttributes && type->textAttribute != NULL)
+    writeAttValType(f, type->textAttribute);
+fprintf(f, "%s", type->name);
+if (isList)
+    if (isOptional)
+	fprintf(f, "*");
+    else
+	fprintf(f, "+");
+else
+    if (isOptional)
+	fprintf(f, "?");
+if (withAttributes)
+    {
+    for (att = type->attributes; att != NULL; att = att->next)
+	writeAttribute(f, att);
+    }
+fprintf(f, "\n");
+
+if (level >= ArraySize(parentStack))
+    errAbort("Recursion too deep in rWriteTree");
+parentStack[level] = type;
+for (i=level-1; i>= 0; i -= 1)
+    if (type == parentStack[i])
+        return;	/* Avoid cycling on self. */
+
+for (el = type->elements; el != NULL; el = el->next)
+    rWriteTree(f, el->type, el->isOptional, el->isList, 
+    	uniqHash, withAttributes, level+1);
+}
+
+void writeTree(char *fileName, struct type *root, boolean withAttributes)
+/* Write out type tree to file. */
+{
+struct hash *uniqHash = newHash(0);  /* Prevent writing dup defs. */
+FILE *f = mustOpen(fileName, "w");
+rWriteTree(f, root, FALSE, FALSE, uniqHash, withAttributes, 0);
+carefulClose(&f);
+}
+
+
+void autoDtd(char *inXml, char *outDtd, char *outStats, char *treeFileName,
+	char *atreeFileName)
+/* autoDtd - Give this a XML document to look at and it will come up with a 
+ * DTD to describe it.. */
+{
+struct xap *xap = xapNew(startHandler, endHandler, inXml);
+typeHash = newHash(0);
+xapParseFile(xap, inXml);
+writeDtd(outDtd, outStats, inXml, topType);
+if (treeFileName != NULL)
+    writeTree(treeFileName, topType, FALSE);
+if (atreeFileName != NULL)
+    writeTree(atreeFileName, topType, TRUE);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+char *treeFileName = NULL, *atreeFileName = NULL;
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+treeFileName = optionVal("tree", treeFileName);
+atreeFileName = optionVal("atree", atreeFileName);
+autoDtd(argv[1], argv[2], argv[3], treeFileName, atreeFileName);
+return 0;
+}