src/lib/htmlPage.c f01299b4b6f140d82fb6ad08576b50823b47ff40

f01299b4b6f140d82fb6ad08576b50823b47ff40
galt
  Thu May 16 22:08:12 2019 -0700
Fixes a bug that happens if space appears before = after attribute name. Fixed it to strip out newlines from quoted attribute values which the standard supports for input but which are not actually part of attribute value, found in a very long url in cite.html. Also now make validate function check for two more illegal conditions: a space directly before the tag name which is illegal.

diff --git src/lib/htmlPage.c src/lib/htmlPage.c
index 437d1d6..33fb36e 100644
--- src/lib/htmlPage.c
+++ src/lib/htmlPage.c
@@ -579,32 +579,30 @@
 			tag->end += 1;
 		    break;
 		    }
 
 		/* Get name - everything up to equals. */
 		e = s;
 		for (;;)
 		    {
 		    c = *e;
 		    if (c == '=')
 		        break;
 		    else if (c == '>')
 		        break;
 		    else if (c == 0)
 		        break;
-		    else if (isspace(c))
-		        break;
 		    e += 1;
 		    }
 		if (c == 0)
 		    {
 		    warn("End of file in tag");
 		    break;
 		    }
 		name = s;
 		*e++ = 0;
 		eraseTrailingSpaces(name);
 		if (c == '>')
 		    {
 		    val = "";
 		    gotEnd = TRUE;
 		    tag->end = html + (e - dupe);
@@ -632,33 +630,39 @@
 				*e++ = 0;
 				tag->end = html + (e - dupe);
 				break;
 				}
 			    else if (isspace(c))
 				{
 				*e++ = 0;
 				break;
 				}
 			    else if (c == 0)
 				break;
 			    ++e;
 			    }
 			}
 		    }
+		
 		AllocVar(att);
 		att->name = cloneString(name);
 		att->val = cloneString(val);
+		// The html standard allows us to break quoted attributes into multiple lines using newlines,
+		// but they are not part of the tag value itself, so 
+		// Strip \n and \r chars from value (att->val);  
+		stripChar(att->val, '\n');
+		stripChar(att->val, '\r');
 		attributeDecode(att->val);
 		slAddTail(&tag->attributes, att);
 		s = e;
 		if (gotEnd)
 		    break;
 		}
 	    }
 	}
     }
 slReverse(&tagList);
 return tagList;
 }
 
 static struct htmlFormVar *findOrMakeVar(struct htmlPage *page, char *name, 
 	struct hash *hash, struct htmlTag *tag, struct htmlFormVar **pVarList)
@@ -1697,31 +1701,34 @@
 "BR",
 "COL",
 "COMMAND",
 "EMBED",
 "FRAME",  // not in html5
 "HR",
 "IMG",
 "INPUT",
 "LINK",
 "META",
 "PARAM",
 "SOURCE"
 };
 
 static char *selfClosers[] =
-/* Tags which can be optionally self-closing in html5 or SVG. */
+/* Tags which can be optionally self-closing in html5 or SVG.
+ * Note that a space is required BEFORE the /> which provides disambiguation,
+ * e.g. We do not know if the trailing slash is part of SRC URL: <img src=http://domain.com/image.jpg/>
+ */
 {
 "CIRCLE",   // SVG
 "ELLIPSE",  // SVG
 "LINE",     // SVG
 "PATH",     // SVG
 "POLYGON",  // SVG
 "POLYLINE", // SVG
 "RECT"      // SVG
 };
 
 static struct htmlTag *validateBody(struct htmlPage *page, struct htmlTag *startTag)
 /* Go through tags from current position (just past <BODY>)
  * up to and including </BODY> and check some things. */
 {
 struct htmlTag *tag, *endTag = NULL;
@@ -1867,31 +1874,43 @@
 void htmlPageValidateOrAbort(struct htmlPage *page)
 /* Do some basic validations.  Aborts if there is a problem. */
 {
 struct htmlTag *tag;
 boolean gotTitle = FALSE;
 char *contentType = NULL;
 
 if (page == NULL)
     errAbort("Can't validate NULL page");
 if (page->header != NULL)
     contentType = hashFindVal(page->header, "Content-Type:");
 if (contentType == NULL || startsWith("text/html", contentType))
     {
     /* To simplify things upper case all tag names. */
     for (tag = page->tags; tag != NULL; tag = tag->next)
+	{
 	touppers(tag->name);
+	if (isEmpty(tag->name)) // causes a blank tag
+	    tagAbort(page, tag, "Space not allowed between opening bracket < and tag name");
+	    if (startsWith("/", tag->name))
+		{
+		if (sameString(tag->name,"/")) // causes a blank close tag
+		    tagAbort(page, tag, "Space not allowed between opening bracket </ and closing tag name");
+		if (tag->attributes)
+		    tagAbort(page, tag, "Attributes are not allowed in closing tag: [%s]", tag->name);
+		}
+	}
+
 
     checkExactlyOne(page->tags, "BODY");
 
     /* Validate header, and make a suggestion or two */
     if ((tag = page->tags) == NULL)
 	errAbort("No tags");
     if (!sameWord(tag->name, "HTML"))
 	errAbort("Doesn't start with <HTML> tag");
     struct htmlTag *headTag = nextTagOfTypeInList(tag->next, "HEAD");
     if (headTag == NULL)
         warn("No <HEAD> tag after <HTML> tag");
     else
 	{
 	tag = headTag;
 	for (;;)
@@ -1930,32 +1949,38 @@
 struct hash *singleTonHash = hashNew(8);
 int i;
 int count=ArraySize(singleTons);
 for (i=0; i<count; ++i)
     hashAdd(singleTonHash, singleTons[i], NULL);
 
 /* Add selfCloser tags to hash. */
 struct hash *selfCloserHash = hashNew(8);
 count=ArraySize(selfClosers);
 for (i=0; i<count; ++i)
     hashAdd(selfCloserHash, selfClosers[i], NULL);
 
 struct slName *tagStack = NULL;
 for (tag = page->tags; tag != NULL; tag = tag->next)
     {
+    if (isEmpty(tag->name)) // causes a blank tag
+	tagAbort(page, tag, "Space not allowed between opening bracket < and tag name");
     if (startsWith("/", tag->name))
 	{
+	if (sameString(tag->name,"/")) // causes a blank close tag
+	    tagAbort(page, tag, "Space not allowed between opening bracket </ and closing tag name");
+        if (tag->attributes)
+	    tagAbort(page, tag, "Attributes are not allowed in closing tag: [%s]", tag->name);
 	if (hashLookup(singleTonHash, tag->name+1))
 	    tagAbort(page, tag, "Tag %s closing tag not allowed for singleton tags.", tag->name);
 	if (!sameString("P", tag->name+1))
 	    {
 	    if (!tagStack)
 		tagAbort(page, tag, "No tags still left on stack. Closing tag %s has no corresponding open tag.", tag->name);
 	    struct slName *top = slPopHead(&tagStack);
 	    // flush LI tags still on stack when /UL or /OL encountered
 	    // since the missing /LI tags are usually tolerated. 
 	    while ((sameString(tag->name, "/UL") || sameString(tag->name, "/OL")) && sameString(top->name,"LI"))
 		{
 		tagWarn(page, tag, "Closing tag %s found. LI tag on stack. Missing /LI tag. Please fix. Continuing.", tag->name);
 		top = slPopHead(&tagStack);
 		}
 	    if (!sameString(top->name,tag->name+1))