9367c4abbf200b833b19c5fe77a198c41a48c809 galt Fri Feb 24 18:29:08 2017 -0800 Adding new function strictTagNestCheck to htmlCheck. CSP testing turned up CGI webpage output that was not even being parsed correctly or the same by various browsers because of reversed end-tags. htmlCheck should be able to find these problems too. Also added the pipe and comma chars to the chars allowed in urls by htmlPage.c diff --git src/utils/htmlCheck/htmlCheck.c src/utils/htmlCheck/htmlCheck.c index f098479..7c08e9d 100644 --- src/utils/htmlCheck/htmlCheck.c +++ src/utils/htmlCheck/htmlCheck.c @@ -1,390 +1,396 @@ /* htmlCheck - Do a little reading and verification of html file. */ /* Copyright (C) 2013 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "errAbort.h" #include "memalloc.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "obscure.h" #include "filePath.h" #include "net.h" #include "htmlPage.h" void usage() /* Explain usage and exit. */ { errAbort( "htmlCheck - Do a little reading and verification of html file\n" "usage:\n" " htmlCheck how url\n" "where how is:\n" " ok - just check for 200 return. Print error message and exit -1 if no 200\n" " getAll - read the url (header and html) and print to stdout\n" " getHeader - read the header and print to stdout\n" " getCookies - print list of cookies\n" " getHtml - print the html, but not the header to stdout\n" " getForms - print the form structure to stdout\n" " getVars - print the form variables to stdout\n" " getLinks - print links\n" " getTags - print out just the tags\n" " checkLinks - check links in page\n" " checkLinks2 - check links in page and all subpages in same host\n" " (Just one level of recursion)\n" " checkLocalLinks - check local links in page\n" " checkLocalLinks2 - check local links in page and connected local pages\n" " (Just one level of recursion)\n" " submit - submit first form in page if any using 'GET' method\n" " validate - do some basic validations including TABLE/TR/TD nesting\n" + " strictTagNestCheck - check tags are correctly nested\n" "options:\n" " cookies=cookie.txt - Cookies is a two column file\n" " containing <cookieName><space><value><newLine>\n" "note: url will need to be in quotes if it contains an ampersand or question mark." ); } static struct optionSpec options[] = { {"cookies", OPTION_STRING}, {NULL, 0}, }; void checkOk(char *fullText) /* Parse out first line and check it's ok. */ { struct htmlStatus *status = htmlStatusParse(&fullText); if (status == NULL) noWarnAbort(); if (status->status != 200) errAbort("Status code %d", status->status); } void getHeader(char *html) /* Parse out and print header. */ { char *line; while ((line = htmlNextCrLfLine(&html)) != NULL) { if (line == NULL || line[0] == 0) break; printf("%s\r\n", line); } } void getLinks(struct htmlPage *page) /* Print out all links. */ { struct slName *link, *linkList = htmlPageLinks(page); for (link = linkList; link != NULL; link = link->next) { printf("%s\n", link->name); } } void htmlPrintForms(struct htmlPage *page, FILE *f) /* Print out all forms. */ { struct htmlForm *form; for (form = page->forms; form != NULL; form = form->next) htmlFormPrint(form, f); } void getVars(struct htmlPage *page) /* Print out all forms. */ { struct htmlForm *form; struct htmlFormVar *var; for (form = page->forms; form != NULL; form = form->next) { for (var = form->vars; var != NULL; var = var->next) htmlFormVarPrint(var, stdout, ""); } } void getTags(struct htmlPage *page) /* Print out all tags. */ { struct htmlTag *tag; struct htmlAttribute *att; for (tag = page->tags; tag != NULL; tag = tag->next) { printf("%s", tag->name); for (att = tag->attributes; att != NULL; att = att->next) { printf(" %s=", att->name); if (hasWhiteSpace(att->val)) printf("\"%s\"", att->val); else printf("%s", att->val); } printf("\n"); } } void getCookies(struct htmlPage *page) /* Print out all cookies. */ { struct htmlCookie *cookie; for (cookie = page->cookies; cookie != NULL; cookie = cookie->next) { printf("%s\t%s\t%s\t%s\t%s\t%s\n", cookie->name, cookie->value, naForNull(cookie->domain), naForNull(cookie->path), naForNull(cookie->expires), (cookie->secure ? "SECURE" : "UNSECURE")); } } void quickSubmit(struct htmlPage *page) /* Just press submit on first form. */ { struct htmlPage *newPage; if (page->forms == NULL) errAbort("No forms on %s", page->url); newPage = htmlPageFromForm(page, page->forms, "submit", "Submit"); htmlPageValidateOrAbort(newPage); } char *skipOverProtocol(char *s) /* Skip over http:// or ftp:// or https:// */ { char *p; if ((p = stringIn("://", s)) != NULL) return p+3; else return s; } int hostNameSize(char *s) /* Return size of host name (not including last slash) */ { char *e = strchr(s, '/'); if (e == NULL) return strlen(s); else return e - s; } boolean sameHost(char *a, char *b) /* Given URLs a and b, return TRUE if they refer to same host. */ { int aSize, bSize; a = skipOverProtocol(a); b = skipOverProtocol(b); aSize = hostNameSize(a); bSize = hostNameSize(b); if (aSize != bSize) return FALSE; return (memcmp(a, b, aSize) == 0); } static struct htmlTag *findNamedAnchor(struct htmlPage *page, char *name) /* Find anchor of given name. */ { struct htmlTag *tag; for (tag = page->tags; tag != NULL; tag = tag->next) { if (sameWord(tag->name, "A")) { char *anchorName = htmlTagAttributeVal(page, tag, "name", NULL); if (anchorName != NULL && sameWord(anchorName, name)) return tag; } } return NULL; } static jmp_buf recoverJumpBuf; static void recoverAbort() /* semiAbort */ { longjmp(recoverJumpBuf, -1); } char *slurpUrl(char *url) /* Grab url. If there's a problem report error and return NULL */ { int status; char *retVal = NULL; pushAbortHandler(recoverAbort); status = setjmp(recoverJumpBuf); if (status == 0) /* Always true except after long jump. */ { struct dyString *dy = netSlurpUrl(url); retVal = dyStringCannibalize(&dy); } popAbortHandler(); return retVal; } void checkRecursiveLinks(struct hash *uniqHash, struct htmlPage *page, int depth, boolean justLocal) /* Check links recursively up to depth. */ { struct slName *linkList = htmlPageLinks(page), *link; for (link = linkList; link != NULL; link = link->next) { if (link->name[0] == '#') { if (findNamedAnchor(page, link->name+1) == NULL) { warn("%s%s doesn't exist", page->url, link->name); } } else { char *url = htmlExpandUrl(page->url, link->name); if (url != NULL) { boolean isLocal = sameHost(page->url, url); if (isLocal || !justLocal) { if (!hashLookup(uniqHash, url)) { struct hash *headerHash = newHash(8); int status = netUrlHeadExt(url, "GET", headerHash); hashAdd(uniqHash, url, NULL); if (status != 200 && status != 302 && status != 301) warn("%d from %s", status, url); else { if (depth > 1 && isLocal) { char *contentType = hashFindValUpperCase(headerHash, "Content-Type:"); if (contentType != NULL && startsWith("text/html", contentType)) { char *fullText = slurpUrl(url); if (fullText != NULL) { struct htmlPage *newPage = htmlPageParse(url, fullText); if (newPage != NULL && newPage->status->status==200) { fullText = NULL; printf("Recursing into %s\n", url); checkRecursiveLinks(uniqHash, newPage, depth-1, justLocal); htmlPageFree(&newPage); } freez(&fullText); } } } } hashFree(&headerHash); } } freez(&url); } } } slFreeList(&linkList); } void checkLinks(struct htmlPage *page, int depth, boolean justLocal) /* Check links (just one level deep. */ { struct hash *uniqHash = hashNew(0); hashAdd(uniqHash, page->url, NULL); checkRecursiveLinks(uniqHash, page, depth, justLocal); hashFree(&uniqHash); } struct htmlCookie *readCookies(char *fileName) /* Read cookies from file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct htmlCookie *list = NULL, *cookie; char *line, *word; while (lineFileNextReal(lf, &line)) { word = nextWord(&line); line = skipLeadingSpaces(line); if (line == NULL) errAbort("Missing cookie value line %d of %s", lf->lineIx, lf->fileName); AllocVar(cookie); cookie->name = cloneString(word); cookie->value = cloneString(line); slAddHead(&list, cookie); } lineFileClose(&lf); slReverse(&list); return list; } void htmlCheck(char *command, char *url, char *cookieFile) /* Read url. Switch on command and dispatch to appropriate routine. */ { char *fullText; struct htmlCookie *cookies = NULL; boolean isLocal = (stringIn("://", url) == NULL); if (cookieFile != NULL) cookies = readCookies(cookieFile); if (isLocal) readInGulp(url, &fullText, NULL); else fullText = htmlSlurpWithCookies(url, cookies); if (sameString(command, "getAll")) mustWrite(stdout, fullText, strlen(fullText)); else if (sameString(command, "ok")) checkOk(fullText); else if (sameString(command, "getHeader")) getHeader(fullText); else /* Do everything that requires full parsing. */ { struct htmlPage *page = NULL; if (isLocal) page = htmlPageParseNoHead(url, fullText); else page = htmlPageParseOk(url, fullText); if (sameString(command, "getHtml")) fputs(page->htmlText, stdout); else if (sameString(command, "getLinks")) getLinks(page); else if (sameString(command, "getForms")) htmlPrintForms(page, stdout); else if (sameString(command, "getVars")) getVars(page); else if (sameString(command, "getTags")) getTags(page); else if (sameString(command, "getCookies")) getCookies(page); else if (sameString(command, "submit")) quickSubmit(page); else if (sameString(command, "validate")) { htmlPageValidateOrAbort(page); verbose(1, "ok\n"); } + else if (sameString(command, "strictTagNestCheck")) + { + htmlPageStrictTagNestCheck(page); + verbose(1, "ok\n"); + } else if (sameString(command, "checkLinks")) checkLinks(page, 1, FALSE); else if (sameString(command, "checkLinks2")) checkLinks(page, 2, FALSE); else if (sameString(command, "checkLocalLinks")) checkLinks(page, 1, TRUE); else if (sameString(command, "checkLocalLinks2")) checkLinks(page, 2, TRUE); else errAbort("Unrecognized command %s", command); htmlPageFree(&page); } } int main(int argc, char *argv[]) /* Process command line. */ { pushCarefulMemHandler(200000000); optionInit(&argc, argv, options); if (argc != 3) usage(); htmlCheck(argv[1], argv[2], optionVal("cookies",NULL)); carefulCheckHeap(); return 0; }