from sgmllib import SGMLParser from DocumentTemplate.DT_Util import html_quote from string import join, upper ALLOW, SKIP, QUOTE, FAIL = 1, 2, 3, 4 HTMLScanFailed = "HTML Scan Failed" HTMLScanUnbalanced = "HTML Scan found Unbalanced tag" class HTMLScanner(SGMLParser): '''Scan an HTML document to enforce limits. Only tags in the "allow" list are passed through, while "skip" tags are silently removed, "quote" tags are html-quoted, and any other tags cause an HTMLScanFailed exception. All allowed tags must balance, unless they end in '/'. A true argument to "close()" will cause it to append closing tags for all unclosed nonempty tags.''' def __init__(self, allow=(), skip=(), quote=()): self.policy = policy = {} empties = {} self.allowEmpty = empties.has_key for tag in allow: tag = upper(tag) if tag[-1:] == '/': tag = tag[:-1] empties[tag] = None policy[tag] = ALLOW for tag in skip: policy[upper(tag)] = SKIP for tag in quote: policy[upper(tag)] = QUOTE policy['SCRIPT'] = policy.get('SCRIPT', FAIL) self.tagStack = [] self.txt = [] SGMLParser.__init__(self) def handle_data(self, data): self.txt.append(data) def handle_comment(self, data): self.txt.append(data) def unknown_starttag(self, tag, attrs): policy = self.policy.get(upper(tag), FAIL) if policy == SKIP: return scriptPolicy = self.policy['SCRIPT'] tagtxt = ['<' + tag] for name, value in attrs: if scriptPolicy <> ALLOW and ( upper(name)[:2] == 'ON' or upper(value)[:11] == 'JAVASCRIPT:'): if scriptPolicy == FAIL: raise HTMLScanFailed, 'SCRIPT' continue tagtxt.append(name + '=' + '"' + value + '"') tagtxt = join(tagtxt) + '>' if policy == QUOTE: self.txt.append(html_quote(tagtxt)) elif policy == ALLOW: self.txt.append(tagtxt) self.tagStack.append(upper(tag)) else: raise HTMLScanFailed, tag def unknown_endtag(self, tag): policy = self.policy.get(upper(tag), FAIL) if policy == SKIP: return tagtxt = '' if policy == QUOTE: self.txt.append(html_quote(tagtxt)) elif policy == ALLOW: self.txt.append(tagtxt) tagStack = self.tagStack while tagStack: lastTag = self.tagStack.pop() if lastTag == upper(tag): return if not self.allowEmpty(lastTag): self.report_unbalanced(lastTag) self.report_unbalanced(tag) else: raise HTMLScanFailed, tag def report_unbalanced(self, tag): raise HTMLScanUnbalanced, tag def close(self, autoclose=None): SGMLParser.close(self) tagStack = self.tagStack while tagStack: lastTag = self.tagStack.pop() if not self.allowEmpty(lastTag): if autoclose: self.txt.append('') else: self.report_unbalanced(lastTag) return join(self.txt, '') docs = ('html', 'head', 'title', 'body', 'style', 'link/', 'base/', 'meta/') frames = ('frameset', 'frame', 'noframes') tables = ('table', 'tr', 'th', 'td') forms = ('form', 'select', 'textarea', 'input/', 'option/') headers = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6') markup = ('a', 'i', 'b', 'em', 'strong', 'code', 'font', 'basefont/') basic = ('pre', 'img/', 'p/', 'br/', 'hr/') active = ('script', 'embed', 'object') structure = ('div', 'span', 'layer', 'ilayer') def AllHTML(): return HTMLScanner(allow=frames + docs + tables + forms + headers + markup + basic + active + structure) def InertHTML(): return HTMLScanner(allow=frames + docs + tables + forms + headers + markup + basic) def FragmentHTML(): return HTMLScanner(allow=tables + forms + headers + markup + basic) def SimpleHTML(): return HTMLScanner(allow=markup) if __name__ == '__main__': scanner = AllHTML() from urllib import urlopen zopemain = urlopen('http://www.zope.org').read() scanner.feed(zopemain) print scanner.close(autoclose=1)