from sgmllib import SGMLParser
from DocumentTemplate.DT_Util import html_quote
from string import join, upper

ALLOW, SKIP, QUOTE, FAIL = 1, 2, 3, 4

HTMLScanFailed = "HTML Scan Failed"
HTMLScanUnbalanced = "HTML Scan found Unbalanced tag"

class HTMLScanner(SGMLParser):
    '''Scan an HTML document to enforce limits.

    Only tags in the "allow" list are passed through, while
    "skip" tags are silently removed, "quote" tags are html-quoted,
    and any other tags cause an HTMLScanFailed exception.

    All allowed tags must balance, unless they end in '/'.  A true
    argument to "close()" will cause it to append closing tags for all
    unclosed nonempty tags.'''
    def __init__(self, allow=(), skip=(), quote=()):
        self.policy = policy = {}
        empties = {}
        self.allowEmpty = empties.has_key
        for tag in allow:
            tag = upper(tag)
            if tag[-1:] == '/':
                tag = tag[:-1]
                empties[tag] = None
            policy[tag] = ALLOW
        for tag in skip:
            policy[upper(tag)] = SKIP
        for tag in quote:
            policy[upper(tag)] = QUOTE
        policy['SCRIPT'] = policy.get('SCRIPT', FAIL)

        self.tagStack = []
        self.txt = []
        SGMLParser.__init__(self)

    def handle_data(self, data):
        self.txt.append(data)

    def handle_comment(self, data):
        self.txt.append(data)

    def unknown_starttag(self, tag, attrs):
        policy = self.policy.get(upper(tag), FAIL)
        if policy == SKIP:
            return
        scriptPolicy = self.policy['SCRIPT']
        tagtxt = ['<' + tag]
        for name, value in attrs:
            if scriptPolicy <> ALLOW and (
                upper(name)[:2] == 'ON' or upper(value)[:11] == 'JAVASCRIPT:'):
                if scriptPolicy == FAIL:
                    raise HTMLScanFailed, 'SCRIPT'
                continue
            tagtxt.append(name + '=' + '"' + value + '"')
        tagtxt = join(tagtxt) + '>'
        if policy == QUOTE:
            self.txt.append(html_quote(tagtxt))
        elif policy == ALLOW:
            self.txt.append(tagtxt)
            self.tagStack.append(upper(tag))
        else:
            raise HTMLScanFailed, tag
    
    def unknown_endtag(self, tag):
        policy = self.policy.get(upper(tag), FAIL)
        if policy == SKIP:
            return
        tagtxt = '</' + tag + '>'
        if policy == QUOTE:
            self.txt.append(html_quote(tagtxt))
        elif policy == ALLOW:
            self.txt.append(tagtxt)
            tagStack = self.tagStack
            while tagStack:
                lastTag = self.tagStack.pop()
                if lastTag == upper(tag):
                    return
                if not self.allowEmpty(lastTag):
                    self.report_unbalanced(lastTag)
            self.report_unbalanced(tag)
        else:
            raise HTMLScanFailed, tag

    def report_unbalanced(self, tag):
        raise HTMLScanUnbalanced, tag

    def close(self, autoclose=None):
        SGMLParser.close(self)
        tagStack = self.tagStack
        while tagStack:
            lastTag = self.tagStack.pop()
            if not self.allowEmpty(lastTag):
                if autoclose:
                    self.txt.append('</' + lastTag + '>')
                else:
                    self.report_unbalanced(lastTag)        
        return join(self.txt, '')

docs = ('html', 'head', 'title', 'body', 'style', 'link/', 'base/', 'meta/')
frames = ('frameset', 'frame', 'noframes')
tables = ('table', 'tr', 'th', 'td')
forms = ('form', 'select', 'textarea', 'input/', 'option/')
headers = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')
markup = ('a', 'i', 'b', 'em', 'strong', 'code', 'font', 'basefont/')
basic = ('pre', 'img/', 'p/', 'br/', 'hr/')
active = ('script', 'embed', 'object')
structure = ('div', 'span', 'layer', 'ilayer')

def AllHTML():
    return HTMLScanner(allow=frames + docs + tables + forms
        + headers + markup + basic + active + structure)

def InertHTML():
    return HTMLScanner(allow=frames + docs + tables + forms
        + headers + markup + basic)

def FragmentHTML():
    return HTMLScanner(allow=tables + forms + headers + markup + basic)

def SimpleHTML():
    return HTMLScanner(allow=markup)


if __name__ == '__main__':
    scanner = AllHTML()
    from urllib import urlopen
    zopemain = urlopen('http://www.zope.org').read()
    scanner.feed(zopemain)
    print scanner.close(autoclose=1)