# Copyright (C) 2001 Stfane Fermigier <sf@fermigier.com> and
# Nuxeo SARL <contact@nuxeo.com>.
# See licence info at the end of this file.

import htmllib, formatter, string

class HTMLParser(htmllib.HTMLParser):
  def __init__(self):
    self._title = ''
    self._raw_text = ''
    self.meta_d = {}
    htmllib.HTMLParser.__init__(self, formatter.NullFormatter())

  def start_title(self, attrs):
    self.save_bgn()
    
  def end_title(self):
    self._title = self.save_end()
    
  def start_body(self, attrs):
    self.save_bgn()

  def end_body(self):
    self._raw_text = self.save_end()

  def do_meta(self, attrs):
    for k, v in attrs:
      k = string.lower(k)
      if k == 'name':
        name = v
      elif k == 'content':
        content = v
    try:
      self.meta_d[string.lower(name)] = content
    except:
      pass

  #
  # Override default formatting.
  #
  def anchor_end(self):
    self.anchor = None

  def do_img(self, atrs):
    pass

  #
  # Retrieve
  #
  def _getMeta(self, name):
    return string.strip(self.meta_d.get(name, ''))

  def getAuthor(self):
    return self._getMeta('author') or self._getMeta('dc.author')

  def getPublisher(self):
    return self._getMeta('publisher') or self._getMeta('dc.publisher')

  def getSource(self):
    return self._getMeta('source') or self._getMeta('dc.source')

  def getDescription(self):
    return self._getMeta('description') or self._getMeta('dc.description')

  def getKeywords(self):
    return self._getMeta('keywords')

  def getTitle(self):
    return self._title

  def getRaw(self):
    return self._raw_text

#
# Simple tests
#

def test(fn):
  p = HTMLParser()
  p.feed(open(fn).read())
  p.close()
  for k in 'Title', 'Author', 'Source', 'Publisher', 'Description', \
      'Keywords', 'Raw':
    print '%s: %s' % (k, getattr(p, 'get'+k)())

if __name__ == '__main__':
  import sys, glob
  
  if len(sys.argv) == 1:
    for fn in glob.glob('tests/*.html'):
      test(fn)
  else:
    test(sys.argv[1])

# vim:et:ts=2:ai
