#
# Copyright 2001-2002 Stfane Fermigier and Nuxeo SARL
# See LICENSE.TXT for licensing information
#
from plugin import Plugin, MatcherBase, ExternalConverterBase, ConversionError, checkCmd
import os, re, misc

try:
  from zLOG import LOG, DEBUG
except ImportError:
  # When called by tests.
  def LOG(*args, **kw):
    pass
  DEBUG = None

checkCmd('wvHtml')

# wv config file (wvZope.xml) is in same directory as this module.
WV_CONFIG = os.path.join(
  os.getcwd(), os.path.dirname(__file__), 'wvZope.xml')

def debug(s):
  LOG('MSOffice', DEBUG, s)


class MSOfficeMatcher(MatcherBase):
  """Matcher for MSOffice that deals with buggy file command.
  """
  def matches(self, **kw):
    """Examine file extension if MSOffice and no user mime_type was provided.
    """
    mime_type = kw.get('mime_type')
    if not mime_type or mime_type == 'application/octet-stream':
      if (kw.get('data_mime_type') in ('application/msword', 'data')
          or kw.get('data_file_type','').find('Microsoft') >= 0):
        filename = kw.get('filename')
        if filename is not None:
          if '.' in filename:
            ext = filename.split('.')[-1].lower()
            if ext in self.acceptable_suffixes:
              return 1
    return MatcherBase.matches(self, **kw)


class MSWordMatcher(MSOfficeMatcher):
  acceptable_mime_types = [
    'application/msword',
    'application/vnd.ms-word',
    'text/msword',
    'text/vnd.ms-word',
    ]
  acceptable_file_types = [
    'Microsoft Word (6.0)? Document',
    'Microsoft Word document data',
    ]
  acceptable_suffixes = [
    'doc',
    ]

class MSWordConverter(ExternalConverterBase):
  suffix = 'doc'
  mime_type = 'application/msword'

  def _convertToHtml(self):
    # was "cd '%s' && wvWare -c iso-8859-1 -x %s '%s.%s' > '%s.html'..."
    # Find correct executable call
    help = os.popen('wvHtml --help 2>&1').read()
    if help.find('--charset=') >= 0:
      execmd = 'wvHtml --charset=iso-8859-1'
    else:
      # Old version.
      execmd = 'wvHtml'

    # wvHtml prefers to be called with relative paths
    cmd = ("cd '%s' && %s '%s.doc' '%s.html' >'%s.log-wvWare' 2>&1"
           % (self.tmpdir, execmd,
              self.name, self.name, self.name))
    self._runCommand(cmd, 'wvWare')

  def getHtml(self):
    data = ExternalConverterBase.getHtml(self)
    if re.search('CONTENT="text/html; charset=UTF-8"', data):
      try:
        data = unicode(data, 'utf-8').encode('iso-8859-1')
        # XXX problem: may fail because of other buggy characters
        data = re.sub('CONTENT="text/html; charset=UTF-8"',
          'CONTENT="text/html; charset=iso-8859-1"', data)
      except:
        pass
    return re.sub(
      '<img alt="0x08 graphic" src="StrangeNoGraphicData">', '', data)

  def getImageNames(self):
    image_names = []
    for fn in os.listdir(self.tmpdir):
      m = re.match("^.+\.(?P<ext>.+)$", fn)
      if m and m.group(1) in ('png', 'jpg', 'gif', 'wmz', 'wmf'):
        image_names.append(fn)
    return image_names

  def getImage(self, image_name):
    return self._getFile("%s/%s" % (self.tmpdir, image_name))



class MSPowerPointMatcher(MSOfficeMatcher):
  acceptable_mime_types = [
    'application/vnd.ms-powerpoint',
    'application/powerpoint',
    ]
  acceptable_suffixes = ['ppt']

class MSPowerPointConverter(ExternalConverterBase):
  suffix = 'ppt'
  mime_type = 'application/vnd.ms-powerpoint'

  def getHtml(self):
    data = ExternalConverterBase.getHtml(self)
    if re.search('CONTENT="text/html; charset=UTF-8"', data):
      data = re.sub('CONTENT="text/html; charset=UTF-8"',
                    'CONTENT="text/html; charset=iso-8859-1"', data)
      try:
        data = unicode(data, 'utf-8').encode('latin-1')
      except UnicodeError:
        pass
    data = misc.convert_stray_utf8(data)
    # Other stray characters (ppthtml 0.5.1)
    other_conv = ((' \x80\x99', "'"),
                  ('\x80\x99', "'"),
                  ('\x80', "..."),
                  )
    for src, dst in other_conv:
      data = data.replace(src, dst)
    return data

  def _convertToHtml(self):
    cmd = ("cd '%s' && ppthtml '%s.ppt' >'%s.html' 2>'%s.log-ppthtml'"
           % (self.tmpdir,
              self.basefilename, self.basefilename, self.basefilename))
    debug('MSPPTConverter cmd: %s' % cmd)
    misc.systemWithTimeout(cmd, 30)
    log = self._getFile("%s.log-ppthtml" % self.basefilename)
    # XXX unlink log
    if re.search('OLE2 object not found', log):
      raise ConversionError



class MSExcelMatcher(MSOfficeMatcher):
  acceptable_mime_types = [
    'application/vnd.ms-excel',
    'application/excel',
    ]
  acceptable_suffixes = ['xls']

class MSExcelConverter(ExternalConverterBase):
  suffix = 'xls'
  mime_type = 'application/vnd.ms-excel'

  def getHtml(self):
    data = ExternalConverterBase.getHtml(self)
    data = misc.convert_stray_utf8(data)
    return data

  def _convertToHtml(self):
    cmd = ("cd '%s' && xlhtml -nh -a '%s.xls' >'%s.html' 2>'%s.log-xlhtml'"
           % (self.tmpdir,
              self.basefilename, self.basefilename, self.basefilename))
    debug('MSXLSConverter cmd: %s' % cmd)
    misc.systemWithTimeout(cmd, 30)
    log = self._getFile("%s.log-xlhtml" % self.basefilename)
    if re.search('OLE2 object not found', log):
      raise ConversionError



def getPlugins():
  return [
    Plugin('MSPowerPoint', MSPowerPointMatcher, MSPowerPointConverter),
    Plugin('MSExcel', MSExcelMatcher, MSExcelConverter),
    Plugin('MSWord', MSWordMatcher, MSWordConverter), # match word last
    ]
