# # update_staticpage script # Peter Bengtsson # http://www.zope.org/Members/peterbe/update_staticpage # # Date today: 31/08/01 # # Version: alpha # Python vers # __version__='0.0.1' __doc__="""A humble script that opens your webpage and sucks in parts of it and saves the HTML into a DTML Document. The id of the DTML Document will be the page name of the URL +'_static'. So, if you open /web/doc/index.html the id of the DTML Document will be 'index.html_static'. Be careful not to request URLs like: www.server.com/. Use instead www.server.com/index_html to be explicit. http://www.zope.org/Members/peterbe/update_staticpage """ from urllib import urlopen from random import choice import string def update_staticpage(self,url,container=None): """ sucks in the request index page and sets it in a DTML document """ if container is None: # less perfect container = self starter = '' ender = '' got = getHTML(url) if got: html = got.keys()[0] lastbit = got.values()[0] if lastbit == '':lastbit = "page" lastbit = lastbit + '_static' if hasattr(container,lastbit) and \ getattr(container,lastbit).meta_type=='DTML Document': container.manage_delObjects([lastbit]) # maybe this should be a File object?? container.manage_addDTMLDocument(lastbit,file=html) # return the name of the DTML Document return lastbit # failure return 0 def getHTML(url): """ sucks in the request index page return a dictionary with the HTML and the guess id. """ starter = '' ender = '' # append a random str to prevent cache confusion randomstr = '&%s=%s' % \ (choice(list(string.lowercase)),choice(list(string.lowercase))) whole_url = '%s?suppress_static=1%s' % (url,randomstr) html= urlopen(whole_url).read() # This needs work. It means that one can only have One static part per page if string.count(html,starter)==1 and string.count(html,ender)==1 \ and string.find(html,starter) < string.find(html,ender): # ok. proceed html = string.split(html,starter)[1] html = string.split(html,ender)[0] lastbit = string.split(url,'/')[-1] return {html : lastbit} return {} if __name__=='__main__': print getHTML('http://www.peterbe.com/web/doc/index_html')