# Licence: ZPL # USE AT YOUR OWN RISK # This is a quick local page cache tool, which can be used as a web agent, or # opaque proxy. It is still very basic, but works for basic tasks. # Essentially this is how it works: # Give it an URL: "http://www.zope.org" # It grabs the html, assigns it an id based on datetime (thanks squishdot) # then it parses the html, looking for anchors and images. # Images get pulled down, with the id's rewritten, and saved into the local # file system. # Anchors get rewritten to point back to the localpagecache. # The link at the bottom of the page "http://www.zope.com" turns into: # "http://whateversite.com:8080/pagecache/get_url?href_url=http://www.zope.com/" # Issues: # Most relative URLs will confuse it still. # There should be some sort of session management, at least seperate folders # for each user, to prevent one user from deleting files in use by another. # Installation: # Ya I know, this should be a product... Maybe next release... +) # # Put this file in /Extensions # make a folder wherever called "pagecache" # put a little form in the index_html like this: # #
# # # # add external methods: # clear_cache (clear the cache) "clear_cache" method # get_img (get an image) "get_img" method # get_url (get an url) "update" method # # add an instance of localFS called "localpagecache". # Finally go through and make sure that all the file locations match up # with your configuration. I indicated them with "<---------------" # # Big thanks to Thomas B. Passin for help and code for string.split # import urllib from httplib import HTTP # transport for fetching docs from time import time, localtime, strftime, gmtime # time handling import re from sys import argv,stdin,stdout,stderr import os import sgmllib import formatter import string url=re.compile('((http:|www\.|ftp:)[^\s"<]*.[^\s"<]*)',re.I) httpstring=re.compile('http:') rootfolderstring=re.compile('//') relativestring=re.compile('/') relativeemptystring=re.compile('[a-zA-Z0-9]') imgtag=re.compile("") notfound=re.compile(".*file not found.*",re.I) def createId(): """ creates an id based on date/time """ id=int(time()) while 1: try: o=self.data[id] except: break id=id+1 return id def clear_cache(self): """ Deletes all the files out of the webcache folder SHould probably be used with some kind of session control so as to not delete cached files that are in use. """ os.chdir('/usr/local/zope/localpagecache') # <--------------- l = os.listdir('/usr/local/zope/localpagecache') # <--------------- for file in l: os.remove(file) return str(len(l)) + " files deleted" def get_img(self, REQUEST=None, href_url=None): """ Get an image """ if REQUEST.has_key('href_url'): href_url = REQUEST['href_url'] else: href_url = 'http://www.google.com' split=string.split(href_url,'/') imgtitle = split[-1] identifier=str(createId()) + imgtitle os.chdir('/usr/local/zope/localpagecache') resourcefile=urllib.urlretrieve(href_url, identifier) buildquickhtml= """IMAGE""" # <--------------- return buildquickhtml def update(self, REQUEST=None, href_url=None): """ Fetch a web page, assign it an id, and write the file out to that id Then parse the file for the images. Delete duplicates. Fetch images, assign them new Ids, and save them off. """ if REQUEST.has_key('href_url'): href_url = REQUEST['href_url'] else: href_url = 'http://www.google.com' split=string.split(href_url,'/') result= string.join(split[:-1],'/') foldercheck = (result+'/'==href_url) if foldercheck == 1: # It is already a folder just pass it on relative_folder_url = href_url else: #pull the filename off the end, and pass that. split=string.split(href_url,'/') relative_folder_url = string.join(split[:-1],'/') print href_url print "checking..." parsed_string = "" href_url = str(href_url) identifier=str(createId()) # Assign a timestamp/id localbaseURL="http://yoursite.com:8080/pagecache/localpagecache/" # <--------------- os.chdir('/usr/local/zope/localpagecache') # <--------------- resourcefile=urllib.urlretrieve(href_url, identifier) # retreive HTML file pieces = parse_images(identifier, relative_folder_url) # Parse the file urllib.urlcleanup() for piece in pieces: parsed_string = parsed_string + piece return parsed_string def parse_images(file_handle, href_url): """Parses out image tags found in the HTML source. Returns the information as a list of TableRowRecord objects. """ abs_formatter = formatter.NullFormatter() # Not sure what this does parser = ImageParser() # Assign to the class parser.href_url = href_url os.chdir('/usr/local/zope/localpagecache/') # <--------------- file = open(file_handle) # Open the file html = file.read() # Read it into memory parser.feed(html) # show it to the parser class file.close() # Close it return parser.pieces # generic return success def checkUrl(url, href_url): newurl = 1 absoluteurl = "" a = httpstring.match(url) # Is this an absolute URL? rf = rootfolderstring.match(url) # or root folder relative? r = relativestring.match(url) # or just regular relative? relempty = relativeemptystring.match (url) # or empty relative? if a: print "[absolute URL found]" absoluteurl = url elif rf: print "[root folder URL found]" absoluteurl = "http:" + str(url) elif r: print "[relative URL found]" absoluteurl = href_url + url elif relempty: print "[relative empty URL found]" absoluteurl = href_url + "/" + url else: print "[No match for URL]" absoluteurl = href_url + url return absoluteurl class ImageParser(sgmllib.SGMLParser): def reset(self): sgmllib.SGMLParser.reset(self) self.pieces = [] self.image_urls = [] self.urls = [] self.href_url = "" def unknown_starttag(self, tag, attrs): strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) self.pieces.append("<%(tag)s%(strattrs)s>" % locals()) def unknown_endtag(self, tag): self.pieces.append("" % locals()) def start_a(self, attrs): redirectHTMLURL="http://yoursite.com:8080/pagecache/get_url" # <--------------- redirectIMGURL="http://yoursite.com:8080/pagecache/get_img" # <--------------- href = [v for k, v in attrs if k=='href'] if href: ab_url = checkUrl(href[0], self.href_url) self.urls.append(ab_url) else: ab_url = "" typesplit=string.split(ab_url, ".") type = typesplit[-1] if type == "html": self.pieces.append("") elif type == "gif": self.pieces.append("") elif type == "jpg": self.pieces.append("") elif type == "jpeg": self.pieces.append("") else: self.pieces.append("") def end_a(self): self.pieces.append("") def start_form(self, attrs): redirectURL="http://yoursite.com:8080/pagecache/get_url" # <--------------- action = [v for k, v in attrs if k=='action'] if action: ab_url = checkUrl(action[0], self.href_url) name = [v for k, v in attrs if k=='name'] if name: link_name = name[0] else: link_name = "" self.pieces.append("") def do_img(self, attrs): """This is called everytime we get an Image through the parse """ storageURL="http://yoursite.com:8080/pagecache/localpagecache/" # <--------------- print "[got image]" # Yay src = [v for k, v in attrs if k=='src'] image_ab_url = "" if src: print src[0] image_ab_url = checkUrl(src[0], self.href_url) print "[linking to absolute url]" print image_ab_url self.image_urls.append(image_ab_url) alt = [v for k, v in attrs if k=='alt'] if alt: alt_tag = "alt=" + str(alt[0]) else: alt_tag = "alt=none" split=string.split(image_ab_url,'/') imgtitle = split[-1] newname=str(createId()) + imgtitle try: resourcefile=urllib.urlretrieve(image_ab_url, newname) # retreive Image file urllib.urlcleanup() except: pass self.pieces.append("""") def start_body(self, attrs): strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) self.pieces.append("" % locals()) def end_body(self): self.pieces.append("""When finished, you can clear out the server cache""") def handle_data(self, text): self.pieces.append(text)