#! /usr/bin/env python """A variant on webchecker that creates a mirror copy of a remote site.""" __version__ = "$Revision$" import os import sys import string import urllib import getopt import webchecker # Extract real version number if necessary if __version__[0] == '$': _v = string.split(__version__) if len(_v) == 3: __version__ = _v[1] def main(): verbose = webchecker.VERBOSE try: opts, args = getopt.getopt(sys.argv[1:], "qv") except getopt.error, msg: print msg print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." return 2 for o, a in opts: if o == "-q": verbose = 0 if o == "-v": verbose = verbose + 1 c = Sucker() c.setflags(verbose=verbose) c.urlopener.addheaders = [ ('User-agent', 'websucker/%s' % __version__), ] for arg in args: print "Adding root", arg c.addroot(arg) print "Run..." c.run() class Sucker(webchecker.Checker): checkext = 0 nonames = 1 # SAM 11/13/99: in general, URLs are now URL pairs. # Since we've suppressed name anchor checking, # we can ignore the second dimension. def readhtml(self, url_pair): url = url_pair[0] text = None path = self.savefilename(url) try: f = open(path, "rb") except IOError: f = self.openpage(url_pair) if f: info = f.info() nurl = f.geturl() if nurl != url: url = nurl path = self.savefilename(url) text = f.read() f.close() self.savefile(text, path) if not self.checkforhtml(info, url): text = None else: if self.checkforhtml({}, url): text = f.read() f.close() return text, url def savefile(self, text, path): dir, base = os.path.split(path) makedirs(dir) try: f = open(path, "wb") f.write(text) f.close() self.message("saved %s", path) except IOError, msg: self.message("didn't save %s: %s", path, str(msg)) def savefilename(self, url): type, rest = urllib.splittype(url) host, path = urllib.splithost(rest) while path[:1] == "/": path = path[1:] user, host = urllib.splituser(host) host, port = urllib.splitnport(host) host = string.lower(host) if not path or path[-1] == "/": path = path + "index.html" if os.sep != "/": path = string.join(string.split(path, "/"), os.sep) path = os.path.join(host, path) return path def makedirs(dir): if not dir: return if os.path.exists(dir): if not os.path.isdir(dir): try: os.rename(dir, dir + ".bak") os.mkdir(dir) os.rename(dir + ".bak", os.path.join(dir, "index.html")) except os.error: pass return head, tail = os.path.split(dir) if not tail: print "Huh? Don't know how to make dir", dir return makedirs(head) os.mkdir(dir, 0777) if __name__ == '__main__': sys.exit(main() or 0)