diff -u urlgrabber-3.1.0/urlgrabber/grabber.py urlgrabber-3.9.1/urlgrabber/grabber.py --- urlgrabber-3.1.0/urlgrabber/grabber.py 2006-09-21 20:58:05.000000000 -0400 +++ urlgrabber-3.9.1/urlgrabber/grabber.py 2010-06-26 13:12:59.000000000 -0400 @@ -16,6 +16,7 @@ # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko +# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal """A high-level cross-protocol url-grabber. @@ -55,8 +56,9 @@ text = None - specifies an alternativ text item in the beginning of the progress - bar line. If not given, the basename of the file is used. + specifies alternative text to be passed to the progress meter + object. If not given, the default progress meter will use the + basename of the file. throttle = 1.0 @@ -157,16 +159,11 @@ partial file or directory name. opener = None - - Overrides the default urllib2.OpenerDirector provided to urllib2 - when making requests. This option exists so that the urllib2 - handler chain may be customized. Note that the range, reget, - proxy, and keepalive features require that custom handlers be - provided to urllib2 in order to function properly. If an opener - option is provided, no attempt is made by urlgrabber to ensure - chain integrity. You are responsible for ensuring that any - extension handlers are present if said features are required. - + No-op when using the curl backend (default) + + cache_openers = True + No-op when using the curl backend (default) + data = None Only relevant for the HTTP family (and ignored for other @@ -179,6 +176,78 @@ badly and if you do not use the proper case (shown here), your values will be overridden with the defaults. + urlparser = URLParser() + + The URLParser class handles pre-processing of URLs, including + auth-handling for user/pass encoded in http urls, file handing + (that is, filenames not sent as a URL), and URL quoting. If you + want to override any of this behavior, you can pass in a + replacement instance. See also the 'quote' option. + + quote = None + + Whether or not to quote the path portion of a url. + quote = 1 -> quote the URLs (they're not quoted yet) + quote = 0 -> do not quote them (they're already quoted) + quote = None -> guess what to do + + This option only affects proper urls like 'file:///etc/passwd'; it + does not affect 'raw' filenames like '/etc/passwd'. The latter + will always be quoted as they are converted to URLs. Also, only + the path part of a url is quoted. If you need more fine-grained + control, you should probably subclass URLParser and pass it in via + the 'urlparser' option. + + ssl_ca_cert = None + + this option can be used if M2Crypto is available and will be + ignored otherwise. If provided, it will be used to create an SSL + context. If both ssl_ca_cert and ssl_context are provided, then + ssl_context will be ignored and a new context will be created from + ssl_ca_cert. + + ssl_context = None + + No-op when using the curl backend (default) + + + self.ssl_verify_peer = True + + Check the server's certificate to make sure it is valid with what our CA validates + + self.ssl_verify_host = True + + Check the server's hostname to make sure it matches the certificate DN + + self.ssl_key = None + + Path to the key the client should use to connect/authenticate with + + self.ssl_key_type = 'PEM' + + PEM or DER - format of key + + self.ssl_cert = None + + Path to the ssl certificate the client should use to to authenticate with + + self.ssl_cert_type = 'PEM' + + PEM or DER - format of certificate + + self.ssl_key_pass = None + + password to access the ssl_key + + self.size = None + + size (in bytes) or Maximum size of the thing being downloaded. + This is mostly to keep us from exploding with an endless datastream + + self.max_header_size = 2097152 + + Maximum size (in bytes) of the headers. + RETRY RELATED ARGUMENTS @@ -283,28 +352,6 @@ passed the same arguments, so you could use the same function for both. - urlparser = URLParser() - - The URLParser class handles pre-processing of URLs, including - auth-handling for user/pass encoded in http urls, file handing - (that is, filenames not sent as a URL), and URL quoting. If you - want to override any of this behavior, you can pass in a - replacement instance. See also the 'quote' option. - - quote = None - - Whether or not to quote the path portion of a url. - quote = 1 -> quote the URLs (they're not quoted yet) - quote = 0 -> do not quote them (they're already quoted) - quote = None -> guess what to do - - This option only affects proper urls like 'file:///etc/passwd'; it - does not affect 'raw' filenames like '/etc/passwd'. The latter - will always be quoted as they are converted to URLs. Also, only - the path part of a url is quoted. If you need more fine-grained - control, you should probably subclass URLParser and pass it in via - the 'urlparser' option. - BANDWIDTH THROTTLING urlgrabber supports throttling via two values: throttle and @@ -364,18 +411,26 @@ """ -# $Id: grabber.py,v 1.48 2006/09/22 00:58:05 mstenner Exp $ + import os import os.path import sys import urlparse -import rfc822 import time import string import urllib import urllib2 -from stat import * # S_* and ST_* +import mimetools +import thread +import types +import stat +import pycurl +from ftplib import parse150 +from StringIO import StringIO +from httplib import HTTPException +import socket +from byterange import range_tuple_normalize, range_tuple_to_header, RangeError ######################################################################## # MODULE INITIALIZATION @@ -385,55 +440,6 @@ except: __version__ = '???' -import sslfactory - -auth_handler = urllib2.HTTPBasicAuthHandler( \ - urllib2.HTTPPasswordMgrWithDefaultRealm()) - -try: - from i18n import _ -except ImportError, msg: - def _(st): return st - -try: - from httplib import HTTPException -except ImportError, msg: - HTTPException = None - -try: - # This is a convenient way to make keepalive optional. - # Just rename the module so it can't be imported. - import keepalive - from keepalive import HTTPHandler, HTTPSHandler - have_keepalive = True -except ImportError, msg: - have_keepalive = False - -try: - # add in range support conditionally too - import byterange - from byterange import HTTPRangeHandler, HTTPSRangeHandler, \ - FileRangeHandler, FTPRangeHandler, range_tuple_normalize, \ - range_tuple_to_header, RangeError -except ImportError, msg: - range_handlers = () - RangeError = None - have_range = 0 -else: - range_handlers = (HTTPRangeHandler(), HTTPSRangeHandler(), - FileRangeHandler(), FTPRangeHandler()) - have_range = 1 - - -# check whether socket timeout support is available (Python >= 2.3) -import socket -try: - TimeoutError = socket.timeout - have_socket_timeout = True -except AttributeError: - TimeoutError = None - have_socket_timeout = False - ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. @@ -456,14 +462,8 @@ global DEBUG DEBUG = DBOBJ - if have_keepalive and keepalive.DEBUG is None: - keepalive.DEBUG = DBOBJ - if have_range and byterange.DEBUG is None: - byterange.DEBUG = DBOBJ - if sslfactory.DEBUG is None: - sslfactory.DEBUG = DBOBJ -def _init_default_logger(): +def _init_default_logger(logspec=None): '''Examines the environment variable URLGRABBER_DEBUG and creates a logging object (logging.logger) based on the contents. It takes the form @@ -489,9 +489,12 @@ collect the code into a nice block.''' try: - dbinfo = os.environ['URLGRABBER_DEBUG'].split(',') + if logspec is None: + logspec = os.environ['URLGRABBER_DEBUG'] + dbinfo = logspec.split(',') import logging - level = logging._levelNames.get(dbinfo[0], int(dbinfo[0])) + level = logging._levelNames.get(dbinfo[0], None) + if level is None: level = int(dbinfo[0]) if level < 1: raise ValueError() formatter = logging.Formatter('%(asctime)s %(message)s') @@ -508,7 +511,19 @@ DBOBJ = None set_logger(DBOBJ) +def _log_package_state(): + if not DEBUG: return + DEBUG.info('urlgrabber version = %s' % __version__) + DEBUG.info('trans function "_" = %s' % _) + _init_default_logger() +_log_package_state() + + +# normally this would be from i18n or something like it ... +def _(st): + return st + ######################################################################## # END MODULE INITIALIZATION ######################################################################## @@ -536,6 +551,7 @@ 13 - malformed proxy url 14 - HTTPError (includes .code and .exception attributes) 15 - user abort + 16 - error writing to local file MirrorGroup error codes (256 -- 511) 256 - No more mirrors left to try @@ -567,7 +583,9 @@ # or simply print e #### print '[Errno %i] %s' % (e.errno, e.strerror) """ - pass + def __init__(self, *args): + IOError.__init__(self, *args) + self.url = "No url specified" class CallbackObject: """Container for returned callback data. @@ -661,7 +679,7 @@ quote = 0 # pathname2url quotes, so we won't do it again if scheme in ['http', 'https']: - parts = self.process_http(parts) + parts = self.process_http(parts, url) if quote is None: quote = self.guess_should_quote(parts) @@ -678,19 +696,9 @@ url = prefix + '/' + url return url - def process_http(self, parts): + def process_http(self, parts, url): (scheme, host, path, parm, query, frag) = parts - - if '@' in host and auth_handler: - try: - user_pass, host = host.split('@', 1) - if ':' in user_pass: - user, password = user_pass.split(':', 1) - except ValueError, e: - raise URLGrabError(1, _('Bad URL: %s') % url) - if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password) - auth_handler.add_password(None, host, user, password) - + # TODO: auth-parsing here, maybe? pycurl doesn't really need it return (scheme, host, path, parm, query, frag) def quote(self, parts): @@ -771,7 +779,7 @@ def _set_attributes(self, **kwargs): """Update object attributes with those provided in kwargs.""" self.__dict__.update(kwargs) - if have_range and kwargs.has_key('range'): + if kwargs.has_key('range'): # normalize the supplied range value self.range = range_tuple_normalize(self.range) if not self.reget in [None, 'simple', 'check_timestamp']: @@ -808,8 +816,36 @@ self.data = None self.urlparser = URLParser() self.quote = None - self.ssl_ca_cert = None - self.ssl_context = None + self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb + self.ssl_context = None # no-op in pycurl + self.ssl_verify_peer = True # check peer's cert for authenticityb + self.ssl_verify_host = True # make sure who they are and who the cert is for matches + self.ssl_key = None # client key + self.ssl_key_type = 'PEM' #(or DER) + self.ssl_cert = None # client cert + self.ssl_cert_type = 'PEM' # (or DER) + self.ssl_key_pass = None # password to access the key + self.size = None # if we know how big the thing we're getting is going + # to be. this is ultimately a MAXIMUM size for the file + self.max_header_size = 2097152 #2mb seems reasonable for maximum header size + + def __repr__(self): + return self.format() + + def format(self, indent=' '): + keys = self.__dict__.keys() + if self.delegate is not None: + keys.remove('delegate') + keys.sort() + s = '{\n' + for k in keys: + s = s + indent + '%-15s: %s,\n' % \ + (repr(k), repr(self.__dict__[k])) + if self.delegate: + df = self.delegate.format(indent + ' ') + s = s + indent + '%-15s: %s\n' % ("'delegate'", df) + s = s + indent + '}' + return s class URLGrabber: """Provides easy opening of URLs with a variety of options. @@ -878,9 +914,10 @@ like any other file object. """ opts = self.opts.derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) (url,parts) = opts.urlparser.parse(url, opts) def retryfunc(opts, url): - return URLGrabberFileObject(url, filename=None, opts=opts) + return PyCurlFileObject(url, filename=None, opts=opts) return self._retry(opts, retryfunc, url) def urlgrab(self, url, filename=None, **kwargs): @@ -890,6 +927,7 @@ different from the passed-in filename if copy_local == 0. """ opts = self.opts.derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) (url,parts) = opts.urlparser.parse(url, opts) (scheme, host, path, parm, query, frag) = parts if filename is None: @@ -901,16 +939,28 @@ if host: path = os.path.normpath('//' + host + path) if not os.path.exists(path): - raise URLGrabError(2, + err = URLGrabError(2, _('Local file does not exist: %s') % (path, )) + err.url = url + raise err elif not os.path.isfile(path): - raise URLGrabError(3, - _('Not a normal file: %s') % (path, )) + err = URLGrabError(3, + _('Not a normal file: %s') % (path, )) + err.url = url + raise err + elif not opts.range: + if not opts.checkfunc is None: + cb_func, cb_args, cb_kwargs = \ + self._make_callback(opts.checkfunc) + obj = CallbackObject() + obj.filename = path + obj.url = url + apply(cb_func, (obj, )+cb_args, cb_kwargs) return path def retryfunc(opts, url, filename): - fo = URLGrabberFileObject(url, filename, opts) + fo = PyCurlFileObject(url, filename, opts) try: fo._do_grab() if not opts.checkfunc is None: @@ -934,12 +984,13 @@ into memory, but don't use too much' """ opts = self.opts.derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) (url,parts) = opts.urlparser.parse(url, opts) if limit is not None: limit = limit + 1 def retryfunc(opts, url, limit): - fo = URLGrabberFileObject(url, filename=None, opts=opts) + fo = PyCurlFileObject(url, filename=None, opts=opts) s = '' try: # this is an unfortunate thing. Some file-like objects @@ -962,8 +1013,11 @@ s = self._retry(opts, retryfunc, url, limit) if limit and len(s) > limit: - raise URLGrabError(8, - _('Exceeded limit (%i): %s') % (limit, url)) + err = URLGrabError(8, + _('Exceeded limit (%i): %s') % (limit, url)) + err.url = url + raise err + return s def _make_callback(self, callback_obj): @@ -976,192 +1030,328 @@ # NOTE: actual defaults are set in URLGrabberOptions default_grabber = URLGrabber() -class URLGrabberFileObject: - """This is a file-object wrapper that supports progress objects - and throttling. - - This exists to solve the following problem: lets say you want to - drop-in replace a normal open with urlopen. You want to use a - progress meter and/or throttling, but how do you do that without - rewriting your code? Answer: urlopen will return a wrapped file - object that does the progress meter and-or throttling internally. - """ +class PyCurlFileObject(): def __init__(self, url, filename, opts): + self.fo = None + self._hdr_dump = '' + self._parsed_hdr = None self.url = url + self.scheme = urlparse.urlsplit(self.url)[0] self.filename = filename + self.append = False + self.reget_time = None self.opts = opts - self.fo = None + if self.opts.reget == 'check_timestamp': + raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this." + self._complete = False self._rbuf = '' self._rbufsize = 1024*8 self._ttime = time.time() self._tsize = 0 self._amount_read = 0 - self._opener = None + self._reget_length = 0 + self._prog_running = False + self._error = (None, None) + self.size = None self._do_open() + def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for in self.fo. This includes methods.""" + if hasattr(self.fo, name): return getattr(self.fo, name) raise AttributeError, name - - def _get_opener(self): - """Build a urllib2 OpenerDirector based on request options.""" - if self.opts.opener: - return self.opts.opener - elif self._opener is None: - handlers = [] - need_keepalive_handler = (have_keepalive and self.opts.keepalive) - need_range_handler = (range_handlers and \ - (self.opts.range or self.opts.reget)) - # if you specify a ProxyHandler when creating the opener - # it _must_ come before all other handlers in the list or urllib2 - # chokes. - if self.opts.proxies: - handlers.append( CachedProxyHandler(self.opts.proxies) ) - - # ------------------------------------------------------- - # OK, these next few lines are a serious kludge to get - # around what I think is a bug in python 2.2's - # urllib2. The basic idea is that default handlers - # get applied first. If you override one (like a - # proxy handler), then the default gets pulled, but - # the replacement goes on the end. In the case of - # proxies, this means the normal handler picks it up - # first and the proxy isn't used. Now, this probably - # only happened with ftp or non-keepalive http, so not - # many folks saw it. The simple approach to fixing it - # is just to make sure you override the other - # conflicting defaults as well. I would LOVE to see - # these go way or be dealt with more elegantly. The - # problem isn't there after 2.2. -MDS 2005/02/24 - if not need_keepalive_handler: - handlers.append( urllib2.HTTPHandler() ) - if not need_range_handler: - handlers.append( urllib2.FTPHandler() ) - # ------------------------------------------------------- - - ssl_factory = sslfactory.get_factory(self.opts.ssl_ca_cert, - self.opts.ssl_context) - - if need_keepalive_handler: - handlers.append(HTTPHandler()) - handlers.append(HTTPSHandler(ssl_factory)) - if need_range_handler: - handlers.extend( range_handlers ) - handlers.append( auth_handler ) - if self.opts.cache_openers: - self._opener = CachedOpenerDirector(ssl_factory, *handlers) - else: - self._opener = ssl_factory.create_opener(*handlers) - # OK, I don't like to do this, but otherwise, we end up with - # TWO user-agent headers. - self._opener.addheaders = [] - return self._opener + + def _retrieve(self, buf): + try: + if not self._prog_running: + if self.opts.progress_obj: + size = self.size + self._reget_length + self.opts.progress_obj.start(self._prog_reportname, + urllib.unquote(self.url), + self._prog_basename, + size=size, + text=self.opts.text) + self._prog_running = True + self.opts.progress_obj.update(self._amount_read) + + self._amount_read += len(buf) + self.fo.write(buf) + return len(buf) + except KeyboardInterrupt: + return -1 + + def _hdr_retrieve(self, buf): + if self._over_max_size(cur=len(self._hdr_dump), + max_size=self.opts.max_header_size): + return -1 + try: + self._hdr_dump += buf + # we have to get the size before we do the progress obj start + # but we can't do that w/o making it do 2 connects, which sucks + # so we cheat and stuff it in here in the hdr_retrieve + if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: + length = buf.split(':')[1] + self.size = int(length) + elif self.scheme in ['ftp']: + s = None + if buf.startswith('213 '): + s = buf[3:].strip() + elif buf.startswith('150 '): + s = parse150(buf) + if s: + self.size = int(s) + + return len(buf) + except KeyboardInterrupt: + return pycurl.READFUNC_ABORT + + def _return_hdr_obj(self): + if self._parsed_hdr: + return self._parsed_hdr + statusend = self._hdr_dump.find('\n') + hdrfp = StringIO() + hdrfp.write(self._hdr_dump[statusend:]) + self._parsed_hdr = mimetools.Message(hdrfp) + return self._parsed_hdr + + hdr = property(_return_hdr_obj) + http_code = property(fget= + lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE)) + + def _set_opts(self, opts={}): + # XXX + if not opts: + opts = self.opts + + + # defaults we're always going to set + self.curl_obj.setopt(pycurl.NOPROGRESS, False) + self.curl_obj.setopt(pycurl.NOSIGNAL, True) + self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve) + self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve) + self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) + self.curl_obj.setopt(pycurl.FAILONERROR, True) + self.curl_obj.setopt(pycurl.OPT_FILETIME, True) - def _do_open(self): - opener = self._get_opener() + if DEBUG: + self.curl_obj.setopt(pycurl.VERBOSE, True) + if opts.user_agent: + self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) + + # maybe to be options later + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) + self.curl_obj.setopt(pycurl.MAXREDIRS, 5) + + # timeouts + timeout = 300 + if opts.timeout: + timeout = int(opts.timeout) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + + # ssl options + if self.scheme == 'https': + if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs + self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) + self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) + self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) + self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host) + if opts.ssl_key: + self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key) + if opts.ssl_key_type: + self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type) + if opts.ssl_cert: + self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert) + if opts.ssl_cert_type: + self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) + if opts.ssl_key_pass: + self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) + + #headers: + if opts.http_headers and self.scheme in ('http', 'https'): + headers = [] + for (tag, content) in opts.http_headers: + headers.append('%s:%s' % (tag, content)) + self.curl_obj.setopt(pycurl.HTTPHEADER, headers) + + # ranges: + if opts.range or opts.reget: + range_str = self._build_range() + if range_str: + self.curl_obj.setopt(pycurl.RANGE, range_str) + + # throttle/bandwidth + if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): + self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) + + # proxy settings + if opts.proxies: + for (scheme, proxy) in opts.proxies.items(): + if self.scheme in ('ftp'): # only set the ftp proxy for ftp items + if scheme not in ('ftp'): + continue + else: + if proxy == '_none_': proxy = "" + self.curl_obj.setopt(pycurl.PROXY, proxy) + elif self.scheme in ('http', 'https'): + if scheme not in ('http', 'https'): + continue + else: + if proxy == '_none_': proxy = "" + self.curl_obj.setopt(pycurl.PROXY, proxy) + + # FIXME username/password/auth settings - req = urllib2.Request(self.url, self.opts.data) # build request object - self._add_headers(req) # add misc headers that we need - self._build_range(req) # take care of reget and byterange stuff - - fo, hdr = self._make_request(req, opener) - if self.reget_time and self.opts.reget == 'check_timestamp': - # do this if we have a local file with known timestamp AND - # we're in check_timestamp reget mode. - fetch_again = 0 - try: - modified_tuple = hdr.getdate_tz('last-modified') - modified_stamp = rfc822.mktime_tz(modified_tuple) - if modified_stamp > self.reget_time: fetch_again = 1 - except (TypeError,): - fetch_again = 1 + #posts - simple - expects the fields as they are + if opts.data: + self.curl_obj.setopt(pycurl.POST, True) + self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) - if fetch_again: - # the server version is newer than the (incomplete) local - # version, so we should abandon the version we're getting - # and fetch the whole thing again. - fo.close() - self.opts.reget = None - del req.headers['Range'] - self._build_range(req) - fo, hdr = self._make_request(req, opener) - - (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url) - path = urllib.unquote(path) - if not (self.opts.progress_obj or self.opts.raw_throttle() \ - or self.opts.timeout): - # if we're not using the progress_obj, throttling, or timeout - # we can get a performance boost by going directly to - # the underlying fileobject for reads. - self.read = fo.read - if hasattr(fo, 'readline'): - self.readline = fo.readline - elif self.opts.progress_obj: - try: - length = int(hdr['Content-Length']) - length = length + self._amount_read # Account for regets - except (KeyError, ValueError, TypeError): - length = None - - self.opts.progress_obj.start(str(self.filename), - urllib.unquote(self.url), - os.path.basename(path), - length, text=self.opts.text) - self.opts.progress_obj.update(0) - (self.fo, self.hdr) = (fo, hdr) - - def _add_headers(self, req): - if self.opts.user_agent: - req.add_header('User-agent', self.opts.user_agent) - try: req_type = req.get_type() - except ValueError: req_type = None - if self.opts.http_headers and req_type in ('http', 'https'): - for h, v in self.opts.http_headers: - req.add_header(h, v) - if self.opts.ftp_headers and req_type == 'ftp': - for h, v in self.opts.ftp_headers: - req.add_header(h, v) + # our url + self.curl_obj.setopt(pycurl.URL, self.url) + + + def _do_perform(self): + if self._complete: + return + + try: + self.curl_obj.perform() + except pycurl.error, e: + # XXX - break some of these out a bit more clearly + # to other URLGrabErrors from + # http://curl.haxx.se/libcurl/c/libcurl-errors.html + # this covers e.args[0] == 22 pretty well - which will be common + + code = self.http_code + errcode = e.args[0] + if self._error[0]: + errcode = self._error[0] + + if errcode == 23 and code >= 200 and code < 299: + err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) + err.url = self.url + + # this is probably wrong but ultimately this is what happens + # we have a legit http code and a pycurl 'writer failed' code + # which almost always means something aborted it from outside + # since we cannot know what it is -I'm banking on it being + # a ctrl-c. XXXX - if there's a way of going back two raises to + # figure out what aborted the pycurl process FIXME + raise KeyboardInterrupt + + elif errcode == 28: + err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err.url = self.url + raise err + elif errcode == 35: + msg = _("problem making ssl connection") + err = URLGrabError(14, msg) + err.url = self.url + raise err + elif errcode == 37: + msg = _("Could not open/read %s") % (self.url) + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif errcode == 42: + err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) + err.url = self.url + # this is probably wrong but ultimately this is what happens + # we have a legit http code and a pycurl 'writer failed' code + # which almost always means something aborted it from outside + # since we cannot know what it is -I'm banking on it being + # a ctrl-c. XXXX - if there's a way of going back two raises to + # figure out what aborted the pycurl process FIXME + raise KeyboardInterrupt + + elif errcode == 58: + msg = _("problem with the local client certificate") + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif errcode == 60: + msg = _("client cert cannot be verified or client cert incorrect") + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif errcode == 63: + if self._error[1]: + msg = self._error[1] + else: + msg = _("Max download size exceeded on %s") % (self.url) + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it + msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) + else: + msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) + code = errcode + err = URLGrabError(14, msg) + err.code = code + err.exception = e + raise err - def _build_range(self, req): - self.reget_time = None - self.append = 0 + def _do_open(self): + self.curl_obj = _curl_cache + self.curl_obj.reset() # reset all old settings away, just in case + # setup any ranges + self._set_opts() + self._do_grab() + return self.fo + + def _add_headers(self): + pass + + def _build_range(self): reget_length = 0 rt = None - if have_range and self.opts.reget and type(self.filename) == type(''): + if self.opts.reget and type(self.filename) in types.StringTypes: # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) except OSError: pass else: - self.reget_time = s[ST_MTIME] - reget_length = s[ST_SIZE] + self.reget_time = s[stat.ST_MTIME] + reget_length = s[stat.ST_SIZE] # Set initial length when regetting self._amount_read = reget_length + self._reget_length = reget_length # set where we started from, too rt = reget_length, '' self.append = 1 if self.opts.range: - if not have_range: - raise URLGrabError(10, _('Byte range requested but range '\ - 'support unavailable')) rt = self.opts.range if rt[0]: rt = (rt[0] + reget_length, rt[1]) if rt: header = range_tuple_to_header(rt) - if header: req.add_header('Range', header) + if header: + return header.split('=')[1] + + def _make_request(self, req, opener): + #XXXX + # This doesn't do anything really, but we could use this + # instead of do_open() to catch a lot of crap errors as + # mstenner did before here + return (self.fo, self.hdr) + try: - if have_socket_timeout and self.opts.timeout: + if self.opts.timeout: old_to = socket.getdefaulttimeout() socket.setdefaulttimeout(self.opts.timeout) try: @@ -1172,50 +1362,99 @@ fo = opener.open(req) hdr = fo.info() except ValueError, e: - raise URLGrabError(1, _('Bad URL: %s') % (e, )) + err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, )) + err.url = self.url + raise err + except RangeError, e: - raise URLGrabError(9, str(e)) + err = URLGrabError(9, _('%s on %s') % (e, self.url)) + err.url = self.url + raise err except urllib2.HTTPError, e: - new_e = URLGrabError(14, str(e)) + new_e = URLGrabError(14, _('%s on %s') % (e, self.url)) new_e.code = e.code new_e.exception = e + new_e.url = self.url raise new_e except IOError, e: - if hasattr(e, 'reason') and have_socket_timeout and \ - isinstance(e.reason, TimeoutError): - raise URLGrabError(12, _('Timeout: %s') % (e, )) + if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): + err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err.url = self.url + raise err else: - raise URLGrabError(4, _('IOError: %s') % (e, )) + err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e)) + err.url = self.url + raise err + except OSError, e: - raise URLGrabError(5, _('OSError: %s') % (e, )) + err = URLGrabError(5, _('%s on %s') % (e, self.url)) + err.url = self.url + raise err + except HTTPException, e: - raise URLGrabError(7, _('HTTP Exception (%s): %s') % \ - (e.__class__.__name__, e)) + err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \ + (e.__class__.__name__, self.url, e)) + err.url = self.url + raise err + else: return (fo, hdr) def _do_grab(self): - """dump the file to self.filename.""" - if self.append: new_fo = open(self.filename, 'ab') - else: new_fo = open(self.filename, 'wb') - bs = 1024*8 - size = 0 - - block = self.read(bs) - size = size + len(block) - while block: - new_fo.write(block) - block = self.read(bs) - size = size + len(block) + """dump the file to a filename or StringIO buffer""" + + if self._complete: + return + _was_filename = False + if type(self.filename) in types.StringTypes and self.filename: + _was_filename = True + self._prog_reportname = str(self.filename) + self._prog_basename = os.path.basename(self.filename) + + if self.append: mode = 'ab' + else: mode = 'wb' + + if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \ + (self.filename, mode)) + try: + self.fo = open(self.filename, mode) + except IOError, e: + err = URLGrabError(16, _(\ + 'error opening local file from %s, IOError: %s') % (self.url, e)) + err.url = self.url + raise err + + else: + self._prog_reportname = 'MEMORY' + self._prog_basename = 'MEMORY' + + + self.fo = StringIO() + # if this is to be a tempfile instead.... + # it just makes crap in the tempdir + #fh, self._temp_name = mkstemp() + #self.fo = open(self._temp_name, 'wb') + + + self._do_perform() + - new_fo.close() - try: - modified_tuple = self.hdr.getdate_tz('last-modified') - modified_stamp = rfc822.mktime_tz(modified_tuple) - os.utime(self.filename, (modified_stamp, modified_stamp)) - except (TypeError,), e: pass - return size + if _was_filename: + # close it up + self.fo.flush() + self.fo.close() + # set the time + mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) + if mod_time != -1: + os.utime(self.filename, (mod_time, mod_time)) + # re open it + self.fo = open(self.filename, 'r') + else: + #self.fo = open(self._temp_name, 'r') + self.fo.seek(0) + + self._complete = True def _fill_buffer(self, amt=None): """fill the buffer to contain at least 'amt' bytes by reading @@ -1233,7 +1472,9 @@ # if we've made it here, then we don't have enough in the buffer # and we need to read more. - + + if not self._complete: self._do_grab() #XXX cheater - change on ranges + buf = [self._rbuf] bufsize = len(self._rbuf) while amt is None or amt: @@ -1250,11 +1491,20 @@ try: new = self.fo.read(readamount) except socket.error, e: - raise URLGrabError(4, _('Socket Error: %s') % (e, )) - except TimeoutError, e: - raise URLGrabError(12, _('Timeout: %s') % (e, )) + err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e)) + err.url = self.url + raise err + + except socket.timeout, e: + raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err.url = self.url + raise err + except IOError, e: - raise URLGrabError(4, _('IOError: %s') %(e,)) + raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e)) + err.url = self.url + raise err + newsize = len(new) if not newsize: break # no more to read @@ -1263,12 +1513,45 @@ bufsize = bufsize + newsize self._tsize = newsize self._amount_read = self._amount_read + newsize - if self.opts.progress_obj: - self.opts.progress_obj.update(self._amount_read) + #if self.opts.progress_obj: + # self.opts.progress_obj.update(self._amount_read) self._rbuf = string.join(buf, '') return + def _progress_update(self, download_total, downloaded, upload_total, uploaded): + if self._over_max_size(cur=self._amount_read-self._reget_length): + return -1 + + try: + if self._prog_running: + downloaded += self._reget_length + self.opts.progress_obj.update(downloaded) + except KeyboardInterrupt: + return -1 + + def _over_max_size(self, cur, max_size=None): + + if not max_size: + max_size = self.size + if self.opts.size: # if we set an opts size use that, no matter what + max_size = self.opts.size + if not max_size: return False # if we have None for all of the Max then this is dumb + if cur > max_size + max_size*.10: + + msg = _("Downloaded more than max size for %s: %s > %s") \ + % (self.url, cur, max_size) + self._error = (pycurl.E_FILESIZE_EXCEEDED, msg) + return True + return False + + def _to_utf8(self, obj, errors='replace'): + '''convert 'unicode' to an encoded utf-8 byte string ''' + # stolen from yum.i18n + if isinstance(obj, unicode): + obj = obj.encode('utf-8', errors) + return obj + def read(self, amt=None): self._fill_buffer(amt) if amt is None: @@ -1278,6 +1561,9 @@ return s def readline(self, limit=-1): + if not self._complete: self._do_grab() + return self.fo.readline() + i = string.find(self._rbuf, '\n') while i < 0 and not (0 < limit <= len(self._rbuf)): L = len(self._rbuf) @@ -1293,43 +1579,13 @@ return s def close(self): - if self.opts.progress_obj: + if self._prog_running: self.opts.progress_obj.end(self._amount_read) self.fo.close() - if self.opts.close_connection: - try: self.fo.close_connection() - except: pass - -_handler_cache = [] -def CachedOpenerDirector(ssl_factory = None, *handlers): - for (cached_handlers, opener) in _handler_cache: - if cached_handlers == handlers: - for handler in opener.handlers: - handler.add_parent(opener) - return opener - if not ssl_factory: - ssl_factory = sslfactory.get_factory() - opener = ssl_factory.create_opener(*handlers) - _handler_cache.append( (handlers, opener) ) - return opener - -_proxy_cache = [] -def CachedProxyHandler(proxies): - for (pdict, handler) in _proxy_cache: - if pdict == proxies: - if DEBUG: DEBUG.debug('re-using proxy settings: %s', proxies) - break - else: - for k, v in proxies.items(): - utype, url = urllib.splittype(v) - host, other = urllib.splithost(url) - if (utype is None) or (host is None): - raise URLGrabError(13, _('Bad proxy URL: %s') % v) - - if DEBUG: DEBUG.info('creating new proxy handler: %s', proxies) - handler = urllib2.ProxyHandler(proxies) - _proxy_cache.append( (proxies, handler) ) - return handler + + +_curl_cache = pycurl.Curl() # make one and reuse it over and over and over + ##################################################################### # DEPRECATED FUNCTIONS @@ -1368,7 +1624,6 @@ ##################################################################### # TESTING def _main_test(): - import sys try: url, filename = sys.argv[1:3] except ValueError: print 'usage:', sys.argv[0], \ @@ -1395,7 +1650,6 @@ def _retry_test(): - import sys try: url, filename = sys.argv[1:3] except ValueError: print 'usage:', sys.argv[0], \ @@ -1430,7 +1684,7 @@ else: print 'LOCAL FILE:', name def _file_object_test(filename=None): - import random, cStringIO, sys + import cStringIO if filename is None: filename = __file__ print 'using file "%s" for comparisons' % filename @@ -1444,7 +1698,7 @@ _test_file_object_readlines]: fo_input = cStringIO.StringIO(s_input) fo_output = cStringIO.StringIO() - wrapper = URLGrabberFileObject(fo_input, None, 0) + wrapper = PyCurlFileObject(fo_input, None, 0) print 'testing %-30s ' % testfunc.__name__, testfunc(wrapper, fo_output) s_output = fo_output.getvalue()