Attachment 236657 Details for Bug 314435 – grabber.py.diff

grabber.py.diff

grabber.py.diff (text/plain), 46.33 KB, created by David Abbott on 2010-06-26 18:15:30 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: David Abbott

Created: 2010-06-26 18:15:30 UTC

Size: 46.33 KB

patch

obsolete

>diff -u urlgrabber-3.1.0/urlgrabber/grabber.py urlgrabber-3.9.1/urlgrabber/grabber.py
>--- urlgrabber-3.1.0/urlgrabber/grabber.py	2006-09-21 20:58:05.000000000 -0400
>+++ urlgrabber-3.9.1/urlgrabber/grabber.py	2010-06-26 13:12:59.000000000 -0400
>@@ -16,6 +16,7 @@
> 
> # This file is part of urlgrabber, a high-level cross-protocol url-grabber
> # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
>+# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
> 
> """A high-level cross-protocol url-grabber.
> 
>@@ -55,8 +56,9 @@
> 
>   text = None
>   
>-    specifies an alternativ text item in the beginning of the progress
>-    bar line. If not given, the basename of the file is used.
>+    specifies alternative text to be passed to the progress meter
>+    object.  If not given, the default progress meter will use the
>+    basename of the file.
> 
>   throttle = 1.0
> 
>@@ -157,16 +159,11 @@
>     partial file or directory name.
> 
>   opener = None
>-  
>-    Overrides the default urllib2.OpenerDirector provided to urllib2
>-    when making requests.  This option exists so that the urllib2
>-    handler chain may be customized.  Note that the range, reget,
>-    proxy, and keepalive features require that custom handlers be
>-    provided to urllib2 in order to function properly.  If an opener
>-    option is provided, no attempt is made by urlgrabber to ensure
>-    chain integrity.  You are responsible for ensuring that any
>-    extension handlers are present if said features are required.
>-    
>+    No-op when using the curl backend (default)
>+
>+  cache_openers = True
>+    No-op when using the curl backend (default)
>+
>   data = None
> 
>     Only relevant for the HTTP family (and ignored for other
>@@ -179,6 +176,78 @@
>     badly and if you do not use the proper case (shown here), your
>     values will be overridden with the defaults.
>     
>+  urlparser = URLParser()
>+
>+    The URLParser class handles pre-processing of URLs, including
>+    auth-handling for user/pass encoded in http urls, file handing
>+    (that is, filenames not sent as a URL), and URL quoting.  If you
>+    want to override any of this behavior, you can pass in a
>+    replacement instance.  See also the 'quote' option.
>+
>+  quote = None
>+
>+    Whether or not to quote the path portion of a url.
>+      quote = 1    ->  quote the URLs (they're not quoted yet)
>+      quote = 0    ->  do not quote them (they're already quoted)
>+      quote = None ->  guess what to do
>+
>+    This option only affects proper urls like 'file:///etc/passwd'; it
>+    does not affect 'raw' filenames like '/etc/passwd'.  The latter
>+    will always be quoted as they are converted to URLs.  Also, only
>+    the path part of a url is quoted.  If you need more fine-grained
>+    control, you should probably subclass URLParser and pass it in via
>+    the 'urlparser' option.
>+
>+  ssl_ca_cert = None
>+
>+    this option can be used if M2Crypto is available and will be
>+    ignored otherwise.  If provided, it will be used to create an SSL
>+    context.  If both ssl_ca_cert and ssl_context are provided, then
>+    ssl_context will be ignored and a new context will be created from
>+    ssl_ca_cert.
>+
>+  ssl_context = None
>+
>+    No-op when using the curl backend (default)
>+   
>+
>+  self.ssl_verify_peer = True 
>+
>+    Check the server's certificate to make sure it is valid with what our CA validates
>+  
>+  self.ssl_verify_host = True
>+
>+    Check the server's hostname to make sure it matches the certificate DN
>+
>+  self.ssl_key = None
>+
>+    Path to the key the client should use to connect/authenticate with
>+
>+  self.ssl_key_type = 'PEM' 
>+
>+    PEM or DER - format of key
>+     
>+  self.ssl_cert = None
>+
>+    Path to the ssl certificate the client should use to to authenticate with
>+
>+  self.ssl_cert_type = 'PEM' 
>+
>+    PEM or DER - format of certificate
>+    
>+  self.ssl_key_pass = None 
>+
>+    password to access the ssl_key
>+    
>+  self.size = None
>+
>+    size (in bytes) or Maximum size of the thing being downloaded. 
>+    This is mostly to keep us from exploding with an endless datastream
>+  
>+  self.max_header_size = 2097152 
>+
>+    Maximum size (in bytes) of the headers.
>+    
> 
> RETRY RELATED ARGUMENTS
> 
>@@ -283,28 +352,6 @@
>     passed the same arguments, so you could use the same function for
>     both.
>       
>-  urlparser = URLParser()
>-
>-    The URLParser class handles pre-processing of URLs, including
>-    auth-handling for user/pass encoded in http urls, file handing
>-    (that is, filenames not sent as a URL), and URL quoting.  If you
>-    want to override any of this behavior, you can pass in a
>-    replacement instance.  See also the 'quote' option.
>-
>-  quote = None
>-
>-    Whether or not to quote the path portion of a url.
>-      quote = 1    ->  quote the URLs (they're not quoted yet)
>-      quote = 0    ->  do not quote them (they're already quoted)
>-      quote = None ->  guess what to do
>-
>-    This option only affects proper urls like 'file:///etc/passwd'; it
>-    does not affect 'raw' filenames like '/etc/passwd'.  The latter
>-    will always be quoted as they are converted to URLs.  Also, only
>-    the path part of a url is quoted.  If you need more fine-grained
>-    control, you should probably subclass URLParser and pass it in via
>-    the 'urlparser' option.
>-
> BANDWIDTH THROTTLING
> 
>   urlgrabber supports throttling via two values: throttle and
>@@ -364,18 +411,26 @@
> 
> """
> 
>-# $Id: grabber.py,v 1.48 2006/09/22 00:58:05 mstenner Exp $
>+
> 
> import os
> import os.path
> import sys
> import urlparse
>-import rfc822
> import time
> import string
> import urllib
> import urllib2
>-from stat import *  # S_* and ST_*
>+import mimetools
>+import thread
>+import types
>+import stat
>+import pycurl
>+from ftplib import parse150
>+from StringIO import StringIO
>+from httplib import HTTPException
>+import socket
>+from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
> 
> ########################################################################
> #                     MODULE INITIALIZATION
>@@ -385,55 +440,6 @@
> except:
>     __version__ = '???'
> 
>-import sslfactory
>-
>-auth_handler = urllib2.HTTPBasicAuthHandler( \
>-     urllib2.HTTPPasswordMgrWithDefaultRealm())
>-
>-try:
>-    from i18n import _
>-except ImportError, msg:
>-    def _(st): return st
>-
>-try:
>-    from httplib import HTTPException
>-except ImportError, msg:
>-    HTTPException = None
>-
>-try:
>-    # This is a convenient way to make keepalive optional.
>-    # Just rename the module so it can't be imported.
>-    import keepalive
>-    from keepalive import HTTPHandler, HTTPSHandler
>-    have_keepalive = True
>-except ImportError, msg:
>-    have_keepalive = False
>-
>-try:
>-    # add in range support conditionally too
>-    import byterange
>-    from byterange import HTTPRangeHandler, HTTPSRangeHandler, \
>-         FileRangeHandler, FTPRangeHandler, range_tuple_normalize, \
>-         range_tuple_to_header, RangeError
>-except ImportError, msg:
>-    range_handlers = ()
>-    RangeError = None
>-    have_range = 0
>-else:
>-    range_handlers = (HTTPRangeHandler(), HTTPSRangeHandler(),
>-        FileRangeHandler(), FTPRangeHandler())
>-    have_range = 1
>-
>-
>-# check whether socket timeout support is available (Python >= 2.3)
>-import socket
>-try:
>-    TimeoutError = socket.timeout
>-    have_socket_timeout = True
>-except AttributeError:
>-    TimeoutError = None
>-    have_socket_timeout = False
>-
> ########################################################################
> # functions for debugging output.  These functions are here because they
> # are also part of the module initialization.
>@@ -456,14 +462,8 @@
> 
>     global DEBUG
>     DEBUG = DBOBJ
>-    if have_keepalive and keepalive.DEBUG is None:
>-        keepalive.DEBUG = DBOBJ
>-    if have_range and byterange.DEBUG is None:
>-        byterange.DEBUG = DBOBJ
>-    if sslfactory.DEBUG is None:
>-        sslfactory.DEBUG = DBOBJ
> 
>-def _init_default_logger():
>+def _init_default_logger(logspec=None):
>     '''Examines the environment variable URLGRABBER_DEBUG and creates
>     a logging object (logging.logger) based on the contents.  It takes
>     the form
>@@ -489,9 +489,12 @@
>     collect the code into a nice block.'''
> 
>     try:
>-        dbinfo = os.environ['URLGRABBER_DEBUG'].split(',')
>+        if logspec is None:
>+            logspec = os.environ['URLGRABBER_DEBUG']
>+        dbinfo = logspec.split(',')
>         import logging
>-        level = logging._levelNames.get(dbinfo[0], int(dbinfo[0]))
>+        level = logging._levelNames.get(dbinfo[0], None)
>+        if level is None: level = int(dbinfo[0])
>         if level < 1: raise ValueError()
> 
>         formatter = logging.Formatter('%(asctime)s %(message)s')
>@@ -508,7 +511,19 @@
>         DBOBJ = None
>     set_logger(DBOBJ)
> 
>+def _log_package_state():
>+    if not DEBUG: return
>+    DEBUG.info('urlgrabber version  = %s' % __version__)
>+    DEBUG.info('trans function "_"  = %s' % _)
>+        
> _init_default_logger()
>+_log_package_state()
>+
>+
>+# normally this would be from i18n or something like it ...
>+def _(st):
>+    return st
>+
> ########################################################################
> #                 END MODULE INITIALIZATION
> ########################################################################
>@@ -536,6 +551,7 @@
>         13   - malformed proxy url
>         14   - HTTPError (includes .code and .exception attributes)
>         15   - user abort
>+        16   - error writing to local file
>         
>       MirrorGroup error codes (256 -- 511)
>         256  - No more mirrors left to try
>@@ -567,7 +583,9 @@
>            # or simply
>          print e  #### print '[Errno %i] %s' % (e.errno, e.strerror)
>     """
>-    pass
>+    def __init__(self, *args):
>+        IOError.__init__(self, *args)
>+        self.url = "No url specified"
> 
> class CallbackObject:
>     """Container for returned callback data.
>@@ -661,7 +679,7 @@
>             quote = 0 # pathname2url quotes, so we won't do it again
>             
>         if scheme in ['http', 'https']:
>-            parts = self.process_http(parts)
>+            parts = self.process_http(parts, url)
>             
>         if quote is None:
>             quote = self.guess_should_quote(parts)
>@@ -678,19 +696,9 @@
>             url = prefix + '/' + url
>         return url
> 
>-    def process_http(self, parts):
>+    def process_http(self, parts, url):
>         (scheme, host, path, parm, query, frag) = parts
>-
>-        if '@' in host and auth_handler:
>-            try:
>-                user_pass, host = host.split('@', 1)
>-                if ':' in user_pass:
>-                    user, password = user_pass.split(':', 1)
>-            except ValueError, e:
>-                raise URLGrabError(1, _('Bad URL: %s') % url)
>-            if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password)
>-            auth_handler.add_password(None, host, user, password)
>-
>+        # TODO: auth-parsing here, maybe? pycurl doesn't really need it
>         return (scheme, host, path, parm, query, frag)
> 
>     def quote(self, parts):
>@@ -771,7 +779,7 @@
>     def _set_attributes(self, **kwargs):
>         """Update object attributes with those provided in kwargs."""
>         self.__dict__.update(kwargs)
>-        if have_range and kwargs.has_key('range'):
>+        if kwargs.has_key('range'):
>             # normalize the supplied range value
>             self.range = range_tuple_normalize(self.range)
>         if not self.reget in [None, 'simple', 'check_timestamp']:
>@@ -808,8 +816,36 @@
>         self.data = None
>         self.urlparser = URLParser()
>         self.quote = None
>-        self.ssl_ca_cert = None
>-        self.ssl_context = None
>+        self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
>+        self.ssl_context = None # no-op in pycurl
>+        self.ssl_verify_peer = True # check peer's cert for authenticityb
>+        self.ssl_verify_host = True # make sure who they are and who the cert is for matches
>+        self.ssl_key = None # client key
>+        self.ssl_key_type = 'PEM' #(or DER)
>+        self.ssl_cert = None # client cert
>+        self.ssl_cert_type = 'PEM' # (or DER)
>+        self.ssl_key_pass = None # password to access the key
>+        self.size = None # if we know how big the thing we're getting is going
>+                         # to be. this is ultimately a MAXIMUM size for the file
>+        self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
>+        
>+    def __repr__(self):
>+        return self.format()
>+        
>+    def format(self, indent='  '):
>+        keys = self.__dict__.keys()
>+        if self.delegate is not None:
>+            keys.remove('delegate')
>+        keys.sort()
>+        s = '{\n'
>+        for k in keys:
>+            s = s + indent + '%-15s: %s,\n' % \
>+                (repr(k), repr(self.__dict__[k]))
>+        if self.delegate:
>+            df = self.delegate.format(indent + '  ')
>+            s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
>+        s = s + indent + '}'
>+        return s
> 
> class URLGrabber:
>     """Provides easy opening of URLs with a variety of options.
>@@ -878,9 +914,10 @@
>         like any other file object.
>         """
>         opts = self.opts.derive(**kwargs)
>+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
>         (url,parts) = opts.urlparser.parse(url, opts) 
>         def retryfunc(opts, url):
>-            return URLGrabberFileObject(url, filename=None, opts=opts)
>+            return PyCurlFileObject(url, filename=None, opts=opts)
>         return self._retry(opts, retryfunc, url)
>     
>     def urlgrab(self, url, filename=None, **kwargs):
>@@ -890,6 +927,7 @@
>         different from the passed-in filename if copy_local == 0.
>         """
>         opts = self.opts.derive(**kwargs)
>+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
>         (url,parts) = opts.urlparser.parse(url, opts) 
>         (scheme, host, path, parm, query, frag) = parts
>         if filename is None:
>@@ -901,16 +939,28 @@
>             if host:
>                 path = os.path.normpath('//' + host + path)
>             if not os.path.exists(path):
>-                raise URLGrabError(2, 
>+                err = URLGrabError(2, 
>                       _('Local file does not exist: %s') % (path, ))
>+                err.url = url
>+                raise err
>             elif not os.path.isfile(path):
>-                raise URLGrabError(3, 
>-                              _('Not a normal file: %s') % (path, ))
>+                err = URLGrabError(3, 
>+                                 _('Not a normal file: %s') % (path, ))
>+                err.url = url
>+                raise err
>+
>             elif not opts.range:
>+                if not opts.checkfunc is None:
>+                    cb_func, cb_args, cb_kwargs = \
>+                       self._make_callback(opts.checkfunc)
>+                    obj = CallbackObject()
>+                    obj.filename = path
>+                    obj.url = url
>+                    apply(cb_func, (obj, )+cb_args, cb_kwargs)        
>                 return path
>         
>         def retryfunc(opts, url, filename):
>-            fo = URLGrabberFileObject(url, filename, opts)
>+            fo = PyCurlFileObject(url, filename, opts)
>             try:
>                 fo._do_grab()
>                 if not opts.checkfunc is None:
>@@ -934,12 +984,13 @@
>         into memory, but don't use too much'
>         """
>         opts = self.opts.derive(**kwargs)
>+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
>         (url,parts) = opts.urlparser.parse(url, opts) 
>         if limit is not None:
>             limit = limit + 1
>             
>         def retryfunc(opts, url, limit):
>-            fo = URLGrabberFileObject(url, filename=None, opts=opts)
>+            fo = PyCurlFileObject(url, filename=None, opts=opts)
>             s = ''
>             try:
>                 # this is an unfortunate thing.  Some file-like objects
>@@ -962,8 +1013,11 @@
>             
>         s = self._retry(opts, retryfunc, url, limit)
>         if limit and len(s) > limit:
>-            raise URLGrabError(8, 
>-                        _('Exceeded limit (%i): %s') % (limit, url))
>+            err = URLGrabError(8, 
>+                               _('Exceeded limit (%i): %s') % (limit, url))
>+            err.url = url
>+            raise err
>+
>         return s
>         
>     def _make_callback(self, callback_obj):
>@@ -976,192 +1030,328 @@
> # NOTE: actual defaults are set in URLGrabberOptions
> default_grabber = URLGrabber()
> 
>-class URLGrabberFileObject:
>-    """This is a file-object wrapper that supports progress objects 
>-    and throttling.
>-
>-    This exists to solve the following problem: lets say you want to
>-    drop-in replace a normal open with urlopen.  You want to use a
>-    progress meter and/or throttling, but how do you do that without
>-    rewriting your code?  Answer: urlopen will return a wrapped file
>-    object that does the progress meter and-or throttling internally.
>-    """
> 
>+class PyCurlFileObject():
>     def __init__(self, url, filename, opts):
>+        self.fo = None
>+        self._hdr_dump = ''
>+        self._parsed_hdr = None
>         self.url = url
>+        self.scheme = urlparse.urlsplit(self.url)[0]
>         self.filename = filename
>+        self.append = False
>+        self.reget_time = None
>         self.opts = opts
>-        self.fo = None
>+        if self.opts.reget == 'check_timestamp':
>+            raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
>+        self._complete = False
>         self._rbuf = ''
>         self._rbufsize = 1024*8
>         self._ttime = time.time()
>         self._tsize = 0
>         self._amount_read = 0
>-        self._opener = None
>+        self._reget_length = 0
>+        self._prog_running = False
>+        self._error = (None, None)
>+        self.size = None
>         self._do_open()
>         
>+        
>     def __getattr__(self, name):
>         """This effectively allows us to wrap at the instance level.
>         Any attribute not found in _this_ object will be searched for
>         in self.fo.  This includes methods."""
>+
>         if hasattr(self.fo, name):
>             return getattr(self.fo, name)
>         raise AttributeError, name
>-   
>-    def _get_opener(self):
>-        """Build a urllib2 OpenerDirector based on request options."""
>-        if self.opts.opener:
>-            return self.opts.opener
>-        elif self._opener is None:
>-            handlers = []
>-            need_keepalive_handler = (have_keepalive and self.opts.keepalive)
>-            need_range_handler = (range_handlers and \
>-                                  (self.opts.range or self.opts.reget))
>-            # if you specify a ProxyHandler when creating the opener
>-            # it _must_ come before all other handlers in the list or urllib2
>-            # chokes.
>-            if self.opts.proxies:
>-                handlers.append( CachedProxyHandler(self.opts.proxies) )
>-
>-                # -------------------------------------------------------
>-                # OK, these next few lines are a serious kludge to get
>-                # around what I think is a bug in python 2.2's
>-                # urllib2.  The basic idea is that default handlers
>-                # get applied first.  If you override one (like a
>-                # proxy handler), then the default gets pulled, but
>-                # the replacement goes on the end.  In the case of
>-                # proxies, this means the normal handler picks it up
>-                # first and the proxy isn't used.  Now, this probably
>-                # only happened with ftp or non-keepalive http, so not
>-                # many folks saw it.  The simple approach to fixing it
>-                # is just to make sure you override the other
>-                # conflicting defaults as well.  I would LOVE to see
>-                # these go way or be dealt with more elegantly.  The
>-                # problem isn't there after 2.2.  -MDS 2005/02/24
>-                if not need_keepalive_handler:
>-                    handlers.append( urllib2.HTTPHandler() )
>-                if not need_range_handler:
>-                    handlers.append( urllib2.FTPHandler() )
>-                # -------------------------------------------------------
>-
>-            ssl_factory = sslfactory.get_factory(self.opts.ssl_ca_cert,
>-                self.opts.ssl_context)
>-
>-            if need_keepalive_handler:
>-                handlers.append(HTTPHandler())
>-                handlers.append(HTTPSHandler(ssl_factory))
>-            if need_range_handler:
>-                handlers.extend( range_handlers )
>-            handlers.append( auth_handler )
>-            if self.opts.cache_openers:
>-                self._opener = CachedOpenerDirector(ssl_factory, *handlers)
>-            else:
>-                self._opener = ssl_factory.create_opener(*handlers)
>-            # OK, I don't like to do this, but otherwise, we end up with
>-            # TWO user-agent headers.
>-            self._opener.addheaders = []
>-        return self._opener
>+
>+    def _retrieve(self, buf):
>+        try:
>+            if not self._prog_running:
>+                if self.opts.progress_obj:
>+                    size  = self.size + self._reget_length
>+                    self.opts.progress_obj.start(self._prog_reportname, 
>+                                                 urllib.unquote(self.url), 
>+                                                 self._prog_basename, 
>+                                                 size=size,
>+                                                 text=self.opts.text)
>+                    self._prog_running = True
>+                    self.opts.progress_obj.update(self._amount_read)
>+
>+            self._amount_read += len(buf)
>+            self.fo.write(buf)
>+            return len(buf)
>+        except KeyboardInterrupt:
>+            return -1
>+            
>+    def _hdr_retrieve(self, buf):
>+        if self._over_max_size(cur=len(self._hdr_dump), 
>+                               max_size=self.opts.max_header_size):
>+            return -1            
>+        try:
>+            self._hdr_dump += buf
>+            # we have to get the size before we do the progress obj start
>+            # but we can't do that w/o making it do 2 connects, which sucks
>+            # so we cheat and stuff it in here in the hdr_retrieve
>+            if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
>+                length = buf.split(':')[1]
>+                self.size = int(length)
>+            elif self.scheme in ['ftp']:
>+                s = None
>+                if buf.startswith('213 '):
>+                    s = buf[3:].strip()
>+                elif buf.startswith('150 '):
>+                    s = parse150(buf)
>+                if s:
>+                    self.size = int(s)
>+            
>+            return len(buf)
>+        except KeyboardInterrupt:
>+            return pycurl.READFUNC_ABORT
>+
>+    def _return_hdr_obj(self):
>+        if self._parsed_hdr:
>+            return self._parsed_hdr
>+        statusend = self._hdr_dump.find('\n')
>+        hdrfp = StringIO()
>+        hdrfp.write(self._hdr_dump[statusend:])
>+        self._parsed_hdr =  mimetools.Message(hdrfp)
>+        return self._parsed_hdr
>+    
>+    hdr = property(_return_hdr_obj)
>+    http_code = property(fget=
>+                 lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE))
>+
>+    def _set_opts(self, opts={}):
>+        # XXX
>+        if not opts:
>+            opts = self.opts
>+
>+
>+        # defaults we're always going to set
>+        self.curl_obj.setopt(pycurl.NOPROGRESS, False)
>+        self.curl_obj.setopt(pycurl.NOSIGNAL, True)
>+        self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
>+        self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
>+        self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
>+        self.curl_obj.setopt(pycurl.FAILONERROR, True)
>+        self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
>         
>-    def _do_open(self):
>-        opener = self._get_opener()
>+        if DEBUG:
>+            self.curl_obj.setopt(pycurl.VERBOSE, True)
>+        if opts.user_agent:
>+            self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
>+        
>+        # maybe to be options later
>+        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
>+        self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
>+        
>+        # timeouts
>+        timeout = 300
>+        if opts.timeout:
>+            timeout = int(opts.timeout)
>+            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
>+
>+        # ssl options
>+        if self.scheme == 'https':
>+            if opts.ssl_ca_cert: # this may do ZERO with nss  according to curl docs
>+                self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
>+                self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
>+            self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
>+            self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
>+            if opts.ssl_key:
>+                self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
>+            if opts.ssl_key_type:
>+                self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
>+            if opts.ssl_cert:
>+                self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
>+            if opts.ssl_cert_type:                
>+                self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
>+            if opts.ssl_key_pass:
>+                self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
>+
>+        #headers:
>+        if opts.http_headers and self.scheme in ('http', 'https'):
>+            headers = []
>+            for (tag, content) in opts.http_headers:
>+                headers.append('%s:%s' % (tag, content))
>+            self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
>+
>+        # ranges:
>+        if opts.range or opts.reget:
>+            range_str = self._build_range()
>+            if range_str:
>+                self.curl_obj.setopt(pycurl.RANGE, range_str)
>+            
>+        # throttle/bandwidth
>+        if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
>+            self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
>+            
>+        # proxy settings
>+        if opts.proxies:
>+            for (scheme, proxy) in opts.proxies.items():
>+                if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
>+                    if scheme not in ('ftp'):
>+                        continue
>+                    else:
>+                        if proxy == '_none_': proxy = ""
>+                        self.curl_obj.setopt(pycurl.PROXY, proxy)
>+                elif self.scheme in ('http', 'https'):
>+                    if scheme not in ('http', 'https'):
>+                        continue
>+                    else:
>+                        if proxy == '_none_': proxy = ""
>+                        self.curl_obj.setopt(pycurl.PROXY, proxy)
>+            
>+        # FIXME username/password/auth settings
> 
>-        req = urllib2.Request(self.url, self.opts.data) # build request object
>-        self._add_headers(req) # add misc headers that we need
>-        self._build_range(req) # take care of reget and byterange stuff
>-
>-        fo, hdr = self._make_request(req, opener)
>-        if self.reget_time and self.opts.reget == 'check_timestamp':
>-            # do this if we have a local file with known timestamp AND
>-            # we're in check_timestamp reget mode.
>-            fetch_again = 0
>-            try:
>-                modified_tuple  = hdr.getdate_tz('last-modified')
>-                modified_stamp  = rfc822.mktime_tz(modified_tuple)
>-                if modified_stamp > self.reget_time: fetch_again = 1
>-            except (TypeError,):
>-                fetch_again = 1
>+        #posts - simple - expects the fields as they are
>+        if opts.data:
>+            self.curl_obj.setopt(pycurl.POST, True)
>+            self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
>             
>-            if fetch_again:
>-                # the server version is newer than the (incomplete) local
>-                # version, so we should abandon the version we're getting
>-                # and fetch the whole thing again.
>-                fo.close()
>-                self.opts.reget = None
>-                del req.headers['Range']
>-                self._build_range(req)
>-                fo, hdr = self._make_request(req, opener)
>-
>-        (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url)
>-        path = urllib.unquote(path)
>-        if not (self.opts.progress_obj or self.opts.raw_throttle() \
>-                or self.opts.timeout):
>-            # if we're not using the progress_obj, throttling, or timeout
>-            # we can get a performance boost by going directly to
>-            # the underlying fileobject for reads.
>-            self.read = fo.read
>-            if hasattr(fo, 'readline'):
>-                self.readline = fo.readline
>-        elif self.opts.progress_obj:
>-            try:    
>-                length = int(hdr['Content-Length'])
>-                length = length + self._amount_read     # Account for regets
>-            except (KeyError, ValueError, TypeError): 
>-                length = None
>-
>-            self.opts.progress_obj.start(str(self.filename),
>-                                         urllib.unquote(self.url),
>-                                         os.path.basename(path), 
>-                                         length, text=self.opts.text)
>-            self.opts.progress_obj.update(0)
>-        (self.fo, self.hdr) = (fo, hdr)
>-    
>-    def _add_headers(self, req):
>-        if self.opts.user_agent:
>-            req.add_header('User-agent', self.opts.user_agent)
>-        try: req_type = req.get_type()
>-        except ValueError: req_type = None
>-        if self.opts.http_headers and req_type in ('http', 'https'):
>-            for h, v in self.opts.http_headers:
>-                req.add_header(h, v)
>-        if self.opts.ftp_headers and req_type == 'ftp':
>-            for h, v in self.opts.ftp_headers:
>-                req.add_header(h, v)
>+        # our url
>+        self.curl_obj.setopt(pycurl.URL, self.url)
>+        
>+    
>+    def _do_perform(self):
>+        if self._complete:
>+            return
>+        
>+        try:
>+            self.curl_obj.perform()
>+        except pycurl.error, e:
>+            # XXX - break some of these out a bit more clearly
>+            # to other URLGrabErrors from 
>+            # http://curl.haxx.se/libcurl/c/libcurl-errors.html
>+            # this covers e.args[0] == 22 pretty well - which will be common
>+            
>+            code = self.http_code
>+            errcode = e.args[0]
>+            if self._error[0]:
>+                errcode = self._error[0]
>+                
>+            if errcode == 23 and code >= 200 and code < 299:
>+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
>+                err.url = self.url
>+                
>+                # this is probably wrong but ultimately this is what happens
>+                # we have a legit http code and a pycurl 'writer failed' code
>+                # which almost always means something aborted it from outside
>+                # since we cannot know what it is -I'm banking on it being
>+                # a ctrl-c. XXXX - if there's a way of going back two raises to 
>+                # figure out what aborted the pycurl process FIXME
>+                raise KeyboardInterrupt
>+            
>+            elif errcode == 28:
>+                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
>+                err.url = self.url
>+                raise err
>+            elif errcode == 35:
>+                msg = _("problem making ssl connection")
>+                err = URLGrabError(14, msg)
>+                err.url = self.url
>+                raise err
>+            elif errcode == 37:
>+                msg = _("Could not open/read %s") % (self.url)
>+                err = URLGrabError(14, msg)
>+                err.url = self.url
>+                raise err
>+                
>+            elif errcode == 42:
>+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
>+                err.url = self.url
>+                # this is probably wrong but ultimately this is what happens
>+                # we have a legit http code and a pycurl 'writer failed' code
>+                # which almost always means something aborted it from outside
>+                # since we cannot know what it is -I'm banking on it being
>+                # a ctrl-c. XXXX - if there's a way of going back two raises to 
>+                # figure out what aborted the pycurl process FIXME
>+                raise KeyboardInterrupt
>+                
>+            elif errcode == 58:
>+                msg = _("problem with the local client certificate")
>+                err = URLGrabError(14, msg)
>+                err.url = self.url
>+                raise err
>+
>+            elif errcode == 60:
>+                msg = _("client cert cannot be verified or client cert incorrect")
>+                err = URLGrabError(14, msg)
>+                err.url = self.url
>+                raise err
>+            
>+            elif errcode == 63:
>+                if self._error[1]:
>+                    msg = self._error[1]
>+                else:
>+                    msg = _("Max download size exceeded on %s") % (self.url)
>+                err = URLGrabError(14, msg)
>+                err.url = self.url
>+                raise err
>+                    
>+            elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
>+                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
>+            else:
>+                msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
>+                code = errcode
>+            err = URLGrabError(14, msg)
>+            err.code = code
>+            err.exception = e
>+            raise err
> 
>-    def _build_range(self, req):
>-        self.reget_time = None
>-        self.append = 0
>+    def _do_open(self):
>+        self.curl_obj = _curl_cache
>+        self.curl_obj.reset() # reset all old settings away, just in case
>+        # setup any ranges
>+        self._set_opts()
>+        self._do_grab()
>+        return self.fo
>+
>+    def _add_headers(self):
>+        pass
>+        
>+    def _build_range(self):
>         reget_length = 0
>         rt = None
>-        if have_range and self.opts.reget and type(self.filename) == type(''):
>+        if self.opts.reget and type(self.filename) in types.StringTypes:
>             # we have reget turned on and we're dumping to a file
>             try:
>                 s = os.stat(self.filename)
>             except OSError:
>                 pass
>             else:
>-                self.reget_time = s[ST_MTIME]
>-                reget_length = s[ST_SIZE]
>+                self.reget_time = s[stat.ST_MTIME]
>+                reget_length = s[stat.ST_SIZE]
> 
>                 # Set initial length when regetting
>                 self._amount_read = reget_length    
>+                self._reget_length = reget_length # set where we started from, too
> 
>                 rt = reget_length, ''
>                 self.append = 1
>                 
>         if self.opts.range:
>-            if not have_range:
>-                raise URLGrabError(10, _('Byte range requested but range '\
>-                                         'support unavailable'))
>             rt = self.opts.range
>             if rt[0]: rt = (rt[0] + reget_length, rt[1])
> 
>         if rt:
>             header = range_tuple_to_header(rt)
>-            if header: req.add_header('Range', header)
>+            if header:
>+                return header.split('=')[1]
>+
>+
> 
>     def _make_request(self, req, opener):
>+        #XXXX
>+        # This doesn't do anything really, but we could use this
>+        # instead of do_open() to catch a lot of crap errors as 
>+        # mstenner did before here
>+        return (self.fo, self.hdr)
>+        
>         try:
>-            if have_socket_timeout and self.opts.timeout:
>+            if self.opts.timeout:
>                 old_to = socket.getdefaulttimeout()
>                 socket.setdefaulttimeout(self.opts.timeout)
>                 try:
>@@ -1172,50 +1362,99 @@
>                 fo = opener.open(req)
>             hdr = fo.info()
>         except ValueError, e:
>-            raise URLGrabError(1, _('Bad URL: %s') % (e, ))
>+            err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
>+            err.url = self.url
>+            raise err
>+
>         except RangeError, e:
>-            raise URLGrabError(9, str(e))
>+            err = URLGrabError(9, _('%s on %s') % (e, self.url))
>+            err.url = self.url
>+            raise err
>         except urllib2.HTTPError, e:
>-            new_e = URLGrabError(14, str(e))
>+            new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
>             new_e.code = e.code
>             new_e.exception = e
>+            new_e.url = self.url
>             raise new_e
>         except IOError, e:
>-            if hasattr(e, 'reason') and have_socket_timeout and \
>-                   isinstance(e.reason, TimeoutError):
>-                raise URLGrabError(12, _('Timeout: %s') % (e, ))
>+            if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout):
>+                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
>+                err.url = self.url
>+                raise err
>             else:
>-                raise URLGrabError(4, _('IOError: %s') % (e, ))
>+                err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e))
>+                err.url = self.url
>+                raise err
>+
>         except OSError, e:
>-            raise URLGrabError(5, _('OSError: %s') % (e, ))
>+            err = URLGrabError(5, _('%s on %s') % (e, self.url))
>+            err.url = self.url
>+            raise err
>+
>         except HTTPException, e:
>-            raise URLGrabError(7, _('HTTP Exception (%s): %s') % \
>-                            (e.__class__.__name__, e))
>+            err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
>+                            (e.__class__.__name__, self.url, e))
>+            err.url = self.url
>+            raise err
>+
>         else:
>             return (fo, hdr)
>         
>     def _do_grab(self):
>-        """dump the file to self.filename."""
>-        if self.append: new_fo = open(self.filename, 'ab')
>-        else: new_fo = open(self.filename, 'wb')
>-        bs = 1024*8
>-        size = 0
>-
>-        block = self.read(bs)
>-        size = size + len(block)
>-        while block:
>-            new_fo.write(block)
>-            block = self.read(bs)
>-            size = size + len(block)
>+        """dump the file to a filename or StringIO buffer"""
>+
>+        if self._complete:
>+            return
>+        _was_filename = False
>+        if type(self.filename) in types.StringTypes and self.filename:
>+            _was_filename = True
>+            self._prog_reportname = str(self.filename)
>+            self._prog_basename = os.path.basename(self.filename)
>+            
>+            if self.append: mode = 'ab'
>+            else: mode = 'wb'
>+
>+            if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
>+                                 (self.filename, mode))
>+            try:
>+                self.fo = open(self.filename, mode)
>+            except IOError, e:
>+                err = URLGrabError(16, _(\
>+                  'error opening local file from %s, IOError: %s') % (self.url, e))
>+                err.url = self.url
>+                raise err
>+
>+        else:
>+            self._prog_reportname = 'MEMORY'
>+            self._prog_basename = 'MEMORY'
>+
>+            
>+            self.fo = StringIO()
>+            # if this is to be a tempfile instead....
>+            # it just makes crap in the tempdir
>+            #fh, self._temp_name = mkstemp()
>+            #self.fo = open(self._temp_name, 'wb')
>+
>+            
>+        self._do_perform()
>+        
> 
>-        new_fo.close()
>-        try:
>-            modified_tuple  = self.hdr.getdate_tz('last-modified')
>-            modified_stamp  = rfc822.mktime_tz(modified_tuple)
>-            os.utime(self.filename, (modified_stamp, modified_stamp))
>-        except (TypeError,), e: pass
> 
>-        return size
>+        if _was_filename:
>+            # close it up
>+            self.fo.flush()
>+            self.fo.close()
>+            # set the time
>+            mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
>+            if mod_time != -1:
>+                os.utime(self.filename, (mod_time, mod_time))
>+            # re open it
>+            self.fo = open(self.filename, 'r')
>+        else:
>+            #self.fo = open(self._temp_name, 'r')
>+            self.fo.seek(0)
>+
>+        self._complete = True
>     
>     def _fill_buffer(self, amt=None):
>         """fill the buffer to contain at least 'amt' bytes by reading
>@@ -1233,7 +1472,9 @@
> 
>         # if we've made it here, then we don't have enough in the buffer
>         # and we need to read more.
>-
>+        
>+        if not self._complete: self._do_grab() #XXX cheater - change on ranges
>+        
>         buf = [self._rbuf]
>         bufsize = len(self._rbuf)
>         while amt is None or amt:
>@@ -1250,11 +1491,20 @@
>             try:
>                 new = self.fo.read(readamount)
>             except socket.error, e:
>-                raise URLGrabError(4, _('Socket Error: %s') % (e, ))
>-            except TimeoutError, e:
>-                raise URLGrabError(12, _('Timeout: %s') % (e, ))
>+                err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
>+                err.url = self.url
>+                raise err
>+
>+            except socket.timeout, e:
>+                raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
>+                err.url = self.url
>+                raise err
>+
>             except IOError, e:
>-                raise URLGrabError(4, _('IOError: %s') %(e,))
>+                raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
>+                err.url = self.url
>+                raise err
>+
>             newsize = len(new)
>             if not newsize: break # no more to read
> 
>@@ -1263,12 +1513,45 @@
>             bufsize = bufsize + newsize
>             self._tsize = newsize
>             self._amount_read = self._amount_read + newsize
>-            if self.opts.progress_obj:
>-                self.opts.progress_obj.update(self._amount_read)
>+            #if self.opts.progress_obj:
>+            #    self.opts.progress_obj.update(self._amount_read)
> 
>         self._rbuf = string.join(buf, '')
>         return
> 
>+    def _progress_update(self, download_total, downloaded, upload_total, uploaded):
>+        if self._over_max_size(cur=self._amount_read-self._reget_length):
>+            return -1
>+
>+        try:
>+            if self._prog_running:
>+                downloaded += self._reget_length
>+                self.opts.progress_obj.update(downloaded)
>+        except KeyboardInterrupt:
>+            return -1
>+    
>+    def _over_max_size(self, cur, max_size=None):
>+
>+        if not max_size:
>+            max_size = self.size
>+        if self.opts.size: # if we set an opts size use that, no matter what
>+            max_size = self.opts.size
>+        if not max_size: return False # if we have None for all of the Max then this is dumb
>+        if cur > max_size + max_size*.10:
>+
>+            msg = _("Downloaded more than max size for %s: %s > %s") \
>+                        % (self.url, cur, max_size)
>+            self._error = (pycurl.E_FILESIZE_EXCEEDED, msg)
>+            return True
>+        return False
>+        
>+    def _to_utf8(self, obj, errors='replace'):
>+        '''convert 'unicode' to an encoded utf-8 byte string '''
>+        # stolen from yum.i18n
>+        if isinstance(obj, unicode):
>+            obj = obj.encode('utf-8', errors)
>+        return obj
>+        
>     def read(self, amt=None):
>         self._fill_buffer(amt)
>         if amt is None:
>@@ -1278,6 +1561,9 @@
>         return s
> 
>     def readline(self, limit=-1):
>+        if not self._complete: self._do_grab()
>+        return self.fo.readline()
>+        
>         i = string.find(self._rbuf, '\n')
>         while i < 0 and not (0 < limit <= len(self._rbuf)):
>             L = len(self._rbuf)
>@@ -1293,43 +1579,13 @@
>         return s
> 
>     def close(self):
>-        if self.opts.progress_obj:
>+        if self._prog_running:
>             self.opts.progress_obj.end(self._amount_read)
>         self.fo.close()
>-        if self.opts.close_connection:
>-            try: self.fo.close_connection()
>-            except: pass
>-
>-_handler_cache = []
>-def CachedOpenerDirector(ssl_factory = None, *handlers):
>-    for (cached_handlers, opener) in _handler_cache:
>-        if cached_handlers == handlers:
>-            for handler in opener.handlers:
>-                handler.add_parent(opener)
>-            return opener
>-    if not ssl_factory:
>-        ssl_factory = sslfactory.get_factory()
>-    opener = ssl_factory.create_opener(*handlers)
>-    _handler_cache.append( (handlers, opener) )
>-    return opener
>-
>-_proxy_cache = []
>-def CachedProxyHandler(proxies):
>-    for (pdict, handler) in _proxy_cache:
>-        if pdict == proxies:
>-            if DEBUG: DEBUG.debug('re-using proxy settings: %s', proxies)
>-            break
>-    else:
>-        for k, v in proxies.items():
>-            utype, url = urllib.splittype(v)
>-            host, other = urllib.splithost(url)
>-            if (utype is None) or (host is None):
>-                raise URLGrabError(13, _('Bad proxy URL: %s') % v)
>-
>-        if DEBUG: DEBUG.info('creating new proxy handler: %s', proxies)
>-        handler = urllib2.ProxyHandler(proxies)
>-        _proxy_cache.append( (proxies, handler) )
>-    return handler
>+        
>+
>+_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
>+
> 
> #####################################################################
> # DEPRECATED FUNCTIONS
>@@ -1368,7 +1624,6 @@
> #####################################################################
> #  TESTING
> def _main_test():
>-    import sys
>     try: url, filename = sys.argv[1:3]
>     except ValueError:
>         print 'usage:', sys.argv[0], \
>@@ -1395,7 +1650,6 @@
> 
> 
> def _retry_test():
>-    import sys
>     try: url, filename = sys.argv[1:3]
>     except ValueError:
>         print 'usage:', sys.argv[0], \
>@@ -1430,7 +1684,7 @@
>     else: print 'LOCAL FILE:', name
> 
> def _file_object_test(filename=None):
>-    import random, cStringIO, sys
>+    import cStringIO
>     if filename is None:
>         filename = __file__
>     print 'using file "%s" for comparisons' % filename
>@@ -1444,7 +1698,7 @@
>                      _test_file_object_readlines]:
>         fo_input = cStringIO.StringIO(s_input)
>         fo_output = cStringIO.StringIO()
>-        wrapper = URLGrabberFileObject(fo_input, None, 0)
>+        wrapper = PyCurlFileObject(fo_input, None, 0)
>         print 'testing %-30s ' % testfunc.__name__,
>         testfunc(wrapper, fo_output)
>         s_output = fo_output.getvalue()
>

Actions: View

Attachments on bug 314435: 227189 | 236657 | 236659