You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

280 lines
11 KiB

diff -up urlgrabber-3.10/test/test_mirror.py.orig urlgrabber-3.10/test/test_mirror.py
--- urlgrabber-3.10/test/test_mirror.py.orig 2013-08-26 09:09:07.000000000 +0200
+++ urlgrabber-3.10/test/test_mirror.py 2016-06-29 18:26:06.790393129 +0200
@@ -268,33 +268,55 @@ class ActionTests(TestCase):
self.assertEquals(self.g.calls, expected_calls)
self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs)
+import thread, socket
+LOCALPORT = 'localhost', 2000
class HttpReplyCode(TestCase):
def setUp(self):
+ # start the server
+ self.exit = False
def server():
- import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
- s.bind(('localhost', 2000)); s.listen(1)
+ s.bind(LOCALPORT); s.listen(1)
while 1:
c, a = s.accept()
+ if self.exit: c.close(); break
while not c.recv(4096).endswith('\r\n\r\n'): pass
c.sendall('HTTP/1.1 %d %s\r\n' % self.reply)
+ if self.content is not None:
+ c.sendall('Content-Length: %d\r\n\r\n' % len(self.content))
+ c.sendall(self.content)
c.close()
- import thread
- self.reply = 503, "Busy"
+ s.close()
+ self.exit = False
thread.start_new_thread(server, ())
+ # create grabber and mirror group objects
def failure(obj):
self.code = getattr(obj.exception, 'code', None)
return {}
self.g = URLGrabber()
- self.mg = MirrorGroup(self.g, ['http://localhost:2000/'], failure_callback = failure)
+ self.mg = MirrorGroup(self.g, ['http://%s:%d' % LOCALPORT],
+ failure_callback = failure)
+
+ def tearDown(self):
+ # shut down the server
+ self.exit = True
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ s.connect(LOCALPORT); s.close() # wake it up
+ while self.exit: pass # poor man's join
def test_grab(self):
+ 'tests the propagation of HTTP reply code'
+ self.reply = 503, "Busy"
+ self.content = None
+
+ # single
self.assertRaises(URLGrabError, self.mg.urlgrab, 'foo')
self.assertEquals(self.code, 503); del self.code
+ # multi
err = []
self.mg.urlgrab('foo', async = True, failfunc = err.append)
urlgrabber.grabber.parallel_wait()
diff -up urlgrabber-3.10/test/test_mirror.py.orig urlgrabber-3.10/test/test_mirror.py
--- urlgrabber-3.10/test/test_mirror.py.orig 2016-06-29 18:26:06.790393129 +0200
+++ urlgrabber-3.10/test/test_mirror.py 2016-06-29 18:26:58.886148544 +0200
@@ -268,13 +268,14 @@ class ActionTests(TestCase):
self.assertEquals(self.g.calls, expected_calls)
self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs)
-import thread, socket
+import threading, socket
LOCALPORT = 'localhost', 2000
class HttpReplyCode(TestCase):
def setUp(self):
# start the server
self.exit = False
+ self.process = lambda data: None
def server():
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
@@ -282,7 +283,10 @@ class HttpReplyCode(TestCase):
while 1:
c, a = s.accept()
if self.exit: c.close(); break
- while not c.recv(4096).endswith('\r\n\r\n'): pass
+ data = ''
+ while not data.endswith('\r\n\r\n'):
+ data = c.recv(4096)
+ self.process(data)
c.sendall('HTTP/1.1 %d %s\r\n' % self.reply)
if self.content is not None:
c.sendall('Content-Length: %d\r\n\r\n' % len(self.content))
@@ -290,7 +294,8 @@ class HttpReplyCode(TestCase):
c.close()
s.close()
self.exit = False
- thread.start_new_thread(server, ())
+ self.thread = threading.Thread(target=server)
+ self.thread.start()
# create grabber and mirror group objects
def failure(obj):
@@ -305,7 +310,7 @@ class HttpReplyCode(TestCase):
self.exit = True
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(LOCALPORT); s.close() # wake it up
- while self.exit: pass # poor man's join
+ self.thread.join()
def test_grab(self):
'tests the propagation of HTTP reply code'
@@ -323,6 +328,45 @@ class HttpReplyCode(TestCase):
self.assertEquals([e.exception.errno for e in err], [256])
self.assertEquals(self.code, 503); del self.code
+ def test_retry_no_cache(self):
+ 'test bypassing proxy cache on failure'
+ def process(data):
+ if 'Pragma:no-cache' in data:
+ self.content = 'version2'
+ else:
+ self.content = 'version1'
+
+ def checkfunc_read(obj):
+ if obj.data == 'version1':
+ raise URLGrabError(-1, 'Outdated version of foo')
+
+ def checkfunc_grab(obj):
+ with open('foo') as f:
+ if f.read() == 'version1':
+ raise URLGrabError(-1, 'Outdated version of foo')
+
+ self.process = process
+ self.reply = 200, "OK"
+
+ opts = self.g.opts
+ opts.retry = 3
+ opts.retry_no_cache = True
+
+ # single
+ opts.checkfunc = checkfunc_read
+ try:
+ self.mg.urlread('foo')
+ except URLGrabError as e:
+ self.fail(str(e))
+
+ # multi
+ opts.checkfunc = checkfunc_grab
+ self.mg.urlgrab('foo', async=True)
+ try:
+ urlgrabber.grabber.parallel_wait()
+ except URLGrabError as e:
+ self.fail(str(e))
+
def suite():
tl = TestLoader()
return tl.loadTestsFromModule(sys.modules[__name__])
diff -up urlgrabber-3.10/urlgrabber/grabber.py.orig urlgrabber-3.10/urlgrabber/grabber.py
--- urlgrabber-3.10/urlgrabber/grabber.py.orig 2016-06-29 18:25:53.964453346 +0200
+++ urlgrabber-3.10/urlgrabber/grabber.py 2016-06-29 18:26:58.886148544 +0200
@@ -171,6 +171,12 @@ GENERAL ARGUMENTS (kwargs)
The libproxy code is only used if the proxies dictionary
does not provide any proxies.
+ no_cache = False
+
+ When True, server-side cache will be disabled for http and https
+ requests. This is equivalent to setting
+ http_headers = (('Pragma', 'no-cache'),)
+
prefix = None
a url prefix that will be prepended to all requested urls. For
@@ -383,10 +389,11 @@ RETRY RELATED ARGUMENTS
identical to checkfunc, except for the attributes defined in the
CallbackObject instance. The attributes for failure_callback are:
- exception = the raised exception
- url = the url we're trying to fetch
- tries = the number of tries so far (including this one)
- retry = the value of the retry option
+ exception = the raised exception
+ url = the url we're trying to fetch
+ tries = the number of tries so far (including this one)
+ retry = the value of the retry option
+ retry_no_cache = the value of the retry_no_cache option
The callback is present primarily to inform the calling program of
the failure, but if it raises an exception (including the one it's
@@ -431,6 +438,19 @@ RETRY RELATED ARGUMENTS
passed the same arguments, so you could use the same function for
both.
+ retry_no_cache = False
+
+ When True, automatically enable no_cache for future retries if
+ checkfunc performs an unsuccessful check.
+
+ This option is useful if your application expects a set of files
+ from the same server to form an atomic unit and you write your
+ checkfunc to ensure each file being downloaded belongs to such a
+ unit. If transparent proxy caching is in effect, the files can
+ become out-of-sync, disrupting the atomicity. Enabling this option
+ will prevent that, while ensuring that you still enjoy the benefits
+ of caching when possible.
+
BANDWIDTH THROTTLING
urlgrabber supports throttling via two values: throttle and
@@ -1001,6 +1021,8 @@ class URLGrabberOptions:
self.half_life = 30*24*60*60 # 30 days
self.default_speed = 1e6 # 1 MBit
self.ftp_disable_epsv = False
+ self.no_cache = False
+ self.retry_no_cache = False
def __repr__(self):
return self.format()
@@ -1077,7 +1099,8 @@ class URLGrabber(object):
if callback:
if DEBUG: DEBUG.info('calling callback: %s', callback)
obj = CallbackObject(exception=exception, url=args[0],
- tries=tries, retry=opts.retry)
+ tries=tries, retry=opts.retry,
+ retry_no_cache=opts.retry_no_cache)
_run_callback(callback, obj)
if (opts.retry is None) or (tries == opts.retry):
@@ -1089,6 +1112,8 @@ class URLGrabber(object):
if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
retrycode, opts.retrycodes)
raise
+ if retrycode is not None and retrycode < 0 and opts.retry_no_cache:
+ opts.no_cache = True
def urlopen(self, url, opts=None, **kwargs):
"""open the url and return a file object
@@ -1429,11 +1454,15 @@ class PyCurlFileObject(object):
self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
#headers:
- if opts.http_headers and self.scheme in ('http', 'https'):
+ if self.scheme in ('http', 'https'):
headers = []
- for (tag, content) in opts.http_headers:
- headers.append('%s:%s' % (tag, content))
- self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
+ if opts.http_headers is not None:
+ for (tag, content) in opts.http_headers:
+ headers.append('%s:%s' % (tag, content))
+ if opts.no_cache:
+ headers.append('Pragma:no-cache')
+ if headers:
+ self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
# ranges:
if opts.range or opts.reget:
@@ -2055,7 +2084,8 @@ class _ExternalDownloader:
'ssl_key_pass',
'ssl_verify_peer', 'ssl_verify_host',
'size', 'max_header_size', 'ip_resolve',
- 'ftp_disable_epsv'
+ 'ftp_disable_epsv',
+ 'no_cache',
)
def start(self, opts):
@@ -2236,6 +2266,8 @@ def parallel_wait(meter=None):
except URLGrabError, ug_err:
retry = 0 # no retries
if opts.tries < retry and ug_err.errno in opts.retrycodes:
+ if ug_err.errno < 0 and opts.retry_no_cache:
+ opts.no_cache = True
start(opts, opts.tries + 1) # simple retry
continue