diff -up urlgrabber-3.10/test/test_mirror.py.orig urlgrabber-3.10/test/test_mirror.py --- urlgrabber-3.10/test/test_mirror.py.orig 2013-08-26 09:09:07.000000000 +0200 +++ urlgrabber-3.10/test/test_mirror.py 2016-06-29 18:26:06.790393129 +0200 @@ -268,33 +268,55 @@ class ActionTests(TestCase): self.assertEquals(self.g.calls, expected_calls) self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) +import thread, socket +LOCALPORT = 'localhost', 2000 class HttpReplyCode(TestCase): def setUp(self): + # start the server + self.exit = False def server(): - import socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - s.bind(('localhost', 2000)); s.listen(1) + s.bind(LOCALPORT); s.listen(1) while 1: c, a = s.accept() + if self.exit: c.close(); break while not c.recv(4096).endswith('\r\n\r\n'): pass c.sendall('HTTP/1.1 %d %s\r\n' % self.reply) + if self.content is not None: + c.sendall('Content-Length: %d\r\n\r\n' % len(self.content)) + c.sendall(self.content) c.close() - import thread - self.reply = 503, "Busy" + s.close() + self.exit = False thread.start_new_thread(server, ()) + # create grabber and mirror group objects def failure(obj): self.code = getattr(obj.exception, 'code', None) return {} self.g = URLGrabber() - self.mg = MirrorGroup(self.g, ['http://localhost:2000/'], failure_callback = failure) + self.mg = MirrorGroup(self.g, ['http://%s:%d' % LOCALPORT], + failure_callback = failure) + + def tearDown(self): + # shut down the server + self.exit = True + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect(LOCALPORT); s.close() # wake it up + while self.exit: pass # poor man's join def test_grab(self): + 'tests the propagation of HTTP reply code' + self.reply = 503, "Busy" + self.content = None + + # single self.assertRaises(URLGrabError, self.mg.urlgrab, 'foo') self.assertEquals(self.code, 503); del self.code + # multi err = [] self.mg.urlgrab('foo', async = True, failfunc = err.append) urlgrabber.grabber.parallel_wait() diff -up urlgrabber-3.10/test/test_mirror.py.orig urlgrabber-3.10/test/test_mirror.py --- urlgrabber-3.10/test/test_mirror.py.orig 2016-06-29 18:26:06.790393129 +0200 +++ urlgrabber-3.10/test/test_mirror.py 2016-06-29 18:26:58.886148544 +0200 @@ -268,13 +268,14 @@ class ActionTests(TestCase): self.assertEquals(self.g.calls, expected_calls) self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) -import thread, socket +import threading, socket LOCALPORT = 'localhost', 2000 class HttpReplyCode(TestCase): def setUp(self): # start the server self.exit = False + self.process = lambda data: None def server(): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) @@ -282,7 +283,10 @@ class HttpReplyCode(TestCase): while 1: c, a = s.accept() if self.exit: c.close(); break - while not c.recv(4096).endswith('\r\n\r\n'): pass + data = '' + while not data.endswith('\r\n\r\n'): + data = c.recv(4096) + self.process(data) c.sendall('HTTP/1.1 %d %s\r\n' % self.reply) if self.content is not None: c.sendall('Content-Length: %d\r\n\r\n' % len(self.content)) @@ -290,7 +294,8 @@ class HttpReplyCode(TestCase): c.close() s.close() self.exit = False - thread.start_new_thread(server, ()) + self.thread = threading.Thread(target=server) + self.thread.start() # create grabber and mirror group objects def failure(obj): @@ -305,7 +310,7 @@ class HttpReplyCode(TestCase): self.exit = True s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(LOCALPORT); s.close() # wake it up - while self.exit: pass # poor man's join + self.thread.join() def test_grab(self): 'tests the propagation of HTTP reply code' @@ -323,6 +328,45 @@ class HttpReplyCode(TestCase): self.assertEquals([e.exception.errno for e in err], [256]) self.assertEquals(self.code, 503); del self.code + def test_retry_no_cache(self): + 'test bypassing proxy cache on failure' + def process(data): + if 'Pragma:no-cache' in data: + self.content = 'version2' + else: + self.content = 'version1' + + def checkfunc_read(obj): + if obj.data == 'version1': + raise URLGrabError(-1, 'Outdated version of foo') + + def checkfunc_grab(obj): + with open('foo') as f: + if f.read() == 'version1': + raise URLGrabError(-1, 'Outdated version of foo') + + self.process = process + self.reply = 200, "OK" + + opts = self.g.opts + opts.retry = 3 + opts.retry_no_cache = True + + # single + opts.checkfunc = checkfunc_read + try: + self.mg.urlread('foo') + except URLGrabError as e: + self.fail(str(e)) + + # multi + opts.checkfunc = checkfunc_grab + self.mg.urlgrab('foo', async=True) + try: + urlgrabber.grabber.parallel_wait() + except URLGrabError as e: + self.fail(str(e)) + def suite(): tl = TestLoader() return tl.loadTestsFromModule(sys.modules[__name__]) diff -up urlgrabber-3.10/urlgrabber/grabber.py.orig urlgrabber-3.10/urlgrabber/grabber.py --- urlgrabber-3.10/urlgrabber/grabber.py.orig 2016-06-29 18:25:53.964453346 +0200 +++ urlgrabber-3.10/urlgrabber/grabber.py 2016-06-29 18:26:58.886148544 +0200 @@ -171,6 +171,12 @@ GENERAL ARGUMENTS (kwargs) The libproxy code is only used if the proxies dictionary does not provide any proxies. + no_cache = False + + When True, server-side cache will be disabled for http and https + requests. This is equivalent to setting + http_headers = (('Pragma', 'no-cache'),) + prefix = None a url prefix that will be prepended to all requested urls. For @@ -383,10 +389,11 @@ RETRY RELATED ARGUMENTS identical to checkfunc, except for the attributes defined in the CallbackObject instance. The attributes for failure_callback are: - exception = the raised exception - url = the url we're trying to fetch - tries = the number of tries so far (including this one) - retry = the value of the retry option + exception = the raised exception + url = the url we're trying to fetch + tries = the number of tries so far (including this one) + retry = the value of the retry option + retry_no_cache = the value of the retry_no_cache option The callback is present primarily to inform the calling program of the failure, but if it raises an exception (including the one it's @@ -431,6 +438,19 @@ RETRY RELATED ARGUMENTS passed the same arguments, so you could use the same function for both. + retry_no_cache = False + + When True, automatically enable no_cache for future retries if + checkfunc performs an unsuccessful check. + + This option is useful if your application expects a set of files + from the same server to form an atomic unit and you write your + checkfunc to ensure each file being downloaded belongs to such a + unit. If transparent proxy caching is in effect, the files can + become out-of-sync, disrupting the atomicity. Enabling this option + will prevent that, while ensuring that you still enjoy the benefits + of caching when possible. + BANDWIDTH THROTTLING urlgrabber supports throttling via two values: throttle and @@ -1001,6 +1021,8 @@ class URLGrabberOptions: self.half_life = 30*24*60*60 # 30 days self.default_speed = 1e6 # 1 MBit self.ftp_disable_epsv = False + self.no_cache = False + self.retry_no_cache = False def __repr__(self): return self.format() @@ -1077,7 +1099,8 @@ class URLGrabber(object): if callback: if DEBUG: DEBUG.info('calling callback: %s', callback) obj = CallbackObject(exception=exception, url=args[0], - tries=tries, retry=opts.retry) + tries=tries, retry=opts.retry, + retry_no_cache=opts.retry_no_cache) _run_callback(callback, obj) if (opts.retry is None) or (tries == opts.retry): @@ -1089,6 +1112,8 @@ class URLGrabber(object): if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', retrycode, opts.retrycodes) raise + if retrycode is not None and retrycode < 0 and opts.retry_no_cache: + opts.no_cache = True def urlopen(self, url, opts=None, **kwargs): """open the url and return a file object @@ -1429,11 +1454,15 @@ class PyCurlFileObject(object): self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) #headers: - if opts.http_headers and self.scheme in ('http', 'https'): + if self.scheme in ('http', 'https'): headers = [] - for (tag, content) in opts.http_headers: - headers.append('%s:%s' % (tag, content)) - self.curl_obj.setopt(pycurl.HTTPHEADER, headers) + if opts.http_headers is not None: + for (tag, content) in opts.http_headers: + headers.append('%s:%s' % (tag, content)) + if opts.no_cache: + headers.append('Pragma:no-cache') + if headers: + self.curl_obj.setopt(pycurl.HTTPHEADER, headers) # ranges: if opts.range or opts.reget: @@ -2055,7 +2084,8 @@ class _ExternalDownloader: 'ssl_key_pass', 'ssl_verify_peer', 'ssl_verify_host', 'size', 'max_header_size', 'ip_resolve', - 'ftp_disable_epsv' + 'ftp_disable_epsv', + 'no_cache', ) def start(self, opts): @@ -2236,6 +2266,8 @@ def parallel_wait(meter=None): except URLGrabError, ug_err: retry = 0 # no retries if opts.tries < retry and ug_err.errno in opts.retrycodes: + if ug_err.errno < 0 and opts.retry_no_cache: + opts.no_cache = True start(opts, opts.tries + 1) # simple retry continue