You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
280 lines
11 KiB
280 lines
11 KiB
7 years ago
|
diff -up urlgrabber-3.10/test/test_mirror.py.orig urlgrabber-3.10/test/test_mirror.py
|
||
|
--- urlgrabber-3.10/test/test_mirror.py.orig 2013-08-26 09:09:07.000000000 +0200
|
||
|
+++ urlgrabber-3.10/test/test_mirror.py 2016-06-29 18:26:06.790393129 +0200
|
||
|
@@ -268,33 +268,55 @@ class ActionTests(TestCase):
|
||
|
self.assertEquals(self.g.calls, expected_calls)
|
||
|
self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs)
|
||
|
|
||
|
+import thread, socket
|
||
|
+LOCALPORT = 'localhost', 2000
|
||
|
|
||
|
class HttpReplyCode(TestCase):
|
||
|
def setUp(self):
|
||
|
+ # start the server
|
||
|
+ self.exit = False
|
||
|
def server():
|
||
|
- import socket
|
||
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||
|
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||
|
- s.bind(('localhost', 2000)); s.listen(1)
|
||
|
+ s.bind(LOCALPORT); s.listen(1)
|
||
|
while 1:
|
||
|
c, a = s.accept()
|
||
|
+ if self.exit: c.close(); break
|
||
|
while not c.recv(4096).endswith('\r\n\r\n'): pass
|
||
|
c.sendall('HTTP/1.1 %d %s\r\n' % self.reply)
|
||
|
+ if self.content is not None:
|
||
|
+ c.sendall('Content-Length: %d\r\n\r\n' % len(self.content))
|
||
|
+ c.sendall(self.content)
|
||
|
c.close()
|
||
|
- import thread
|
||
|
- self.reply = 503, "Busy"
|
||
|
+ s.close()
|
||
|
+ self.exit = False
|
||
|
thread.start_new_thread(server, ())
|
||
|
|
||
|
+ # create grabber and mirror group objects
|
||
|
def failure(obj):
|
||
|
self.code = getattr(obj.exception, 'code', None)
|
||
|
return {}
|
||
|
self.g = URLGrabber()
|
||
|
- self.mg = MirrorGroup(self.g, ['http://localhost:2000/'], failure_callback = failure)
|
||
|
+ self.mg = MirrorGroup(self.g, ['http://%s:%d' % LOCALPORT],
|
||
|
+ failure_callback = failure)
|
||
|
+
|
||
|
+ def tearDown(self):
|
||
|
+ # shut down the server
|
||
|
+ self.exit = True
|
||
|
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||
|
+ s.connect(LOCALPORT); s.close() # wake it up
|
||
|
+ while self.exit: pass # poor man's join
|
||
|
|
||
|
def test_grab(self):
|
||
|
+ 'tests the propagation of HTTP reply code'
|
||
|
+ self.reply = 503, "Busy"
|
||
|
+ self.content = None
|
||
|
+
|
||
|
+ # single
|
||
|
self.assertRaises(URLGrabError, self.mg.urlgrab, 'foo')
|
||
|
self.assertEquals(self.code, 503); del self.code
|
||
|
|
||
|
+ # multi
|
||
|
err = []
|
||
|
self.mg.urlgrab('foo', async = True, failfunc = err.append)
|
||
|
urlgrabber.grabber.parallel_wait()
|
||
|
diff -up urlgrabber-3.10/test/test_mirror.py.orig urlgrabber-3.10/test/test_mirror.py
|
||
|
--- urlgrabber-3.10/test/test_mirror.py.orig 2016-06-29 18:26:06.790393129 +0200
|
||
|
+++ urlgrabber-3.10/test/test_mirror.py 2016-06-29 18:26:58.886148544 +0200
|
||
|
@@ -268,13 +268,14 @@ class ActionTests(TestCase):
|
||
|
self.assertEquals(self.g.calls, expected_calls)
|
||
|
self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs)
|
||
|
|
||
|
-import thread, socket
|
||
|
+import threading, socket
|
||
|
LOCALPORT = 'localhost', 2000
|
||
|
|
||
|
class HttpReplyCode(TestCase):
|
||
|
def setUp(self):
|
||
|
# start the server
|
||
|
self.exit = False
|
||
|
+ self.process = lambda data: None
|
||
|
def server():
|
||
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||
|
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||
|
@@ -282,7 +283,10 @@ class HttpReplyCode(TestCase):
|
||
|
while 1:
|
||
|
c, a = s.accept()
|
||
|
if self.exit: c.close(); break
|
||
|
- while not c.recv(4096).endswith('\r\n\r\n'): pass
|
||
|
+ data = ''
|
||
|
+ while not data.endswith('\r\n\r\n'):
|
||
|
+ data = c.recv(4096)
|
||
|
+ self.process(data)
|
||
|
c.sendall('HTTP/1.1 %d %s\r\n' % self.reply)
|
||
|
if self.content is not None:
|
||
|
c.sendall('Content-Length: %d\r\n\r\n' % len(self.content))
|
||
|
@@ -290,7 +294,8 @@ class HttpReplyCode(TestCase):
|
||
|
c.close()
|
||
|
s.close()
|
||
|
self.exit = False
|
||
|
- thread.start_new_thread(server, ())
|
||
|
+ self.thread = threading.Thread(target=server)
|
||
|
+ self.thread.start()
|
||
|
|
||
|
# create grabber and mirror group objects
|
||
|
def failure(obj):
|
||
|
@@ -305,7 +310,7 @@ class HttpReplyCode(TestCase):
|
||
|
self.exit = True
|
||
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||
|
s.connect(LOCALPORT); s.close() # wake it up
|
||
|
- while self.exit: pass # poor man's join
|
||
|
+ self.thread.join()
|
||
|
|
||
|
def test_grab(self):
|
||
|
'tests the propagation of HTTP reply code'
|
||
|
@@ -323,6 +328,45 @@ class HttpReplyCode(TestCase):
|
||
|
self.assertEquals([e.exception.errno for e in err], [256])
|
||
|
self.assertEquals(self.code, 503); del self.code
|
||
|
|
||
|
+ def test_retry_no_cache(self):
|
||
|
+ 'test bypassing proxy cache on failure'
|
||
|
+ def process(data):
|
||
|
+ if 'Pragma:no-cache' in data:
|
||
|
+ self.content = 'version2'
|
||
|
+ else:
|
||
|
+ self.content = 'version1'
|
||
|
+
|
||
|
+ def checkfunc_read(obj):
|
||
|
+ if obj.data == 'version1':
|
||
|
+ raise URLGrabError(-1, 'Outdated version of foo')
|
||
|
+
|
||
|
+ def checkfunc_grab(obj):
|
||
|
+ with open('foo') as f:
|
||
|
+ if f.read() == 'version1':
|
||
|
+ raise URLGrabError(-1, 'Outdated version of foo')
|
||
|
+
|
||
|
+ self.process = process
|
||
|
+ self.reply = 200, "OK"
|
||
|
+
|
||
|
+ opts = self.g.opts
|
||
|
+ opts.retry = 3
|
||
|
+ opts.retry_no_cache = True
|
||
|
+
|
||
|
+ # single
|
||
|
+ opts.checkfunc = checkfunc_read
|
||
|
+ try:
|
||
|
+ self.mg.urlread('foo')
|
||
|
+ except URLGrabError as e:
|
||
|
+ self.fail(str(e))
|
||
|
+
|
||
|
+ # multi
|
||
|
+ opts.checkfunc = checkfunc_grab
|
||
|
+ self.mg.urlgrab('foo', async=True)
|
||
|
+ try:
|
||
|
+ urlgrabber.grabber.parallel_wait()
|
||
|
+ except URLGrabError as e:
|
||
|
+ self.fail(str(e))
|
||
|
+
|
||
|
def suite():
|
||
|
tl = TestLoader()
|
||
|
return tl.loadTestsFromModule(sys.modules[__name__])
|
||
|
diff -up urlgrabber-3.10/urlgrabber/grabber.py.orig urlgrabber-3.10/urlgrabber/grabber.py
|
||
|
--- urlgrabber-3.10/urlgrabber/grabber.py.orig 2016-06-29 18:25:53.964453346 +0200
|
||
|
+++ urlgrabber-3.10/urlgrabber/grabber.py 2016-06-29 18:26:58.886148544 +0200
|
||
|
@@ -171,6 +171,12 @@ GENERAL ARGUMENTS (kwargs)
|
||
|
The libproxy code is only used if the proxies dictionary
|
||
|
does not provide any proxies.
|
||
|
|
||
|
+ no_cache = False
|
||
|
+
|
||
|
+ When True, server-side cache will be disabled for http and https
|
||
|
+ requests. This is equivalent to setting
|
||
|
+ http_headers = (('Pragma', 'no-cache'),)
|
||
|
+
|
||
|
prefix = None
|
||
|
|
||
|
a url prefix that will be prepended to all requested urls. For
|
||
|
@@ -383,10 +389,11 @@ RETRY RELATED ARGUMENTS
|
||
|
identical to checkfunc, except for the attributes defined in the
|
||
|
CallbackObject instance. The attributes for failure_callback are:
|
||
|
|
||
|
- exception = the raised exception
|
||
|
- url = the url we're trying to fetch
|
||
|
- tries = the number of tries so far (including this one)
|
||
|
- retry = the value of the retry option
|
||
|
+ exception = the raised exception
|
||
|
+ url = the url we're trying to fetch
|
||
|
+ tries = the number of tries so far (including this one)
|
||
|
+ retry = the value of the retry option
|
||
|
+ retry_no_cache = the value of the retry_no_cache option
|
||
|
|
||
|
The callback is present primarily to inform the calling program of
|
||
|
the failure, but if it raises an exception (including the one it's
|
||
|
@@ -431,6 +438,19 @@ RETRY RELATED ARGUMENTS
|
||
|
passed the same arguments, so you could use the same function for
|
||
|
both.
|
||
|
|
||
|
+ retry_no_cache = False
|
||
|
+
|
||
|
+ When True, automatically enable no_cache for future retries if
|
||
|
+ checkfunc performs an unsuccessful check.
|
||
|
+
|
||
|
+ This option is useful if your application expects a set of files
|
||
|
+ from the same server to form an atomic unit and you write your
|
||
|
+ checkfunc to ensure each file being downloaded belongs to such a
|
||
|
+ unit. If transparent proxy caching is in effect, the files can
|
||
|
+ become out-of-sync, disrupting the atomicity. Enabling this option
|
||
|
+ will prevent that, while ensuring that you still enjoy the benefits
|
||
|
+ of caching when possible.
|
||
|
+
|
||
|
BANDWIDTH THROTTLING
|
||
|
|
||
|
urlgrabber supports throttling via two values: throttle and
|
||
|
@@ -1001,6 +1021,8 @@ class URLGrabberOptions:
|
||
|
self.half_life = 30*24*60*60 # 30 days
|
||
|
self.default_speed = 1e6 # 1 MBit
|
||
|
self.ftp_disable_epsv = False
|
||
|
+ self.no_cache = False
|
||
|
+ self.retry_no_cache = False
|
||
|
|
||
|
def __repr__(self):
|
||
|
return self.format()
|
||
|
@@ -1077,7 +1099,8 @@ class URLGrabber(object):
|
||
|
if callback:
|
||
|
if DEBUG: DEBUG.info('calling callback: %s', callback)
|
||
|
obj = CallbackObject(exception=exception, url=args[0],
|
||
|
- tries=tries, retry=opts.retry)
|
||
|
+ tries=tries, retry=opts.retry,
|
||
|
+ retry_no_cache=opts.retry_no_cache)
|
||
|
_run_callback(callback, obj)
|
||
|
|
||
|
if (opts.retry is None) or (tries == opts.retry):
|
||
|
@@ -1089,6 +1112,8 @@ class URLGrabber(object):
|
||
|
if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
|
||
|
retrycode, opts.retrycodes)
|
||
|
raise
|
||
|
+ if retrycode is not None and retrycode < 0 and opts.retry_no_cache:
|
||
|
+ opts.no_cache = True
|
||
|
|
||
|
def urlopen(self, url, opts=None, **kwargs):
|
||
|
"""open the url and return a file object
|
||
|
@@ -1429,11 +1454,15 @@ class PyCurlFileObject(object):
|
||
|
self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
|
||
|
|
||
|
#headers:
|
||
|
- if opts.http_headers and self.scheme in ('http', 'https'):
|
||
|
+ if self.scheme in ('http', 'https'):
|
||
|
headers = []
|
||
|
- for (tag, content) in opts.http_headers:
|
||
|
- headers.append('%s:%s' % (tag, content))
|
||
|
- self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
|
||
|
+ if opts.http_headers is not None:
|
||
|
+ for (tag, content) in opts.http_headers:
|
||
|
+ headers.append('%s:%s' % (tag, content))
|
||
|
+ if opts.no_cache:
|
||
|
+ headers.append('Pragma:no-cache')
|
||
|
+ if headers:
|
||
|
+ self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
|
||
|
|
||
|
# ranges:
|
||
|
if opts.range or opts.reget:
|
||
|
@@ -2055,7 +2084,8 @@ class _ExternalDownloader:
|
||
|
'ssl_key_pass',
|
||
|
'ssl_verify_peer', 'ssl_verify_host',
|
||
|
'size', 'max_header_size', 'ip_resolve',
|
||
|
- 'ftp_disable_epsv'
|
||
|
+ 'ftp_disable_epsv',
|
||
|
+ 'no_cache',
|
||
|
)
|
||
|
|
||
|
def start(self, opts):
|
||
|
@@ -2236,6 +2266,8 @@ def parallel_wait(meter=None):
|
||
|
except URLGrabError, ug_err:
|
||
|
retry = 0 # no retries
|
||
|
if opts.tries < retry and ug_err.errno in opts.retrycodes:
|
||
|
+ if ug_err.errno < 0 and opts.retry_no_cache:
|
||
|
+ opts.no_cache = True
|
||
|
start(opts, opts.tries + 1) # simple retry
|
||
|
continue
|
||
|
|