diff options
author | Peter Wu <peter@lekensteyn.nl> | 2017-03-04 16:35:43 +0100 |
---|---|---|
committer | Peter Wu <peter@lekensteyn.nl> | 2017-03-04 16:35:43 +0100 |
commit | da8a76f3c7a537fb1b155f1ba7d19591845b28bf (patch) | |
tree | 72fcdce501ef5ea57ddced08abc7e0e72a4d365d | |
parent | e476230f0ee46e496da0e23f8352a464530fd6ad (diff) | |
download | scripts-da8a76f3c7a537fb1b155f1ba7d19591845b28bf.tar.gz |
arch-proxy.py: support resumed downloads
Quick hack to support one specific range request type: resume
downloading at some offset until the end. The full file is still
downloaded from upstream as needed.
-rwxr-xr-x | arch-proxy.py | 84 |
1 files changed, 72 insertions, 12 deletions
diff --git a/arch-proxy.py b/arch-proxy.py index d40cb4c..56f231f 100755 --- a/arch-proxy.py +++ b/arch-proxy.py @@ -10,6 +10,7 @@ import argparse import http.server import os +import re import socket from datetime import datetime import requests @@ -22,17 +23,30 @@ def text_to_epoch(text): def epoch_to_text(epoch): return datetime.fromtimestamp(epoch).strftime(DATE_FORMAT) +class BadRequest(Exception): + pass + class RequestHandler(http.server.BaseHTTPRequestHandler): - def send_ok(self, size, headers={}, cached=False): - self.log_message('"%s" %d %s %s', self.requestline, 200, size, + def send_ok(self, size, headers={}, cached=False, range_offset=None): + if range_offset is None: + code = 200 + else: + if range_offset >= size: + # TODO need different status code + raise BadRequest("416 Requested Range Not Satisfiable") + code = 206 + content_range = "bytes %d-%d/%d" % (range_offset, size - 1, size) + headers["Content-Range"] = content_range + size -= range_offset + self.log_message('"%s" %d %s %s', self.requestline, code, size, "HIT" if cached else "MISS") - self.send_response_only(200) + self.send_response_only(code) self.send_header('Content-Length', size) for k, v in headers.items(): self.send_header(k, v) self.end_headers() - def request_data(self, head_only=False, mtime_out=None): + def request_data(self, head_only=False, mtime_out=None, range_offset=None): method = "HEAD" if head_only else "GET" url = self.get_upstream_url() with closing(requests.request(method, url, stream=not head_only)) as r: @@ -50,7 +64,8 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): mtime_out[0] = mtime except ValueError: self.log_error("Unable to parse Last-Modified header") - self.send_ok(int(r.headers['Content-Length']), response_headers) + self.send_ok(int(r.headers['Content-Length']), response_headers, + range_offset=range_offset) if not head_only: yield from r.iter_content(4096) @@ -83,27 +98,63 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): except OSError as e: self.log_error("Failed to remove %s", temp_path) + def parse_range(self): + value = self.headers.get('Range') + if value is not None: + # Only support "continue" range requests, resuming previous + # download. Anything more complex is not needed at the moment. + m = re.match(r'bytes=(?P<from>\d+)-$', value) + if not m: + raise BadRequest("Unsupported range request: %s" % value) + return int(m.group("from")) + + @staticmethod + def skip_range_chunk(chunk, skip_bytes): + if skip_bytes: + chunksize = len(chunk) + if chunksize > skip_bytes: + chunk = chunk[skip_bytes:] + skip_bytes = None + else: + chunk = b'' + skip_bytes -= chunksize + return chunk, (skip_bytes or None) + + @classmethod + def skip_range(cls, it, skip_bytes): + for chunk in it: + chunk, skip_bytes = cls.skip_range_chunk(chunk, skip_bytes) + if chunk: + yield chunk + + def request_data_with_cache(self, head_only=False): + range_offset = self.parse_range() if not self.is_cacheable(): # Not cacheable, directly obtain data and bypass cache - yield from self.request_data() + remote_data = self.request_data(range_offset=range_offset) + yield from self.skip_range(remote_data, range_offset) return path = self.get_local_path() try: # Try to open cached file and yield data from it - self.send_ok(os.path.getsize(path), - {'Last-Modified': epoch_to_text(os.stat(path).st_mtime)}, - cached=True) + stat_info = os.stat(path) + response_headers = {'Last-Modified': + epoch_to_text(stat_info.st_mtime)} + self.send_ok(stat_info.st_size, response_headers, cached=True, + range_offset=range_offset) if not head_only: with open(path, 'rb') as f: + if range_offset: + f.seek(range_offset) yield from f except FileNotFoundError: # File does not exist, so try to pipe upstream # (optionally writing to cache file) mtime_pointer = [None] remote_data = self.request_data(head_only=head_only, - mtime_out=mtime_pointer) + mtime_out=mtime_pointer, range_offset=range_offset) if head_only: list(remote_data) # consume yield and StopIteration if not head_only and remote_data: @@ -111,15 +162,18 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): with self.open_write_cache(path) as cache_file: cache_ok = cache_file is not None if cache_ok: + skip = range_offset for chunk in remote_data: cache_file.write(chunk) - yield chunk + chunk, skip = self.skip_range_chunk(chunk, skip) + if chunk: + yield chunk if cache_ok: # Write was successful, now fix mtime and rename self.finish_cache(mtime_pointer[0]) else: # Cache file unavailable, just pass all data - yield from remote_data + yield from self.skip_range(remote_data, range_offset) def do_GET(self): try: @@ -127,6 +181,9 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): if data: for chunk in data: self.wfile.write(chunk) + except BadRequest as e: + self.log_error("GET %s - Bad Request: %s", self.path, e) + self.send_response(400) except Exception as e: self.log_error("GET %s failed: %s", self.path, e) import traceback; traceback.print_exc() @@ -135,6 +192,9 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): def do_HEAD(self): try: list(self.request_data_with_cache(True)) + except BadRequest as e: + self.log_error("GET %s - Bad Request: %s", self.path, e) + self.send_response(400) except Exception as e: self.log_error("HEAD %s failed: %s", self.path, e) import traceback; traceback.print_exc() |