From dedb9339ef45810daff2b068ff8d434927547c27 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 8 May 2018 22:45:04 +0200 Subject: arch-proxy.py: support mirrors and Arch Linux Archive (ALA) For testing older packages while reusing a local package cache. Fix miscellaneous issues like printing stack traces when -Syu while the cache is up to date (peer would close connection). --- arch-proxy.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 98 insertions(+), 19 deletions(-) (limited to 'arch-proxy.py') diff --git a/arch-proxy.py b/arch-proxy.py index 56f231f..9da960c 100755 --- a/arch-proxy.py +++ b/arch-proxy.py @@ -27,7 +27,7 @@ class BadRequest(Exception): pass class RequestHandler(http.server.BaseHTTPRequestHandler): - def send_ok(self, size, headers={}, cached=False, range_offset=None): + def send_ok(self, size, headers={}, upstream=None, range_offset=None): if range_offset is None: code = 200 else: @@ -39,7 +39,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): headers["Content-Range"] = content_range size -= range_offset self.log_message('"%s" %d %s %s', self.requestline, code, size, - "HIT" if cached else "MISS") + "HIT" if upstream is None else "MISS:%s" % (upstream,)) self.send_response_only(code) self.send_header('Content-Length', size) for k, v in headers.items(): @@ -47,14 +47,33 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): self.end_headers() def request_data(self, head_only=False, mtime_out=None, range_offset=None): + """ + Retrieves the full response body. The given "range_offset" serves only + as hint for the response to the client, it is not used with the upstream + request. + """ method = "HEAD" if head_only else "GET" - url = self.get_upstream_url() - with closing(requests.request(method, url, stream=not head_only)) as r: - if r.status_code != 200: - self.log_request(r.status_code) - self.send_response_only(r.status_code) - self.end_headers() - return + streamable = not head_only + status_code = None + urls = list(self.get_upstream_urls()) + # Try each upstream. If one fails, log it and try another. On success, + # return the response data. If all upstreams fail, fail the request. + for i, url in enumerate(urls): + with closing(requests.request(method, url, stream=streamable)) as r: + status_code = r.status_code + if status_code == 200: + yield from self.process_upstream_response(r, head_only, + mtime_out, i, range_offset) + return + self.log_message('"%s" %d - SKIP:%d', self.requestline, + status_code, i) + self.log_request(status_code) + self.send_response_only(status_code) + self.end_headers() + + def process_upstream_response(self, r, head_only, mtime_out, upstream, + range_offset): + if r: response_headers = {} if 'Last-Modified' in r.headers: try: @@ -65,7 +84,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): except ValueError: self.log_error("Unable to parse Last-Modified header") self.send_ok(int(r.headers['Content-Length']), response_headers, - range_offset=range_offset) + upstream=upstream, range_offset=range_offset) if not head_only: yield from r.iter_content(4096) @@ -142,7 +161,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): stat_info = os.stat(path) response_headers = {'Last-Modified': epoch_to_text(stat_info.st_mtime)} - self.send_ok(stat_info.st_size, response_headers, cached=True, + self.send_ok(stat_info.st_size, response_headers, range_offset=range_offset) if not head_only: with open(path, 'rb') as f: @@ -162,6 +181,9 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): with self.open_write_cache(path) as cache_file: cache_ok = cache_file is not None if cache_ok: + # Overwrite the temporary cache file from begin to end, + # but do not write include the first "range_offset" + # bytes in the response. skip = range_offset for chunk in remote_data: cache_file.write(chunk) @@ -181,6 +203,8 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): if data: for chunk in data: self.wfile.write(chunk) + except (BrokenPipeError, ConnectionResetError): + self.log_error("GET %s - (connection aborted)", self.path) except BadRequest as e: self.log_error("GET %s - Bad Request: %s", self.path, e) self.send_response(400) @@ -192,17 +216,24 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): def do_HEAD(self): try: list(self.request_data_with_cache(True)) + except (BrokenPipeError, ConnectionResetError): + self.log_error("HEAD %s - (connection aborted)", self.path) except BadRequest as e: - self.log_error("GET %s - Bad Request: %s", self.path, e) + self.log_error("HEAD %s - Bad Request: %s", self.path, e) self.send_response(400) except Exception as e: self.log_error("HEAD %s failed: %s", self.path, e) import traceback; traceback.print_exc() self.send_response(502) - def get_upstream_url(self): - prefix = "http://mirror.nl.leaseweb.net/archlinux/" - return prefix + self.path + def get_upstream_urls(self): + # If an old version is requested, retrieve the databases from the + # archive mirror and do not fallback. + if self.server.archive_url and self.is_date_sensitive_request(): + yield self.server.archive_url + self.path + return + for prefix in self.server.mirrors: + yield prefix + self.path def get_local_path(self): filename = os.path.basename(self.path) @@ -212,6 +243,14 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): """Whether the requested file should be cached.""" return self.path.endswith(".pkg.tar.xz") + def is_date_sensitive_request(self): + """Whether the resource is ephemeral.""" + path = self.path + if path.endswith(".sig"): + path = path[:-4] + suffixes = [".db", ".files", ".abs.tar.gz"] + return any(path.endswith(suffix) for suffix in suffixes) + class SomeServer(http.server.HTTPServer): def __init__(self, addr, handler, args): self.allow_reuse_address = True @@ -220,14 +259,54 @@ class SomeServer(http.server.HTTPServer): super().__init__(addr, handler) self.cachedir = args.cachedir self.is_readonly = args.readonly + self.mirrors = args.mirrors + if not args.date: + self.archive_url = None + else: + archive_mirror = "https://archive.archlinux.org/repos/" + self.archive_url = archive_mirror + args.date + "/" + self.mirrors.append(self.archive_url) + + def dump_config(self): + yesno = lambda x: "yes" if x else "no" + print("Listen address: %s:%s" % self.socket.getsockname()[:2]) + print("Cache directory: %s" % self.cachedir) + print("Read-only cache: %s" % yesno(self.is_readonly)) + print("Using archive: %s" % yesno(self.archive_url)) + print("Mirrors:") + for mirror in self.mirrors: + print(" %s" % mirror) + +def mirror_url(string): + scheme = string.split(":", 1)[0] + if scheme not in ("http", "https"): + raise argparse.ArgumentTypeError("%s is not a valid URL" % string) + return string.rstrip("/") + "/" + +def parse_date(string): + m = re.match(r'^(\d{4})([/-]?)(\d{2})\2(\d{2})$', string) + if not m: + raise argparse.ArgumentTypeError("%s is not a valid date" % string) + y, _, m, d = m.groups() + return "%s/%s/%s" % (y, m, d) -parser = argparse.ArgumentParser() -parser.add_argument("--readonly", action="store_true") -parser.add_argument("--cachedir", default=os.getcwd()) -parser.add_argument("--port", type=int, default=8001) +parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("--readonly", action="store_true", + help="Do not write downloaded results to the cache directory") +parser.add_argument("--cachedir", default=os.getcwd(), + help="Cache directory") +parser.add_argument("--port", type=int, default=8001, + help="Listen port") +parser.add_argument("--date", type=parse_date, + help="Provide a repository snapshot from 'yyyy/mm/dd'") +parser.add_argument("--mirror", dest="mirrors", metavar='URL', nargs="+", + type=mirror_url, default=["https://mirror.nl.leaseweb.net/archlinux/"], + help="Mirror list") if __name__ == '__main__': args = parser.parse_args() addr = ('', args.port) server = SomeServer(addr, RequestHandler, args) + server.dump_config() server.serve_forever() -- cgit v1.2.1