From dedb9339ef45810daff2b068ff8d434927547c27 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Tue, 8 May 2018 22:45:04 +0200
Subject: arch-proxy.py: support mirrors and Arch Linux Archive (ALA)

For testing older packages while reusing a local package cache.
Fix miscellaneous issues like printing stack traces when -Syu while the
cache is up to date (peer would close connection).
---
 arch-proxy.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 98 insertions(+), 19 deletions(-)

(limited to 'arch-proxy.py')

diff --git a/arch-proxy.py b/arch-proxy.py
index 56f231f..9da960c 100755
--- a/arch-proxy.py
+++ b/arch-proxy.py
@@ -27,7 +27,7 @@ class BadRequest(Exception):
     pass
 
 class RequestHandler(http.server.BaseHTTPRequestHandler):
-    def send_ok(self, size, headers={}, cached=False, range_offset=None):
+    def send_ok(self, size, headers={}, upstream=None, range_offset=None):
         if range_offset is None:
             code = 200
         else:
@@ -39,7 +39,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
             headers["Content-Range"] = content_range
             size -= range_offset
         self.log_message('"%s" %d %s %s', self.requestline, code, size,
-                "HIT" if cached else "MISS")
+                "HIT" if upstream is None else "MISS:%s" % (upstream,))
         self.send_response_only(code)
         self.send_header('Content-Length', size)
         for k, v in headers.items():
@@ -47,14 +47,33 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
         self.end_headers()
 
     def request_data(self, head_only=False, mtime_out=None, range_offset=None):
+        """
+        Retrieves the full response body. The given "range_offset" serves only
+        as hint for the response to the client, it is not used with the upstream
+        request.
+        """
         method = "HEAD" if head_only else "GET"
-        url = self.get_upstream_url()
-        with closing(requests.request(method, url, stream=not head_only)) as r:
-            if r.status_code != 200:
-                self.log_request(r.status_code)
-                self.send_response_only(r.status_code)
-                self.end_headers()
-                return
+        streamable = not head_only
+        status_code = None
+        urls = list(self.get_upstream_urls())
+        # Try each upstream. If one fails, log it and try another. On success,
+        # return the response data. If all upstreams fail, fail the request.
+        for i, url in enumerate(urls):
+            with closing(requests.request(method, url, stream=streamable)) as r:
+                status_code = r.status_code
+                if status_code == 200:
+                    yield from self.process_upstream_response(r, head_only,
+                            mtime_out, i, range_offset)
+                    return
+                self.log_message('"%s" %d - SKIP:%d', self.requestline,
+                        status_code, i)
+        self.log_request(status_code)
+        self.send_response_only(status_code)
+        self.end_headers()
+
+    def process_upstream_response(self, r, head_only, mtime_out, upstream,
+            range_offset):
+        if r:
             response_headers = {}
             if 'Last-Modified' in r.headers:
                 try:
@@ -65,7 +84,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
                 except ValueError:
                     self.log_error("Unable to parse Last-Modified header")
             self.send_ok(int(r.headers['Content-Length']), response_headers,
-                    range_offset=range_offset)
+                    upstream=upstream, range_offset=range_offset)
             if not head_only:
                 yield from r.iter_content(4096)
 
@@ -142,7 +161,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
             stat_info = os.stat(path)
             response_headers = {'Last-Modified':
                     epoch_to_text(stat_info.st_mtime)}
-            self.send_ok(stat_info.st_size, response_headers, cached=True,
+            self.send_ok(stat_info.st_size, response_headers,
                     range_offset=range_offset)
             if not head_only:
                 with open(path, 'rb') as f:
@@ -162,6 +181,9 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
                 with self.open_write_cache(path) as cache_file:
                     cache_ok = cache_file is not None
                     if cache_ok:
+                        # Overwrite the temporary cache file from begin to end,
+                        # but do not write include the first "range_offset"
+                        # bytes in the response.
                         skip = range_offset
                         for chunk in remote_data:
                             cache_file.write(chunk)
@@ -181,6 +203,8 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
             if data:
                 for chunk in data:
                     self.wfile.write(chunk)
+        except (BrokenPipeError, ConnectionResetError):
+            self.log_error("GET %s - (connection aborted)", self.path)
         except BadRequest as e:
             self.log_error("GET %s - Bad Request: %s", self.path, e)
             self.send_response(400)
@@ -192,17 +216,24 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
     def do_HEAD(self):
         try:
             list(self.request_data_with_cache(True))
+        except (BrokenPipeError, ConnectionResetError):
+            self.log_error("HEAD %s - (connection aborted)", self.path)
         except BadRequest as e:
-            self.log_error("GET %s - Bad Request: %s", self.path, e)
+            self.log_error("HEAD %s - Bad Request: %s", self.path, e)
             self.send_response(400)
         except Exception as e:
             self.log_error("HEAD %s failed: %s", self.path, e)
             import traceback; traceback.print_exc()
             self.send_response(502)
 
-    def get_upstream_url(self):
-        prefix = "http://mirror.nl.leaseweb.net/archlinux/"
-        return prefix + self.path
+    def get_upstream_urls(self):
+        # If an old version is requested, retrieve the databases from the
+        # archive mirror and do not fallback.
+        if self.server.archive_url and self.is_date_sensitive_request():
+            yield self.server.archive_url + self.path
+            return
+        for prefix in self.server.mirrors:
+            yield prefix + self.path
 
     def get_local_path(self):
         filename = os.path.basename(self.path)
@@ -212,6 +243,14 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
         """Whether the requested file should be cached."""
         return self.path.endswith(".pkg.tar.xz")
 
+    def is_date_sensitive_request(self):
+        """Whether the resource is ephemeral."""
+        path = self.path
+        if path.endswith(".sig"):
+            path = path[:-4]
+        suffixes = [".db", ".files", ".abs.tar.gz"]
+        return any(path.endswith(suffix) for suffix in suffixes)
+
 class SomeServer(http.server.HTTPServer):
     def __init__(self, addr, handler, args):
         self.allow_reuse_address = True
@@ -220,14 +259,54 @@ class SomeServer(http.server.HTTPServer):
         super().__init__(addr, handler)
         self.cachedir = args.cachedir
         self.is_readonly = args.readonly
+        self.mirrors = args.mirrors
+        if not args.date:
+            self.archive_url = None
+        else:
+            archive_mirror = "https://archive.archlinux.org/repos/"
+            self.archive_url = archive_mirror + args.date + "/"
+            self.mirrors.append(self.archive_url)
+
+    def dump_config(self):
+        yesno = lambda x: "yes" if x else "no"
+        print("Listen address:  %s:%s" % self.socket.getsockname()[:2])
+        print("Cache directory: %s" % self.cachedir)
+        print("Read-only cache: %s" % yesno(self.is_readonly))
+        print("Using archive:   %s" % yesno(self.archive_url))
+        print("Mirrors:")
+        for mirror in self.mirrors:
+            print(" %s" % mirror)
+
+def mirror_url(string):
+    scheme = string.split(":", 1)[0]
+    if scheme not in ("http", "https"):
+        raise argparse.ArgumentTypeError("%s is not a valid URL" % string)
+    return string.rstrip("/") + "/"
+
+def parse_date(string):
+    m = re.match(r'^(\d{4})([/-]?)(\d{2})\2(\d{2})$', string)
+    if not m:
+        raise argparse.ArgumentTypeError("%s is not a valid date" % string)
+    y, _, m, d = m.groups()
+    return "%s/%s/%s" % (y, m, d)
 
-parser = argparse.ArgumentParser()
-parser.add_argument("--readonly", action="store_true")
-parser.add_argument("--cachedir", default=os.getcwd())
-parser.add_argument("--port", type=int, default=8001)
+parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument("--readonly", action="store_true",
+        help="Do not write downloaded results to the cache directory")
+parser.add_argument("--cachedir", default=os.getcwd(),
+        help="Cache directory")
+parser.add_argument("--port", type=int, default=8001,
+        help="Listen port")
+parser.add_argument("--date", type=parse_date,
+        help="Provide a repository snapshot from 'yyyy/mm/dd'")
+parser.add_argument("--mirror", dest="mirrors", metavar='URL', nargs="+",
+        type=mirror_url, default=["https://mirror.nl.leaseweb.net/archlinux/"],
+        help="Mirror list")
 
 if __name__ == '__main__':
     args = parser.parse_args()
     addr = ('', args.port)
     server = SomeServer(addr, RequestHandler, args)
+    server.dump_config()
     server.serve_forever()
-- 
cgit v1.2.1