From 89d6a7380cb86df2a1caa19f2c8e6c69f212d185 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Fri, 24 Feb 2017 18:07:01 +0100 Subject: arch-proxy.py: quick proxy cache for Arch Linux packages With a freshly bootstrapped VM (using https://github.com/Lekensteyn/archdir), it became tiresome (and wasteful) to download the same packages when my local nginx cache was not available. This script serves as a simple proxy that tries to cache downloaded packages in the current working directory. --- arch-proxy.py | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100755 arch-proxy.py (limited to 'arch-proxy.py') diff --git a/arch-proxy.py b/arch-proxy.py new file mode 100755 index 0000000..e412364 --- /dev/null +++ b/arch-proxy.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +# Arch Linux packages proxy +# +# Proxies requests, caching files with ".pkg.tar.xz" suffix. +# If the cached file exists, serve that file. +# Otherwise, try to download file (as normally) and optionally cache it with +# ".download" extension. If the file exists and the file is not being +# downloaded, resume the download (Range requests). + +import http.server +import os +import socket +from datetime import datetime +import requests +from contextlib import closing, contextmanager +import fcntl + +addr = ('', 8001) +cache_dir = os.getcwd() + +DATE_FORMAT = '%a, %d %b %Y %H:%M:%S GMT' +def text_to_epoch(text): + return datetime.strptime(text, DATE_FORMAT).timestamp() +def epoch_to_text(epoch): + return datetime.fromtimestamp(epoch).strftime(DATE_FORMAT) + +class RequestHandler(http.server.BaseHTTPRequestHandler): + def send_ok(self, size, headers={}, cached=False): + self.log_message('"%s" %d %s %s', self.requestline, 200, size, + "HIT" if cached else "MISS") + self.send_response_only(200) + self.send_header('Content-Length', size) + for k, v in headers.items(): + self.send_header(k, v) + self.end_headers() + + def request_data(self, head_only=False, mtime_out=None): + method = "HEAD" if head_only else "GET" + url = self.get_upstream_url() + with closing(requests.request(method, url, stream=not head_only)) as r: + if r.status_code != 200: + self.log_request(r.status_code) + self.send_response_only(r.status_code) + self.end_headers() + return + response_headers = {} + if 'Last-Modified' in r.headers: + try: + mtime = text_to_epoch(r.headers['Last-Modified']) + response_headers['Last-Modified'] = epoch_to_text(mtime) + if mtime_out: + mtime_out[0] = mtime + except ValueError: + self.log_error("Unable to parse Last-Modified header") + self.send_ok(int(r.headers['Content-Length']), response_headers) + if not head_only: + yield from r.iter_content(4096) + + @contextmanager + def open_write_cache(self, path): + temp_path = path + ".download" + try: + with open(temp_path, 'wb') as f: + # Prevent concurrent writers + fcntl.lockf(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + yield f + except OSError as e: + self.log_error("Failed to create cache file %s: %s", temp_path, e) + yield None + + def finish_cache(self, mtime): + path = self.get_local_path() + temp_path = path + ".download" + if mtime: + os.utime(temp_path, times=(mtime, mtime)) + try: + os.rename(temp_path, path) + except OSError as e: + self.log_error("Failed to rename %s", temp_path) + try: + os.unlink(temp_path) + except OSError as e: + self.log_error("Failed to remove %s", temp_path) + + def request_data_with_cache(self, head_only=False): + if not self.is_cacheable(): + # Not cacheable, directly obtain data and bypass cache + yield from self.request_data() + return + + path = self.get_local_path() + try: + # Try to open cached file and yield data from it + self.send_ok(os.path.getsize(path), + {'Last-Modified': epoch_to_text(os.stat(path).st_mtime)}, + cached=True) + if not head_only: + with open(path, 'rb') as f: + yield from f + except FileNotFoundError: + # File does not exist, so try to pipe upstream + # (optionally writing to cache file) + mtime_pointer = [None] + remote_data = self.request_data(head_only=head_only, + mtime_out=mtime_pointer) + if head_only: + list(remote_data) # consume yield and StopIteration + if not head_only and remote_data: + with self.open_write_cache(path) as cache_file: + if cache_file: + for chunk in remote_data: + cache_file.write(chunk) + yield chunk + else: + # Cache file unavailable, just pass all data + yield from remote_data + return + # Write was successful, now fix mtime and rename + self.finish_cache(mtime_pointer[0]) + + def do_GET(self): + try: + data = self.request_data_with_cache() + if data: + for chunk in data: + self.wfile.write(chunk) + except Exception as e: + self.log_error("GET %s failed: %s", self.path, e) + import traceback; traceback.print_exc() + self.send_response(502) + + def do_HEAD(self): + try: + list(self.request_data_with_cache(True)) + except Exception as e: + self.log_error("HEAD %s failed: %s", self.path, e) + import traceback; traceback.print_exc() + self.send_response(502) + + def get_upstream_url(self): + prefix = "http://mirror.nl.leaseweb.net/archlinux/" + return prefix + self.path + + def get_local_path(self): + filename = os.path.basename(self.path) + return os.path.join(cache_dir, filename) + + def is_cacheable(self): + """Whether the requested file should be cached.""" + return self.path.endswith(".pkg.tar.xz") + +class SomeServer(http.server.HTTPServer): + def __init__(self, addr, handler): + self.allow_reuse_address = True + if ':' in addr[0]: + self.address_family = socket.AF_INET6 + super().__init__(addr, handler) + +if __name__ == '__main__': + server = SomeServer(addr, RequestHandler) + server.serve_forever() -- cgit v1.2.1