git » repo-mgmt.git » main » tree

[main] / cleanup-old-pkg.py

#!/usr/bin/env python3

import os
import sys
import subprocess
from subprocess import DEVNULL, PIPE, STDOUT

class Return:
    def __init__(self, rv, out=None):
        self.rv = rv
        self.out = out
    def __bool__(self):
        return self.rv == 0
    def __int__(self):
        return self.rv
    def __str__(self):
        if self.out:
            return self.out
        return str(self.rv)

def sub(*args, **kwargs):
    c = subprocess.run(*args, **kwargs)
    return Return(c.returncode, c.stdout)

def subc(*args, **kwargs):
    c = subprocess.run(*args, **kwargs)
    if c.returncode != 0:
        print("subprocess failed: ", args)
        print("code:", c.returncode)
        sys.exit(1)
    return Return(c.returncode, c.stdout)

def builddir(pkg):
    cmd = [ "tar", "-xO", "--force-local", "--occurrence=1", "-f", pkg, ".BUILDINFO" ]
    info = subc(cmd, capture_output=True, text=True)
    pfx = "builddir = "
    for L in str(info).splitlines():
        if L.startswith(pfx):
            return L[len(pfx):].strip()
    return None


# this is os.walk, simplified, but yielding DirEntries, and only for non-dirs,
# one by one.
def filewalk(top):
	stack = [top]
	while stack:
		top = stack.pop()
		dirs = []
		with os.scandir(top) as entries:
			for entry in entries:
				if entry.is_dir():
					dirs.append(entry.name)
				else:
					yield top, entry
		for dirname in reversed(dirs):
			new_path = os.path.join(top, dirname)
			stack.append(new_path)

def srclist(bdir):
    cmd = [ "./srclist.sh", bdir ]
    info = subc(cmd, capture_output=True, text=True)
    files = []
    for L in str(info).splitlines():
        L = L.strip()
        if len(L) and " " in L:
            _,file = L.strip().split(" ",maxsplit=1)
            files.append(file)
    return files
    

releasepkg="/mnt/nfs/pkg"


class Pkg:
    def __init__(self, e, ver):
        self.files = {}
        self.files[ver] = [e]
        self._builddir = None
        self._builddirs = []
        self.analyzed = False

    def add(self, e, ver):
        self.analyzed = False
        if ver not in self.files.keys():
            self.files[ver] = [e];
        else:
            self.files[ver].append(e)

    def __len__(self): # How many versions?
        return len(self.files)

    def _analyze(self):
        """Compute things that need stat() of the files: size and age."""
        self.sizes = {}
        self.dates = {}
        newest = 0
        oldest = None
        total_size = 0
        for ver in self.files.keys():
            vsiz = 0
            vnew = 0
            for e in self.files[ver]:
                vsiz += e.stat().st_size
                mt = e.stat().st_mtime
                if mt > vnew:
                    vnew = mt
            self.sizes[ver] = vsiz
            self.dates[ver] = vnew
            total_size += vsiz
            if not oldest or vnew < oldest:
                oldest = vnew
            if vnew > newest:
                newest = vnew
        self.total_size = total_size
        self.newest = newest
        self.oldest = oldest
        self.analyzed = True

    def size(self):
        if not self.analyzed:
            self._analyze()
        return self.total_size

    def age(self):
        if not self.analyzed:
            self._analyze()
        return self.newest - self.oldest

    def newest_ver(self):
        if not self.analyzed:
            self._analyze()
        newest_ver = None
        for ver,date in self.dates.items():
            if date == self.newest:
                newest_ver = ver
                break
        return newest_ver

    def builddir(self):
        if self._builddir:
            return self._builddir
        self._builddir = builddir(self.files[self.newest_ver()][0].path)
        self._builddirs = [self._builddir]
        return self._builddir

    def builddirs(self):
        if not self._builddir:
            return [self.builddir()]
        if len(self._builddirs) <= 1:
            return [self._builddir]
        return self._builddirs;

    def add_builddir(self, dir):
        if len(self._builddirs) < 1:
            self.builddir()
        if dir not in self._builddirs:
            self._builddirs.append(dir)

    def merge(self, other):
        for ver in other.files.keys():
            for e in other.files[ver]:
                self.add(e,ver)


def fuzzy_builddir_match(pkg1, pkg2):
    dir2 = pkg2.builddir()
    if dir2 in pkg1.builddirs():
        return True
    dir1 = pkg1.builddir()
    if dir1 == dir2:
        return True
    if "-" in dir2 and "linux" not in dir1:
        base,suffix = dir2.rsplit("-",maxsplit=1)
        if dir1 == base:
            #print("fuzzy match", dir1, dir2)
            return True
    return False

pkgs = {}

for root, entry in filewalk(releasepkg):
    suffix = ".pkg.tar.xz"
    dbgsuf = "-dbginfo"
    if entry.name.endswith(suffix):
        if not entry.is_file():
            continue
        _,arch,repo = root.rsplit("/",maxsplit=2)
        pkg,ver,rel,pkgarch = entry.name[:-len(suffix)].rsplit("-",maxsplit=3)
        pkgver = f"{ver}-{rel}" # Simpler to reassemble than skip splitting there
        dbg = False

        if pkg.endswith(dbgsuf):
            dbg = True
            pkg = pkg[:-len(dbgsuf)]

        #print(arch, repo, pkg, pkgver, pkgarch, dbg)

        id = (repo, pkg)
        if id not in pkgs.keys():
            pkgs[id] = Pkg(entry, pkgver)
        else:
            pkgs[id].add(entry, pkgver)

# Rules:
keep_versions = 3
keep_age = 6 * 30 * 24 * 60 * 60
keep_size = 100 * 1024 * 1024
# These mean:
# - we always keep atleast 3 versions.
# - we always keep packages for atleast 6 months
# - we always keep atleast 100 MB of versions for a package
#  (NOTE: the size does count the dbginfo and all arches)

candidates = {}
for id, pkg in pkgs.items():
    if len(pkg) <= keep_versions:
        continue
    candidates[id] = pkg

# This process brings split packages back together
# and with fuzzy_builddir_match we also bring together version-synced packages
# like firefox and firefox-i18n
merged = True
while merged:
    merged = False
    for id, pkg in candidates.items():
        for id2, other in candidates.items():
            if id == id2: # do not merge myself into itself...
                continue
            if id[0] != id2[0]: # repo must match
                continue
            if pkg.newest_ver() == other.newest_ver() and fuzzy_builddir_match(pkg, other):
                newid = tuple([id[0]] + list(id[1:]) + list(id2[1:]))
                pkg.merge(other)
                if other.builddir() not in pkg.builddirs():
                    pkg.add_builddir(other.builddir())
                candidates[newid] = pkg
                del candidates[id]
                del candidates[id2]
                merged = True
                break
        if merged:
            break

finalists = {}
for id, pkg in candidates.items():
    if pkg.age() <= keep_age:
        continue
    if pkg.size() <= keep_size:
        continue
    finalists[id] = pkg
    #print(id, len(pkg), pkg.age() / 86400, pkg.size() / (1024*1024))


removals = {}
for id, pkg in finalists.items():
    versions = sorted(pkg.dates.items(), key=lambda e: e[1], reverse=True)
    compliant = False
    rmlist = []
    while not compliant:
        ver,date = versions.pop()
        rmlist += pkg.files[ver]
        del pkg.files[ver]
        pkg._analyze()
        if len(pkg) <= keep_versions:
            compliant = True
        if pkg.age() <= keep_age:
            compliant = True
        if pkg.size() <= keep_size:
            compliant = True
    rminfo = (pkg.builddirs(), ver, date, rmlist)
    removals[id] = rminfo
    #print(len(pkg), pkg.age() / 86400, pkg.size() / (1024*1024))
    #print(removals[id])


for id, info in removals.items():
    # Do not cleanup core sources, they're arranged differently.
    repo = id[0]
    if repo == "core":
        continue
    rmsrc = []
    for dir in info[0]:
        keepfiles = srclist(dir)
        basedir = os.path.basename(dir)
        srcarchives = f"/sources/archives/{repo}/{basedir}"
        with os.scandir(srcarchives) as entries:
            for entry in entries:
                if entry.is_dir():
                    continue
                if entry.name in keepfiles:
                    continue
                if entry.stat().st_mtime > info[2]:
                    continue
                rmsrc.append(entry)
                info[3].append(entry)
    #print(id, rmsrc)

if len(removals):
    print(f"Removing old packages/sources for {len(removals)} packages:")
    for id, info in removals.items():
        idstr = id[0] + ":" + ",".join(id[1:])
        print(f"{len(info[3])} files for {idstr} - version {info[1]} and older")
        for e in info[3]:
            print(f"    {e.path}")
    print("Press enter to continue, Ctrl-C to abort.")
    _ = input()

    for id, info in removals.items():
        for e in info[3]:
            if e.path.endswith(".pkg.tar.xz"):
                sigfile = e.path + ".sig"
                if os.path.exists(sigfile):
                    os.unlink(sigfile)
            os.unlink(e.path)
    print("Done.")
else:
    print("Nothing to clean up.")