git » repo-mgmt.git » commit 86f7c56

Finally: the old packages cleanup crew is here

author Urja (ARMLFS builder)
2025-11-09 15:25:30 UTC
committer Urja (ARMLFS builder)
2025-11-09 15:25:30 UTC
parent f4165e61d024c062f248cd83b2d3286fa057537c

Finally: the old packages cleanup crew is here

and we have rules for what we keep (y)
(atleast: 3 packages, 6 months, 100 MB)

cleanup-old-pkg.py +319 -0
srclist.sh +71 -0

diff --git a/cleanup-old-pkg.py b/cleanup-old-pkg.py
new file mode 100755
index 0000000..c65d468
--- /dev/null
+++ b/cleanup-old-pkg.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import subprocess
+from subprocess import DEVNULL, PIPE, STDOUT
+
+class Return:
+    def __init__(self, rv, out=None):
+        self.rv = rv
+        self.out = out
+    def __bool__(self):
+        return self.rv == 0
+    def __int__(self):
+        return self.rv
+    def __str__(self):
+        if self.out:
+            return self.out
+        return str(self.rv)
+
+def sub(*args, **kwargs):
+    c = subprocess.run(*args, **kwargs)
+    return Return(c.returncode, c.stdout)
+
+def subc(*args, **kwargs):
+    c = subprocess.run(*args, **kwargs)
+    if c.returncode != 0:
+        print("subprocess failed: ", args)
+        print("code:", c.returncode)
+        sys.exit(1)
+    return Return(c.returncode, c.stdout)
+
+def builddir(pkg):
+    cmd = [ "tar", "-xO", "--force-local", "--occurrence=1", "-f", pkg, ".BUILDINFO" ]
+    info = subc(cmd, capture_output=True, text=True)
+    pfx = "builddir = "
+    for L in str(info).splitlines():
+        if L.startswith(pfx):
+            return L[len(pfx):].strip()
+    return None
+
+
+# this is os.walk, simplified, but yielding DirEntries, and only for non-dirs,
+# one by one.
+def filewalk(top):
+	stack = [top]
+	while stack:
+		top = stack.pop()
+		dirs = []
+		with os.scandir(top) as entries:
+			for entry in entries:
+				if entry.is_dir():
+					dirs.append(entry.name)
+				else:
+					yield top, entry
+		for dirname in reversed(dirs):
+			new_path = os.path.join(top, dirname)
+			stack.append(new_path)
+
+def srclist(bdir):
+    cmd = [ "./srclist.sh", bdir ]
+    info = subc(cmd, capture_output=True, text=True)
+    files = []
+    for L in str(info).splitlines():
+        L = L.strip()
+        if len(L) and " " in L:
+            _,file = L.strip().split(" ",maxsplit=1)
+            files.append(file)
+    return files
+    
+
+releasepkg="/mnt/nfs/pkg"
+
+
+class Pkg:
+    def __init__(self, e, ver):
+        self.files = {}
+        self.files[ver] = [e]
+        self._builddir = None
+        self._builddirs = []
+        self.analyzed = False
+
+    def add(self, e, ver):
+        self.analyzed = False
+        if ver not in self.files.keys():
+            self.files[ver] = [e];
+        else:
+            self.files[ver].append(e)
+
+    def __len__(self): # How many versions?
+        return len(self.files)
+
+    def _analyze(self):
+        """Compute things that need stat() of the files: size and age."""
+        self.sizes = {}
+        self.dates = {}
+        newest = 0
+        oldest = None
+        total_size = 0
+        for ver in self.files.keys():
+            vsiz = 0
+            vnew = 0
+            for e in self.files[ver]:
+                vsiz += e.stat().st_size
+                mt = e.stat().st_mtime
+                if mt > vnew:
+                    vnew = mt
+            self.sizes[ver] = vsiz
+            self.dates[ver] = vnew
+            total_size += vsiz
+            if not oldest or vnew < oldest:
+                oldest = vnew
+            if vnew > newest:
+                newest = vnew
+        self.total_size = total_size
+        self.newest = newest
+        self.oldest = oldest
+        self.analyzed = True
+
+    def size(self):
+        if not self.analyzed:
+            self._analyze()
+        return self.total_size
+
+    def age(self):
+        if not self.analyzed:
+            self._analyze()
+        return self.newest - self.oldest
+
+    def newest_ver(self):
+        if not self.analyzed:
+            self._analyze()
+        newest_ver = None
+        for ver,date in self.dates.items():
+            if date == self.newest:
+                newest_ver = ver
+                break
+        return newest_ver
+
+    def builddir(self):
+        if self._builddir:
+            return self._builddir
+        self._builddir = builddir(self.files[self.newest_ver()][0].path)
+        self._builddirs = [self._builddir]
+        return self._builddir
+
+    def builddirs(self):
+        if not self._builddir:
+            return [self.builddir()]
+        if len(self._builddirs) <= 1:
+            return [self._builddir]
+        return self._builddirs;
+
+    def add_builddir(self, dir):
+        if len(self._builddirs) < 1:
+            self.builddir()
+        if dir not in self._builddirs:
+            self._builddirs.append(dir)
+
+    def merge(self, other):
+        for ver in other.files.keys():
+            for e in other.files[ver]:
+                self.add(e,ver)
+
+
+def fuzzy_builddir_match(pkg1, pkg2):
+    dir2 = pkg2.builddir()
+    if dir2 in pkg1.builddirs():
+        return True
+    dir1 = pkg1.builddir()
+    if dir1 == dir2:
+        return True
+    if "-" in dir2 and "linux" not in dir1:
+        base,suffix = dir2.rsplit("-",maxsplit=1)
+        if dir1 == base:
+            #print("fuzzy match", dir1, dir2)
+            return True
+    return False
+
+pkgs = {}
+
+for root, entry in filewalk(releasepkg):
+    suffix = ".pkg.tar.xz"
+    dbgsuf = "-dbginfo"
+    if entry.name.endswith(suffix):
+        if not entry.is_file():
+            continue
+        _,arch,repo = root.rsplit("/",maxsplit=2)
+        pkg,ver,rel,pkgarch = entry.name[:-len(suffix)].rsplit("-",maxsplit=3)
+        pkgver = f"{ver}-{rel}" # Simpler to reassemble than skip splitting there
+        dbg = False
+
+        if pkg.endswith(dbgsuf):
+            dbg = True
+            pkg = pkg[:-len(dbgsuf)]
+
+        #print(arch, repo, pkg, pkgver, pkgarch, dbg)
+
+        id = (repo, pkg)
+        if id not in pkgs.keys():
+            pkgs[id] = Pkg(entry, pkgver)
+        else:
+            pkgs[id].add(entry, pkgver)
+
+# Rules:
+keep_versions = 3
+keep_age = 6 * 30 * 24 * 60 * 60
+keep_size = 100 * 1024 * 1024
+# These mean:
+# - we always keep atleast 3 versions.
+# - we always keep packages for atleast 6 months
+# - we always keep atleast 100 MB of versions for a package
+#  (NOTE: the size does count the dbginfo and all arches)
+
+candidates = {}
+for id, pkg in pkgs.items():
+    if len(pkg) <= keep_versions:
+        continue
+    candidates[id] = pkg
+
+# This process brings split packages back together
+# and with fuzzy_builddir_match we also bring together version-synced packages
+# like firefox and firefox-i18n
+merged = True
+while merged:
+    merged = False
+    for id, pkg in candidates.items():
+        for id2, other in candidates.items():
+            if id == id2: # do not merge myself into itself...
+                continue
+            if id[0] != id2[0]: # repo must match
+                continue
+            if pkg.newest_ver() == other.newest_ver() and fuzzy_builddir_match(pkg, other):
+                newid = tuple([id[0]] + list(id[1:]) + list(id2[1:]))
+                pkg.merge(other)
+                if other.builddir() not in pkg.builddirs():
+                    pkg.add_builddir(other.builddir())
+                candidates[newid] = pkg
+                del candidates[id]
+                del candidates[id2]
+                merged = True
+                break
+        if merged:
+            break
+
+finalists = {}
+for id, pkg in candidates.items():
+    if pkg.age() <= keep_age:
+        continue
+    if pkg.size() <= keep_size:
+        continue
+    finalists[id] = pkg
+    #print(id, len(pkg), pkg.age() / 86400, pkg.size() / (1024*1024))
+
+
+removals = {}
+for id, pkg in finalists.items():
+    versions = sorted(pkg.dates.items(), key=lambda e: e[1], reverse=True)
+    compliant = False
+    rmlist = []
+    while not compliant:
+        ver,date = versions.pop()
+        rmlist += pkg.files[ver]
+        del pkg.files[ver]
+        pkg._analyze()
+        if len(pkg) <= keep_versions:
+            compliant = True
+        if pkg.age() <= keep_age:
+            compliant = True
+        if pkg.size() <= keep_size:
+            compliant = True
+    rminfo = (pkg.builddirs(), ver, date, rmlist)
+    removals[id] = rminfo
+    #print(len(pkg), pkg.age() / 86400, pkg.size() / (1024*1024))
+    #print(removals[id])
+
+
+for id, info in removals.items():
+    # Do not cleanup core sources, they're arranged differently.
+    repo = id[0]
+    if repo == "core":
+        continue
+    rmsrc = []
+    for dir in info[0]:
+        keepfiles = srclist(dir)
+        basedir = os.path.basename(dir)
+        srcarchives = f"/sources/archives/{repo}/{basedir}"
+        with os.scandir(srcarchives) as entries:
+            for entry in entries:
+                if entry.is_dir():
+                    continue
+                if entry.name in keepfiles:
+                    continue
+                if entry.stat().st_mtime > info[2]:
+                    continue
+                rmsrc.append(entry)
+                info[3].append(entry)
+    #print(id, rmsrc)
+
+if len(removals):
+    print(f"Removing old packages/sources for {len(removals)} packages:")
+    for id, info in removals.items():
+        idstr = id[0] + ":" + ",".join(id[1:])
+        print(f"{len(info[3])} files for {idstr} - version {info[1]} and older")
+        for e in info[3]:
+            print(f"    {e.path}")
+    print("Press enter to continue, Ctrl-C to abort.")
+    _ = input()
+
+    for id, info in removals.items():
+        for e in info[3]:
+            if e.path.endswith(".pkg.tar.xz"):
+                sigfile = e.path + ".sig"
+                if os.path.exists(sigfile):
+                    os.unlink(sigfile)
+            os.unlink(e.path)
+    print("Done.")
+else:
+    print("Nothing to clean up.")
diff --git a/srclist.sh b/srclist.sh
new file mode 100755
index 0000000..532fa36
--- /dev/null
+++ b/srclist.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# extract the protocol from a source entry - return "local" for local sources
+get_protocol() {
+	if [[ $1 = *://* ]]; then
+		# strip leading filename
+		local proto="${1#*::}"
+		proto="${proto%%://*}"
+		# strip proto+uri://
+		printf "%s\n" "${proto%%+*}"
+	elif [[ $1 = *lp:* ]]; then
+		local proto="${1#*::}"
+		printf "%s\n" "${proto%%+lp:*}"
+	else
+		printf "%s\n" local
+	fi
+}
+
+# extract the filename from a source entry
+get_filename() {
+	local netfile=$1
+
+	# if a filename is specified, use it
+	if [[ $netfile = *::* ]]; then
+		printf "%s\n" "${netfile%%::*}"
+		return
+	fi
+
+	local proto=$(get_protocol "$netfile")
+
+	case $proto in
+		bzr|fossil|git|hg|svn)
+			filename=${netfile%%#*}
+			filename=${filename%%\?*}
+			filename=${filename%/}
+			filename=${filename##*/}
+			if [[ $proto = bzr ]]; then
+				filename=${filename#*lp:}
+			fi
+			if [[ $proto = fossil ]]; then
+				filename=$filename.fossil
+			fi
+			if [[ $proto = git ]]; then
+				filename=${filename%%.git*}
+			fi
+			;;
+		*)
+			# if it is just an URL, we only keep the last component
+			filename="${netfile##*/}"
+			;;
+	esac
+	printf "%s\n" "${filename}"
+}
+
+
+for arg; do
+  if [ ! -e $arg/PKGBUILD ]; then
+    continue
+  fi
+
+  (cd $arg;
+  for src in $(su builder -c "makepkg --printsrcinfo" | grep "^	source = " | cut -d ' ' -f 3-); do
+    p=$(get_protocol "$src")
+    if [ "$p" = "local" ]; then
+      continue
+    fi
+    name=$(get_filename "$src")
+    echo $(basename `pwd`) $name
+  done
+  )
+done