From d5d74dab010eace0a7e1219c16fe1879063764f0 Mon Sep 17 00:00:00 2001
From: Daniel Alley <dalley@redhat.com>
Date: Wed, 27 May 2026 23:16:11 -0400
Subject: [PATCH] Sync optimization: do existing content check in first stage

Build a cache of existing packages from the previous repository version.
In the event that the repository being created is substantially similar
(very likely), this should reduce the amount of database queries
required in the QueryExistingContent stage, which can discriminate
between Content objects yet-to-be-saved and ones which were already
saved.

Assisted-By: claude-opus-4.6
---
 pulp_rpm/app/tasks/synchronizing.py | 123 ++++++++++++++++++++--------
 1 file changed, 88 insertions(+), 35 deletions(-)

diff --git a/pulp_rpm/app/tasks/synchronizing.py b/pulp_rpm/app/tasks/synchronizing.py
index 3383e95dd..3afa8c7b6 100644
--- a/pulp_rpm/app/tasks/synchronizing.py
+++ b/pulp_rpm/app/tasks/synchronizing.py
@@ -1388,6 +1388,29 @@ def score_grouping(items):
             "total": total_packages,
         }
         async with ProgressReport(**progress_data) as packages_pb:
+            # Pre-load existing packages from the latest repo version keyed by pkgId.
+            # Cache hits reuse the saved model object, causing QueryExistingContents to
+            # skip them (because _state.adding is False on already-saved objects).
+            def _build_existing_packages_cache():
+                cache = {}
+                latest_version = self.repository.latest_version()
+                if latest_version:
+                    # ignore particularly expensive metadata which we do not need to handle for already-synced packages
+                    for existing_pkg in (
+                        Package.objects.filter(pk__in=latest_version.content.all())
+                        .defer(
+                            "files",
+                            "requires",
+                            "provides",
+                            "changelogs",
+                        )
+                        .iterator()
+                    ):
+                        cache[existing_pkg.pkgId] = existing_pkg
+                return cache
+
+            existing_packages = await sync_to_async(_build_existing_packages_cache)()
+
             string_cache = {}
             tuple_cache = {}
 
@@ -1410,45 +1433,75 @@ def score_grouping(items):
                 if pkg.name != last_seen_package_name and not use_global_caching:
                     string_cache.clear()
                     tuple_cache.clear()
-                # Implicit: There can be multiple package entries that are completely identical
-                # (same NEVRA, same build time, same checksum / pkgid) and the same or different
-                # location_href. We're not explicitly handling this, the pipeline will deduplicate.
-                package = Package(
-                    **Package.createrepo_to_dict(
-                        pkg, string_cache=string_cache, tuple_cache=tuple_cache
+
+                # If we see a package that's in the cache (generated from latest repo_version)
+                # avoid generating a new empty Package and instead pass the saved one. This avoids
+                # more expensive queries down the line in QueryExistingContents.
+                cached = existing_packages.pop(pkg.pkgId, None)
+                if cached is not None:
+                    base_url = pkg.location_base or self.remote_url
+                    url = urlpath_sanitize(base_url, pkg.location_href)
+                    store_package_for_mirroring(self.repository, cached.pkgId, pkg.location_href)
+                    last_seen_package_name = pkg.name
+                    del pkg
+
+                    artifact = Artifact(size=cached.size_package)
+                    checksum_type = getattr(CHECKSUM_TYPES, cached.checksum_type.upper())
+                    setattr(artifact, checksum_type, cached.pkgId)
+                    da = DeclarativeArtifact(
+                        artifact=artifact,
+                        url=url,
+                        relative_path=cached.location_href,
+                        remote=self.remote,
+                        deferred_download=self.deferred_download,
                     )
-                )
-                # TODO: set signing_keys when we support package signing during sync
-                package.signing_keys = None
-                base_url = pkg.location_base or self.remote_url
-                url = urlpath_sanitize(base_url, package.location_href)
-                last_seen_package_name = pkg.name
-                del pkg  # delete it as soon as we're done with it
-
-                # Location_href is not a property of the Package in isolation [0], and Pulp has
-                # a well defined way of generating the layout/locations on publication time.
-                # We only need to use the original location_href for metadata mirroring
-                # [0] https://github.com/pulp/pulp_rpm/issues/2580
-                original_location_href = package.location_href
-                package.location_href = package.filename
-                store_package_for_mirroring(self.repository, package.pkgId, original_location_href)
-
-                artifact = Artifact(size=package.size_package)
-                checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper())
-                setattr(artifact, checksum_type, package.pkgId)
-                da = DeclarativeArtifact(
-                    artifact=artifact,
-                    url=url,
-                    relative_path=package.location_href,
-                    remote=self.remote,
-                    deferred_download=self.deferred_download,
-                )
-                dc = DeclarativeContent(content=package, d_artifacts=[da])
-                dc.extra_data = defaultdict(list)
+                    dc = DeclarativeContent(content=cached, d_artifacts=[da])
+                    dc.extra_data = defaultdict(list)
+                else:
+                    # Implicit: There can be multiple package entries that are completely
+                    # identical (same NEVRA, same build time, same checksum / pkgid) and the
+                    # same or different location_href. We're not explicitly handling this, the
+                    # pipeline will deduplicate.
+                    package = Package(
+                        **Package.createrepo_to_dict(
+                            pkg, string_cache=string_cache, tuple_cache=tuple_cache
+                        )
+                    )
+                    # TODO: set signing_keys when we support package signing during sync
+                    package.signing_keys = None
+                    base_url = pkg.location_base or self.remote_url
+                    url = urlpath_sanitize(base_url, package.location_href)
+                    last_seen_package_name = pkg.name
+                    del pkg  # delete & free the memory as soon as we're done with it
+
+                    # Location_href is not a property of the Package in isolation [0], and
+                    # Pulp has a well defined way of generating the layout/locations on
+                    # publication time. We only need to use the original location_href for
+                    # metadata mirroring.
+                    # [0] https://github.com/pulp/pulp_rpm/issues/2580
+                    original_location_href = package.location_href
+                    package.location_href = package.filename
+                    store_package_for_mirroring(
+                        self.repository, package.pkgId, original_location_href
+                    )
+
+                    artifact = Artifact(size=package.size_package)
+                    checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper())
+                    setattr(artifact, checksum_type, package.pkgId)
+                    da = DeclarativeArtifact(
+                        artifact=artifact,
+                        url=url,
+                        relative_path=package.location_href,
+                        remote=self.remote,
+                        deferred_download=self.deferred_download,
+                    )
+                    dc = DeclarativeContent(content=package, d_artifacts=[da])
+                    dc.extra_data = defaultdict(list)
 
                 # find if a package relates to a modulemd
                 if dc.content.nevra in self.nevra_to_module.keys():
-                    dc.content.is_modular = True
+                    if dc.content._state.adding:  # don't edit existing packages though
+                        dc.content.is_modular = True
                     for dc_modulemd in self.nevra_to_module[dc.content.nevra]:
                         dc.extra_data["modulemd_relation"].append(dc_modulemd)
                         dc_modulemd.extra_data["package_relation"].append(dc)