riverqueue
diff --git a/‎client.go‎
Lines changed: 52 additions & 3 deletions b/‎client.go‎
Lines changed: 52 additions & 3 deletions
diff --git a/‎client_test.go‎
Lines changed: 35 additions & 0 deletions b/‎client_test.go‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎internal/leadership/elector.go‎
Lines changed: 20 additions & 13 deletions b/‎internal/leadership/elector.go‎
Lines changed: 20 additions & 13 deletions
diff --git a/‎internal/leadership/elector_test.go‎
Lines changed: 3 additions & 0 deletions b/‎internal/leadership/elector_test.go‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎internal/maintenance/job_cleaner.go‎
Lines changed: 12 additions & 0 deletions b/‎internal/maintenance/job_cleaner.go‎
Lines changed: 12 additions & 0 deletions
@@ -9,6 +9,7 @@ import (
 	"log/slog"
 	"os"
 	"regexp"
+	"slices"
 	"strings"
 	"sync"
 	"time"
@@ -209,6 +210,40 @@ type Config struct {
 	// Jobs may have their own specific hooks by implementing JobArgsWithHooks.
 	Hooks []rivertype.Hook
 
+	// LeaderDomain is an optional "domain" string to use for leader election.
+	// Different clients sharing the same River schema can elect multiple
+	// leaders as long as they're using different domains, with one leader
+	// elected per domain.
+	//
+	// Setting this value also triggers the related behavior that maintenance
+	// services start to only operate on the queues they're configured on. So
+	// for example, given client1 handling queueA and queueB and client2
+	// handling queueC and queueD, whichever client is elected leader will end
+	// up running all maintenance services for all queues (queueA, queueB,
+	// queueC, and queueD). But if client1 is using domain "domain1" and client2
+	// is using domain "domain2", then client1 (elected in domain1) will only
+	// run maintenance services on queueA and queueB, while client2 (elected in
+	// domain2) will run maintenance services on queueC and queueD.
+	//
+	// A warning though that River *does not protect against configuration
+	// mistakes*. If client1 on domain1 is configured for queueA and queueB, and
+	// client2 on domain2 is *also* configured for queueA and queueB, then both
+	// clients may end up running maintenance services on the same queues at the
+	// same time. It's the caller's responsibility to ensure that doesn't
+	// happen.
+	//
+	// Certain maintenance services that aren't queue-related like the indexer
+	// will continue to run on all leaders regardless of domain. If using this
+	// feature, it's a good idea to configure ReindexerTimeout on all but a
+	// single leader domain to river.NeverSchedule().
+	//
+	// In general, most River users should not need LeaderDomain, and when
+	// running multiple Rivers may want to consider using multiple databases and
+	// multiple schemas instead.
+	//
+	// Defaults to "default".
+	LeaderDomain string
+
 	// Logger is the structured logger to use for logging purposes. If none is
 	// specified, logs will be emitted to STDOUT with messages at warn level
 	// or higher.
@@ -415,6 +450,7 @@ func (c *Config) WithDefaults() *Config {
 		Hooks:                       c.Hooks,
 		JobInsertMiddleware:         c.JobInsertMiddleware,
 		JobTimeout:                  cmp.Or(c.JobTimeout, JobTimeoutDefault),
+		LeaderDomain:                c.LeaderDomain,
 		Logger:                      logger,
 		MaxAttempts:                 cmp.Or(c.MaxAttempts, MaxAttemptsDefault),
 		Middleware:                  c.Middleware,
@@ -840,6 +876,7 @@ func NewClient[TTx any](driver riverdriver.Driver[TTx], config *Config) (*Client
 
 		client.elector = leadership.NewElector(archetype, driver.GetExecutor(), client.notifier, &leadership.Config{
 			ClientID: config.ID,
+			Domain:   config.LeaderDomain,
 			Schema:   config.Schema,
 		})
 		client.services = append(client.services, client.elector)
@@ -860,6 +897,14 @@ func NewClient[TTx any](driver riverdriver.Driver[TTx], config *Config) (*Client
 			client.services = append(client.services, pluginPilot.PluginServices()...)
 		}
 
+		// It's important for queuesIncluded to be `nil` in case it's not in use
+		// for the various driver queries to work correctly.
+		var queuesIncluded []string
+		if config.LeaderDomain != "" && config.LeaderDomain != leadership.DomainDefault && len(config.Queues) > 0 {
+			queuesIncluded = maputil.Keys(config.Queues)
+			slices.Sort(queuesIncluded)
+		}
+
 		//
 		// Maintenance services
 		//
@@ -872,6 +917,7 @@ func NewClient[TTx any](driver riverdriver.Driver[TTx], config *Config) (*Client
 				CompletedJobRetentionPeriod: config.CompletedJobRetentionPeriod,
 				DiscardedJobRetentionPeriod: config.DiscardedJobRetentionPeriod,
 				QueuesExcluded:              client.pilot.JobCleanerQueuesExcluded(),
+				QueuesIncluded:              queuesIncluded,
 				Schema:                      config.Schema,
 				Timeout:                     config.JobCleanerTimeout,
 			}, driver.GetExecutor())
@@ -882,6 +928,7 @@ func NewClient[TTx any](driver riverdriver.Driver[TTx], config *Config) (*Client
 		{
 			jobRescuer := maintenance.NewRescuer(archetype, &maintenance.JobRescuerConfig{
 				ClientRetryPolicy: config.RetryPolicy,
+				QueuesIncluded:    queuesIncluded,
 				RescueAfter:       config.RescueStuckJobsAfter,
 				Schema:            config.Schema,
 				WorkUnitFactoryFunc: func(kind string) workunit.WorkUnitFactory {
@@ -897,9 +944,10 @@ func NewClient[TTx any](driver riverdriver.Driver[TTx], config *Config) (*Client
 
 		{
 			jobScheduler := maintenance.NewJobScheduler(archetype, &maintenance.JobSchedulerConfig{
-				Interval:     config.schedulerInterval,
-				NotifyInsert: client.maybeNotifyInsertForQueues,
-				Schema:       config.Schema,
+				Interval:       config.schedulerInterval,
+				NotifyInsert:   client.maybeNotifyInsertForQueues,
+				QueuesIncluded: queuesIncluded,
+				Schema:         config.Schema,
 			}, driver.GetExecutor())
 			maintenanceServices = append(maintenanceServices, jobScheduler)
 			client.testSignals.jobScheduler = &jobScheduler.TestSignals
@@ -925,6 +973,7 @@ func NewClient[TTx any](driver riverdriver.Driver[TTx], config *Config) (*Client
 
 		{
 			queueCleaner := maintenance.NewQueueCleaner(archetype, &maintenance.QueueCleanerConfig{
+				QueuesIncluded:  queuesIncluded,
 				RetentionPeriod: maintenance.QueueRetentionPeriodDefault,
 				Schema:          config.Schema,
 			}, driver.GetExecutor())
 
@@ -259,6 +259,41 @@ func Test_Client_Common(t *testing.T) {
 		riversharedtest.WaitOrTimeout(t, workedChan)
 	})
 
+	t.Run("Leadership_AlternateLeaderDomain", func(t *testing.T) {
+		t.Parallel()
+
+		var client1 *Client[pgx.Tx]
+		{
+			config, bundle := setupConfig(t)
+			config.ReindexerSchedule = &neverSchedule{}
+
+			var err error
+			client1, err = NewClient(bundle.driver, config)
+			require.NoError(t, err)
+			client1.testSignals.Init(t)
+		}
+
+		var client2 *Client[pgx.Tx]
+		{
+			config, bundle := setupConfig(t)
+			config.LeaderDomain = "alternate_domain"
+			config.Schema = client1.config.Schema
+			config.ReindexerSchedule = &neverSchedule{}
+
+			var err error
+			client2, err = NewClient(bundle.driver, config)
+			require.NoError(t, err)
+			client2.testSignals.Init(t)
+		}
+
+		startClient(ctx, t, client1)
+		startClient(ctx, t, client2)
+
+		// Both elected
+		client1.testSignals.electedLeader.WaitOrTimeout()
+		client2.testSignals.electedLeader.WaitOrTimeout()
+	})
+
 	t.Run("Queues_Add_WhenClientWontExecuteJobs", func(t *testing.T) {
 		t.Parallel()
 
 
@@ -22,6 +22,8 @@ import (
 	"github.com/riverqueue/river/rivershared/util/testutil"
 )
 
+const DomainDefault = "default"
+
 const (
 	electIntervalDefault           = 5 * time.Second
 	electIntervalJitterDefault     = 1 * time.Second
@@ -82,6 +84,7 @@ func (ts *electorTestSignals) Init(tb testutil.TestingTB) {
 
 type Config struct {
 	ClientID            string
+	Domain              string
 	ElectInterval       time.Duration // period on which each elector attempts elect even without having received a resignation notification
 	ElectIntervalJitter time.Duration
 	Schema              string
@@ -121,6 +124,7 @@ func NewElector(archetype *baseservice.Archetype, exec riverdriver.Executor, not
 	return baseservice.Init(archetype, &Elector{
 		config: (&Config{
 			ClientID:            config.ClientID,
+			Domain:              cmp.Or(config.Domain, string(DomainDefault)),
 			ElectInterval:       cmp.Or(config.ElectInterval, electIntervalDefault),
 			ElectIntervalJitter: cmp.Or(config.ElectIntervalJitter, electIntervalJitterDefault),
 			Schema:              config.Schema,
@@ -143,9 +147,9 @@ func (e *Elector) Start(ctx context.Context) error {
 
 	var sub *notifier.Subscription
 	if e.notifier == nil {
-		e.Logger.DebugContext(ctx, e.Name+": No notifier configured; starting in poll mode", "client_id", e.config.ClientID)
+		e.Logger.DebugContext(ctx, e.Name+": Resigned leadership successfully", "client_id", e.config.ClientID, "domain", e.config.Domain)
 	} else {
-		e.Logger.DebugContext(ctx, e.Name+": Listening for leadership changes", "client_id", e.config.ClientID, "topic", notifier.NotificationTopicLeadership)
+		e.Logger.DebugContext(ctx, e.Name+": Resigned leadership successfully", "client_id", e.config.ClientID, "domain", e.config.Domain, "topic", notifier.NotificationTopicLeadership)
 		var err error
 		sub, err = e.notifier.Listen(ctx, notifier.NotificationTopicLeadership, func(topic notifier.NotificationTopic, payload string) {
 			e.handleLeadershipNotification(ctx, topic, payload)
@@ -180,7 +184,7 @@ func (e *Elector) Start(ctx context.Context) error {
 				return
 			}
 
-			e.Logger.DebugContext(ctx, e.Name+": Gained leadership", "client_id", e.config.ClientID)
+			e.Logger.DebugContext(ctx, e.Name+": Gained leadership", "client_id", e.config.ClientID, "domain", e.config.Domain)
 			e.testSignals.GainedLeadership.Signal(struct{}{})
 
 			err := e.keepLeadershipLoop(ctx)
@@ -193,7 +197,7 @@ func (e *Elector) Start(ctx context.Context) error {
 					continue // lost leadership reelection; unusual but not a problem; don't log
 				}
 
-				e.Logger.ErrorContext(ctx, e.Name+": Error keeping leadership", "client_id", e.config.ClientID, "err", err)
+				e.Logger.ErrorContext(ctx, e.Name+": Error keeping leadership", "client_id", e.config.ClientID, "domain", e.config.Domain, "err", err)
 			}
 		}
 	}()
@@ -205,10 +209,11 @@ func (e *Elector) attemptGainLeadershipLoop(ctx context.Context) error {
 	var attempt int
 	for {
 		attempt++
-		e.Logger.DebugContext(ctx, e.Name+": Attempting to gain leadership", "client_id", e.config.ClientID)
+		e.Logger.DebugContext(ctx, e.Name+": Attempting to gain leadership", "client_id", e.config.ClientID, "domain", e.config.Domain)
 
 		elected, err := attemptElectOrReelect(ctx, e.exec, false, &riverdriver.LeaderElectParams{
 			LeaderID: e.config.ClientID,
+			Name:     e.config.Domain,
 			Now:      e.Time.NowUTCOrNil(),
 			Schema:   e.config.Schema,
 			TTL:      e.leaderTTL(),
@@ -229,7 +234,7 @@ func (e *Elector) attemptGainLeadershipLoop(ctx context.Context) error {
 
 		attempt = 0
 
-		e.Logger.DebugContext(ctx, e.Name+": Leadership bid was unsuccessful (not an error)", "client_id", e.config.ClientID)
+		e.Logger.DebugContext(ctx, e.Name+": Leadership bid was unsuccessful (not an error)", "client_id", e.config.ClientID, "domain", e.config.Domain)
 		e.testSignals.DeniedLeadership.Signal(struct{}{})
 
 		select {
@@ -254,17 +259,17 @@ func (e *Elector) attemptGainLeadershipLoop(ctx context.Context) error {
 func (e *Elector) handleLeadershipNotification(ctx context.Context, topic notifier.NotificationTopic, payload string) {
 	if topic != notifier.NotificationTopicLeadership {
 		// This should not happen unless the notifier is broken.
-		e.Logger.ErrorContext(ctx, e.Name+": Received unexpected notification", "client_id", e.config.ClientID, "topic", topic, "payload", payload)
+		e.Logger.ErrorContext(ctx, e.Name+": Received unexpected notification", "client_id", e.config.ClientID, "domain", e.config.Domain, "topic", topic, "payload", payload)
 		return
 	}
 
 	notification := DBNotification{}
 	if err := json.Unmarshal([]byte(payload), &notification); err != nil {
-		e.Logger.ErrorContext(ctx, e.Name+": Unable to unmarshal leadership notification", "client_id", e.config.ClientID, "err", err)
+		e.Logger.ErrorContext(ctx, e.Name+": Unable to unmarshal leadership notification", "client_id", e.config.ClientID, "domain", e.config.Domain, "err", err)
 		return
 	}
 
-	e.Logger.DebugContext(ctx, e.Name+": Received notification from notifier", "action", notification.Action, "client_id", e.config.ClientID)
+	e.Logger.DebugContext(ctx, e.Name+": Received notification from notifier", "action", notification.Action, "client_id", e.config.ClientID, "domain", e.config.Domain)
 
 	// Do an initial context check so in case context is done, it always takes
 	// precedence over sending a leadership notification.
@@ -359,7 +364,7 @@ func (e *Elector) keepLeadershipLoop(ctx context.Context) error {
 		case <-e.requestResignChan:
 			// Receive a notification telling current leader to resign.
 
-			e.Logger.InfoContext(ctx, e.Name+": Current leader received forced resignation", "client_id", e.config.ClientID)
+			e.Logger.InfoContext(ctx, e.Name+": Current leader received forced resignation", "client_id", e.config.ClientID, "domain", e.config.Domain)
 
 			if !timer.Stop() {
 				<-timer.C
@@ -383,10 +388,11 @@ func (e *Elector) keepLeadershipLoop(ctx context.Context) error {
 			// Reelect timer expired; attempt reelection below.
 		}
 
-		e.Logger.DebugContext(ctx, e.Name+": Current leader attempting reelect", "client_id", e.config.ClientID)
+		e.Logger.InfoContext(ctx, e.Name+": Current leader received forced resignation", "client_id", e.config.ClientID, "domain", e.config.Domain)
 
 		reelected, err := attemptElectOrReelect(ctx, e.exec, true, &riverdriver.LeaderElectParams{
 			LeaderID: e.config.ClientID,
+			Name:     e.config.Domain,
 			Now:      e.Time.NowUTCOrNil(),
 			Schema:   e.config.Schema,
 			TTL:      e.leaderTTL(),
@@ -424,7 +430,7 @@ func (e *Elector) keepLeadershipLoop(ctx context.Context) error {
 // always surrendered in a timely manner so it can be picked up quickly by
 // another client, even in the event of a cancellation.
 func (e *Elector) attemptResignLoop(ctx context.Context) {
-	e.Logger.DebugContext(ctx, e.Name+": Attempting to resign leadership", "client_id", e.config.ClientID)
+	e.Logger.InfoContext(ctx, e.Name+": Current leader received forced resignation", "client_id", e.config.ClientID, "domain", e.config.Domain)
 
 	// Make a good faith attempt to resign, even in the presence of errors, but
 	// don't keep hammering if it doesn't work. In case a resignation failure,
@@ -469,7 +475,7 @@ func (e *Elector) attemptResign(ctx context.Context, attempt int) error {
 	}
 
 	if resigned {
-		e.Logger.DebugContext(ctx, e.Name+": Resigned leadership successfully", "client_id", e.config.ClientID)
+		e.Logger.DebugContext(ctx, e.Name+": Resigned leadership successfully", "client_id", e.config.ClientID, "domain", e.config.Domain)
 		e.testSignals.ResignedLeadership.Signal(struct{}{})
 	}
 
@@ -484,6 +490,7 @@ func (e *Elector) errorSlogArgs(err error, attempt int, sleepDuration time.Durat
 	return []any{
 		slog.Int("attempt", attempt),
 		slog.String("client_id", e.config.ClientID),
+		slog.String("domain", e.config.Domain),
 		slog.String("err", err.Error()),
 		slog.String("sleep_duration", sleepDuration.String()),
 	}
 
@@ -422,6 +422,7 @@ func TestAttemptElectOrReelect(t *testing.T) {
 
 		elected, err := attemptElectOrReelect(ctx, bundle.exec, false, &riverdriver.LeaderElectParams{
 			LeaderID: clientID,
+			Name:     DomainDefault,
 			TTL:      leaderTTL,
 			Schema:   "",
 		})
@@ -451,6 +452,7 @@ func TestAttemptElectOrReelect(t *testing.T) {
 		// the transaction.
 		elected, err := attemptElectOrReelect(ctx, bundle.exec, true, &riverdriver.LeaderElectParams{
 			LeaderID: clientID,
+			Name:     DomainDefault,
 			TTL:      30 * time.Second,
 			Schema:   "",
 		})
@@ -478,6 +480,7 @@ func TestAttemptElectOrReelect(t *testing.T) {
 
 		elected, err := attemptElectOrReelect(ctx, bundle.exec, true, &riverdriver.LeaderElectParams{
 			LeaderID: "different-client-id",
+			Name:     DomainDefault,
 			TTL:      leaderTTL,
 			Schema:   "",
 		})
 
@@ -56,6 +56,10 @@ type JobCleanerConfig struct {
 	// QueuesExcluded are queues that'll be excluded from cleaning.
 	QueuesExcluded []string
 
+	// QueuesIncluded are queues that'll be included in cleaning.  If set, only
+	// these queues will be cleaned. If nil, all queues are cleaned.
+	QueuesIncluded []string
+
 	// Schema where River tables are located. Empty string omits schema, causing
 	// Postgres to default to `search_path`.
 	Schema string
@@ -79,6 +83,12 @@ func (c *JobCleanerConfig) mustValidate() *JobCleanerConfig {
 	if c.Interval <= 0 {
 		panic("JobCleanerConfig.Interval must be above zero")
 	}
+	if c.QueuesExcluded != nil && len(c.QueuesExcluded) == 0 {
+		panic("JobCleanerConfig.QueuesExcluded should be either nil or a non-empty slice")
+	}
+	if c.QueuesIncluded != nil && len(c.QueuesIncluded) == 0 {
+		panic("JobCleanerConfig.QueuesIncluded should be either nil or a non-empty slice")
+	}
 	if c.Timeout <= 0 {
 		panic("JobCleanerConfig.Timeout must be above zero")
 	}
@@ -117,6 +127,7 @@ func NewJobCleaner(archetype *baseservice.Archetype, config *JobCleanerConfig, e
 			CompletedJobRetentionPeriod: cmp.Or(config.CompletedJobRetentionPeriod, riversharedmaintenance.CompletedJobRetentionPeriodDefault),
 			DiscardedJobRetentionPeriod: cmp.Or(config.DiscardedJobRetentionPeriod, riversharedmaintenance.DiscardedJobRetentionPeriodDefault),
 			QueuesExcluded:              config.QueuesExcluded,
+			QueuesIncluded:              config.QueuesIncluded,
 			Interval:                    cmp.Or(config.Interval, riversharedmaintenance.JobCleanerIntervalDefault),
 			Schema:                      config.Schema,
 			Timeout:                     cmp.Or(config.Timeout, riversharedmaintenance.JobCleanerTimeoutDefault),
@@ -205,6 +216,7 @@ func (s *JobCleaner) runOnce(ctx context.Context) (*jobCleanerRunOnceResult, err
 				DiscardedFinalizedAtHorizon: time.Now().Add(-s.Config.DiscardedJobRetentionPeriod),
 				Max:                         s.batchSize(),
 				QueuesExcluded:              s.Config.QueuesExcluded,
+				QueuesIncluded:              s.Config.QueuesIncluded,
 				Schema:                      s.Config.Schema,
 			})
 			if err != nil {