rudderlabs
diff --git a/‎go.mod‎
Lines changed: 1 addition & 1 deletion b/‎go.mod‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎go.sum‎
Lines changed: 2 additions & 2 deletions b/‎go.sum‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎router/handle.go‎
Lines changed: 5 additions & 4 deletions b/‎router/handle.go‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎router/handle_lifecycle.go‎
Lines changed: 4 additions & 1 deletion b/‎router/handle_lifecycle.go‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎router/partition_worker.go‎
Lines changed: 51 additions & 19 deletions b/‎router/partition_worker.go‎
Lines changed: 51 additions & 19 deletions
diff --git a/‎router/router_test.go‎
Lines changed: 12 additions & 9 deletions b/‎router/router_test.go‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎router/types.go‎
Lines changed: 26 additions & 24 deletions b/‎router/types.go‎
Lines changed: 26 additions & 24 deletions
@@ -88,7 +88,7 @@ require (
 	github.com/rudderlabs/bing-ads-go-sdk v0.2.3
 	github.com/rudderlabs/compose-test v0.1.3
 	github.com/rudderlabs/keydb v1.2.0
-	github.com/rudderlabs/rudder-go-kit v0.63.5
+	github.com/rudderlabs/rudder-go-kit v0.64.0
 	github.com/rudderlabs/rudder-observability-kit v0.0.5
 	github.com/rudderlabs/rudder-schemas v0.7.0
 	github.com/rudderlabs/rudder-transformer/go v0.0.0-20250707171833-9cd525669b1b
 
@@ -1201,8 +1201,8 @@ github.com/rudderlabs/keydb v1.2.0 h1:LMwWezUh3C+xheNirHNMhUlcb09ZH0Alo6bDDwwMaN
 github.com/rudderlabs/keydb v1.2.0/go.mod h1:ZYouneft71uF85OTUGS5cI9DoVdsfKDItgUTGNli9Mk=
 github.com/rudderlabs/parquet-go v0.0.3 h1:/zgRj929pGKHsthc0kw8stVEcFu1JUcpxDRlhxjSLic=
 github.com/rudderlabs/parquet-go v0.0.3/go.mod h1:WmwBOdvwpXl2aZGRk3NxxgzC/DaWGfax3jrCRhKhtSo=
-github.com/rudderlabs/rudder-go-kit v0.63.5 h1:SpeaZY89Q7okKP/cN2O09I+7m/Z9XhRwMAX+LvPcVtA=
-github.com/rudderlabs/rudder-go-kit v0.63.5/go.mod h1:58UhsuOVycglmqOhfBHMppAowoX4ANrdUeugS8qG0EM=
+github.com/rudderlabs/rudder-go-kit v0.64.0 h1:DuRDsgcn4bpEbh4/gPJ44mqgvjrjqzIJFfD4r8cm/o8=
+github.com/rudderlabs/rudder-go-kit v0.64.0/go.mod h1:58UhsuOVycglmqOhfBHMppAowoX4ANrdUeugS8qG0EM=
 github.com/rudderlabs/rudder-observability-kit v0.0.5 h1:s/+zsqdmpYG2LuWitFqQ2aIYFf67B7akJ3yxX4/KtXc=
 github.com/rudderlabs/rudder-observability-kit v0.0.5/go.mod h1:rL0zi374TMMx6YHzFxYyPItjl90iOaKxy1fdFCcq2DQ=
 github.com/rudderlabs/rudder-schemas v0.7.0 h1:hKShHYpbIldE1Q591vodI6iaAZ/IUOyC1DqUUJZysNU=
 
@@ -73,7 +73,8 @@ type Handle struct {
 	eventOrderHalfEnabledStateDuration config.ValueLoader[time.Duration]
 	deliveryThrottlerTimeout           config.ValueLoader[time.Duration]
 	drainConcurrencyLimit              config.ValueLoader[int]
-	workerInputBufferSize              int
+	maxNoOfJobsPerChannel              int // maximum capacity of each worker channel (hard capacity limit of the underlying go channel)
+	noOfJobsPerChannel                 int // requested capacity of each worker channel (important when job buffering is being calculated using the standard method)
 	saveDestinationResponse            bool
 	saveDestinationResponseOverride    config.ValueLoader[bool]
 	reportJobsdbPayload                config.ValueLoader[bool]
@@ -151,7 +152,7 @@ func (rt *Handle) activePartitions(ctx context.Context) []string {
 
 // pickup picks up jobs from the jobsDB for the provided partition and returns the number of jobs picked up and whether the limits were reached or not
 // picked up jobs are distributed to the workers
-func (rt *Handle) pickup(ctx context.Context, partition string, workers []*worker, pickupBatchSizeGauge stats.Gauge) (pickupCount int, limitsReached bool) {
+func (rt *Handle) pickup(ctx context.Context, partition string, workers []*worker, pickupBatchSizeGauge Gauge[int]) (pickupCount int, limitsReached bool) {
 	// pickup limiter with dynamic priority
 	start := time.Now()
 	var discardedCount int
@@ -210,7 +211,7 @@ func (rt *Handle) pickup(ctx context.Context, partition string, workers []*worke
 	}
 
 	type reservedJob struct {
-		slot        *workerSlot
+		slot        *reservedSlot
 		job         *jobsdb.JobT
 		drainReason string
 		parameters  routerutils.JobParameters
@@ -599,7 +600,7 @@ func (rt *Handle) getQueryParams(partition string, pickUpCount int) jobsdb.GetQu
 }
 
 type workerJobSlot struct {
-	slot        *workerSlot
+	slot        *reservedSlot
 	drainReason string
 }
 
 
@@ -106,7 +106,8 @@ func (rt *Handle) Setup(
 	}
 	rt.guaranteeUserEventOrder = getRouterConfigBool("guaranteeUserEventOrder", rt.destType, true)
 	rt.noOfWorkers = getRouterConfigInt("noOfWorkers", destType, 64)
-	rt.workerInputBufferSize = getRouterConfigInt("noOfJobsPerChannel", destType, 1000)
+	rt.maxNoOfJobsPerChannel = getRouterConfigInt("maxNoOfJobsPerChannel", destType, 10000)
+	rt.noOfJobsPerChannel = getRouterConfigInt("noOfJobsPerChannel", destType, 1000)
 	// Explicitly control destination types for which we want to support batching
 	// Avoiding stale configurations still having KAFKA batching enabled to cause issues with later versions of rudder-server
 	batchingSupportedDestinations := config.GetStringSliceVar([]string{"AM"}, "Router.batchingSupportedDestinations")
@@ -333,6 +334,8 @@ func (rt *Handle) setupReloadableVars() {
 	rt.reloadableConfig.failingJobsPenaltySleep = config.GetReloadableDurationVar(2000, time.Millisecond, getRouterConfigKeys("failingJobsPenaltySleep", rt.destType)...)
 	rt.reloadableConfig.failingJobsPenaltyThreshold = config.GetReloadableFloat64Var(0.6, getRouterConfigKeys("failingJobsPenaltyThreshold", rt.destType)...)
 	rt.reloadableConfig.oauthV2ExpirationTimeDiff = config.GetReloadableDurationVar(5, time.Minute, getRouterConfigKeys("oauth.expirationTimeDiff", rt.destType)...)
+	rt.reloadableConfig.enableExperimentalBufferSizeCalculator = config.GetReloadableBoolVar(false, getRouterConfigKeys("enableExperimentalBufferSizeCalculator", rt.destType)...)
+	rt.reloadableConfig.experimentalBufferSizeScalingFactor = config.GetReloadableFloat64Var(2.0, getRouterConfigKeys("experimentalBufferSizeScalingFactor", rt.destType)...)
 	rt.diagnosisTickerTime = config.GetDurationVar(60, time.Second, "Diagnostics.routerTimePeriod", "Diagnostics.routerTimePeriodInS")
 	rt.netClientTimeout = config.GetDurationVar(10, time.Second,
 		"Router."+rt.destType+".httpTimeout",
 
@@ -9,6 +9,8 @@ import (
 
 	"github.com/rudderlabs/rudder-go-kit/logger"
 	"github.com/rudderlabs/rudder-go-kit/stats"
+	"github.com/rudderlabs/rudder-go-kit/stats/metric"
+	kitsync "github.com/rudderlabs/rudder-go-kit/sync"
 	"github.com/rudderlabs/rudder-server/utils/cache"
 	"github.com/rudderlabs/rudder-server/utils/crash"
 	"github.com/rudderlabs/rudder-server/utils/misc"
@@ -18,33 +20,63 @@ import (
 // A partition worker uses multiple workers internally to process the jobs that are being picked up asynchronously.
 func newPartitionWorker(ctx context.Context, rt *Handle, partition string) *partitionWorker {
 	pw := &partitionWorker{
-		logger:    rt.logger.Child("p-" + partition),
-		rt:        rt,
-		partition: partition,
-		ctx:       ctx,
+		logger:               rt.logger.Child("p-" + partition),
+		rt:                   rt,
+		partition:            partition,
+		ctx:                  ctx,
+		pickupBatchSizeGauge: newGaugeWithLastValue[int](stats.Default.NewTaggedStat("router_pickup_batch_size_gauge", stats.GaugeType, stats.Tags{"destType": rt.destType, "partition": partition})),
 	}
 	pw.g, _ = errgroup.WithContext(context.Background())
 	pw.workers = make([]*worker, rt.noOfWorkers)
+	deliveryTimeStat := stats.Default.NewTaggedStat("router_delivery_time", stats.TimerType, stats.Tags{"destType": rt.destType})
+	routerDeliveryLatencyStat := stats.Default.NewTaggedStat("router_delivery_latency", stats.TimerType, stats.Tags{"destType": rt.destType})
+	routerProxyStat := stats.Default.NewTaggedStat("router_proxy_latency", stats.TimerType, stats.Tags{"destType": rt.destType})
+
+	bufferCapacityStat := stats.Default.NewTaggedStat("router_worker_buffer_capacity", stats.HistogramType, stats.Tags{"destType": rt.destType, "partition": partition})
+	bufferSizeStat := stats.Default.NewTaggedStat("router_worker_buffer_size", stats.HistogramType, stats.Tags{"destType": rt.destType, "partition": partition})
+
 	for i := 0; i < rt.noOfWorkers; i++ {
 		ctx, cancelFunc := context.WithCancel(context.Background())
+		workLoopThroughput := metric.NewSimpleMovingAverage(20)
+		workLoopThroughputStat := stats.Default.NewTaggedStat("router_worker_work_loop_throughput", stats.HistogramType, stats.Tags{"destType": rt.destType, "partition": partition})
 		worker := &worker{
-			logger:                    pw.logger.Child("w-" + strconv.Itoa(i)),
-			partition:                 partition,
-			id:                        i,
-			ctx:                       ctx,
-			cancelFunc:                cancelFunc,
-			inputCh:                   make(chan workerJob, rt.workerInputBufferSize),
+			logger:     pw.logger.Child("w-" + strconv.Itoa(i)),
+			partition:  partition,
+			id:         i,
+			ctx:        ctx,
+			cancelFunc: cancelFunc,
+			workerBuffer: newWorkerBuffer(
+				rt.maxNoOfJobsPerChannel,
+				newBufferSizeCalculatorSwitcher(
+					rt.reloadableConfig.enableExperimentalBufferSizeCalculator,
+					pw.pickupBatchSizeGauge,
+					rt.noOfWorkers,
+					rt.reloadableConfig.noOfJobsToBatchInAWorker,
+					workLoopThroughput,
+					rt.reloadableConfig.experimentalBufferSizeScalingFactor,
+					rt.noOfJobsPerChannel,
+				),
+				&workerBufferStats{
+					onceEvery:       kitsync.NewOnceEvery(5 * time.Second),
+					currentCapacity: bufferCapacityStat,
+					currentSize:     bufferSizeStat,
+				}),
 			barrier:                   rt.barrier,
 			rt:                        rt,
-			deliveryTimeStat:          stats.Default.NewTaggedStat("router_delivery_time", stats.TimerType, stats.Tags{"destType": rt.destType}),
-			routerDeliveryLatencyStat: stats.Default.NewTaggedStat("router_delivery_latency", stats.TimerType, stats.Tags{"destType": rt.destType}),
-			routerProxyStat:           stats.Default.NewTaggedStat("router_proxy_latency", stats.TimerType, stats.Tags{"destType": rt.destType}),
+			deliveryTimeStat:          deliveryTimeStat,
+			routerDeliveryLatencyStat: routerDeliveryLatencyStat,
+			routerProxyStat:           routerProxyStat,
 			deliveryLatencyStatsCache: cache.NewStatsCache(func(labels deliveryMetricLabels) stats.Measurement {
 				return stats.Default.NewTaggedStat("transformer_outgoing_request_latency", stats.TimerType, labels.ToStatTags())
 			}),
 			deliveryCountStatsCache: cache.NewStatsCache(func(labels deliveryMetricLabels) stats.Measurement {
 				return stats.Default.NewTaggedStat("transformer_outgoing_request_count", stats.CountType, labels.ToStatTags())
 			}),
+			workLoopThroughput: newSmaHistogram(
+				workLoopThroughput,
+				workLoopThroughputStat,
+				kitsync.NewOnceEvery(10*time.Second),
+			),
 		}
 		pw.workers[i] = worker
 
@@ -66,9 +98,10 @@ type partitionWorker struct {
 	partition string
 
 	// state
-	ctx     context.Context
-	g       *errgroup.Group // group against which all the workers are spawned
-	workers []*worker       // workers that are responsible for processing the jobs
+	ctx                  context.Context
+	g                    *errgroup.Group         // group against which all the workers are spawned
+	pickupBatchSizeGauge GaugeWithLastValue[int] // gauge to track the pickup batch size used in the last pickup iteration
+	workers              []*worker               // workers that are responsible for processing the jobs
 
 	pickupCount   int  // number of jobs picked up by the workers in the last iteration
 	limitsReached bool // whether the limits were reached in the last iteration
@@ -77,8 +110,7 @@ type partitionWorker struct {
 // Work picks up jobs for the partitioned worker and returns whether it worked or not
 func (pw *partitionWorker) Work() bool {
 	start := time.Now()
-	var pickupBatchSizeGauge stats.Gauge = stats.Default.NewTaggedStat("router_pickup_batch_size_gauge", stats.GaugeType, stats.Tags{"destType": pw.rt.destType, "partition": pw.partition})
-	pw.pickupCount, pw.limitsReached = pw.rt.pickup(pw.ctx, pw.partition, pw.workers, pickupBatchSizeGauge)
+	pw.pickupCount, pw.limitsReached = pw.rt.pickup(pw.ctx, pw.partition, pw.workers, pw.pickupBatchSizeGauge)
 	// the following stats are used to track the total time taken for the pickup process and the number of jobs picked up
 	stats.Default.NewTaggedStat("router_generator_loop", stats.TimerType, stats.Tags{"destType": pw.rt.destType}).Since(start)
 	stats.Default.NewTaggedStat("router_generator_events", stats.CountType, stats.Tags{"destType": pw.rt.destType, "partition": pw.partition}).Count(pw.pickupCount)
@@ -101,7 +133,7 @@ func (pw *partitionWorker) SleepDurations() (min, max time.Duration) {
 func (pw *partitionWorker) Stop() {
 	for _, worker := range pw.workers {
 		worker.cancelFunc()
-		close(worker.inputCh)
+		worker.workerBuffer.Close()
 	}
 	_ = pw.g.Wait()
 }
@@ -20,6 +20,7 @@ import (
 
 	"github.com/rudderlabs/rudder-go-kit/config"
 	"github.com/rudderlabs/rudder-go-kit/logger"
+	"github.com/rudderlabs/rudder-go-kit/stats/metric"
 	"github.com/rudderlabs/rudder-server/admin"
 	backendconfig "github.com/rudderlabs/rudder-server/backend-config"
 	"github.com/rudderlabs/rudder-server/enterprise/reporting"
@@ -262,7 +263,8 @@ func TestBackoff(t *testing.T) {
 			logger:                logger.NOP,
 			backgroundCtx:         context.Background(),
 			noOfWorkers:           1,
-			workerInputBufferSize: 3,
+			maxNoOfJobsPerChannel: 3,
+			noOfJobsPerChannel:    3,
 			barrier:               barrier,
 			reloadableConfig: &reloadableConfig{
 				maxFailedCountForJob: config.SingleValueLoader(3),
@@ -278,13 +280,14 @@ func TestBackoff(t *testing.T) {
 			},
 		}
 		workers := []*worker{{
-			logger:  logger.NOP,
-			inputCh: make(chan workerJob, 3),
-			barrier: barrier,
+			logger:             logger.NOP,
+			workerBuffer:       newSimpleWorkerBuffer(3),
+			barrier:            barrier,
+			workLoopThroughput: metric.NewSimpleMovingAverage(1),
 		}}
 		t.Run("eventorder disabled", func(t *testing.T) {
 			r.guaranteeUserEventOrder = false
-			workers[0].inputReservations = 0
+			workers[0].workerBuffer = newSimpleWorkerBuffer(3)
 
 			slot, err := r.findWorkerSlot(context.Background(), workers, backoffJob, parameters, map[eventorder.BarrierKey]struct{}{})
 			require.Nil(t, slot)
@@ -317,7 +320,7 @@ func TestBackoff(t *testing.T) {
 
 		t.Run("eventorder enabled", func(t *testing.T) {
 			r.guaranteeUserEventOrder = true
-			workers[0].inputReservations = 0
+			workers[0].workerBuffer = newSimpleWorkerBuffer(3)
 
 			slot, err := r.findWorkerSlot(context.Background(), workers, backoffJob, parameters, map[eventorder.BarrierKey]struct{}{})
 			require.Nil(t, slot)
@@ -353,7 +356,7 @@ func TestBackoff(t *testing.T) {
 		t.Run("eventorder enabled with drain job", func(t *testing.T) {
 			r.drainer = &drainer{drain: true, reason: "drain job due to some reason"}
 			r.guaranteeUserEventOrder = true
-			workers[0].inputReservations = 0
+			workers[0].workerBuffer = newSimpleWorkerBuffer(3)
 
 			slot, err := r.findWorkerSlot(context.Background(), workers, backoffJob, parameters, map[eventorder.BarrierKey]struct{}{})
 			require.NotNil(t, slot)
@@ -422,7 +425,7 @@ func TestBackoff(t *testing.T) {
 
 		t.Run("job not blocked after event ordering is disabled(destinationID level)", func(t *testing.T) {
 			r.guaranteeUserEventOrder = true
-			workers[0].inputReservations = 0
+			workers[0].workerBuffer = newSimpleWorkerBuffer(3)
 			job := &jobsdb.JobT{
 				JobID:      1,
 				Parameters: []byte(`{"destination_id": "destination"}`),
@@ -458,7 +461,7 @@ func TestBackoff(t *testing.T) {
 
 		t.Run("job not blocked after event ordering is disabled(workspaceID level)", func(t *testing.T) {
 			r.guaranteeUserEventOrder = true
-			workers[0].inputReservations = 0
+			workers[0].workerBuffer = newSimpleWorkerBuffer(3)
 			job := &jobsdb.JobT{
 				JobID:      1,
 				Parameters: []byte(`{"destination_id": "destination"}`),
 
@@ -47,28 +47,30 @@ type JobResponse struct {
 }
 
 type reloadableConfig struct {
-	jobQueryBatchSize                 config.ValueLoader[int]
-	maxJobQueryBatchSize              config.ValueLoader[int] // absolute max limit on job query batch size when adapting based on throttling limits
-	updateStatusBatchSize             config.ValueLoader[int]
-	readSleep                         config.ValueLoader[time.Duration]
-	maxStatusUpdateWait               config.ValueLoader[time.Duration]
-	minRetryBackoff                   config.ValueLoader[time.Duration]
-	maxRetryBackoff                   config.ValueLoader[time.Duration]
-	jobsBatchTimeout                  config.ValueLoader[time.Duration]
-	failingJobsPenaltyThreshold       config.ValueLoader[float64]
-	failingJobsPenaltySleep           config.ValueLoader[time.Duration]
-	noOfJobsToBatchInAWorker          config.ValueLoader[int]
-	jobsDBCommandTimeout              config.ValueLoader[time.Duration]
-	jobdDBMaxRetries                  config.ValueLoader[int]
-	maxFailedCountForJob              config.ValueLoader[int]
-	maxFailedCountForSourcesJob       config.ValueLoader[int]
-	payloadLimit                      config.ValueLoader[int64]
-	retryTimeWindow                   config.ValueLoader[time.Duration]
-	sourcesRetryTimeWindow            config.ValueLoader[time.Duration]
-	pickupFlushInterval               config.ValueLoader[time.Duration]
-	maxDSQuerySize                    config.ValueLoader[int]
-	transformerProxy                  config.ValueLoader[bool]
-	skipRtAbortAlertForTransformation config.ValueLoader[bool] // represents if event delivery(via transformerProxy) should be alerted via router-aborted-count alert def
-	skipRtAbortAlertForDelivery       config.ValueLoader[bool] // represents if transformation(router or batch) should be alerted via router-aborted-count alert def
-	oauthV2ExpirationTimeDiff         config.ValueLoader[time.Duration]
+	jobQueryBatchSize                      config.ValueLoader[int]
+	maxJobQueryBatchSize                   config.ValueLoader[int] // absolute max limit on job query batch size when adapting based on throttling limits
+	updateStatusBatchSize                  config.ValueLoader[int]
+	readSleep                              config.ValueLoader[time.Duration]
+	maxStatusUpdateWait                    config.ValueLoader[time.Duration]
+	minRetryBackoff                        config.ValueLoader[time.Duration]
+	maxRetryBackoff                        config.ValueLoader[time.Duration]
+	jobsBatchTimeout                       config.ValueLoader[time.Duration]
+	failingJobsPenaltyThreshold            config.ValueLoader[float64]
+	failingJobsPenaltySleep                config.ValueLoader[time.Duration]
+	noOfJobsToBatchInAWorker               config.ValueLoader[int]
+	jobsDBCommandTimeout                   config.ValueLoader[time.Duration]
+	jobdDBMaxRetries                       config.ValueLoader[int]
+	maxFailedCountForJob                   config.ValueLoader[int]
+	maxFailedCountForSourcesJob            config.ValueLoader[int]
+	payloadLimit                           config.ValueLoader[int64]
+	retryTimeWindow                        config.ValueLoader[time.Duration]
+	sourcesRetryTimeWindow                 config.ValueLoader[time.Duration]
+	pickupFlushInterval                    config.ValueLoader[time.Duration]
+	maxDSQuerySize                         config.ValueLoader[int]
+	transformerProxy                       config.ValueLoader[bool]
+	skipRtAbortAlertForTransformation      config.ValueLoader[bool] // represents if event delivery(via transformerProxy) should be alerted via router-aborted-count alert def
+	skipRtAbortAlertForDelivery            config.ValueLoader[bool] // represents if transformation(router or batch) should be alerted via router-aborted-count alert def
+	oauthV2ExpirationTimeDiff              config.ValueLoader[time.Duration]
+	enableExperimentalBufferSizeCalculator config.ValueLoader[bool]    // whether to use the experimental worker buffer size calculator or not
+	experimentalBufferSizeScalingFactor    config.ValueLoader[float64] // scaling factor to scale up the buffer size in the experimental calculator
 }