Add constant generation setting

pgmpablo157321 · pgmpablo157321 · commit d2bc6960ac9b · 2025-03-10T16:05:20.000-05:00
diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc
@@ -310,6 +310,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::server_max_async_queries)
       .def_readwrite("server_num_issue_query_threads",
                      &TestSettings::server_num_issue_query_threads)
+      .def_readwrite("server_constant_gen",
+                     &TestSettings::server_constant_gen)
       .def_readwrite("offline_expected_qps",
                      &TestSettings::offline_expected_qps)
       .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
diff --git a/loadgen/demos/py_demo_constant_gen.py b/loadgen/demos/py_demo_constant_gen.py
@@ -0,0 +1,75 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    time.sleep(0.001)
+    responses = []
+    for s in query_samples:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.server_target_qps = 100
+    settings.server_target_latency_ns = 100000000
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+    settings.server_constant_gen = True
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
@@ -207,6 +207,13 @@ auto ScheduleDistribution<TestScenario::Server>(double qps) {
   };
 }
 
+auto ScheduleConstantDistribution(double qps){
+  return [dist = std::uniform_real_distribution<>(1.0 / qps)](auto& gen) mutable {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::duration<double>(dist(gen)));
+  };
+}
+
 /// \brief Selects samples for the accuracy mode.
 template <TestMode mode>
 auto SampleDistribution(size_t sample_count, size_t stride, std::mt19937* rng) {
@@ -310,8 +317,9 @@ std::vector<QueryMetadata> GenerateQueries(
   auto sample_distribution_equal_issue = SampleDistributionEqualIssue(
       min_queries, loaded_samples.size(), &sample_rng);
 
-  auto schedule_distribution =
-      ScheduleDistribution<scenario>(settings.target_qps);
+  TestScenario temp_scenario = scenario;
+  auto schedule_distribution = ScheduleDistribution<scenario>(settings.target_qps);
+  auto schedule_constant_distribution = ScheduleConstantDistribution(settings.target_qps);
 
   // When sample_concatenate_permutation is turned on, pad to a multiple of the
   // complete dataset to ensure fairness.
@@ -397,7 +405,11 @@ std::vector<QueryMetadata> GenerateQueries(
     }
     queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
     prev_timestamp = timestamp;
-    timestamp += schedule_distribution(schedule_rng);
+    if (settings.server_constant_gen && (scenario == TestScenario::Server)){
+      timestamp += schedule_constant_distribution(schedule_rng);
+    } else {
+      timestamp += schedule_distribution(schedule_rng);
+    }
     // In equal_issue mode, the min_queries will be bumped up by a multiple of
     // the dataset size if the test time has not met the threshold.
     if (enable_equal_issue && (queries.size() >= min_queries) &&
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
@@ -169,6 +169,10 @@ struct TestSettings {
   /// StartTest() will be used to call IssueQuery(). See also
   /// mlperf::RegisterIssueQueryThread().
   uint64_t server_num_issue_query_threads = 0;
+  /// \brief If this flag is set to true, LoadGen the time between samples genera-
+  /// ted by LoadGen in the server scenario is set to constant. Otherwise, the 
+  /// time between samples follows an exponential distribution
+  bool server_constant_gen = false;
   /**@}*/
 
   // ==================================
diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
@@ -53,6 +53,7 @@ TestSettingsInternal::TestSettingsInternal(
       use_token_latencies(requested.use_token_latencies),
       server_ttft_latency(requested.server_ttft_latency),
       server_tpot_latency(requested.server_tpot_latency),
+      server_constant_gen(requested.server_constant_gen),
       infer_token_latencies(requested.infer_token_latencies),
       token_latency_scaling_factor(requested.token_latency_scaling_factor) {
   // Target QPS, target latency, and max_async_queries.
@@ -305,6 +306,8 @@ void LogRequestedTestSettings(const TestSettings &s) {
                    s.server_max_async_queries);
         MLPERF_LOG(detail, "requested_server_num_issue_query_threads",
                    s.server_num_issue_query_threads);
+        MLPERF_LOG(detail, "requested_server_constant_gen",
+                   s.server_constant_gen);
         break;
       case TestScenario::Offline:
         MLPERF_LOG(detail, "requested_offline_expected_qps",
@@ -452,6 +455,8 @@ void TestSettingsInternal::LogEffectiveSettings() const {
                s.performance_sample_count);
     MLPERF_LOG(detail, "effective_sample_concatenate_permutation",
                s.sample_concatenate_permutation);
+    MLPERF_LOG(detail, "effective_server_constant_gen",
+               s.server_constant_gen);
 #else
     detail("");
     detail("Effective Settings:");
@@ -772,6 +777,8 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
     server_coalesce_queries = (val == 0) ? false : true;
   if (lookupkv(model, "Server", "max_async_queries", &val, nullptr))
     server_max_async_queries = int(val);
+  if (lookupkv(model, "Server", "constant_gen", &val, nullptr))
+    server_constant_gen = (val == 0) ? false : true;
 
   lookupkv(model, scenario, "min_duration", &min_duration_ms, nullptr);
   lookupkv(model, scenario, "max_duration", &max_duration_ms, nullptr);
diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
@@ -85,6 +85,7 @@ struct TestSettingsInternal {
   bool use_token_latencies = false;
   int64_t server_ttft_latency;
   int64_t server_tpot_latency;
+  bool server_constant_gen;
 
   bool infer_token_latencies = false;
   int64_t token_latency_scaling_factor;