diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
index 029889722bd..d32c3ed8d3f 100644
--- a/ax/benchmark/benchmark.py
+++ b/ax/benchmark/benchmark.py
@@ -27,10 +27,7 @@
 import numpy as np
 
 from ax.benchmark.benchmark_method import BenchmarkMethod
-from ax.benchmark.benchmark_problem import (
-    BenchmarkProblemProtocol,
-    BenchmarkProblemWithKnownOptimum,
-)
+from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
 from ax.benchmark.metrics.base import BenchmarkMetricBase, GroundTruthMetricMixin
 from ax.core.experiment import Experiment
@@ -53,16 +50,14 @@
 def compute_score_trace(
     optimization_trace: np.ndarray,
     num_baseline_trials: int,
-    problem: BenchmarkProblemProtocol,
+    problem: BenchmarkProblem,
 ) -> np.ndarray:
     """Computes a score trace from the optimization trace."""
 
     # Use the first GenerationStep's best found point as baseline. Sometimes (ex. in
     # a timeout) the first GenerationStep will not have not completed and we will not
     # have enough trials; in this case we do not score.
-    if (len(optimization_trace) <= num_baseline_trials) or not isinstance(
-        problem, BenchmarkProblemWithKnownOptimum
-    ):
+    if len(optimization_trace) <= num_baseline_trials:
         return np.full(len(optimization_trace), np.nan)
     optimum = problem.optimal_value
     baseline = optimization_trace[num_baseline_trials - 1]
@@ -77,7 +72,7 @@ def compute_score_trace(
 
 
 def _create_benchmark_experiment(
-    problem: BenchmarkProblemProtocol, method_name: str
+    problem: BenchmarkProblem, method_name: str
 ) -> Experiment:
     """Creates an empty experiment for the given problem and method.
 
@@ -117,7 +112,7 @@ def _create_benchmark_experiment(
 
 
 def benchmark_replication(
-    problem: BenchmarkProblemProtocol,
+    problem: BenchmarkProblem,
     method: BenchmarkMethod,
     seed: int,
 ) -> BenchmarkResult:
@@ -192,7 +187,7 @@ def benchmark_replication(
 
 
 def benchmark_one_method_problem(
-    problem: BenchmarkProblemProtocol,
+    problem: BenchmarkProblem,
     method: BenchmarkMethod,
     seeds: Iterable[int],
 ) -> AggregatedBenchmarkResult:
@@ -205,7 +200,7 @@ def benchmark_one_method_problem(
 
 
 def benchmark_multiple_problems_methods(
-    problems: Iterable[BenchmarkProblemProtocol],
+    problems: Iterable[BenchmarkProblem],
     methods: Iterable[BenchmarkMethod],
     seeds: Iterable[int],
 ) -> List[AggregatedBenchmarkResult]:
@@ -222,7 +217,7 @@ def benchmark_multiple_problems_methods(
 
 
 def make_ground_truth_metrics(
-    problem: BenchmarkProblemProtocol,
+    problem: BenchmarkProblem,
     include_tracking_metrics: bool = True,
 ) -> Dict[str, Metric]:
     """Makes a ground truth version for each metric defined on the problem.
diff --git a/ax/benchmark/benchmark_problem.py b/ax/benchmark/benchmark_problem.py
index b3f8e29531a..0c152586037 100644
--- a/ax/benchmark/benchmark_problem.py
+++ b/ax/benchmark/benchmark_problem.py
@@ -5,13 +5,8 @@
 
 # pyre-strict
 
-# NOTE: Do not add `from __future__ import annotations` to this file. Adding
-# `annotations` postpones evaluation of types and will break FBLearner's usage of
-# `BenchmarkProblem` as return type annotation, used for serialization and rendering
-# in the UI.
-
-import abc
-from typing import Any, Dict, List, Optional, Protocol, runtime_checkable, Type, Union
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Type, Union
 
 from ax.benchmark.metrics.base import BenchmarkMetricBase
 
@@ -30,8 +25,11 @@
 from ax.core.types import ComparisonOp
 from ax.utils.common.base import Base
 from ax.utils.common.typeutils import checked_cast
-from botorch.test_functions.base import BaseTestProblem, ConstrainedBaseTestProblem
-from botorch.test_functions.multi_objective import MultiObjectiveTestProblem
+from botorch.test_functions.base import (
+    BaseTestProblem,
+    ConstrainedBaseTestProblem,
+    MultiObjectiveTestProblem,
+)
 from botorch.test_functions.synthetic import SyntheticTestFunction
 
 
@@ -51,376 +49,245 @@ def _get_name(
     return f"{base_name}{observed_noise}{dim_str}"
 
 
-@runtime_checkable
-class BenchmarkProblemProtocol(Protocol):
+@dataclass(kw_only=True, repr=True)
+class BenchmarkProblem(Base):
     """
-    Specifies the interface any benchmark problem must adhere to.
-
-    Classes implementing this interface include BenchmarkProblem,
-    SurrogateBenchmarkProblem, and MOOSurrogateBenchmarkProblem.
+    Problem against which diffrent methods can be benchmarked.
+
+    Defines how data is generated, the objective (via the OptimizationConfig),
+    and the SearchSpace.
+
+    Args:
+        name: Can be generated programmatically with `_get_name`.
+        optimization_config: Defines the objective of optimizaiton.
+        num_trials: Number of optimization iterations to run. BatchTrials count
+            as one trial.
+        observe_noise_stds: If boolean, whether the standard deviation of the
+            observation noise is observed for all metrics. If a dictionary,
+            whether noise levels are observed on a per-metric basis.
+        has_ground_truth: Whether the Runner produces underlying ground truth
+            values, which are not observed in real noisy problems but may be
+            known in benchmarks.
+        tracking_metrics: Tracking metrics are not optimized, and for the
+            purpose of benchmarking, they will not be fit. The ground truth may
+            be provided as `tracking_metrics`.
+        optimal_value: The best ground-truth objective value. Hypervolume for
+            multi-objective problems. If the best value is not known, it is
+            conventional to set it to a value that is almost certainly better
+            than the best value, so that a benchmark's score will not exceed 100%.
+        search_space: The search space.
+        runner: The Runner that will be used to generate data for the problem,
+            including any ground-truth data stored as tracking metrics.
     """
 
     name: str
-    search_space: SearchSpace
     optimization_config: OptimizationConfig
     num_trials: int
-    tracking_metrics: List[BenchmarkMetricBase]
-    is_noiseless: bool  # If True, evaluations are deterministic
-    observe_noise_stds: Union[
-        bool, Dict[str, bool]
-    ]  # Whether we observe the observation noise level
-    has_ground_truth: bool  # if True, evals (w/o synthetic noise) are determinstic
-
-    @abc.abstractproperty
-    def runner(self) -> Runner:
-        pass  # pragma: no cover
-
-
-@runtime_checkable
-class BenchmarkProblemWithKnownOptimum(Protocol):
+    observe_noise_stds: Union[bool, Dict[str, bool]] = False
+    has_ground_truth: bool = True
+    tracking_metrics: List[BenchmarkMetricBase] = field(default_factory=list)
     optimal_value: float
 
+    search_space: SearchSpace = field(repr=False)
+    runner: Runner = field(repr=False)
+    is_noiseless: bool
 
-class BenchmarkProblem(Base):
-    """Benchmark problem, represented in terms of Ax search space, optimization
-    config, and runner.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        search_space: SearchSpace,
-        optimization_config: OptimizationConfig,
-        runner: Runner,
-        num_trials: int,
-        is_noiseless: bool = False,
-        observe_noise_stds: Union[bool, Dict[str, bool]] = False,
-        has_ground_truth: bool = False,
-        tracking_metrics: Optional[List[BenchmarkMetricBase]] = None,
-    ) -> None:
-        self.name = name
-        self.search_space = search_space
-        self.optimization_config = optimization_config
-        self._runner = runner
-        self.num_trials = num_trials
-        self.is_noiseless = is_noiseless
-        self.observe_noise_stds = observe_noise_stds
-        self.has_ground_truth = has_ground_truth
-        self.tracking_metrics: List[BenchmarkMetricBase] = tracking_metrics or []
-
-    @property
-    def runner(self) -> Runner:
-        return self._runner
-
-    @classmethod
-    def from_botorch(
-        cls,
-        test_problem_class: Type[BaseTestProblem],
-        test_problem_kwargs: Dict[str, Any],
-        lower_is_better: bool,
-        num_trials: int,
-        observe_noise_sd: bool = False,
-    ) -> "BenchmarkProblem":
-        """
-        Create a BenchmarkProblem from a BoTorch BaseTestProblem using
-        specialized Metrics and Runners. The test problem's result will be
-        computed on the Runner and retrieved by the Metric.
-
-        Args:
-            test_problem_class: The BoTorch test problem class which will be used
-                to define the `search_space`, `optimization_config`, and `runner`.
-            test_problem_kwargs: Keyword arguments used to instantiate the
-                `test_problem_class`.
-            num_trials: Simply the `num_trials` of the `BenchmarkProblem` created.
-            observe_noise_sd: Whether the standard deviation of the observation noise is
-                observed or not (in which case it must be inferred by the model).
-                This is separate from whether synthetic noise is added to the
-                problem, which is controlled by the `noise_std` of the test problem.
-        """
-
-        # pyre-fixme [45]: Invalid class instantiation
-        test_problem = test_problem_class(**test_problem_kwargs)
-        is_constrained = isinstance(test_problem, ConstrainedBaseTestProblem)
-
-        search_space = SearchSpace(
-            parameters=[
-                RangeParameter(
-                    name=f"x{i}",
-                    parameter_type=ParameterType.FLOAT,
-                    lower=lower,
-                    upper=upper,
-                )
-                for i, (lower, upper) in enumerate(test_problem._bounds)
-            ]
-        )
-
-        dim = test_problem_kwargs.get("dim", None)
-        name = _get_name(
-            test_problem=test_problem, observe_noise_sd=observe_noise_sd, dim=dim
-        )
-
-        # TODO: Support constrained MOO problems.
 
-        objective = Objective(
-            metric=BenchmarkMetric(
-                name=name,
-                lower_is_better=lower_is_better,
-                observe_noise_sd=observe_noise_sd,
-                outcome_index=0,
-            ),
-            minimize=lower_is_better,
-        )
-
-        outcome_names = [name]
-        outcome_constraints = []
-
-        # NOTE: Currently we don't support the case where only some of the
-        # outcomes have noise levels observed.
-
-        if is_constrained:
-            for i in range(test_problem.num_constraints):
-                outcome_name = f"constraint_slack_{i}"
-                outcome_constraints.append(
-                    OutcomeConstraint(
-                        metric=BenchmarkMetric(
-                            name=outcome_name,
-                            lower_is_better=False,  # positive slack = feasible
-                            observe_noise_sd=observe_noise_sd,
-                            outcome_index=i,
-                        ),
-                        op=ComparisonOp.GEQ,
-                        bound=0.0,
-                        relative=False,
-                    )
-                )
-                outcome_names.append(outcome_name)
-
-        optimization_config = OptimizationConfig(
-            objective=objective,
-            outcome_constraints=outcome_constraints,
-        )
-
-        return cls(
-            name=name,
-            search_space=search_space,
-            optimization_config=optimization_config,
-            runner=BotorchTestProblemRunner(
-                test_problem_class=test_problem_class,
-                test_problem_kwargs=test_problem_kwargs,
-                outcome_names=outcome_names,
-            ),
-            num_trials=num_trials,
-            observe_noise_stds=observe_noise_sd,
-            is_noiseless=test_problem.noise_std in (None, 0.0),
-            has_ground_truth=True,  # all synthetic problems have ground truth
-        )
-
-    def __repr__(self) -> str:
-        """
-        Return a string representation that includes only the attributes that
-        print nicely and contain information likely to be useful.
-        """
-        return (
-            f"{self.__class__.__name__}("
-            f"name={self.name}, "
-            f"optimization_config={self.optimization_config}, "
-            f"num_trials={self.num_trials}, "
-            f"is_noiseless={self.is_noiseless}, "
-            f"observe_noise_stds={self.observe_noise_stds}, "
-            f"has_ground_truth={self.has_ground_truth}, "
-            f"tracking_metrics={self.tracking_metrics})"
-        )
+def create_single_objective_problem_from_botorch(
+    test_problem_class: Type[SyntheticTestFunction],
+    test_problem_kwargs: Dict[str, Any],
+    lower_is_better: bool,
+    num_trials: int,
+    observe_noise_sd: bool = False,
+) -> BenchmarkProblem:
+    """
+    Create a `BenchmarkProblem` whose `optimization_config` is a
+    `SingleObjectiveOptimizationConfig` a BoTorch SyntheticTestFunction using
+    specialized Metrics and Runners for benchmarking. The test problem's result
+    will be computed on the Runner and retrieved by the Metric.
+
+    Args:
+        test_problem_class: The BoTorch test problem class which will be used
+            to define the `search_space`, `optimization_config`, and `runner`.
+        test_problem_kwargs: Keyword arguments used to instantiate the
+            `test_problem_class`.
+        num_trials: Simply the `num_trials` of the `BenchmarkProblem` created.
+        observe_noise_sd: Whether the standard deviation of the observation noise is
+            observed or not (in which case it must be inferred by the model).
+            This is separate from whether synthetic noise is added to the
+            problem, which is controlled by the `noise_std` of the test problem.
+    """
+    # pyre-fixme [45]: Invalid class instantiation
+    test_problem = test_problem_class(**test_problem_kwargs)
+    is_constrained = isinstance(test_problem, ConstrainedBaseTestProblem)
+
+    search_space = SearchSpace(
+        parameters=[
+            RangeParameter(
+                name=f"x{i}",
+                parameter_type=ParameterType.FLOAT,
+                lower=lower,
+                upper=upper,
+            )
+            for i, (lower, upper) in enumerate(test_problem._bounds)
+        ]
+    )
 
+    dim = test_problem_kwargs.get("dim", None)
+    name = _get_name(
+        test_problem=test_problem, observe_noise_sd=observe_noise_sd, dim=dim
+    )
 
-class SingleObjectiveBenchmarkProblem(BenchmarkProblem):
-    """The most basic BenchmarkProblem, with a single objective and a known optimal
-    value.
-    """
+    # TODO: Support constrained MOO problems.
 
-    def __init__(
-        self,
-        optimal_value: float,
-        *,
-        name: str,
-        search_space: SearchSpace,
-        optimization_config: OptimizationConfig,
-        runner: Runner,
-        num_trials: int,
-        is_noiseless: bool = False,
-        observe_noise_stds: Union[bool, Dict[str, bool]] = False,
-        has_ground_truth: bool = False,
-        tracking_metrics: Optional[List[BenchmarkMetricBase]] = None,
-    ) -> None:
-        super().__init__(
+    objective = Objective(
+        metric=BenchmarkMetric(
             name=name,
-            search_space=search_space,
-            optimization_config=optimization_config,
-            runner=runner,
-            num_trials=num_trials,
-            is_noiseless=is_noiseless,
-            observe_noise_stds=observe_noise_stds,
-            has_ground_truth=has_ground_truth,
-            tracking_metrics=tracking_metrics,
-        )
-        self.optimal_value = optimal_value
-
-    @classmethod
-    def from_botorch_synthetic(
-        cls,
-        test_problem_class: Type[SyntheticTestFunction],
-        test_problem_kwargs: Dict[str, Any],
-        lower_is_better: bool,
-        num_trials: int,
-        observe_noise_sd: bool = False,
-    ) -> "SingleObjectiveBenchmarkProblem":
-        """Create a BenchmarkProblem from a BoTorch BaseTestProblem using specialized
-        Metrics and Runners. The test problem's result will be computed on the Runner
-        and retrieved by the Metric.
-        """
-
-        # pyre-fixme [45]: Invalid class instantiation
-        test_problem = test_problem_class(**test_problem_kwargs)
-
-        problem = BenchmarkProblem.from_botorch(
-            test_problem_class=test_problem_class,
-            test_problem_kwargs=test_problem_kwargs,
             lower_is_better=lower_is_better,
-            num_trials=num_trials,
             observe_noise_sd=observe_noise_sd,
-        )
-
-        dim = test_problem_kwargs.get("dim", None)
-        name = _get_name(
-            test_problem=test_problem, observe_noise_sd=observe_noise_sd, dim=dim
-        )
-
-        return cls(
-            name=name,
-            search_space=problem.search_space,
-            optimization_config=problem.optimization_config,
-            runner=problem.runner,
-            num_trials=num_trials,
-            is_noiseless=problem.is_noiseless,
-            observe_noise_stds=problem.observe_noise_stds,
-            has_ground_truth=problem.has_ground_truth,
-            optimal_value=test_problem.optimal_value,
-        )
+            outcome_index=0,
+        ),
+        minimize=lower_is_better,
+    )
+
+    outcome_names = [name]
+    outcome_constraints = []
+
+    # NOTE: Currently we don't support the case where only some of the
+    # outcomes have noise levels observed.
+
+    if is_constrained:
+        for i in range(test_problem.num_constraints):
+            outcome_name = f"constraint_slack_{i}"
+            outcome_constraints.append(
+                OutcomeConstraint(
+                    metric=BenchmarkMetric(
+                        name=outcome_name,
+                        lower_is_better=False,  # positive slack = feasible
+                        observe_noise_sd=observe_noise_sd,
+                        outcome_index=i,
+                    ),
+                    op=ComparisonOp.GEQ,
+                    bound=0.0,
+                    relative=False,
+                )
+            )
+            outcome_names.append(outcome_name)
+
+    optimization_config = OptimizationConfig(
+        objective=objective,
+        outcome_constraints=outcome_constraints,
+    )
+    optimal_value = (
+        test_problem.max_hv
+        if isinstance(test_problem, MultiObjectiveTestProblem)
+        else test_problem.optimal_value
+    )
+    return BenchmarkProblem(
+        name=name,
+        search_space=search_space,
+        optimization_config=optimization_config,
+        runner=BotorchTestProblemRunner(
+            test_problem_class=test_problem_class,
+            test_problem_kwargs=test_problem_kwargs,
+            outcome_names=outcome_names,
+        ),
+        num_trials=num_trials,
+        observe_noise_stds=observe_noise_sd,
+        is_noiseless=test_problem.noise_std in (None, 0.0),
+        has_ground_truth=True,  # all synthetic problems have ground truth
+        optimal_value=optimal_value,
+    )
 
 
+@dataclass(kw_only=True, repr=True)
 class MultiObjectiveBenchmarkProblem(BenchmarkProblem):
     """
     A `BenchmarkProblem` that supports multiple objectives.
 
     For multi-objective problems, `optimal_value` indicates the maximum
     hypervolume attainable with the given `reference_point`.
-    """
 
-    def __init__(
-        self,
-        optimal_value: float,
-        reference_point: List[float],
-        *,
-        name: str,
-        search_space: SearchSpace,
-        optimization_config: OptimizationConfig,
-        runner: Runner,
-        num_trials: int,
-        is_noiseless: bool = False,
-        observe_noise_stds: Union[bool, Dict[str, bool]] = False,
-        has_ground_truth: bool = False,
-        tracking_metrics: Optional[List[BenchmarkMetricBase]] = None,
-    ) -> None:
-        self.optimal_value = optimal_value
-        self.reference_point = reference_point
-        super().__init__(
-            name=name,
-            search_space=search_space,
-            optimization_config=optimization_config,
-            runner=runner,
-            num_trials=num_trials,
-            is_noiseless=is_noiseless,
-            observe_noise_stds=observe_noise_stds,
-            has_ground_truth=has_ground_truth,
-            tracking_metrics=tracking_metrics,
-        )
+    For argument descriptions, see `BenchmarkProblem`; it additionally takes a `runner`
+    and a `reference_point`.
+    """
 
-    @classmethod
-    def from_botorch_multi_objective(
-        cls,
-        test_problem_class: Type[MultiObjectiveTestProblem],
-        test_problem_kwargs: Dict[str, Any],
-        # TODO: Figure out whether we should use `lower_is_better` here.
-        num_trials: int,
-        observe_noise_sd: bool = False,
-    ) -> "MultiObjectiveBenchmarkProblem":
-        """Create a BenchmarkProblem from a BoTorch BaseTestProblem using specialized
-        Metrics and Runners. The test problem's result will be computed on the Runner
-        once per trial and each Metric will retrieve its own result by index.
-        """
-
-        # pyre-fixme [45]: Invalid class instantiation
-        test_problem = test_problem_class(**test_problem_kwargs)
-
-        problem = BenchmarkProblem.from_botorch(
-            test_problem_class=test_problem_class,
-            test_problem_kwargs=test_problem_kwargs,
-            lower_is_better=True,  # Seems like we always assume minimization for MOO?
-            num_trials=num_trials,
-            observe_noise_sd=observe_noise_sd,
-        )
+    reference_point: List[float]
+    optimization_config: MultiObjectiveOptimizationConfig
 
-        dim = test_problem_kwargs.get("dim", None)
-        name = _get_name(
-            test_problem=test_problem, observe_noise_sd=observe_noise_sd, dim=dim
-        )
 
-        n_obj = test_problem.num_objectives
-        if not observe_noise_sd:
-            noise_sds = [None] * n_obj
-        elif isinstance(test_problem.noise_std, list):
-            noise_sds = test_problem.noise_std
-        else:
-            noise_sds = [checked_cast(float, test_problem.noise_std or 0.0)] * n_obj
-
-        metrics = [
-            BenchmarkMetric(
-                name=f"{name}_{i}",
-                lower_is_better=True,
-                observe_noise_sd=observe_noise_sd,
-                outcome_index=i,
-            )
-            for i, noise_sd in enumerate(noise_sds)
-        ]
-        optimization_config = MultiObjectiveOptimizationConfig(
-            objective=MultiObjective(
-                objectives=[
-                    Objective(metric=metric, minimize=True) for metric in metrics
-                ]
-            ),
-            objective_thresholds=[
-                ObjectiveThreshold(
-                    metric=metric,
-                    bound=test_problem.ref_point[i].item(),
-                    relative=False,
-                    op=ComparisonOp.LEQ,
-                )
-                for i, metric in enumerate(metrics)
-            ],
+def create_multi_objective_problem_from_botorch(
+    test_problem_class: Type[MultiObjectiveTestProblem],
+    test_problem_kwargs: Dict[str, Any],
+    # TODO: Figure out whether we should use `lower_is_better` here.
+    num_trials: int,
+    observe_noise_sd: bool = False,
+) -> MultiObjectiveBenchmarkProblem:
+    """Create a BenchmarkProblem from a BoTorch BaseTestProblem using specialized
+    Metrics and Runners. The test problem's result will be computed on the Runner
+    once per trial and each Metric will retrieve its own result by index.
+    """
+    if issubclass(test_problem_class, ConstrainedBaseTestProblem):
+        raise NotImplementedError(
+            "Constrained multi-objective problems are not supported."
         )
 
-        return cls(
-            name=name,
-            search_space=problem.search_space,
-            optimization_config=optimization_config,
-            runner=problem.runner,
-            num_trials=num_trials,
-            is_noiseless=problem.is_noiseless,
-            observe_noise_stds=observe_noise_sd,
-            has_ground_truth=problem.has_ground_truth,
-            optimal_value=test_problem.max_hv,
-            reference_point=test_problem._ref_point,
+    # pyre-fixme [45]: Invalid class instantiation
+    test_problem = test_problem_class(**test_problem_kwargs)
+
+    problem = create_single_objective_problem_from_botorch(
+        # pyre-fixme [6]: Passing a multi-objective problem where a
+        # single-objective problem is expected.
+        test_problem_class=test_problem_class,
+        test_problem_kwargs=test_problem_kwargs,
+        lower_is_better=True,  # Seems like we always assume minimization for MOO?
+        num_trials=num_trials,
+        observe_noise_sd=observe_noise_sd,
+    )
+
+    name = problem.name
+
+    n_obj = test_problem.num_objectives
+    if not observe_noise_sd:
+        noise_sds = [None] * n_obj
+    elif isinstance(test_problem.noise_std, list):
+        noise_sds = test_problem.noise_std
+    else:
+        noise_sds = [checked_cast(float, test_problem.noise_std or 0.0)] * n_obj
+
+    metrics = [
+        BenchmarkMetric(
+            name=f"{name}_{i}",
+            lower_is_better=True,
+            observe_noise_sd=observe_noise_sd,
+            outcome_index=i,
         )
-
-    @property
-    def maximum_hypervolume(self) -> float:
-        return self.optimal_value
+        for i, noise_sd in enumerate(noise_sds)
+    ]
+    optimization_config = MultiObjectiveOptimizationConfig(
+        objective=MultiObjective(
+            objectives=[Objective(metric=metric, minimize=True) for metric in metrics]
+        ),
+        objective_thresholds=[
+            ObjectiveThreshold(
+                metric=metric,
+                bound=test_problem.ref_point[i].item(),
+                relative=False,
+                op=ComparisonOp.LEQ,
+            )
+            for i, metric in enumerate(metrics)
+        ],
+    )
+
+    return MultiObjectiveBenchmarkProblem(
+        name=name,
+        search_space=problem.search_space,
+        optimization_config=optimization_config,
+        runner=problem.runner,
+        num_trials=num_trials,
+        is_noiseless=problem.is_noiseless,
+        observe_noise_stds=observe_noise_sd,
+        has_ground_truth=problem.has_ground_truth,
+        optimal_value=test_problem.max_hv,
+        reference_point=test_problem._ref_point,
+    )
diff --git a/ax/benchmark/problems/hpo/pytorch_cnn.py b/ax/benchmark/problems/hpo/pytorch_cnn.py
index 0a5db5dbb8b..15da5b9d30e 100644
--- a/ax/benchmark/problems/hpo/pytorch_cnn.py
+++ b/ax/benchmark/problems/hpo/pytorch_cnn.py
@@ -9,7 +9,7 @@
 
 import pandas as pd
 import torch
-from ax.benchmark.benchmark_problem import SingleObjectiveBenchmarkProblem
+from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.core.base_trial import BaseTrial, TrialStatus
 from ax.core.data import Data
 from ax.core.metric import Metric, MetricFetchE, MetricFetchResult
@@ -26,7 +26,7 @@
 from torch.utils.data import DataLoader, Dataset
 
 
-class PyTorchCNNBenchmarkProblem(SingleObjectiveBenchmarkProblem):
+class PyTorchCNNBenchmarkProblem(BenchmarkProblem):
     @equality_typechecker
     def __eq__(self, other: Base) -> bool:
         if not isinstance(other, PyTorchCNNBenchmarkProblem):
diff --git a/ax/benchmark/problems/registry.py b/ax/benchmark/problems/registry.py
index 4c6521af1a6..a7bc652a7b9 100644
--- a/ax/benchmark/problems/registry.py
+++ b/ax/benchmark/problems/registry.py
@@ -11,8 +11,8 @@
 
 from ax.benchmark.benchmark_problem import (
     BenchmarkProblem,
-    MultiObjectiveBenchmarkProblem,
-    SingleObjectiveBenchmarkProblem,
+    create_multi_objective_problem_from_botorch,
+    create_single_objective_problem_from_botorch,
 )
 from ax.benchmark.problems.hd_embedding import embed_higher_dimension
 from ax.benchmark.problems.hpo.torchvision import PyTorchCNNTorchvisionBenchmarkProblem
@@ -29,7 +29,7 @@ class BenchmarkProblemRegistryEntry:
 
 BENCHMARK_PROBLEM_REGISTRY = {
     "ackley4": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Ackley,
             "test_problem_kwargs": {"dim": 4},
@@ -39,7 +39,7 @@ class BenchmarkProblemRegistryEntry:
         },
     ),
     "branin": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Branin,
             "test_problem_kwargs": {},
@@ -49,7 +49,7 @@ class BenchmarkProblemRegistryEntry:
         },
     ),
     "branin_currin": BenchmarkProblemRegistryEntry(
-        factory_fn=MultiObjectiveBenchmarkProblem.from_botorch_multi_objective,
+        factory_fn=create_multi_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": BraninCurrin,
             "test_problem_kwargs": {},
@@ -59,7 +59,7 @@ class BenchmarkProblemRegistryEntry:
     ),
     "branin_currin30": BenchmarkProblemRegistryEntry(
         factory_fn=lambda n, num_trials: embed_higher_dimension(
-            problem=MultiObjectiveBenchmarkProblem.from_botorch_multi_objective(
+            problem=create_multi_objective_problem_from_botorch(
                 test_problem_class=BraninCurrin,
                 test_problem_kwargs={},
                 num_trials=num_trials,
@@ -70,7 +70,7 @@ class BenchmarkProblemRegistryEntry:
         factory_kwargs={"n": 30, "num_trials": 30},
     ),
     "griewank4": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Griewank,
             "test_problem_kwargs": {"dim": 4},
@@ -80,7 +80,7 @@ class BenchmarkProblemRegistryEntry:
         },
     ),
     "hartmann3": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Hartmann,
             "test_problem_kwargs": {"dim": 3},
@@ -90,7 +90,7 @@ class BenchmarkProblemRegistryEntry:
         },
     ),
     "hartmann6": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Hartmann,
             "test_problem_kwargs": {"dim": 6},
@@ -101,7 +101,7 @@ class BenchmarkProblemRegistryEntry:
     ),
     "hartmann30": BenchmarkProblemRegistryEntry(
         factory_fn=lambda n, num_trials: embed_higher_dimension(
-            problem=SingleObjectiveBenchmarkProblem.from_botorch_synthetic(
+            problem=create_single_objective_problem_from_botorch(
                 test_problem_class=synthetic.Hartmann,
                 test_problem_kwargs={"dim": 6},
                 lower_is_better=True,
@@ -131,7 +131,7 @@ class BenchmarkProblemRegistryEntry:
         factory_kwargs={"num_trials": 50, "observe_noise_sd": False},
     ),
     "levy4": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Levy,
             "test_problem_kwargs": {"dim": 4},
@@ -141,7 +141,7 @@ class BenchmarkProblemRegistryEntry:
         },
     ),
     "powell4": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Powell,
             "test_problem_kwargs": {"dim": 4},
@@ -151,7 +151,7 @@ class BenchmarkProblemRegistryEntry:
         },
     ),
     "rosenbrock4": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Rosenbrock,
             "test_problem_kwargs": {"dim": 4},
@@ -161,7 +161,7 @@ class BenchmarkProblemRegistryEntry:
         },
     ),
     "six_hump_camel": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.SixHumpCamel,
             "test_problem_kwargs": {},
@@ -171,7 +171,7 @@ class BenchmarkProblemRegistryEntry:
         },
     ),
     "three_hump_camel": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.ThreeHumpCamel,
             "test_problem_kwargs": {},
@@ -182,7 +182,7 @@ class BenchmarkProblemRegistryEntry:
     ),
     # Problems where we observe the noise level
     "branin_observed_noise": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Branin,
             "test_problem_kwargs": {},
@@ -192,7 +192,7 @@ class BenchmarkProblemRegistryEntry:
         },
     ),
     "branin_currin_observed_noise": BenchmarkProblemRegistryEntry(
-        factory_fn=MultiObjectiveBenchmarkProblem.from_botorch_multi_objective,
+        factory_fn=create_multi_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": BraninCurrin,
             "test_problem_kwargs": {},
@@ -202,7 +202,7 @@ class BenchmarkProblemRegistryEntry:
     ),
     "branin_currin30_observed_noise": BenchmarkProblemRegistryEntry(
         factory_fn=lambda n, num_trials: embed_higher_dimension(
-            problem=MultiObjectiveBenchmarkProblem.from_botorch_multi_objective(
+            problem=create_multi_objective_problem_from_botorch(
                 test_problem_class=BraninCurrin,
                 test_problem_kwargs={},
                 num_trials=num_trials,
@@ -213,7 +213,7 @@ class BenchmarkProblemRegistryEntry:
         factory_kwargs={"n": 30, "num_trials": 30},
     ),
     "hartmann6_observed_noise": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.Hartmann,
             "test_problem_kwargs": {"dim": 6},
@@ -224,7 +224,7 @@ class BenchmarkProblemRegistryEntry:
     ),
     "hartmann30_observed_noise": BenchmarkProblemRegistryEntry(
         factory_fn=lambda n, num_trials: embed_higher_dimension(
-            problem=SingleObjectiveBenchmarkProblem.from_botorch_synthetic(
+            problem=create_single_objective_problem_from_botorch(
                 test_problem_class=synthetic.Hartmann,
                 test_problem_kwargs={"dim": 6},
                 lower_is_better=True,
@@ -240,7 +240,7 @@ class BenchmarkProblemRegistryEntry:
         factory_kwargs={"num_trials": 25, "observe_noise_sd": True},
     ),
     "constrained_gramacy_observed_noise": BenchmarkProblemRegistryEntry(
-        factory_fn=SingleObjectiveBenchmarkProblem.from_botorch_synthetic,
+        factory_fn=create_single_objective_problem_from_botorch,
         factory_kwargs={
             "test_problem_class": synthetic.ConstrainedGramacy,
             "test_problem_kwargs": {},
diff --git a/ax/benchmark/problems/surrogate.py b/ax/benchmark/problems/surrogate.py
index 4f08ecb84f8..ffb2930b3a8 100644
--- a/ax/benchmark/problems/surrogate.py
+++ b/ax/benchmark/problems/surrogate.py
@@ -4,226 +4,3 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-strict
-
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-from ax.benchmark.metrics.base import BenchmarkMetricBase
-
-from ax.benchmark.runners.surrogate import SurrogateRunner
-from ax.core.optimization_config import (
-    MultiObjectiveOptimizationConfig,
-    OptimizationConfig,
-)
-from ax.core.runner import Runner
-from ax.core.search_space import SearchSpace
-from ax.modelbridge.torch import TorchModelBridge
-from ax.utils.common.base import Base
-from ax.utils.common.equality import equality_typechecker
-from ax.utils.common.typeutils import checked_cast, not_none
-from botorch.utils.datasets import SupervisedDataset
-
-
-class SurrogateBenchmarkProblemBase(Base):
-    """
-    Base class for SOOSurrogateBenchmarkProblem and MOOSurrogateBenchmarkProblem.
-
-    Its `runner` is created lazily, when `runner` is accessed or `set_runner` is
-    called, to defer construction of the surrogate and downloading of datasets.
-    """
-
-    def __init__(
-        self,
-        *,
-        name: str,
-        search_space: SearchSpace,
-        optimization_config: OptimizationConfig,
-        num_trials: int,
-        outcome_names: List[str],
-        observe_noise_stds: Union[bool, Dict[str, bool]] = False,
-        noise_stds: Union[float, Dict[str, float]] = 0.0,
-        get_surrogate_and_datasets: Optional[
-            Callable[[], Tuple[TorchModelBridge, List[SupervisedDataset]]]
-        ] = None,
-        tracking_metrics: Optional[List[BenchmarkMetricBase]] = None,
-        _runner: Optional[Runner] = None,
-    ) -> None:
-        """Construct a `SurrogateBenchmarkProblemBase` instance.
-
-        Args:
-            name: The name of the benchmark problem.
-            search_space: The search space to optimize over.
-            optimization_config: THe optimization config for the problem.
-            num_trials: The number of trials to run.
-            outcome_names: The names of the metrics the benchmark problem
-                produces outcome observations for.
-            observe_noise_stds: Whether or not to observe the observation noise
-                level for each metric. If True/False, observe the the noise standard
-                deviation for all/no metrics. If a dictionary, specify this for
-                individual metrics (metrics not appearing in the dictionary will
-                be assumed to not provide observation noise levels).
-            noise_stds: The standard deviation(s) of the observation noise(s).
-                If a single value is provided, it is used for all metrics. Providing
-                a dictionary allows specifying different noise levels for different
-                metrics (metrics not appearing in the dictionary will be assumed to
-                be noiseless - but not necessarily be known to the problem to be
-                noiseless).
-            get_surrogate_and_datasets: A factory function that retunrs the Surrogate
-                and a list of datasets to be used by the surrogate.
-            tracking_metrics: Additional tracking metrics to compute during the
-                optimization (not used to inform the optimization).
-        """
-
-        if get_surrogate_and_datasets is None and _runner is None:
-            raise ValueError(
-                "Either `get_surrogate_and_datasets` or `_runner` required."
-            )
-        self.name = name
-        self.search_space = search_space
-        self.optimization_config = optimization_config
-        self.num_trials = num_trials
-        self.outcome_names = outcome_names
-        self.observe_noise_stds = observe_noise_stds
-        self.noise_stds = noise_stds
-        self.get_surrogate_and_datasets = get_surrogate_and_datasets
-        self.tracking_metrics: List[BenchmarkMetricBase] = tracking_metrics or []
-        self._runner = _runner
-
-    @property
-    def is_noiseless(self) -> bool:
-        if self.noise_stds is None:
-            return True
-        if isinstance(self.noise_stds, float):
-            return self.noise_stds == 0.0
-        return all(std == 0.0 for std in checked_cast(dict, self.noise_stds).values())
-
-    @property
-    def has_ground_truth(self) -> bool:
-        # All surrogate-based problems have a ground truth
-        return True
-
-    @equality_typechecker
-    def __eq__(self, other: Base) -> bool:
-        if type(other) is not type(self):
-            return False
-
-        # Checking the whole datasets' equality here would be too expensive to be
-        # worth it; just check names instead
-        return self.name == other.name
-
-    def set_runner(self) -> None:
-        surrogate, datasets = not_none(self.get_surrogate_and_datasets)()
-
-        self._runner = SurrogateRunner(
-            name=self.name,
-            surrogate=surrogate,
-            datasets=datasets,
-            search_space=self.search_space,
-            outcome_names=self.outcome_names,
-            noise_stds=self.noise_stds,
-        )
-
-    @property
-    def runner(self) -> Runner:
-        if self._runner is None:
-            self.set_runner()
-        return not_none(self._runner)
-
-    def __repr__(self) -> str:
-        """
-        Return a string representation that includes only the attributes that
-        print nicely and contain information likely to be useful.
-        """
-        return (
-            f"{self.__class__.__name__}("
-            f"name={self.name}, "
-            f"optimization_config={self.optimization_config}, "
-            f"num_trials={self.num_trials}, "
-            f"is_noiseless={self.is_noiseless}, "
-            f"observe_noise_stds={self.observe_noise_stds}, "
-            f"noise_stds={self.noise_stds}, "
-            f"tracking_metrics={self.tracking_metrics})"
-        )
-
-
-class SOOSurrogateBenchmarkProblem(SurrogateBenchmarkProblemBase):
-    """
-    Has the same attributes/properties as a `MultiObjectiveBenchmarkProblem`,
-    but its runner is not constructed until needed, to allow for deferring
-    constructing the surrogate and downloading data. The surrogate is only
-    defined when `runner` is accessed or `set_runner` is called.
-    """
-
-    def __init__(
-        self,
-        optimal_value: float,
-        *,
-        name: str,
-        search_space: SearchSpace,
-        optimization_config: OptimizationConfig,
-        num_trials: int,
-        outcome_names: List[str],
-        observe_noise_stds: Union[bool, Dict[str, bool]] = False,
-        noise_stds: Union[float, Dict[str, float]] = 0.0,
-        get_surrogate_and_datasets: Optional[
-            Callable[[], Tuple[TorchModelBridge, List[SupervisedDataset]]]
-        ] = None,
-        tracking_metrics: Optional[List[BenchmarkMetricBase]] = None,
-        _runner: Optional[Runner] = None,
-    ) -> None:
-        super().__init__(
-            name=name,
-            search_space=search_space,
-            optimization_config=optimization_config,
-            num_trials=num_trials,
-            outcome_names=outcome_names,
-            observe_noise_stds=observe_noise_stds,
-            noise_stds=noise_stds,
-            get_surrogate_and_datasets=get_surrogate_and_datasets,
-            tracking_metrics=tracking_metrics,
-            _runner=_runner,
-        )
-        self.optimal_value = optimal_value
-
-
-class MOOSurrogateBenchmarkProblem(SurrogateBenchmarkProblemBase):
-    """
-    Has the same attributes/properties as a `MultiObjectiveBenchmarkProblem`,
-    but its runner is not constructed until needed, to allow for deferring
-    constructing the surrogate and downloading data. The surrogate is only
-    defined when `runner` is accessed or `set_runner` is called.
-    """
-
-    optimization_config: MultiObjectiveOptimizationConfig
-
-    def __init__(
-        self,
-        optimal_value: float,
-        reference_point: List[float],
-        *,
-        name: str,
-        search_space: SearchSpace,
-        optimization_config: MultiObjectiveOptimizationConfig,
-        num_trials: int,
-        outcome_names: List[str],
-        observe_noise_stds: Union[bool, Dict[str, bool]] = False,
-        noise_stds: Union[float, Dict[str, float]] = 0.0,
-        get_surrogate_and_datasets: Optional[
-            Callable[[], Tuple[TorchModelBridge, List[SupervisedDataset]]]
-        ] = None,
-        tracking_metrics: Optional[List[BenchmarkMetricBase]] = None,
-        _runner: Optional[Runner] = None,
-    ) -> None:
-        super().__init__(
-            name=name,
-            search_space=search_space,
-            optimization_config=optimization_config,
-            num_trials=num_trials,
-            outcome_names=outcome_names,
-            observe_noise_stds=observe_noise_stds,
-            noise_stds=noise_stds,
-            get_surrogate_and_datasets=get_surrogate_and_datasets,
-            tracking_metrics=tracking_metrics,
-            _runner=_runner,
-        )
-        self.reference_point = reference_point
-        self.optimal_value = optimal_value
diff --git a/ax/benchmark/problems/synthetic/discretized/mixed_integer.py b/ax/benchmark/problems/synthetic/discretized/mixed_integer.py
index b7bcaeb9080..4f72e18fffb 100644
--- a/ax/benchmark/problems/synthetic/discretized/mixed_integer.py
+++ b/ax/benchmark/problems/synthetic/discretized/mixed_integer.py
@@ -20,7 +20,7 @@
 
 from typing import Dict, List, Optional, Tuple, Type, Union
 
-from ax.benchmark.benchmark_problem import SingleObjectiveBenchmarkProblem
+from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.benchmark.metrics.benchmark import BenchmarkMetric
 from ax.benchmark.runners.botorch_test import BotorchTestProblemRunner
 from ax.core.objective import Objective
@@ -47,7 +47,7 @@ def _get_problem_from_common_inputs(
     num_trials: int,
     optimal_value: float,
     test_problem_bounds: Optional[List[Tuple[float, float]]] = None,
-) -> SingleObjectiveBenchmarkProblem:
+) -> BenchmarkProblem:
     """This is a helper that deduplicates common bits of the below problems.
 
     Args:
@@ -111,7 +111,7 @@ def _get_problem_from_common_inputs(
         outcome_names=[metric_name],
         modified_bounds=bounds,
     )
-    return SingleObjectiveBenchmarkProblem(
+    return BenchmarkProblem(
         name=benchmark_name + ("_observed_noise" if observe_noise_sd else ""),
         search_space=search_space,
         optimization_config=optimization_config,
@@ -128,7 +128,7 @@ def get_discrete_hartmann(
     num_trials: int = 50,
     observe_noise_sd: bool = False,
     bounds: Optional[List[Tuple[float, float]]] = None,
-) -> SingleObjectiveBenchmarkProblem:
+) -> BenchmarkProblem:
     """6D Hartmann problem where first 4 dimensions are discretized."""
     dim_int = 4
     if bounds is None:
@@ -160,7 +160,7 @@ def get_discrete_ackley(
     num_trials: int = 50,
     observe_noise_sd: bool = False,
     bounds: Optional[List[Tuple[float, float]]] = None,
-) -> SingleObjectiveBenchmarkProblem:
+) -> BenchmarkProblem:
     """13D Ackley problem where first 10 dimensions are discretized.
 
     This also restricts Ackley evaluation bounds to [0, 1].
@@ -193,7 +193,7 @@ def get_discrete_rosenbrock(
     num_trials: int = 50,
     observe_noise_sd: bool = False,
     bounds: Optional[List[Tuple[float, float]]] = None,
-) -> SingleObjectiveBenchmarkProblem:
+) -> BenchmarkProblem:
     """10D Rosenbrock problem where first 6 dimensions are discretized."""
     dim_int = 6
     if bounds is None:
diff --git a/ax/benchmark/problems/synthetic/hss/jenatton.py b/ax/benchmark/problems/synthetic/hss/jenatton.py
index f424db30904..f545ac39400 100644
--- a/ax/benchmark/problems/synthetic/hss/jenatton.py
+++ b/ax/benchmark/problems/synthetic/hss/jenatton.py
@@ -5,7 +5,7 @@
 
 # pyre-strict
 
-from ax.benchmark.benchmark_problem import SingleObjectiveBenchmarkProblem
+from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.benchmark.metrics.jenatton import JenattonMetric
 from ax.core.objective import Objective
 from ax.core.optimization_config import OptimizationConfig
@@ -17,7 +17,7 @@
 def get_jenatton_benchmark_problem(
     num_trials: int = 50,
     observe_noise_sd: bool = False,
-) -> SingleObjectiveBenchmarkProblem:
+) -> BenchmarkProblem:
     search_space = HierarchicalSearchSpace(
         parameters=[
             ChoiceParameter(
@@ -65,7 +65,7 @@ def get_jenatton_benchmark_problem(
 
     name = "Jenatton" + ("_observed_noise" if observe_noise_sd else "")
 
-    return SingleObjectiveBenchmarkProblem(
+    return BenchmarkProblem(
         name=name,
         search_space=search_space,
         optimization_config=optimization_config,
diff --git a/ax/benchmark/runners/surrogate.py b/ax/benchmark/runners/surrogate.py
index f64a5d1dd15..f685307d83a 100644
--- a/ax/benchmark/runners/surrogate.py
+++ b/ax/benchmark/runners/surrogate.py
@@ -6,7 +6,7 @@
 # pyre-strict
 
 import warnings
-from typing import Any, Dict, Iterable, List, Optional, Set, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from ax.benchmark.runners.base import BenchmarkRunner
@@ -15,20 +15,27 @@
 from ax.core.observation import ObservationFeatures
 from ax.core.search_space import SearchSpace
 from ax.modelbridge.torch import TorchModelBridge
+from ax.utils.common.base import Base
+from ax.utils.common.equality import equality_typechecker
 from ax.utils.common.serialization import TClassDecoderRegistry, TDecoderRegistry
 from botorch.utils.datasets import SupervisedDataset
+from pyre_extensions import assert_is_instance, none_throws
 from torch import Tensor
 
 
 class SurrogateRunner(BenchmarkRunner):
     def __init__(
         self,
+        *,
         name: str,
-        surrogate: TorchModelBridge,
-        datasets: List[SupervisedDataset],
         search_space: SearchSpace,
         outcome_names: List[str],
+        surrogate: Optional[TorchModelBridge] = None,
+        datasets: Optional[List[SupervisedDataset]] = None,
         noise_stds: Union[float, Dict[str, float]] = 0.0,
+        get_surrogate_and_datasets: Optional[
+            Callable[[], Tuple[TorchModelBridge, List[SupervisedDataset]]]
+        ] = None,
     ) -> None:
         """Runner for surrogate benchmark problems.
 
@@ -45,15 +52,42 @@ def __init__(
                 is added to all outputs. Alternatively, a dictionary mapping outcome
                 names to noise standard deviations can be provided to specify different
                 noise levels for different outputs.
+            get_surrogate_and_datasets: Function that returns the surrogate and
+                datasets, to allow for lazy construction. If
+                `get_surrogate_and_datasets` is not provided, `surrogate` and
+                `datasets` must be provided, and vice versa.
         """
+        if get_surrogate_and_datasets is None and (
+            surrogate is None or datasets is None
+        ):
+            raise ValueError(
+                "If get_surrogate_and_datasets is provided, surrogate and "
+                "datasets must not be provided, and vice versa."
+            )
+        self.get_surrogate_and_datasets = get_surrogate_and_datasets
         self.name = name
-        self.surrogate = surrogate
+        self._surrogate = surrogate
         self._outcome_names = outcome_names
-        self.datasets = datasets
+        self._datasets = datasets
         self.search_space = search_space
         self.noise_stds = noise_stds
         self.statuses: Dict[int, TrialStatus] = {}
 
+    def set_surrogate_and_datasets(self) -> None:
+        self._surrogate, self._datasets = none_throws(self.get_surrogate_and_datasets)()
+
+    @property
+    def surrogate(self) -> TorchModelBridge:
+        if self.get_surrogate_and_datasets is not None:
+            self.set_surrogate_and_datasets()
+        return none_throws(self._surrogate)
+
+    @property
+    def datasets(self) -> List[SupervisedDataset]:
+        if self.get_surrogate_and_datasets is not None:
+            self.set_surrogate_and_datasets()
+        return none_throws(self._datasets)
+
     @property
     def outcome_names(self) -> List[str]:
         return self._outcome_names
@@ -131,3 +165,22 @@ def deserialize_init_args(
         class_decoder_registry: Optional[TClassDecoderRegistry] = None,
     ) -> Dict[str, Any]:
         return {}
+
+    @property
+    def is_noiseless(self) -> bool:
+        if self.noise_stds is None:
+            return True
+        if isinstance(self.noise_stds, float):
+            return self.noise_stds == 0.0
+        return all(
+            std == 0.0 for std in assert_is_instance(self.noise_stds, dict).values()
+        )
+
+    @equality_typechecker
+    def __eq__(self, other: Base) -> bool:
+        if type(other) is not type(self):
+            return False
+
+        # Checking the whole datasets' equality here would be too expensive to be
+        # worth it; just check names instead
+        return self.name == other.name
diff --git a/ax/benchmark/tests/problems/test_mixed_integer_problems.py b/ax/benchmark/tests/problems/test_mixed_integer_problems.py
index 717beb3aabd..fa6cb400515 100644
--- a/ax/benchmark/tests/problems/test_mixed_integer_problems.py
+++ b/ax/benchmark/tests/problems/test_mixed_integer_problems.py
@@ -58,9 +58,6 @@ def test_problems(self) -> None:
                 ).test_problem._bounds,
                 expected_bounds,
             )
-            print(f"{name=}")
-            print(f"{problem.optimal_value=}")
-            print(f"{problem_cls().optimal_value=}")
             self.assertGreaterEqual(problem.optimal_value, problem_cls().optimal_value)
 
         # Test that they match correctly to the original problems.
diff --git a/ax/benchmark/tests/problems/test_surrogate_problems.py b/ax/benchmark/tests/problems/test_surrogate_problems.py
index 6d54784e0e0..febdb0332ac 100644
--- a/ax/benchmark/tests/problems/test_surrogate_problems.py
+++ b/ax/benchmark/tests/problems/test_surrogate_problems.py
@@ -8,53 +8,40 @@
 
 import numpy as np
 from ax.benchmark.benchmark import compute_score_trace
-from ax.benchmark.benchmark_problem import BenchmarkProblemProtocol
-from ax.core.runner import Runner
+from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.benchmark_stubs import get_moo_surrogate, get_soo_surrogate
+from pyre_extensions import assert_is_instance
 
 
 class TestSurrogateProblems(TestCase):
-    def test_conforms_to_protocol(self) -> None:
+    def setUp(self) -> None:
+        super().setUp()
+        # print max output so errors in 'repr' can be fully shown
+        self.maxDiff = None
+
+    def test_conforms_to_api(self) -> None:
         sbp = get_soo_surrogate()
-        self.assertIsInstance(sbp, BenchmarkProblemProtocol)
+        self.assertIsInstance(sbp, BenchmarkProblem)
 
         mbp = get_moo_surrogate()
-        self.assertIsInstance(mbp, BenchmarkProblemProtocol)
+        self.assertIsInstance(mbp, BenchmarkProblem)
 
-    def test_lazy_instantiation(self) -> None:
+    def test_repr(self) -> None:
 
-        # test instantiation from init
         sbp = get_soo_surrogate()
-        # test __repr__ method
 
         expected_repr = (
-            "SOOSurrogateBenchmarkProblem(name=test, "
+            "BenchmarkProblem(name='test', "
             "optimization_config=OptimizationConfig(objective=Objective(metric_name="
             '"branin", '
             "minimize=False), "
-            "outcome_constraints=[]), num_trials=6, is_noiseless=True, "
-            "observe_noise_stds=True, noise_stds=0.0, tracking_metrics=[])"
+            "outcome_constraints=[]), num_trials=6, "
+            "observe_noise_stds=True, has_ground_truth=True, "
+            "tracking_metrics=[], optimal_value=0.0, is_noiseless=True)"
         )
         self.assertEqual(repr(sbp), expected_repr)
 
-        self.assertIsNone(sbp._runner)
-        # sets runner
-        self.assertIsInstance(sbp.runner, Runner)
-
-        self.assertIsNotNone(sbp._runner)
-        self.assertIsNotNone(sbp.runner)
-
-        # repeat for MOO
-        sbp = get_moo_surrogate()
-
-        self.assertIsNone(sbp._runner)
-        # sets runner
-        self.assertIsInstance(sbp.runner, Runner)
-
-        self.assertIsNotNone(sbp._runner)
-        self.assertIsNotNone(sbp.runner)
-
     def test_compute_score_trace(self) -> None:
         soo_problem = get_soo_surrogate()
         score_trace = compute_score_trace(
diff --git a/ax/benchmark/tests/runners/test_surrogate_runner.py b/ax/benchmark/tests/runners/test_surrogate_runner.py
index 0fdf4e65154..b9eb7681bbc 100644
--- a/ax/benchmark/tests/runners/test_surrogate_runner.py
+++ b/ax/benchmark/tests/runners/test_surrogate_runner.py
@@ -8,10 +8,12 @@
 from unittest.mock import MagicMock
 
 import torch
-from ax.benchmark.problems.surrogate import SurrogateRunner
+from ax.benchmark.runners.surrogate import SurrogateRunner
 from ax.core.parameter import ParameterType, RangeParameter
 from ax.core.search_space import SearchSpace
+from ax.modelbridge.torch import TorchModelBridge
 from ax.utils.common.testutils import TestCase
+from ax.utils.testing.benchmark_stubs import get_soo_surrogate
 
 
 class TestSurrogateRunner(TestCase):
@@ -43,3 +45,14 @@ def test_surrogate_runner(self) -> None:
                 self.assertIs(runner.surrogate, surrogate)
                 self.assertEqual(runner.outcome_names, ["dummy_metric"])
                 self.assertEqual(runner.noise_stds, noise_std)
+
+    def test_lazy_instantiation(self) -> None:
+        problem = get_soo_surrogate()
+
+        self.assertIsNone(problem.runner._surrogate)
+        self.assertIsNone(problem.runner._datasets)
+
+        # sets datasets and surrogate
+        self.assertIsInstance(problem.runner.surrogate, TorchModelBridge)
+        self.assertIsNotNone(problem.runner._surrogate)
+        self.assertIsNotNone(problem.runner._datasets)
diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py
index dc24816effb..3d0ae2eeda3 100644
--- a/ax/benchmark/tests/test_benchmark.py
+++ b/ax/benchmark/tests/test_benchmark.py
@@ -21,12 +21,13 @@
     BenchmarkMethod,
     get_benchmark_scheduler_options,
 )
-from ax.benchmark.benchmark_problem import SingleObjectiveBenchmarkProblem
+from ax.benchmark.benchmark_problem import create_single_objective_problem_from_botorch
 from ax.benchmark.benchmark_result import BenchmarkResult
 from ax.benchmark.methods.modular_botorch import get_sobol_botorch_modular_acquisition
 from ax.benchmark.metrics.base import GroundTruthMetricMixin
 from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric
 from ax.benchmark.problems.registry import get_problem
+from ax.core.optimization_config import MultiObjectiveOptimizationConfig
 from ax.modelbridge.generation_strategy import GenerationNode, GenerationStrategy
 from ax.modelbridge.model_spec import ModelSpec
 from ax.modelbridge.registry import Models
@@ -36,7 +37,6 @@
 from ax.utils.common.testutils import TestCase
 from ax.utils.common.typeutils import checked_cast, not_none
 from ax.utils.testing.benchmark_stubs import (
-    get_constrained_multi_objective_benchmark_problem,
     get_moo_surrogate,
     get_multi_objective_benchmark_problem,
     get_single_objective_benchmark_problem,
@@ -162,9 +162,10 @@ def test_make_ground_truth_optimization_config(self) -> None:
         gt_opt_cfg = make_ground_truth_optimization_config(experiment)
         self.assertIs(gt_opt_cfg.objective.metric, gt_metric)
 
-        # Test behavior with MOO problem and outcome constraints
-        problem = get_constrained_multi_objective_benchmark_problem(
-            observe_noise_sd=False
+        # Test behavior with MOO problem
+        problem = get_multi_objective_benchmark_problem(observe_noise_sd=False)
+        self.assertIsInstance(
+            problem.optimization_config, MultiObjectiveOptimizationConfig
         )
         experiment = _create_benchmark_experiment(
             problem=problem, method_name="test_method"
@@ -300,7 +301,6 @@ def test_replication_sobol_surrogate(self) -> None:
             ("moo", get_moo_surrogate()),
         ]:
             with self.subTest(name, problem=problem):
-                surrogate, datasets = not_none(problem.get_surrogate_and_datasets)()
                 res = benchmark_replication(problem=problem, method=method, seed=0)
 
                 self.assertEqual(
@@ -439,7 +439,7 @@ def test_benchmark_multiple_problems_methods(self) -> None:
                 self.assertTrue((agg.score_trace[col] <= 100).all())
 
     def test_timeout(self) -> None:
-        problem = SingleObjectiveBenchmarkProblem.from_botorch_synthetic(
+        problem = create_single_objective_problem_from_botorch(
             test_problem_class=Branin,
             test_problem_kwargs={},
             lower_is_better=True,
diff --git a/ax/benchmark/tests/test_benchmark_problem.py b/ax/benchmark/tests/test_benchmark_problem.py
index b6de743528c..ccdc7faddde 100644
--- a/ax/benchmark/tests/test_benchmark_problem.py
+++ b/ax/benchmark/tests/test_benchmark_problem.py
@@ -8,15 +8,15 @@
 from typing import List, Optional, Union
 
 from ax.benchmark.benchmark_problem import (
-    MultiObjectiveBenchmarkProblem,
-    SingleObjectiveBenchmarkProblem,
+    create_multi_objective_problem_from_botorch,
+    create_single_objective_problem_from_botorch,
 )
 from ax.benchmark.metrics.benchmark import BenchmarkMetric
 from ax.benchmark.runners.botorch_test import BotorchTestProblemRunner
 from ax.core.types import ComparisonOp
 from ax.utils.common.testutils import TestCase
 from ax.utils.common.typeutils import checked_cast
-from botorch.test_functions.multi_objective import BraninCurrin
+from botorch.test_functions.multi_objective import BraninCurrin, ConstrainedBraninCurrin
 from botorch.test_functions.synthetic import (
     Ackley,
     ConstrainedGramacy,
@@ -27,9 +27,14 @@
 
 
 class TestBenchmarkProblem(TestCase):
+    def setUp(self) -> None:
+        # Print full output, so that any differences in 'repr' output are shown
+        self.maxDiff = None
+        super().setUp()
+
     def test_single_objective_from_botorch(self) -> None:
         for botorch_test_problem in [Ackley(), ConstrainedHartmann(dim=6)]:
-            test_problem = SingleObjectiveBenchmarkProblem.from_botorch_synthetic(
+            test_problem = create_single_objective_problem_from_botorch(
                 test_problem_class=botorch_test_problem.__class__,
                 test_problem_kwargs={},
                 lower_is_better=True,
@@ -77,15 +82,16 @@ def test_single_objective_from_botorch(self) -> None:
                     test_problem.optimization_config.outcome_constraints, []
                 )
                 expected_repr = (
-                    "SingleObjectiveBenchmarkProblem(name=Ackley, "
+                    "SingleObjectiveBenchmarkProblem(name='Ackley', "
                     "optimization_config=OptimizationConfig(objective=Objective("
                     'metric_name="Ackley", '
                     "minimize=True), outcome_constraints=[]), "
                     "num_trials=1, "
-                    "is_noiseless=True, "
                     "observe_noise_stds=False, "
                     "has_ground_truth=True, "
-                    "tracking_metrics=[])"
+                    "tracking_metrics=[], "
+                    "optimal_value=0.0, "
+                    "is_noiseless=True)"
                 )
             else:
                 outcome_constraint = (
@@ -96,16 +102,17 @@ def test_single_objective_from_botorch(self) -> None:
                 self.assertFalse(outcome_constraint.relative)
                 self.assertEqual(outcome_constraint.bound, 0.0)
                 expected_repr = (
-                    "SingleObjectiveBenchmarkProblem(name=ConstrainedHartmann, "
+                    "SingleObjectiveBenchmarkProblem(name='ConstrainedHartmann', "
                     "optimization_config=OptimizationConfig(objective=Objective("
                     'metric_name="ConstrainedHartmann", minimize=True), '
                     "outcome_constraints=[OutcomeConstraint(constraint_slack_0"
                     " >= 0.0)]), "
                     "num_trials=1, "
-                    "is_noiseless=True, "
                     "observe_noise_stds=False, "
                     "has_ground_truth=True, "
-                    "tracking_metrics=[])"
+                    "tracking_metrics=[], "
+                    "optimal_value=-3.32237, "
+                    "is_noiseless=True)"
                 )
 
             self.assertEqual(repr(test_problem), expected_repr)
@@ -124,7 +131,7 @@ def test_constrained_from_botorch(
         objective_noise_std: Optional[float],
         constraint_noise_std: Optional[Union[float, List[float]]],
     ) -> None:
-        ax_problem = SingleObjectiveBenchmarkProblem.from_botorch_synthetic(
+        ax_problem = create_single_objective_problem_from_botorch(
             test_problem_class=ConstrainedGramacy,
             test_problem_kwargs={
                 "noise_std": objective_noise_std,
@@ -160,12 +167,10 @@ def test_constrained_from_botorch(
 
     def test_moo_from_botorch(self) -> None:
         test_problem = BraninCurrin()
-        branin_currin_problem = (
-            MultiObjectiveBenchmarkProblem.from_botorch_multi_objective(
-                test_problem_class=test_problem.__class__,
-                test_problem_kwargs={},
-                num_trials=1,
-            )
+        branin_currin_problem = create_multi_objective_problem_from_botorch(
+            test_problem_class=test_problem.__class__,
+            test_problem_kwargs={},
+            num_trials=1,
         )
 
         # Test search space
@@ -197,8 +202,19 @@ def test_moo_from_botorch(self) -> None:
         self.assertEqual(branin_currin_problem.optimal_value, test_problem._max_hv)
         self.assertEqual(branin_currin_problem.reference_point, test_problem._ref_point)
 
+    def test_moo_from_botorch_constrained(self) -> None:
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            "Constrained multi-objective problems are not supported.",
+        ):
+            create_multi_objective_problem_from_botorch(
+                test_problem_class=ConstrainedBraninCurrin,
+                test_problem_kwargs={},
+                num_trials=1,
+            )
+
     def test_maximization_problem(self) -> None:
-        test_problem = SingleObjectiveBenchmarkProblem.from_botorch_synthetic(
+        test_problem = create_single_objective_problem_from_botorch(
             test_problem_class=Cosine8,
             lower_is_better=False,
             num_trials=1,
diff --git a/ax/storage/json_store/encoders.py b/ax/storage/json_store/encoders.py
index e22f641427d..9fac153a7d8 100644
--- a/ax/storage/json_store/encoders.py
+++ b/ax/storage/json_store/encoders.py
@@ -14,7 +14,6 @@
 from ax.benchmark.benchmark_problem import (
     BenchmarkProblem,
     MultiObjectiveBenchmarkProblem,
-    SingleObjectiveBenchmarkProblem,
 )
 from ax.benchmark.problems.hpo.torchvision import PyTorchCNNTorchvisionBenchmarkProblem
 from ax.core import ObservationFeatures
@@ -170,24 +169,6 @@ def multi_objective_benchmark_problem_to_dict(
     }
 
 
-def single_objective_benchmark_problem_to_dict(
-    soo_benchmark_problem: SingleObjectiveBenchmarkProblem,
-) -> Dict[str, Any]:
-    return {
-        "__type": soo_benchmark_problem.__class__.__name__,
-        "name": soo_benchmark_problem.name,
-        "search_space": soo_benchmark_problem.search_space,
-        "optimization_config": soo_benchmark_problem.optimization_config,
-        "runner": soo_benchmark_problem.runner,
-        "num_trials": soo_benchmark_problem.num_trials,
-        "is_noiseless": soo_benchmark_problem.is_noiseless,
-        "observe_noise_stds": soo_benchmark_problem.observe_noise_stds,
-        "has_ground_truth": soo_benchmark_problem.has_ground_truth,
-        "tracking_metrics": soo_benchmark_problem.tracking_metrics,
-        "optimal_value": soo_benchmark_problem.optimal_value,
-    }
-
-
 def trial_to_dict(trial: Trial) -> Dict[str, Any]:
     """Convert Ax trial to a dictionary."""
     return {
diff --git a/ax/storage/json_store/registry.py b/ax/storage/json_store/registry.py
index 9813e14fd7d..393cddb8e32 100644
--- a/ax/storage/json_store/registry.py
+++ b/ax/storage/json_store/registry.py
@@ -14,7 +14,6 @@
 from ax.benchmark.benchmark_problem import (
     BenchmarkProblem,
     MultiObjectiveBenchmarkProblem,
-    SingleObjectiveBenchmarkProblem,
 )
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
 from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric
@@ -157,7 +156,6 @@
     runner_to_dict,
     scalarized_objective_to_dict,
     search_space_to_dict,
-    single_objective_benchmark_problem_to_dict,
     sum_parameter_constraint_to_dict,
     surrogate_to_dict,
     threshold_early_stopping_strategy_to_dict,
@@ -256,7 +254,6 @@
     ScalarizedObjective: scalarized_objective_to_dict,
     SearchSpace: search_space_to_dict,
     SingleDiagnosticBestModelSelector: best_model_selector_to_dict,
-    SingleObjectiveBenchmarkProblem: single_objective_benchmark_problem_to_dict,
     HierarchicalSearchSpace: search_space_to_dict,
     SumConstraint: sum_parameter_constraint_to_dict,
     Surrogate: surrogate_to_dict,
@@ -382,7 +379,6 @@
     "SchedulerOptions": SchedulerOptions,
     "SearchSpace": SearchSpace,
     "SingleDiagnosticBestModelSelector": SingleDiagnosticBestModelSelector,
-    "SingleObjectiveBenchmarkProblem": SingleObjectiveBenchmarkProblem,
     "SklearnDataset": SklearnDataset,
     "SklearnMetric": SklearnMetric,
     "SklearnModelType": SklearnModelType,
diff --git a/ax/storage/json_store/tests/test_json_store.py b/ax/storage/json_store/tests/test_json_store.py
index 3540cd3bcf3..ff56180c988 100644
--- a/ax/storage/json_store/tests/test_json_store.py
+++ b/ax/storage/json_store/tests/test_json_store.py
@@ -46,7 +46,6 @@
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.benchmark_stubs import (
     get_aggregated_benchmark_result,
-    get_benchmark_problem,
     get_benchmark_result,
     get_multi_objective_benchmark_problem,
     get_single_objective_benchmark_problem,
@@ -142,7 +141,7 @@
     ("AugmentedHartmannMetric", get_augmented_hartmann_metric),
     ("BatchTrial", get_batch_trial),
     ("BenchmarkMethod", get_sobol_gpei_benchmark_method),
-    ("BenchmarkProblem", get_benchmark_problem),
+    ("BenchmarkProblem", get_single_objective_benchmark_problem),
     ("BenchmarkResult", get_benchmark_result),
     ("BoTorchModel", get_botorch_model),
     ("BoTorchModel", get_botorch_model_with_default_acquisition_class),
diff --git a/ax/utils/testing/benchmark_stubs.py b/ax/utils/testing/benchmark_stubs.py
index 170334b0258..890b68723a3 100644
--- a/ax/utils/testing/benchmark_stubs.py
+++ b/ax/utils/testing/benchmark_stubs.py
@@ -12,14 +12,12 @@
 from ax.benchmark.benchmark_method import BenchmarkMethod
 from ax.benchmark.benchmark_problem import (
     BenchmarkProblem,
+    create_multi_objective_problem_from_botorch,
+    create_single_objective_problem_from_botorch,
     MultiObjectiveBenchmarkProblem,
-    SingleObjectiveBenchmarkProblem,
 )
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
-from ax.benchmark.problems.surrogate import (
-    MOOSurrogateBenchmarkProblem,
-    SOOSurrogateBenchmarkProblem,
-)
+from ax.benchmark.runners.surrogate import SurrogateRunner
 from ax.core.experiment import Experiment
 from ax.core.optimization_config import (
     MultiObjectiveOptimizationConfig,
@@ -43,21 +41,12 @@
 from botorch.test_functions.synthetic import Branin
 
 
-def get_benchmark_problem() -> BenchmarkProblem:
-    return BenchmarkProblem.from_botorch(
-        test_problem_class=Branin,
-        test_problem_kwargs={},
-        lower_is_better=True,
-        num_trials=4,
-    )
-
-
 def get_single_objective_benchmark_problem(
     observe_noise_sd: bool = False,
     num_trials: int = 4,
     test_problem_kwargs: Optional[Dict[str, Any]] = None,
-) -> SingleObjectiveBenchmarkProblem:
-    return SingleObjectiveBenchmarkProblem.from_botorch_synthetic(
+) -> BenchmarkProblem:
+    return create_single_objective_problem_from_botorch(
         test_problem_class=Branin,
         test_problem_kwargs=test_problem_kwargs or {},
         lower_is_better=True,
@@ -69,7 +58,7 @@ def get_single_objective_benchmark_problem(
 def get_multi_objective_benchmark_problem(
     observe_noise_sd: bool = False, num_trials: int = 4
 ) -> MultiObjectiveBenchmarkProblem:
-    return MultiObjectiveBenchmarkProblem.from_botorch_multi_objective(
+    return create_multi_objective_problem_from_botorch(
         test_problem_class=BraninCurrin,
         test_problem_kwargs={},
         num_trials=num_trials,
@@ -80,7 +69,7 @@ def get_multi_objective_benchmark_problem(
 def get_constrained_multi_objective_benchmark_problem(
     observe_noise_sd: bool = False, num_trials: int = 4
 ) -> MultiObjectiveBenchmarkProblem:
-    return MultiObjectiveBenchmarkProblem.from_botorch_multi_objective(
+    return create_multi_objective_problem_from_botorch(
         test_problem_class=ConstrainedBraninCurrin,
         test_problem_kwargs={},
         num_trials=num_trials,
@@ -101,7 +90,7 @@ def get_sobol_benchmark_method() -> BenchmarkMethod:
     )
 
 
-def get_soo_surrogate() -> SOOSurrogateBenchmarkProblem:
+def get_soo_surrogate() -> BenchmarkProblem:
     experiment = get_branin_experiment(with_completed_trial=True)
     surrogate = TorchModelBridge(
         experiment=experiment,
@@ -110,21 +99,27 @@ def get_soo_surrogate() -> SOOSurrogateBenchmarkProblem:
         data=experiment.lookup_data(),
         transforms=[],
     )
-    return SOOSurrogateBenchmarkProblem(
+    runner = SurrogateRunner(
+        name="test",
+        search_space=experiment.search_space,
+        outcome_names=["branin"],
+        get_surrogate_and_datasets=lambda: (surrogate, []),
+    )
+    return BenchmarkProblem(
         name="test",
         search_space=experiment.search_space,
         optimization_config=checked_cast(
             OptimizationConfig, experiment.optimization_config
         ),
         num_trials=6,
-        outcome_names=["branin"],
         observe_noise_stds=True,
-        get_surrogate_and_datasets=lambda: (surrogate, []),
         optimal_value=0.0,
+        runner=runner,
+        is_noiseless=runner.is_noiseless,
     )
 
 
-def get_moo_surrogate() -> MOOSurrogateBenchmarkProblem:
+def get_moo_surrogate() -> MultiObjectiveBenchmarkProblem:
     experiment = get_branin_experiment_with_multi_objective(with_completed_trial=True)
     surrogate = TorchModelBridge(
         experiment=experiment,
@@ -133,18 +128,25 @@ def get_moo_surrogate() -> MOOSurrogateBenchmarkProblem:
         data=experiment.lookup_data(),
         transforms=[],
     )
-    return MOOSurrogateBenchmarkProblem(
+
+    runner = SurrogateRunner(
+        name="test",
+        search_space=experiment.search_space,
+        outcome_names=["branin_a", "branin_b"],
+        get_surrogate_and_datasets=lambda: (surrogate, []),
+    )
+    return MultiObjectiveBenchmarkProblem(
         name="test",
         search_space=experiment.search_space,
         optimization_config=checked_cast(
             MultiObjectiveOptimizationConfig, experiment.optimization_config
         ),
         num_trials=10,
-        outcome_names=["branin_a", "branin_b"],
         observe_noise_stds=True,
-        get_surrogate_and_datasets=lambda: (surrogate, []),
         optimal_value=1.0,
         reference_point=[],
+        runner=runner,
+        is_noiseless=runner.is_noiseless,
     )