pydvl.value.least_core.montecarlo ¶

montecarlo_least_core ¶

montecarlo_least_core(
    u: Utility,
    n_iterations: int,
    *,
    n_jobs: int = 1,
    parallel_backend: Optional[ParallelBackend] = None,
    config: Optional[ParallelConfig] = None,
    non_negative_subsidy: bool = False,
    solver_options: Optional[dict] = None,
    progress: bool = False,
    seed: Optional[Seed] = None
) -> ValuationResult

Computes approximate Least Core values using a Monte Carlo approach.

\[ \begin{array}{lll} \text{minimize} & \displaystyle{e} & \\ \text{subject to} & \displaystyle\sum_{i\in N} x_{i} = v(N) & \\ & \displaystyle\sum_{i\in S} x_{i} + e \geq v(S) & , \forall S \in \{S_1, S_2, \dots, S_m \overset{\mathrm{iid}}{\sim} U(2^N) \} \end{array} \]

Where:

\(U(2^N)\) is the uniform distribution over the powerset of \(N\).
\(m\) is the number of subsets that will be sampled and whose utility will be computed and used to compute the data values.

PARAMETER	DESCRIPTION
`u`	Utility object with model, data, and scoring function TYPE: `Utility`
`n_iterations`	total number of iterations to use TYPE: `int`
`n_jobs`	number of jobs across which to distribute the computation TYPE: `int` DEFAULT: `1`
`parallel_backend`	Parallel backend instance to use for parallelizing computations. If `None`, use JoblibParallelBackend backend. See the Parallel Backends package for available options. TYPE: `Optional[ParallelBackend]` DEFAULT: `None`
`config`	(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc. TYPE: `Optional[ParallelConfig]` DEFAULT: `None`
`non_negative_subsidy`	If True, the least core subsidy \(e\) is constrained to be non-negative. TYPE: `bool` DEFAULT: `False`
`solver_options`	Dictionary of options that will be used to select a solver and to configure it. Refer to cvxpy's documentation for all possible options. TYPE: `Optional[dict]` DEFAULT: `None`
`progress`	If True, shows a tqdm progress bar TYPE: `bool` DEFAULT: `False`
`seed`	Either an instance of a numpy random number generator or a seed for it. TYPE: `Optional[Seed]` DEFAULT: `None`

RETURNS	DESCRIPTION
`ValuationResult`	Object with the data values and the least core value.

Changed in version 0.9.0

Deprecated config argument and added a parallel_backend argument to allow users to pass the Parallel Backend instance directly.

Source code in src/pydvl/value/least_core/montecarlo.py

@deprecated(
    target=True,
    args_mapping={"config": "config"},
    deprecated_in="0.9.0",
    remove_in="0.10.0",
)
def montecarlo_least_core(
    u: Utility,
    n_iterations: int,
    *,
    n_jobs: int = 1,
    parallel_backend: Optional[ParallelBackend] = None,
    config: Optional[ParallelConfig] = None,
    non_negative_subsidy: bool = False,
    solver_options: Optional[dict] = None,
    progress: bool = False,
    seed: Optional[Seed] = None,
) -> ValuationResult:
    r"""Computes approximate Least Core values using a Monte Carlo approach.

    $$
    \begin{array}{lll}
    \text{minimize} & \displaystyle{e} & \\
    \text{subject to} & \displaystyle\sum_{i\in N} x_{i} = v(N) & \\
    & \displaystyle\sum_{i\in S} x_{i} + e \geq v(S) & ,
    \forall S \in \{S_1, S_2, \dots, S_m \overset{\mathrm{iid}}{\sim} U(2^N) \}
    \end{array}
    $$

    Where:

    * $U(2^N)$ is the uniform distribution over the powerset of $N$.
    * $m$ is the number of subsets that will be sampled and whose utility will
      be computed and used to compute the data values.

    Args:
        u: Utility object with model, data, and scoring function
        n_iterations: total number of iterations to use
        n_jobs: number of jobs across which to distribute the computation
        parallel_backend: Parallel backend instance to use
            for parallelizing computations. If `None`,
            use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.
            See the [Parallel Backends][pydvl.parallel.backends] package
            for available options.
        config: (**DEPRECATED**) Object configuring parallel computation,
            with cluster address, number of cpus, etc.
        non_negative_subsidy: If True, the least core subsidy $e$ is constrained
            to be non-negative.
        solver_options: Dictionary of options that will be used to select a solver
            and to configure it. Refer to [cvxpy's
            documentation](https://www.cvxpy.org/tutorial/advanced/index.html#setting-solver-options)
            for all possible options.
        progress: If True, shows a tqdm progress bar
        seed: Either an instance of a numpy random number generator or a seed for it.

    Returns:
        Object with the data values and the least core value.

    !!! tip "Changed in version 0.9.0"
        Deprecated `config` argument and added a `parallel_backend`
        argument to allow users to pass the Parallel Backend instance
        directly.
    """
    problem = mclc_prepare_problem(
        u,
        n_iterations,
        n_jobs=n_jobs,
        parallel_backend=parallel_backend,
        config=config,
        progress=progress,
        seed=seed,
    )
    return lc_solve_problem(
        problem,
        u=u,
        algorithm="montecarlo_least_core",
        non_negative_subsidy=non_negative_subsidy,
        solver_options=solver_options,
    )

mclc_prepare_problem ¶

mclc_prepare_problem(
    u: Utility,
    n_iterations: int,
    *,
    n_jobs: int = 1,
    parallel_backend: Optional[ParallelBackend] = None,
    config: Optional[ParallelConfig] = None,
    progress: bool = False,
    seed: Optional[Seed] = None
) -> LeastCoreProblem

Prepares a linear problem by sampling subsets of the data. Use this to separate the problem preparation from the solving with lc_solve_problem(). Useful for parallel execution of multiple experiments.

See montecarlo_least_core for argument descriptions.

Changed in version 0.9.0

Deprecated config argument and added a parallel_backend argument to allow users to pass the Parallel Backend instance directly.

Source code in src/pydvl/value/least_core/montecarlo.py

@deprecated(
    target=True,
    args_mapping={"config": "config"},
    deprecated_in="0.9.0",
    remove_in="0.10.0",
)
def mclc_prepare_problem(
    u: Utility,
    n_iterations: int,
    *,
    n_jobs: int = 1,
    parallel_backend: Optional[ParallelBackend] = None,
    config: Optional[ParallelConfig] = None,
    progress: bool = False,
    seed: Optional[Seed] = None,
) -> LeastCoreProblem:
    """Prepares a linear problem by sampling subsets of the data. Use this to
    separate the problem preparation from the solving with
    [lc_solve_problem()][pydvl.value.least_core.common.lc_solve_problem]. Useful
    for parallel execution of multiple experiments.

    See
    [montecarlo_least_core][pydvl.value.least_core.montecarlo.montecarlo_least_core]
    for argument descriptions.

    !!! note "Changed in version 0.9.0"
        Deprecated `config` argument and added a `parallel_backend`
        argument to allow users to pass the Parallel Backend instance
        directly.
    """
    n = len(u.data)

    if n_iterations < n:
        warnings.warn(
            f"Number of iterations '{n_iterations}' is smaller the size of the dataset '{n}'. "
            f"This is not optimal because in the worst case we need at least '{n}' constraints "
            "to satisfy the individual rationality condition."
        )

    if n_iterations > 2**n:
        warnings.warn(
            f"Passed n_iterations is greater than the number subsets! "
            f"Setting it to 2^{n}",
            RuntimeWarning,
        )
        n_iterations = 2**n

    parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)

    iterations_per_job = max(
        1, n_iterations // parallel_backend.effective_n_jobs(n_jobs)
    )

    map_reduce_job: MapReduceJob["Utility", "LeastCoreProblem"] = MapReduceJob(
        inputs=u,
        map_func=_montecarlo_least_core,
        reduce_func=_reduce_func,
        map_kwargs=dict(n_iterations=iterations_per_job, progress=progress),
        n_jobs=n_jobs,
        parallel_backend=parallel_backend,
    )

    return map_reduce_job(seed=seed)