pydvl.valuation.methods._utility_values_and_sample_masks ¶

compute_utility_values_and_sample_masks ¶

compute_utility_values_and_sample_masks(
    utility: UtilityBase,
    sampler: IndexSampler,
    n_samples: int,
    progress: bool,
    extra_samples: Iterable[SampleT] | None = None,
) -> Tuple[NDArray[float_], NDArray[bool_]]

Calculate utility values and sample masks on samples in parallel.

Creating the utility evaluations and sample masks is the computational bottleneck of several data valuation algorithms, for examples least-core and group-testing.

PARAMETER	DESCRIPTION
`utility`	Utility object with model, data and scoring function. TYPE: `UtilityBase`
`sampler`	The sampler to use for the valuation. TYPE: `IndexSampler`
`n_samples`	The number of samples to use from the sampler. TYPE: `int`
`progress`	Whether to show a progress bar. TYPE: `bool`
`extra_samples`	Additional samples to evaluate. For example, this can be used to calculate the total utility of the dataset in parallel with evaluating the utility on the samples. Defaults to None. TYPE: `Iterable[SampleT] \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`Tuple[NDArray[float_], NDArray[bool_]]`	A tuple containing the utility values and the sample masks.

RAISES	DESCRIPTION
`ValueError`	If the utility object does not have training data.

Source code in src/pydvl/valuation/methods/_utility_values_and_sample_masks.py

def compute_utility_values_and_sample_masks(
    utility: UtilityBase,
    sampler: IndexSampler,
    n_samples: int,
    progress: bool,
    extra_samples: Iterable[SampleT] | None = None,
) -> Tuple[NDArray[np.float_], NDArray[np.bool_]]:
    """Calculate utility values and sample masks on samples in parallel.

    Creating the utility evaluations and sample masks is the computational bottleneck
    of several data valuation algorithms, for examples least-core and group-testing.

    Args:
        utility: Utility object with model, data and scoring function.
        sampler: The sampler to use for the valuation.
        n_samples: The number of samples to use from the sampler.
        progress: Whether to show a progress bar.
        extra_samples: Additional samples to evaluate. For example, this can be used
            to calculate the total utility of the dataset in parallel with evaluating
            the utility on the samples. Defaults to None.

    Returns:
        A tuple containing the utility values and the sample masks.

    Raises:
        ValueError: If the utility object does not have training data.

    """
    if utility.training_data is None:
        raise ValueError("Utility object must have training data.")

    indices = utility.training_data.indices
    n_obs = len(indices)

    batch_size = sampler.batch_size
    n_batches = math.ceil(n_samples / batch_size)

    def _create_mask_and_utility_values(
        batch: Iterable[SampleT],
    ) -> tuple[List[NDArray[BoolDType]], List[float]]:
        """Convert sampled indices to boolean masks and calculate utility on each
        sample in batch."""
        masks: List[NDArray[BoolDType]] = []
        u_values: List[float] = []
        for sample in batch:
            m = np.full(n_obs, False)
            m[sample.subset.astype(int)] = True
            masks.append(m)
            u_values.append(utility(sample))

        return masks, u_values

    generator = cast(
        BatchGenerator,
        takewhile(
            lambda _: sampler.n_samples < n_samples,
            sampler.generate_batches(indices),
        ),
    )

    if extra_samples is not None:
        generator = cast(
            BatchGenerator, chain(generator, batched(extra_samples, batch_size))
        )

    generator_with_progress = cast(
        BatchGenerator,
        tqdm(
            generator,
            disable=not progress,
            total=n_batches - 1,
            position=0,
        ),
    )

    with Parallel(return_as="generator") as parallel:
        results = parallel(
            delayed(_create_mask_and_utility_values)(batch)
            for batch in generator_with_progress
        )

        masks: List[NDArray[BoolDType]] = []
        u_values: List[float] = []
        for m, v in results:
            masks.extend(m)
            u_values.extend(v)

    return np.array(u_values), np.row_stack(masks)