Skip to content

pydvl.reporting.scores

compute_removal_score

compute_removal_score(
    u: ModelUtility,
    values: ValuationResult,
    training_data: Dataset,
    percentages: NDArray[float_] | Iterable[float],
    *,
    remove_best: bool = False,
    progress: bool = False,
) -> dict[float, float]

Fits a model and computes its score on a test set after incrementally removing a percentage of data points from the training set, based on their values.

PARAMETER DESCRIPTION
u

Utility object with model, test data, and scoring function.

TYPE: ModelUtility

training_data

Dataset from which to remove data points.

TYPE: Dataset

values

Data values of data instances in the training set.

TYPE: ValuationResult

percentages

Sequence of removal percentages.

TYPE: NDArray[float_] | Iterable[float]

remove_best

If True, removes data points in order of decreasing valuation.

TYPE: bool DEFAULT: False

progress

If True, display a progress bar.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
dict[float, float]

Dictionary that maps the percentages to their respective scores.

Source code in src/pydvl/reporting/scores.py
def compute_removal_score(
    u: ModelUtility,
    values: ValuationResult,
    training_data: Dataset,
    percentages: NDArray[np.float_] | Iterable[float],
    *,
    remove_best: bool = False,
    progress: bool = False,
) -> dict[float, float]:
    """Fits a model and computes its score on a test set after incrementally removing
    a percentage of data points from the training set, based on their values.

    Args:
        u: Utility object with model, test data, and scoring function.
        training_data: Dataset from which to remove data points.
        values: Data values of data instances in the training set.
        percentages: Sequence of removal percentages.
        remove_best: If True, removes data points in order of decreasing valuation.
        progress: If True, display a progress bar.

    Returns:
        Dictionary that maps the percentages to their respective scores.
    """
    u = u.with_dataset(training_data)

    # Sanity checks
    if np.any([x >= 1.0 or x < 0.0 for x in percentages]):
        raise ValueError("All percentages should be in the range [0.0, 1.0)")

    if len(values) != len(training_data):
        raise ValueError(
            f"The number of values, {len(values)}, should be equal to the number of data points, {len(training_data)}"
        )

    scores = {}

    # We sort in descending order if we want to remove the best values
    values.sort(reverse=remove_best)

    for pct in tqdm(percentages, disable=not progress, desc="Removal Scores"):
        n_removal = int(pct * len(training_data))
        indices = values.indices[n_removal:]
        score = u(Sample(idx=None, subset=indices))
        scores[pct] = score
    return scores