Source code for gobbli.inspect.evaluate

from dataclasses import dataclass
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union

import altair as alt
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)

from gobbli.util import (
    as_multiclass,
    as_multilabel,
    escape_line_delimited_text,
    is_multilabel,
    multilabel_to_indicator_df,
    pred_prob_to_pred_label,
    pred_prob_to_pred_multilabel,
    truncate_text,
)


[docs]@dataclass
class ClassificationError:
    """
    Describes an error in classification.  Reports the original text,
    the true label, and the predicted probability.

    Args:
      X: The original text.
      y_true: The true label(s).
      y_pred_proba: The model predicted probability for each class.
    """

    X: str
    y_true: Union[str, List[str]]
    y_pred_proba: Dict[str, float]

    @property
    def y_pred(self) -> str:
        """
        Returns:
          The class with the highest predicted probability for this observation.
        """
        return max(self.y_pred_proba, key=lambda k: self.y_pred_proba[k])

[docs]    def y_pred_multilabel(self, threshold: float = 0.5) -> List[str]:
        """
        Args:
          threshold: The predicted probability threshold for predictions

        Returns:
          The predicted labels for this observation (predicted probability greater than
          the given threshold)
        """
        return pred_prob_to_pred_multilabel(self.y_pred_proba, threshold)


MetricFunc = Callable[[Sequence[str], pd.DataFrame], float]
"""
A function used to calculate some metric.  It should accept a sequence of true labels (y_true)
and a dataframe of shape (n_samples, n_classes) containing predicted probabilities; it should
output a real number.
"""


DEFAULT_METRICS: Dict[str, MetricFunc] = {
    "Weighted F1 Score": lambda y_true, y_pred: f1_score(
        y_true, y_pred, average="weighted"
    ),
    "Weighted Precision Score": lambda y_true, y_pred: precision_score(
        y_true, y_pred, average="weighted"
    ),
    "Weighted Recall Score": lambda y_true, y_pred: recall_score(
        y_true, y_pred, average="weighted"
    ),
    "Accuracy": lambda y_true, y_pred: accuracy_score(y_true, y_pred),
}
"""
The default set of metrics to evaluate classification models with.  Users may want to extend
this.
"""


[docs]@dataclass
class ClassificationEvaluation:
    """
    Provides several methods for evaluating the results from a classification problem.

    Args:
      labels: The set of unique labels in the dataset.
      X: The list of texts that were classified.
      y_true: The true labels for the dataset.
      y_pred_proba: A dataframe containing a row for each observation in X and a
        column for each label in the training data.  Cells are predicted probabilities.
    """

    labels: List[str]
    X: List[str]
    y_true: Union[List[str], List[List[str]]]
    y_pred_proba: pd.DataFrame
    metric_funcs: Optional[Dict[str, Callable[[Sequence, Sequence], float]]] = None

    def __post_init__(self):
        if not len(self.y_true) == self.y_pred_proba.shape[0]:
            raise ValueError(
                "y_true and y_pred_proba must have the same number of observations"
            )

        self.multilabel = is_multilabel(self.y_true)

    @property
    def y_true_multiclass(self) -> List[str]:
        return as_multiclass(self.y_true, self.multilabel)

    @property
    def y_true_multilabel(self) -> pd.DataFrame:
        return multilabel_to_indicator_df(
            as_multilabel(self.y_true, self.multilabel), self.labels
        )

    @property
    def y_pred_multiclass(self) -> List[str]:
        """
        Returns:
          Predicted class for each observation (assuming multiclass context).
        """
        return pred_prob_to_pred_label(self.y_pred_proba)

    @property
    def y_pred_multilabel(self) -> pd.DataFrame:
        """
        Returns:
          Indicator dataframe containing a 0 if each label wasn't predicted and 1 if
          it was for each observation.
        """
        return pred_prob_to_pred_multilabel(self.y_pred_proba).astype("int")

[docs]    def metrics(self) -> Dict[str, float]:
        """
        Returns:
          A dictionary containing various metrics of model performance on the test dataset.
        """
        metric_funcs = self.metric_funcs
        if metric_funcs is None:
            metric_funcs = DEFAULT_METRICS

        if self.multilabel:
            y_true: Union[List[str], pd.DataFrame] = self.y_true_multilabel
            y_pred: Union[List[str], pd.DataFrame] = self.y_pred_multilabel
        else:
            y_true = self.y_true_multiclass
            y_pred = self.y_pred_multiclass

        return {
            name: metric_func(y_true, y_pred)
            for name, metric_func in metric_funcs.items()
        }

[docs]    def metrics_report(self) -> str:
        """
        Returns:
          A nicely formatted human-readable report describing metrics of model performance
          on the test dataset.
        """
        metric_string = "\n".join(
            f"{name}: {metric}" for name, metric in self.metrics().items()
        )

        if self.multilabel:
            y_true: Union[pd.DataFrame, List[str]] = self.y_true_multilabel
            y_pred: Union[pd.DataFrame, List[str]] = self.y_pred_multilabel
            # Since these are indicator dataframes, the "labels" are indices
            labels: Union[List[str], List[int]] = list(range(len(self.labels)))
        else:
            y_true = self.y_true_multiclass
            y_pred = self.y_pred_multiclass
            # Since these are lists of labels, the "labels" are the strings themselves
            labels = self.labels

        return (
            "Metrics:\n"
            "--------\n"
            f"{metric_string}\n\n"
            "Classification Report:\n"
            "----------------------\n"
            f"{classification_report(y_true, y_pred, labels=labels, target_names=self.labels)}\n"
        )

[docs]    def plot(self, sample_size: Optional[int] = None) -> alt.Chart:
        """
        Args:
          sample_size: Optional number of points to sample for the plot.  Unsampled
            plots may be difficult to save due to their size.

        Returns:
          An Altair chart visualizing predicted probabilities and true classes to visually identify
          where errors are being made.
        """
        # Since multilabel is a generalization of the multiclass paradigm, implement
        # this visualization the same for multiclass and multilabel using the multilabel
        # format
        pred_prob_df = self.y_pred_proba
        true_df = self.y_true_multilabel

        if sample_size is not None:
            # Avoid errors due to sample being larger than the population if the number
            # of observations is smaller than the sample size
            pred_prob_df = pred_prob_df.sample(
                n=min(sample_size, pred_prob_df.shape[0])
            )
            true_df = true_df.iloc[pred_prob_df.index]

        charts = []

        if self.multilabel:
            legend_label = "Has Label"
        else:
            legend_label = "Belongs to Class"

        for label in self.labels:
            # Plot the predicted probabilities for given label for all observations
            plot_df = (
                pred_prob_df[[label]]
                .rename({label: "Predicted Probability"}, axis="columns")
                .join(
                    true_df[[label]]
                    .astype("bool")
                    .rename({label: legend_label}, axis="columns")
                )
            )

            charts.append(
                alt.layer(
                    alt.Chart(plot_df, title=label, height=40)
                    .mark_circle(size=8)
                    .encode(
                        x=alt.X(
                            "Predicted Probability",
                            type="quantitative",
                            title=None,
                            scale=alt.Scale(domain=(0.0, 1.0)),
                        ),
                        y=alt.Y(
                            "jitter",
                            type="quantitative",
                            title=None,
                            axis=alt.Axis(
                                values=[0], ticks=True, grid=False, labels=False
                            ),
                            scale=alt.Scale(),
                        ),
                        color=alt.Color(legend_label, type="nominal"),
                    )
                    .transform_calculate(
                        # Generate Gaussian jitter with a Box-Muller transform
                        jitter="sqrt(-2*log(random()))*cos(2*PI*random())/32"
                    )
                    .properties(height=40)
                )
            )
        return alt.vconcat(*charts)

[docs]    def errors_for_label(self, label: str, k: int = 10):
        """
        Output the biggest mistakes for the given class by the classifier

        Args:
          label: The label to return errors for.
          k: The number of results to return for each of false positives and false negatives.

        Returns:
          A 2-tuple.  The first element is a list of the top ``k`` false positives, and the
          second element is a list of the top ``k`` false negatives.
        """
        pred_label = self.y_pred_multilabel[label].astype("bool")
        true_label = self.y_true_multilabel[label].astype("bool")

        # Order false positives/false negatives by the degree of the error;
        # i.e. we want the false positives with highest predicted probability first
        # and false negatives with lowest predicted probability first
        # Take the top `k` of each
        false_positives = (
            self.y_pred_proba.loc[pred_label & ~true_label]
            .sort_values(by=label, ascending=False)
            .iloc[:k]
        )
        false_negatives = (
            self.y_pred_proba.loc[~pred_label & true_label]
            .sort_values(by=label, ascending=True)
            .iloc[:k]
        )

        def create_classification_errors(
            y_pred_proba: pd.DataFrame,
        ) -> List[ClassificationError]:
            classification_errors = []
            for ndx, row in y_pred_proba.iterrows():
                classification_errors.append(
                    ClassificationError(
                        X=self.X[ndx],
                        y_true=self.y_true[ndx],
                        y_pred_proba=row.to_dict(),
                    )
                )
            return classification_errors

        return (
            create_classification_errors(false_positives),
            create_classification_errors(false_negatives),
        )

[docs]    def errors(
        self, k: int = 10
    ) -> Dict[str, Tuple[List[ClassificationError], List[ClassificationError]]]:
        """
        Output the biggest mistakes for each class by the classifier.

        Args:
          k: The number of results to return for each of false positives and false negatives.

        Returns:
          A dictionary whose keys are label names and values are 2-tuples.  The first
          element is a list of the top ``k`` false positives, and the second element is a list
          of the top ``k`` false negatives.
        """
        errors = {}
        for label in self.labels:
            errors[label] = self.errors_for_label(label, k=k)

        return errors

[docs]    def errors_report(self, k: int = 10) -> str:
        """
        Args:
          k: The number of results to return for each of false positives and false negatives.

        Returns:
          A nicely-formatted human-readable report describing the biggest mistakes made by
          the classifier for each class.
        """
        errors = self.errors(k=k)
        output = "Errors Report\n" "------------\n\n"

        for label, (false_positives, false_negatives) in errors.items():

            def make_errors_str(errors: List[ClassificationError]) -> str:
                if self.multilabel:
                    return "\n".join(
                        (
                            f"Correct Value: {label in e.y_true}\n"
                            f"Predicted Probability: {e.y_pred_proba[label]}"
                            f"Text: {truncate_text(escape_line_delimited_text(e.X), 500)}\n"
                        )
                        for e in errors
                    )
                else:
                    return "\n".join(
                        (
                            f"True Class: {e.y_true}\n"
                            f"Predicted Class: {e.y_pred} (Probability: {e.y_pred_proba[e.y_pred]})\n"
                            f"Text: {truncate_text(escape_line_delimited_text(e.X), 500)}\n"
                        )
                        for e in errors
                    )

            false_positives_str = make_errors_str(false_positives)
            if len(false_positives_str) == 0:
                false_positives_str = "None"
            false_negatives_str = make_errors_str(false_negatives)
            if len(false_negatives_str) == 0:
                false_negatives_str = "None"

            header_name = "CLASS" if self.multilabel else "LABEL"

            output += (
                " -------\n"
                f"| {header_name}: {label}\n"
                " -------\n\n"
                "False Positives\n"
                "***************\n\n"
                f"{false_positives_str}\n\n"
                "False Negatives\n"
                "***************\n\n"
                f"{false_negatives_str}\n\n"
            )

        return output
Source code for gobbli.inspect.evaluate

Navigation

Related Topics