Skip to content

Bias Metrics - API Reference

Auto-generated documentation for bias metric classes.

warprec.evaluation.metrics.bias.aclt.ACLT

Bases: UserAverageTopKMetric

ACLT (Average Coverage of Long-Tail items) is a metric that evaluates the extent to which a recommendation system provides recommendations from the long-tail of item popularity. The long-tail is determined based on a given popularity percentile threshold.

This metric is designed to assess recommendation diversity by measuring the proportion of recommended long-tail items relative to all recommendations. A higher ACLT value indicates a system that effectively recommends less popular items.

Attributes:

Name Type Description
long_tail Tensor

The lookup tensor of long tail items.

Parameters:

Name Type Description Default
k int

The cutoff for recommendations.

required
num_users int

Number of users in the training set.

required
item_interactions Tensor

The counts for item interactions in training set.

required
pop_ratio float

The percentile considered popular.

0.8
dist_sync_on_step bool

Torchmetrics parameter.

False
**kwargs Any

The keyword argument dictionary.

{}
Source code in warprec/evaluation/metrics/bias/aclt.py
@metric_registry.register("ACLT")
class ACLT(UserAverageTopKMetric):
    """ACLT (Average Coverage of Long-Tail items) is a metric that evaluates the
    extent to which a recommendation system provides recommendations from the long-tail
    of item popularity. The long-tail is determined based on a given popularity percentile threshold.

    This metric is designed to assess recommendation diversity by measuring the
    proportion of recommended long-tail items relative to all recommendations. A higher
    ACLT value indicates a system that effectively recommends less popular items.

    Attributes:
        long_tail (Tensor): The lookup tensor of long tail items.

    Args:
        k (int): The cutoff for recommendations.
        num_users (int): Number of users in the training set.
        item_interactions (Tensor): The counts for item interactions in training set.
        pop_ratio (float): The percentile considered popular.
        dist_sync_on_step (bool): Torchmetrics parameter.
        **kwargs (Any): The keyword argument dictionary.
    """

    _REQUIRED_COMPONENTS: Set[MetricBlock] = {
        MetricBlock.BINARY_RELEVANCE,
        MetricBlock.VALID_USERS,
        MetricBlock.TOP_K_INDICES,
    }

    long_tail: Tensor

    def __init__(
        self,
        k: int,
        num_users: int,
        item_interactions: Tensor,
        pop_ratio: float = 0.8,
        dist_sync_on_step: bool = False,
        **kwargs: Any,
    ):
        super().__init__(
            k=k, num_users=num_users, dist_sync_on_step=dist_sync_on_step, **kwargs
        )
        self.pop_ratio = pop_ratio
        _, lt = self.compute_head_tail(item_interactions, self.pop_ratio)
        self.register_buffer("long_tail", lt)

    def compute_scores(
        self, preds: Tensor, target: Tensor, top_k_rel: Tensor, **kwargs: Any
    ) -> Tensor:
        # Retrieve top_k_indices from kwargs
        top_k_indices = kwargs.get(f"top_{self.k}_indices")

        # Remap top_k_indices to global
        item_indices = kwargs.get("item_indices")
        top_k_indices = self.remap_indices(top_k_indices, item_indices)

        # Check which items are in the long tail
        is_long_tail = torch.isin(top_k_indices, self.long_tail)

        # Sum hits per user (Count of Long Tail items)
        return is_long_tail.sum(dim=1).float()

    @property
    def name(self):
        """The name of the metric."""
        if self.pop_ratio == 0.8:
            return self.__class__.__name__
        return f"ACLT[Pop{int(self.pop_ratio * 100)}%]"

name property

The name of the metric.

warprec.evaluation.metrics.bias.aplt.APLT

Bases: UserAverageTopKMetric

APLT (Average Proportion of Long-Tail items) is a metric that evaluates the proportion of long-tail items present in the top-k recommendations. Unlike APLT, which focuses on the number of long-tail recommendations, APLT normalizes by the total number of recommended items, providing a proportional measure.

This metric helps analyze how well a recommendation system balances diversity by incorporating less popular items into recommendations while maintaining relevance.

Attributes:

Name Type Description
long_tail Tensor

The lookup tensor of long tail items.

Parameters:

Name Type Description Default
k int

The cutoff for recommendations.

required
num_users int

Number of users in the training set.

required
item_interactions Tensor

The counts for item interactions in training set.

required
pop_ratio float

The percentile considered popular.

0.8
dist_sync_on_step bool

Torchmetrics parameter.

False
**kwargs Any

The keyword argument dictionary.

{}
Source code in warprec/evaluation/metrics/bias/aplt.py
@metric_registry.register("APLT")
class APLT(UserAverageTopKMetric):
    """APLT (Average Proportion of Long-Tail items) is a metric that evaluates
    the proportion of long-tail items present in the top-k recommendations.
    Unlike APLT, which focuses on the number of long-tail recommendations, APLT normalizes
    by the total number of recommended items, providing a proportional measure.

    This metric helps analyze how well a recommendation system balances diversity
    by incorporating less popular items into recommendations while maintaining relevance.

    Attributes:
        long_tail (Tensor): The lookup tensor of long tail items.

    Args:
        k (int): The cutoff for recommendations.
        num_users (int): Number of users in the training set.
        item_interactions (Tensor): The counts for item interactions in training set.
        pop_ratio (float): The percentile considered popular.
        dist_sync_on_step (bool): Torchmetrics parameter.
        **kwargs (Any): The keyword argument dictionary.
    """

    _REQUIRED_COMPONENTS: Set[MetricBlock] = {
        MetricBlock.BINARY_RELEVANCE,
        MetricBlock.VALID_USERS,
        MetricBlock.TOP_K_INDICES,
    }

    long_tail: Tensor

    def __init__(
        self,
        k: int,
        num_users: int,
        item_interactions: Tensor,
        pop_ratio: float = 0.8,
        dist_sync_on_step: bool = False,
        **kwargs: Any,
    ):
        super().__init__(k=k, num_users=num_users, dist_sync_on_step=dist_sync_on_step)
        self.pop_ratio = pop_ratio
        _, lt = self.compute_head_tail(item_interactions, self.pop_ratio)
        self.register_buffer("long_tail", lt)

    def compute_scores(
        self, preds: Tensor, target: Tensor, top_k_rel: Tensor, **kwargs: Any
    ) -> Tensor:
        # Retrieve top_k_indices from kwargs
        top_k_indices = kwargs.get(f"top_{self.k}_indices")

        # Remap top_k_indices to global
        item_indices = kwargs.get("item_indices")
        top_k_indices = self.remap_indices(top_k_indices, item_indices)

        # Check which items are in the long tail
        is_long_tail = torch.isin(top_k_indices, self.long_tail)

        # Proportion: Count / k
        return is_long_tail.sum(dim=1).float() / self.k

    @property
    def name(self):
        """The name of the metric."""
        if self.pop_ratio == 0.8:
            return self.__class__.__name__
        return f"APLT[Pop{int(self.pop_ratio * 100)}%]"

name property

The name of the metric.

warprec.evaluation.metrics.bias.arp.ARP

Bases: UserAverageTopKMetric

ARP (Average Recommendation Popularity) is a metric that evaluates the average popularity of the top-k recommendations.

Attributes:

Name Type Description
pop Tensor

The lookup tensor of item popularity.

Parameters:

Name Type Description Default
k int

The cutoff for recommendations.

required
num_users int

Number of users in the training set.

required
item_interactions Tensor

The counts for item interactions in training set.

required
dist_sync_on_step bool

Torchmetrics parameter.

False
**kwargs Any

The keyword argument dictionary.

{}
Source code in warprec/evaluation/metrics/bias/arp.py
@metric_registry.register("ARP")
class ARP(UserAverageTopKMetric):
    """ARP (Average Recommendation Popularity) is a metric that evaluates
    the average popularity of the top-k recommendations.

    Attributes:
        pop (Tensor): The lookup tensor of item popularity.

    Args:
        k (int): The cutoff for recommendations.
        num_users (int): Number of users in the training set.
        item_interactions (Tensor): The counts for item interactions in training set.
        dist_sync_on_step (bool): Torchmetrics parameter.
        **kwargs (Any): The keyword argument dictionary.
    """

    _REQUIRED_COMPONENTS: Set[MetricBlock] = {
        MetricBlock.BINARY_RELEVANCE,
        MetricBlock.VALID_USERS,
        MetricBlock.TOP_K_INDICES,
    }

    pop: Tensor

    def __init__(
        self,
        k: int,
        num_users: int,
        item_interactions: Tensor,
        dist_sync_on_step: bool = False,
        **kwargs: Any,
    ):
        super().__init__(k=k, num_users=num_users, dist_sync_on_step=dist_sync_on_step)
        self.register_buffer("pop", self.compute_popularity(item_interactions))

    def compute_scores(
        self, preds: Tensor, target: Tensor, top_k_rel: Tensor, **kwargs: Any
    ) -> Tensor:
        # Retrieve top_k_indices from kwargs
        top_k_indices = kwargs.get(f"top_{self.k}_indices")

        # Remap top_k_indices to global
        item_indices = kwargs.get("item_indices")
        top_k_indices = self.remap_indices(top_k_indices, item_indices)

        # Retrieve popularity for the recommended items
        # Shape: [batch_size, k]
        recommended_items_pop = self.pop[top_k_indices]

        # Average popularity per user
        return recommended_items_pop.mean(dim=1).float()

warprec.evaluation.metrics.bias.pop_reo.PopREO

Bases: TopKMetric

Popularity-based Ranking-based Equal Opportunity (PopREO) metric.

This metric evaluates the fairness of a recommender system by comparing the proportion of recommended items from the short head (most popular items) and long tail (less popular items) to their respective proportions in the ground truth. It calculates the standard deviation of these proportions divided by their mean, providing a measure of how equally the system recommends items across different popularity groups.

Attributes:

Name Type Description
short_head Tensor

The lookup tensor of short head items.

long_tail Tensor

The lookup tensor of long tail items.

short_recs Tensor

The short head recommendations.

long_recs Tensor

The long tail recommendations.

short_gt Tensor

The short head items in the target.

long_gt Tensor

The long tail items in the target.

Parameters:

Name Type Description Default
k int

The cutoff for recommendations.

required
item_interactions Tensor

The counts for item interactions in training set.

required
pop_ratio float

The percentile considered popular.

0.8
dist_sync_on_step bool

Torchmetrics parameter.

False
**kwargs Any

The keyword argument dictionary.

{}
Source code in warprec/evaluation/metrics/bias/pop_reo.py
@metric_registry.register("PopREO")
class PopREO(TopKMetric):
    """Popularity-based Ranking-based Equal Opportunity (PopREO) metric.

    This metric evaluates the fairness of a recommender system by comparing the
    proportion of recommended items from the short head (most popular items) and
    long tail (less popular items) to their respective proportions in the ground truth.
    It calculates the standard deviation of these proportions divided by their mean,
    providing a measure of how equally the system recommends items
    across different popularity groups.

    Attributes:
        short_head (Tensor): The lookup tensor of short head items.
        long_tail (Tensor): The lookup tensor of long tail items.
        short_recs (Tensor): The short head recommendations.
        long_recs (Tensor): The long tail recommendations.
        short_gt (Tensor): The short head items in the target.
        long_gt (Tensor): The long tail items in the target.

    Args:
        k (int): The cutoff for recommendations.
        item_interactions (Tensor): The counts for item interactions in training set.
        pop_ratio (float): The percentile considered popular.
        dist_sync_on_step (bool): Torchmetrics parameter.
        **kwargs (Any): The keyword argument dictionary.
    """

    _REQUIRED_COMPONENTS: Set[MetricBlock] = {
        MetricBlock.BINARY_RELEVANCE,
        MetricBlock.TOP_K_INDICES,
    }

    short_head: Tensor
    long_tail: Tensor
    short_recs: Tensor
    long_recs: Tensor
    short_gt: Tensor
    long_gt: Tensor

    def __init__(
        self,
        k: int,
        item_interactions: Tensor,
        pop_ratio: float = 0.8,
        dist_sync_on_step: bool = False,
        **kwargs: Any,
    ):
        super().__init__(k, dist_sync_on_step)
        self.pop_ratio = pop_ratio
        self.add_state("short_recs", default=torch.tensor(0.0), dist_reduce_fx="sum")
        self.add_state("long_recs", default=torch.tensor(0.0), dist_reduce_fx="sum")
        self.add_state("short_gt", default=torch.tensor(0.0), dist_reduce_fx="sum")
        self.add_state("long_gt", default=torch.tensor(0.0), dist_reduce_fx="sum")

        # Add short head and long tail items as buffer
        sh, lt = self.compute_head_tail(item_interactions, self.pop_ratio)
        self.register_buffer("short_head", sh)
        self.register_buffer("long_tail", lt)

    def update(self, preds: Tensor, **kwargs: Any):
        target = kwargs.get("binary_relevance")
        top_k_indices = kwargs.get(f"top_{self.k}_indices")
        item_indices = kwargs.get("item_indices")

        # Remap top_k_indices to global
        item_indices = kwargs.get("item_indices")
        top_k_indices = self.remap_indices(top_k_indices, item_indices)

        # Extract positive item indices from target
        if item_indices is not None:
            rows, cols = target.nonzero(as_tuple=True)
            positive_indices = item_indices[rows, cols]

        else:  # Full evaluation
            _, positive_indices = target.nonzero(as_tuple=True)

        # Accumulate short head and long tail recommendations
        self.short_recs += torch.isin(top_k_indices, self.short_head).sum().float()
        self.long_recs += torch.isin(top_k_indices, self.long_tail).sum().float()
        self.short_gt += torch.isin(positive_indices, self.short_head).sum().float()
        self.long_gt += torch.isin(positive_indices, self.long_tail).sum().float()

    def compute(self):
        # Calculate proportions of hits per group
        pr_short = self.short_recs / (self.short_gt if self.short_gt > 0 else 1.0)
        pr_long = self.long_recs / (self.long_gt if self.long_gt > 0 else 1.0)

        # Handle the case where one group has zero items
        if self.short_gt == 0 or self.long_gt == 0:
            return torch.tensor(0.0)

        pr = torch.stack([pr_short, pr_long])
        # std(unbiased=False) matches numpy std by default for population std
        pop_reo = (torch.std(pr, unbiased=False) / torch.mean(pr)).item()
        return {self.name: pop_reo}

    @property
    def name(self):
        """The name of the metric."""
        if self.pop_ratio == 0.8:
            return self.__class__.__name__
        return f"PopREO[Pop{int(self.pop_ratio * 100)}%]"

name property

The name of the metric.

warprec.evaluation.metrics.bias.pop_rsp.PopRSP

Bases: TopKMetric

Popularity-based Ranking-based Statistical Parity (PopRSP) metric.

This metric evaluates the disparity in recommendation performance between popular (short head) and less popular (long tail) items. It calculates the standard deviation of precision across these two groups, normalized by their mean, to assess the balance in recommendation exposure.

Attributes:

Name Type Description
short_head Tensor

The lookup tensor of short head items.

long_tail Tensor

The lookup tensor of long tail items.

total_short Tensor

The total number of short head items.

total_long Tensor

The total number of long tail items.

short_recs Tensor

The short head recommendations.

long_recs Tensor

The long tail recommendations.

Parameters:

Name Type Description Default
k int

The cutoff for recommendations.

required
item_interactions Tensor

The counts for item interactions in training set.

required
pop_ratio float

The percentile considered popular.

0.8
dist_sync_on_step bool

Torchmetrics parameter.

False
**kwargs Any

The keyword argument dictionary.

{}
Source code in warprec/evaluation/metrics/bias/pop_rsp.py
@metric_registry.register("PopRSP")
class PopRSP(TopKMetric):
    """Popularity-based Ranking-based Statistical Parity (PopRSP) metric.

    This metric evaluates the disparity in recommendation performance
    between popular (short head) and less popular (long tail) items.
    It calculates the standard deviation of precision across these
    two groups, normalized by their mean, to assess the balance in
    recommendation exposure.

    Attributes:
        short_head (Tensor): The lookup tensor of short head items.
        long_tail (Tensor): The lookup tensor of long tail items.
        total_short (Tensor): The total number of short head items.
        total_long (Tensor): The total number of long tail items.
        short_recs (Tensor): The short head recommendations.
        long_recs (Tensor): The long tail recommendations.

    Args:
        k (int): The cutoff for recommendations.
        item_interactions (Tensor): The counts for item interactions in training set.
        pop_ratio (float): The percentile considered popular.
        dist_sync_on_step (bool): Torchmetrics parameter.
        **kwargs (Any): The keyword argument dictionary.
    """

    _REQUIRED_COMPONENTS: Set[MetricBlock] = {
        MetricBlock.BINARY_RELEVANCE,
        MetricBlock.TOP_K_INDICES,
    }

    short_head: Tensor
    long_tail: Tensor
    total_short: Tensor
    total_long: Tensor
    short_recs: Tensor
    long_recs: Tensor

    def __init__(
        self,
        k: int,
        item_interactions: Tensor,
        pop_ratio: float = 0.8,
        dist_sync_on_step: bool = False,
        **kwargs: Any,
    ):
        super().__init__(k, dist_sync_on_step)
        self.pop_ratio = pop_ratio
        self.add_state("short_recs", default=torch.tensor(0.0), dist_reduce_fx="sum")
        self.add_state("long_recs", default=torch.tensor(0.0), dist_reduce_fx="sum")

        # Add short head and long tail items as buffer
        sh, lt = self.compute_head_tail(item_interactions, self.pop_ratio)
        self.register_buffer("short_head", sh)
        self.register_buffer("long_tail", lt)

        # Store the total number of items in each group
        self.register_buffer("total_short", torch.tensor(len(sh), dtype=torch.float))
        self.register_buffer("total_long", torch.tensor(len(lt), dtype=torch.float))

    def update(self, preds: Tensor, **kwargs: Any):
        top_k_indices = kwargs.get(f"top_{self.k}_indices")
        item_indices = kwargs.get("item_indices")

        # Remap top_k_indices to global
        item_indices = kwargs.get("item_indices")
        top_k_indices = self.remap_indices(top_k_indices, item_indices)

        # Accumulate short head and long tail recommendations
        self.short_recs += torch.isin(top_k_indices, self.short_head).sum().float()
        self.long_recs += torch.isin(top_k_indices, self.long_tail).sum().float()

    def compute(self):
        """Computes the final metric value."""
        # Handle division by zero
        if self.total_short == 0 or self.total_long == 0:
            return {self.name: torch.tensor(0.0)}

        pr_short = self.short_recs / self.total_short
        pr_long = self.long_recs / self.total_long
        pr = torch.stack([pr_short, pr_long])

        # Handle the case where mean is zero
        if torch.mean(pr) == 0:
            return {self.name: torch.tensor(0.0)}

        pop_rsp = torch.std(pr, unbiased=False) / torch.mean(pr)
        return {self.name: pop_rsp.item()}

    @property
    def name(self):
        """The name of the metric."""
        if self.pop_ratio == 0.8:
            return self.__class__.__name__
        return f"PopRSP[Pop{int(self.pop_ratio * 100)}%]"

name property

The name of the metric.

compute()

Computes the final metric value.

Source code in warprec/evaluation/metrics/bias/pop_rsp.py
def compute(self):
    """Computes the final metric value."""
    # Handle division by zero
    if self.total_short == 0 or self.total_long == 0:
        return {self.name: torch.tensor(0.0)}

    pr_short = self.short_recs / self.total_short
    pr_long = self.long_recs / self.total_long
    pr = torch.stack([pr_short, pr_long])

    # Handle the case where mean is zero
    if torch.mean(pr) == 0:
        return {self.name: torch.tensor(0.0)}

    pop_rsp = torch.std(pr, unbiased=False) / torch.mean(pr)
    return {self.name: pop_rsp.item()}