Skip to content

Rapidata benchmark

RapidataBenchmark #

RapidataBenchmark(
    name: str, id: str, openapi_service: OpenAPIService
)

An instance of a Rapidata benchmark.

Used to interact with a specific benchmark in the Rapidata system, such as retrieving prompts and evaluating models.

Parameters:

Name Type Description Default
name str

The name that will be used to identify the benchmark on the overview.

required
id str

The id of the benchmark.

required
openapi_service OpenAPIService

The OpenAPI service to use to interact with the Rapidata API.

required
Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def __init__(self, name: str, id: str, openapi_service: OpenAPIService):
    self.name = name
    self.id = id
    self._openapi_service = openapi_service
    self.__prompts: list[str | None] = []
    self.__english_prompts: list[str | None] = []
    self.__prompt_assets: list[str | None] = []
    self.__leaderboards: list["RapidataLeaderboard"] = []
    self.__identifiers: list[str] = []
    self.__tags: list[list[str]] = []
    self.__participants: list[BenchmarkParticipant] = []
    self.__benchmark_page: str = (
        f"https://app.{self._openapi_service.environment}/mri/benchmarks/{self.id}"
    )
    self._prompt_uploader = BenchmarkPromptUploader(id, openapi_service)

prompts property #

prompts: list[str | None]

Returns the prompts as originally provided, in the order they were registered.

english_prompts property #

english_prompts: list[str | None]

Returns the prompts translated to English, aligned by index with prompts.

The translations are produced server-side, so accessing this after add_prompts triggers a one-off re-fetch of the prompt set.

prompt_assets property #

prompt_assets: list[str | None]

Returns the prompt assets that are registered for the benchmark.

tags property #

tags: list[list[str]]

Returns the tags that are registered for the benchmark.

leaderboards property #

leaderboards: list[RapidataLeaderboard]

Returns the leaderboards that are registered for the benchmark.

participants property #

participants: list[BenchmarkParticipant]

Returns the participants that are registered for the benchmark.

add_prompts #

add_prompts(
    identifiers: Optional[list[str]] = None,
    prompts: Optional[list[str | None] | list[str]] = None,
    prompt_assets: Optional[
        list[str | None] | list[str]
    ] = None,
    tags: Optional[
        list[list[str] | None] | list[list[str]]
    ] = None,
) -> None

Adds one or more prompts to the benchmark. Everything is matched up by the indexes of the lists.

prompts or identifiers must be provided, as well as prompts or prompt_assets.

The prompts are uploaded concurrently. A failed upload does not abort the rest: every prompt is attempted, failures are logged, and only the prompts that succeeded are registered.

Parameters:

Name Type Description Default
identifiers Optional[list[str]]

The identifiers of the prompts/assets/tags that will be used to match up the media. If not provided, it will use the prompts as the identifiers.

None
prompts Optional[list[str | None] | list[str]]

The prompts that will be registered for the benchmark.

None
prompt_assets Optional[list[str | None] | list[str]]

The prompt assets that will be registered for the benchmark.

None
tags Optional[list[list[str] | None] | list[list[str]]]

The tags that will be associated with the prompts to use for filtering the leaderboard results. They will NOT be shown to the users.

None
Example
benchmark.add_prompts(
    identifiers=["id1", "id2"],
    prompts=["prompt 1", "prompt 2"],
    prompt_assets=["https://assets.rapidata.ai/prompt_1.jpg", "https://assets.rapidata.ai/prompt_2.jpg"],
    tags=[["tag1", "tag2"], ["tag2"]],
)
Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def add_prompts(
    self,
    identifiers: Optional[list[str]] = None,
    prompts: Optional[list[str | None] | list[str]] = None,
    prompt_assets: Optional[list[str | None] | list[str]] = None,
    tags: Optional[list[list[str] | None] | list[list[str]]] = None,
) -> None:
    """
    Adds one or more prompts to the benchmark. Everything is matched up by the
    indexes of the lists.

    prompts or identifiers must be provided, as well as prompts or prompt_assets.

    The prompts are uploaded concurrently. A failed upload does not abort the
    rest: every prompt is attempted, failures are logged, and only the prompts
    that succeeded are registered.

    Args:
        identifiers: The identifiers of the prompts/assets/tags that will be used to match up the media. If not provided, it will use the prompts as the identifiers.
        prompts: The prompts that will be registered for the benchmark.
        prompt_assets: The prompt assets that will be registered for the benchmark.
        tags: The tags that will be associated with the prompts to use for filtering the leaderboard results. They will NOT be shown to the users.

    Example:
        ```python
        benchmark.add_prompts(
            identifiers=["id1", "id2"],
            prompts=["prompt 1", "prompt 2"],
            prompt_assets=["https://assets.rapidata.ai/prompt_1.jpg", "https://assets.rapidata.ai/prompt_2.jpg"],
            tags=[["tag1", "tag2"], ["tag2"]],
        )
        ```
    """
    with tracer.start_as_current_span("RapidataBenchmark.add_prompts"):
        if prompts and (
            not isinstance(prompts, list)
            or not all(
                isinstance(prompt, str) or prompt is None for prompt in prompts
            )
        ):
            raise ValueError("Prompts must be a list of strings or None.")

        if prompt_assets and (
            not isinstance(prompt_assets, list)
            or not all(
                isinstance(asset, str) or asset is None for asset in prompt_assets
            )
        ):
            raise ValueError("Media assets must be a list of strings or None.")

        if identifiers and (
            not isinstance(identifiers, list)
            or not all(isinstance(identifier, str) for identifier in identifiers)
        ):
            raise ValueError("Identifiers must be a list of strings.")

        if identifiers and len(set(identifiers)) != len(identifiers):
            raise ValueError("Identifiers must be unique.")

        if tags is not None:
            if not isinstance(tags, list):
                raise ValueError("Tags must be a list of lists of strings or None.")

            for tag in tags:
                if tag is not None and (
                    not isinstance(tag, list)
                    or not all(isinstance(item, str) for item in tag)
                ):
                    raise ValueError(
                        "Tags must be a list of lists of strings or None."
                    )

        if not identifiers and not prompts:
            raise ValueError(
                "At least one of identifiers or prompts must be provided."
            )

        if not prompts and not prompt_assets:
            raise ValueError(
                "At least one of prompts or media assets must be provided."
            )

        if not identifiers:
            assert prompts is not None
            if len(set(prompts)) != len(prompts):
                raise ValueError(
                    "Prompts must be unique. Otherwise use identifiers."
                )
            if any(prompt is None for prompt in prompts):
                raise ValueError(
                    "Prompts must not be None. Otherwise use identifiers."
                )

            identifiers = cast(list[str], prompts)

        assert identifiers is not None

        expected_length = len(identifiers)

        if not prompts:
            prompts = cast(list[str | None], [None] * expected_length)

        if not prompt_assets:
            prompt_assets = cast(list[str | None], [None] * expected_length)

        if not tags:
            tags = cast(list[list[str] | None], [None] * expected_length)

        if not (expected_length == len(prompts) == len(prompt_assets) == len(tags)):
            raise ValueError(
                "Identifiers, prompts, media assets, and tags must have the same length or set to None."
            )

        # Snapshot once: `self.identifiers` is a property whose getter re-fetches
        # over HTTP while the cache is empty, so testing it inside the comprehension
        # fired one request per identifier (a full re-fetch each time on a fresh,
        # empty benchmark). One lookup into a set instead.
        existing_identifiers = set(self.identifiers)
        already_registered = [
            identifier
            for identifier in identifiers
            if identifier in existing_identifiers
        ]
        if already_registered:
            raise ValueError(
                f"Identifiers already exist in the benchmark: {already_registered}"
            )

        to_upload = [
            BenchmarkPrompt(
                identifier, prompt, asset, tag if tag is not None else []
            )
            for identifier, prompt, asset, tag in zip(
                identifiers, prompts, prompt_assets, tags
            )
        ]

        for uploaded in self._prompt_uploader.upload_many(to_upload):
            self.__identifiers.append(uploaded.identifier)
            self.__prompts.append(uploaded.prompt)
            self.__prompt_assets.append(uploaded.prompt_asset)
            self.__tags.append(uploaded.tags)

        # The English translation is produced server-side and is unknown for
        # the just-added prompts. Clear it so the next access lazily re-fetches
        # the prompt set, while the rest of the cache stays intact.
        self.__english_prompts = []

create_leaderboard #

create_leaderboard(
    name: str,
    instruction: str,
    show_prompt: bool = False,
    show_prompt_asset: bool = False,
    inverse_ranking: bool = False,
    level_of_detail: LevelOfDetail | None = None,
    min_responses_per_matchup: int | None = None,
    audience_id: str | RapidataAudienceBase | None = None,
    settings: Sequence["RapidataSetting"] | None = None,
) -> RapidataLeaderboard

Creates a new leaderboard for the benchmark.

Parameters:

Name Type Description Default
name str

The name of the leaderboard. (not shown to the users)

required
instruction str

The instruction decides how the models will be evaluated.

required
show_prompt bool

Whether to show the prompt to the users. (default: False)

False
show_prompt_asset bool

Whether to show the prompt asset to the users. (only works if the prompt asset is a URL) (default: False)

False
inverse_ranking bool

Whether to inverse the ranking of the leaderboard. (if the question is inversed, e.g. "Which video is worse?")

False
level_of_detail LevelOfDetail | None

The level of detail of the leaderboard. This will effect how many comparisons are done per model evaluation. One of: 'debug', 'low', 'medium', 'high', 'very high'. (default: None, server decides)

None
min_responses_per_matchup int | None

The minimum number of responses required to be considered for the leaderboard. (default: 3)

None
audience_id str | RapidataAudienceBase | None

The audience that should answer the leaderboard. Pass either the audience id, a :class:RapidataAudience (dimension audience), or a :class:RapidataFilteredAudience (derived via 🇵🇾meth:RapidataAudience.filter). Defaults to the global audience when not specified.

None
settings Sequence['RapidataSetting'] | None

The settings that should be applied to the leaderboard. Will determine the behavior of the tasks on the leaderboard. (default: [])

None
Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def create_leaderboard(
    self,
    name: str,
    instruction: str,
    show_prompt: bool = False,
    show_prompt_asset: bool = False,
    inverse_ranking: bool = False,
    level_of_detail: LevelOfDetail | None = None,
    min_responses_per_matchup: int | None = None,
    audience_id: str | RapidataAudienceBase | None = None,
    settings: Sequence["RapidataSetting"] | None = None,
) -> RapidataLeaderboard:
    """
    Creates a new leaderboard for the benchmark.

    Args:
        name: The name of the leaderboard. (not shown to the users)
        instruction: The instruction decides how the models will be evaluated.
        show_prompt: Whether to show the prompt to the users. (default: False)
        show_prompt_asset: Whether to show the prompt asset to the users. (only works if the prompt asset is a URL) (default: False)
        inverse_ranking: Whether to inverse the ranking of the leaderboard. (if the question is inversed, e.g. "Which video is worse?")
        level_of_detail: The level of detail of the leaderboard. This will effect how many comparisons are done per model evaluation. One of: 'debug', 'low', 'medium', 'high', 'very high'. (default: None, server decides)
        min_responses_per_matchup: The minimum number of responses required to be considered for the leaderboard. (default: 3)
        audience_id: The audience that should answer the leaderboard. Pass either the audience id, a :class:`RapidataAudience` (dimension audience), or a :class:`RapidataFilteredAudience` (derived via :py:meth:`RapidataAudience.filter`). Defaults to the global audience when not specified.
        settings: The settings that should be applied to the leaderboard. Will determine the behavior of the tasks on the leaderboard. (default: [])
    """
    from rapidata.api_client.models.create_leaderboard_endpoint_input import (
        CreateLeaderboardEndpointInput,
    )
    from rapidata.rapidata_client.audience._audience_base import (
        RapidataAudienceBase,
    )
    from rapidata.rapidata_client.benchmark._detail_mapper import DetailMapper
    from rapidata.rapidata_client.benchmark.leaderboard.rapidata_leaderboard import (
        RapidataLeaderboard,
    )

    with tracer.start_as_current_span("RapidataBenchmark.create_leaderboard"):
        if level_of_detail is not None and (
            not isinstance(level_of_detail, str)
            or level_of_detail not in LevelOfDetail.__args__
        ):
            raise ValueError(
                "Level of detail must be a string and one of: "
                + ", ".join(LevelOfDetail.__args__)
            )

        if min_responses_per_matchup is not None and (
            not isinstance(min_responses_per_matchup, int)
            or min_responses_per_matchup < 3
        ):
            raise ValueError(
                "Min responses per matchup must be an integer and at least 3"
            )

        resolved_audience_id = (
            audience_id.id
            if isinstance(audience_id, RapidataAudienceBase)
            else audience_id
        )

        logger.info(
            "Creating leaderboard %s with instruction %s, show_prompt %s, show_prompt_asset %s, inverse_ranking %s, level_of_detail %s, min_responses_per_matchup %s, audience_id %s, settings %s",
            name,
            instruction,
            show_prompt,
            show_prompt_asset,
            inverse_ranking,
            level_of_detail,
            min_responses_per_matchup,
            resolved_audience_id,
            settings,
        )

        leaderboard_result = (
            self._openapi_service.leaderboard.leaderboard_api.leaderboard_post(
                create_leaderboard_endpoint_input=CreateLeaderboardEndpointInput(
                    benchmarkId=self.id,
                    name=name,
                    instruction=instruction,
                    showPrompt=show_prompt,
                    showPromptAsset=show_prompt_asset,
                    isInversed=inverse_ranking,
                    minResponses=min_responses_per_matchup,
                    responseBudget=(
                        DetailMapper.get_budget(level_of_detail)
                        if level_of_detail is not None
                        else None
                    ),
                    audienceId=resolved_audience_id,
                    featureFlags=(
                        [setting._to_feature_flag() for setting in settings]
                        if settings
                        else None
                    ),
                )
            )
        )

        assert (
            leaderboard_result.benchmark_id == self.id
        ), "The leaderboard was not created for the correct benchmark."

        logger.info("Leaderboard created with id %s", leaderboard_result.id)

        return RapidataLeaderboard(
            name,
            instruction,
            show_prompt,
            show_prompt_asset,
            inverse_ranking,
            leaderboard_result.response_budget,
            leaderboard_result.min_responses,
            self.id,
            leaderboard_result.id,
            self._openapi_service,
        )

evaluate_model #

evaluate_model(
    name: str,
    media: list[str],
    identifiers: list[str] | None = None,
    prompts: list[str] | None = None,
    data_type: Literal["media", "text"] = "media",
) -> None

Evaluates a model on the benchmark across all leaderboards.

prompts or identifiers must be provided to match the media.

Parameters:

Name Type Description Default
name str

The name of the model.

required
media list[str]

The generated media or text that will be used to evaluate the model.

required
identifiers list[str] | None

The identifiers that correspond to the media. The order of the identifiers must match the order of the media.

The identifiers that are used must be registered for the benchmark. To see the registered identifiers, use the identifiers property.

None
prompts list[str] | None

The prompts that correspond to the media. The order of the prompts must match the order of the media.

None
data_type Literal['media', 'text']

The type of data being provided. Use "media" for images/videos/audio (default) or "text" for text content.

'media'
Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def evaluate_model(
    self,
    name: str,
    media: list[str],
    identifiers: list[str] | None = None,
    prompts: list[str] | None = None,
    data_type: Literal["media", "text"] = "media",
) -> None:
    """
    Evaluates a model on the benchmark across all leaderboards.

    prompts or identifiers must be provided to match the media.

    Args:
        name: The name of the model.
        media: The generated media or text that will be used to evaluate the model.
        identifiers: The identifiers that correspond to the media. The order of the identifiers must match the order of the media.\n
            The identifiers that are used must be registered for the benchmark. To see the registered identifiers, use the identifiers property.
        prompts: The prompts that correspond to the media. The order of the prompts must match the order of the media.
        data_type: The type of data being provided. Use "media" for images/videos/audio (default) or "text" for text content.
    """
    with tracer.start_as_current_span("RapidataBenchmark.evaluate_model"):
        participant = self.add_model(
            name=name,
            media=media,
            identifiers=identifiers,
            prompts=prompts,
            data_type=data_type,
        )
        participant.run()

add_model #

add_model(
    name: str,
    media: list[str],
    identifiers: list[str] | None = None,
    prompts: list[str] | None = None,
    data_type: Literal["media", "text"] = "media",
) -> BenchmarkParticipant

Adds a model to the benchmark without immediately submitting it for evaluation.

This method creates a participant, uploads media, but does NOT submit the participant. Use participant.run() or benchmark.run() to submit afterwards.

Parameters:

Name Type Description Default
name str

The name of the model.

required
media list[str]

The generated media or text that will be used to evaluate the model.

required
identifiers list[str] | None

The identifiers that correspond to the media. The order of the identifiers must match the order of the media.

The identifiers that are used must be registered for the benchmark. To see the registered identifiers, use the identifiers property.

None
prompts list[str] | None

The prompts that correspond to the media. The order of the prompts must match the order of the media.

None
data_type Literal['media', 'text']

The type of data being provided. Use "media" for images/videos/audio (default) or "text" for text content.

'media'

Returns:

Type Description
BenchmarkParticipant

The created BenchmarkParticipant instance.

Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def add_model(
    self,
    name: str,
    media: list[str],
    identifiers: list[str] | None = None,
    prompts: list[str] | None = None,
    data_type: Literal["media", "text"] = "media",
) -> BenchmarkParticipant:
    """Adds a model to the benchmark without immediately submitting it for evaluation.

    This method creates a participant, uploads media, but does NOT submit the participant.
    Use `participant.run()` or `benchmark.run()` to submit afterwards.

    Args:
        name: The name of the model.
        media: The generated media or text that will be used to evaluate the model.
        identifiers: The identifiers that correspond to the media. The order of the identifiers must match the order of the media.\n
            The identifiers that are used must be registered for the benchmark. To see the registered identifiers, use the identifiers property.
        prompts: The prompts that correspond to the media. The order of the prompts must match the order of the media.
        data_type: The type of data being provided. Use "media" for images/videos/audio (default) or "text" for text content.

    Returns:
        The created BenchmarkParticipant instance.
    """
    from rapidata.api_client.models.create_benchmark_participant_endpoint_input import (
        CreateBenchmarkParticipantEndpointInput,
    )
    from rapidata.rapidata_client.benchmark.participant.participant import (
        BenchmarkParticipant,
    )

    with tracer.start_as_current_span("RapidataBenchmark.add_model"):
        if not media:
            raise ValueError("Media must be a non-empty list of strings")

        if not identifiers and not prompts:
            raise ValueError("Identifiers or prompts must be provided.")

        if identifiers and prompts:
            raise ValueError(
                "Identifiers and prompts cannot be provided at the same time. Use one or the other."
            )

        if not identifiers:
            assert prompts is not None
            identifiers = prompts

        if len(media) != len(identifiers):
            raise ValueError(
                "Media and identifiers/prompts must have the same length"
            )

        if not all(identifier in self.identifiers for identifier in identifiers):
            raise ValueError(
                "All identifiers/prompts must be in the registered identifiers/prompts list. To see the registered identifiers/prompts, use the identifiers/prompts property."
            )

        participant_result = self._openapi_service.leaderboard.benchmark_api.benchmark_benchmark_id_participants_post(
            benchmark_id=self.id,
            create_benchmark_participant_endpoint_input=CreateBenchmarkParticipantEndpointInput(
                name=name,
            ),
        )

        logger.info(f"Participant created: {participant_result.participant_id}")

        participant = BenchmarkParticipant(
            name,
            participant_result.participant_id,
            self._openapi_service,
            self.id,
        )

        with tracer.start_as_current_span("upload_media_for_participant"):
            logger.info(
                f"Uploading {len(media)} media assets to participant {participant.id}"
            )

            successful_uploads, failed_uploads = participant.upload_media(
                media,
                identifiers,
                data_type=data_type,
            )

            total_uploads = len(media)
            success_rate = (
                (len(successful_uploads) / total_uploads * 100)
                if total_uploads > 0
                else 0
            )
            logger.info(
                f"Upload complete: {len(successful_uploads)} successful, {len(failed_uploads)} failed ({success_rate:.1f}% success rate)"
            )

            if failed_uploads:
                logger.error(f"Failed uploads for media: {failed_uploads}")
                logger.warning(
                    "Some uploads failed. The model evaluation may be incomplete."
                )

            if len(successful_uploads) == 0:
                raise RuntimeError(
                    "No uploads were successful. The model evaluation will not be completed."
                )

        # Clear cache so next access re-fetches
        self.__participants = []

        return participant

run #

run() -> None

Submits all participants that are in CREATED state.

This is a convenience method to submit all unsubmitted participants at once.

Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def run(self) -> None:
    """Submits all participants that are in `CREATED` state.

    This is a convenience method to submit all unsubmitted participants at once.
    """
    from rapidata.api_client.models.participant_status import ParticipantStatus

    with tracer.start_as_current_span("RapidataBenchmark.run"):
        created = [
            p for p in self.participants if p.status == ParticipantStatus.CREATED
        ]
        logger.info(f"Submitting {len(created)} participants in CREATED state")
        for participant in created:
            participant.run()

        # Clear cache so next access re-fetches
        self.__participants = []

view #

view() -> None

Views the benchmark.

Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def view(self) -> None:
    """
    Views the benchmark.
    """

    logger.info("Opening benchmark page in browser...")
    could_open_browser = webbrowser.open(self.__benchmark_page)
    if not could_open_browser:
        encoded_url = urllib.parse.quote(
            self.__benchmark_page, safe="%/:=&?~#+!$,;'@()*[]"
        )
        managed_print(
            Fore.RED
            + f"Please open this URL in your browser: '{encoded_url}'"
            + Fore.RESET
        )

get_overall_standings #

get_overall_standings(
    tags: Optional[list[str]] = None,
    leaderboard_ids: Optional[list[str]] = None,
) -> DataFrame

Returns an aggregated elo table of all leaderboards in the benchmark.

Parameters:

Name Type Description Default
tags Optional[list[str]]

Filter standings by these tags. If None, all tags are considered.

None
leaderboard_ids Optional[list[str]]

Filter to only include matchups from these leaderboards. If None, all leaderboards are considered.

None
Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def get_overall_standings(
    self,
    tags: Optional[list[str]] = None,
    leaderboard_ids: Optional[list[str]] = None,
) -> pd.DataFrame:
    """
    Returns an aggregated elo table of all leaderboards in the benchmark.

    Args:
        tags: Filter standings by these tags. If None, all tags are considered.
        leaderboard_ids: Filter to only include matchups from these leaderboards. If None, all leaderboards are considered.
    """
    import pandas as pd

    with tracer.start_as_current_span("get_overall_standings"):
        participants = self._openapi_service.leaderboard.benchmark_api.benchmark_benchmark_id_standings_get(
            benchmark_id=self.id,
            tags=tags,
            leaderboard_ids=leaderboard_ids,
        )

        standings = []
        for participant in participants.items:
            standings.append(
                {
                    "name": participant.name,
                    "wins": participant.wins,
                    "total_matches": participant.total_matches,
                    "score": (
                        round(participant.score, 2)
                        if participant.score is not None
                        else None
                    ),
                }
            )

        return pd.DataFrame(standings)

get_win_loss_matrix #

get_win_loss_matrix(
    tags: Optional[list[str]] = None,
    participant_ids: Optional[list[str]] = None,
    leaderboard_ids: Optional[list[str]] = None,
    use_weighted_scoring: Optional[bool] = None,
) -> DataFrame

Returns the win/loss matrix for all participants across all leaderboards in the benchmark.

The matrix shows pairwise comparison results where each cell [i, j] represents the number of wins participant i has against participant j.

Parameters:

Name Type Description Default
tags Optional[list[str]]

Filter matchups by these tags. If None, all matchups are considered.

None
participant_ids Optional[list[str]]

Filter to only include these participants.

None
leaderboard_ids Optional[list[str]]

Filter to only include matchups from these leaderboards.

None
use_weighted_scoring Optional[bool]

Whether to use weighted scoring for the matrix calculation.

None

Returns:

Type Description
DataFrame

A pandas DataFrame with participants as both index and columns,

DataFrame

containing the pairwise win counts.

Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def get_win_loss_matrix(
    self,
    tags: Optional[list[str]] = None,
    participant_ids: Optional[list[str]] = None,
    leaderboard_ids: Optional[list[str]] = None,
    use_weighted_scoring: Optional[bool] = None,
) -> pd.DataFrame:
    """
    Returns the win/loss matrix for all participants across all leaderboards in the benchmark.

    The matrix shows pairwise comparison results where each cell [i, j] represents
    the number of wins participant i has against participant j.

    Args:
        tags: Filter matchups by these tags. If None, all matchups are considered.
        participant_ids: Filter to only include these participants.
        leaderboard_ids: Filter to only include matchups from these leaderboards.
        use_weighted_scoring: Whether to use weighted scoring for the matrix calculation.

    Returns:
        A pandas DataFrame with participants as both index and columns,
        containing the pairwise win counts.
    """
    import pandas as pd

    with tracer.start_as_current_span("get_win_loss_matrix"):
        result = self._openapi_service.leaderboard.benchmark_api.benchmark_benchmark_id_matrix_get(
            benchmark_id=self.id,
            tags=tags,
            participant_ids=participant_ids,
            leaderboard_ids=leaderboard_ids,
            use_weighted_scoring=use_weighted_scoring,
        )

        return pd.DataFrame(
            data=result.data,
            index=pd.Index(result.index),
            columns=pd.Index(result.columns),
        )