Skip to content

Rapidata benchmark

RapidataBenchmark #

RapidataBenchmark(
    name: str, id: str, openapi_service: OpenAPIService
)

An instance of a Rapidata benchmark.

Used to interact with a specific benchmark in the Rapidata system, such as retrieving prompts and evaluating models.

Parameters:

Name Type Description Default
name str

The name that will be used to identify the benchmark on the overview.

required
id str

The id of the benchmark.

required
openapi_service OpenAPIService

The OpenAPI service to use to interact with the Rapidata API.

required
Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def __init__(self, name: str, id: str, openapi_service: OpenAPIService):
    self.name = name
    self.id = id
    self.__openapi_service = openapi_service
    self.__prompts: list[str | None] = []
    self.__prompt_assets: list[str | None] = []
    self.__leaderboards: list[RapidataLeaderboard] = []
    self.__identifiers: list[str] = []
    self.__tags: list[list[str]] = []

prompts property #

prompts: list[str | None]

Returns the prompts that are registered for the leaderboard.

prompt_assets property #

prompt_assets: list[str | None]

Returns the prompt assets that are registered for the benchmark.

tags property #

tags: list[list[str]]

Returns the tags that are registered for the benchmark.

leaderboards property #

leaderboards: list[RapidataLeaderboard]

Returns the leaderboards that are registered for the benchmark.

add_prompt #

add_prompt(
    identifier: str,
    prompt: str | None = None,
    asset: str | None = None,
    tags: Optional[list[str]] = None,
)

Adds a prompt to the benchmark.

Parameters:

Name Type Description Default
identifier str

The identifier of the prompt/asset/tags that will be used to match up the media.

required
prompt str | None

The prompt that will be used to evaluate the model.

None
asset str | None

The asset that will be used to evaluate the model. Provided as a link to the asset.

None
tags Optional[list[str]]

The tags can be used to filter the leaderboard results. They will NOT be shown to the users.

None
Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def add_prompt(
    self,
    identifier: str,
    prompt: str | None = None,
    asset: str | None = None,
    tags: Optional[list[str]] = None,
):
    """
    Adds a prompt to the benchmark.

    Args:
        identifier: The identifier of the prompt/asset/tags that will be used to match up the media.
        prompt: The prompt that will be used to evaluate the model.
        asset: The asset that will be used to evaluate the model. Provided as a link to the asset.
        tags: The tags can be used to filter the leaderboard results. They will NOT be shown to the users.
    """
    if tags is None:
        tags = []

    if not isinstance(identifier, str):
        raise ValueError("Identifier must be a string.")

    if prompt is None and asset is None:
        raise ValueError("Prompt or asset must be provided.")

    if prompt is not None and not isinstance(prompt, str):
        raise ValueError("Prompt must be a string.")

    if asset is not None and not isinstance(asset, str):
        raise ValueError("Asset must be a string. That is the link to the asset.")

    if identifier in self.identifiers:
        raise ValueError("Identifier already exists in the benchmark.")

    if asset is not None and not re.match(r"^https?://", asset):
        raise ValueError("Asset must be a link to the asset.")

    if tags is not None and (
        not isinstance(tags, list) or not all(isinstance(tag, str) for tag in tags)
    ):
        raise ValueError("Tags must be a list of strings.")

    self.__identifiers.append(identifier)

    self.__tags.append(tags)
    self.__prompts.append(prompt)
    self.__prompt_assets.append(asset)

    self.__openapi_service.benchmark_api.benchmark_benchmark_id_prompt_post(
        benchmark_id=self.id,
        submit_prompt_model=SubmitPromptModel(
            identifier=identifier,
            prompt=prompt,
            promptAsset=(
                SubmitPromptModelPromptAsset(
                    UrlAssetInput(_t="UrlAssetInput", url=asset)
                )
                if asset is not None
                else None
            ),
            tags=tags,
        ),
    )

create_leaderboard #

create_leaderboard(
    name: str,
    instruction: str,
    show_prompt: bool = False,
    show_prompt_asset: bool = False,
    inverse_ranking: bool = False,
    level_of_detail: Literal[
        "low", "medium", "high", "very high"
    ] = "low",
    min_responses_per_matchup: int = 3,
) -> RapidataLeaderboard

Creates a new leaderboard for the benchmark.

Parameters:

Name Type Description Default
name str

The name of the leaderboard. (not shown to the users)

required
instruction str

The instruction decides how the models will be evaluated.

required
show_prompt bool

Whether to show the prompt to the users. (default: False)

False
show_prompt_asset bool

Whether to show the prompt asset to the users. (only works if the prompt asset is a URL) (default: False)

False
inverse_ranking bool

Whether to inverse the ranking of the leaderboard. (if the question is inversed, e.g. "Which video is worse?")

False
level_of_detail Literal['low', 'medium', 'high', 'very high']

The level of detail of the leaderboard. This will effect how many comparisons are done per model evaluation. (default: "low")

'low'
min_responses_per_matchup int

The minimum number of responses required to be considered for the leaderboard. (default: 3)

3
Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
def create_leaderboard(
    self,
    name: str,
    instruction: str,
    show_prompt: bool = False,
    show_prompt_asset: bool = False,
    inverse_ranking: bool = False,
    level_of_detail: Literal["low", "medium", "high", "very high"] = "low",
    min_responses_per_matchup: int = 3,
) -> RapidataLeaderboard:
    """
    Creates a new leaderboard for the benchmark.

    Args:
        name: The name of the leaderboard. (not shown to the users)
        instruction: The instruction decides how the models will be evaluated.
        show_prompt: Whether to show the prompt to the users. (default: False)
        show_prompt_asset: Whether to show the prompt asset to the users. (only works if the prompt asset is a URL) (default: False)
        inverse_ranking: Whether to inverse the ranking of the leaderboard. (if the question is inversed, e.g. "Which video is worse?")
        level_of_detail: The level of detail of the leaderboard. This will effect how many comparisons are done per model evaluation. (default: "low")
        min_responses_per_matchup: The minimum number of responses required to be considered for the leaderboard. (default: 3)
    """
    if not isinstance(min_responses_per_matchup, int):
        raise ValueError("Min responses per matchup must be an integer")

    if min_responses_per_matchup < 3:
        raise ValueError("Min responses per matchup must be at least 3")

    leaderboard_result = self.__openapi_service.leaderboard_api.leaderboard_post(
        create_leaderboard_model=CreateLeaderboardModel(
            benchmarkId=self.id,
            name=name,
            instruction=instruction,
            showPrompt=show_prompt,
            showPromptAsset=show_prompt_asset,
            isInversed=inverse_ranking,
            minResponses=min_responses_per_matchup,
            responseBudget=DetailMapper.get_budget(level_of_detail),
        )
    )

    assert (
        leaderboard_result.benchmark_id == self.id
    ), "The leaderboard was not created for the correct benchmark."

    return RapidataLeaderboard(
        name,
        instruction,
        show_prompt,
        show_prompt_asset,
        inverse_ranking,
        leaderboard_result.response_budget,
        min_responses_per_matchup,
        leaderboard_result.id,
        self.__openapi_service,
    )

evaluate_model #

evaluate_model(
    name: str, media: list[str], identifiers: list[str]
) -> None

Evaluates a model on the benchmark across all leaderboards.

Parameters:

Name Type Description Default
name str

The name of the model.

required
media list[str]

The generated images/videos that will be used to evaluate the model.

required
identifiers list[str]

The identifiers that correspond to the media. The order of the identifiers must match the order of the media. The identifiers that are used must be registered for the benchmark. To see the registered identifiers, use the identifiers property.

required
Source code in src/rapidata/rapidata_client/benchmark/rapidata_benchmark.py
    def evaluate_model(
        self, name: str, media: list[str], identifiers: list[str]
    ) -> None:
        """
        Evaluates a model on the benchmark across all leaderboards.

        Args:
            name: The name of the model.
            media: The generated images/videos that will be used to evaluate the model.
            identifiers: The identifiers that correspond to the media. The order of the identifiers must match the order of the media.
                The identifiers that are used must be registered for the benchmark. To see the registered identifiers, use the identifiers property.
        """
        if not media:
            raise ValueError("Media must be a non-empty list of strings")

        if len(media) != len(identifiers):
            raise ValueError("Media and identifiers must have the same length")

        if not all(identifier in self.identifiers for identifier in identifiers):
            raise ValueError(
                "All identifiers must be in the registered identifiers list. To see the registered identifiers, use the identifiers property.\
\nTo see the prompts that are associated with the identifiers, use the prompts property."
            )

        # happens before the creation of the participant to ensure all media paths are valid
        assets: list[MediaAsset] = []
        for media_path in media:
            assets.append(MediaAsset(media_path))

        participant_result = self.__openapi_service.benchmark_api.benchmark_benchmark_id_participants_post(
            benchmark_id=self.id,
            create_benchmark_participant_model=CreateBenchmarkParticipantModel(
                name=name,
            ),
        )

        logger.info(f"Participant created: {participant_result.participant_id}")

        participant = BenchmarkParticipant(
            name, participant_result.participant_id, self.__openapi_service
        )

        successful_uploads, failed_uploads = participant.upload_media(
            assets,
            identifiers,
        )

        total_uploads = len(assets)
        success_rate = (
            (len(successful_uploads) / total_uploads * 100) if total_uploads > 0 else 0
        )
        logger.info(
            f"Upload complete: {len(successful_uploads)} successful, {len(failed_uploads)} failed ({success_rate:.1f}% success rate)"
        )

        if failed_uploads:
            logger.error(
                f"Failed uploads for media: {[asset.path for asset in failed_uploads]}"
            )
            logger.warning(
                "Some uploads failed. The model evaluation may be incomplete."
            )

        if len(successful_uploads) == 0:
            raise RuntimeError(
                "No uploads were successful. The model evaluation will not be completed."
            )

        self.__openapi_service.participant_api.participants_participant_id_submit_post(
            participant_id=participant_result.participant_id
        )