Skip to content

Tasks

Module for task-related functions and classes.

base_task

Base module for tasks.

BaseTask

Bases: ABC

Abstract base class for tasks in the promptolution library.

Source code in promptolution/tasks/base_task.py
class BaseTask(ABC):
    """Abstract base class for tasks in the promptolution library."""

    def __init__(
        self,
        df: pd.DataFrame,
        x_column: str,
        y_column: Optional[str] = None,
        task_description: Optional[str] = None,
        n_subsamples: int = 30,
        eval_strategy: "EvalStrategy" = "full",
        seed: int = 42,
        config: Optional["ExperimentConfig"] = None,
    ) -> None:
        """Initialize the BaseTask.

        Args:
            df (pd.DataFrame): The input DataFrame containing the data.
            x_column (str): Name of the column containing input texts.
            y_column (Optional[str]): Name of the column containing labels/ground truth (if applicable).
            task_description (str): Description of the task.
            n_subsamples (int): Number of subsamples to use for evaluation.
            eval_strategy (Literal): Subsampling strategy ("full", "subsample", "sequential_block", "random_block", "evaluated").
            seed (int): Random seed for reproducibility.
            config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
        """
        self.df = df
        self.x_column = x_column
        self.y_column = y_column
        self.task_description = task_description
        self.n_subsamples = n_subsamples
        self.eval_strategy = eval_strategy
        self.seed = seed

        super().__init__()
        if config is not None:
            config.apply_to(self)

        self.xs: List[str] = df[self.x_column].values.astype(str).tolist()
        self.has_y = y_column is not None
        if self.has_y and y_column is not None:
            self.ys: List[str] = df[y_column].values.astype(str).tolist()
        else:
            # If no y_column is provided, create a dummy y array
            self.ys = [""] * len(self.xs)

        self.block_idx = 0
        self.n_blocks = len(self.xs) // self.n_subsamples if self.n_subsamples > 0 else 1
        self.rng = np.random.default_rng(seed)

        self.eval_cache: Dict[Tuple[str, str, str], float] = {}  # (prompt, x, y): scores per datapoint
        self.seq_cache: Dict[Tuple[str, str, str], str] = {}  # (prompt, x, y): generating sequence per datapoint

    def subsample(self, eval_strategy: "EvalStrategy" = None) -> Tuple[List[str], List[str]]:
        """Subsample the dataset based on the specified parameters.

        Args:
            eval_strategy (EvalStrategy, optional): Subsampling strategy to use instead of self.eval_strategy. Defaults to None.

        Returns:
            Tuple[List[str], List[str]]: Subsampled input data and labels.
        """
        if eval_strategy is None:
            eval_strategy = self.eval_strategy

        if eval_strategy in ["full", "evaluated"]:
            return self.xs, self.ys
        elif eval_strategy == "subsample":
            indices = self.rng.choice(len(self.xs), min(self.n_subsamples, len(self.xs)), replace=False)
            return [self.xs[i] for i in indices], [self.ys[i] for i in indices]
        elif eval_strategy == "random_block":
            block_id = self.rng.integers(0, self.n_blocks)
            start_idx = block_id * self.n_subsamples
            end_idx = min((block_id + 1) * self.n_subsamples, len(self.xs))
            indices = np.arange(start_idx, end_idx)
            return [self.xs[i] for i in indices], [self.ys[i] for i in indices]
        elif eval_strategy == "sequential_block":
            start_idx = self.block_idx * self.n_subsamples
            end_idx = min((self.block_idx + 1) * self.n_subsamples, len(self.xs))
            indices = np.arange(start_idx, end_idx)
            return [self.xs[i] for i in indices], [self.ys[i] for i in indices]
        else:
            raise ValueError(f"Unknown subsampling strategy: '{eval_strategy}'")

    def _prepare_batch(
        self,
        prompts: List[str],
        xs: List[str],
        ys: List[str],
        eval_strategy: Literal["full", "subsample", "sequential_block", "random_block", "evaluated"] = "full",
    ) -> List[Tuple[str, str, str]]:
        """Generates (prompt, x, y) keys that require prediction.

        Returns keys not found in eval_cache.
        """
        if eval_strategy == "evaluated":
            return []
        keys_to_predict = []
        for prompt in prompts:
            for x, y in zip(xs, ys):
                cache_key = (prompt, x, str(y))
                if cache_key not in self.eval_cache:
                    keys_to_predict.append(cache_key)
        return keys_to_predict

    def _collect_results_from_cache(
        self,
        prompts: List[str],
        xs: List[str],
        ys: List[str],
        return_agg_scores: bool,
        return_seq: bool,
    ) -> Union[List[float], List[List[float]], Tuple[List[List[float]], List[List[str]]]]:
        """Collects all results for the current batch from the cache and formats them."""
        assert not (return_agg_scores and return_seq), "Cannot return both aggregated scores and sequences"

        scores = []
        seqs = []

        for prompt in prompts:
            datapoint_scores = []
            datapoint_seqs = []
            for x, y in zip(xs, ys):
                cache_key = (prompt, x, y)
                datapoint_scores.append(self.eval_cache[cache_key])
                if return_seq:
                    datapoint_seqs.append(self.seq_cache.get(cache_key, ""))
            scores.append(datapoint_scores)
            if return_seq:
                seqs.append(datapoint_seqs)

        if return_agg_scores:
            agg_scores = [np.nanmean(s).item() for s in scores]
            return agg_scores

        return scores if not return_seq else (scores, seqs)

    @abstractmethod
    def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[float]:
        """Abstract method to calculate the score for a predictions.

        This method should be implemented by subclasses based on their specific evaluation logic.
        """
        raise NotImplementedError

    @overload
    def evaluate(
        self,
        prompts: List[str],
        predictor: "BasePredictor",
        system_prompts: Optional[Union[str, List[str]]] = None,
        return_agg_scores: Literal[True] = True,
        return_seq: Literal[False] = False,
        eval_strategy: Optional["EvalStrategy"] = None,
    ) -> List[float]:
        ...

    @overload
    def evaluate(
        self,
        prompts: List[str],
        predictor: "BasePredictor",
        system_prompts: Optional[Union[str, List[str]]] = None,
        return_agg_scores: Literal[False] = False,
        return_seq: Literal[False] = False,
        eval_strategy: Optional["EvalStrategy"] = None,
    ) -> List[List[float]]:
        ...

    @overload
    def evaluate(
        self,
        prompts: List[str],
        predictor: "BasePredictor",
        system_prompts: Optional[Union[str, List[str]]] = None,
        return_agg_scores: Literal[False] = False,
        return_seq: Literal[True] = True,
        eval_strategy: Optional["EvalStrategy"] = None,
    ) -> Tuple[List[List[float]], List[List[str]]]:
        ...

    @overload
    def evaluate(
        self,
        prompts: str,
        predictor: "BasePredictor",
        system_prompts: Optional[Union[str, List[str]]] = None,
        return_agg_scores: Literal[True] = True,
        return_seq: Literal[False] = False,
        eval_strategy: Optional["EvalStrategy"] = None,
    ) -> List[float]:
        ...

    @overload
    def evaluate(
        self,
        prompts: str,
        predictor: "BasePredictor",
        system_prompts: Optional[Union[str, List[str]]] = None,
        return_agg_scores: Literal[False] = False,
        return_seq: Literal[False] = False,
        eval_strategy: Optional["EvalStrategy"] = None,
    ) -> List[List[float]]:
        ...

    @overload
    def evaluate(
        self,
        prompts: str,
        predictor: "BasePredictor",
        system_prompts: Optional[Union[str, List[str]]] = None,
        return_agg_scores: Literal[False] = False,
        return_seq: Literal[True] = True,
        eval_strategy: Optional["EvalStrategy"] = None,
    ) -> Tuple[List[List[float]], List[List[str]]]:
        ...

    def evaluate(
        self,
        prompts: Union[str, List[str]],
        predictor: "BasePredictor",
        system_prompts: Optional[Union[str, List[str]]] = None,
        return_agg_scores: bool = True,
        return_seq: bool = False,
        eval_strategy: Optional["EvalStrategy"] = None,
    ) -> Union[List[float], List[List[float]], Tuple[List[List[float]], List[List[str]]]]:
        """Evaluate a set of prompts using a given predictor.

        This method orchestrates subsampling, prediction, caching, and result collection.

        Note: Cannot return both aggregated scores and sequences (assertion will fail).
        """
        assert not (return_agg_scores and return_seq), "Cannot return both aggregated scores and sequences"

        seqs: List[str] = []

        prompts = [prompts] if isinstance(prompts, str) else prompts
        eval_strategy = eval_strategy or self.eval_strategy
        xs, ys = self.subsample(eval_strategy=eval_strategy)
        batches = self._prepare_batch(prompts, xs, ys, eval_strategy=eval_strategy)
        (prompts_to_evaluate, xs_to_evaluate, ys_to_evaluate) = ([], [], []) if not batches else zip(*batches)

        if prompts_to_evaluate:
            preds_seqs = predictor.predict(
                prompts=list(prompts_to_evaluate),
                xs=list(xs_to_evaluate),
                system_prompts=system_prompts,
                return_seq=return_seq,
            )
        else:
            preds_seqs = ([], []) if return_seq else []

        if return_seq:
            preds, seqs = preds_seqs if isinstance(preds_seqs, tuple) else (preds_seqs, [])
        else:
            preds = preds_seqs

        scores: List[float] = self._evaluate(list(xs_to_evaluate), list(ys_to_evaluate), preds)
        for i, cache_key in enumerate(batches):
            self.eval_cache[cache_key] = scores[i]
            if return_seq:
                self.seq_cache[cache_key] = seqs[i]

        return self._collect_results_from_cache(
            prompts,
            xs,
            ys,
            return_agg_scores,
            return_seq,
        )

    def pop_datapoints(self, n: Optional[int] = None, frac: Optional[float] = None) -> pd.DataFrame:
        """Pop a number of datapoints from the dataset.

        Args:
            n (int, optional): Number of datapoints to pop. Defaults to None.
            frac (float, optional): Fraction of datapoints to pop. Defaults to None.

        Returns:
            pd.DataFrame: DataFrame containing the popped datapoints.
        """
        assert n is None or frac is None, "Only one of n or frac can be specified."
        if n is not None:
            indices = self.rng.choice(len(self.xs), n, replace=False)
        elif frac is not None:
            indices = self.rng.choice(len(self.xs), int(len(self.xs) * frac), replace=False)
        else:
            raise ValueError("Either n or frac must be specified.")

        popped_xs = [self.xs[i] for i in indices]
        popped_ys = [self.ys[i] for i in indices]
        df_popped = pd.DataFrame({self.x_column: popped_xs, self.y_column: popped_ys})

        self.xs = [x for i, x in enumerate(self.xs) if i not in indices]
        self.ys = [y for i, y in enumerate(self.ys) if i not in indices]

        # Update n_blocks and block_idx based on the new dataset size
        self.n_blocks = len(self.xs) // self.n_subsamples if self.n_subsamples > 0 else 1
        self.block_idx = min(self.block_idx, self.n_blocks - 1) if self.n_blocks > 0 else 0

        # Clear cache for popped items (optional, but good practice if memory is a concern)
        keys_to_remove = []
        for key in self.eval_cache:
            if key[1] in popped_xs and key[2] in popped_ys:  # Check if the x and y correspond to popped data
                keys_to_remove.append(key)
        for key in keys_to_remove:
            self.eval_cache.pop(key, None)
            self.seq_cache.pop(key, None)

        return df_popped

    def increment_block_idx(self) -> None:
        """Increment the block index for subsampling.

        Raises:
            ValueError: If the eval_strategy does not contain "block".
        """
        if "block" not in self.eval_strategy:
            raise ValueError("Block increment is only valid for block subsampling.")
        self.block_idx += 1
        if self.n_blocks > 0:  # Ensure n_blocks is not zero to avoid division by zero
            self.block_idx %= self.n_blocks
        else:
            self.block_idx = 0  # If no blocks, reset to 0

    def reset_block_idx(self) -> None:
        """Reset the block index for subsampling.

        Raises:
            ValueError: If the eval_strategy does not contain "block".
        """
        if "block" not in self.eval_strategy:
            raise ValueError("Block reset is only valid for block subsampling.")
        self.block_idx = 0

__init__(df, x_column, y_column=None, task_description=None, n_subsamples=30, eval_strategy='full', seed=42, config=None)

Initialize the BaseTask.

Parameters:

Name Type Description Default
df DataFrame

The input DataFrame containing the data.

required
x_column str

Name of the column containing input texts.

required
y_column Optional[str]

Name of the column containing labels/ground truth (if applicable).

None
task_description str

Description of the task.

None
n_subsamples int

Number of subsamples to use for evaluation.

30
eval_strategy Literal

Subsampling strategy ("full", "subsample", "sequential_block", "random_block", "evaluated").

'full'
seed int

Random seed for reproducibility.

42
config ExperimentConfig

Configuration for the task, overriding defaults.

None
Source code in promptolution/tasks/base_task.py
def __init__(
    self,
    df: pd.DataFrame,
    x_column: str,
    y_column: Optional[str] = None,
    task_description: Optional[str] = None,
    n_subsamples: int = 30,
    eval_strategy: "EvalStrategy" = "full",
    seed: int = 42,
    config: Optional["ExperimentConfig"] = None,
) -> None:
    """Initialize the BaseTask.

    Args:
        df (pd.DataFrame): The input DataFrame containing the data.
        x_column (str): Name of the column containing input texts.
        y_column (Optional[str]): Name of the column containing labels/ground truth (if applicable).
        task_description (str): Description of the task.
        n_subsamples (int): Number of subsamples to use for evaluation.
        eval_strategy (Literal): Subsampling strategy ("full", "subsample", "sequential_block", "random_block", "evaluated").
        seed (int): Random seed for reproducibility.
        config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
    """
    self.df = df
    self.x_column = x_column
    self.y_column = y_column
    self.task_description = task_description
    self.n_subsamples = n_subsamples
    self.eval_strategy = eval_strategy
    self.seed = seed

    super().__init__()
    if config is not None:
        config.apply_to(self)

    self.xs: List[str] = df[self.x_column].values.astype(str).tolist()
    self.has_y = y_column is not None
    if self.has_y and y_column is not None:
        self.ys: List[str] = df[y_column].values.astype(str).tolist()
    else:
        # If no y_column is provided, create a dummy y array
        self.ys = [""] * len(self.xs)

    self.block_idx = 0
    self.n_blocks = len(self.xs) // self.n_subsamples if self.n_subsamples > 0 else 1
    self.rng = np.random.default_rng(seed)

    self.eval_cache: Dict[Tuple[str, str, str], float] = {}  # (prompt, x, y): scores per datapoint
    self.seq_cache: Dict[Tuple[str, str, str], str] = {}  # (prompt, x, y): generating sequence per datapoint

evaluate(prompts, predictor, system_prompts=None, return_agg_scores=True, return_seq=False, eval_strategy=None)

evaluate(prompts: List[str], predictor: BasePredictor, system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[True] = True, return_seq: Literal[False] = False, eval_strategy: Optional[EvalStrategy] = None) -> List[float]
evaluate(prompts: List[str], predictor: BasePredictor, system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[False] = False, return_seq: Literal[False] = False, eval_strategy: Optional[EvalStrategy] = None) -> List[List[float]]
evaluate(prompts: List[str], predictor: BasePredictor, system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[False] = False, return_seq: Literal[True] = True, eval_strategy: Optional[EvalStrategy] = None) -> Tuple[List[List[float]], List[List[str]]]
evaluate(prompts: str, predictor: BasePredictor, system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[True] = True, return_seq: Literal[False] = False, eval_strategy: Optional[EvalStrategy] = None) -> List[float]
evaluate(prompts: str, predictor: BasePredictor, system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[False] = False, return_seq: Literal[False] = False, eval_strategy: Optional[EvalStrategy] = None) -> List[List[float]]
evaluate(prompts: str, predictor: BasePredictor, system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[False] = False, return_seq: Literal[True] = True, eval_strategy: Optional[EvalStrategy] = None) -> Tuple[List[List[float]], List[List[str]]]

Evaluate a set of prompts using a given predictor.

This method orchestrates subsampling, prediction, caching, and result collection.

Note: Cannot return both aggregated scores and sequences (assertion will fail).

Source code in promptolution/tasks/base_task.py
def evaluate(
    self,
    prompts: Union[str, List[str]],
    predictor: "BasePredictor",
    system_prompts: Optional[Union[str, List[str]]] = None,
    return_agg_scores: bool = True,
    return_seq: bool = False,
    eval_strategy: Optional["EvalStrategy"] = None,
) -> Union[List[float], List[List[float]], Tuple[List[List[float]], List[List[str]]]]:
    """Evaluate a set of prompts using a given predictor.

    This method orchestrates subsampling, prediction, caching, and result collection.

    Note: Cannot return both aggregated scores and sequences (assertion will fail).
    """
    assert not (return_agg_scores and return_seq), "Cannot return both aggregated scores and sequences"

    seqs: List[str] = []

    prompts = [prompts] if isinstance(prompts, str) else prompts
    eval_strategy = eval_strategy or self.eval_strategy
    xs, ys = self.subsample(eval_strategy=eval_strategy)
    batches = self._prepare_batch(prompts, xs, ys, eval_strategy=eval_strategy)
    (prompts_to_evaluate, xs_to_evaluate, ys_to_evaluate) = ([], [], []) if not batches else zip(*batches)

    if prompts_to_evaluate:
        preds_seqs = predictor.predict(
            prompts=list(prompts_to_evaluate),
            xs=list(xs_to_evaluate),
            system_prompts=system_prompts,
            return_seq=return_seq,
        )
    else:
        preds_seqs = ([], []) if return_seq else []

    if return_seq:
        preds, seqs = preds_seqs if isinstance(preds_seqs, tuple) else (preds_seqs, [])
    else:
        preds = preds_seqs

    scores: List[float] = self._evaluate(list(xs_to_evaluate), list(ys_to_evaluate), preds)
    for i, cache_key in enumerate(batches):
        self.eval_cache[cache_key] = scores[i]
        if return_seq:
            self.seq_cache[cache_key] = seqs[i]

    return self._collect_results_from_cache(
        prompts,
        xs,
        ys,
        return_agg_scores,
        return_seq,
    )

increment_block_idx()

Increment the block index for subsampling.

Raises:

Type Description
ValueError

If the eval_strategy does not contain "block".

Source code in promptolution/tasks/base_task.py
def increment_block_idx(self) -> None:
    """Increment the block index for subsampling.

    Raises:
        ValueError: If the eval_strategy does not contain "block".
    """
    if "block" not in self.eval_strategy:
        raise ValueError("Block increment is only valid for block subsampling.")
    self.block_idx += 1
    if self.n_blocks > 0:  # Ensure n_blocks is not zero to avoid division by zero
        self.block_idx %= self.n_blocks
    else:
        self.block_idx = 0  # If no blocks, reset to 0

pop_datapoints(n=None, frac=None)

Pop a number of datapoints from the dataset.

Parameters:

Name Type Description Default
n int

Number of datapoints to pop. Defaults to None.

None
frac float

Fraction of datapoints to pop. Defaults to None.

None

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame containing the popped datapoints.

Source code in promptolution/tasks/base_task.py
def pop_datapoints(self, n: Optional[int] = None, frac: Optional[float] = None) -> pd.DataFrame:
    """Pop a number of datapoints from the dataset.

    Args:
        n (int, optional): Number of datapoints to pop. Defaults to None.
        frac (float, optional): Fraction of datapoints to pop. Defaults to None.

    Returns:
        pd.DataFrame: DataFrame containing the popped datapoints.
    """
    assert n is None or frac is None, "Only one of n or frac can be specified."
    if n is not None:
        indices = self.rng.choice(len(self.xs), n, replace=False)
    elif frac is not None:
        indices = self.rng.choice(len(self.xs), int(len(self.xs) * frac), replace=False)
    else:
        raise ValueError("Either n or frac must be specified.")

    popped_xs = [self.xs[i] for i in indices]
    popped_ys = [self.ys[i] for i in indices]
    df_popped = pd.DataFrame({self.x_column: popped_xs, self.y_column: popped_ys})

    self.xs = [x for i, x in enumerate(self.xs) if i not in indices]
    self.ys = [y for i, y in enumerate(self.ys) if i not in indices]

    # Update n_blocks and block_idx based on the new dataset size
    self.n_blocks = len(self.xs) // self.n_subsamples if self.n_subsamples > 0 else 1
    self.block_idx = min(self.block_idx, self.n_blocks - 1) if self.n_blocks > 0 else 0

    # Clear cache for popped items (optional, but good practice if memory is a concern)
    keys_to_remove = []
    for key in self.eval_cache:
        if key[1] in popped_xs and key[2] in popped_ys:  # Check if the x and y correspond to popped data
            keys_to_remove.append(key)
    for key in keys_to_remove:
        self.eval_cache.pop(key, None)
        self.seq_cache.pop(key, None)

    return df_popped

reset_block_idx()

Reset the block index for subsampling.

Raises:

Type Description
ValueError

If the eval_strategy does not contain "block".

Source code in promptolution/tasks/base_task.py
def reset_block_idx(self) -> None:
    """Reset the block index for subsampling.

    Raises:
        ValueError: If the eval_strategy does not contain "block".
    """
    if "block" not in self.eval_strategy:
        raise ValueError("Block reset is only valid for block subsampling.")
    self.block_idx = 0

subsample(eval_strategy=None)

Subsample the dataset based on the specified parameters.

Parameters:

Name Type Description Default
eval_strategy EvalStrategy

Subsampling strategy to use instead of self.eval_strategy. Defaults to None.

None

Returns:

Type Description
Tuple[List[str], List[str]]

Tuple[List[str], List[str]]: Subsampled input data and labels.

Source code in promptolution/tasks/base_task.py
def subsample(self, eval_strategy: "EvalStrategy" = None) -> Tuple[List[str], List[str]]:
    """Subsample the dataset based on the specified parameters.

    Args:
        eval_strategy (EvalStrategy, optional): Subsampling strategy to use instead of self.eval_strategy. Defaults to None.

    Returns:
        Tuple[List[str], List[str]]: Subsampled input data and labels.
    """
    if eval_strategy is None:
        eval_strategy = self.eval_strategy

    if eval_strategy in ["full", "evaluated"]:
        return self.xs, self.ys
    elif eval_strategy == "subsample":
        indices = self.rng.choice(len(self.xs), min(self.n_subsamples, len(self.xs)), replace=False)
        return [self.xs[i] for i in indices], [self.ys[i] for i in indices]
    elif eval_strategy == "random_block":
        block_id = self.rng.integers(0, self.n_blocks)
        start_idx = block_id * self.n_subsamples
        end_idx = min((block_id + 1) * self.n_subsamples, len(self.xs))
        indices = np.arange(start_idx, end_idx)
        return [self.xs[i] for i in indices], [self.ys[i] for i in indices]
    elif eval_strategy == "sequential_block":
        start_idx = self.block_idx * self.n_subsamples
        end_idx = min((self.block_idx + 1) * self.n_subsamples, len(self.xs))
        indices = np.arange(start_idx, end_idx)
        return [self.xs[i] for i in indices], [self.ys[i] for i in indices]
    else:
        raise ValueError(f"Unknown subsampling strategy: '{eval_strategy}'")

classification_tasks

Module for classification tasks.

ClassificationTask

Bases: BaseTask

A class representing a classification task in the promptolution library.

This class handles the loading and management of classification datasets, as well as the evaluation of predictors on these datasets.

Source code in promptolution/tasks/classification_tasks.py
class ClassificationTask(BaseTask):
    """A class representing a classification task in the promptolution library.

    This class handles the loading and management of classification datasets,
    as well as the evaluation of predictors on these datasets.
    """

    def __init__(
        self,
        df: pd.DataFrame,
        task_description: Optional[str] = None,
        x_column: str = "x",
        y_column: str = "y",
        n_subsamples: int = 30,
        eval_strategy: Literal["full", "subsample", "sequential_block", "random_block"] = "full",
        seed: int = 42,
        metric: Callable[[Any, Any], float] = accuracy_score,
        config: Optional["ExperimentConfig"] = None,
    ) -> None:
        """Initialize the ClassificationTask from a pandas DataFrame.

        Args:
            df (pd.DataFrame): Input DataFrame containing the data
            task_description (str): Description of the task
            x_column (str, optional): Name of the column containing input texts. Defaults to "x".
            y_column (str, optional): Name of the column containing labels. Defaults to "y".
            n_subsamples (int, optional): Number of subsamples to use. No subsampling if None. Defaults to None.
            eval_strategy (str, optional): Subsampling strategy to use. Options:
                - "full": Uses the entire dataset for evaluation.
                - "evaluated": Uses only previously evaluated datapoints from the cache.
                - "subsample": Randomly selects n_subsamples datapoints without replacement.
                - "sequential_block": Uses a block of block_size consecutive datapoints, advancing through blocks sequentially.
                - "random_block": Randomly selects a block of block_size consecutive datapoints.
                Defaults to "full".
            seed (int, optional): Random seed for reproducibility. Defaults to 42.
            metric (Callable, optional): Metric to use for evaluation. Defaults to accuracy_score.
            config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
        """
        self.metric = metric
        super().__init__(
            df=df,
            x_column=x_column,
            y_column=y_column,
            task_description=task_description,
            n_subsamples=n_subsamples,
            eval_strategy=eval_strategy,
            seed=seed,
            config=config,
        )
        self.ys: List[str] = (
            df[self.y_column].str.lower().values.tolist()
        )  # Ensure y values are lowercase for consistent comparison
        self.classes = np.unique(self.ys)

    def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[float]:
        """Calculate the score for a single prediction."""
        scores = []
        for pred, y in zip(preds, ys):
            scores.append(self.metric([y], [pred]))
        return scores

__init__(df, task_description=None, x_column='x', y_column='y', n_subsamples=30, eval_strategy='full', seed=42, metric=accuracy_score, config=None)

Initialize the ClassificationTask from a pandas DataFrame.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing the data

required
task_description str

Description of the task

None
x_column str

Name of the column containing input texts. Defaults to "x".

'x'
y_column str

Name of the column containing labels. Defaults to "y".

'y'
n_subsamples int

Number of subsamples to use. No subsampling if None. Defaults to None.

30
eval_strategy str

Subsampling strategy to use. Options: - "full": Uses the entire dataset for evaluation. - "evaluated": Uses only previously evaluated datapoints from the cache. - "subsample": Randomly selects n_subsamples datapoints without replacement. - "sequential_block": Uses a block of block_size consecutive datapoints, advancing through blocks sequentially. - "random_block": Randomly selects a block of block_size consecutive datapoints. Defaults to "full".

'full'
seed int

Random seed for reproducibility. Defaults to 42.

42
metric Callable

Metric to use for evaluation. Defaults to accuracy_score.

accuracy_score
config ExperimentConfig

Configuration for the task, overriding defaults.

None
Source code in promptolution/tasks/classification_tasks.py
def __init__(
    self,
    df: pd.DataFrame,
    task_description: Optional[str] = None,
    x_column: str = "x",
    y_column: str = "y",
    n_subsamples: int = 30,
    eval_strategy: Literal["full", "subsample", "sequential_block", "random_block"] = "full",
    seed: int = 42,
    metric: Callable[[Any, Any], float] = accuracy_score,
    config: Optional["ExperimentConfig"] = None,
) -> None:
    """Initialize the ClassificationTask from a pandas DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing the data
        task_description (str): Description of the task
        x_column (str, optional): Name of the column containing input texts. Defaults to "x".
        y_column (str, optional): Name of the column containing labels. Defaults to "y".
        n_subsamples (int, optional): Number of subsamples to use. No subsampling if None. Defaults to None.
        eval_strategy (str, optional): Subsampling strategy to use. Options:
            - "full": Uses the entire dataset for evaluation.
            - "evaluated": Uses only previously evaluated datapoints from the cache.
            - "subsample": Randomly selects n_subsamples datapoints without replacement.
            - "sequential_block": Uses a block of block_size consecutive datapoints, advancing through blocks sequentially.
            - "random_block": Randomly selects a block of block_size consecutive datapoints.
            Defaults to "full".
        seed (int, optional): Random seed for reproducibility. Defaults to 42.
        metric (Callable, optional): Metric to use for evaluation. Defaults to accuracy_score.
        config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
    """
    self.metric = metric
    super().__init__(
        df=df,
        x_column=x_column,
        y_column=y_column,
        task_description=task_description,
        n_subsamples=n_subsamples,
        eval_strategy=eval_strategy,
        seed=seed,
        config=config,
    )
    self.ys: List[str] = (
        df[self.y_column].str.lower().values.tolist()
    )  # Ensure y values are lowercase for consistent comparison
    self.classes = np.unique(self.ys)

judge_tasks

Module for judge tasks.

JudgeTask

Bases: BaseTask

Task that evaluates a predictor using an LLM as a judge, optionally accepting a ground truth.

Source code in promptolution/tasks/judge_tasks.py
class JudgeTask(BaseTask):
    """Task that evaluates a predictor using an LLM as a judge, optionally accepting a ground truth."""

    def __init__(
        self,
        df: pd.DataFrame,
        judge_llm: "BaseLLM",
        x_column: str = "x",
        y_column: Optional[str] = None,
        task_description: Optional[str] = None,
        n_subsamples: int = 30,
        eval_strategy: "EvalStrategy" = "full",
        seed: int = 42,
        judge_prompt: Optional[str] = None,
        min_score: float = -5.0,
        max_score: float = 5.0,
        config: "ExperimentConfig" = None,
    ):
        """Initialize the JudgeTask.

        Args:
            df (pd.DataFrame): The input DataFrame containing the data.
            judge_llm (BaseLLM): The LLM judging the predictions.
            x_column (str): Name of the column containing input texts.
            y_column (Optional[str]): Name of the column containing labels/ground truth (if applicable).
            task_description (Optional[str]): Description of the task, parsed to the Judge-LLM and Meta-LLM.
            n_subsamples (int): Number of subsamples to use for evaluation.
            eval_strategy (EvalStrategy): Subsampling strategy to use for evaluation.
            seed (int): Random seed for reproducibility.
            judge_prompt (Optional[str]): Custom prompt for the judge. Note: The score of the Judge will be extracted inside <final_score> tags.
            min_score (float): Minimum score for evaluation.
            max_score (float): Maximum score for evaluation.
            config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
        """
        if judge_prompt is None:
            judge_prompt = JUDGE_PROMPT_WITH_GROUND_TRUTH if y_column else JUDGE_PROMPT_WITHOUT_GROUND_TRUTH
        self.judge_prompt = judge_prompt
        self.min_score = min_score
        self.max_score = max_score

        super().__init__(
            df=df,
            x_column=x_column,
            y_column=y_column,
            task_description=task_description,
            n_subsamples=n_subsamples,
            eval_strategy=eval_strategy,
            seed=seed,
            config=config,
        )
        self.judge_llm = judge_llm

    def _construct_judge_prompt(self, x: str, pred: str, y: Optional[str] = None) -> str:
        """Constructs the judge prompt based on whether ground truth is available."""
        if y is not None:
            prompt = self.judge_prompt.replace("{ground_truth}", str(y))
        else:
            prompt = self.judge_prompt

        task_description = self.task_description or ""
        prompt = prompt.replace("{task}", task_description).replace("{input}", x).replace("{prediction}", pred)
        return prompt

    def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[float]:
        """Calculate the score for a single prediction using the LLM judge."""
        prompts: List[str] = []
        for x, y, pred in zip(xs, ys, preds):
            judge_prompt = self._construct_judge_prompt(x, pred, y)
            prompts.append(judge_prompt)
        judge_responses = self.judge_llm.get_response(prompts)
        scores_str = extract_from_tag(judge_responses, "<final_score>", "</final_score>")
        scores = []
        for score_str, judge_response in zip(scores_str, judge_responses):
            try:
                # only numeric chars, - or . are allowed
                score_str = "".join(filter(lambda c: c.isdigit() or c in "-.", score_str))
                score = float(score_str)
                # normalize from [min_score, max_score] to [0, 1]
                score = (score - self.min_score) / (self.max_score - self.min_score)
                score = max(0.0, min(1.0, score))
            except ValueError:
                logger.warning(f"Failed to parse score '{score}' as float. Defaulting to a score 0.0.")
                score = 0.0

            scores.append(score)

        return scores

__init__(df, judge_llm, x_column='x', y_column=None, task_description=None, n_subsamples=30, eval_strategy='full', seed=42, judge_prompt=None, min_score=-5.0, max_score=5.0, config=None)

Initialize the JudgeTask.

Parameters:

Name Type Description Default
df DataFrame

The input DataFrame containing the data.

required
judge_llm BaseLLM

The LLM judging the predictions.

required
x_column str

Name of the column containing input texts.

'x'
y_column Optional[str]

Name of the column containing labels/ground truth (if applicable).

None
task_description Optional[str]

Description of the task, parsed to the Judge-LLM and Meta-LLM.

None
n_subsamples int

Number of subsamples to use for evaluation.

30
eval_strategy EvalStrategy

Subsampling strategy to use for evaluation.

'full'
seed int

Random seed for reproducibility.

42
judge_prompt Optional[str]

Custom prompt for the judge. Note: The score of the Judge will be extracted inside tags.

None
min_score float

Minimum score for evaluation.

-5.0
max_score float

Maximum score for evaluation.

5.0
config ExperimentConfig

Configuration for the task, overriding defaults.

None
Source code in promptolution/tasks/judge_tasks.py
def __init__(
    self,
    df: pd.DataFrame,
    judge_llm: "BaseLLM",
    x_column: str = "x",
    y_column: Optional[str] = None,
    task_description: Optional[str] = None,
    n_subsamples: int = 30,
    eval_strategy: "EvalStrategy" = "full",
    seed: int = 42,
    judge_prompt: Optional[str] = None,
    min_score: float = -5.0,
    max_score: float = 5.0,
    config: "ExperimentConfig" = None,
):
    """Initialize the JudgeTask.

    Args:
        df (pd.DataFrame): The input DataFrame containing the data.
        judge_llm (BaseLLM): The LLM judging the predictions.
        x_column (str): Name of the column containing input texts.
        y_column (Optional[str]): Name of the column containing labels/ground truth (if applicable).
        task_description (Optional[str]): Description of the task, parsed to the Judge-LLM and Meta-LLM.
        n_subsamples (int): Number of subsamples to use for evaluation.
        eval_strategy (EvalStrategy): Subsampling strategy to use for evaluation.
        seed (int): Random seed for reproducibility.
        judge_prompt (Optional[str]): Custom prompt for the judge. Note: The score of the Judge will be extracted inside <final_score> tags.
        min_score (float): Minimum score for evaluation.
        max_score (float): Maximum score for evaluation.
        config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
    """
    if judge_prompt is None:
        judge_prompt = JUDGE_PROMPT_WITH_GROUND_TRUTH if y_column else JUDGE_PROMPT_WITHOUT_GROUND_TRUTH
    self.judge_prompt = judge_prompt
    self.min_score = min_score
    self.max_score = max_score

    super().__init__(
        df=df,
        x_column=x_column,
        y_column=y_column,
        task_description=task_description,
        n_subsamples=n_subsamples,
        eval_strategy=eval_strategy,
        seed=seed,
        config=config,
    )
    self.judge_llm = judge_llm

reward_tasks

Module for Reward tasks.

RewardTask

Bases: BaseTask

A task that evaluates a predictor using a reward function.

This task takes a DataFrame, a column name for input data, and a reward function. The reward function takes in a prediction as input and returns a scalar reward.

Source code in promptolution/tasks/reward_tasks.py
class RewardTask(BaseTask):
    """A task that evaluates a predictor using a reward function.

    This task takes a DataFrame, a column name for input data, and a reward function.
    The reward function takes in a prediction as input and returns a scalar reward.
    """

    def __init__(
        self,
        df: pd.DataFrame,
        reward_function: Callable[[str], float],
        x_column: str = "x",
        task_description: Optional[str] = None,
        n_subsamples: int = 30,
        eval_strategy: "EvalStrategy" = "full",
        seed: int = 42,
        config: Optional["ExperimentConfig"] = None,
    ) -> None:
        """Initialize the RewardTask.

        Args:
            df (pd.DataFrame): Input DataFrame containing the data.
            reward_function (Callable): Function that takes a prediction and returns a reward score. Note: The optimizers aim to maximize.
            x_column (str, optional): Name of the column containing input texts. Defaults to "x".
            task_description (str, optional): Description of the task.
            n_subsamples (int, optional): Number of subsamples to use. Defaults to 30.
            eval_strategy (str, optional): Subsampling strategy to use. Defaults to "full".
            seed (int, optional): Random seed for reproducibility. Defaults to 42.
            config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
        """
        self.reward_function = reward_function
        super().__init__(
            df=df,
            x_column=x_column,
            task_description=task_description,
            n_subsamples=n_subsamples,
            eval_strategy=eval_strategy,
            seed=seed,
            config=config,
        )

    def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[float]:
        """Calculate the score for a single reward prediction using the reward function."""
        rewards = [self.reward_function(pred) for pred in preds]
        return rewards

__init__(df, reward_function, x_column='x', task_description=None, n_subsamples=30, eval_strategy='full', seed=42, config=None)

Initialize the RewardTask.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing the data.

required
reward_function Callable

Function that takes a prediction and returns a reward score. Note: The optimizers aim to maximize.

required
x_column str

Name of the column containing input texts. Defaults to "x".

'x'
task_description str

Description of the task.

None
n_subsamples int

Number of subsamples to use. Defaults to 30.

30
eval_strategy str

Subsampling strategy to use. Defaults to "full".

'full'
seed int

Random seed for reproducibility. Defaults to 42.

42
config ExperimentConfig

Configuration for the task, overriding defaults.

None
Source code in promptolution/tasks/reward_tasks.py
def __init__(
    self,
    df: pd.DataFrame,
    reward_function: Callable[[str], float],
    x_column: str = "x",
    task_description: Optional[str] = None,
    n_subsamples: int = 30,
    eval_strategy: "EvalStrategy" = "full",
    seed: int = 42,
    config: Optional["ExperimentConfig"] = None,
) -> None:
    """Initialize the RewardTask.

    Args:
        df (pd.DataFrame): Input DataFrame containing the data.
        reward_function (Callable): Function that takes a prediction and returns a reward score. Note: The optimizers aim to maximize.
        x_column (str, optional): Name of the column containing input texts. Defaults to "x".
        task_description (str, optional): Description of the task.
        n_subsamples (int, optional): Number of subsamples to use. Defaults to 30.
        eval_strategy (str, optional): Subsampling strategy to use. Defaults to "full".
        seed (int, optional): Random seed for reproducibility. Defaults to 42.
        config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
    """
    self.reward_function = reward_function
    super().__init__(
        df=df,
        x_column=x_column,
        task_description=task_description,
        n_subsamples=n_subsamples,
        eval_strategy=eval_strategy,
        seed=seed,
        config=config,
    )