LLMs

Module for Large Language Models.

`api_llm`

Module to interface with various language models through their respective APIs.

`APILLM`

Bases: BaseLLM

A class to interface with language models through their respective APIs.

This class provides a unified interface for making API calls to language models using the OpenAI client library. It handles rate limiting through semaphores and supports both synchronous and asynchronous operations.

Attributes:

Name	Type	Description
`model_id`	`str`	Identifier for the model to use.
`client`	`AsyncOpenAI`	The initialized API client.
`max_tokens`	`int`	Maximum number of tokens in model responses.
`semaphore`	`Semaphore`	Semaphore to limit concurrent API calls.

Source code in promptolution/llms/api_llm.py

class APILLM(BaseLLM):
    """A class to interface with language models through their respective APIs.

    This class provides a unified interface for making API calls to language models
    using the OpenAI client library. It handles rate limiting through semaphores
    and supports both synchronous and asynchronous operations.

    Attributes:
        model_id (str): Identifier for the model to use.
        client (AsyncOpenAI): The initialized API client.
        max_tokens (int): Maximum number of tokens in model responses.
        semaphore (asyncio.Semaphore): Semaphore to limit concurrent API calls.
    """

    def __init__(
        self,
        api_url: str = None,
        model_id: str = None,
        api_key: str = None,
        max_concurrent_calls=50,
        max_tokens=512,
        config: "ExperimentConfig" = None,
    ):
        """Initialize the APILLM with a specific model and API configuration.

        Args:
            api_url (str): The base URL for the API endpoint.
            model_id (str): Identifier for the model to use.
            api_key (str, optional): API key for authentication. Defaults to None.
            max_concurrent_calls (int, optional): Maximum number of concurrent API calls. Defaults to 50.
            max_tokens (int, optional): Maximum number of tokens in model responses. Defaults to 512.
            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.

        Raises:
            ImportError: If required libraries are not installed.
        """
        if not import_successful:
            raise ImportError(
                "Could not import at least one of the required libraries: openai, asyncio. "
                "Please ensure they are installed in your environment."
            )

        self.api_url = api_url
        self.model_id = model_id
        self.api_key = api_key
        self.max_concurrent_calls = max_concurrent_calls
        self.max_tokens = max_tokens

        super().__init__(config=config)
        self.client = AsyncOpenAI(base_url=self.api_url, api_key=self.api_key)
        self.semaphore = asyncio.Semaphore(self.max_concurrent_calls)

    def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
        # Setup for async execution in sync context
        loop = asyncio.get_event_loop()
        responses = loop.run_until_complete(self._get_response_async(prompts, system_prompts))
        return responses

    async def _get_response_async(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
        tasks = [
            _invoke_model(prompt, system_prompt, self.max_tokens, self.model_id, self.client, self.semaphore)
            for prompt, system_prompt in zip(prompts, system_prompts)
        ]
        responses = await asyncio.gather(*tasks)
        return [response.choices[0].message.content for response in responses]

`init(api_url=None, model_id=None, api_key=None, max_concurrent_calls=50, max_tokens=512, config=None)`

Initialize the APILLM with a specific model and API configuration.

Parameters:

Name	Type	Description	Default
`api_url`	`str`	The base URL for the API endpoint.	`None`
`model_id`	`str`	Identifier for the model to use.	`None`
`api_key`	`str`	API key for authentication. Defaults to None.	`None`
`max_concurrent_calls`	`int`	Maximum number of concurrent API calls. Defaults to 50.	`50`
`max_tokens`	`int`	Maximum number of tokens in model responses. Defaults to 512.	`512`
`config`	`ExperimentConfig`	Configuration for the LLM, overriding defaults.	`None`

Raises:

Type	Description
`ImportError`	If required libraries are not installed.

Source code in promptolution/llms/api_llm.py

def __init__(
    self,
    api_url: str = None,
    model_id: str = None,
    api_key: str = None,
    max_concurrent_calls=50,
    max_tokens=512,
    config: "ExperimentConfig" = None,
):
    """Initialize the APILLM with a specific model and API configuration.

    Args:
        api_url (str): The base URL for the API endpoint.
        model_id (str): Identifier for the model to use.
        api_key (str, optional): API key for authentication. Defaults to None.
        max_concurrent_calls (int, optional): Maximum number of concurrent API calls. Defaults to 50.
        max_tokens (int, optional): Maximum number of tokens in model responses. Defaults to 512.
        config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.

    Raises:
        ImportError: If required libraries are not installed.
    """
    if not import_successful:
        raise ImportError(
            "Could not import at least one of the required libraries: openai, asyncio. "
            "Please ensure they are installed in your environment."
        )

    self.api_url = api_url
    self.model_id = model_id
    self.api_key = api_key
    self.max_concurrent_calls = max_concurrent_calls
    self.max_tokens = max_tokens

    super().__init__(config=config)
    self.client = AsyncOpenAI(base_url=self.api_url, api_key=self.api_key)
    self.semaphore = asyncio.Semaphore(self.max_concurrent_calls)

`base_llm`

Base module for LLMs in the promptolution library.

`BaseLLM`

Bases: ABC

Abstract base class for Language Models in the promptolution library.

This class defines the interface that all concrete LLM implementations should follow. It's designed to track which configuration parameters are actually used.

Attributes:

Name	Type	Description
`config`	`LLMModelConfig`	Configuration for the language model.
`input_token_count`	`int`	Count of input tokens processed.
`output_token_count`	`int`	Count of output tokens generated.

Source code in promptolution/llms/base_llm.py

class BaseLLM(ABC):
    """Abstract base class for Language Models in the promptolution library.

    This class defines the interface that all concrete LLM implementations should follow.
    It's designed to track which configuration parameters are actually used.

    Attributes:
        config (LLMModelConfig): Configuration for the language model.
        input_token_count (int): Count of input tokens processed.
        output_token_count (int): Count of output tokens generated.
    """

    def __init__(self, config: "ExperimentConfig" = None):
        """Initialize the LLM with a configuration or direct parameters.

        This constructor supports both config-based and direct parameter initialization
        for backward compatibility.

        Args:
            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.
        """
        if config is not None:
            config.apply_to(self)
        # Initialize token counters
        self.input_token_count = 0
        self.output_token_count = 0

    def get_token_count(self):
        """Get the current count of input and output tokens.

        Returns:
            dict: A dictionary containing the input and output token counts.
        """
        return {
            "input_tokens": self.input_token_count,
            "output_tokens": self.output_token_count,
            "total_tokens": self.input_token_count + self.output_token_count,
        }

    def reset_token_count(self):
        """Reset the token counters to zero."""
        self.input_token_count = 0
        self.output_token_count = 0

    def update_token_count(self, inputs: List[str], outputs: List[str]):
        """Update the token count based on the given inputs and outputs.

        It uses a simple tokenization method (splitting by whitespace) to count tokens in the base class.

        Args:
            inputs (List[str]): A list of input prompts.
            outputs (List[str]): A list of generated responses.
        """
        input_tokens = sum([len(i.split()) for i in inputs])
        output_tokens = sum([len(o.split()) for o in outputs])
        self.input_token_count += input_tokens
        self.output_token_count += output_tokens

    def get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
        """Generate responses for the given prompts.

        This method calls the _get_response method to generate responses
        for the given prompts. It also updates the token count for the
        input and output tokens.

        Args:
            prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                        it's converted to a list containing that string.
            system_prompts (Optional, str or List[str]): System prompt(s) to provide context to the model.

        Returns:
            List[str]: A list of generated responses, one for each input prompt.
        """
        if system_prompts is None:
            system_prompts = DEFAULT_SYS_PROMPT
        if isinstance(prompts, str):
            prompts = [prompts]
        if isinstance(system_prompts, str):
            system_prompts = [system_prompts] * len(prompts)
        responses = self._get_response(prompts, system_prompts)
        self.update_token_count(prompts + system_prompts, responses)

        return responses

    def set_generation_seed(self, seed: int):
        """Set the random seed for reproducibility per request.

        Args:
            seed (int): Random seed value.
        """
        pass

    @abstractmethod
    def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
        """Generate responses for the given prompts.

        This method should be implemented by subclasses to define how
        the LLM generates responses.

        Args:
            prompts (List[str]): A list of input prompts.
            system_prompts (List[str]): A list of system prompts to provide context to the model.

        Returns:
            List[str]: A list of generated responses corresponding to the input prompts.
        """
        raise NotImplementedError

`init(config=None)`

Initialize the LLM with a configuration or direct parameters.

This constructor supports both config-based and direct parameter initialization for backward compatibility.

Parameters:

Name	Type	Description	Default
`config`	`ExperimentConfig`	Configuration for the LLM, overriding defaults.	`None`

Source code in promptolution/llms/base_llm.py

def __init__(self, config: "ExperimentConfig" = None):
    """Initialize the LLM with a configuration or direct parameters.

    This constructor supports both config-based and direct parameter initialization
    for backward compatibility.

    Args:
        config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.
    """
    if config is not None:
        config.apply_to(self)
    # Initialize token counters
    self.input_token_count = 0
    self.output_token_count = 0

`get_response(prompts, system_prompts=None)`

Generate responses for the given prompts.

This method calls the _get_response method to generate responses for the given prompts. It also updates the token count for the input and output tokens.

Parameters:

Name	Type	Description	Default
`prompts`	`str or List[str]`	Input prompt(s). If a single string is provided, it's converted to a list containing that string.	required
`system_prompts`	`(Optional, str or List[str])`	System prompt(s) to provide context to the model.	`None`

Returns:

Type	Description
`List[str]`	List[str]: A list of generated responses, one for each input prompt.

Source code in promptolution/llms/base_llm.py

def get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
    """Generate responses for the given prompts.

    This method calls the _get_response method to generate responses
    for the given prompts. It also updates the token count for the
    input and output tokens.

    Args:
        prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                    it's converted to a list containing that string.
        system_prompts (Optional, str or List[str]): System prompt(s) to provide context to the model.

    Returns:
        List[str]: A list of generated responses, one for each input prompt.
    """
    if system_prompts is None:
        system_prompts = DEFAULT_SYS_PROMPT
    if isinstance(prompts, str):
        prompts = [prompts]
    if isinstance(system_prompts, str):
        system_prompts = [system_prompts] * len(prompts)
    responses = self._get_response(prompts, system_prompts)
    self.update_token_count(prompts + system_prompts, responses)

    return responses

`get_token_count()`

Get the current count of input and output tokens.

Returns:

Name	Type	Description
`dict`		A dictionary containing the input and output token counts.

Source code in promptolution/llms/base_llm.py

def get_token_count(self):
    """Get the current count of input and output tokens.

    Returns:
        dict: A dictionary containing the input and output token counts.
    """
    return {
        "input_tokens": self.input_token_count,
        "output_tokens": self.output_token_count,
        "total_tokens": self.input_token_count + self.output_token_count,
    }

`reset_token_count()`

Reset the token counters to zero.

Source code in promptolution/llms/base_llm.py

def reset_token_count(self):
    """Reset the token counters to zero."""
    self.input_token_count = 0
    self.output_token_count = 0

`set_generation_seed(seed)`

Set the random seed for reproducibility per request.

Parameters:

Name	Type	Description	Default
`seed`	`int`	Random seed value.	required

Source code in promptolution/llms/base_llm.py

def set_generation_seed(self, seed: int):
    """Set the random seed for reproducibility per request.

    Args:
        seed (int): Random seed value.
    """
    pass

`update_token_count(inputs, outputs)`

Update the token count based on the given inputs and outputs.

It uses a simple tokenization method (splitting by whitespace) to count tokens in the base class.

Parameters:

Name	Type	Description	Default
`inputs`	`List[str]`	A list of input prompts.	required
`outputs`	`List[str]`	A list of generated responses.	required

Source code in promptolution/llms/base_llm.py

def update_token_count(self, inputs: List[str], outputs: List[str]):
    """Update the token count based on the given inputs and outputs.

    It uses a simple tokenization method (splitting by whitespace) to count tokens in the base class.

    Args:
        inputs (List[str]): A list of input prompts.
        outputs (List[str]): A list of generated responses.
    """
    input_tokens = sum([len(i.split()) for i in inputs])
    output_tokens = sum([len(o.split()) for o in outputs])
    self.input_token_count += input_tokens
    self.output_token_count += output_tokens

`local_llm`

Module for running LLMs locally using the Hugging Face Transformers library.

`LocalLLM`

Bases: BaseLLM

A class for running language models locally using the Hugging Face Transformers library.

This class sets up a text generation pipeline with specified model parameters and provides a method to generate responses for given prompts.

Attributes:

Name	Type	Description
`pipeline`	`Pipeline`	The text generation pipeline.

Methods:

Name	Description
`get_response`	Generate responses for a list of prompts.

Source code in promptolution/llms/local_llm.py

class LocalLLM(BaseLLM):
    """A class for running language models locally using the Hugging Face Transformers library.

    This class sets up a text generation pipeline with specified model parameters
    and provides a method to generate responses for given prompts.

    Attributes:
        pipeline (transformers.Pipeline): The text generation pipeline.

    Methods:
        get_response: Generate responses for a list of prompts.
    """

    def __init__(self, model_id: str, batch_size: int = 8, config: "ExperimentConfig" = None):
        """Initialize the LocalLLM with a specific model.

        Args:
            model_id (str): The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").
            batch_size (int, optional): The batch size for text generation. Defaults to 8.
            config (ExperimentConfig, optional): "ExperimentConfig" overwriting defaults.

        Note:
            This method sets up a text generation pipeline with bfloat16 precision,
            automatic device mapping, and specific generation parameters.
        """
        if not imports_successful:
            raise ImportError(
                "Could not import at least one of the required libraries: torch, transformers. "
                "Please ensure they are installed in your environment."
            )
        self.pipeline = transformers.pipeline(
            "text-generation",
            model=model_id,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto",
            max_new_tokens=256,
            batch_size=batch_size,
            num_return_sequences=1,
            return_full_text=False,
        )
        self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
        self.pipeline.tokenizer.padding_side = "left"
        super().__init__(config)

    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
        """Generate responses for a list of prompts using the local language model.

        Args:
            prompts (list[str]): A list of input prompts.
            system_prompts (list[str]): A list of system prompts to guide the model's behavior.

        Returns:
            list[str]: A list of generated responses corresponding to the input prompts.

        Note:
            This method uses torch.no_grad() for inference to reduce memory usage.
            It handles both single and batch inputs, ensuring consistent output format.
        """
        inputs = []
        for prompt, sys_prompt in zip(prompts, system_prompts):
            inputs.append([{"role": "system", "prompt": sys_prompt}, {"role": "user", "prompt": prompt}])

        with torch.no_grad():
            response = self.pipeline(inputs, pad_token_id=self.pipeline.tokenizer.eos_token_id)

        if len(response) != 1:
            response = [r[0] if isinstance(r, list) else r for r in response]

        response = [r["generated_text"] for r in response]
        return response

    def __del__(self):
        """Cleanup method to delete the pipeline and free up GPU memory."""
        del self.pipeline
        torch.cuda.empty_cache()

`del()`

Cleanup method to delete the pipeline and free up GPU memory.

Source code in promptolution/llms/local_llm.py

def __del__(self):
    """Cleanup method to delete the pipeline and free up GPU memory."""
    del self.pipeline
    torch.cuda.empty_cache()

`init(model_id, batch_size=8, config=None)`

Initialize the LocalLLM with a specific model.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").	required
`batch_size`	`int`	The batch size for text generation. Defaults to 8.	`8`
`config`	`ExperimentConfig`	"ExperimentConfig" overwriting defaults.	`None`

Note

This method sets up a text generation pipeline with bfloat16 precision, automatic device mapping, and specific generation parameters.

Source code in promptolution/llms/local_llm.py

def __init__(self, model_id: str, batch_size: int = 8, config: "ExperimentConfig" = None):
    """Initialize the LocalLLM with a specific model.

    Args:
        model_id (str): The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").
        batch_size (int, optional): The batch size for text generation. Defaults to 8.
        config (ExperimentConfig, optional): "ExperimentConfig" overwriting defaults.

    Note:
        This method sets up a text generation pipeline with bfloat16 precision,
        automatic device mapping, and specific generation parameters.
    """
    if not imports_successful:
        raise ImportError(
            "Could not import at least one of the required libraries: torch, transformers. "
            "Please ensure they are installed in your environment."
        )
    self.pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
        max_new_tokens=256,
        batch_size=batch_size,
        num_return_sequences=1,
        return_full_text=False,
    )
    self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
    self.pipeline.tokenizer.padding_side = "left"
    super().__init__(config)

`vllm`

Module for running language models locally using the vLLM library.

`VLLM`

Bases: BaseLLM

A class for running language models using the vLLM library.

This class sets up a vLLM inference engine with specified model parameters and provides a method to generate responses for given prompts.

Attributes:

Name	Type	Description
`llm`	`LLM`	The vLLM inference engine.
`tokenizer`	`PreTrainedTokenizer`	The tokenizer for the model.
`sampling_params`	`SamplingParams`	Parameters for text generation.

Methods:

Name	Description
`get_response`	Generate responses for a list of prompts.
`update_token_count`	Update the token count based on the given inputs and outputs.

Source code in promptolution/llms/vllm.py

class VLLM(BaseLLM):
    """A class for running language models using the vLLM library.

    This class sets up a vLLM inference engine with specified model parameters
    and provides a method to generate responses for given prompts.

    Attributes:
        llm (vllm.LLM): The vLLM inference engine.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
        sampling_params (vllm.SamplingParams): Parameters for text generation.

    Methods:
        get_response: Generate responses for a list of prompts.
        update_token_count: Update the token count based on the given inputs and outputs.
    """

    def __init__(
        self,
        model_id: str,
        batch_size: int | None = None,
        max_generated_tokens: int = 256,
        temperature: float = 0.1,
        top_p: float = 0.9,
        model_storage_path: str | None = None,
        dtype: str = "auto",
        tensor_parallel_size: int = 1,
        gpu_memory_utilization: float = 0.95,
        max_model_len: int = 2048,
        trust_remote_code: bool = False,
        seed: int = 42,
        llm_kwargs: dict = None,
        config: "ExperimentConfig" = None,
    ):
        """Initialize the VLLM with a specific model.

        Args:
            model_id (str): The identifier of the model to use.
            batch_size (int, optional): The batch size for text generation. Defaults to 8.
            max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
            temperature (float, optional): Sampling temperature. Defaults to 0.1.
            top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
            model_storage_path (str, optional): Directory to store the model. Defaults to None.
            dtype (str, optional): Data type for model weights. Defaults to "float16".
            tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
            gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
            max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
            seed (int, optional): Random seed for the model. Defaults to 42.
            llm_kwargs (dict, optional): Additional keyword arguments for the LLM. Defaults to None.
            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.

        Note:
            This method sets up a vLLM engine with specified parameters for efficient inference.
        """
        if not imports_successful:
            raise ImportError(
                "Could not import at least one of the required libraries: transformers, vllm. "
                "Please ensure they are installed in your environment."
            )

        self.dtype = dtype
        self.tensor_parallel_size = tensor_parallel_size
        self.gpu_memory_utilization = gpu_memory_utilization
        self.max_model_len = max_model_len
        self.trust_remote_code = trust_remote_code

        # Configure sampling parameters
        self.sampling_params = SamplingParams(
            temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, seed=seed
        )

        if llm_kwargs is None:
            llm_kwargs = {}
        # Initialize the vLLM engine with both explicit parameters and any additional kwargs
        llm_params = {
            "model": model_id,
            "tokenizer": model_id,
            "dtype": self.dtype,
            "tensor_parallel_size": self.tensor_parallel_size,
            "gpu_memory_utilization": self.gpu_memory_utilization,
            "max_model_len": self.max_model_len,
            "download_dir": model_storage_path,
            "trust_remote_code": self.trust_remote_code,
            "seed": seed,
            **llm_kwargs,
        }

        self.llm = LLM(**llm_params)

        if batch_size is None:
            cache_config = self.llm.llm_engine.model_executor.cache_config
            self.batch_size = int((cache_config.gpu_blocks * cache_config.block_size / self.max_model_len) * 0.95)
            logger.info(f"🚀 Batch size set to {self.batch_size} based on GPU memory.")
        else:
            self.batch_size = batch_size

        # Initialize tokenizer separately for potential pre-processing
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)

        super().__init__(config)

    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
        """Generate responses for a list of prompts using the vLLM engine.

        Args:
            prompts (list[str]): A list of input prompts.
            system_prompts (list[str]): A list of system prompts to guide the model's behavior.

        Returns:
            list[str]: A list of generated responses corresponding to the input prompts.

        Note:
            This method uses vLLM's batched generation capabilities for efficient inference.
            It also counts input and output tokens.
        """
        prompts = [
            self.tokenizer.apply_chat_template(
                [
                    {
                        "role": "system",
                        "content": sys_prompt,
                    },
                    {"role": "user", "content": prompt},
                ],
                tokenize=False,
                add_generation_prompt=True,
            )
            for prompt, sys_prompt in zip(prompts, system_prompts)
        ]

        # generate responses for self.batch_size prompts at the same time
        all_responses = []
        for i in range(0, len(prompts), self.batch_size):
            batch = prompts[i : i + self.batch_size]
            outputs = self.llm.generate(batch, self.sampling_params)
            responses = [output.outputs[0].text for output in outputs]

            all_responses.extend(responses)

        return all_responses

    def update_token_count(self, inputs: List[str], outputs: List[str]):
        """Update the token count based on the given inputs and outputs.

            Uses the tokenizer to count the tokens.

        Args:
            inputs (List[str]): A list of input prompts.
            outputs (List[str]): A list of generated responses.
        """
        for input in inputs:
            self.input_token_count += len(self.tokenizer.encode(input))

        for output in outputs:
            self.output_token_count += len(self.tokenizer.encode(output))

    def set_generation_seed(self, seed):
        """Set the random seed for text generation.

        Args:
            seed (int): Random seed for text generation.
        """
        self.sampling_params.seed = seed

`init(model_id, batch_size=None, max_generated_tokens=256, temperature=0.1, top_p=0.9, model_storage_path=None, dtype='auto', tensor_parallel_size=1, gpu_memory_utilization=0.95, max_model_len=2048, trust_remote_code=False, seed=42, llm_kwargs=None, config=None)`

Initialize the VLLM with a specific model.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	The identifier of the model to use.	required
`batch_size`	`int`	The batch size for text generation. Defaults to 8.	`None`
`max_generated_tokens`	`int`	Maximum number of tokens to generate. Defaults to 256.	`256`
`temperature`	`float`	Sampling temperature. Defaults to 0.1.	`0.1`
`top_p`	`float`	Top-p sampling parameter. Defaults to 0.9.	`0.9`
`model_storage_path`	`str`	Directory to store the model. Defaults to None.	`None`
`dtype`	`str`	Data type for model weights. Defaults to "float16".	`'auto'`
`tensor_parallel_size`	`int`	Number of GPUs for tensor parallelism. Defaults to 1.	`1`
`gpu_memory_utilization`	`float`	Fraction of GPU memory to use. Defaults to 0.95.	`0.95`
`max_model_len`	`int`	Maximum sequence length for the model. Defaults to 2048.	`2048`
`trust_remote_code`	`bool`	Whether to trust remote code. Defaults to False.	`False`
`seed`	`int`	Random seed for the model. Defaults to 42.	`42`
`llm_kwargs`	`dict`	Additional keyword arguments for the LLM. Defaults to None.	`None`
`config`	`ExperimentConfig`	Configuration for the LLM, overriding defaults.	`None`

Note

This method sets up a vLLM engine with specified parameters for efficient inference.

Source code in promptolution/llms/vllm.py

def __init__(
    self,
    model_id: str,
    batch_size: int | None = None,
    max_generated_tokens: int = 256,
    temperature: float = 0.1,
    top_p: float = 0.9,
    model_storage_path: str | None = None,
    dtype: str = "auto",
    tensor_parallel_size: int = 1,
    gpu_memory_utilization: float = 0.95,
    max_model_len: int = 2048,
    trust_remote_code: bool = False,
    seed: int = 42,
    llm_kwargs: dict = None,
    config: "ExperimentConfig" = None,
):
    """Initialize the VLLM with a specific model.

    Args:
        model_id (str): The identifier of the model to use.
        batch_size (int, optional): The batch size for text generation. Defaults to 8.
        max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
        temperature (float, optional): Sampling temperature. Defaults to 0.1.
        top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
        model_storage_path (str, optional): Directory to store the model. Defaults to None.
        dtype (str, optional): Data type for model weights. Defaults to "float16".
        tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
        gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
        max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
        trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
        seed (int, optional): Random seed for the model. Defaults to 42.
        llm_kwargs (dict, optional): Additional keyword arguments for the LLM. Defaults to None.
        config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.

    Note:
        This method sets up a vLLM engine with specified parameters for efficient inference.
    """
    if not imports_successful:
        raise ImportError(
            "Could not import at least one of the required libraries: transformers, vllm. "
            "Please ensure they are installed in your environment."
        )

    self.dtype = dtype
    self.tensor_parallel_size = tensor_parallel_size
    self.gpu_memory_utilization = gpu_memory_utilization
    self.max_model_len = max_model_len
    self.trust_remote_code = trust_remote_code

    # Configure sampling parameters
    self.sampling_params = SamplingParams(
        temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, seed=seed
    )

    if llm_kwargs is None:
        llm_kwargs = {}
    # Initialize the vLLM engine with both explicit parameters and any additional kwargs
    llm_params = {
        "model": model_id,
        "tokenizer": model_id,
        "dtype": self.dtype,
        "tensor_parallel_size": self.tensor_parallel_size,
        "gpu_memory_utilization": self.gpu_memory_utilization,
        "max_model_len": self.max_model_len,
        "download_dir": model_storage_path,
        "trust_remote_code": self.trust_remote_code,
        "seed": seed,
        **llm_kwargs,
    }

    self.llm = LLM(**llm_params)

    if batch_size is None:
        cache_config = self.llm.llm_engine.model_executor.cache_config
        self.batch_size = int((cache_config.gpu_blocks * cache_config.block_size / self.max_model_len) * 0.95)
        logger.info(f"🚀 Batch size set to {self.batch_size} based on GPU memory.")
    else:
        self.batch_size = batch_size

    # Initialize tokenizer separately for potential pre-processing
    self.tokenizer = AutoTokenizer.from_pretrained(model_id)

    super().__init__(config)

`set_generation_seed(seed)`

Set the random seed for text generation.

Parameters:

Name	Type	Description	Default
`seed`	`int`	Random seed for text generation.	required

Source code in promptolution/llms/vllm.py

def set_generation_seed(self, seed):
    """Set the random seed for text generation.

    Args:
        seed (int): Random seed for text generation.
    """
    self.sampling_params.seed = seed

`update_token_count(inputs, outputs)`

Update the token count based on the given inputs and outputs.

Uses the tokenizer to count the tokens.

Parameters:

Name	Type	Description	Default
`inputs`	`List[str]`	A list of input prompts.	required
`outputs`	`List[str]`	A list of generated responses.	required

Source code in promptolution/llms/vllm.py

def update_token_count(self, inputs: List[str], outputs: List[str]):
    """Update the token count based on the given inputs and outputs.

        Uses the tokenizer to count the tokens.

    Args:
        inputs (List[str]): A list of input prompts.
        outputs (List[str]): A list of generated responses.
    """
    for input in inputs:
        self.input_token_count += len(self.tokenizer.encode(input))

    for output in outputs:
        self.output_token_count += len(self.tokenizer.encode(output))

LLMs

api_llm

APILLM

__init__(api_url=None, model_id=None, api_key=None, max_concurrent_calls=50, max_tokens=512, config=None)

base_llm

BaseLLM

__init__(config=None)

get_response(prompts, system_prompts=None)

get_token_count()

reset_token_count()

set_generation_seed(seed)

update_token_count(inputs, outputs)

local_llm

LocalLLM

__del__()

__init__(model_id, batch_size=8, config=None)

vllm

VLLM

__init__(model_id, batch_size=None, max_generated_tokens=256, temperature=0.1, top_p=0.9, model_storage_path=None, dtype='auto', tensor_parallel_size=1, gpu_memory_utilization=0.95, max_model_len=2048, trust_remote_code=False, seed=42, llm_kwargs=None, config=None)

set_generation_seed(seed)

update_token_count(inputs, outputs)

`api_llm`

`APILLM`

`init(api_url=None, model_id=None, api_key=None, max_concurrent_calls=50, max_tokens=512, config=None)`

`base_llm`

`BaseLLM`

`init(config=None)`

`get_response(prompts, system_prompts=None)`

`get_token_count()`

`reset_token_count()`

`set_generation_seed(seed)`

`update_token_count(inputs, outputs)`

`local_llm`

`LocalLLM`

`del()`

`init(model_id, batch_size=8, config=None)`

`vllm`

`VLLM`

`init(model_id, batch_size=None, max_generated_tokens=256, temperature=0.1, top_p=0.9, model_storage_path=None, dtype='auto', tensor_parallel_size=1, gpu_memory_utilization=0.95, max_model_len=2048, trust_remote_code=False, seed=42, llm_kwargs=None, config=None)`

`set_generation_seed(seed)`

`update_token_count(inputs, outputs)`