Skip to content

LLMs

Module for Large Language Models.

api_llm

Module to interface with various language models through their respective APIs.

APILLM

Bases: BaseLLM

A class to interface with language models through their respective APIs.

This class provides a unified interface for making API calls to language models using the OpenAI client library. It handles rate limiting through semaphores and supports both synchronous and asynchronous operations.

Attributes:

Name Type Description
model_id str

Identifier for the model to use.

client AsyncOpenAI

The initialized API client.

max_tokens int

Maximum number of tokens in model responses.

semaphore Semaphore

Semaphore to limit concurrent API calls.

Source code in promptolution/llms/api_llm.py
class APILLM(BaseLLM):
    """A class to interface with language models through their respective APIs.

    This class provides a unified interface for making API calls to language models
    using the OpenAI client library. It handles rate limiting through semaphores
    and supports both synchronous and asynchronous operations.

    Attributes:
        model_id (str): Identifier for the model to use.
        client (AsyncOpenAI): The initialized API client.
        max_tokens (int): Maximum number of tokens in model responses.
        semaphore (asyncio.Semaphore): Semaphore to limit concurrent API calls.
    """

    def __init__(
        self,
        api_url: str = None,
        model_id: str = None,
        api_key: str = None,
        max_concurrent_calls=50,
        max_tokens=512,
        config: "ExperimentConfig" = None,
    ):
        """Initialize the APILLM with a specific model and API configuration.

        Args:
            api_url (str): The base URL for the API endpoint.
            model_id (str): Identifier for the model to use.
            api_key (str, optional): API key for authentication. Defaults to None.
            max_concurrent_calls (int, optional): Maximum number of concurrent API calls. Defaults to 50.
            max_tokens (int, optional): Maximum number of tokens in model responses. Defaults to 512.
            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.

        Raises:
            ImportError: If required libraries are not installed.
        """
        if not import_successful:
            raise ImportError(
                "Could not import at least one of the required libraries: openai, asyncio. "
                "Please ensure they are installed in your environment."
            )

        self.api_url = api_url
        self.model_id = model_id
        self.api_key = api_key
        self.max_concurrent_calls = max_concurrent_calls
        self.max_tokens = max_tokens

        super().__init__(config=config)
        self.client = AsyncOpenAI(base_url=self.api_url, api_key=self.api_key)
        self.semaphore = asyncio.Semaphore(self.max_concurrent_calls)

    def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
        # Setup for async execution in sync context
        loop = asyncio.get_event_loop()
        responses = loop.run_until_complete(self._get_response_async(prompts, system_prompts))
        return responses

    async def _get_response_async(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
        tasks = [
            _invoke_model(prompt, system_prompt, self.max_tokens, self.model_id, self.client, self.semaphore)
            for prompt, system_prompt in zip(prompts, system_prompts)
        ]
        responses = await asyncio.gather(*tasks)
        return [response.choices[0].message.content for response in responses]

__init__(api_url=None, model_id=None, api_key=None, max_concurrent_calls=50, max_tokens=512, config=None)

Initialize the APILLM with a specific model and API configuration.

Parameters:

Name Type Description Default
api_url str

The base URL for the API endpoint.

None
model_id str

Identifier for the model to use.

None
api_key str

API key for authentication. Defaults to None.

None
max_concurrent_calls int

Maximum number of concurrent API calls. Defaults to 50.

50
max_tokens int

Maximum number of tokens in model responses. Defaults to 512.

512
config ExperimentConfig

Configuration for the LLM, overriding defaults.

None

Raises:

Type Description
ImportError

If required libraries are not installed.

Source code in promptolution/llms/api_llm.py
def __init__(
    self,
    api_url: str = None,
    model_id: str = None,
    api_key: str = None,
    max_concurrent_calls=50,
    max_tokens=512,
    config: "ExperimentConfig" = None,
):
    """Initialize the APILLM with a specific model and API configuration.

    Args:
        api_url (str): The base URL for the API endpoint.
        model_id (str): Identifier for the model to use.
        api_key (str, optional): API key for authentication. Defaults to None.
        max_concurrent_calls (int, optional): Maximum number of concurrent API calls. Defaults to 50.
        max_tokens (int, optional): Maximum number of tokens in model responses. Defaults to 512.
        config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.

    Raises:
        ImportError: If required libraries are not installed.
    """
    if not import_successful:
        raise ImportError(
            "Could not import at least one of the required libraries: openai, asyncio. "
            "Please ensure they are installed in your environment."
        )

    self.api_url = api_url
    self.model_id = model_id
    self.api_key = api_key
    self.max_concurrent_calls = max_concurrent_calls
    self.max_tokens = max_tokens

    super().__init__(config=config)
    self.client = AsyncOpenAI(base_url=self.api_url, api_key=self.api_key)
    self.semaphore = asyncio.Semaphore(self.max_concurrent_calls)

base_llm

Base module for LLMs in the promptolution library.

BaseLLM

Bases: ABC

Abstract base class for Language Models in the promptolution library.

This class defines the interface that all concrete LLM implementations should follow. It's designed to track which configuration parameters are actually used.

Attributes:

Name Type Description
config LLMModelConfig

Configuration for the language model.

input_token_count int

Count of input tokens processed.

output_token_count int

Count of output tokens generated.

Source code in promptolution/llms/base_llm.py
class BaseLLM(ABC):
    """Abstract base class for Language Models in the promptolution library.

    This class defines the interface that all concrete LLM implementations should follow.
    It's designed to track which configuration parameters are actually used.

    Attributes:
        config (LLMModelConfig): Configuration for the language model.
        input_token_count (int): Count of input tokens processed.
        output_token_count (int): Count of output tokens generated.
    """

    def __init__(self, config: "ExperimentConfig" = None):
        """Initialize the LLM with a configuration or direct parameters.

        This constructor supports both config-based and direct parameter initialization
        for backward compatibility.

        Args:
            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.
        """
        if config is not None:
            config.apply_to(self)
        # Initialize token counters
        self.input_token_count = 0
        self.output_token_count = 0

    def get_token_count(self):
        """Get the current count of input and output tokens.

        Returns:
            dict: A dictionary containing the input and output token counts.
        """
        return {
            "input_tokens": self.input_token_count,
            "output_tokens": self.output_token_count,
            "total_tokens": self.input_token_count + self.output_token_count,
        }

    def reset_token_count(self):
        """Reset the token counters to zero."""
        self.input_token_count = 0
        self.output_token_count = 0

    def update_token_count(self, inputs: List[str], outputs: List[str]):
        """Update the token count based on the given inputs and outputs.

        It uses a simple tokenization method (splitting by whitespace) to count tokens in the base class.

        Args:
            inputs (List[str]): A list of input prompts.
            outputs (List[str]): A list of generated responses.
        """
        input_tokens = sum([len(i.split()) for i in inputs])
        output_tokens = sum([len(o.split()) for o in outputs])
        self.input_token_count += input_tokens
        self.output_token_count += output_tokens

    def get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
        """Generate responses for the given prompts.

        This method calls the _get_response method to generate responses
        for the given prompts. It also updates the token count for the
        input and output tokens.

        Args:
            prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                        it's converted to a list containing that string.
            system_prompts (Optional, str or List[str]): System prompt(s) to provide context to the model.

        Returns:
            List[str]: A list of generated responses, one for each input prompt.
        """
        if system_prompts is None:
            system_prompts = DEFAULT_SYS_PROMPT
        if isinstance(prompts, str):
            prompts = [prompts]
        if isinstance(system_prompts, str):
            system_prompts = [system_prompts] * len(prompts)
        responses = self._get_response(prompts, system_prompts)
        self.update_token_count(prompts + system_prompts, responses)

        return responses

    def set_generation_seed(self, seed: int):
        """Set the random seed for reproducibility per request.

        Args:
            seed (int): Random seed value.
        """
        pass

    @abstractmethod
    def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
        """Generate responses for the given prompts.

        This method should be implemented by subclasses to define how
        the LLM generates responses.

        Args:
            prompts (List[str]): A list of input prompts.
            system_prompts (List[str]): A list of system prompts to provide context to the model.

        Returns:
            List[str]: A list of generated responses corresponding to the input prompts.
        """
        raise NotImplementedError

__init__(config=None)

Initialize the LLM with a configuration or direct parameters.

This constructor supports both config-based and direct parameter initialization for backward compatibility.

Parameters:

Name Type Description Default
config ExperimentConfig

Configuration for the LLM, overriding defaults.

None
Source code in promptolution/llms/base_llm.py
def __init__(self, config: "ExperimentConfig" = None):
    """Initialize the LLM with a configuration or direct parameters.

    This constructor supports both config-based and direct parameter initialization
    for backward compatibility.

    Args:
        config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.
    """
    if config is not None:
        config.apply_to(self)
    # Initialize token counters
    self.input_token_count = 0
    self.output_token_count = 0

get_response(prompts, system_prompts=None)

Generate responses for the given prompts.

This method calls the _get_response method to generate responses for the given prompts. It also updates the token count for the input and output tokens.

Parameters:

Name Type Description Default
prompts str or List[str]

Input prompt(s). If a single string is provided, it's converted to a list containing that string.

required
system_prompts (Optional, str or List[str])

System prompt(s) to provide context to the model.

None

Returns:

Type Description
List[str]

List[str]: A list of generated responses, one for each input prompt.

Source code in promptolution/llms/base_llm.py
def get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
    """Generate responses for the given prompts.

    This method calls the _get_response method to generate responses
    for the given prompts. It also updates the token count for the
    input and output tokens.

    Args:
        prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                    it's converted to a list containing that string.
        system_prompts (Optional, str or List[str]): System prompt(s) to provide context to the model.

    Returns:
        List[str]: A list of generated responses, one for each input prompt.
    """
    if system_prompts is None:
        system_prompts = DEFAULT_SYS_PROMPT
    if isinstance(prompts, str):
        prompts = [prompts]
    if isinstance(system_prompts, str):
        system_prompts = [system_prompts] * len(prompts)
    responses = self._get_response(prompts, system_prompts)
    self.update_token_count(prompts + system_prompts, responses)

    return responses

get_token_count()

Get the current count of input and output tokens.

Returns:

Name Type Description
dict

A dictionary containing the input and output token counts.

Source code in promptolution/llms/base_llm.py
def get_token_count(self):
    """Get the current count of input and output tokens.

    Returns:
        dict: A dictionary containing the input and output token counts.
    """
    return {
        "input_tokens": self.input_token_count,
        "output_tokens": self.output_token_count,
        "total_tokens": self.input_token_count + self.output_token_count,
    }

reset_token_count()

Reset the token counters to zero.

Source code in promptolution/llms/base_llm.py
def reset_token_count(self):
    """Reset the token counters to zero."""
    self.input_token_count = 0
    self.output_token_count = 0

set_generation_seed(seed)

Set the random seed for reproducibility per request.

Parameters:

Name Type Description Default
seed int

Random seed value.

required
Source code in promptolution/llms/base_llm.py
def set_generation_seed(self, seed: int):
    """Set the random seed for reproducibility per request.

    Args:
        seed (int): Random seed value.
    """
    pass

update_token_count(inputs, outputs)

Update the token count based on the given inputs and outputs.

It uses a simple tokenization method (splitting by whitespace) to count tokens in the base class.

Parameters:

Name Type Description Default
inputs List[str]

A list of input prompts.

required
outputs List[str]

A list of generated responses.

required
Source code in promptolution/llms/base_llm.py
def update_token_count(self, inputs: List[str], outputs: List[str]):
    """Update the token count based on the given inputs and outputs.

    It uses a simple tokenization method (splitting by whitespace) to count tokens in the base class.

    Args:
        inputs (List[str]): A list of input prompts.
        outputs (List[str]): A list of generated responses.
    """
    input_tokens = sum([len(i.split()) for i in inputs])
    output_tokens = sum([len(o.split()) for o in outputs])
    self.input_token_count += input_tokens
    self.output_token_count += output_tokens

local_llm

Module for running LLMs locally using the Hugging Face Transformers library.

LocalLLM

Bases: BaseLLM

A class for running language models locally using the Hugging Face Transformers library.

This class sets up a text generation pipeline with specified model parameters and provides a method to generate responses for given prompts.

Attributes:

Name Type Description
pipeline Pipeline

The text generation pipeline.

Methods:

Name Description
get_response

Generate responses for a list of prompts.

Source code in promptolution/llms/local_llm.py
class LocalLLM(BaseLLM):
    """A class for running language models locally using the Hugging Face Transformers library.

    This class sets up a text generation pipeline with specified model parameters
    and provides a method to generate responses for given prompts.

    Attributes:
        pipeline (transformers.Pipeline): The text generation pipeline.

    Methods:
        get_response: Generate responses for a list of prompts.
    """

    def __init__(self, model_id: str, batch_size: int = 8, config: "ExperimentConfig" = None):
        """Initialize the LocalLLM with a specific model.

        Args:
            model_id (str): The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").
            batch_size (int, optional): The batch size for text generation. Defaults to 8.
            config (ExperimentConfig, optional): "ExperimentConfig" overwriting defaults.

        Note:
            This method sets up a text generation pipeline with bfloat16 precision,
            automatic device mapping, and specific generation parameters.
        """
        if not imports_successful:
            raise ImportError(
                "Could not import at least one of the required libraries: torch, transformers. "
                "Please ensure they are installed in your environment."
            )
        self.pipeline = transformers.pipeline(
            "text-generation",
            model=model_id,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto",
            max_new_tokens=256,
            batch_size=batch_size,
            num_return_sequences=1,
            return_full_text=False,
        )
        self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
        self.pipeline.tokenizer.padding_side = "left"
        super().__init__(config)

    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
        """Generate responses for a list of prompts using the local language model.

        Args:
            prompts (list[str]): A list of input prompts.
            system_prompts (list[str]): A list of system prompts to guide the model's behavior.

        Returns:
            list[str]: A list of generated responses corresponding to the input prompts.

        Note:
            This method uses torch.no_grad() for inference to reduce memory usage.
            It handles both single and batch inputs, ensuring consistent output format.
        """
        inputs = []
        for prompt, sys_prompt in zip(prompts, system_prompts):
            inputs.append([{"role": "system", "prompt": sys_prompt}, {"role": "user", "prompt": prompt}])

        with torch.no_grad():
            response = self.pipeline(inputs, pad_token_id=self.pipeline.tokenizer.eos_token_id)

        if len(response) != 1:
            response = [r[0] if isinstance(r, list) else r for r in response]

        response = [r["generated_text"] for r in response]
        return response

    def __del__(self):
        """Cleanup method to delete the pipeline and free up GPU memory."""
        del self.pipeline
        torch.cuda.empty_cache()

__del__()

Cleanup method to delete the pipeline and free up GPU memory.

Source code in promptolution/llms/local_llm.py
def __del__(self):
    """Cleanup method to delete the pipeline and free up GPU memory."""
    del self.pipeline
    torch.cuda.empty_cache()

__init__(model_id, batch_size=8, config=None)

Initialize the LocalLLM with a specific model.

Parameters:

Name Type Description Default
model_id str

The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").

required
batch_size int

The batch size for text generation. Defaults to 8.

8
config ExperimentConfig

"ExperimentConfig" overwriting defaults.

None
Note

This method sets up a text generation pipeline with bfloat16 precision, automatic device mapping, and specific generation parameters.

Source code in promptolution/llms/local_llm.py
def __init__(self, model_id: str, batch_size: int = 8, config: "ExperimentConfig" = None):
    """Initialize the LocalLLM with a specific model.

    Args:
        model_id (str): The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").
        batch_size (int, optional): The batch size for text generation. Defaults to 8.
        config (ExperimentConfig, optional): "ExperimentConfig" overwriting defaults.

    Note:
        This method sets up a text generation pipeline with bfloat16 precision,
        automatic device mapping, and specific generation parameters.
    """
    if not imports_successful:
        raise ImportError(
            "Could not import at least one of the required libraries: torch, transformers. "
            "Please ensure they are installed in your environment."
        )
    self.pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
        max_new_tokens=256,
        batch_size=batch_size,
        num_return_sequences=1,
        return_full_text=False,
    )
    self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
    self.pipeline.tokenizer.padding_side = "left"
    super().__init__(config)

vllm

Module for running language models locally using the vLLM library.

VLLM

Bases: BaseLLM

A class for running language models using the vLLM library.

This class sets up a vLLM inference engine with specified model parameters and provides a method to generate responses for given prompts.

Attributes:

Name Type Description
llm LLM

The vLLM inference engine.

tokenizer PreTrainedTokenizer

The tokenizer for the model.

sampling_params SamplingParams

Parameters for text generation.

Methods:

Name Description
get_response

Generate responses for a list of prompts.

update_token_count

Update the token count based on the given inputs and outputs.

Source code in promptolution/llms/vllm.py
class VLLM(BaseLLM):
    """A class for running language models using the vLLM library.

    This class sets up a vLLM inference engine with specified model parameters
    and provides a method to generate responses for given prompts.

    Attributes:
        llm (vllm.LLM): The vLLM inference engine.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
        sampling_params (vllm.SamplingParams): Parameters for text generation.

    Methods:
        get_response: Generate responses for a list of prompts.
        update_token_count: Update the token count based on the given inputs and outputs.
    """

    def __init__(
        self,
        model_id: str,
        batch_size: int | None = None,
        max_generated_tokens: int = 256,
        temperature: float = 0.1,
        top_p: float = 0.9,
        model_storage_path: str | None = None,
        dtype: str = "auto",
        tensor_parallel_size: int = 1,
        gpu_memory_utilization: float = 0.95,
        max_model_len: int = 2048,
        trust_remote_code: bool = False,
        seed: int = 42,
        llm_kwargs: dict = None,
        config: "ExperimentConfig" = None,
    ):
        """Initialize the VLLM with a specific model.

        Args:
            model_id (str): The identifier of the model to use.
            batch_size (int, optional): The batch size for text generation. Defaults to 8.
            max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
            temperature (float, optional): Sampling temperature. Defaults to 0.1.
            top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
            model_storage_path (str, optional): Directory to store the model. Defaults to None.
            dtype (str, optional): Data type for model weights. Defaults to "float16".
            tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
            gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
            max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
            seed (int, optional): Random seed for the model. Defaults to 42.
            llm_kwargs (dict, optional): Additional keyword arguments for the LLM. Defaults to None.
            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.

        Note:
            This method sets up a vLLM engine with specified parameters for efficient inference.
        """
        if not imports_successful:
            raise ImportError(
                "Could not import at least one of the required libraries: transformers, vllm. "
                "Please ensure they are installed in your environment."
            )

        self.dtype = dtype
        self.tensor_parallel_size = tensor_parallel_size
        self.gpu_memory_utilization = gpu_memory_utilization
        self.max_model_len = max_model_len
        self.trust_remote_code = trust_remote_code

        # Configure sampling parameters
        self.sampling_params = SamplingParams(
            temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, seed=seed
        )

        if llm_kwargs is None:
            llm_kwargs = {}
        # Initialize the vLLM engine with both explicit parameters and any additional kwargs
        llm_params = {
            "model": model_id,
            "tokenizer": model_id,
            "dtype": self.dtype,
            "tensor_parallel_size": self.tensor_parallel_size,
            "gpu_memory_utilization": self.gpu_memory_utilization,
            "max_model_len": self.max_model_len,
            "download_dir": model_storage_path,
            "trust_remote_code": self.trust_remote_code,
            "seed": seed,
            **llm_kwargs,
        }

        self.llm = LLM(**llm_params)

        if batch_size is None:
            cache_config = self.llm.llm_engine.model_executor.cache_config
            self.batch_size = int((cache_config.gpu_blocks * cache_config.block_size / self.max_model_len) * 0.95)
            logger.info(f"πŸš€ Batch size set to {self.batch_size} based on GPU memory.")
        else:
            self.batch_size = batch_size

        # Initialize tokenizer separately for potential pre-processing
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)

        super().__init__(config)

    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
        """Generate responses for a list of prompts using the vLLM engine.

        Args:
            prompts (list[str]): A list of input prompts.
            system_prompts (list[str]): A list of system prompts to guide the model's behavior.

        Returns:
            list[str]: A list of generated responses corresponding to the input prompts.

        Note:
            This method uses vLLM's batched generation capabilities for efficient inference.
            It also counts input and output tokens.
        """
        prompts = [
            self.tokenizer.apply_chat_template(
                [
                    {
                        "role": "system",
                        "content": sys_prompt,
                    },
                    {"role": "user", "content": prompt},
                ],
                tokenize=False,
                add_generation_prompt=True,
            )
            for prompt, sys_prompt in zip(prompts, system_prompts)
        ]

        # generate responses for self.batch_size prompts at the same time
        all_responses = []
        for i in range(0, len(prompts), self.batch_size):
            batch = prompts[i : i + self.batch_size]
            outputs = self.llm.generate(batch, self.sampling_params)
            responses = [output.outputs[0].text for output in outputs]

            all_responses.extend(responses)

        return all_responses

    def update_token_count(self, inputs: List[str], outputs: List[str]):
        """Update the token count based on the given inputs and outputs.

            Uses the tokenizer to count the tokens.

        Args:
            inputs (List[str]): A list of input prompts.
            outputs (List[str]): A list of generated responses.
        """
        for input in inputs:
            self.input_token_count += len(self.tokenizer.encode(input))

        for output in outputs:
            self.output_token_count += len(self.tokenizer.encode(output))

    def set_generation_seed(self, seed):
        """Set the random seed for text generation.

        Args:
            seed (int): Random seed for text generation.
        """
        self.sampling_params.seed = seed

__init__(model_id, batch_size=None, max_generated_tokens=256, temperature=0.1, top_p=0.9, model_storage_path=None, dtype='auto', tensor_parallel_size=1, gpu_memory_utilization=0.95, max_model_len=2048, trust_remote_code=False, seed=42, llm_kwargs=None, config=None)

Initialize the VLLM with a specific model.

Parameters:

Name Type Description Default
model_id str

The identifier of the model to use.

required
batch_size int

The batch size for text generation. Defaults to 8.

None
max_generated_tokens int

Maximum number of tokens to generate. Defaults to 256.

256
temperature float

Sampling temperature. Defaults to 0.1.

0.1
top_p float

Top-p sampling parameter. Defaults to 0.9.

0.9
model_storage_path str

Directory to store the model. Defaults to None.

None
dtype str

Data type for model weights. Defaults to "float16".

'auto'
tensor_parallel_size int

Number of GPUs for tensor parallelism. Defaults to 1.

1
gpu_memory_utilization float

Fraction of GPU memory to use. Defaults to 0.95.

0.95
max_model_len int

Maximum sequence length for the model. Defaults to 2048.

2048
trust_remote_code bool

Whether to trust remote code. Defaults to False.

False
seed int

Random seed for the model. Defaults to 42.

42
llm_kwargs dict

Additional keyword arguments for the LLM. Defaults to None.

None
config ExperimentConfig

Configuration for the LLM, overriding defaults.

None
Note

This method sets up a vLLM engine with specified parameters for efficient inference.

Source code in promptolution/llms/vllm.py
def __init__(
    self,
    model_id: str,
    batch_size: int | None = None,
    max_generated_tokens: int = 256,
    temperature: float = 0.1,
    top_p: float = 0.9,
    model_storage_path: str | None = None,
    dtype: str = "auto",
    tensor_parallel_size: int = 1,
    gpu_memory_utilization: float = 0.95,
    max_model_len: int = 2048,
    trust_remote_code: bool = False,
    seed: int = 42,
    llm_kwargs: dict = None,
    config: "ExperimentConfig" = None,
):
    """Initialize the VLLM with a specific model.

    Args:
        model_id (str): The identifier of the model to use.
        batch_size (int, optional): The batch size for text generation. Defaults to 8.
        max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
        temperature (float, optional): Sampling temperature. Defaults to 0.1.
        top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
        model_storage_path (str, optional): Directory to store the model. Defaults to None.
        dtype (str, optional): Data type for model weights. Defaults to "float16".
        tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
        gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
        max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
        trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
        seed (int, optional): Random seed for the model. Defaults to 42.
        llm_kwargs (dict, optional): Additional keyword arguments for the LLM. Defaults to None.
        config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.

    Note:
        This method sets up a vLLM engine with specified parameters for efficient inference.
    """
    if not imports_successful:
        raise ImportError(
            "Could not import at least one of the required libraries: transformers, vllm. "
            "Please ensure they are installed in your environment."
        )

    self.dtype = dtype
    self.tensor_parallel_size = tensor_parallel_size
    self.gpu_memory_utilization = gpu_memory_utilization
    self.max_model_len = max_model_len
    self.trust_remote_code = trust_remote_code

    # Configure sampling parameters
    self.sampling_params = SamplingParams(
        temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, seed=seed
    )

    if llm_kwargs is None:
        llm_kwargs = {}
    # Initialize the vLLM engine with both explicit parameters and any additional kwargs
    llm_params = {
        "model": model_id,
        "tokenizer": model_id,
        "dtype": self.dtype,
        "tensor_parallel_size": self.tensor_parallel_size,
        "gpu_memory_utilization": self.gpu_memory_utilization,
        "max_model_len": self.max_model_len,
        "download_dir": model_storage_path,
        "trust_remote_code": self.trust_remote_code,
        "seed": seed,
        **llm_kwargs,
    }

    self.llm = LLM(**llm_params)

    if batch_size is None:
        cache_config = self.llm.llm_engine.model_executor.cache_config
        self.batch_size = int((cache_config.gpu_blocks * cache_config.block_size / self.max_model_len) * 0.95)
        logger.info(f"πŸš€ Batch size set to {self.batch_size} based on GPU memory.")
    else:
        self.batch_size = batch_size

    # Initialize tokenizer separately for potential pre-processing
    self.tokenizer = AutoTokenizer.from_pretrained(model_id)

    super().__init__(config)

set_generation_seed(seed)

Set the random seed for text generation.

Parameters:

Name Type Description Default
seed int

Random seed for text generation.

required
Source code in promptolution/llms/vllm.py
def set_generation_seed(self, seed):
    """Set the random seed for text generation.

    Args:
        seed (int): Random seed for text generation.
    """
    self.sampling_params.seed = seed

update_token_count(inputs, outputs)

Update the token count based on the given inputs and outputs.

Uses the tokenizer to count the tokens.

Parameters:

Name Type Description Default
inputs List[str]

A list of input prompts.

required
outputs List[str]

A list of generated responses.

required
Source code in promptolution/llms/vllm.py
def update_token_count(self, inputs: List[str], outputs: List[str]):
    """Update the token count based on the given inputs and outputs.

        Uses the tokenizer to count the tokens.

    Args:
        inputs (List[str]): A list of input prompts.
        outputs (List[str]): A list of generated responses.
    """
    for input in inputs:
        self.input_token_count += len(self.tokenizer.encode(input))

    for output in outputs:
        self.output_token_count += len(self.tokenizer.encode(output))