Skip to content

LLMs

Module for Large Language Models.

get_llm(model_id, *args, **kwargs)

Factory function to create and return a language model instance based on the provided model_id.

This function supports three types of language models: 1. DummyLLM: A mock LLM for testing purposes. 2. LocalLLM: For running models locally. 3. VLLM: For running models using the vLLM library. 4. APILLM: For API-based models (default if not matching other types).

Parameters:

Name Type Description Default
model_id str

Identifier for the model to use. Special cases: - "dummy" for DummyLLM - "local-{model_name}" for LocalLLM - "vllm-{model_name}" for VLLM - Any other string for APILLM

required
*args

Variable length argument list passed to the LLM constructor.

()
**kwargs

Arbitrary keyword arguments passed to the LLM constructor.

{}

Returns:

Type Description

An instance of DummyLLM, LocalLLM, or APILLM based on the model_id.

Source code in promptolution/llms/__init__.py
def get_llm(model_id: str, *args, **kwargs):
    """Factory function to create and return a language model instance based on the provided model_id.

    This function supports three types of language models:
    1. DummyLLM: A mock LLM for testing purposes.
    2. LocalLLM: For running models locally.
    3. VLLM: For running models using the vLLM library.
    4. APILLM: For API-based models (default if not matching other types).

    Args:
        model_id (str): Identifier for the model to use. Special cases:
                        - "dummy" for DummyLLM
                        - "local-{model_name}" for LocalLLM
                        - "vllm-{model_name}" for VLLM
                        - Any other string for APILLM
        *args: Variable length argument list passed to the LLM constructor.
        **kwargs: Arbitrary keyword arguments passed to the LLM constructor.

    Returns:
        An instance of DummyLLM, LocalLLM, or APILLM based on the model_id.
    """
    if model_id == "dummy":
        return DummyLLM(*args, **kwargs)
    if "local" in model_id:
        model_id = "-".join(model_id.split("-")[1:])
        return LocalLLM(model_id, *args, **kwargs)
    if "vllm" in model_id:
        model_id = "-".join(model_id.split("-")[1:])
        return VLLM(model_id, *args, **kwargs)
    return APILLM(model_id, *args, **kwargs)

api_llm

Module to interface with various language models through their respective APIs.

APILLM

Bases: BaseLLM

A class to interface with various language models through their respective APIs.

This class supports Claude (Anthropic), GPT (OpenAI), and LLaMA (DeepInfra) models. It handles API key management, model initialization, and provides methods for both synchronous and asynchronous inference.

Attributes:

Name Type Description
model

The initialized language model instance.

Methods:

Name Description
get_response

Synchronously get responses for a list of prompts.

get_response_async

Asynchronously get responses for a list of prompts.

Source code in promptolution/llms/api_llm.py
class APILLM(BaseLLM):
    """A class to interface with various language models through their respective APIs.

    This class supports Claude (Anthropic), GPT (OpenAI), and LLaMA (DeepInfra) models.
    It handles API key management, model initialization, and provides methods for
    both synchronous and asynchronous inference.

    Attributes:
        model: The initialized language model instance.

    Methods:
        get_response: Synchronously get responses for a list of prompts.
        get_response_async: Asynchronously get responses for a list of prompts.
    """

    def __init__(self, model_id: str, token: str = None, **kwargs: Any):
        """Initialize the APILLM with a specific model.

        Args:
            model_id (str): Identifier for the model to use.
            token (str): API key for the model.

        Raises:
            ValueError: If an unknown model identifier is provided.
        """
        super().__init__()
        if "claude" in model_id:
            self.model = ChatAnthropic(model=model_id, api_key=token)
        elif "gpt" in model_id:
            self.model = ChatOpenAI(model=model_id, api_key=token)
        else:
            self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token)

    def _get_response(self, prompts: List[str]) -> List[str]:
        """Get responses for a list of prompts in a synchronous manner.

        This method includes retry logic for handling connection errors and rate limits.

        Args:
            prompts (list[str]): List of input prompts.

        Returns:
            list[str]: List of model responses.

        Raises:
            requests.exceptions.ConnectionError: If max retries are exceeded.
        """
        max_retries = 100
        delay = 3
        attempts = 0

        nest_asyncio.apply()

        while attempts < max_retries:
            try:
                responses = asyncio.run(self.get_response_async(prompts))
                return responses
            except requests.exceptions.ConnectionError as e:
                attempts += 1
                logger.critical(
                    f"Connection error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds..."
                )
                time.sleep(delay)
            except openai.RateLimitError as e:
                attempts += 1
                logger.critical(
                    f"Rate limit error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds..."
                )
                time.sleep(delay)

        # If the loop exits, it means max retries were reached
        raise requests.exceptions.ConnectionError("Max retries exceeded. Connection could not be established.")

    async def get_response_async(self, prompts: list[str], max_concurrent_calls=200) -> list[str]:
        """Asynchronously get responses for a list of prompts.

        This method uses a semaphore to limit the number of concurrent API calls.

        Args:
            prompts (list[str]): List of input prompts.
            max_concurrent_calls (int): Maximum number of concurrent API calls allowed.

        Returns:
            list[str]: List of model responses.
        """
        semaphore = asyncio.Semaphore(max_concurrent_calls)
        tasks = []

        for prompt in prompts:
            tasks.append(invoke_model(prompt, self.model, semaphore))

        responses = await asyncio.gather(*tasks)
        return responses

__init__(model_id, token=None, **kwargs)

Initialize the APILLM with a specific model.

Parameters:

Name Type Description Default
model_id str

Identifier for the model to use.

required
token str

API key for the model.

None

Raises:

Type Description
ValueError

If an unknown model identifier is provided.

Source code in promptolution/llms/api_llm.py
def __init__(self, model_id: str, token: str = None, **kwargs: Any):
    """Initialize the APILLM with a specific model.

    Args:
        model_id (str): Identifier for the model to use.
        token (str): API key for the model.

    Raises:
        ValueError: If an unknown model identifier is provided.
    """
    super().__init__()
    if "claude" in model_id:
        self.model = ChatAnthropic(model=model_id, api_key=token)
    elif "gpt" in model_id:
        self.model = ChatOpenAI(model=model_id, api_key=token)
    else:
        self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token)

get_response_async(prompts, max_concurrent_calls=200) async

Asynchronously get responses for a list of prompts.

This method uses a semaphore to limit the number of concurrent API calls.

Parameters:

Name Type Description Default
prompts list[str]

List of input prompts.

required
max_concurrent_calls int

Maximum number of concurrent API calls allowed.

200

Returns:

Type Description
list[str]

list[str]: List of model responses.

Source code in promptolution/llms/api_llm.py
async def get_response_async(self, prompts: list[str], max_concurrent_calls=200) -> list[str]:
    """Asynchronously get responses for a list of prompts.

    This method uses a semaphore to limit the number of concurrent API calls.

    Args:
        prompts (list[str]): List of input prompts.
        max_concurrent_calls (int): Maximum number of concurrent API calls allowed.

    Returns:
        list[str]: List of model responses.
    """
    semaphore = asyncio.Semaphore(max_concurrent_calls)
    tasks = []

    for prompt in prompts:
        tasks.append(invoke_model(prompt, self.model, semaphore))

    responses = await asyncio.gather(*tasks)
    return responses

invoke_model(prompt, model, semaphore) async

Asynchronously invoke a language model with retry logic.

Parameters:

Name Type Description Default
prompt str

The input prompt for the model.

required
model

The language model to invoke.

required
semaphore Semaphore

Semaphore to limit concurrent calls.

required

Returns:

Name Type Description
str

The model's response content.

Raises:

Type Description
ChatDeepInfraException

If all retry attempts fail.

Source code in promptolution/llms/api_llm.py
async def invoke_model(prompt, model, semaphore):
    """Asynchronously invoke a language model with retry logic.

    Args:
        prompt (str): The input prompt for the model.
        model: The language model to invoke.
        semaphore (asyncio.Semaphore): Semaphore to limit concurrent calls.

    Returns:
        str: The model's response content.

    Raises:
        ChatDeepInfraException: If all retry attempts fail.
    """
    async with semaphore:
        max_retries = 100
        delay = 3
        attempts = 0

        while attempts < max_retries:
            try:
                response = await model.ainvoke([HumanMessage(content=prompt)])
                return response.content
            except ChatDeepInfraException as e:
                print(f"DeepInfra error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds...")
                attempts += 1
                await asyncio.sleep(delay)

base_llm

Base module for LLMs in the promptolution library.

BaseLLM

Bases: ABC

Abstract base class for Language Models in the promptolution library.

This class defines the interface that all concrete LLM implementations should follow.

Methods:

Name Description
get_response

An abstract method that should be implemented by subclasses to generate responses for given prompts.

Source code in promptolution/llms/base_llm.py
class BaseLLM(ABC):
    """Abstract base class for Language Models in the promptolution library.

    This class defines the interface that all concrete LLM implementations should follow.

    Methods:
        get_response: An abstract method that should be implemented by subclasses
                      to generate responses for given prompts.
    """

    def __init__(self, *args, **kwargs):
        """Initialize the LLM."""
        self.input_token_count = 0
        self.output_token_count = 0

    def get_token_count(self):
        """Get the current count of input and output tokens.

        Returns:
            dict: A dictionary containing the input and output token counts.
        """
        return {
            "input_tokens": self.input_token_count,
            "output_tokens": self.output_token_count,
            "total_tokens": self.input_token_count + self.output_token_count,
        }

    def reset_token_count(self):
        """Reset the token counters to zero."""
        self.input_token_count = 0
        self.output_token_count = 0

    def update_token_count(self, inputs: List[str], outputs: List[str]):
        """Update the token count based on the given inputs and outputs.

        Args:
            inputs (List[str]): A list of input prompts.
            outputs (List[str]): A list of generated responses.
        """
        logger.warning("Token count is approximated using word count split by whitespace, not an actual tokenizer.")
        input_tokens = sum([len(i.split()) for i in inputs])
        output_tokens = sum([len(o.split()) for o in outputs])
        self.input_token_count += input_tokens
        self.output_token_count += output_tokens

    def get_response(self, prompts: str) -> str:
        """Generate responses for the given prompts.

        This method calls the _get_response method to generate responses
        for the given prompts. It also updates the token count for the
        input and output tokens.

        Args:
            prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                        it's converted to a list containing that string.

        Returns:
            List[str]: A list of generated responses, one for each input prompt.
        """
        if isinstance(prompts, str):
            prompts = [prompts]
        responses = self._get_response(prompts)
        self.update_token_count(prompts, responses)

        return responses

    @abstractmethod
    def _get_response(self, prompts: List[str]) -> List[str]:
        """Generate responses for the given prompts.

        This method should be implemented by subclasses to define how
        the LLM generates responses.

        Args:
            prompts (List[str]): A list of input prompts.

        Returns:
            List[str]: A list of generated responses corresponding to the input prompts.
        """
        pass

__init__(*args, **kwargs)

Initialize the LLM.

Source code in promptolution/llms/base_llm.py
def __init__(self, *args, **kwargs):
    """Initialize the LLM."""
    self.input_token_count = 0
    self.output_token_count = 0

get_response(prompts)

Generate responses for the given prompts.

This method calls the _get_response method to generate responses for the given prompts. It also updates the token count for the input and output tokens.

Parameters:

Name Type Description Default
prompts str or List[str]

Input prompt(s). If a single string is provided, it's converted to a list containing that string.

required

Returns:

Type Description
str

List[str]: A list of generated responses, one for each input prompt.

Source code in promptolution/llms/base_llm.py
def get_response(self, prompts: str) -> str:
    """Generate responses for the given prompts.

    This method calls the _get_response method to generate responses
    for the given prompts. It also updates the token count for the
    input and output tokens.

    Args:
        prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                    it's converted to a list containing that string.

    Returns:
        List[str]: A list of generated responses, one for each input prompt.
    """
    if isinstance(prompts, str):
        prompts = [prompts]
    responses = self._get_response(prompts)
    self.update_token_count(prompts, responses)

    return responses

get_token_count()

Get the current count of input and output tokens.

Returns:

Name Type Description
dict

A dictionary containing the input and output token counts.

Source code in promptolution/llms/base_llm.py
def get_token_count(self):
    """Get the current count of input and output tokens.

    Returns:
        dict: A dictionary containing the input and output token counts.
    """
    return {
        "input_tokens": self.input_token_count,
        "output_tokens": self.output_token_count,
        "total_tokens": self.input_token_count + self.output_token_count,
    }

reset_token_count()

Reset the token counters to zero.

Source code in promptolution/llms/base_llm.py
def reset_token_count(self):
    """Reset the token counters to zero."""
    self.input_token_count = 0
    self.output_token_count = 0

update_token_count(inputs, outputs)

Update the token count based on the given inputs and outputs.

Parameters:

Name Type Description Default
inputs List[str]

A list of input prompts.

required
outputs List[str]

A list of generated responses.

required
Source code in promptolution/llms/base_llm.py
def update_token_count(self, inputs: List[str], outputs: List[str]):
    """Update the token count based on the given inputs and outputs.

    Args:
        inputs (List[str]): A list of input prompts.
        outputs (List[str]): A list of generated responses.
    """
    logger.warning("Token count is approximated using word count split by whitespace, not an actual tokenizer.")
    input_tokens = sum([len(i.split()) for i in inputs])
    output_tokens = sum([len(o.split()) for o in outputs])
    self.input_token_count += input_tokens
    self.output_token_count += output_tokens

DummyLLM

Bases: BaseLLM

A dummy implementation of the BaseLLM for testing purposes.

This class generates random responses for given prompts, simulating the behavior of a language model without actually performing any complex natural language processing.

Source code in promptolution/llms/base_llm.py
class DummyLLM(BaseLLM):
    """A dummy implementation of the BaseLLM for testing purposes.

    This class generates random responses for given prompts, simulating
    the behavior of a language model without actually performing any
    complex natural language processing.
    """

    def _get_response(self, prompts: str) -> str:
        """Generate random responses for the given prompts.

        This method creates silly, random responses enclosed in <prompt> tags.
        It's designed for testing and demonstration purposes.

        Args:
            prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                        it's converted to a list containing that string.

        Returns:
            List[str]: A list of randomly generated responses, one for each input prompt.
        """
        if isinstance(prompts, str):
            prompts = [prompts]
        results = []
        for p in prompts:
            r = np.random.rand()
            if r < 0.3:
                results += [f"Joooo wazzuppp <prompt>hier gehts los {r} </prompt> {p}"]
            elif 0.3 <= r < 0.6:
                results += [f"was das hier? <prompt>peter lustig{r}</prompt> {p}"]
            else:
                results += [f"hier ist ein <prompt>test{r}</prompt> {p}"]

        return results

local_llm

Module for running language models locally using the Hugging Face Transformers library.

LocalLLM

Bases: BaseLLM

A class for running language models locally using the Hugging Face Transformers library.

This class sets up a text generation pipeline with specified model parameters and provides a method to generate responses for given prompts.

Attributes:

Name Type Description
pipeline Pipeline

The text generation pipeline.

Methods:

Name Description
get_response

Generate responses for a list of prompts.

Source code in promptolution/llms/local_llm.py
class LocalLLM(BaseLLM):
    """A class for running language models locally using the Hugging Face Transformers library.

    This class sets up a text generation pipeline with specified model parameters
    and provides a method to generate responses for given prompts.

    Attributes:
        pipeline (transformers.Pipeline): The text generation pipeline.

    Methods:
        get_response: Generate responses for a list of prompts.
    """

    def __init__(self, model_id: str, batch_size=8):
        """Initialize the LocalLLM with a specific model.

        Args:
            model_id (str): The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").
            batch_size (int, optional): The batch size for text generation. Defaults to 8.

        Note:
            This method sets up a text generation pipeline with bfloat16 precision,
            automatic device mapping, and specific generation parameters.
        """
        super().__init__()

        self.pipeline = transformers.pipeline(
            "text-generation",
            model=model_id,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto",
            max_new_tokens=256,
            batch_size=batch_size,
            num_return_sequences=1,
            return_full_text=False,
        )
        self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
        self.pipeline.tokenizer.padding_side = "left"

    def _get_response(self, prompts: list[str]):
        """Generate responses for a list of prompts using the local language model.

        Args:
            prompts (list[str]): A list of input prompts.

        Returns:
            list[str]: A list of generated responses corresponding to the input prompts.

        Note:
            This method uses torch.no_grad() for inference to reduce memory usage.
            It handles both single and batch inputs, ensuring consistent output format.
        """
        with torch.no_grad():
            response = self.pipeline(prompts, pad_token_id=self.pipeline.tokenizer.eos_token_id)

        if len(response) != 1:
            response = [r[0] if isinstance(r, list) else r for r in response]

        response = [r["generated_text"] for r in response]
        return response

    def __del__(self):
        """Cleanup method to delete the pipeline and free up GPU memory."""
        try:
            del self.pipeline
            torch.cuda.empty_cache()
        except Exception as e:
            logger.warning(f"Error during LocalLLM cleanup: {e}")

__del__()

Cleanup method to delete the pipeline and free up GPU memory.

Source code in promptolution/llms/local_llm.py
def __del__(self):
    """Cleanup method to delete the pipeline and free up GPU memory."""
    try:
        del self.pipeline
        torch.cuda.empty_cache()
    except Exception as e:
        logger.warning(f"Error during LocalLLM cleanup: {e}")

__init__(model_id, batch_size=8)

Initialize the LocalLLM with a specific model.

Parameters:

Name Type Description Default
model_id str

The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").

required
batch_size int

The batch size for text generation. Defaults to 8.

8
Note

This method sets up a text generation pipeline with bfloat16 precision, automatic device mapping, and specific generation parameters.

Source code in promptolution/llms/local_llm.py
def __init__(self, model_id: str, batch_size=8):
    """Initialize the LocalLLM with a specific model.

    Args:
        model_id (str): The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").
        batch_size (int, optional): The batch size for text generation. Defaults to 8.

    Note:
        This method sets up a text generation pipeline with bfloat16 precision,
        automatic device mapping, and specific generation parameters.
    """
    super().__init__()

    self.pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
        max_new_tokens=256,
        batch_size=batch_size,
        num_return_sequences=1,
        return_full_text=False,
    )
    self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
    self.pipeline.tokenizer.padding_side = "left"

vllm

Module for running language models locally using the vLLM library.

VLLM

Bases: BaseLLM

A class for running language models using the vLLM library.

This class sets up a vLLM inference engine with specified model parameters and provides a method to generate responses for given prompts.

Attributes:

Name Type Description
llm LLM

The vLLM inference engine.

tokenizer PreTrainedTokenizer

The tokenizer for the model.

sampling_params SamplingParams

Parameters for text generation.

Methods:

Name Description
get_response

Generate responses for a list of prompts.

update_token_count

Update the token count based on the given inputs and outputs.

Source code in promptolution/llms/vllm.py
class VLLM(BaseLLM):
    """A class for running language models using the vLLM library.

    This class sets up a vLLM inference engine with specified model parameters
    and provides a method to generate responses for given prompts.

    Attributes:
        llm (vllm.LLM): The vLLM inference engine.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
        sampling_params (vllm.SamplingParams): Parameters for text generation.

    Methods:
        get_response: Generate responses for a list of prompts.
        update_token_count: Update the token count based on the given inputs and outputs.
    """

    def __init__(
        self,
        model_id: str,
        batch_size: int | None = None,
        max_generated_tokens: int = 256,
        temperature: float = 0.1,
        top_p: float = 0.9,
        model_storage_path: str | None = None,
        dtype: str = "auto",
        tensor_parallel_size: int = 1,
        gpu_memory_utilization: float = 0.95,
        max_model_len: int = 2048,
        trust_remote_code: bool = False,
        seed: int = 42,
        **kwargs,
    ):
        """Initialize the VLLM with a specific model.

        Args:
            model_id (str): The identifier of the model to use.
            batch_size (int, optional): The batch size for text generation. Defaults to 8.
            max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
            temperature (float, optional): Sampling temperature. Defaults to 0.1.
            top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
            model_storage_path (str, optional): Directory to store the model. Defaults to None.
            dtype (str, optional): Data type for model weights. Defaults to "float16".
            tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
            gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
            max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
            seed (int, optional): Random seed for the model. Defaults to 42.
            **kwargs: Additional keyword arguments to pass to the LLM class initialization.

        Note:
            This method sets up a vLLM engine with specified parameters for efficient inference.
        """
        super().__init__()

        self.dtype = dtype
        self.tensor_parallel_size = tensor_parallel_size
        self.gpu_memory_utilization = gpu_memory_utilization
        self.max_model_len = max_model_len
        self.trust_remote_code = trust_remote_code

        # Configure sampling parameters
        self.sampling_params = SamplingParams(
            temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, seed=seed
        )

        # Initialize the vLLM engine with both explicit parameters and any additional kwargs
        llm_params = {
            "model": model_id,
            "tokenizer": model_id,
            "dtype": self.dtype,
            "tensor_parallel_size": self.tensor_parallel_size,
            "gpu_memory_utilization": self.gpu_memory_utilization,
            "max_model_len": self.max_model_len,
            "download_dir": model_storage_path,
            "trust_remote_code": self.trust_remote_code,
            "seed": seed,
            **kwargs,
        }

        self.llm = LLM(**llm_params)

        if batch_size is None:
            gpu_blocks = self.llm.llm_engine.model_executor.cache_config.num_gpu_blocks
            block_size = self.llm.llm_engine.model_executor.cache_config.block_size
            self.batch_size = int((gpu_blocks * block_size / self.max_model_len) * 0.95)
            logger.info(f"Batch size set to {self.batch_size} based on GPU memory.")
        else:
            self.batch_size = batch_size

        # Initialize tokenizer separately for potential pre-processing
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)

    def _get_response(self, inputs: list[str]):
        """Generate responses for a list of prompts using the vLLM engine.

        Args:
            prompts (list[str]): A list of input prompts.

        Returns:
            list[str]: A list of generated responses corresponding to the input prompts.

        Note:
            This method uses vLLM's batched generation capabilities for efficient inference.
            It also counts input and output tokens.
        """
        prompts = [
            self.tokenizer.apply_chat_template(
                [
                    {
                        "role": "system",
                        "content": "You are a helpful assistant.",
                    },
                    {"role": "user", "content": input},
                ],
                tokenize=False,
            )
            for input in inputs
        ]

        # generate responses for self.batch_size prompts at the same time
        all_responses = []
        for i in range(0, len(prompts), self.batch_size):
            batch = prompts[i : i + self.batch_size]
            outputs = self.llm.generate(batch, self.sampling_params)
            responses = [output.outputs[0].text for output in outputs]

            all_responses.extend(responses)

        return all_responses

    def update_token_count(self, inputs: List[str], outputs: List[str]):
        """Update the token count based on the given inputs and outputs.

            Uses the tokenizer to count the tokens.

        Args:
            inputs (List[str]): A list of input prompts.
            outputs (List[str]): A list of generated responses.
        """
        for input in inputs:
            self.input_token_count += len(self.tokenizer.encode(input))

        for output in outputs:
            self.output_token_count += len(self.tokenizer.encode(output))

    def __del__(self):
        """Cleanup method to delete the LLM instance and free up GPU memory."""
        del self.llm
        torch.cuda.empty_cache()

__del__()

Cleanup method to delete the LLM instance and free up GPU memory.

Source code in promptolution/llms/vllm.py
def __del__(self):
    """Cleanup method to delete the LLM instance and free up GPU memory."""
    del self.llm
    torch.cuda.empty_cache()

__init__(model_id, batch_size=None, max_generated_tokens=256, temperature=0.1, top_p=0.9, model_storage_path=None, dtype='auto', tensor_parallel_size=1, gpu_memory_utilization=0.95, max_model_len=2048, trust_remote_code=False, seed=42, **kwargs)

Initialize the VLLM with a specific model.

Parameters:

Name Type Description Default
model_id str

The identifier of the model to use.

required
batch_size int

The batch size for text generation. Defaults to 8.

None
max_generated_tokens int

Maximum number of tokens to generate. Defaults to 256.

256
temperature float

Sampling temperature. Defaults to 0.1.

0.1
top_p float

Top-p sampling parameter. Defaults to 0.9.

0.9
model_storage_path str

Directory to store the model. Defaults to None.

None
dtype str

Data type for model weights. Defaults to "float16".

'auto'
tensor_parallel_size int

Number of GPUs for tensor parallelism. Defaults to 1.

1
gpu_memory_utilization float

Fraction of GPU memory to use. Defaults to 0.95.

0.95
max_model_len int

Maximum sequence length for the model. Defaults to 2048.

2048
trust_remote_code bool

Whether to trust remote code. Defaults to False.

False
seed int

Random seed for the model. Defaults to 42.

42
**kwargs

Additional keyword arguments to pass to the LLM class initialization.

{}
Note

This method sets up a vLLM engine with specified parameters for efficient inference.

Source code in promptolution/llms/vllm.py
def __init__(
    self,
    model_id: str,
    batch_size: int | None = None,
    max_generated_tokens: int = 256,
    temperature: float = 0.1,
    top_p: float = 0.9,
    model_storage_path: str | None = None,
    dtype: str = "auto",
    tensor_parallel_size: int = 1,
    gpu_memory_utilization: float = 0.95,
    max_model_len: int = 2048,
    trust_remote_code: bool = False,
    seed: int = 42,
    **kwargs,
):
    """Initialize the VLLM with a specific model.

    Args:
        model_id (str): The identifier of the model to use.
        batch_size (int, optional): The batch size for text generation. Defaults to 8.
        max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
        temperature (float, optional): Sampling temperature. Defaults to 0.1.
        top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
        model_storage_path (str, optional): Directory to store the model. Defaults to None.
        dtype (str, optional): Data type for model weights. Defaults to "float16".
        tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
        gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
        max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
        trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
        seed (int, optional): Random seed for the model. Defaults to 42.
        **kwargs: Additional keyword arguments to pass to the LLM class initialization.

    Note:
        This method sets up a vLLM engine with specified parameters for efficient inference.
    """
    super().__init__()

    self.dtype = dtype
    self.tensor_parallel_size = tensor_parallel_size
    self.gpu_memory_utilization = gpu_memory_utilization
    self.max_model_len = max_model_len
    self.trust_remote_code = trust_remote_code

    # Configure sampling parameters
    self.sampling_params = SamplingParams(
        temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, seed=seed
    )

    # Initialize the vLLM engine with both explicit parameters and any additional kwargs
    llm_params = {
        "model": model_id,
        "tokenizer": model_id,
        "dtype": self.dtype,
        "tensor_parallel_size": self.tensor_parallel_size,
        "gpu_memory_utilization": self.gpu_memory_utilization,
        "max_model_len": self.max_model_len,
        "download_dir": model_storage_path,
        "trust_remote_code": self.trust_remote_code,
        "seed": seed,
        **kwargs,
    }

    self.llm = LLM(**llm_params)

    if batch_size is None:
        gpu_blocks = self.llm.llm_engine.model_executor.cache_config.num_gpu_blocks
        block_size = self.llm.llm_engine.model_executor.cache_config.block_size
        self.batch_size = int((gpu_blocks * block_size / self.max_model_len) * 0.95)
        logger.info(f"Batch size set to {self.batch_size} based on GPU memory.")
    else:
        self.batch_size = batch_size

    # Initialize tokenizer separately for potential pre-processing
    self.tokenizer = AutoTokenizer.from_pretrained(model_id)

update_token_count(inputs, outputs)

Update the token count based on the given inputs and outputs.

Uses the tokenizer to count the tokens.

Parameters:

Name Type Description Default
inputs List[str]

A list of input prompts.

required
outputs List[str]

A list of generated responses.

required
Source code in promptolution/llms/vllm.py
def update_token_count(self, inputs: List[str], outputs: List[str]):
    """Update the token count based on the given inputs and outputs.

        Uses the tokenizer to count the tokens.

    Args:
        inputs (List[str]): A list of input prompts.
        outputs (List[str]): A list of generated responses.
    """
    for input in inputs:
        self.input_token_count += len(self.tokenizer.encode(input))

    for output in outputs:
        self.output_token_count += len(self.tokenizer.encode(output))