Skip to content

llama

Llama generation module.

Classes:

  • LlamaGen

    Llama Generation class.

  • OllamaGen

    Ollama Generation class for local inference via ollama-python.

  • OllamaOpenAIGen

    OllamaGen via the Ollama Python client.

LlamaGen

LlamaGen(
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Optional[Cache] = None,
    logs: dict[str, Any] = DEFAULT_LOGS,
)

Bases: GenerationBase

Llama Generation class.

Methods:

  • generate

    Generate text using Llama model with language support.

Source code in src/rago/generation/base.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def __init__(
    self,
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Optional[Cache] = None,
    logs: dict[str, Any] = DEFAULT_LOGS,
) -> None:
    """Initialize Generation class."""
    if logs is DEFAULT_LOGS:
        logs = {}
    super().__init__(api_key=api_key, cache=cache, logs=logs)

    self.model_name: str = (
        model_name if model_name is not None else self.default_model_name
    )
    self.output_max_length: int = (
        output_max_length or self.default_output_max_length
    )
    self.temperature: float = (
        temperature
        if temperature is not None
        else self.default_temperature
    )

    self.prompt_template: str = (
        prompt_template or self.default_prompt_template
    )
    self.structured_output: Optional[Type[BaseModel]] = structured_output
    if api_params is DEFAULT_API_PARAMS:
        api_params = deepcopy(self.default_api_params or {})

    self.system_message = system_message
    self.api_params = api_params

    if device not in ['cpu', 'cuda', 'auto']:
        raise Exception(
            f'Device {device} not supported. Options: cpu, cuda, auto.'
        )

    cuda_available = torch.cuda.is_available()
    self.device_name: str = (
        'cpu' if device == 'cpu' or not cuda_available else 'cuda'
    )
    self.device = torch.device(self.device_name)

    self._validate()
    self._setup()

generate

generate(query: str, context: list[str]) -> str

Generate text using Llama model with language support.

Source code in src/rago/generation/llama.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def generate(self, query: str, context: list[str]) -> str:
    """Generate text using Llama model with language support."""
    input_text = self.prompt_template.format(
        query=query, context=' '.join(context)
    )

    # Detect and set the language code for multilingual models (optional)
    language = str(detect(query)) or 'en'
    self.tokenizer.lang_code = language

    api_params = (
        self.api_params if self.api_params else self.default_api_params
    )

    # Generate the response with adjusted parameters

    model_params = dict(
        text_inputs=input_text,
        max_new_tokens=self.output_max_length,
        do_sample=True,
        temperature=self.temperature,
        eos_token_id=self.tokenizer.eos_token_id,
        **api_params,
    )
    response = self.generator(**model_params)

    self.logs['model_params'] = model_params

    # Extract and return the answer only
    answer = str(response[0].get('generated_text', ''))
    # Strip off any redundant text after the answer itself
    return answer.split('Answer:')[-1].strip()

OllamaGen

OllamaGen(
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Optional[Cache] = None,
    logs: dict[str, Any] = DEFAULT_LOGS,
)

Bases: GenerationBase

Ollama Generation class for local inference via ollama-python.

Methods:

  • generate

    Generate text by sending a prompt to the local Ollama model.

Source code in src/rago/generation/base.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def __init__(
    self,
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Optional[Cache] = None,
    logs: dict[str, Any] = DEFAULT_LOGS,
) -> None:
    """Initialize Generation class."""
    if logs is DEFAULT_LOGS:
        logs = {}
    super().__init__(api_key=api_key, cache=cache, logs=logs)

    self.model_name: str = (
        model_name if model_name is not None else self.default_model_name
    )
    self.output_max_length: int = (
        output_max_length or self.default_output_max_length
    )
    self.temperature: float = (
        temperature
        if temperature is not None
        else self.default_temperature
    )

    self.prompt_template: str = (
        prompt_template or self.default_prompt_template
    )
    self.structured_output: Optional[Type[BaseModel]] = structured_output
    if api_params is DEFAULT_API_PARAMS:
        api_params = deepcopy(self.default_api_params or {})

    self.system_message = system_message
    self.api_params = api_params

    if device not in ['cpu', 'cuda', 'auto']:
        raise Exception(
            f'Device {device} not supported. Options: cpu, cuda, auto.'
        )

    cuda_available = torch.cuda.is_available()
    self.device_name: str = (
        'cpu' if device == 'cpu' or not cuda_available else 'cuda'
    )
    self.device = torch.device(self.device_name)

    self._validate()
    self._setup()

generate

generate(query: str, context: list[str]) -> str | BaseModel

Generate text by sending a prompt to the local Ollama model.

Parameters:

  • query (str) –

    The user query.

  • context (list[str]) –

    Augmented context strings.

Returns:

  • str

    The generated response text.

Source code in src/rago/generation/llama.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def generate(self, query: str, context: list[str]) -> str | BaseModel:
    """
    Generate text by sending a prompt to the local Ollama model.

    Parameters
    ----------
    query : str
        The user query.
    context : list[str]
        Augmented context strings.

    Returns
    -------
    str
        The generated response text.
    """
    input_text = self.prompt_template.format(
        query=query,
        context=' '.join(context),
    )

    messages = []
    if self.system_message:
        messages.append({'role': 'system', 'content': self.system_message})
    messages.append({'role': 'user', 'content': input_text})

    params = {
        'model': self.model_name,
        'messages': messages,
        **(self.api_params or {}),
    }
    response = self.model.chat(**params)
    return str(response.message.content).strip()

OllamaOpenAIGen

OllamaOpenAIGen(
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Optional[Cache] = None,
    logs: dict[str, Any] = DEFAULT_LOGS,
)

Bases: OpenAIGen

OllamaGen via the Ollama Python client.

Methods:

  • generate

    Generate text using OpenAI's API with dynamic model support.

Source code in src/rago/generation/base.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def __init__(
    self,
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Optional[Cache] = None,
    logs: dict[str, Any] = DEFAULT_LOGS,
) -> None:
    """Initialize Generation class."""
    if logs is DEFAULT_LOGS:
        logs = {}
    super().__init__(api_key=api_key, cache=cache, logs=logs)

    self.model_name: str = (
        model_name if model_name is not None else self.default_model_name
    )
    self.output_max_length: int = (
        output_max_length or self.default_output_max_length
    )
    self.temperature: float = (
        temperature
        if temperature is not None
        else self.default_temperature
    )

    self.prompt_template: str = (
        prompt_template or self.default_prompt_template
    )
    self.structured_output: Optional[Type[BaseModel]] = structured_output
    if api_params is DEFAULT_API_PARAMS:
        api_params = deepcopy(self.default_api_params or {})

    self.system_message = system_message
    self.api_params = api_params

    if device not in ['cpu', 'cuda', 'auto']:
        raise Exception(
            f'Device {device} not supported. Options: cpu, cuda, auto.'
        )

    cuda_available = torch.cuda.is_available()
    self.device_name: str = (
        'cpu' if device == 'cpu' or not cuda_available else 'cuda'
    )
    self.device = torch.device(self.device_name)

    self._validate()
    self._setup()

generate

generate(query: str, context: list[str]) -> str | BaseModel

Generate text using OpenAI's API with dynamic model support.

Source code in src/rago/generation/openai.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def generate(
    self,
    query: str,
    context: list[str],
) -> str | BaseModel:
    """Generate text using OpenAI's API with dynamic model support."""
    input_text = self.prompt_template.format(
        query=query, context=' '.join(context)
    )

    if not self.model:
        raise Exception('The model was not created.')

    messages = []
    if self.system_message:
        messages.append({'role': 'system', 'content': self.system_message})
    messages.append({'role': 'user', 'content': input_text})

    model_params = dict(
        model=self.model_name,
        messages=messages,
        max_tokens=self.output_max_length,
        temperature=self.temperature,
        **self.api_params,
    )

    if self.structured_output:
        model_params['response_model'] = self.structured_output

    response = self.model.chat.completions.create(**model_params)

    self.logs['model_params'] = model_params

    has_choices = hasattr(response, 'choices')

    if has_choices and isinstance(response.choices, list):
        return cast(str, response.choices[0].message.content.strip())
    return cast(BaseModel, response)