Skip to content

llama

Llama generation module.

Classes:

  • LlamaGen

    Llama Generation class.

  • OllamaGen

    Ollama Generation class for local inference via ollama-python.

  • OllamaOpenAIGen

    OllamaGen via the Ollama Python client.

LlamaGen

LlamaGen(
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Cache | None = None,
    logs: dict[str, Any] | None = None,
)

Bases: GenerationBase

Llama Generation class.

Methods:

  • apply

    Apply attached configuration to the step.

  • generate

    Generate text using Llama model with language support.

  • process

    Generate a result from the current pipeline content.

Source code in src/rago/generation/base.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def __init__(
    self,
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Cache | None = None,
    logs: dict[str, Any] | None = None,
) -> None:
    super().__init__()
    self.api_key = api_key
    self.cache = cache
    self.logs = logs if logs is not None else {}

    self.model_name = (
        model_name if model_name is not None else self.default_model_name
    )
    self.output_max_length = (
        output_max_length or self.default_output_max_length
    )
    self.temperature = (
        temperature
        if temperature is not None
        else self.default_temperature
    )
    self.prompt_template = prompt_template or self.default_prompt_template
    self.structured_output = structured_output
    if api_params is DEFAULT_API_PARAMS:
        api_params = deepcopy(self.default_api_params or {})

    self.system_message = system_message
    self.api_params = api_params

    if device not in ['cpu', 'cuda', 'auto']:
        raise Exception(
            f'Device {device} not supported. Options: cpu, cuda, auto.'
        )

    cuda_available = torch.cuda.is_available()
    self.device_name = (
        'cpu' if device == 'cpu' or not cuda_available else 'cuda'
    )
    self.device = torch.device(self.device_name)

    self._validate()
    self._load_optional_modules()
    self._setup()

apply

apply(parameters: Any) -> None

Apply attached configuration to the step.

Source code in src/rago/base.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def apply(self, parameters: Any) -> None:
    """Apply attached configuration to the step."""
    if parameters is None:
        return

    if _is_cache_backend(parameters):
        self.cache = parameters
        return

    if _is_vector_db(parameters):
        setattr(self, 'db', parameters)
        return

    if _is_text_splitter(parameters):
        setattr(self, 'splitter', parameters)
        return

    for key, value in config_to_dict(parameters).items():
        if key == 'cache':
            self.cache = value
        elif key == 'logs':
            self.logs = value if value is not None else {}
        else:
            setattr(self, key, value)

generate

generate(query: str, data: list[str]) -> str

Generate text using Llama model with language support.

Source code in src/rago/generation/llama.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def generate(self, query: str, data: list[str]) -> str:
    """Generate text using Llama model with language support."""
    input_text = self._format_prompt(query, data)

    # Detect and set the language code for multilingual models (optional)
    language = str(self._detect(query)) or 'en'
    self.tokenizer.lang_code = language

    api_params = (
        self.api_params if self.api_params else self.default_api_params
    )

    # Generate the response with adjusted parameters

    model_params = dict(
        text_inputs=input_text,
        max_new_tokens=self.output_max_length,
        do_sample=True,
        temperature=self.temperature,
        eos_token_id=self.tokenizer.eos_token_id,
        **api_params,
    )
    response = self.generator(**model_params)

    # self.logs['model_params'] = model_params

    # Extract and return the answer only
    answer = str(response[0].get('generated_text', ''))
    # Strip off any redundant text after the answer itself
    return answer.split('Answer:')[-1].strip()

process

process(inp: Input) -> Output

Generate a result from the current pipeline content.

Source code in src/rago/generation/base.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def process(self, inp: Input) -> Output:
    """Generate a result from the current pipeline content."""
    query = str(inp.query)
    data = [
        str(item)
        for item in ensure_list(
            inp.get('content', inp.get('data', inp.get('source')))
        )
    ]
    result = self.generate(query, data)
    output = Output.from_input(inp)
    output.result = result
    output.content = _serialize_generation_result(result)
    output.data = output.content
    return output

OllamaGen

OllamaGen(
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Cache | None = None,
    logs: dict[str, Any] | None = None,
)

Bases: GenerationBase

Ollama Generation class for local inference via ollama-python.

Methods:

  • apply

    Apply attached configuration to the step.

  • generate

    Generate text by sending a prompt to the local Ollama model.

  • process

    Generate a result from the current pipeline content.

Source code in src/rago/generation/base.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def __init__(
    self,
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Cache | None = None,
    logs: dict[str, Any] | None = None,
) -> None:
    super().__init__()
    self.api_key = api_key
    self.cache = cache
    self.logs = logs if logs is not None else {}

    self.model_name = (
        model_name if model_name is not None else self.default_model_name
    )
    self.output_max_length = (
        output_max_length or self.default_output_max_length
    )
    self.temperature = (
        temperature
        if temperature is not None
        else self.default_temperature
    )
    self.prompt_template = prompt_template or self.default_prompt_template
    self.structured_output = structured_output
    if api_params is DEFAULT_API_PARAMS:
        api_params = deepcopy(self.default_api_params or {})

    self.system_message = system_message
    self.api_params = api_params

    if device not in ['cpu', 'cuda', 'auto']:
        raise Exception(
            f'Device {device} not supported. Options: cpu, cuda, auto.'
        )

    cuda_available = torch.cuda.is_available()
    self.device_name = (
        'cpu' if device == 'cpu' or not cuda_available else 'cuda'
    )
    self.device = torch.device(self.device_name)

    self._validate()
    self._load_optional_modules()
    self._setup()

apply

apply(parameters: Any) -> None

Apply attached configuration to the step.

Source code in src/rago/base.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def apply(self, parameters: Any) -> None:
    """Apply attached configuration to the step."""
    if parameters is None:
        return

    if _is_cache_backend(parameters):
        self.cache = parameters
        return

    if _is_vector_db(parameters):
        setattr(self, 'db', parameters)
        return

    if _is_text_splitter(parameters):
        setattr(self, 'splitter', parameters)
        return

    for key, value in config_to_dict(parameters).items():
        if key == 'cache':
            self.cache = value
        elif key == 'logs':
            self.logs = value if value is not None else {}
        else:
            setattr(self, key, value)

generate

generate(query: str, data: list[str]) -> str | BaseModel

Generate text by sending a prompt to the local Ollama model.

Parameters:

  • query (str) –

    The user query.

  • data (list[str]) –

    Augmented context strings.

Returns:

  • str

    The generated response text.

Source code in src/rago/generation/llama.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def generate(self, query: str, data: list[str]) -> str | BaseModel:
    """
    Generate text by sending a prompt to the local Ollama model.

    Parameters
    ----------
    query : str
        The user query.
    data : list[str]
        Augmented context strings.

    Returns
    -------
    str
        The generated response text.
    """
    input_text = self._format_prompt(query, data)

    messages = []
    if self.system_message:
        messages.append({'role': 'system', 'content': self.system_message})
    messages.append({'role': 'user', 'content': input_text})

    request_params = copy(self.api_params or {})
    options = copy(request_params.pop('options', {}))
    options.setdefault('temperature', self.temperature)
    options.setdefault('num_predict', self.output_max_length)

    params = {
        'model': self.model_name,
        'messages': messages,
        'options': options,
        **request_params,
    }
    response = self.model.chat(**params)
    return str(response.message.content).strip()

process

process(inp: Input) -> Output

Generate a result from the current pipeline content.

Source code in src/rago/generation/base.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def process(self, inp: Input) -> Output:
    """Generate a result from the current pipeline content."""
    query = str(inp.query)
    data = [
        str(item)
        for item in ensure_list(
            inp.get('content', inp.get('data', inp.get('source')))
        )
    ]
    result = self.generate(query, data)
    output = Output.from_input(inp)
    output.result = result
    output.content = _serialize_generation_result(result)
    output.data = output.content
    return output

OllamaOpenAIGen

OllamaOpenAIGen(
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Cache | None = None,
    logs: dict[str, Any] | None = None,
)

Bases: OpenAIGen

OllamaGen via the Ollama Python client.

Methods:

  • apply

    Apply attached configuration to the step.

  • generate

    Generate text using OpenAI's API with dynamic model support.

  • process

    Generate a result from the current pipeline content.

Source code in src/rago/generation/base.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def __init__(
    self,
    model_name: Optional[str] = None,
    temperature: Optional[float] = None,
    prompt_template: str = '',
    output_max_length: int = 500,
    device: str = 'auto',
    structured_output: Optional[Type[BaseModel]] = None,
    system_message: str = '',
    api_params: dict[str, Any] = DEFAULT_API_PARAMS,
    api_key: str = '',
    cache: Cache | None = None,
    logs: dict[str, Any] | None = None,
) -> None:
    super().__init__()
    self.api_key = api_key
    self.cache = cache
    self.logs = logs if logs is not None else {}

    self.model_name = (
        model_name if model_name is not None else self.default_model_name
    )
    self.output_max_length = (
        output_max_length or self.default_output_max_length
    )
    self.temperature = (
        temperature
        if temperature is not None
        else self.default_temperature
    )
    self.prompt_template = prompt_template or self.default_prompt_template
    self.structured_output = structured_output
    if api_params is DEFAULT_API_PARAMS:
        api_params = deepcopy(self.default_api_params or {})

    self.system_message = system_message
    self.api_params = api_params

    if device not in ['cpu', 'cuda', 'auto']:
        raise Exception(
            f'Device {device} not supported. Options: cpu, cuda, auto.'
        )

    cuda_available = torch.cuda.is_available()
    self.device_name = (
        'cpu' if device == 'cpu' or not cuda_available else 'cuda'
    )
    self.device = torch.device(self.device_name)

    self._validate()
    self._load_optional_modules()
    self._setup()

apply

apply(parameters: Any) -> None

Apply attached configuration to the step.

Source code in src/rago/base.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def apply(self, parameters: Any) -> None:
    """Apply attached configuration to the step."""
    if parameters is None:
        return

    if _is_cache_backend(parameters):
        self.cache = parameters
        return

    if _is_vector_db(parameters):
        setattr(self, 'db', parameters)
        return

    if _is_text_splitter(parameters):
        setattr(self, 'splitter', parameters)
        return

    for key, value in config_to_dict(parameters).items():
        if key == 'cache':
            self.cache = value
        elif key == 'logs':
            self.logs = value if value is not None else {}
        else:
            setattr(self, key, value)

generate

generate(query: str, data: list[str]) -> str | BaseModel

Generate text using OpenAI's API with dynamic model support.

Source code in src/rago/generation/openai.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def generate(
    self,
    query: str,
    data: list[str],
) -> str | BaseModel:
    """Generate text using OpenAI's API with dynamic model support."""
    input_text = self._format_prompt(query, data)

    if not self.model:
        raise Exception('The model was not created.')

    messages = []
    if self.system_message:
        messages.append({'role': 'system', 'content': self.system_message})
    messages.append({'role': 'user', 'content': input_text})

    model_params = dict(
        model=self.model_name,
        messages=messages,
        max_tokens=self.output_max_length,
        temperature=self.temperature,
        **self.api_params,
    )

    if self.structured_output:
        model_params['response_model'] = self.structured_output

    response = self.model.chat.completions.create(**model_params)

    # self.logs['model_params'] = model_params

    has_choices = hasattr(response, 'choices')

    if has_choices and isinstance(response.choices, list):
        return cast(str, response.choices[0].message.content.strip())
    return cast(BaseModel, response)

process

process(inp: Input) -> Output

Generate a result from the current pipeline content.

Source code in src/rago/generation/base.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def process(self, inp: Input) -> Output:
    """Generate a result from the current pipeline content."""
    query = str(inp.query)
    data = [
        str(item)
        for item in ensure_list(
            inp.get('content', inp.get('data', inp.get('source')))
        )
    ]
    result = self.generate(query, data)
    output = Output.from_input(inp)
    output.result = result
    output.content = _serialize_generation_result(result)
    output.data = output.content
    return output