Skip to content

Ollama Integration

This guide shows how to integrate Ladger with Ollama for tracking usage of self-hosted models like Llama, Mistral, and CodeLlama.

Why Track Ollama?

Even though Ollama models are “free” (no API costs), tracking usage helps you:

  • Monitor infrastructure costs (GPU time, electricity)
  • Compare performance across models
  • Identify optimization opportunities
  • Maintain consistent observability across providers

Installation

Terminal window
npm install @ladger/sdk ollama

Basic Setup

import { LadgerTracer } from '@ladger/sdk';
import { Ollama } from 'ollama';
const tracer = new LadgerTracer({
apiKey: process.env.LADGER_API_KEY!,
flowName: 'ollama-assistant',
});
const ollama = new Ollama({
host: 'http://localhost:11434', // Default Ollama host
});

Chat Completions

Basic Chat

async function chat(message: string, model = 'llama2') {
return tracer.trace('ollama-chat', async (span) => {
const response = await ollama.chat({
model,
messages: [{ role: 'user', content: message }],
});
span.setAttributes({
'ollama.model': model,
'ollama.total_duration': response.total_duration,
'ollama.load_duration': response.load_duration,
'ollama.eval_duration': response.eval_duration,
});
span.recordCost({
provider: 'ollama',
model,
inputTokens: response.prompt_eval_count,
outputTokens: response.eval_count,
// No API cost, but can track compute cost
costUsd: 0,
});
return response.message.content;
});
}

Streaming

async function chatStream(message: string, model = 'llama2') {
return tracer.trace('ollama-stream', async (span) => {
let inputTokens = 0;
let outputTokens = 0;
let content = '';
const stream = await ollama.chat({
model,
messages: [{ role: 'user', content: message }],
stream: true,
});
for await (const chunk of stream) {
content += chunk.message.content;
process.stdout.write(chunk.message.content);
// Final chunk contains usage
if (chunk.done) {
inputTokens = chunk.prompt_eval_count || 0;
outputTokens = chunk.eval_count || 0;
}
}
span.recordCost({
provider: 'ollama',
model,
inputTokens,
outputTokens,
});
return content;
});
}

Generate (Completion)

async function generate(prompt: string, model = 'codellama') {
return tracer.trace('ollama-generate', async (span) => {
const response = await ollama.generate({
model,
prompt,
});
span.setAttributes({
'ollama.model': model,
'ollama.context_size': response.context?.length || 0,
});
span.recordCost({
provider: 'ollama',
model,
inputTokens: response.prompt_eval_count,
outputTokens: response.eval_count,
});
return response.response;
});
}

Embeddings

async function createEmbedding(text: string, model = 'nomic-embed-text') {
return tracer.trace('ollama-embedding', async (span) => {
const response = await ollama.embeddings({
model,
prompt: text,
});
span.setAttributes({
'embedding.model': model,
'embedding.dimensions': response.embedding.length,
});
span.recordCost({
provider: 'ollama',
model,
inputTokens: text.split(/\s+/).length, // Approximate token count
});
return response.embedding;
});
}

Tracking Compute Costs

For self-hosted models, you might want to track infrastructure costs:

// Example: Estimate GPU cost per token
const GPU_COST_PER_HOUR = 0.50; // $0.50/hour for GPU rental
const TOKENS_PER_SECOND = 30; // Approximate throughput
function estimateComputeCost(inputTokens: number, outputTokens: number): number {
const totalTokens = inputTokens + outputTokens;
const seconds = totalTokens / TOKENS_PER_SECOND;
const hours = seconds / 3600;
return hours * GPU_COST_PER_HOUR;
}
async function chatWithCost(message: string, model = 'llama2') {
return tracer.trace('ollama-chat-costed', async (span) => {
const response = await ollama.chat({
model,
messages: [{ role: 'user', content: message }],
});
const computeCost = estimateComputeCost(
response.prompt_eval_count || 0,
response.eval_count || 0
);
span.recordCost({
provider: 'ollama',
model,
inputTokens: response.prompt_eval_count,
outputTokens: response.eval_count,
costUsd: computeCost, // Estimated compute cost
});
return response.message.content;
});
}

Performance Tracking

Track performance metrics specific to local inference:

async function chatWithMetrics(message: string, model = 'llama2') {
return tracer.trace('ollama-metrics', async (span) => {
const startTime = Date.now();
const response = await ollama.chat({
model,
messages: [{ role: 'user', content: message }],
});
const endTime = Date.now();
const wallClockMs = endTime - startTime;
// Convert nanoseconds to milliseconds
const totalDurationMs = response.total_duration / 1_000_000;
const loadDurationMs = response.load_duration / 1_000_000;
const evalDurationMs = response.eval_duration / 1_000_000;
const promptEvalMs = response.prompt_eval_duration / 1_000_000;
span.setAttributes({
// Model info
'ollama.model': model,
// Token counts
'ollama.prompt_tokens': response.prompt_eval_count,
'ollama.completion_tokens': response.eval_count,
// Timing (in ms)
'ollama.wall_clock_ms': wallClockMs,
'ollama.total_duration_ms': totalDurationMs,
'ollama.load_duration_ms': loadDurationMs,
'ollama.prompt_eval_ms': promptEvalMs,
'ollama.eval_duration_ms': evalDurationMs,
// Performance metrics
'ollama.tokens_per_second': response.eval_count / (evalDurationMs / 1000),
});
span.recordCost({
provider: 'ollama',
model,
inputTokens: response.prompt_eval_count,
outputTokens: response.eval_count,
});
return response.message.content;
});
}
ModelSizeUse CaseSpeed
llama2:7b7BGeneral chatFast
llama2:13b13BBetter reasoningMedium
llama2:70b70BComplex tasksSlow
codellama7-34BCode generationMedium
mistral7BEfficient general useFast
mixtral8x7BHigh qualityMedium
phi2.7BLightweightVery Fast
neural-chat7BConversationalFast

Complete Example

import 'dotenv/config';
import { LadgerTracer } from '@ladger/sdk';
import { Ollama } from 'ollama';
const tracer = new LadgerTracer({
apiKey: process.env.LADGER_API_KEY!,
flowName: 'ollama-demo',
debug: true,
});
const ollama = new Ollama();
// Cost estimation for self-hosted inference
const COST_CONFIG = {
gpuCostPerHour: 0.50,
baselineTokensPerSecond: 30,
modelSpeedMultipliers: {
'llama2:7b': 1.5,
'llama2:13b': 1.0,
'llama2:70b': 0.3,
'mistral': 1.8,
'codellama': 1.2,
} as Record<string, number>,
};
function estimateCost(model: string, totalTokens: number): number {
const speedMultiplier = COST_CONFIG.modelSpeedMultipliers[model] || 1.0;
const tokensPerSecond = COST_CONFIG.baselineTokensPerSecond * speedMultiplier;
const seconds = totalTokens / tokensPerSecond;
const hours = seconds / 3600;
return hours * COST_CONFIG.gpuCostPerHour;
}
async function intelligentChat(message: string) {
return tracer.trace('intelligent-chat', async (parentSpan) => {
// Step 1: Quick classification with small model
const complexity = await tracer.trace('classify', async (span) => {
const response = await ollama.chat({
model: 'phi',
messages: [{
role: 'user',
content: `Rate this query complexity as "simple" or "complex": "${message}"`,
}],
});
span.recordCost({
provider: 'ollama',
model: 'phi',
inputTokens: response.prompt_eval_count,
outputTokens: response.eval_count,
costUsd: estimateCost('phi',
(response.prompt_eval_count || 0) + (response.eval_count || 0)
),
});
return response.message.content.toLowerCase().includes('complex')
? 'complex'
: 'simple';
}, { parent: parentSpan });
parentSpan.setAttributes({ complexity });
// Step 2: Route to appropriate model
const model = complexity === 'complex' ? 'llama2:13b' : 'llama2:7b';
// Step 3: Generate response
return tracer.trace('respond', async (span) => {
const response = await ollama.chat({
model,
messages: [{ role: 'user', content: message }],
});
const totalTokens = (response.prompt_eval_count || 0) + (response.eval_count || 0);
span.setAttributes({
'model.selected': model,
'ollama.tokens_per_second': response.eval_count /
(response.eval_duration / 1_000_000_000),
});
span.recordCost({
provider: 'ollama',
model,
inputTokens: response.prompt_eval_count,
outputTokens: response.eval_count,
costUsd: estimateCost(model, totalTokens),
});
return response.message.content;
}, { parent: parentSpan });
});
}
async function main() {
console.log('Simple query:');
console.log(await intelligentChat('What is 2+2?'));
console.log('\nComplex query:');
console.log(await intelligentChat('Explain the theory of relativity and its implications for GPS satellites.'));
await tracer.shutdown();
}
main();

Next Steps