Ollama Integration
This guide shows how to integrate Ladger with Ollama for tracking usage of self-hosted models like Llama, Mistral, and CodeLlama.
Why Track Ollama?
Even though Ollama models are “free” (no API costs), tracking usage helps you:
- Monitor infrastructure costs (GPU time, electricity)
- Compare performance across models
- Identify optimization opportunities
- Maintain consistent observability across providers
Installation
npm install @ladger/sdk ollamayarn add @ladger/sdk ollamapnpm add @ladger/sdk ollamaBasic Setup
import { LadgerTracer } from '@ladger/sdk';import { Ollama } from 'ollama';
const tracer = new LadgerTracer({ apiKey: process.env.LADGER_API_KEY!, flowName: 'ollama-assistant',});
const ollama = new Ollama({ host: 'http://localhost:11434', // Default Ollama host});Chat Completions
Basic Chat
async function chat(message: string, model = 'llama2') { return tracer.trace('ollama-chat', async (span) => { const response = await ollama.chat({ model, messages: [{ role: 'user', content: message }], });
span.setAttributes({ 'ollama.model': model, 'ollama.total_duration': response.total_duration, 'ollama.load_duration': response.load_duration, 'ollama.eval_duration': response.eval_duration, });
span.recordCost({ provider: 'ollama', model, inputTokens: response.prompt_eval_count, outputTokens: response.eval_count, // No API cost, but can track compute cost costUsd: 0, });
return response.message.content; });}Streaming
async function chatStream(message: string, model = 'llama2') { return tracer.trace('ollama-stream', async (span) => { let inputTokens = 0; let outputTokens = 0; let content = '';
const stream = await ollama.chat({ model, messages: [{ role: 'user', content: message }], stream: true, });
for await (const chunk of stream) { content += chunk.message.content; process.stdout.write(chunk.message.content);
// Final chunk contains usage if (chunk.done) { inputTokens = chunk.prompt_eval_count || 0; outputTokens = chunk.eval_count || 0; } }
span.recordCost({ provider: 'ollama', model, inputTokens, outputTokens, });
return content; });}Generate (Completion)
async function generate(prompt: string, model = 'codellama') { return tracer.trace('ollama-generate', async (span) => { const response = await ollama.generate({ model, prompt, });
span.setAttributes({ 'ollama.model': model, 'ollama.context_size': response.context?.length || 0, });
span.recordCost({ provider: 'ollama', model, inputTokens: response.prompt_eval_count, outputTokens: response.eval_count, });
return response.response; });}Embeddings
async function createEmbedding(text: string, model = 'nomic-embed-text') { return tracer.trace('ollama-embedding', async (span) => { const response = await ollama.embeddings({ model, prompt: text, });
span.setAttributes({ 'embedding.model': model, 'embedding.dimensions': response.embedding.length, });
span.recordCost({ provider: 'ollama', model, inputTokens: text.split(/\s+/).length, // Approximate token count });
return response.embedding; });}Tracking Compute Costs
For self-hosted models, you might want to track infrastructure costs:
// Example: Estimate GPU cost per tokenconst GPU_COST_PER_HOUR = 0.50; // $0.50/hour for GPU rentalconst TOKENS_PER_SECOND = 30; // Approximate throughput
function estimateComputeCost(inputTokens: number, outputTokens: number): number { const totalTokens = inputTokens + outputTokens; const seconds = totalTokens / TOKENS_PER_SECOND; const hours = seconds / 3600; return hours * GPU_COST_PER_HOUR;}
async function chatWithCost(message: string, model = 'llama2') { return tracer.trace('ollama-chat-costed', async (span) => { const response = await ollama.chat({ model, messages: [{ role: 'user', content: message }], });
const computeCost = estimateComputeCost( response.prompt_eval_count || 0, response.eval_count || 0 );
span.recordCost({ provider: 'ollama', model, inputTokens: response.prompt_eval_count, outputTokens: response.eval_count, costUsd: computeCost, // Estimated compute cost });
return response.message.content; });}Performance Tracking
Track performance metrics specific to local inference:
async function chatWithMetrics(message: string, model = 'llama2') { return tracer.trace('ollama-metrics', async (span) => { const startTime = Date.now();
const response = await ollama.chat({ model, messages: [{ role: 'user', content: message }], });
const endTime = Date.now(); const wallClockMs = endTime - startTime;
// Convert nanoseconds to milliseconds const totalDurationMs = response.total_duration / 1_000_000; const loadDurationMs = response.load_duration / 1_000_000; const evalDurationMs = response.eval_duration / 1_000_000; const promptEvalMs = response.prompt_eval_duration / 1_000_000;
span.setAttributes({ // Model info 'ollama.model': model,
// Token counts 'ollama.prompt_tokens': response.prompt_eval_count, 'ollama.completion_tokens': response.eval_count,
// Timing (in ms) 'ollama.wall_clock_ms': wallClockMs, 'ollama.total_duration_ms': totalDurationMs, 'ollama.load_duration_ms': loadDurationMs, 'ollama.prompt_eval_ms': promptEvalMs, 'ollama.eval_duration_ms': evalDurationMs,
// Performance metrics 'ollama.tokens_per_second': response.eval_count / (evalDurationMs / 1000), });
span.recordCost({ provider: 'ollama', model, inputTokens: response.prompt_eval_count, outputTokens: response.eval_count, });
return response.message.content; });}Popular Ollama Models
| Model | Size | Use Case | Speed |
|---|---|---|---|
| llama2:7b | 7B | General chat | Fast |
| llama2:13b | 13B | Better reasoning | Medium |
| llama2:70b | 70B | Complex tasks | Slow |
| codellama | 7-34B | Code generation | Medium |
| mistral | 7B | Efficient general use | Fast |
| mixtral | 8x7B | High quality | Medium |
| phi | 2.7B | Lightweight | Very Fast |
| neural-chat | 7B | Conversational | Fast |
Complete Example
import 'dotenv/config';import { LadgerTracer } from '@ladger/sdk';import { Ollama } from 'ollama';
const tracer = new LadgerTracer({ apiKey: process.env.LADGER_API_KEY!, flowName: 'ollama-demo', debug: true,});
const ollama = new Ollama();
// Cost estimation for self-hosted inferenceconst COST_CONFIG = { gpuCostPerHour: 0.50, baselineTokensPerSecond: 30, modelSpeedMultipliers: { 'llama2:7b': 1.5, 'llama2:13b': 1.0, 'llama2:70b': 0.3, 'mistral': 1.8, 'codellama': 1.2, } as Record<string, number>,};
function estimateCost(model: string, totalTokens: number): number { const speedMultiplier = COST_CONFIG.modelSpeedMultipliers[model] || 1.0; const tokensPerSecond = COST_CONFIG.baselineTokensPerSecond * speedMultiplier; const seconds = totalTokens / tokensPerSecond; const hours = seconds / 3600; return hours * COST_CONFIG.gpuCostPerHour;}
async function intelligentChat(message: string) { return tracer.trace('intelligent-chat', async (parentSpan) => {
// Step 1: Quick classification with small model const complexity = await tracer.trace('classify', async (span) => { const response = await ollama.chat({ model: 'phi', messages: [{ role: 'user', content: `Rate this query complexity as "simple" or "complex": "${message}"`, }], });
span.recordCost({ provider: 'ollama', model: 'phi', inputTokens: response.prompt_eval_count, outputTokens: response.eval_count, costUsd: estimateCost('phi', (response.prompt_eval_count || 0) + (response.eval_count || 0) ), });
return response.message.content.toLowerCase().includes('complex') ? 'complex' : 'simple'; }, { parent: parentSpan });
parentSpan.setAttributes({ complexity });
// Step 2: Route to appropriate model const model = complexity === 'complex' ? 'llama2:13b' : 'llama2:7b';
// Step 3: Generate response return tracer.trace('respond', async (span) => { const response = await ollama.chat({ model, messages: [{ role: 'user', content: message }], });
const totalTokens = (response.prompt_eval_count || 0) + (response.eval_count || 0);
span.setAttributes({ 'model.selected': model, 'ollama.tokens_per_second': response.eval_count / (response.eval_duration / 1_000_000_000), });
span.recordCost({ provider: 'ollama', model, inputTokens: response.prompt_eval_count, outputTokens: response.eval_count, costUsd: estimateCost(model, totalTokens), });
return response.message.content; }, { parent: parentSpan }); });}
async function main() { console.log('Simple query:'); console.log(await intelligentChat('What is 2+2?'));
console.log('\nComplex query:'); console.log(await intelligentChat('Explain the theory of relativity and its implications for GPS satellites.'));
await tracer.shutdown();}
main();Next Steps
- Compare with OpenAI Integration
- Learn about Cost Analysis across providers