Ollama Integration

This guide shows how to integrate Ladger with Ollama for tracking usage of self-hosted models like Llama, Mistral, and CodeLlama.

Why Track Ollama?

Even though Ollama models are “free” (no API costs), tracking usage helps you:

Monitor infrastructure costs (GPU time, electricity)
Compare performance across models
Identify optimization opportunities
Maintain consistent observability across providers

Installation

npm install @ladger/sdk ollama

yarn add @ladger/sdk ollama

pnpm add @ladger/sdk ollama

Basic Setup

import { LadgerTracer } from '@ladger/sdk';
import { Ollama } from 'ollama';

const tracer = new LadgerTracer({
  apiKey: process.env.LADGER_API_KEY!,
  flowName: 'ollama-assistant',
});

const ollama = new Ollama({
  host: 'http://localhost:11434', // Default Ollama host
});

Chat Completions

Basic Chat

async function chat(message: string, model = 'llama2') {
  return tracer.trace('ollama-chat', async (span) => {
    const response = await ollama.chat({
      model,
      messages: [{ role: 'user', content: message }],
    });

    span.setAttributes({
      'ollama.model': model,
      'ollama.total_duration': response.total_duration,
      'ollama.load_duration': response.load_duration,
      'ollama.eval_duration': response.eval_duration,
    });

    span.recordCost({
      provider: 'ollama',
      model,
      inputTokens: response.prompt_eval_count,
      outputTokens: response.eval_count,
      // No API cost, but can track compute cost
      costUsd: 0,
    });

    return response.message.content;
  });
}

Streaming

async function chatStream(message: string, model = 'llama2') {
  return tracer.trace('ollama-stream', async (span) => {
    let inputTokens = 0;
    let outputTokens = 0;
    let content = '';

    const stream = await ollama.chat({
      model,
      messages: [{ role: 'user', content: message }],
      stream: true,
    });

    for await (const chunk of stream) {
      content += chunk.message.content;
      process.stdout.write(chunk.message.content);

      // Final chunk contains usage
      if (chunk.done) {
        inputTokens = chunk.prompt_eval_count || 0;
        outputTokens = chunk.eval_count || 0;
      }
    }

    span.recordCost({
      provider: 'ollama',
      model,
      inputTokens,
      outputTokens,
    });

    return content;
  });
}

Generate (Completion)

async function generate(prompt: string, model = 'codellama') {
  return tracer.trace('ollama-generate', async (span) => {
    const response = await ollama.generate({
      model,
      prompt,
    });

    span.setAttributes({
      'ollama.model': model,
      'ollama.context_size': response.context?.length || 0,
    });

    span.recordCost({
      provider: 'ollama',
      model,
      inputTokens: response.prompt_eval_count,
      outputTokens: response.eval_count,
    });

    return response.response;
  });
}

Embeddings

async function createEmbedding(text: string, model = 'nomic-embed-text') {
  return tracer.trace('ollama-embedding', async (span) => {
    const response = await ollama.embeddings({
      model,
      prompt: text,
    });

    span.setAttributes({
      'embedding.model': model,
      'embedding.dimensions': response.embedding.length,
    });

    span.recordCost({
      provider: 'ollama',
      model,
      inputTokens: text.split(/\s+/).length, // Approximate token count
    });

    return response.embedding;
  });
}

Tracking Compute Costs

For self-hosted models, you might want to track infrastructure costs:

// Example: Estimate GPU cost per token
const GPU_COST_PER_HOUR = 0.50; // $0.50/hour for GPU rental
const TOKENS_PER_SECOND = 30;   // Approximate throughput

function estimateComputeCost(inputTokens: number, outputTokens: number): number {
  const totalTokens = inputTokens + outputTokens;
  const seconds = totalTokens / TOKENS_PER_SECOND;
  const hours = seconds / 3600;
  return hours * GPU_COST_PER_HOUR;
}

async function chatWithCost(message: string, model = 'llama2') {
  return tracer.trace('ollama-chat-costed', async (span) => {
    const response = await ollama.chat({
      model,
      messages: [{ role: 'user', content: message }],
    });

    const computeCost = estimateComputeCost(
      response.prompt_eval_count || 0,
      response.eval_count || 0
    );

    span.recordCost({
      provider: 'ollama',
      model,
      inputTokens: response.prompt_eval_count,
      outputTokens: response.eval_count,
      costUsd: computeCost, // Estimated compute cost
    });

    return response.message.content;
  });
}

Performance Tracking

Track performance metrics specific to local inference:

async function chatWithMetrics(message: string, model = 'llama2') {
  return tracer.trace('ollama-metrics', async (span) => {
    const startTime = Date.now();

    const response = await ollama.chat({
      model,
      messages: [{ role: 'user', content: message }],
    });

    const endTime = Date.now();
    const wallClockMs = endTime - startTime;

    // Convert nanoseconds to milliseconds
    const totalDurationMs = response.total_duration / 1_000_000;
    const loadDurationMs = response.load_duration / 1_000_000;
    const evalDurationMs = response.eval_duration / 1_000_000;
    const promptEvalMs = response.prompt_eval_duration / 1_000_000;

    span.setAttributes({
      // Model info
      'ollama.model': model,

      // Token counts
      'ollama.prompt_tokens': response.prompt_eval_count,
      'ollama.completion_tokens': response.eval_count,

      // Timing (in ms)
      'ollama.wall_clock_ms': wallClockMs,
      'ollama.total_duration_ms': totalDurationMs,
      'ollama.load_duration_ms': loadDurationMs,
      'ollama.prompt_eval_ms': promptEvalMs,
      'ollama.eval_duration_ms': evalDurationMs,

      // Performance metrics
      'ollama.tokens_per_second': response.eval_count / (evalDurationMs / 1000),
    });

    span.recordCost({
      provider: 'ollama',
      model,
      inputTokens: response.prompt_eval_count,
      outputTokens: response.eval_count,
    });

    return response.message.content;
  });
}

Popular Ollama Models

Model	Size	Use Case	Speed
llama2:7b	7B	General chat	Fast
llama2:13b	13B	Better reasoning	Medium
llama2:70b	70B	Complex tasks	Slow
codellama	7-34B	Code generation	Medium
mistral	7B	Efficient general use	Fast
mixtral	8x7B	High quality	Medium
phi	2.7B	Lightweight	Very Fast
neural-chat	7B	Conversational	Fast

Complete Example

import 'dotenv/config';
import { LadgerTracer } from '@ladger/sdk';
import { Ollama } from 'ollama';

const tracer = new LadgerTracer({
  apiKey: process.env.LADGER_API_KEY!,
  flowName: 'ollama-demo',
  debug: true,
});

const ollama = new Ollama();

// Cost estimation for self-hosted inference
const COST_CONFIG = {
  gpuCostPerHour: 0.50,
  baselineTokensPerSecond: 30,
  modelSpeedMultipliers: {
    'llama2:7b': 1.5,
    'llama2:13b': 1.0,
    'llama2:70b': 0.3,
    'mistral': 1.8,
    'codellama': 1.2,
  } as Record<string, number>,
};

function estimateCost(model: string, totalTokens: number): number {
  const speedMultiplier = COST_CONFIG.modelSpeedMultipliers[model] || 1.0;
  const tokensPerSecond = COST_CONFIG.baselineTokensPerSecond * speedMultiplier;
  const seconds = totalTokens / tokensPerSecond;
  const hours = seconds / 3600;
  return hours * COST_CONFIG.gpuCostPerHour;
}

async function intelligentChat(message: string) {
  return tracer.trace('intelligent-chat', async (parentSpan) => {

    // Step 1: Quick classification with small model
    const complexity = await tracer.trace('classify', async (span) => {
      const response = await ollama.chat({
        model: 'phi',
        messages: [{
          role: 'user',
          content: `Rate this query complexity as "simple" or "complex": "${message}"`,
        }],
      });

      span.recordCost({
        provider: 'ollama',
        model: 'phi',
        inputTokens: response.prompt_eval_count,
        outputTokens: response.eval_count,
        costUsd: estimateCost('phi',
          (response.prompt_eval_count || 0) + (response.eval_count || 0)
        ),
      });

      return response.message.content.toLowerCase().includes('complex')
        ? 'complex'
        : 'simple';
    }, { parent: parentSpan });

    parentSpan.setAttributes({ complexity });

    // Step 2: Route to appropriate model
    const model = complexity === 'complex' ? 'llama2:13b' : 'llama2:7b';

    // Step 3: Generate response
    return tracer.trace('respond', async (span) => {
      const response = await ollama.chat({
        model,
        messages: [{ role: 'user', content: message }],
      });

      const totalTokens = (response.prompt_eval_count || 0) + (response.eval_count || 0);

      span.setAttributes({
        'model.selected': model,
        'ollama.tokens_per_second': response.eval_count /
          (response.eval_duration / 1_000_000_000),
      });

      span.recordCost({
        provider: 'ollama',
        model,
        inputTokens: response.prompt_eval_count,
        outputTokens: response.eval_count,
        costUsd: estimateCost(model, totalTokens),
      });

      return response.message.content;
    }, { parent: parentSpan });
  });
}

async function main() {
  console.log('Simple query:');
  console.log(await intelligentChat('What is 2+2?'));

  console.log('\nComplex query:');
  console.log(await intelligentChat('Explain the theory of relativity and its implications for GPS satellites.'));

  await tracer.shutdown();
}

main();

Next Steps

Compare with OpenAI Integration
Learn about Cost Analysis across providers