Performance Tuning Guide

Optimize Trinity applications for maximum performance. This guide covers SIMD optimization, memory management, VSA operation tuning, and benchmarking best practices.

Overview

Trinity's balanced ternary architecture provides unique performance characteristics:

Aspect	Performance	Notes
Memory Density	1.58 bits/trit	20x more compact than float32
Compute	Add-only operations	No multiplication needed for binding
SIMD Potential	High	Ternary operations vectorize well
Cache Efficiency	Excellent	Packed representation reduces cache misses

SIMD Optimization

Understanding SIMD in Trinity

SIMD (Single Instruction, Multiple Data) allows processing multiple trits simultaneously. Zig's @Vector() type is the key to unlocking this performance.

Basic SIMD Operations

Vectorized Binding

const std = @import("std");
const vsa = @import("trinity/vsa");

// Scalar version (slow)
fn bindScalar(a: []const i2, b: []const i2, result: []i2) void {
    for (a, 0..) |trit_a, i| {
        result[i] = trit_a * b[i];
    }
}

// SIMD version (8x faster on AVX2)
fn bindSIMD(a: []const i2, b: []const i2, result: []i2) void {
    const Vec = @Vector(16, i2);  // Process 16 trits at once
    const len = a.len / 16;

    var i: usize = 0;
    while (i < len) : (i += 1) {
        const va: Vec = a[i*16..][0..16].*;
        const vb: Vec = b[i*16..][0..16].*;
        result[i*16..][0..16].* = va * vb;
    }

    // Handle remainder
    const remainder = a.len % 16;
    if (remainder > 0) {
        const start = len * 16;
        for (0..remainder) |j| {
            result[start + j] = a[start + j] * b[start + j];
        }
    }
}

Vectorized Similarity Calculation

// Cosine similarity with SIMD
fn cosineSimilaritySIMD(a: []const i2, b: []const i2) f64 {
    const Vec = @Vector(16, i32);  // Use i32 to avoid overflow
    const len = a.len / 16;

    var dotVec: Vec = @splat(0);
    var i: usize = 0;

    while (i < len) : (i += 1) {
        const va: Vec = @intCast(a[i*16..][0..16].*);
        const vb: Vec = @intCast(b[i*16..][0..16].*);
        dotVec += va * vb;
    }

    // Reduce vector to scalar
    var dotSum: i32 = 0;
    for (0..16) |j| {
        dotSum += dotVec[j];
    }

    // Handle remainder
    for (len * 16..a.len) |j| {
        dotSum += @as(i32, a[j]) * @as(i32, b[j]);
    }

    return @as(f64, @floatFromInt(dotSum)) / @as(f64, @floatFromInt(a.len));
}

SIMD Optimization Tips

Tip	Benefit	Example
Align to 16/32 bytes	Prevents cross-cache-line loads	`alignas(32) var data: [1024]i2`
Use power-of-2 sizes	Enables loop unrolling	Process 16/32/64 trits at once
Prefetch memory	Hides latency	`@prefetch(ptr[i + 8])`
Avoid branches	Keeps vector pipeline full	Use ternary operators instead of if
Batch operations	Amortizes overhead	Process 1000+ trits at once

Compiler Hints

// Tell Zig to vectorize
fn optimizedBind(a: []const i2, b: []const i2, result: []i2) void {
    @setRuntimeSafety(false);  // Disable bounds checking
    @setOptimizationMode(.Optimized);  // Force optimization

    const Vec = @Vector(32, i2);
    // ... implementation
}

Memory Management

HybridBigInt Memory Efficiency

Trinity's HybridBigInt uses a packed representation that's 20x more memory-efficient than float32 arrays.

Memory Comparison

Representation	10,000 Trits	Memory
`[]f32`	10,000 × 32-bit	40 KB
`[]i8` (ternary)	10,000 × 8-bit	10 KB
`HybridBigInt` (packed)	10,000 × 1.58-bit	2 KB

Pool Allocation for Frequent Operations

const VsaPool = struct {
    const VEC_SIZE = 10000;
    const POOL_SIZE = 100;

    allocator: std.mem.Allocator,
    pool: [POOL_SIZE]?vsa.HybridBigInt,

    fn init(allocator: std.mem.Allocator) VsaPool {
        return .{
            .allocator = allocator,
            .pool = [_]?vsa.HybridBigInt{null} ** POOL_SIZE,
        };
    }

    fn acquire(pool: *VsaPool) !*vsa.HybridBigInt {
        // Find free slot
        for (&pool.pool, 0..) |*slot, i| {
            if (slot.* == null) {
                slot.* = try vsa.HybridBigInt.init(pool.allocator, VEC_SIZE);
                return &slot.*.?;
            }
        }
        return error.PoolExhausted;
    }

    fn release(pool: *VsaPool, vec: *vsa.HybridBigInt) void {
        vec.deinit(pool.allocator);
        for (&pool.pool, 0..) |*slot, i| {
            if (slot.*) |v| {
                if (v == vec.*) {
                    slot.* = null;
                    return;
                }
            }
        }
    }
};

Cache-Friendly Data Structures

Structure of Arrays vs. Array of Structures

// BAD: Array of Structures (cache misses)
const TrinaryVectorSoA = struct {
    data: []vsa.HybridBigInt,
};

// GOOD: Structure of Arrays (cache friendly)
const TrinaryVectorSoA = struct {
    // Store trits contiguously
    trits: []i2,
    length: usize,

    fn deinit(self: *const TrinaryVectorSoA, allocator: std.mem.Allocator) void {
        allocator.free(self.trits);
    }
};

Memory Profiling

# Build with memory profiling
zig build -Drelease -Dmemory-profile

# Run with memory tracker
./zig-out/bin/tri --profile-memory

# Analyze heap usage
./zig-out/bin/tri --profile-heap > heap.log
zig tools/analyze-profile heap.log

VSA Operation Optimization

Batch Binding

// Process multiple bindings in one pass
fn batchBind(vectors: []const vsa.HybridBigInt, keys: []const vsa.HybridBigInt, results: []vsa.HybridBigInt) !void {
    // Pre-allocate all results
    for (0..vectors.len) |i| {
        results[i] = try vsa.HybridBigInt.init(allocator, vectors[i].len);
    }

    // Batch process (better cache locality)
    for (0..vectors.len) |i| {
        _ = try vsa.bind(&vectors[i], &keys[i], &results[i]);
    }
}

Similarity Search Optimization

// Use spatial partitioning for faster nearest-neighbor search
const LshTable = struct {
    tables: []std.AutoHashMap(u64, []usize),
    num_tables: usize,
    num_hashes: usize,

    fn init(allocator: std.mem.Allocator, num_tables: usize, num_hashes: usize) !LshTable {
        var tables = try allocator.alloc(std.AutoHashMap(u64, []usize), num_tables);
        for (tables) |*table| {
            table.* = std.AutoHashMap(u64, []usize).init(allocator);
        }
        return .{
            .tables = tables,
            .num_tables = num_tables,
            .num_hashes = num_hashes,
        };
    }

    fn insert(lsh: *LshTable, allocator: std.mem.Allocator, idx: usize, vec: *const vsa.HybridBigInt) !void {
        for (0..lsh.num_tables) |t| {
            const hash = computeHash(vec, t);
            const entry = try lsh.tables[t].getOrPut(hash);
            if (!entry.found_existing) {
                entry.value_ptr.* = &[_]usize{};
            }
            // Append index
            const new_list = try allocator.alloc(usize, entry.value_ptr.len + 1);
            @memcpy(new_list[0..entry.value_ptr.len], entry.value_ptr.*);
            new_list[entry.value_ptr.len] = idx;
            entry.value_ptr.* = new_list;
        }
    }

    fn findNearest(lsh: *LshTable, query: *const vsa.HybridBigInt) !?usize {
        var candidates = std.ArrayList(usize).init(allocator);
        defer candidates.deinit();

        for (0..lsh.num_tables) |t| {
            const hash = computeHash(query, t);
            if (lsh.tables[t].get(hash)) |indices| {
                try candidates.appendSlice(indices);
            }
        }

        // Filter by actual similarity
        var best_idx: ?usize = null;
        var best_sim: f64 = 0.0;

        for (candidates.items) |idx| {
            const sim = try vsa.cosineSimilarity(query, &vectors[idx]);
            if (sim > best_sim) {
                best_sim = sim;
                best_idx = idx;
            }
        }

        return best_idx;
    }
};

Permutation Caching

// Cache frequently used permutations
var perm_cache: std.AutoHashMap(usize, vsa.HybridBigInt) = undefined;

fn getCachedPermutation(vec: *const vsa.HybridBigInt, count: usize) !vsa.HybridBigInt {
    const key = @intFromPtr(vec.ptr) ^ count;

    if (perm_cache.get(key)) |cached| {
        return cached.clone();
    }

    const result = try vsa.permute(vec, count);
    try perm_cache.put(key, result.clone());
    return result;
}

Benchmarking Guidelines

Microbenchmarking Template

const std = @import("std");
const vsa = @import("trinity/vsa");

fn benchmarkBind(allocator: std.mem.Allocator, iterations: usize) !void {
    const timer = try std.time.Timer.start();

    // Setup
    const vec_a = try vsa.HybridBigInt.random(allocator, 10000);
    defer vec_a.deinit(allocator);
    const vec_b = try vsa.HybridBigInt.random(allocator, 10000);
    defer vec_b.deinit(allocator);
    var result = try vsa.HybridBigInt.init(allocator, 10000);
    defer result.deinit(allocator);

    // Warmup
    for (0..100) |_| {
        _ = try vsa.bind(&vec_a, &vec_b, &result);
    }

    // Benchmark
    const start = timer.lap();
    for (0..iterations) |_| {
        _ = try vsa.bind(&vec_a, &vec_b, &result);
    }
    const end = timer.read();

    // Results
    const elapsed_ns = end - start;
    const avg_ns = @as(f64, @floatFromInt(elapsed_ns)) / @as(f64, @floatFromInt(iterations));
    const ops_per_sec = 1_000_000_000.0 / avg_ns;

    std.debug.print(
        \\bind() Benchmark:
        \\  Iterations: {d}
        \\  Total time: {d:.2} ms
        \\  Avg/op: {d:.3} ns
        \\  Ops/sec: {d:.0}
        \\
    , .{ iterations, @as(f64, @floatFromInt(elapsed_ns)) / 1_000_000.0, avg_ns, ops_per_sec });
}

Performance Regression Testing

# Create baseline
zig build bench --baseline

# Compare with current
zig build bench --compare

# Output:
# bind(): +2.3% (was 45.2 ns/op, now 46.3 ns/op) [REGRESSION]
# similarity(): -1.8% (was 32.1 ns/op, now 31.5 ns/op) [IMPROVEMENT]

Benchmarking Best Practices

Practice	Why	Example
Warmup iterations	CPU cache and branch prediction	Run 100+ iterations before measuring
Statistical significance	Variance in measurements	Use 1000+ iterations, repeat 5+ times
Isolate variables	Measure one thing at a time	Don't benchmark bind+similarity together
Use realistic data	Synthetic data can mislead	Use actual corpus data
Check assembly	Verify compiler optimization	`zig objdump -d binary`

Advanced Techniques

Multi-threading for Batch Operations

fn parallelBind(vectors: []const vsa.HybridBigInt, keys: []const vsa.HybridBigInt) !void {
    const num_threads = try std.Thread.getCpuCount();
    const chunk_size = vectors.len / num_threads;

    var threads: [16]std.Thread = undefined;

    for (0..num_threads) |i| {
        const start = i * chunk_size;
        const end = if (i == num_threads - 1) vectors.len else (i + 1) * chunk_size;

        threads[i] = try std.Thread.spawn(.{}, struct {
            fn worker(start: usize, end: usize) !void {
                for (start..end) |j| {
                    _ = try vsa.bind(&vectors[j], &keys[j], &results[j]);
                }
            }.worker, .{ start, end });
    }

    for (0..num_threads) |i| {
        threads[i].join();
    }
}

GPU Offloading (Future)

// Pseudo-code for GPU acceleration
fn gpuBindBatch(vectors: []vsa.HybridBigInt, keys: []vsa.HybridBigInt) !void {
    // 1. Copy data to GPU
    const gpu_vectors = try gpu.copyToGpu(vectors);
    defer gpu.free(gpu_vectors);
    const gpu_keys = try gpu.copyToGpu(keys);
    defer gpu.free(gpu_keys);

    // 2. Launch kernel
    try gpu.launchKernel(bindKernel, .{ gpu_vectors, gpu_keys });

    // 3. Copy results back
    try gpu.copyFromGpu(results, gpu_results);
}

Performance Checklist

Use this checklist before deploying to production:

All hot paths use SIMD operations
Memory is aligned to cache line boundaries
Object pools for frequent allocations
Benchmark suite covers critical paths
Performance regression tests pass
Memory usage is stable (no leaks)
CPU utilization is >70% (not bottlenecked)
Cache hit rate is >80%

Overview​

SIMD Optimization​

Understanding SIMD in Trinity​

Basic SIMD Operations​

Vectorized Binding​

Vectorized Similarity Calculation​

SIMD Optimization Tips​

Compiler Hints​

Memory Management​

HybridBigInt Memory Efficiency​

Memory Comparison​

Pool Allocation for Frequent Operations​

Cache-Friendly Data Structures​

Structure of Arrays vs. Array of Structures​

Memory Profiling​

VSA Operation Optimization​

Batch Binding​

Similarity Search Optimization​

Permutation Caching​

Benchmarking Guidelines​

Microbenchmarking Template​

Performance Regression Testing​

Benchmarking Best Practices​

Advanced Techniques​

Multi-threading for Batch Operations​

GPU Offloading (Future)​

Performance Checklist​

Further Reading​