Skip to main content

Performance Tuning Guide

Optimize Trinity applications for maximum performance. This guide covers SIMD optimization, memory management, VSA operation tuning, and benchmarking best practices.

Overview

Trinity's balanced ternary architecture provides unique performance characteristics:

AspectPerformanceNotes
Memory Density1.58 bits/trit20x more compact than float32
ComputeAdd-only operationsNo multiplication needed for binding
SIMD PotentialHighTernary operations vectorize well
Cache EfficiencyExcellentPacked representation reduces cache misses

SIMD Optimization

Understanding SIMD in Trinity

SIMD (Single Instruction, Multiple Data) allows processing multiple trits simultaneously. Zig's @Vector() type is the key to unlocking this performance.

Basic SIMD Operations

Vectorized Binding

const std = @import("std");
const vsa = @import("trinity/vsa");

// Scalar version (slow)
fn bindScalar(a: []const i2, b: []const i2, result: []i2) void {
for (a, 0..) |trit_a, i| {
result[i] = trit_a * b[i];
}
}

// SIMD version (8x faster on AVX2)
fn bindSIMD(a: []const i2, b: []const i2, result: []i2) void {
const Vec = @Vector(16, i2); // Process 16 trits at once
const len = a.len / 16;

var i: usize = 0;
while (i < len) : (i += 1) {
const va: Vec = a[i*16..][0..16].*;
const vb: Vec = b[i*16..][0..16].*;
result[i*16..][0..16].* = va * vb;
}

// Handle remainder
const remainder = a.len % 16;
if (remainder > 0) {
const start = len * 16;
for (0..remainder) |j| {
result[start + j] = a[start + j] * b[start + j];
}
}
}

Vectorized Similarity Calculation

// Cosine similarity with SIMD
fn cosineSimilaritySIMD(a: []const i2, b: []const i2) f64 {
const Vec = @Vector(16, i32); // Use i32 to avoid overflow
const len = a.len / 16;

var dotVec: Vec = @splat(0);
var i: usize = 0;

while (i < len) : (i += 1) {
const va: Vec = @intCast(a[i*16..][0..16].*);
const vb: Vec = @intCast(b[i*16..][0..16].*);
dotVec += va * vb;
}

// Reduce vector to scalar
var dotSum: i32 = 0;
for (0..16) |j| {
dotSum += dotVec[j];
}

// Handle remainder
for (len * 16..a.len) |j| {
dotSum += @as(i32, a[j]) * @as(i32, b[j]);
}

return @as(f64, @floatFromInt(dotSum)) / @as(f64, @floatFromInt(a.len));
}

SIMD Optimization Tips

TipBenefitExample
Align to 16/32 bytesPrevents cross-cache-line loadsalignas(32) var data: [1024]i2
Use power-of-2 sizesEnables loop unrollingProcess 16/32/64 trits at once
Prefetch memoryHides latency@prefetch(ptr[i + 8])
Avoid branchesKeeps vector pipeline fullUse ternary operators instead of if
Batch operationsAmortizes overheadProcess 1000+ trits at once

Compiler Hints

// Tell Zig to vectorize
fn optimizedBind(a: []const i2, b: []const i2, result: []i2) void {
@setRuntimeSafety(false); // Disable bounds checking
@setOptimizationMode(.Optimized); // Force optimization

const Vec = @Vector(32, i2);
// ... implementation
}

Memory Management

HybridBigInt Memory Efficiency

Trinity's HybridBigInt uses a packed representation that's 20x more memory-efficient than float32 arrays.

Memory Comparison

Representation10,000 TritsMemory
[]f3210,000 × 32-bit40 KB
[]i8 (ternary)10,000 × 8-bit10 KB
HybridBigInt (packed)10,000 × 1.58-bit2 KB

Pool Allocation for Frequent Operations

const VsaPool = struct {
const VEC_SIZE = 10000;
const POOL_SIZE = 100;

allocator: std.mem.Allocator,
pool: [POOL_SIZE]?vsa.HybridBigInt,

fn init(allocator: std.mem.Allocator) VsaPool {
return .{
.allocator = allocator,
.pool = [_]?vsa.HybridBigInt{null} ** POOL_SIZE,
};
}

fn acquire(pool: *VsaPool) !*vsa.HybridBigInt {
// Find free slot
for (&pool.pool, 0..) |*slot, i| {
if (slot.* == null) {
slot.* = try vsa.HybridBigInt.init(pool.allocator, VEC_SIZE);
return &slot.*.?;
}
}
return error.PoolExhausted;
}

fn release(pool: *VsaPool, vec: *vsa.HybridBigInt) void {
vec.deinit(pool.allocator);
for (&pool.pool, 0..) |*slot, i| {
if (slot.*) |v| {
if (v == vec.*) {
slot.* = null;
return;
}
}
}
}
};

Cache-Friendly Data Structures

Structure of Arrays vs. Array of Structures

// BAD: Array of Structures (cache misses)
const TrinaryVectorSoA = struct {
data: []vsa.HybridBigInt,
};

// GOOD: Structure of Arrays (cache friendly)
const TrinaryVectorSoA = struct {
// Store trits contiguously
trits: []i2,
length: usize,

fn deinit(self: *const TrinaryVectorSoA, allocator: std.mem.Allocator) void {
allocator.free(self.trits);
}
};

Memory Profiling

# Build with memory profiling
zig build -Drelease -Dmemory-profile

# Run with memory tracker
./zig-out/bin/tri --profile-memory

# Analyze heap usage
./zig-out/bin/tri --profile-heap > heap.log
zig tools/analyze-profile heap.log

VSA Operation Optimization

Batch Binding

// Process multiple bindings in one pass
fn batchBind(vectors: []const vsa.HybridBigInt, keys: []const vsa.HybridBigInt, results: []vsa.HybridBigInt) !void {
// Pre-allocate all results
for (0..vectors.len) |i| {
results[i] = try vsa.HybridBigInt.init(allocator, vectors[i].len);
}

// Batch process (better cache locality)
for (0..vectors.len) |i| {
_ = try vsa.bind(&vectors[i], &keys[i], &results[i]);
}
}

Similarity Search Optimization

// Use spatial partitioning for faster nearest-neighbor search
const LshTable = struct {
tables: []std.AutoHashMap(u64, []usize),
num_tables: usize,
num_hashes: usize,

fn init(allocator: std.mem.Allocator, num_tables: usize, num_hashes: usize) !LshTable {
var tables = try allocator.alloc(std.AutoHashMap(u64, []usize), num_tables);
for (tables) |*table| {
table.* = std.AutoHashMap(u64, []usize).init(allocator);
}
return .{
.tables = tables,
.num_tables = num_tables,
.num_hashes = num_hashes,
};
}

fn insert(lsh: *LshTable, allocator: std.mem.Allocator, idx: usize, vec: *const vsa.HybridBigInt) !void {
for (0..lsh.num_tables) |t| {
const hash = computeHash(vec, t);
const entry = try lsh.tables[t].getOrPut(hash);
if (!entry.found_existing) {
entry.value_ptr.* = &[_]usize{};
}
// Append index
const new_list = try allocator.alloc(usize, entry.value_ptr.len + 1);
@memcpy(new_list[0..entry.value_ptr.len], entry.value_ptr.*);
new_list[entry.value_ptr.len] = idx;
entry.value_ptr.* = new_list;
}
}

fn findNearest(lsh: *LshTable, query: *const vsa.HybridBigInt) !?usize {
var candidates = std.ArrayList(usize).init(allocator);
defer candidates.deinit();

for (0..lsh.num_tables) |t| {
const hash = computeHash(query, t);
if (lsh.tables[t].get(hash)) |indices| {
try candidates.appendSlice(indices);
}
}

// Filter by actual similarity
var best_idx: ?usize = null;
var best_sim: f64 = 0.0;

for (candidates.items) |idx| {
const sim = try vsa.cosineSimilarity(query, &vectors[idx]);
if (sim > best_sim) {
best_sim = sim;
best_idx = idx;
}
}

return best_idx;
}
};

Permutation Caching

// Cache frequently used permutations
var perm_cache: std.AutoHashMap(usize, vsa.HybridBigInt) = undefined;

fn getCachedPermutation(vec: *const vsa.HybridBigInt, count: usize) !vsa.HybridBigInt {
const key = @intFromPtr(vec.ptr) ^ count;

if (perm_cache.get(key)) |cached| {
return cached.clone();
}

const result = try vsa.permute(vec, count);
try perm_cache.put(key, result.clone());
return result;
}

Benchmarking Guidelines

Microbenchmarking Template

const std = @import("std");
const vsa = @import("trinity/vsa");

fn benchmarkBind(allocator: std.mem.Allocator, iterations: usize) !void {
const timer = try std.time.Timer.start();

// Setup
const vec_a = try vsa.HybridBigInt.random(allocator, 10000);
defer vec_a.deinit(allocator);
const vec_b = try vsa.HybridBigInt.random(allocator, 10000);
defer vec_b.deinit(allocator);
var result = try vsa.HybridBigInt.init(allocator, 10000);
defer result.deinit(allocator);

// Warmup
for (0..100) |_| {
_ = try vsa.bind(&vec_a, &vec_b, &result);
}

// Benchmark
const start = timer.lap();
for (0..iterations) |_| {
_ = try vsa.bind(&vec_a, &vec_b, &result);
}
const end = timer.read();

// Results
const elapsed_ns = end - start;
const avg_ns = @as(f64, @floatFromInt(elapsed_ns)) / @as(f64, @floatFromInt(iterations));
const ops_per_sec = 1_000_000_000.0 / avg_ns;

std.debug.print(
\\bind() Benchmark:
\\ Iterations: {d}
\\ Total time: {d:.2} ms
\\ Avg/op: {d:.3} ns
\\ Ops/sec: {d:.0}
\\
, .{ iterations, @as(f64, @floatFromInt(elapsed_ns)) / 1_000_000.0, avg_ns, ops_per_sec });
}

Performance Regression Testing

# Create baseline
zig build bench --baseline

# Compare with current
zig build bench --compare

# Output:
# bind(): +2.3% (was 45.2 ns/op, now 46.3 ns/op) [REGRESSION]
# similarity(): -1.8% (was 32.1 ns/op, now 31.5 ns/op) [IMPROVEMENT]

Benchmarking Best Practices

PracticeWhyExample
Warmup iterationsCPU cache and branch predictionRun 100+ iterations before measuring
Statistical significanceVariance in measurementsUse 1000+ iterations, repeat 5+ times
Isolate variablesMeasure one thing at a timeDon't benchmark bind+similarity together
Use realistic dataSynthetic data can misleadUse actual corpus data
Check assemblyVerify compiler optimizationzig objdump -d binary

Advanced Techniques

Multi-threading for Batch Operations

fn parallelBind(vectors: []const vsa.HybridBigInt, keys: []const vsa.HybridBigInt) !void {
const num_threads = try std.Thread.getCpuCount();
const chunk_size = vectors.len / num_threads;

var threads: [16]std.Thread = undefined;

for (0..num_threads) |i| {
const start = i * chunk_size;
const end = if (i == num_threads - 1) vectors.len else (i + 1) * chunk_size;

threads[i] = try std.Thread.spawn(.{}, struct {
fn worker(start: usize, end: usize) !void {
for (start..end) |j| {
_ = try vsa.bind(&vectors[j], &keys[j], &results[j]);
}
}.worker, .{ start, end });
}

for (0..num_threads) |i| {
threads[i].join();
}
}

GPU Offloading (Future)

// Pseudo-code for GPU acceleration
fn gpuBindBatch(vectors: []vsa.HybridBigInt, keys: []vsa.HybridBigInt) !void {
// 1. Copy data to GPU
const gpu_vectors = try gpu.copyToGpu(vectors);
defer gpu.free(gpu_vectors);
const gpu_keys = try gpu.copyToGpu(keys);
defer gpu.free(gpu_keys);

// 2. Launch kernel
try gpu.launchKernel(bindKernel, .{ gpu_vectors, gpu_keys });

// 3. Copy results back
try gpu.copyFromGpu(results, gpu_results);
}

Performance Checklist

Use this checklist before deploying to production:

  • All hot paths use SIMD operations
  • Memory is aligned to cache line boundaries
  • Object pools for frequent allocations
  • Benchmark suite covers critical paths
  • Performance regression tests pass
  • Memory usage is stable (no leaks)
  • CPU utilization is >70% (not bottlenecked)
  • Cache hit rate is >80%

Further Reading


Need more performance tips? Check the community forum or open a GitHub issue.