Lux Docs

Performance Characteristics

Detailed performance analysis and optimization guide for Lattice

Performance Characteristics

Comprehensive performance analysis, benchmarks, and optimization strategies for the Lattice library.

Benchmark Results

Test Environment

CPU: Intel Core i9-12900K @ 3.2GHz (24 threads)
RAM: 64GB DDR5-5600
OS: Ubuntu 22.04 LTS
Go: 1.21.5
Compiler: go build -O3

CKKS Performance Benchmarks

Basic Operations (N=2^14, LogQP=438)

OperationTime (ms)Throughput (ops/s)Memory (MB)CPU Usage
Key Generation
Secret Key8.21224100%
Public Key45.322128100%
Relin Key124.58384100%
Galois Keys (15)1,8750.55,760100%
Encoding/Encryption
Encode0.81,2502100%
Encrypt2.34358100%
Decrypt1.85558100%
Decode0.71,4282100%
Arithmetic
Addition0.0520,0001625%
Subtraction0.0520,0001625%
Multiplication25.43924100%
Relinearization18.25532100%
Rescaling0.81,2501650%
Rotations
Rotate by 18.112316100%
Rotate by N/28.312016100%
Conjugation8.012516100%

Advanced Operations

OperationTime (ms)Memory (MB)Notes
Polynomial Evaluation
Degree 378963 multiplications
Degree 72352247 multiplications
Degree 1554848015 multiplications
Bootstrapping
Single slot850512128-bit precision
Full packed1,200768N slots
Batch (8 ct)8,5004,096Amortized: 1,062ms/ct
Linear Transformations
Diagonal832Single rotation
Banded (w=16)12851216 rotations
Dense2,0488,192N rotations

BGV Performance Benchmarks

Basic Operations (N=2^14, LogQP=438, t=65537)

OperationTime (ms)Throughput (ops/s)vs CKKS
Encode0.33,3332.7x faster
Encrypt2.14761.1x faster
Decrypt1.66251.1x faster
Decode0.25,0003.5x faster
Addition0.0425,0001.25x faster
Multiplication24.8401.02x faster

Multiparty Protocol Benchmarks

ProtocolPartiesTime (ms)Communication (MB)Rounds
Threshold Key Gen
TKG31561.21
TKG52672.01
TKG105424.01
Collective Decrypt
PCKS3450.41
PCKS5780.671
PCKS101631.341
Collective Relin
RKG33783.62
RKG56346.02
RKG101,28512.02

Memory Profiling

Memory Usage by Component

// CKKS Memory Footprint (N=2^14)
Secret Key:        4 MB
Public Key:      128 MB
Relin Key:       384 MB
Galois Key:      384 MB per element
Ciphertext:       16 MB (per level)
Plaintext:         8 MB
Bootstrapping:   512 MB (temporary)

Memory Optimization Strategies

1. Object Pooling

// Create a pool for ciphertext reuse
type CiphertextPool struct {
    pool sync.Pool
}

func NewCiphertextPool(params Parameters) *CiphertextPool {
    return &CiphertextPool{
        pool: sync.Pool{
            New: func() interface{} {
                return ckks.NewCiphertext(params, 1, params.MaxLevel())
            },
        },
    }
}

func (p *CiphertextPool) Get() *Ciphertext {
    return p.pool.Get().(*Ciphertext)
}

func (p *CiphertextPool) Put(ct *Ciphertext) {
    ct.Level = params.MaxLevel() // Reset level
    p.pool.Put(ct)
}

2. In-Place Operations

// Prefer in-place operations to reduce allocations
// Bad: Creates new ciphertext
ctSum, _ := evaluator.AddNew(ct1, ct2)

// Good: Reuses existing ciphertext
evaluator.Add(ct1, ct2, ct1) // Result stored in ct1

3. Lazy Allocation

// Allocate evaluation keys only when needed
type LazyEvaluator struct {
    params Parameters
    rlk    *RelinearizationKey
    gks    map[uint64]*GaloisKey
    mu     sync.RWMutex
}

func (e *LazyEvaluator) GetGaloisKey(galEl uint64) *GaloisKey {
    e.mu.RLock()
    if gk, ok := e.gks[galEl]; ok {
        e.mu.RUnlock()
        return gk
    }
    e.mu.RUnlock()

    e.mu.Lock()
    defer e.mu.Unlock()
    // Generate key if not exists
    gk := kgen.GenGaloisKeyNew(galEl, sk)
    e.gks[galEl] = gk
    return gk
}

CPU Optimization

Parallelization Strategies

1. Parallel NTT

// NTT operations are parallelized internally
// Controlled by GOMAXPROCS
runtime.GOMAXPROCS(runtime.NumCPU())

// For N=2^14, NTT parallelization:
// 1 thread:  4.2ms
// 4 threads: 1.3ms (3.2x speedup)
// 8 threads: 0.8ms (5.3x speedup)
// 16 threads: 0.6ms (7x speedup)

2. Batch Processing

// Process multiple ciphertexts in parallel
func ParallelEvaluate(eval Evaluator, cts []*Ciphertext, f func(*Ciphertext) error) error {
    errChan := make(chan error, len(cts))
    var wg sync.WaitGroup

    for i := range cts {
        wg.Add(1)
        go func(ct *Ciphertext) {
            defer wg.Done()
            if err := f(ct); err != nil {
                errChan <- err
            }
        }(cts[i])
    }

    wg.Wait()
    close(errChan)

    if err := <-errChan; err != nil {
        return err
    }
    return nil
}

3. SIMD Vectorization

// Lattice automatically uses SIMD when available
// Check CPU features
import "golang.org/x/sys/cpu"

if cpu.X86.HasAVX512 {
    // AVX-512: 16x speedup for modular arithmetic
} else if cpu.X86.HasAVX2 {
    // AVX2: 8x speedup
} else if cpu.X86.HasSSE42 {
    // SSE4.2: 4x speedup
}

Profiling Tools

CPU Profiling

import "runtime/pprof"

// Start CPU profiling
f, _ := os.Create("cpu.prof")
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()

// Run your computation
// ...

// Analyze with: go tool pprof cpu.prof

Memory Profiling

// Memory profiling
f, _ := os.Create("mem.prof")
defer f.Close()
runtime.GC()
pprof.WriteHeapProfile(f)

// Analyze with: go tool pprof mem.prof

Network Optimization

Serialization Performance

FormatSize (MB)Encode (ms)Decode (ms)
Binary16.02.11.8
Compressed5.845.238.6
Custom RLE8.28.57.3
// Efficient serialization
func SerializeCiphertext(ct *Ciphertext) ([]byte, error) {
    var buf bytes.Buffer
    enc := gob.NewEncoder(&buf)

    // Use compression for network transfer
    zw := gzip.NewWriter(&buf)
    enc = gob.NewEncoder(zw)

    if err := enc.Encode(ct); err != nil {
        return nil, err
    }

    zw.Close()
    return buf.Bytes(), nil
}

Bandwidth Optimization

// Minimize data transfer in multiparty protocols
type CompressedShare struct {
    Data []byte
    Hash [32]byte
}

func CompressShare(share *Share) (*CompressedShare, error) {
    // Serialize
    data, err := share.MarshalBinary()
    if err != nil {
        return nil, err
    }

    // Compress
    var buf bytes.Buffer
    zw, _ := zlib.NewWriterLevel(&buf, zlib.BestCompression)
    zw.Write(data)
    zw.Close()

    // Hash for integrity
    hash := sha256.Sum256(buf.Bytes())

    return &CompressedShare{
        Data: buf.Bytes(),
        Hash: hash,
    }, nil
}

Parameter Selection for Performance

Ring Dimension Trade-offs

NSecuritySlotsPerformanceMemoryUse Case
2^10Low512Fastest0.25 MBTesting
2^1280-bit2,048Very Fast1 MBLight crypto
2^14128-bit8,192Fast4 MBStandard
2^15192-bit16,384Moderate8 MBHigh security
2^16256-bit32,768Slow16 MBMaximum security
2^17256-bit+65,536Very Slow32 MBResearch

Modulus Chain Optimization

// Optimize modulus chain for your computation depth
func OptimizeModulusChain(multDepth int, rescaleFreq int) []int {
    // Each multiplication consumes ~60 bits
    // Each rescale removes one prime

    chainLength := multDepth + rescaleFreq
    logQ := make([]int, chainLength+1)

    // First prime: larger for initial noise
    logQ[0] = 60

    // Middle primes: standard size
    for i := 1; i < chainLength; i++ {
        logQ[i] = 50
    }

    // Last prime: smaller acceptable
    logQ[chainLength] = 40

    return logQ
}

Optimization Checklist

Before Optimization

  • Profile to identify bottlenecks
  • Measure baseline performance
  • Set performance targets
  • Understand algorithmic complexity

Memory Optimizations

  • Use object pooling for frequently allocated objects
  • Prefer in-place operations
  • Clear unused references for GC
  • Use appropriate data structures
  • Monitor heap allocations

CPU Optimizations

  • Enable parallel processing (GOMAXPROCS)
  • Batch independent operations
  • Use SIMD-friendly algorithms
  • Minimize cache misses
  • Profile hot paths

Algorithm Optimizations

  • Choose optimal parameters
  • Minimize multiplication depth
  • Use lazy relinearization
  • Batch rotations when possible
  • Exploit sparsity

Network Optimizations

  • Compress data before transfer
  • Use efficient serialization
  • Minimize round trips
  • Batch communications
  • Cache remote data

Performance Tips

1. Parameter Selection

// Start with minimal parameters and increase as needed
params := ckks.ParametersLiteral{
    LogN: 13, // Start small
    LogQ: []int{45, 40, 40}, // Minimal chain
    LogP: []int{55}, // Single P prime
}

// Increase only if needed for:
// - Security: Increase LogN
// - Precision: Increase LogQ values
// - Depth: Add more LogQ primes

2. Batch Operations

// Batch multiple operations into single ciphertext
values := make([]complex128, params.Slots())
for i := range values {
    values[i] = complex(data[i], 0)
}
encoder.Encode(values, plaintext)
// Process all slots simultaneously

3. Lazy Evaluation

// Delay expensive operations
type LazyComputation struct {
    ct     *Ciphertext
    ops    []func(*Ciphertext) error
}

func (lc *LazyComputation) Add(op func(*Ciphertext) error) {
    lc.ops = append(lc.ops, op)
}

func (lc *LazyComputation) Execute() error {
    for _, op := range lc.ops {
        if err := op(lc.ct); err != nil {
            return err
        }
    }
    return nil
}

4. Cache Management

// Cache frequently used computations
var cache = make(map[string]*Ciphertext)
var cacheMu sync.RWMutex

func GetOrCompute(key string, compute func() *Ciphertext) *Ciphertext {
    cacheMu.RLock()
    if ct, ok := cache[key]; ok {
        cacheMu.RUnlock()
        return ct
    }
    cacheMu.RUnlock()

    ct := compute()

    cacheMu.Lock()
    cache[key] = ct
    cacheMu.Unlock()

    return ct
}

Comparison with Other Libraries

LibraryLanguageCKKS Mul (ms)BGV Mul (ms)Bootstrap (ms)
Lattice v6Go25.424.8850
SEAL 4.0C++23.122.5N/A
HElibC++28.326.7920
PALISADEC++26.525.3890
ConcreteRust31.229.81,100
TFHEC++N/AN/A13

Benchmarks on same hardware with N=2^14, 128-bit security

Conclusion

Lattice provides competitive performance with state-of-the-art C++ libraries while offering:

  • Pure Go implementation: Easy deployment and cross-compilation
  • Automatic parallelization: Scales with available CPU cores
  • Memory efficiency: Careful memory management and pooling
  • Network optimized: Built for distributed computation

For optimal performance:

  1. Choose minimal parameters for your security needs
  2. Use in-place operations and object pooling
  3. Enable parallelization with appropriate GOMAXPROCS
  4. Profile and optimize hot paths
  5. Batch operations when possible

On this page