Skip to content

Performance Optimization Guide

Advanced techniques for optimizing data operations in RustyBT.

Memory Management

1. Chunked Processing

def process_large_dataset(file_path, chunk_size=100_000):
    """Process large dataset in chunks."""
    df_lazy = pl.scan_parquet(file_path)
    total_rows = df_lazy.select(pl.count()).collect()[0, 0]

    for offset in range(0, total_rows, chunk_size):
        chunk = df_lazy.slice(offset, chunk_size).collect()
        process_chunk(chunk)
        del chunk  # Free memory immediately

2. Column Selection

# Bad: Load all columns
df = pl.read_parquet('data.parquet')

# Good: Load only needed columns
df = pl.read_parquet('data.parquet', columns=['timestamp', 'close'])

3. Streaming Mode

# Use Polars streaming for large datasets
df = (pl.scan_parquet('large_file.parquet')
    .filter(pl.col('volume') > 1_000_000)
    .select(['symbol', 'close'])
    .collect(streaming=True)  # Process in streaming mode
)

Query Optimization

1. Filter Early

# Bad: Filter after loading
df = pl.read_parquet('data.parquet')
df_filtered = df.filter(pl.col('symbol') == 'AAPL')

# Good: Filter during load (predicate pushdown)
df = (pl.scan_parquet('data.parquet')
    .filter(pl.col('symbol') == 'AAPL')
    .collect()
)

2. Index Usage

# Create index for fast lookups
df = df.set_sorted('timestamp')

# Fast binary search
result = df.filter(pl.col('timestamp') == target_date)

Parallel Execution

1. Polars Parallelism

# Polars automatically parallelizes operations
import polars as pl

# Set thread count (default: all cores)
pl.Config.set_thread_pool_size(8)

# Operations run in parallel automatically
df = (pl.scan_parquet('data.parquet')
    .groupby('symbol')
    .agg(pl.col('close').mean())
    .collect()
)

2. Multi-Processing

from multiprocessing import Pool

def process_symbol(symbol):
    return expensive_computation(symbol)

# Process symbols in parallel
with Pool(processes=8) as pool:
    results = pool.map(process_symbol, symbols)

I/O Optimization

1. Compression Selection

Compression Speed Ratio Use Case
snappy ⚡⚡⚡ 1.5x Fast reads
lz4 ⚡⚡⚡ 1.5x Fast reads
zstd ⚡⚡ 3x Storage
gzip 3x Maximum compression

2. Partitioning

# Partition by symbol for efficient filtering
writer.write_dataset(
    df,
    '/data/partitioned',
    partition_by=['symbol']
)

# Fast symbol-specific reads
df_aapl = pl.read_parquet('/data/partitioned/symbol=AAPL/')

See Caching and Troubleshooting for more.