Feature Showcase¶
This tutorial demonstrates all major cachepy features. It corresponds to the
01_cachepy_showcase.ipynb notebook.
Setup¶
import os, sys, time, shutil
from pathlib import Path
from cachepy import cache_file, cache_tree_nodes, cache_tree_reset
from cachepy.cache_file import (
cache_prune, cache_stats, fast_file_hash,
_file_state_cache, load_config,
)
DEMO_CACHE = Path("demo_cache")
def fresh_cache():
if DEMO_CACHE.exists():
shutil.rmtree(DEMO_CACHE)
cache_tree_reset()
_file_state_cache.clear()
return DEMO_CACHE
1. Basic Caching¶
Wrap any function with @cache_file(cache_dir). The first call executes normally;
subsequent calls with the same arguments return instantly from disk.
cache_dir = fresh_cache()
@cache_file(cache_dir)
def slow_computation(n):
"""Simulate an expensive computation."""
time.sleep(1)
return sum(i**2 for i in range(n))
result1 = slow_computation(10_000) # ~1s
result2 = slow_computation(10_000) # instant (cache hit)
result3 = slow_computation(5_000) # ~1s (different args)
Takeaway: Same args = cache hit. Different args = new computation.
2. Argument Normalization¶
cachepy normalizes how arguments are passed — positional, named, or with explicit defaults all resolve to the same cache key.
cache_dir = fresh_cache()
@cache_file(cache_dir)
def add(a, b, c=0):
print(f" -> executing add({a}, {b}, {c})")
return a + b + c
add(1, 2) # executes
add(a=1, b=2) # cache hit
add(b=2, a=1) # cache hit
add(1, 2, c=0) # cache hit
add(1, 2, c=10) # executes (different args)
3. kwargs Order Independence¶
For **kwargs functions, keyword argument order is ignored (sorted internally).
cache_dir = fresh_cache()
@cache_file(cache_dir)
def config_hash(**kwargs):
return str(sorted(kwargs.items()))
config_hash(alpha=0.1, beta=0.9) # executes
config_hash(beta=0.9, alpha=0.1) # cache hit
4. File Dependency Tracking¶
When arguments point to files, cachepy hashes file content (not just the path).
cache_dir = fresh_cache()
data_file = Path("demo_data.csv")
data_file.write_text("gene,expr\nTP53,10.5\nBRCA1,8.2\n")
@cache_file(cache_dir, file_args=["fpath"])
def parse_csv(fpath):
lines = Path(fpath).read_text().strip().split("\n")
header = lines[0].split(",")
return [dict(zip(header, l.split(","))) for l in lines[1:]]
parse_csv(str(data_file)) # executes
parse_csv(str(data_file)) # cache hit (same content)
data_file.write_text("gene,expr\nTP53,10.5\nBRCA1,8.2\nEGFR,15.3\n")
_file_state_cache.clear()
parse_csv(str(data_file)) # cache miss (file changed)
5. Body Change Detection¶
Redefining a function changes its hash, automatically invalidating the cache.
cache_dir = fresh_cache()
@cache_file(cache_dir)
def transform(x):
return x * 2
transform(5) # returns 10
@cache_file(cache_dir)
def transform(x):
return x * 3 # body changed
transform(5) # returns 15 (new cache entry)
6. Version Parameter¶
Manually bump version to invalidate without touching the function body.
@cache_file(cache_dir, version="1.0")
def predict(x):
return x * 42
predict(10) # executes
@cache_file(cache_dir, version="2.0")
def predict(x):
return x * 42
predict(10) # cache miss — different version
7. Force & Skip Save¶
Control caching per-call:
_force=True— always re-execute (ignores cache)_skip_save=True— execute but don't write to disk
@cache_file(cache_dir)
def fetch(query):
return {"query": query, "ts": time.time()}
r1 = fetch("TP53") # executes
r2 = fetch("TP53") # cache hit
r3 = fetch("TP53", _force=True) # forced re-run
fetch("BRCA1", _skip_save=True) # dry run (no file written)
8. External Dependencies¶
Declare files or variables that should invalidate the cache.
# depends_on_files
config = Path("demo_config.yml")
config.write_text("threshold: 0.05\n")
@cache_file(cache_dir, depends_on_files=[str(config)])
def analyze(x):
return x ** 2
analyze(5) # executes
config.write_text("threshold: 0.01\n")
_file_state_cache.clear()
analyze(5) # cache miss — config changed
# depends_on_vars
@cache_file(cache_dir, depends_on_vars={"schema": "v3"})
def process(x):
return x + 1
9. Environment Variables¶
Track environment variables — the cache invalidates when they change.
os.environ["GENOME_BUILD"] = "hg38"
@cache_file(cache_dir, env_vars=["GENOME_BUILD"])
def align(reads):
build = os.environ.get("GENOME_BUILD", "unknown")
return f"aligned_{reads}_to_{build}"
align("sample1") # executes
os.environ["GENOME_BUILD"] = "hg19"
align("sample1") # cache miss
10. Verbose Mode¶
import logging
logging.basicConfig(level=logging.INFO, format="[cachepy] %(message)s")
@cache_file(cache_dir, verbose=True)
def compute(x):
return x * 2
compute(1) # logs: first execution
compute(1) # logs: cache hit
11. Cache Statistics & Pruning¶
stats = cache_stats(cache_dir)
print(f"Entries: {stats['n_entries']} | Size: {stats['total_size_mb']:.2f} MB")
cache_prune(cache_dir, days_old=0)
12. Dependency Graph¶
When cached functions call other cached functions, cachepy tracks the call graph.
@cache_file(cache_dir)
def load_data(path):
return [1, 2, 3, 4, 5]
@cache_file(cache_dir)
def normalize(data):
mean = sum(data) / len(data)
return [x - mean for x in data]
@cache_file(cache_dir)
def pipeline(path):
return normalize(load_data(path))
pipeline('input.csv')
for nid, node in cache_tree_nodes().items():
print(f" {node['fname']} parents={len(node.get('parents', []))}")
13. Recursive Functions¶
Recursive cached functions automatically memoize sub-problems.
@cache_file(cache_dir)
def fib(n):
if n <= 1: return n
return fib(n-1) + fib(n-2)
fib(10) # calls fib 11 times
fib(10) # 0 calls (all cached)
14. Error Handling¶
Failed functions don't leave stale cache entries or broken graph nodes.
@cache_file(cache_dir)
def risky(x):
if x < 0: raise ValueError("negative")
return x ** 0.5
risky(4) # succeeds, cached
risky(-1) # raises ValueError, nothing cached
15. YAML Configuration¶
from cachepy.cache_file import load_config
config_path = Path("demo_cachepy.yml")
config_path.write_text(
"cache_dir: /tmp/my_project_cache\n"
"backend: pickle\n"
"verbose: true\n"
)
cfg = load_config(config_path)
16. Speed Benchmark¶
Cache overhead is constant (~1-3 ms) regardless of original computation time. See the notebook for the full benchmark with plots.