Code examples, configuration parameters, and API reference.
class EMAScorer:
def __init__(self, num_positions, num_heads, alpha=0.1):
self.alpha = alpha
self.scores = np.zeros((num_positions, num_heads))
def update(self, attention_weights):
# attention_weights: [num_heads, num_positions]
self.scores = (
self.alpha * attention_weights.T +
(1 - self.alpha) * self.scores
)
def get_eviction_candidates(self, n, anchor_zone=100):
# Aggregate across heads (max)
agg_scores = self.scores.max(axis=1)
# Protect anchor zone
agg_scores[:anchor_zone] = float('inf')
# Return n lowest-scoring positions
return np.argsort(agg_scores)[:n]
def compute_prefetch_priority(current_pos, cache_positions, ema_scores):
priorities = []
for pos in cache_positions:
distance = abs(current_pos - pos)
rope_factor = 1.0 / (1 + distance / 100) # RoPE decay
ema_factor = ema_scores[pos].max()
priority = 0.6 * rope_factor + 0.4 * ema_factor
priorities.append((pos, priority))
return sorted(priorities, key=lambda x: -x[1])
| Parameter | Default | Description |
|---|---|---|
| ema_alpha | 0.1 | EMA decay rate |
| anchor_zone_size | 100 | Protected positions |
| prefetch_depth | 2 | Layers to prefetch ahead |
| eviction_batch | 64 | Entries per eviction |
| tier1_capacity | 256 GB | CXL DRAM per endpoint |
from vllm import LLM
from endpoint_cache import EndpointCache
cache = EndpointCache(
endpoints=["cxl://ep0", "cxl://ep1", "cxl://ep2", "cxl://ep3"],
policy="ema_rope",
config={"ema_alpha": 0.1, "anchor_zone": 100}
)
llm = LLM(
model="meta-llama/Llama-2-70b",
kv_cache_backend=cache,
max_model_len=128000,
gpu_memory_utilization=0.9
)