93 lines
1.9 KiB
Python
93 lines
1.9 KiB
Python
"""Inference module for continuous batching.
|
|
|
|
Layers:
|
|
- core/: Core inference loop (cache, executor, scheduler, task)
|
|
- api/: HTTP protocol handlers (OpenAI, Anthropic)
|
|
- engine.py: Facade (InferenceEngine), Value Object (GenerationRequest)
|
|
- sample.py: Strategy pattern (TemperatureStrategy, TopKStrategy, TopPStrategy)
|
|
"""
|
|
|
|
from astrai.inference.api import (
|
|
AnthropicHandler,
|
|
AnthropicMessage,
|
|
ChatCompletionRequest,
|
|
ChatMessage,
|
|
MessagesRequest,
|
|
OpenAIHandler,
|
|
ProtocolHandler,
|
|
StopChecker,
|
|
StreamContext,
|
|
app,
|
|
run_server,
|
|
)
|
|
from astrai.inference.core import (
|
|
STOP,
|
|
Allocator,
|
|
Executor,
|
|
InferenceScheduler,
|
|
KVCache,
|
|
KvcacheView,
|
|
PagePool,
|
|
PrefixCache,
|
|
Storage,
|
|
Task,
|
|
TaskManager,
|
|
TaskStatus,
|
|
TaskTable,
|
|
page_hash,
|
|
)
|
|
from astrai.inference.engine import (
|
|
GenerationRequest,
|
|
InferenceEngine,
|
|
)
|
|
from astrai.inference.sample import (
|
|
BaseSamplingStrategy,
|
|
SamplingPipeline,
|
|
TemperatureStrategy,
|
|
TopKStrategy,
|
|
TopPStrategy,
|
|
sample,
|
|
)
|
|
|
|
__all__ = [
|
|
# Engine / Requests
|
|
"InferenceEngine",
|
|
"GenerationRequest",
|
|
# Core scheduler
|
|
"InferenceScheduler",
|
|
"Executor",
|
|
"STOP",
|
|
"Task",
|
|
"TaskManager",
|
|
"TaskStatus",
|
|
# Core cache
|
|
"Allocator",
|
|
"KVCache",
|
|
"KvcacheView",
|
|
"PagePool",
|
|
"PrefixCache",
|
|
"Storage",
|
|
"TaskTable",
|
|
"page_hash",
|
|
# Sampling (Strategy pattern)
|
|
"sample",
|
|
"BaseSamplingStrategy",
|
|
"TemperatureStrategy",
|
|
"TopKStrategy",
|
|
"TopPStrategy",
|
|
"SamplingPipeline",
|
|
# Protocol
|
|
"ProtocolHandler",
|
|
"StopChecker",
|
|
"StreamContext",
|
|
"AnthropicHandler",
|
|
"OpenAIHandler",
|
|
# Server
|
|
"ChatMessage",
|
|
"ChatCompletionRequest",
|
|
"AnthropicMessage",
|
|
"MessagesRequest",
|
|
"app",
|
|
"run_server",
|
|
]
|