47 lines
1.1 KiB
Python
47 lines
1.1 KiB
Python
"""Inference module for continuous batching.
|
|
|
|
Layers:
|
|
- engine.py: Facade (InferenceEngine), Value Object (GenerationParams, GenerationRequest)
|
|
- scheduler.py: Continuous-batching loop, Task state machine, TaskStatus enum
|
|
- cache.py: Object Pool (SlotAllocator), PrefixCacheManager
|
|
- sampling.py: Strategy pattern (TemperatureStrategy, TopKStrategy, TopPStrategy)
|
|
- server.py: FastAPI HTTP server (OpenAI-compatible endpoints)
|
|
"""
|
|
|
|
from astrai.inference.engine import (
|
|
GenerationParams,
|
|
GenerationRequest,
|
|
InferenceEngine,
|
|
)
|
|
from astrai.inference.sampling import (
|
|
BaseSamplingStrategy,
|
|
SamplingPipeline,
|
|
TemperatureStrategy,
|
|
TopKStrategy,
|
|
TopPStrategy,
|
|
sample,
|
|
)
|
|
from astrai.inference.scheduler import (
|
|
InferenceScheduler,
|
|
Task,
|
|
TaskStatus,
|
|
)
|
|
|
|
__all__ = [
|
|
# Engine / Requests
|
|
"InferenceEngine",
|
|
"GenerationRequest",
|
|
"GenerationParams",
|
|
# Scheduler
|
|
"InferenceScheduler",
|
|
"Task",
|
|
"TaskStatus",
|
|
# Sampling (Strategy pattern)
|
|
"sample",
|
|
"BaseSamplingStrategy",
|
|
"TemperatureStrategy",
|
|
"TopKStrategy",
|
|
"TopPStrategy",
|
|
"SamplingPipeline",
|
|
]
|