AstrAI/astrai/config/preprocess_config.py

58 lines
1.5 KiB
Python

"""Pipeline configuration for JSONL preprocessing.
Supports single-sequence (SFT/pretrain) and multi-output (DPO/GRPO)
modes, both driven declaratively through ``input.sections`` or
``input.sources``.
"""
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from astrai.config.base import BaseConfig
@dataclass
class InputConfig(BaseConfig):
"""Declarative input mapping.
Single-output mode (backward-compatible)::
{"input": {"sections": [{"field": "messages", ...}]}}
Multi-output mode (DPO / GRPO)::
{"input": {"sources": {
"chosen": {"sections": [{"field": "chosen", ...}]},
"rejected": {"sections": [{"field": "rejected", ...}]},
}}}
"""
sections: Optional[List[Dict]] = None
sources: Optional[Dict[str, Dict]] = None
@dataclass
class ProcessingConfig(BaseConfig):
max_seq_len: int = 2048
min_chars: int = 50
max_chars: int = 2_000_000
max_items: Optional[int] = None
@dataclass
class OutputConfig(BaseConfig):
domain_key: Optional[str] = None
storage_format: str = "bin"
max_tokens_per_shard: int = 100_000_000
dtype: Dict[str, str] = field(default_factory=dict)
@dataclass
class PipelineConfig(BaseConfig):
version: int = 1
input: InputConfig = field(default_factory=InputConfig)
mask: Dict[str, str] = field(default_factory=dict)
mask_default: str = "mask"
preprocessing: ProcessingConfig = field(default_factory=ProcessingConfig)
output: OutputConfig = field(default_factory=OutputConfig)