diff --git a/README.md b/README.md index 676722c..2a01ef0 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,7 @@ curl http://localhost:8000/health Check out the demos in the `scripts/demo/` folder: ```bash -# Download pre‑processed data (required before running demos) +# Download model weights (required before running demos) python scripts/demo/download.py # Interactive streaming chat diff --git a/assets/docs/README-zh-CN.md b/assets/docs/README-zh-CN.md index 5d336eb..fa3e36d 100644 --- a/assets/docs/README-zh-CN.md +++ b/assets/docs/README-zh-CN.md @@ -207,7 +207,7 @@ curl http://localhost:8000/health 查看 `scripts/demo/` 文件夹中的演示: ```bash -# 下载预处理数据(运行演示前必需) +# 下载模型权重(运行演示前必需) python scripts/demo/download.py # 交互式流式聊天 diff --git a/assets/docs/architecture.md b/assets/docs/architecture.md index 02ba923..f32e29c 100644 --- a/assets/docs/architecture.md +++ b/assets/docs/architecture.md @@ -352,16 +352,11 @@ classDiagram +build(item, config, tokenizer) Optional[dict] } - class ChatMaskBuilder { - +build(item, config, tokenizer) Optional[dict] - } - - class InstructionMaskBuilder { - +build(item, config, tokenizer) Optional[dict] - } - - class TextMaskBuilder { + class SectionedMaskBuilder { + +SectionRenderer renderer +build(item, config, tokenizer) Optional[dict] + +_build_single(item, config, tokenizer) Optional[dict] + +_build_multi(item, sources_spec, config, tokenizer) Optional[dict] } class Pipeline { @@ -370,8 +365,12 @@ classDiagram +str output_dir +str tokenizer_path +BaseMaskBuilder mask_builder + +PackingStrategy _packer + +PositionIdStrategy _position_id + +StoreWriter _writer +transform(item) Optional[dict] +run() + +_flush(domains, shard_idx) } } @@ -841,7 +840,7 @@ classDiagram class ResponseBuilder { <> - +prepare(request, tokenizer) Tuple[str, GenContext, List[str]] + +prepare(request, engine) Tuple[str, GenContext, List[str]] +format_stream_start(ctx) List[str] +format_chunk(token) str +format_stream_end(ctx, stop) List[str] @@ -849,7 +848,7 @@ classDiagram } class OpenAIResponseBuilder { - +prepare(request, tokenizer) Tuple + +prepare(request, engine) Tuple +format_stream_start(ctx) List[str] +format_chunk(token) str +format_stream_end(ctx, stop) List[str] @@ -857,7 +856,7 @@ classDiagram } class AnthropicResponseBuilder { - +prepare(request, tokenizer) Tuple + +prepare(request, engine) Tuple +format_stream_start(ctx) List[str] +format_chunk(token) str +format_stream_end(ctx, stop) List[str] @@ -1034,7 +1033,6 @@ classDiagram BaseSamplingStrategy <|-- TemperatureStrategy BaseSamplingStrategy <|-- TopKStrategy BaseSamplingStrategy <|-- TopPStrategy - BaseSamplingStrategy <|-- SamplingPipeline ParallelModel <|-- RowParallelLinear ParallelModel <|-- ColumnParallelLinear AutoModel <|-- AutoRegressiveLM @@ -1063,9 +1061,7 @@ classDiagram BaseExecutor <|-- FSDPExecutor ResponseBuilder <|-- OpenAIResponseBuilder ResponseBuilder <|-- AnthropicResponseBuilder - BaseMaskBuilder <|-- ChatMaskBuilder - BaseMaskBuilder <|-- InstructionMaskBuilder - BaseMaskBuilder <|-- TextMaskBuilder + BaseMaskBuilder <|-- SectionedMaskBuilder %% --- Composition (strong ownership, part destroyed with whole) --- KVCache *-- PagePool @@ -1162,7 +1158,7 @@ classDiagram | Module | Components | Description | |--------|------------|-------------| | **astrai.config** | BaseConfig, BaseModelConfig, AutoRegressiveLMConfig, EncoderConfig, ConfigFactory, TrainConfig, PipelineConfig, InputConfig, ProcessingConfig, OutputConfig | Configuration management (to_dict/from_dict, to_file/from_file, from_json/to_json) | -| **astrai.preprocessing** | BaseMaskBuilder, MaskBuilderFactory, ChatMaskBuilder, InstructionMaskBuilder, TextMaskBuilder, Pipeline, filter_by_length, dedup_signature | Declarative JSON-driven data preprocessing | +| **astrai.preprocessing** | BaseMaskBuilder, MaskBuilderFactory, SectionedMaskBuilder, Pipeline, filter_by_length, PackingStrategy, PackingStrategyFactory, PositionIdStrategy, PositionIdStrategyFactory, StoreWriter, StoreWriterFactory | Declarative JSON-driven data preprocessing | | **astrai.dataset** | BaseDataset–GRPODataset, Store–MmapStore, StoreFactory, ResumableDistributedSampler, DatasetFactory | Dataset loading and management | | **astrai.serialization** | Checkpoint | Model serialization | | **astrai.model** | AutoModel, AutoRegressiveLM, EmbeddingEncoder, DecoderBlock, GQA, MLA, MLP, DeepSeekMoE, AttnFactory, FFNFactory, RMSNorm, Linear, RotaryEmbedding, Embedding | Neural network model | diff --git a/assets/docs/dataflow.md b/assets/docs/dataflow.md index 38facbd..df5f599 100644 --- a/assets/docs/dataflow.md +++ b/assets/docs/dataflow.md @@ -26,7 +26,7 @@ H5 backend supports shared memory via `.share_memory_()`. Bin (mmap) uses OS pag | Type | Storage Keys | |------|-------------| | `seq` | `sequence` (→ input_ids, target_ids via offset-by-1) | -| `sft` | `sequence`, `loss_mask` | +| `sft` | `sequence`, `loss_mask`, `position_ids` | | `dpo` | `chosen`, `rejected`, `chosen_mask`, `rejected_mask` | | `grpo` | `prompts`, `responses`, `masks`, `rewards` | diff --git a/assets/docs/params.md b/assets/docs/params.md index e3bf04f..2f663e4 100644 --- a/assets/docs/params.md +++ b/assets/docs/params.md @@ -48,6 +48,27 @@ | `--start_epoch` | Resume from epoch (0 = from scratch) | 0 | | `--start_batch` | Resume from batch iteration | 0 | +### Validation + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--val_split` | Ratio to split from training dataset for validation (e.g. 0.05) | None | +| `--val_step` | Number of optimizer steps between validation runs | 1000 | + +### Logging + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--log_dir` | Directory for metric logs | checkpoint/logs | +| `--log_interval` | Number of batch iterations between metric logs | 100 | +| `--metrics` | Metrics to log (e.g. --metrics loss lr val_loss) | ["loss", "lr"] | + +### Gradient Checkpointing + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--gradient_checkpointing` | Enable activation checkpointing for DecoderBlock modules | False | + ### Distributed Training | Parameter | Description | Default | @@ -56,6 +77,9 @@ | `--parallel_mode` | Parallel strategy (`none`, `ddp`, or `fsdp`) | none | | `--device_type` | Device type | cuda | | `--start_method` | Multiprocessing start method (`spawn`, `fork`, `forkserver`) | spawn | +| `--backend` | Distributed training backend | nccl | +| `--master_addr` | Master node address | localhost | +| `--master_port` | Master node port | 29500 | ### Strategy-specific diff --git a/assets/docs/preprocessing.md b/assets/docs/preprocessing.md index 84a5e1e..de825b7 100644 --- a/assets/docs/preprocessing.md +++ b/assets/docs/preprocessing.md @@ -243,6 +243,9 @@ When `sources` is set, `sections` is ignored. | `min_chars` | int | `50` | Skip text-mode items shorter than this | | `max_chars` | int | `2000000` | Skip text-mode items longer than this | | `max_items` | int or null | `null` | Stop after N documents | +| `packing_strategy` | str | `"simple"` | Packing strategy: `"simple"`, `"bfd"`, `"bfd_split"` | +| `max_packed_len` | int | `8192` | Maximum length of a packed bin | +| `truncation_mode` | str | `"keep_start"` | How to truncate sequences: `"keep_start"` or `"keep_end"` | ### `output` @@ -252,6 +255,7 @@ When `sources` is set, `sections` is ignored. | `storage_format` | str | `"bin"` | `"bin"` (mmap) or `"h5"` | | `max_tokens_per_shard` | int | `100000000` | Flush threshold in cumulative tokens | | `dtype` | dict[str, str] | `{}` | Per-key tensor dtype override (e.g. `{"loss_mask": "bool"}`) | +| `position_ids_mode` | str | `"none"` | How to compute position_ids: `"none"`, `"doc_reset"`, `"continuous"` | ---