diff --git a/README.md b/README.md
index 676722c..2a01ef0 100644
--- a/README.md
+++ b/README.md
@@ -201,7 +201,7 @@ curl http://localhost:8000/health
 Check out the demos in the `scripts/demo/` folder:
 
 ```bash
-# Download pre‑processed data (required before running demos)
+# Download model weights (required before running demos)
 python scripts/demo/download.py
 
 # Interactive streaming chat
diff --git a/assets/docs/README-zh-CN.md b/assets/docs/README-zh-CN.md
index 5d336eb..fa3e36d 100644
--- a/assets/docs/README-zh-CN.md
+++ b/assets/docs/README-zh-CN.md
@@ -207,7 +207,7 @@ curl http://localhost:8000/health
 查看 `scripts/demo/` 文件夹中的演示：
 
 ```bash
-# 下载预处理数据（运行演示前必需）
+# 下载模型权重（运行演示前必需）
 python scripts/demo/download.py
 
 # 交互式流式聊天
diff --git a/assets/docs/architecture.md b/assets/docs/architecture.md
index 02ba923..f32e29c 100644
--- a/assets/docs/architecture.md
+++ b/assets/docs/architecture.md
@@ -352,16 +352,11 @@ classDiagram
             +build(item, config, tokenizer) Optional[dict]
         }
 
-        class ChatMaskBuilder {
-            +build(item, config, tokenizer) Optional[dict]
-        }
-
-        class InstructionMaskBuilder {
-            +build(item, config, tokenizer) Optional[dict]
-        }
-
-        class TextMaskBuilder {
+        class SectionedMaskBuilder {
+            +SectionRenderer renderer
             +build(item, config, tokenizer) Optional[dict]
+            +_build_single(item, config, tokenizer) Optional[dict]
+            +_build_multi(item, sources_spec, config, tokenizer) Optional[dict]
         }
 
         class Pipeline {
@@ -370,8 +365,12 @@ classDiagram
             +str output_dir
             +str tokenizer_path
             +BaseMaskBuilder mask_builder
+            +PackingStrategy _packer
+            +PositionIdStrategy _position_id
+            +StoreWriter _writer
             +transform(item) Optional[dict]
             +run()
+            +_flush(domains, shard_idx)
         }
     }
 
@@ -841,7 +840,7 @@ classDiagram
 
         class ResponseBuilder {
             <<abstract>>
-            +prepare(request, tokenizer) Tuple[str, GenContext, List[str]]
+            +prepare(request, engine) Tuple[str, GenContext, List[str]]
             +format_stream_start(ctx) List[str]
             +format_chunk(token) str
             +format_stream_end(ctx, stop) List[str]
@@ -849,7 +848,7 @@ classDiagram
         }
 
         class OpenAIResponseBuilder {
-            +prepare(request, tokenizer) Tuple
+            +prepare(request, engine) Tuple
             +format_stream_start(ctx) List[str]
             +format_chunk(token) str
             +format_stream_end(ctx, stop) List[str]
@@ -857,7 +856,7 @@ classDiagram
         }
 
         class AnthropicResponseBuilder {
-            +prepare(request, tokenizer) Tuple
+            +prepare(request, engine) Tuple
             +format_stream_start(ctx) List[str]
             +format_chunk(token) str
             +format_stream_end(ctx, stop) List[str]
@@ -1034,7 +1033,6 @@ classDiagram
     BaseSamplingStrategy <|-- TemperatureStrategy
     BaseSamplingStrategy <|-- TopKStrategy
     BaseSamplingStrategy <|-- TopPStrategy
-    BaseSamplingStrategy <|-- SamplingPipeline
     ParallelModel <|-- RowParallelLinear
     ParallelModel <|-- ColumnParallelLinear
     AutoModel <|-- AutoRegressiveLM
@@ -1063,9 +1061,7 @@ classDiagram
     BaseExecutor <|-- FSDPExecutor
     ResponseBuilder <|-- OpenAIResponseBuilder
     ResponseBuilder <|-- AnthropicResponseBuilder
-    BaseMaskBuilder <|-- ChatMaskBuilder
-    BaseMaskBuilder <|-- InstructionMaskBuilder
-    BaseMaskBuilder <|-- TextMaskBuilder
+    BaseMaskBuilder <|-- SectionedMaskBuilder
 
     %% --- Composition (strong ownership, part destroyed with whole) ---
     KVCache *-- PagePool
@@ -1162,7 +1158,7 @@ classDiagram
 | Module | Components | Description |
 |--------|------------|-------------|
 | **astrai.config** | BaseConfig, BaseModelConfig, AutoRegressiveLMConfig, EncoderConfig, ConfigFactory, TrainConfig, PipelineConfig, InputConfig, ProcessingConfig, OutputConfig | Configuration management (to_dict/from_dict, to_file/from_file, from_json/to_json) |
-| **astrai.preprocessing** | BaseMaskBuilder, MaskBuilderFactory, ChatMaskBuilder, InstructionMaskBuilder, TextMaskBuilder, Pipeline, filter_by_length, dedup_signature | Declarative JSON-driven data preprocessing |
+| **astrai.preprocessing** | BaseMaskBuilder, MaskBuilderFactory, SectionedMaskBuilder, Pipeline, filter_by_length, PackingStrategy, PackingStrategyFactory, PositionIdStrategy, PositionIdStrategyFactory, StoreWriter, StoreWriterFactory | Declarative JSON-driven data preprocessing |
 | **astrai.dataset** | BaseDataset–GRPODataset, Store–MmapStore, StoreFactory, ResumableDistributedSampler, DatasetFactory | Dataset loading and management |
 | **astrai.serialization** | Checkpoint | Model serialization |
 | **astrai.model** | AutoModel, AutoRegressiveLM, EmbeddingEncoder, DecoderBlock, GQA, MLA, MLP, DeepSeekMoE, AttnFactory, FFNFactory, RMSNorm, Linear, RotaryEmbedding, Embedding | Neural network model |
diff --git a/assets/docs/dataflow.md b/assets/docs/dataflow.md
index 38facbd..df5f599 100644
--- a/assets/docs/dataflow.md
+++ b/assets/docs/dataflow.md
@@ -26,7 +26,7 @@ H5 backend supports shared memory via `.share_memory_()`. Bin (mmap) uses OS pag
 | Type | Storage Keys |
 |------|-------------|
 | `seq` | `sequence` (→ input_ids, target_ids via offset-by-1) |
-| `sft` | `sequence`, `loss_mask` |
+| `sft` | `sequence`, `loss_mask`, `position_ids` |
 | `dpo` | `chosen`, `rejected`, `chosen_mask`, `rejected_mask` |
 | `grpo` | `prompts`, `responses`, `masks`, `rewards` |
 
diff --git a/assets/docs/params.md b/assets/docs/params.md
index e3bf04f..2f663e4 100644
--- a/assets/docs/params.md
+++ b/assets/docs/params.md
@@ -48,6 +48,27 @@
 | `--start_epoch` | Resume from epoch (0 = from scratch) | 0 |
 | `--start_batch` | Resume from batch iteration | 0 |
 
+### Validation
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--val_split` | Ratio to split from training dataset for validation (e.g. 0.05) | None |
+| `--val_step` | Number of optimizer steps between validation runs | 1000 |
+
+### Logging
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--log_dir` | Directory for metric logs | checkpoint/logs |
+| `--log_interval` | Number of batch iterations between metric logs | 100 |
+| `--metrics` | Metrics to log (e.g. --metrics loss lr val_loss) | ["loss", "lr"] |
+
+### Gradient Checkpointing
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--gradient_checkpointing` | Enable activation checkpointing for DecoderBlock modules | False |
+
 ### Distributed Training
 
 | Parameter | Description | Default |
@@ -56,6 +77,9 @@
 | `--parallel_mode` | Parallel strategy (`none`, `ddp`, or `fsdp`) | none |
 | `--device_type` | Device type | cuda |
 | `--start_method` | Multiprocessing start method (`spawn`, `fork`, `forkserver`) | spawn |
+| `--backend` | Distributed training backend | nccl |
+| `--master_addr` | Master node address | localhost |
+| `--master_port` | Master node port | 29500 |
 
 ### Strategy-specific
 
diff --git a/assets/docs/preprocessing.md b/assets/docs/preprocessing.md
index 84a5e1e..de825b7 100644
--- a/assets/docs/preprocessing.md
+++ b/assets/docs/preprocessing.md
@@ -243,6 +243,9 @@ When `sources` is set, `sections` is ignored.
 | `min_chars` | int | `50` | Skip text-mode items shorter than this |
 | `max_chars` | int | `2000000` | Skip text-mode items longer than this |
 | `max_items` | int or null | `null` | Stop after N documents |
+| `packing_strategy` | str | `"simple"` | Packing strategy: `"simple"`, `"bfd"`, `"bfd_split"` |
+| `max_packed_len` | int | `8192` | Maximum length of a packed bin |
+| `truncation_mode` | str | `"keep_start"` | How to truncate sequences: `"keep_start"` or `"keep_end"` |
 
 ### `output`
 
@@ -252,6 +255,7 @@ When `sources` is set, `sections` is ignored.
 | `storage_format` | str | `"bin"` | `"bin"` (mmap) or `"h5"` |
 | `max_tokens_per_shard` | int | `100000000` | Flush threshold in cumulative tokens |
 | `dtype` | dict[str, str] | `{}` | Per-key tensor dtype override (e.g. `{"loss_mask": "bool"}`) |
+| `position_ids_mode` | str | `"none"` | How to compute position_ids: `"none"`, `"doc_reset"`, `"continuous"` |
 
 ---