refactor : 合并 data config docstring 并实现 BFD 打包策略

- 将 ProcessingConfig/OutputConfig 参数描述合并到类级 docstring - Pipeline 支持 packing_strategy/truncation_mode，新增 bfd 打包
2026-06-05 17:41:51 +08:00 · 2026-06-05 17:41:51 +08:00 · 3057741de9
parent acd1103bd0
commit 3057741de9
2 changed files with 130 additions and 7 deletions
--- a/astrai/config/preprocess_config.py
+++ b/astrai/config/preprocess_config.py
@ -33,25 +33,70 @@ class InputConfig(BaseConfig):

@dataclass
 class ProcessingConfig(BaseConfig):
+    """Processing configuration.
+
+    Parameters
+    ----------
+    max_seq_len : int
+        Maximum sequence length (default: 2048).
+    min_chars : int
+        Minimum number of characters to keep (default: 50).
+    max_chars : int
+        Maximum number of characters to keep (default: 2_000_000).
+    max_items : Optional[int]
+        Maximum number of items to process (default: None, unlimited).
+    packing_strategy : str
+        How to pack sequences into a contiguous stream.
+
+        - ``"simple"``: sequential concatenation (default, backward compatible).
+        - ``"bfd"``: best-fit decreasing bin packing, minimises wasted tokens.
+        - ``"bfd_split"``: BFD with over-length sequences split into chunks.
+    max_packed_len : int
+        Maximum length of a packed bin. Sequences longer than this are
+        truncated or split depending on ``packing_strategy`` (default: 8192).
+    truncation_mode : str
+        How to truncate sequences longer than ``max_packed_len``.
+
+        - ``"keep_start"``: keep the first ``max_packed_len`` tokens (default).
+        - ``"keep_end"``: keep the last ``max_packed_len`` tokens.
+    """
+
    max_seq_len: int = 2048
    min_chars: int = 50
    max_chars: int = 2_000_000
    max_items: Optional[int] = None
+    packing_strategy: str = "simple"
+    max_packed_len: int = 8192
+    truncation_mode: str = "keep_start"


@dataclass
 class OutputConfig(BaseConfig):
+    """Output configuration.
+
+    Parameters
+    ----------
+    domain_key : Optional[str]
+        Domain key for the output store (default: None).
+    storage_format : str
+        Storage format, one of ``"bin"``, ``"jsonl"`` (default: ``"bin"``).
+    max_tokens_per_shard : int
+        Maximum tokens per shard before splitting (default: 100_000_000).
+    dtype : Dict[str, str]
+        Per-key dtype overrides, e.g. ``{"input_ids": "int32"}`` (default: {}).
+    position_ids_mode : Optional[str]
+        How to compute position_ids in packed sequences.
+
+        - ``None`` / ``"none"``: do not generate (backward compatible).
+        - ``"doc_reset"``: reset to 0 at each document boundary.
+        - ``"continuous"``: sequential 0, 1, 2, ... (pretrain, single doc).
+    """
+
    domain_key: Optional[str] = None
    storage_format: str = "bin"
    max_tokens_per_shard: int = 100_000_000
    dtype: Dict[str, str] = field(default_factory=dict)
    position_ids_mode: Optional[str] = None
-    """How to compute position_ids in packed sequences.
-
-    - ``None`` / ``"none"``: do not generate (backward compatible).
-    - ``"doc_reset"``: reset to 0 at each document boundary.
-    - ``"continuous"``: sequential 0, 1, 2, ... (pretrain, single doc).
-    """


@dataclass
--- a/astrai/preprocessing/pipeline.py
+++ b/astrai/preprocessing/pipeline.py
@ -8,7 +8,7 @@ import json
 import os
 from collections import defaultdict
 from itertools import chain
-from typing import Optional
+from typing import List, Optional, Tuple

 import torch
 import tqdm
@ -35,6 +35,65 @@ def filter_by_length(text: str, min_len: int = 50, max_len: int = 2_000_000) ->
    return min_len <= len(text) <= max_len


+def _truncate(seq: list, max_len: int, mode: str) -> list:
+    if len(seq) <= max_len:
+        return seq
+    if mode == "keep_end":
+        return seq[-max_len:]
+    return seq[:max_len]
+
+
+def pack_sequences(
+    sequences: List[list],
+    max_packed_len: int,
+    strategy: str,
+    truncation_mode: str,
+) -> List[Tuple[int, int]]:
+    """Pack *sequences* into bins and return a reorder plan.
+
+    Returns a list of ``(orig_idx, truncated_length)`` in flush order.
+    All keys (sequence, loss_mask, …) must be reordered and truncated
+    identically according to this plan.
+
+    Supported *strategy* values:
+
+    - ``"simple"``: sequential, no reordering.
+    - ``"bfd"``: best-fit decreasing bin packing.
+    """
+    n = len(sequences)
+    if strategy == "simple":
+        return [(i, min(len(sequences[i]), max_packed_len)) for i in range(n)]
+
+    order = sorted(range(n), key=lambda i: len(sequences[i]), reverse=True)
+    bins: List[List[int]] = []
+    bin_lengths: List[int] = []
+
+    for orig_idx in order:
+        seq_len = min(len(sequences[orig_idx]), max_packed_len)
+
+        best_bin = None
+        best_remain = max_packed_len + 1
+        for i, bl in enumerate(bin_lengths):
+            remain = max_packed_len - bl
+            if seq_len <= remain < best_remain:
+                best_remain = remain
+                best_bin = i
+
+        if best_bin is not None:
+            bins[best_bin].append(orig_idx)
+            bin_lengths[best_bin] += seq_len
+        else:
+            bins.append([orig_idx])
+            bin_lengths.append(seq_len)
+
+    plan: List[Tuple[int, int]] = []
+    for bin_indices in bins:
+        for orig_idx in bin_indices:
+            plan.append((orig_idx, min(len(sequences[orig_idx]), max_packed_len)))
+
+    return plan
+
+
 class Pipeline:
    """Tokenization pipeline driven by a declarative :class:`PipelineConfig`.

@ -145,6 +204,25 @@ class Pipeline:
        for domain, keys in domains.items():
            idx = shard_idx[domain]
            chunk_dir = os.path.join(self.output_dir, domain)
+
+            pp = self.config.preprocessing
+            if pp.packing_strategy != "simple" and "sequence" in keys:
+                plan = pack_sequences(
+                    keys["sequence"],
+                    pp.max_packed_len,
+                    pp.packing_strategy,
+                    pp.truncation_mode,
+                )
+                reordered = defaultdict(list)
+                for orig_idx, truncated_len in plan:
+                    for k, vals in keys.items():
+                        reordered[k].append(
+                            _truncate(
+                                vals[orig_idx], pp.max_packed_len, pp.truncation_mode
+                            )
+                        )
+                keys = reordered
+
            tensors = {}
            for key, ids_list in keys.items():
                dt = _STR_TO_DTYPE.get(