3 changed files with 18 additions and 44 deletions
--- a/astrai/inference/cache.py
+++ b/astrai/inference/cache.py
@ -170,13 +170,15 @@ class PagedCache:
            written += chunk

    def gather(self, layer_id: int, page_table: Tensor) -> Tuple[Tensor, Tensor]:
-        # page_table: [batch, max_pages] with -1 padding for tasks with fewer pages.
-        # clamp(min=0) maps -1 to page 0 (irrelevant data) — truncated by CacheView total_len.
-        safe = page_table.clamp(min=0)
-        k = self.k_cache[layer_id, safe]
-        v = self.v_cache[layer_id, safe]
-        k = k.flatten(1, 2)
-        v = v.flatten(1, 2)
+        k_parts, v_parts = [], []
+        for pi in range(page_table.size(1)):
+            phys_pages = page_table[:, pi]
+            if not (phys_pages >= 0).any():
+                break
+            k_parts.append(self.k_cache[layer_id, phys_pages])
+            v_parts.append(self.v_cache[layer_id, phys_pages])
+        k = torch.cat(k_parts, dim=1)
+        v = torch.cat(v_parts, dim=1)
        return k, v


--- a/astrai/inference/scheduler.py
+++ b/astrai/inference/scheduler.py
@ -147,13 +147,6 @@ class InferenceScheduler:
        if len(prompt_ids) > self.max_prompt_len:
            prompt_ids = prompt_ids[-self.max_prompt_len :]

-        if len(prompt_ids) >= self.max_seq_len:
-            if stream_callback:
-                stream_callback(STOP)
-            return task_id
-
-        max_tokens = min(max_tokens, self.max_seq_len - len(prompt_ids))
-
        task = Task(
            task_id=task_id,
            prompt_ids=prompt_ids,
@ -196,10 +189,7 @@ class InferenceScheduler:
    def _remove_finished_tasks(self) -> None:
        finished = []
        for task in self.active_tasks:
-            if task.status == TaskStatus.ABORTED:
-                task.finish_time = time.time()
-                finished.append(task)
-            elif task.is_finished(self.tokenizer.stop_ids):
+            if task.is_finished(self.tokenizer.stop_ids):
                task.status = TaskStatus.FINISHED
                task.finish_time = time.time()
                finished.append(task)
@ -213,9 +203,7 @@ class InferenceScheduler:
                task._pages_freed = True

        self.active_tasks = [
-            t
-            for t in self.active_tasks
-            if t.status not in (TaskStatus.FINISHED, TaskStatus.ABORTED)
+            t for t in self.active_tasks if t.status != TaskStatus.FINISHED
        ]

    def _refill_active_batch(self) -> None:
@ -266,9 +254,7 @@ class InferenceScheduler:

        seq_len = prompt_len - start_pos
        input_ids = torch.empty(batch_sz, seq_len, dtype=torch.long, device=self.device)
-        input_mask = torch.ones(
-            batch_sz, prompt_len, dtype=torch.bool, device=self.device
-        )
+        input_mask = torch.ones(batch_sz, seq_len, dtype=torch.bool, device=self.device)

        for i, t in enumerate(tasks):
            input_ids[i] = torch.tensor(
@ -294,22 +280,11 @@ class InferenceScheduler:
            return

        tasks = sorted(tasks, key=lambda t: t.task_id)
-
-        valid: List[Task] = []
-        for t in tasks:
-            if self._maybe_alloc_page(t, start_pos):
-                valid.append(t)
-            else:
-                t.status = TaskStatus.ABORTED
-                if t.stream_callback:
-                    t.stream_callback(STOP)
-
-        if not valid:
-            return
-
-        tasks = valid
        batch_sz = len(tasks)

+        for t in tasks:
+            self._maybe_alloc_page(t, start_pos)
+
        input_ids = torch.tensor(
            [t.output_ids[-1] if t.output_ids else t.prompt_ids[-1] for t in tasks],
            dtype=torch.long,
@ -359,15 +334,14 @@ class InferenceScheduler:
        rows = [t.page_table + [-1] * (max_pages - t.n_pages) for t in tasks]
        return torch.tensor(rows, dtype=torch.long, device=self.device)

-    def _maybe_alloc_page(self, task: Task, pos: int) -> bool:
+    def _maybe_alloc_page(self, task: Task, pos: int) -> None:
        needed = self._n_pages_for(pos + 1)
        while task.n_pages < needed:
            p = self.page_cache.alloc()
            if p < 0:
-                return False
+                break
            task.page_table.append(p)
            task.n_pages += 1
-        return True

    def _run_generation_loop(self) -> None:
        try:
--- a/astrai/model/transformer.py
+++ b/astrai/model/transformer.py
@ -29,9 +29,7 @@ def process_attention_mask(

    if seq_mask is None:
        if start_pos != 0:
-            seq_mask = torch.ones(
-                (1, start_pos + seq_len), dtype=torch.bool, device=device
-            )
+            seq_mask = torch.ones((1, seq_len), dtype=torch.bool, device=device)
        else:
            return None