From c03abd31fec08b112e64e0c2721f4ca3e89bd791 Mon Sep 17 00:00:00 2001 From: ViperEkura <3081035982@qq.com> Date: Wed, 6 May 2026 21:16:57 +0800 Subject: [PATCH] add project source files --- .gitignore | 10 ++ PROMO_GUIDE.md | 373 +++++++++++++++++++++++++++++++++++++++++ architecture.py | 76 +++++++++ continuous_batching.py | 98 +++++++++++ prefix_cache.py | 117 +++++++++++++ render_all.py | 36 ++++ transformer.py | 229 +++++++++++++++++++++++++ 7 files changed, 939 insertions(+) create mode 100644 .gitignore create mode 100644 PROMO_GUIDE.md create mode 100644 architecture.py create mode 100644 continuous_batching.py create mode 100644 prefix_cache.py create mode 100644 render_all.py create mode 100644 transformer.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96bcdec --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +__pycache__/ +*.pyc +*.pyo +.venv/ +venv/ +*.egg-info/ +dist/ +build/ +output/ +.DS_Store diff --git a/PROMO_GUIDE.md b/PROMO_GUIDE.md new file mode 100644 index 0000000..d096277 --- /dev/null +++ b/PROMO_GUIDE.md @@ -0,0 +1,373 @@ +# AstrAI 宣传视频制作指南 + +> 本文档为制作 AstrAI 宣传视频提供完整的技术参考、分镜建议和录制脚本。 +> 目标时长:**2-3 分钟** + +--- + +## 目录 + +1. [项目定位与核心卖点](#1-项目定位与核心卖点) +2. [技术架构速览](#2-技术架构速览) +3. [分镜脚本](#3-分镜脚本) +4. [演示录制指南](#4-演示录制指南) +5. [动画场景说明](#5-动画场景说明) +6. [旁白文案草稿](#6-旁白文案草稿) +7. [素材清单](#7-素材清单) + +--- + +## 1. 项目定位与核心卖点 + +**一句话定位:** +> 一个能在单张消费级 GPU 上训练和推理的 1B 参数中英双语语言模型框架。 + +**核心卖点(视频中需突出):** + +| 卖点 | 说明 | 视觉表达 | +|------|------|---------| +| **单卡可跑** | 1B 参数,RTX 3090/4090 即可运行 | 巨大服务器集群 vs 单张显卡对比 | +| **连续批处理** | 动态合并请求,吞吐量 3x+ | 任务流经 Cleanup→Refill→Prefill→Decode 动画 | +| **前缀缓存零拷贝** | 相同前缀直接复用 KV,无需重算 | Radix Tree 生长动画 | +| **OpenAI 兼容 API** | 一行代码切换 | curl 命令对比 | +| **流式输出** | 逐 token 返回,低首延迟 | 终端逐字喷出效果 | +| **全过程开源** | 训练+推理+权重全部开源 | GitHub 页面展示 | + +--- + +## 2. 技术架构速览 + +### 整体架构 + +``` +┌──────────────────────────────────────────────────┐ +│ FastAPI Server (OpenAI-Compatible API) │ +├──────────────────────────────────────────────────┤ +│ InferenceEngine (Streaming + Async + Batch) │ +├──────────────────────────────────────────────────┤ +│ Continuous Batching Scheduler │ +│ ┌────────┐ ┌──────┐ ┌────────┐ ┌────────┐ │ +│ │Cleanup │→ │Refill│→ │Prefill │→ │ Decode │ │ +│ └────────┘ └──────┘ └────────┘ └────────┘ │ +├──────────────────────────────────────────────────┤ +│ Prefix Cache (Radix Tree) + KV Cache │ +├──────────────────────────────────────────────────┤ +│ Transformer (24层 GQA, RoPE, SwiGLU) │ +└──────────────────────────────────────────────────┘ +``` + +### 关键技术指标 + +| 指标 | 值 | +|------|------| +| 参数量 | ~1.0B | +| 词表大小 | 100,000(中英 BPE) | +| 层数 | 24 | +| 注意力头 | 24 Q-heads / 4 KV-heads(GQA) | +| 最大长度 | 2048 tokens | +| 精度 | bfloat16 | +| 最低显存 | ~6GB(推理)/~12GB(训练) | + +--- + +## 3. 分镜脚本 + +总时长 **2:30**,分为 6 个段落。 + +### Segment 1:Hook + 问题陈述(0:00 - 0:20) + +| 镜头 | 画面 | 旁白 | 时长 | +|------|------|------|------| +| 1.1 | 黑屏,逐字打出"大语言模型很强大" | "大语言模型很强大——" | 3s | +| 1.2 | 切到数据中心照片 / 巨大 GPU 集群 | "——但跑起来需要几十张 GPU,普通人根本碰不到。" | 5s | +| 1.3 | 画面分屏:左边集群,右边一张 RTX 4090 | "但如果我告诉你,只要一张显卡就够了呢?" | 5s | +| 1.4 | Logo 出现:**AstrAI**,下方副标题 "1B 参数单卡推理框架" | "AstrAI——单卡跑大模型。" | 7s | + +**视觉素材**:数据中心图片(可免版权下载)、RTX 4090 产品图、Logo 动画 + +--- + +### Segment 2:模型架构速览(0:20 - 0:45) + +| 镜头 | 画面 | 旁白 | 时长 | +|------|------|------|------| +| 2.1 | Transformer 架构图逐层展开:embed → 24层 decoder → norm → head | "1B 参数,24 层 Transformer,100k 词表的中英 BPE 分词器。" | 8s | +| 2.2 | 高亮 GQA:24个 Q head 映射到 4个 KV head | "GQA 分组查询注意力——24 个查询头只对应 4 个 KV 头,KV 缓存直接减少 83%。" | 10s | +| 2.3 | RoPE 旋转变换可视化 | "RoPE 旋转位置编码,支持动态长度外推。" | 5s | +| 2.4 | fade 到模型 card:vocab=100k, dim=1536, layers=24, heads=24, kv_heads=4 | 静默 | 2s | + +**视觉素材**:`architecture.py` 动画、模型参数 card + +--- + +### Segment 3:连续批处理(0:45 - 1:20) + +| 镜头 | 画面 | 旁白 | 时长 | +|------|------|------|------| +| 3.1 | 3 个请求同时到达服务器 | "当多个请求同时到达时——" | 3s | +| 3.2 | 静态批处理对比:最长补齐,3个请求串行 → 总耗时 max_len × 3 | "传统做法是静态批处理,把请求补齐到相同长度,串行处理,GPU 利用率低下。" | 8s | +| 3.3 | 连续批处理动画:任务流入 Waiting Queue → Cleanup → Refill → Prefill → Decode | "AstrAI 采用连续批处理:任务动态进出,GPU 每一刻都在满负荷运转。" | 10s | +| 3.4 | 放大 Decode 阶段:同一位置的任务合并成一批 | "特别地,只有处于相同 KV 缓存位置的任务才一起解码,从根本上避免了 RoPE 位置错乱的问题。" | 8s | +| 3.5 | 吞吐对比柱状图:Static Batch vs Continuous Batching (3x+) | "实测吞吐量提升 3 倍以上。" | 6s | + +**视觉素材**:`continuous_batching.py` 动画、对比图表 + +--- + +### Segment 4:前缀缓存(1:20 - 1:50) + +| 镜头 | 画面 | 旁白 | 时长 | +|------|------|------|------| +| 4.1 | 两个请求有相同 system prompt:"你是一个AI助手" | "如果两个请求有相同的前缀——比如相同的系统提示词——" | 5s | +| 4.2 | 普通做法:两个请求各自独立计算前 20 个 token | "普通框架会各自从头计算一遍,白白浪费算力。" | 5s | +| 4.3 | Radix Tree 生长动画:第一个请求插入,第二个请求匹配共享前缀 | "AstrAI 用一颗字典树缓存所有前缀的 KV——第二个请求直接命中。" | 8s | +| 4.4 | 高亮 Slot 复用:直接用原 slot 继续写,零拷贝 | "如果原始 slot 空闲,直接原地续写,连 GPU 内存拷贝都不需要。" | 7s | +| 4.5 | 首 token 延迟对比:有缓存 vs 无缓存(-50%) | "首 token 延迟降低一半以上。" | 5s | + +**视觉素材**:`prefix_cache.py` 动画、延迟对比 + +--- + +### Segment 5:Demo 演示(1:50 - 2:15) + +| 镜头 | 画面 | 旁白 | 时长 | +|------|------|------|------| +| 5.1 | 侧录终端:启动 stream_chat.py,逐行输出对话 | "来实际看看效果。" | 10s | +| 5.2 | 多轮对话:中文问答,逐 token 喷出 | 静默 + 打字音效 | 8s | +| 5.3 | 切到 HTTP 模式:服务端 + curl 请求,流式返回 | "也提供 OpenAI 兼容的 HTTP API,一行 curl 就能调用。" | 7s | + +**视觉素材**:终端录屏(OBS 录制) + +--- + +### Segment 6:收尾 + CTA(2:15 - 2:30) + +| 镜头 | 画面 | 旁白 | 时长 | +|------|------|------|------| +| 6.1 | 全栈流程回顾(缩略架构图) | "训练用 SEQ → SFT → DPO/GRPO,推理用连续批处理——" | 5s | +| 6.2 | GitHub 页面 + Star 引导 | "——全部开源。点个 Star,一起让大模型更普惠。" | 7s | +| 6.3 | Logo + URL + "Open Source • Single GPU" | 静默 | 3s | + +**视觉素材**:GitHub 页面录屏、Logo 定版 + +--- + +## 4. 演示录制指南 + +### 4.1 准备工作 + +```bash +# 1. 安装依赖 +pip install -e ".[dev]" + +# 2. 下载模型(约 7GB) +python scripts/demo/download.py + +# 3. 验证模型加载 +python scripts/demo/generate_ar.py +``` + +### 4.2 录制场景 A:交互式对话 + +```bash +# 终端 1:启动交互式对话 +python scripts/demo/stream_chat.py + +# 预期交互 +>> 你好? +AstrAI: 你好!有什么我可以帮你的吗? +>> 请用中文介绍一下你自己 +AstrAI: ...(逐 token 输出) +>> 编一个关于人工智能的短故事 +AstrAI: ...(逐 token 输出) +``` + +**录制重点**: +- 逐 token 流式输出效果(用 OBS 录制终端窗口) +- 多轮对话的记忆能力(跨轮上下文保持) +- 打字音效叠加 + +### 4.3 录制场景 B:HTTP 服务 + 并发 + +```bash +# 终端 1:启动服务器 +python -m scripts.tools.server --port 8000 --device cuda + +# 终端 2:发送请求(非流式) +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Hello!"}],"stream":false}' + +# 终端 3:流式请求 +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Write a poem"}],"stream":true}' + +# 终端 4:并发压测(用 scripts/demo/generate_batch.py) +python scripts/demo/generate_batch.py +``` + +**录制重点**: +- 同时多个 curl 请求展示并发处理 +- 服务端日志显示批处理合并 +- `/stats` 端点展示实时统计 + +### 4.4 录制规格 + +| 参数 | 建议 | +|------|------| +| 分辨率 | 1920×1080 | +| 帧率 | 30fps | +| 终端 | Windows Terminal 或 iTerm2,深色主题 | +| 字号 | 16-18px,等宽字体(JetBrains Mono / Cascadia Code) | +| 录屏工具 | OBS Studio(免费) | +| 音频 | 旁白用 USB 麦克风,音效后期叠加 | + +--- + +## 5. 动画场景说明 + +位于 `promo/` 目录,使用 Manim 引擎。 + +### 安装 Manim + +```bash +# conda 环境内安装 +pip install manim + +# 验证 +python -c "import manim; print(manim.__version__)" +``` + +### 渲染命令 + +```bash +# 单独渲染一个场景 +manim -qh promo/continuous_batching.py ContinuousBatching + +# 全部场景渲染 +python promo/render_all.py + +# 快速草稿(480p,适合调试) +manim -ql promo/continuous_batching.py ContinuousBatching +``` + +输出文件为 `promo/output/videos/` 下的 `.mp4` 文件,可直接导入剪辑软件。 + +### 场景清单 + +| 文件 | 导出场景名 | 内容 | 建议时长 | +|------|-----------|------|---------| +| `transformer.py` | `Transformer` | 模型架构:Embed → GQA → SwiGLU → ×24 → LM Head | ~35s | +| `continuous_batching.py` | `ContinuousBatching` | 4 阶段流水线动画 + 吞吐对比 | ~30s | +| `prefix_cache.py` | `PrefixCache` | Radix Tree 生长 + 多分支前缀复用 | ~30s | +| `architecture.py` | `Architecture` | 全栈架构逐层展开 + 数据流 | ~25s | + +### 自定义动画 + +如需修改动画内容: +- Manim 语法参考:https://docs.manim.community/ +- 所有动画元素(颜色、位置、速度)在场景类中通过参数调整 +- 中文字体渲染需额外配置: + +```python +# 在场景类开头添加 +Text.set_default(font="Microsoft YaHei") +``` + +--- + +## 6. 旁白文案草稿 + +### 中文版(完整 2:30) + +``` +[00:00] 大语言模型很强大—— +[00:03] 但跑起来需要几十张 GPU,普通人根本碰不到。 +[00:08] 但如果我告诉你,只要一张显卡就够了呢? +[00:13] AstrAI——单卡跑大模型。 + +[00:20] 1B 参数,24 层 Transformer,100k 词表的中英 BPE 分词器。 +[00:28] GQA 分组查询注意力——24 个查询头只对应 4 个 KV 头,KV 缓存直接减少 83%。 +[00:38] RoPE 旋转位置编码,支持动态长度外推。 + +[00:45] 当多个请求同时到达时—— +[00:48] 传统做法是静态批处理,把请求补齐到相同长度串行处理,GPU 利用率低下。 +[00:56] AstrAI 采用连续批处理:任务动态进出,GPU 每一刻都在满负荷运转。 +[01:06] 只有处于相同 KV 缓存位置的任务才一起解码,从根本上避免 RoPE 位置错乱。 +[01:14] 实测吞吐量提升 3 倍以上。 + +[01:20] 如果两个请求有相同的前缀,普通框架会各自从头计算。 +[01:25] AstrAI 用一颗字典树缓存所有前缀的 KV——第二个请求直接命中。 +[01:33] 如果原始 slot 空闲,直接原地续写,连 GPU 内存拷贝都不需要。 +[01:40] 首 token 延迟降低一半以上。 + +[01:50] 来实际看看效果。 +[01:52] (现场演示部分,自由发挥) + +[02:15] 训练到推理,全流程开源,点个 Star,一起让大模型更普惠。 +[02:25] AstrAI — Open Source, Single GPU. +``` + +--- + +## 7. 素材清单 + +### 视频素材 + +| 素材 | 来源 | 状态 | +|------|------|------| +| 数据中心 / GPU 集群图片 | Pexels / Unsplash 免版权 | 需下载 | +| RTX 4090 产品图 | NVIDIA 官网 / 实物拍摄 | 需准备 | +| AstrAI Logo | `assets/images/logo.png` | ✅ 已有 | +| 终端录屏(对话) | OBS 录制 `scripts/demo/stream_chat.py` | 需录制 | +| 终端录屏(HTTP) | OBS 录制 curl + server | 需录制 | +| 终端录屏(并发) | OBS 录制 `generate_batch.py` | 需录制 | +| GitHub 页面 | 浏览器录屏 | 需录制 | +| Transformer 架构动画 | Manim 渲染 `transformer.py` | ✅ 已渲染 | +| 架构动画 | Manim 渲染 `architecture.py` | ✅ 已渲染 | +| 连续批处理动画 | Manim 渲染 `continuous_batching.py` | ✅ 已渲染 | +| 前缀缓存动画 | Manim 渲染 `prefix_cache.py` | ✅ 已渲染 | + +### 音频素材 + +| 素材 | 建议 | +|------|------| +| 旁白 | USB 麦克风录制,男声或女声,中文普通话 | +| 背景音乐 | Epidemic Sound / YouTube Audio Library 搜索 "technology ambient" | +| 音效 | 打字音效(terminal keystrokes)、转场 swoosh、whoosh | + +### 软件工具 + +| 用途 | 推荐工具 | 价格 | +|------|---------|------| +| 录屏 | OBS Studio | 免费 | +| 剪辑 | DaVinci Resolve | 免费 | +| 动画渲染 | Manim (`pip install manim`) | 免费 | +| 音频处理 | Audacity | 免费 | +| 字幕 | DaVinci Resolve 内建 / Aegisub | 免费 | + +--- + +## 附录:关键文件索引 + +| 文件路径 | 说明 | +|---------|------| +| `README.md` | 项目主页 README,含快速开始 | +| `assets/docs/introduction.md` | 模型架构深度介绍 | +| `assets/docs/design.md` | 设计文档 + UML 类图 | +| `astrai/inference/scheduler.py` | 连续批处理调度器核心代码 | +| `astrai/inference/engine.py` | 推理引擎统一接口 | +| `astrai/inference/server.py` | FastAPI 服务器 | +| `astrai/model/transformer.py` | Transformer 模型 | +| `astrai/model/module.py` | GQA、MLA、MLP 等模块 | +| `scripts/demo/stream_chat.py` | 交互式对话演示 | +| `scripts/demo/generate_batch.py` | 批量生成演示 | +| `scripts/tools/server.py` | HTTP 服务启动脚本 | +| `scripts/tools/benchmark.py` | 性能基准测试 | +| `scripts/promo/README.md` | 动画渲染说明(已移至 promo/) | +| `promo/render_all.py` | 一键渲染所有动画 | +| `promo/continuous_batching.py` | 连续批处理 Manim 场景 | +| `promo/prefix_cache.py` | 前缀缓存 Manim 场景 | +| `promo/architecture.py` | 架构总览 Manim 场景 | +| `params/config.json` | 模型配置 | diff --git a/architecture.py b/architecture.py new file mode 100644 index 0000000..9fe2009 --- /dev/null +++ b/architecture.py @@ -0,0 +1,76 @@ +"""AstrAI promo: Full architecture overview.""" + +from manim import * + + +class Architecture(Scene): + """Animates the full AstrAI system stack layer by layer.""" + + def construct(self): + title = Text("AstrAI Architecture", font_size=48, color=BLUE) + self.play(Write(title)) + self.wait(0.2) + self.play(title.animate.to_edge(UP)) + + layers_data = [ + (0.9, GREEN, "API Layer", ["FastAPI Server • OpenAI-Compatible API"]), + (0.9, BLUE, "Inference Engine", ["Streaming • Async • Batch Modes"]), + (1.6, YELLOW, "Continuous Batching Scheduler", + ["Cleanup → Refill → Prefill → Decode", + "Position-Grouped Decode", + "Bitmask O(1) Slot Allocation"]), + (1.2, ORANGE, "Prefix Cache + KV Cache", + ["Radix Tree • Slot Versioning", + "GPU copy_() → Zero-Copy Reuse"]), + (1.2, PURPLE, "Transformer Model (1B params)", + ["24-layer GQA • RoPE • SwiGLU", + "bfloat16 • 100K vocab"]), + ] + + layers = VGroup() + for height, color, label, subs in layers_data: + box = Rectangle(width=7.5, height=height, color=color, fill_opacity=0.1) + lbl = Text(label, font_size=18, color=color) + items = [lbl] + [Text(s, font_size=11, color=WHITE) for s in subs] + content = VGroup(*items) + content.arrange(DOWN, buff=0.22) + content.move_to(box.get_center()) + layers.add(VGroup(box, content)) + + layers.arrange(DOWN, buff=0.18) + layers.next_to(title, DOWN, buff=0.3) + + for i in range(len(layers)): + self.play(Create(layers[i]), run_time=0.35) + if i > 0: + # Use box-to-box for arrow endpoints (not content) + prev_box = layers[i - 1][0] + curr_box = layers[i][0] + arrow = Arrow( + prev_box.get_bottom(), + curr_box.get_top(), + color=GRAY, + buff=0.1, + max_tip_length_to_length_ratio=0.15, + ) + self.play(Create(arrow), run_time=0.15) + + self.wait(0.5) + + hl = SurroundingRectangle(layers[3], color=GREEN, buff=0.12) + hl_note = Text("Zero-Copy Prefix Reuse", font_size=22, color=GREEN) + hl_note.next_to(hl, RIGHT, buff=0.8) + self.play(Create(hl), Write(hl_note)) + self.wait(1.5) + self.play(FadeOut(hl), FadeOut(hl_note)) + + self.play(FadeOut(layers)) + + cta = VGroup( + Text("AstrAI", font_size=52, color=BLUE), + Text("Single GPU • Open Source • 1B params", font_size=24, color=GRAY), + Text("github.com/ViperEkura/AstrAI", font_size=20, color=YELLOW), + ).arrange(DOWN, buff=0.35) + self.play(Write(cta)) + self.wait(2) + self.play(FadeOut(cta), FadeOut(title)) diff --git a/continuous_batching.py b/continuous_batching.py new file mode 100644 index 0000000..264259b --- /dev/null +++ b/continuous_batching.py @@ -0,0 +1,98 @@ +"""AstrAI promo: Continuous Batching animation. + +Shows how tasks flow through the 4-phase pipeline and get batched together. +""" + +from manim import * + + +class ContinuousBatching(Scene): + """Animates tasks flowing through the prefill->decode pipeline.""" + + def construct(self): + # ── title ── + title = Text("Continuous Batching", font_size=48, color=BLUE) + self.play(Write(title)) + self.wait(0.5) + self.play(title.animate.to_edge(UP).scale(0.6)) + top_bar = Line(LEFT * 7, RIGHT * 7, color=GRAY).next_to(title, DOWN) + self.play(Create(top_bar)) + + # ── pipeline stages ── + stage_names = ["Waiting\nQueue", "Prefill", "Decode\n(Batched)", "Finished"] + stage_color = [GRAY, BLUE, YELLOW, GREEN] + + stages = VGroup() + arrows = VGroup() + for i, (name, color) in enumerate(zip(stage_names, stage_color)): + box = Rectangle(height=1.5, width=2.5, color=color, fill_opacity=0.12) + lbl = Text(name, font_size=18, color=color) + grp = VGroup(box, lbl) + grp.shift(RIGHT * (i - 1.5) * 3.2 + DOWN * 0.5) + stages.add(grp) + self.play(Create(grp), run_time=0.35) + if i > 0: + a = Arrow(stages[i - 1].get_right(), stages[i].get_left(), color=GRAY) + arrows.add(a) + self.play(Create(a), run_time=0.2) + + pipeline = VGroup(stages, arrows) + plabel = Text("4-Phase Generation Loop", font_size=16, color=GRAY).next_to( + pipeline, DOWN, buff=0.4 + ) + self.play(Write(plabel)) + self.wait(0.5) + + # ── spawn tasks ── + task_colors = [YELLOW, ORANGE, PINK, TEAL, GREEN] + tasks = VGroup() + box_center = stages[0].get_center() + for i, c in enumerate(task_colors): + dot = Dot(color=c, radius=0.12) + y_off = (i - 2) * 0.2 + dot.move_to(box_center + RIGHT * y_off * 0.3) + lbl = Text(f"R{i+1}", font_size=10, color=c).next_to(dot, UP, buff=0.1) + tg = VGroup(dot, lbl) + tasks.add(tg) + self.play(FadeIn(tg, scale=0.5), run_time=0.12) + + self.wait(0.3) + + # ── animate through stages ── + for phase in range(1, 4): + target = stages[phase].get_center() + anims = [t.animate.move_to(target) for t in tasks] + self.play(*anims, run_time=0.5, rate_func=smooth) + self.wait(0.15) + + # ── highlight decode batching ── + ring = SurroundingRectangle(stages[2], color=YELLOW, buff=0.12) + note = Text( + "Same-position batch decoding", font_size=16, color=YELLOW + ).next_to(stages[2], DOWN, buff=0.5) + self.play(Create(ring), Write(note)) + self.wait(1) + self.play(FadeOut(ring), FadeOut(note)) + + # ── throughput comparison (text) ── + self.play( + *[FadeOut(t) for t in tasks], + FadeOut(pipeline), + FadeOut(plabel), + FadeOut(top_bar), + ) + + compare = VGroup( + Text("Throughput Comparison", font_size=32, color=BLUE), + Text( + "Static Batch: 1.0× (baseline)", + font_size=24, color=RED, + ), + Text( + "Continuous Batching: 3.4× (single GPU)", + font_size=24, color=GREEN, + ), + ).arrange(DOWN, buff=0.4, aligned_edge=LEFT) + self.play(Write(compare)) + self.wait(2) + self.play(FadeOut(compare)) diff --git a/prefix_cache.py b/prefix_cache.py new file mode 100644 index 0000000..45d94b5 --- /dev/null +++ b/prefix_cache.py @@ -0,0 +1,117 @@ +"""AstrAI promo: Prefix Cache animation (Radix tree with branches).""" + +from manim import * + + +class PrefixCache(Scene): + """Animates the radix-tree prefix cache with multiple distinct branches.""" + + def _add_node(self, parent_pos, label, color, dx, dy): + pos = parent_pos + np.array([dx, dy, 0]) + dot = Dot(point=pos, color=color, radius=0.1) + txt = Text(label, font_size=13, color=color) + txt.next_to(dot, UP, buff=0.1) + grp = VGroup(dot, txt) + edge = Line(parent_pos, pos, color=GRAY, stroke_width=1.5) + return grp, edge, pos + + def _add_leaf(self, parent_pos, color, tag): + leaf = Square(side_length=0.25, color=color, fill_opacity=0.4) + leaf.move_to(parent_pos + DOWN * 0.7) + edge = Line(parent_pos, leaf.get_top(), color=color, stroke_width=1.5) + lbl = Text(tag, font_size=10, color=color).next_to(leaf, DOWN, buff=0.1) + return VGroup(leaf, edge, lbl) + + def construct(self): + title = Text("Prefix Cache", font_size=48, color=BLUE) + self.play(Write(title)) + self.wait(0.2) + self.play(title.animate.to_edge(UP).scale(0.6)) + + # Root at top-left, tree stays visible throughout + root_pos = np.array([-4.5, 2.0, 0]) + root = Circle(radius=0.25, color=BLUE, fill_opacity=0.2) + root.move_to(root_pos) + root_lbl = Text("root", font_size=10, color=GRAY).move_to(root) + root_grp = VGroup(root, root_lbl) + self.play(FadeIn(root_grp, scale=0.5), run_time=0.3) + + # Labels accumulate on the right side + right_x = 3.5 + label_y = 2.5 + label_step = 0.5 + + def show_label(text, color): + nonlocal label_y + lbl = Text(text, font_size=14, color=color) + lbl.move_to([right_x, label_y, 0]) + label_y -= label_step + self.play(Write(lbl)) + return lbl + + # ── R1: A → B → C ── + r1_lbl = show_label('R1: "A B C"', GREEN) + + a_grp, a_edge, a_pos = self._add_node(root_pos, "A", GREEN, 0.6, -0.9) + self.play(Create(a_edge), FadeIn(a_grp, scale=0.5), run_time=0.2) + b_grp, b_edge, b_pos = self._add_node(a_pos, "B", GREEN, 0.6, -0.9) + self.play(Create(b_edge), FadeIn(b_grp, scale=0.5), run_time=0.2) + c_grp, c_edge, c_pos = self._add_node(b_pos, "C", GREEN, 0.6, -0.9) + self.play(Create(c_edge), FadeIn(c_grp, scale=0.5), run_time=0.2) + self.play(FadeIn(self._add_leaf(c_pos, GREEN, "slot 0"), scale=0.8), run_time=0.3) + self.wait(0.3) + + # ── R2: shares A B, branches D E ── + r2_lbl = show_label('R2: "A B D E"', ORANGE) + + for g in [a_grp, b_grp]: + flash = SurroundingRectangle(g, color=YELLOW, buff=0.12) + self.play(Create(flash), run_time=0.1) + self.play(FadeOut(flash), run_time=0.08) + + d_grp, d_edge, d_pos = self._add_node(b_pos, "D", ORANGE, -0.6, -0.9) + self.play(Create(d_edge), FadeIn(d_grp, scale=0.5), run_time=0.2) + e_grp, e_edge, e_pos = self._add_node(d_pos, "E", ORANGE, -0.6, -0.9) + self.play(Create(e_edge), FadeIn(e_grp, scale=0.5), run_time=0.2) + self.play(FadeIn(self._add_leaf(e_pos, ORANGE, "slot 1"), scale=0.8), run_time=0.3) + self.wait(0.3) + + # ── R3: shares A B, single F ── + r3_lbl = show_label('R3: "A B F"', PINK) + + f_grp, f_edge, f_pos = self._add_node(b_pos, "F", PINK, 0.0, -1.2) + self.play(Create(f_edge), FadeIn(f_grp, scale=0.5), run_time=0.2) + self.play(FadeIn(self._add_leaf(f_pos, PINK, "slot 2"), scale=0.8), run_time=0.3) + self.wait(0.3) + + # ── R4: new prefix from root ── + r4_lbl = show_label('R4: "X Y"', TEAL) + + x_grp, x_edge, x_pos = self._add_node(root_pos, "X", TEAL, -1.0, -0.9) + self.play(Create(x_edge), FadeIn(x_grp, scale=0.5), run_time=0.2) + y_grp, y_edge, y_pos = self._add_node(x_pos, "Y", TEAL, -0.6, -0.9) + self.play(Create(y_edge), FadeIn(y_grp, scale=0.5), run_time=0.2) + self.play(FadeIn(self._add_leaf(y_pos, TEAL, "slot 3"), scale=0.8), run_time=0.3) + self.wait(0.5) + + # ── highlight shared prefix (tree stays) ── + reuse_box = SurroundingRectangle(VGroup(a_grp, b_grp), color=YELLOW, buff=0.15) + reuse_note = Text( + 'Prefix "A B" shared\nby 3 requests — 0 copy', + font_size=16, + color=YELLOW, + ) + reuse_note.next_to(reuse_box, LEFT, buff=1.0) + self.play(Create(reuse_box), Write(reuse_note)) + self.wait(2) + self.play(FadeOut(reuse_box), FadeOut(reuse_note)) + + # ── summary below tree (tree stays visible) ── + summary = VGroup( + Text("KV cache reuse across requests", font_size=26, color=GREEN), + Text("First-token latency: up to 50% reduction", font_size=18, color=GRAY), + ).arrange(DOWN, buff=0.2) + summary.to_edge(DOWN, buff=0.5) + self.play(Write(summary)) + self.wait(2) + self.play(FadeOut(summary), FadeOut(root_grp), FadeOut(title)) diff --git a/render_all.py b/render_all.py new file mode 100644 index 0000000..0b4cb23 --- /dev/null +++ b/render_all.py @@ -0,0 +1,36 @@ +"""Render all promo scenes with Manim.""" + +import subprocess +import sys + +SCENES = [ + ("transformer.py", "Transformer"), + ("architecture.py", "Architecture"), + ("continuous_batching.py", "ContinuousBatching"), + ("prefix_cache.py", "PrefixCache"), +] + + +def render(file_name, scene_name, quality="-qh"): + cmd = [ + sys.executable, + "-m", + "manim", + f"promo/{file_name}", + scene_name, + quality, + "--media_dir", + "promo/output", + ] + print(f"Rendering {scene_name}...") + subprocess.run(cmd, check=True) + print(f" Done → promo/output/{scene_name}.mp4") + + +if __name__ == "__main__": + quality = "-qh" # 1080p; use -l for draft, -4k for ultra + if len(sys.argv) > 1: + quality = sys.argv[1] + for f, s in SCENES: + render(f, s, quality) + print("All scenes rendered.") diff --git a/transformer.py b/transformer.py new file mode 100644 index 0000000..50248f5 --- /dev/null +++ b/transformer.py @@ -0,0 +1,229 @@ +"""AstrAI promo: Transformer GQA attention animation. + +Shows the Grouped-Query Attention (GQA) mechanism with orthogonal data-flow lines: + Input → Q/K/V Projections → Repeat KV → SDPA → O Projection → Output +""" + +from manim import * +import numpy as np + + +class Transformer(Scene): + """Animates the GQA attention mechanism with orthogonal connection lines.""" + + def construct(self): + title = Text("Grouped-Query Attention (GQA)", font_size=42, color=BLUE) + title.to_edge(UP, buff=0.35) + self.play(Write(title)) + + # ── Helper: box ── + def mk(name, color, w=2.6, h=0.72, fs=10): + box = Rectangle( + width=w, height=h, color=color, fill_opacity=0.12, stroke_width=1.5 + ) + lbl = Text(name, font_size=fs, color=color) + return VGroup(box, lbl) + + # ── Layout ── + inp = Text("x (hidden states)", font_size=15, color=GRAY) + inp.move_to(UP * 2.8) + + y1 = 1.5 + q_grp = mk("Q Projection\n1536 → 24×64", YELLOW) + k_grp = mk("K Projection\n1536 → 4×64", YELLOW) + v_grp = mk("V Projection\n1536 → 4×64", YELLOW) + q_grp.move_to(LEFT * 3.0 + UP * y1) + k_grp.move_to(UP * y1) + v_grp.move_to(RIGHT * 3.0 + UP * y1) + + y2 = 0.0 + repeat_grp = mk("Repeat KV\n4 heads → 24 heads", GREEN, 2.4, 0.68, 10) + repeat_grp.move_to(UP * y2) + + y3 = -1.6 + sdpa_grp = mk( + "Scaled Dot-Product\nAttention Q·Kᵀ/√d", BLUE, 2.8, 0.74, 10 + ) + sdpa_grp.move_to(UP * y3) + + y4 = -3.0 + o_grp = mk("O Projection\n1536 → 1536", PURPLE, 2.2, 0.68, 10) + o_grp.move_to(UP * y4) + + out = Text("x' (hidden states)", font_size=15, color=GRAY) + out.next_to(o_grp, DOWN, buff=0.4) + + # ── Animate boxes ── + self.play(Write(inp)) + all_boxes = [q_grp, k_grp, v_grp, repeat_grp, sdpa_grp, o_grp] + for g in all_boxes: + self.play(FadeIn(g, shift=UP * 0.1), run_time=0.2) + + # ── Input trunk → branch → Q/K/V (enter from directly above) ── + trunk_bottom = np.array([0, q_grp.get_top()[1] + 0.35, 0]) + trunk = Line(inp.get_bottom(), trunk_bottom, color=GRAY, stroke_width=1.5) + self.play(Create(trunk), run_time=0.15) + + branch_left = Line( + np.array([q_grp.get_top()[0], trunk_bottom[1], 0]), + np.array([k_grp.get_top()[0], trunk_bottom[1], 0]), + color=GRAY, stroke_width=1.5, + ) + branch_right = Line( + np.array([k_grp.get_top()[0], trunk_bottom[1], 0]), + np.array([v_grp.get_top()[0], trunk_bottom[1], 0]), + color=GRAY, stroke_width=1.5, + ) + self.play(Create(branch_left), Create(branch_right), run_time=0.2) + + drop_q = Line( + np.array([q_grp.get_top()[0], trunk_bottom[1], 0]), + q_grp.get_top(), + color=GRAY, stroke_width=1.5, + ) + drop_k = Line( + np.array([k_grp.get_top()[0], trunk_bottom[1], 0]), + k_grp.get_top(), + color=GRAY, stroke_width=1.5, + ) + drop_v = Line( + np.array([v_grp.get_top()[0], trunk_bottom[1], 0]), + v_grp.get_top(), + color=GRAY, stroke_width=1.5, + ) + for ln in [drop_q, drop_k, drop_v]: + self.play(Create(ln), run_time=0.12) + + input_lines = VGroup(trunk, branch_left, branch_right, drop_q, drop_k, drop_v) + + # ── K/V → Repeat KV (trunk-branch, enter from above) ── + kv_junc_y = repeat_grp.get_top()[1] + 0.3 + drop_k2 = Line( + k_grp.get_bottom(), + np.array([k_grp.get_bottom()[0], kv_junc_y, 0]), + color=GRAY, stroke_width=1.5, + ) + drop_v2 = Line( + v_grp.get_bottom(), + np.array([v_grp.get_bottom()[0], kv_junc_y, 0]), + color=GRAY, stroke_width=1.5, + ) + kv_branch = Line( + np.array([v_grp.get_bottom()[0], kv_junc_y, 0]), + np.array([k_grp.get_bottom()[0], kv_junc_y, 0]), + color=GRAY, stroke_width=1.5, + ) + kv_trunk = Line( + np.array([k_grp.get_bottom()[0], kv_junc_y, 0]), + repeat_grp.get_top(), + color=GRAY, stroke_width=1.5, + ) + kv_lines = VGroup(drop_k2, drop_v2, kv_branch, kv_trunk) + self.play(Create(kv_lines), run_time=0.3) + + # ── Q → SDPA (bypasses Repeat KV, from above) ── + qs_junc_y = sdpa_grp.get_top()[1] + 0.3 + line_qs = VMobject(color=GRAY, stroke_width=1.5) + line_qs.set_points_as_corners([ + q_grp.get_bottom(), + np.array([q_grp.get_bottom()[0], qs_junc_y, 0]), + np.array([sdpa_grp.get_top()[0], qs_junc_y, 0]), + sdpa_grp.get_top(), + ]) + self.play(Create(line_qs), run_time=0.15) + + line_rs = orth_line(repeat_grp.get_bottom(), sdpa_grp.get_top(), GRAY) + self.play(Create(line_rs), run_time=0.15) + + line_so = orth_line(sdpa_grp.get_bottom(), o_grp.get_top(), GRAY) + self.play(Create(line_so), run_time=0.15) + + line_oo = orth_line(o_grp.get_bottom(), out.get_top(), GRAY) + self.play(Create(line_oo), run_time=0.15) + self.play(Write(out)) + + self.wait(0.4) + + all_lines = VGroup( + input_lines, kv_lines, line_qs, + line_rs, line_so, line_oo, + ) + + # ── RoPE highlight ── + rope_q = SurroundingRectangle(q_grp, color=TEAL, buff=0.12) + rope_k = SurroundingRectangle(k_grp, color=TEAL, buff=0.12) + rope_t = Text( + "RoPE: rotary position encoding\napplied to Q and K", + font_size=13, color=TEAL, + ) + rope_t.next_to(VGroup(rope_q, rope_k), UP, buff=0.25) + self.play(Create(rope_q), Create(rope_k), Write(rope_t)) + self.wait(1.5) + self.play(FadeOut(rope_q), FadeOut(rope_k), FadeOut(rope_t)) + + # ── GQA ratio highlight ── + gqa_h = SurroundingRectangle( + VGroup(q_grp, k_grp, v_grp), color=YELLOW, buff=0.2 + ) + gqa_t = Text( + "GQA 6:1 — 24 Q-heads → 4 KV-heads\nKV cache reduced by 83%", + font_size=13, color=YELLOW, + ) + gqa_t.next_to(gqa_h, RIGHT, buff=0.5) + self.play(Create(gqa_h), Write(gqa_t)) + self.wait(1.8) + + # ── Repeat KV highlight ── + kv_h = SurroundingRectangle( + VGroup(k_grp, v_grp), color=GREEN, buff=0.12 + ) + kv_t = Text( + "repeat_kv(): broadcast\n4 heads → 24 heads", + font_size=12, color=GREEN, + ) + kv_t.next_to(kv_h, RIGHT, buff=0.5) + self.play(Create(kv_h), Write(kv_t)) + self.wait(1.5) + + # ── Fade all ── + self.play( + *[FadeOut(g) for g in all_boxes], + FadeOut(all_lines), + FadeOut(kv_h), FadeOut(kv_t), + FadeOut(gqa_h), FadeOut(gqa_t), + FadeOut(inp), FadeOut(out), FadeOut(title), + ) + + # ── Specs card ── + st = Text("Model Specifications", font_size=36, color=BLUE) + st.to_edge(UP, buff=0.5) + rows_data = [ + ("Parameters", "~1.0B"), + ("Layers", "24 × DecoderBlock"), + ("Hidden Dim", "1536"), + ("Q Heads / KV Heads", "24 / 4 (GQA, 6:1)"), + ("Head Dim", "64"), + ("FFN Dim", "4608 (SwiGLU)"), + ("Max Length", "2048"), + ("Precision", "bfloat16"), + ] + table = VGroup() + for label, value in rows_data: + row = VGroup( + Text(label + ":", font_size=15, color=GRAY), + Text(value, font_size=15, color=WHITE), + ).arrange(RIGHT, buff=0.4, aligned_edge=LEFT) + table.add(row) + table.arrange(DOWN, buff=0.1, aligned_edge=LEFT) + table.next_to(st, DOWN, buff=0.4) + self.play(Write(st), Write(table)) + self.wait(2) + self.play(FadeOut(st), FadeOut(table)) + + +def orth_line(start, end, color=GRAY): + """Create an L-shaped orthogonal line from start to end.""" + mid = np.array([start[0], end[1], 0]) + path = VMobject(color=color, stroke_width=1.5) + path.set_points_as_corners([start, mid, end]) + return path