refactor: 修改转化方法

2026-04-18 20:31:57 +08:00 · 2026-04-18 20:31:57 +08:00 · 6fc0be476f
parent 01867ed9dc
commit 6fc0be476f
2 changed files with 28 additions and 636 deletions
--- a/markdown-converter/SKILL.md
+++ b/markdown-converter/SKILL.md
@ -1,84 +1,45 @@
 ---
 name: markdown-converter
-description: A versatile Markdown conversion tool that supports converting Markdown to HTML, PNG images, and plain text formats.
-metadata: {"clawdbot":{"emoji":"📝","os":["linux","darwin","win32"]}}
+description: A simple tool to convert Markdown to PNG images using browser engine.
+metadata: {"clawdbot":{"emoji":"🖼️","os":["linux","darwin","win32"]}}
 ---

-# Markdown Converter
+# Markdown to PNG Converter

-A versatile Markdown conversion tool that supports converting Markdown to HTML, PNG images, and plain text formats.
+A simple tool to convert Markdown documents to PNG images using Chromium browser engine.

 ## Features

- **HTML Conversion**: High-quality Markdown to HTML conversion
- **PNG Output**: Render Markdown as PNG images
- **CJK Support**: Uses system CJK fonts for Chinese character rendering
- **Code Highlighting**: Syntax highlighting for code blocks
- **Clean Output**: Removes invisible Unicode characters
+- **Browser-based rendering**: High quality output using real browser engine
+- **Full CSS support**: Supports complex Markdown with tables, code blocks, etc.
+- **Code highlighting**: Syntax highlighting for code blocks
+- **CJK support**: Uses system fonts for Chinese character rendering

 ## Installation

 ```bash
-pip install matplotlib html2text pygments markdown
+pip install markdown2image playwright && playwright install chromium
 ```

 ## Dependencies

- **matplotlib**: PNG image rendering with excellent CJK support
- **html2text**: HTML and Markdown conversion
- **Pygments**: Code syntax highlighting
- **markdown**: Python Markdown processor
+- **markdown2image**: Markdown to image conversion
+- **playwright**: Browser automation (Chromium)

 ## Usage

 ```bash
-# Convert to HTML
-python scripts/md_convert.py input.md output.html
-
-# Convert to PNG
+# Convert Markdown to PNG
 python scripts/md_convert.py input.md output.png
-
-# Convert to plain text
-python scripts/md_convert.py input.md output.txt
 ```

-## Output Formats
-
-### HTML
- Complete HTML document structure
- Inline CSS styling
- Code syntax highlighting
- Responsive design
-
-### PNG Card
- White card background
- Large title font
- Automatic text wrapping
- CJK character support
-
-### Plain Text
- Plain text output
- Preserves basic formatting
- Removes invisible characters
-
 ## Supported Platforms

- **Windows**: Uses system CJK fonts
- **macOS**: Uses PingFang and other system fonts
- **Linux**: Uses NotoSansCJK and other fonts
-
-## Workflow
-
-```
-Markdown Input
-    ↓
-[html2text / markdown library]
-    ↓
-HTML / PNG / Plain Text
-```
+- **Windows**: Uses Chromium browser
+- **macOS**: Uses Chromium browser
+- **Linux**: Uses Chromium browser

 ## Notes

- PNG rendering requires Chinese fonts to be installed on the system
- Code highlighting requires Pygments support
- Large files may require longer processing time
+- Requires Chromium browser (installed via `playwright install chromium`)
+- Chinese fonts are supported via system fonts
--- a/markdown-converter/scripts/md_convert.py
+++ b/markdown-converter/scripts/md_convert.py
@ -1,581 +1,27 @@
-"""Markdown 转换器 - 支持 HTML / PNG / Plain Text 输出
+"""Markdown to PNG converter

 Dependencies:
-  - HTML/PNG 输出: pip install Pillow html2text pygments markdown
+  - pip install markdown2image playwright && playwright install chromium
 """

-import html
-import re
-import sys
 import os
+import sys
 from pathlib import Path

-
-# ============================================================
-# 字体查找
-# ============================================================
-
-def _find_font():
-    """Find a suitable TrueType font across platforms."""
-    candidates = []
-    if sys.platform == "win32":
-        pf = os.environ.get("WINDIR", r"C:\Windows")
-        candidates = [
-            os.path.join(pf, "Fonts", "msyh.ttc"),
-            os.path.join(pf, "Fonts", "msyhbd.ttc"),
-            os.path.join(pf, "Fonts", "simhei.ttf"),
-            os.path.join(pf, "Fonts", "simsun.ttc"),
-        ]
-    elif sys.platform == "darwin":
-        candidates = [
-            "/System/Library/Fonts/PingFang.ttc",
-            "/System/Library/Fonts/STHeiti Light.ttc",
-            "/Library/Fonts/Arial Unicode.ttf",
-        ]
-    else:
-        candidates = [
-            "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
-            "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
-            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
-            "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
-        ]
-    for p in candidates:
-        if os.path.exists(p):
-            return p
-    return None
-
-
-# ============================================================
-# 工具函数
-# ============================================================
-
-def _clean_invisible_chars(text):
-    """清理不可见的 Unicode 字符"""
-    if not text:
-        return ""
-    text = re.sub(r'[\u200b-\u200f\u2028-\u202f\ufeff\u00ad]', '', text)
-    text = re.sub(r'[\xa0\u3000]', ' ', text)
-    text = re.sub(r'[\uff00-\uffef]', '', text)
-    return text
-
-
-def _decode_html_entities(text):
-    """解码 HTML 实体"""
-    entities = {
-        '<': '<',
-        '>': '>',
-        '&': '&',
-        '"': '"',
-        ''': "'",
-        ''': "'",
-    }
-    for k, v in entities.items():
-        text = text.replace(k, v)
-    # 处理特殊引号
-    text = text.replace('"', '"').replace('"', '"')
-    text = text.replace(''', "'").replace(''', "'")
-    return text
-
-
-# ============================================================
-# Markdown 解析
-# ============================================================
-
-def parse_markdown(md_text):
-    """解析 Markdown 文本，提取标题、代码块、段落等元素"""
-    if not md_text:
-        return []
-
-    md_text = _clean_invisible_chars(md_text)
-    lines = md_text.split('\n')
-    elements = []
-    current_paragraph = []
-
-    def flush_paragraph():
-        nonlocal current_paragraph
-        if current_paragraph:
-            text = ' '.join(current_paragraph)
-            if text.strip():
-                elements.append(('paragraph', text.strip()))
-            current_paragraph = []
-
-    for line in lines:
-        stripped = line.strip()
-
-        # 跳过空行
-        if not stripped:
-            flush_paragraph()
-            continue
-
-        # 标题
-        header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
-        if header_match:
-            flush_paragraph()
-            level = len(header_match.group(1))
-            elements.append(('header', level, header_match.group(2).strip()))
-            continue
-
-        # 代码块
-        if stripped.startswith('```'):
-            flush_paragraph()
-            elements.append(('codeblock', stripped[3:].strip()))
-            continue
-
-        # 无序列表
-        list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped)
-        if list_match:
-            flush_paragraph()
-            elements.append(('list_item', list_match.group(1).strip()))
-            continue
-
-        # 有序列表
-        ordered_match = re.match(r'^\d+\.\s+(.+)$', stripped)
-        if ordered_match:
-            flush_paragraph()
-            elements.append(('ordered_item', ordered_match.group(1).strip()))
-            continue
-
-        # 引用
-        if stripped.startswith('>'):
-            flush_paragraph()
-            content = stripped[1:].strip()
-            elements.append(('quote', content))
-            continue
-
-        # 水平线
-        if re.match(r'^[\-\*_]{3,}$', stripped):
-            flush_paragraph()
-            elements.append(('hr',))
-            continue
-
-        # 链接或图片
-        link_match = re.match(r'!?\[([^\]]+)\]\([^\)]+\)', stripped)
-        if link_match:
-            flush_paragraph()
-            elements.append(('link', link_match.group(1)))
-            continue
-
-        # 默认作为段落处理
-        current_paragraph.append(stripped)
-
-    flush_paragraph()
-    return elements
-
-
-# ============================================================
-# 文本换行
-# ============================================================
-
-def _wrap_text(text, font, max_width, draw):
-    """Wrap text to fit within max_width pixels."""
-    lines = []
-    for paragraph in text.split("\n"):
-        if not paragraph.strip():
-            lines.append("")
-            continue
-        current = ""
-        for ch in paragraph:
-            test = current + ch
-            bbox = draw.textbbox((0, 0), test, font=font)
-            if bbox[2] - bbox[0] > max_width and current:
-                lines.append(current)
-                current = ch
-            else:
-                current = test
-        if current:
-            lines.append(current)
-    return lines
-
-
-# ============================================================
-# HTML 转换
-# ============================================================
-
-def markdown_to_html(md_text):
-    """将 Markdown 转换为 HTML"""
-    if not md_text:
-        return ""
-
-    md_text = _clean_invisible_chars(md_text)
-    elements = parse_markdown(md_text)
-
-    try:
-        import html
-    except ImportError:
-        import urllib.parse as html
-
-    try:
-        import markdown
-        from markdown.extensions import codehilite
-
-        md = markdown.Markdown(extensions=['codehilite', 'fenced_code', 'tables'])
-        html_content = md.convert(md_text)
-        return _html_template(html_content)
-    except ImportError:
-        # 降级处理：使用简单的转换
-        return _simple_markdown_to_html(md_text)
-
-
-def _simple_markdown_to_html(md_text):
-    """简单的 Markdown 到 HTML 转换（无外部依赖）"""
-    lines = md_text.split('\n')
-    html_lines = []
-
-    in_codeblock = False
-
-    for line in lines:
-        stripped = line.strip()
-
-        # 代码块开始/结束
-        if stripped.startswith('```'):
-            if in_codeblock:
-                html_lines.append('</code></pre>')
-                in_codeblock = False
-            else:
-                lang = stripped[3:].strip() or ''
-                lang_attr = f' class="language-{lang}"' if lang else ''
-                html_lines.append(f'<pre><code lang="{lang}">')
-                in_codeblock = True
-            continue
-
-        if in_codeblock:
-            html_lines.append(html.escape(line))
-            continue
-
-        # 标题
-        header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
-        if header_match:
-            level = len(header_match.group(1))
-            content = header_match.group(2)
-            html_lines.append(f'<h{level}>{_decode_html_entities(content)}</h{level}>')
-            continue
-
-        # 水平线
-        if re.match(r'^[\-\*_]{3,}$', stripped):
-            html_lines.append('<hr>')
-            continue
-
-        # 引用
-        if stripped.startswith('>'):
-            content = stripped[1:].strip()
-            html_lines.append(f'<blockquote>{_decode_html_entities(content)}</blockquote>')
-            continue
-
-        # 列表项
-        list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped)
-        if list_match:
-            content = list_match.group(1)
-            html_lines.append(f'<li>{_decode_html_entities(content)}</li>')
-            continue
-
-        # 段落
-        if stripped:
-            # 处理粗体和斜体
-            text = _decode_html_entities(stripped)
-            text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text)
-            text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text)
-            text = re.sub(r'__(.+?)__', r'<strong>\1</strong>', text)
-            text = re.sub(r'_(.+?)_', r'<em>\1</em>', text)
-            text = re.sub(r'`(.+?)`', r'<code>\1</code>', text)
-            html_lines.append(f'<p>{text}</p>')
-
-    return _html_template('\n'.join(html_lines))
-
-
-def _html_template(content):
-    """生成完整的 HTML 文档"""
-    return f"""<!DOCTYPE html>
-<html lang="zh-CN">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Markdown Document</title>
-    <style>
-        body {{
-            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
-            line-height: 1.6;
-            max-width: 800px;
-            margin: 0 auto;
-            padding: 20px;
-            color: #333;
-        }}
-        h1, h2, h3, h4, h5, h6 {{
-            margin-top: 1.5em;
-            margin-bottom: 0.5em;
-            font-weight: 600;
-        }}
-        h1 {{ font-size: 2em; border-bottom: 2px solid #333; }}
-        h2 {{ font-size: 1.5em; border-bottom: 1px solid #ddd; }}
-        code {{
-            background: #f4f4f4;
-            padding: 2px 6px;
-            border-radius: 3px;
-            font-family: 'Consolas', 'Monaco', monospace;
-        }}
-        pre {{
-            background: #f4f4f4;
-            padding: 16px;
-            border-radius: 6px;
-            overflow-x: auto;
-        }}
-        pre code {{
-            background: none;
-            padding: 0;
-        }}
-        blockquote {{
-            border-left: 4px solid #ddd;
-            margin: 0;
-            padding-left: 16px;
-            color: #666;
-        }}
-        a {{
-            color: #0066cc;
-        }}
-        hr {{
-            border: none;
-            border-top: 1px solid #ddd;
-            margin: 24px 0;
-        }}
-    </style>
-</head>
-<body>
-{content}
-</body>
-</html>"""
-
-
-# ============================================================
-# PNG 渲染 (使用 matplotlib)
-# ============================================================
-
-def _get_matplotlib_font():
-    """获取支持中文的 matplotlib 字体（通过字体文件路径）"""
-    import matplotlib
-    import matplotlib.font_manager as fm
-    import os
-    import sys
-    
-    # Linux 中文字体路径
-    linux_font_paths = [
-        '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc',
-        '/usr/share/fonts/opentype/noto/NotoSansSC-Regular.otf',
-        '/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc',
-        '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc',
-        '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc',
-        '/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf',
-        '/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc',
-        '/usr/share/fonts/truetype/arphic/uming.ttc',
-        '/usr/share/fonts/truetype/arphic/ukai.ttc',
-    ]
-    
-    # Windows 中文字体路径
-    if sys.platform == "win32":
-        windir = os.environ.get("WINDIR", r"C:\Windows")
-        windows_font_paths = [
-            os.path.join(windir, "Fonts", "msyh.ttc"),
-            os.path.join(windir, "Fonts", "msyhbd.ttc"),
-            os.path.join(windir, "Fonts", "simhei.ttf"),
-            os.path.join(windir, "Fonts", "simsun.ttc"),
-            os.path.join(windir, "Fonts", "STHeiti Light.ttc"),
-        ]
-        linux_font_paths.extend(windows_font_paths)
-    
-    # macOS 中文字体路径
-    elif sys.platform == "darwin":
-        mac_font_paths = [
-            '/System/Library/Fonts/PingFang.ttc',
-            '/System/Library/Fonts/STHeiti Light.ttc',
-            '/Library/Fonts/Arial Unicode.ttf',
-            '/System/Library/Fonts/Supplemental/Arial Unicode.ttf',
-        ]
-        linux_font_paths.extend(mac_font_paths)
-    
-    # 查找存在的字体文件
-    for font_path in linux_font_paths:
-        if os.path.exists(font_path):
-            # 清除字体缓存并加载指定字体
-            fm.fontManager.addfont(font_path)
-            font = fm.FontProperties(fname=font_path)
-            # 验证字体可以显示中文
-            return font
-    
-    # 如果没找到，返回 None 让 matplotlib 使用默认
-    return None
+from markdown2image import Markdown2Image


 def markdown_to_png(md_text, img_path):
-    """将 Markdown 渲染为 PNG 图片（使用 matplotlib）"""
-    try:
-        import matplotlib.pyplot as plt
-        import matplotlib.patches as patches
-        import matplotlib.font_manager as fm
-        import matplotlib
-    except ImportError:
-        raise ImportError("matplotlib not installed. Run: pip install matplotlib")
-
-    # 设置非交互式后端
-    matplotlib.use('Agg')
-    
-    # 获取中文字体
-    font = _get_matplotlib_font()
-
-    elements = parse_markdown(md_text)
-    
-    W, PAD = 10, 0.5  # 英寸, 边距
-    FIG_H = 2.0  # 初始高度
-    LINE_H = 0.35  # 每行高度
-    CODE_H = 0.5  # 代码块初始高度
-    
-    # 计算所需高度
-    y = 2.5  # 顶部空间
-    for elem in elements:
-        if elem[0] == 'header':
-            level = elem[1]
-            text = elem[2]
-            chars_per_line = 50 if level <= 2 else 60
-            lines = max(1, len(text) // chars_per_line + 1)
-            y += lines * (0.5 if level <= 2 else 0.4) + 0.2
-        elif elem[0] == 'paragraph':
-            chars_per_line = 60
-            lines = max(1, len(elem[1]) // chars_per_line + 1)
-            y += lines * 0.35 + 0.3
-        elif elem[0] == 'codeblock':
-            lines = elem[1].count('\n') + 2
-            y += lines * 0.3 + 0.2
-        elif elem[0] in ('list_item', 'quote'):
-            chars_per_line = 55
-            lines = max(1, len(elem[1]) // chars_per_line + 1)
-            y += lines * 0.32 + 0.15
-
-    FIG_H = max(8, y)
-    
-    fig, ax = plt.subplots(figsize=(W, FIG_H))
-    fig.patch.set_facecolor('#ffffff')
-    ax.set_facecolor('#ffffff')
-    
-    # 标题栏背景
-    header = patches.Rectangle((0, FIG_H - 1.2), W, 1.0, linewidth=0, facecolor='#2d3748')
-    ax.add_patch(header)
-    
-    # 标题
-    ax.text(0.5, FIG_H - 0.6, 'Markdown Document', 
-            fontsize=18, fontweight='bold', color='white',
-            fontproperties=font, ha='left', va='center')
-    
-    ax.text(0.5, FIG_H - 1.0, 'Converted from Markdown',
-            fontsize=10, color='#888888', fontproperties=font, ha='left', va='center')
-    
-    ax.set_xlim(0, W)
-    ax.set_ylim(0, FIG_H)
-    ax.axis('off')
-    
-    cy = FIG_H - 1.5
-    
-    for elem in elements:
-        if elem[0] == 'header':
-            level = elem[1]
-            text = elem[2]
-            size = 16 if level <= 2 else 14
-            weight = 'bold' if level == 1 else 'normal'
-            color = '#1a1a2e' if level == 1 else '#2d3748'
-            ax.text(PAD, cy, text, fontsize=size, fontweight=weight, 
-                    color=color, fontproperties=font, ha='left', va='top')
-            cy -= size * 0.04 + 0.15
-
-        elif elem[0] == 'paragraph':
-            ax.text(PAD, cy, elem[1], fontsize=11, color='#374151',
-                    fontproperties=font, ha='left', va='top', wrap=True)
-            lines = max(1, len(elem[1]) // 60 + 1)
-            cy -= lines * 0.35 + 0.25
-
-        elif elem[0] == 'codeblock':
-            code_h = max(0.5, (elem[1].count('\n') + 2) * 0.3)
-            code_box = patches.Rectangle((PAD, cy - code_h), W - PAD * 2, code_h,
-                                         linewidth=1, edgecolor='#e0e0e0', facecolor='#f4f4f4')
-            ax.add_patch(code_box)
-            ax.text(PAD + 0.1, cy - 0.15, elem[1][:500], fontsize=9, 
-                    color='#333333', fontfamily='monospace', va='top')
-            cy -= code_h + 0.2
-
-        elif elem[0] == 'list_item':
-            ax.text(PAD, cy, f'\u2022 {elem[1]}', fontsize=11, color='#374151',
-                    fontproperties=font, ha='left', va='top')
-            cy -= 0.35
-
-        elif elem[0] == 'quote':
-            ax.plot([PAD, PAD, PAD + 0.05, PAD + 0.05], 
-                    [cy, cy - 0.4, cy - 0.4, cy - 0.6], 
-                    color='#0066cc', linewidth=2)
-            ax.text(PAD + 0.15, cy - 0.1, elem[1], fontsize=11, color='#666666',
-                    fontproperties=font, ha='left', va='top')
-            cy -= 0.5
-
-        elif elem[0] == 'hr':
-            ax.axhline(y=cy, color='#e0e0e0', linewidth=1, xmin=0.05, xmax=0.95)
-            cy -= 0.3
-
-    plt.tight_layout(pad=0)
-    
+    """将 Markdown 渲染为 PNG 图片"""
    Path(img_path).parent.mkdir(parents=True, exist_ok=True)
-    plt.savefig(img_path, format='png', dpi=150, bbox_inches='tight',
-                facecolor='#ffffff', edgecolor='none')
-    plt.close()
    
+    m2i = Markdown2Image()
+    m2i.b64_decode_and_dump(markdown_text=md_text, output_path=img_path)

-# ============================================================
-# 纯文本转换
-# ============================================================
-
-def markdown_to_text(md_text):
-    """将 Markdown 转换为纯文本"""
-    if not md_text:
-        return ""
-
-    md_text = _clean_invisible_chars(md_text)
-    elements = parse_markdown(md_text)
-
-    lines = []
-
-    for elem in elements:
-        if elem[0] == 'header':
-            level = elem[1]
-            prefix = "#" * level + " "
-            lines.append(f"{prefix}{elem[2]}")
-            lines.append("")
-
-        elif elem[0] == 'paragraph':
-            lines.append(elem[1])
-            lines.append("")
-
-        elif elem[0] == 'codeblock':
-            lines.append("```")
-            lines.append(elem[1])
-            lines.append("```")
-            lines.append("")
-
-        elif elem[0] == 'list_item':
-            lines.append(f"• {elem[1]}")
-
-        elif elem[0] == 'ordered_item':
-            lines.append(f"  {elem[1]}")
-
-        elif elem[0] == 'quote':
-            lines.append(f"> {elem[1]}")
-
-        elif elem[0] == 'hr':
-            lines.append("─" * 50)
-            lines.append("")
-
-    return '\n'.join(lines)
-
-
-# ============================================================
-# 主函数
-# ============================================================

 def main():
    if len(sys.argv) < 3:
-        print("Usage: python md_convert.py <input.md> <output.{html|png|txt}>", file=sys.stderr)
+        print("Usage: python md_convert.py <input.md> <output.png>", file=sys.stderr)
        sys.exit(1)

    input_path = sys.argv[1]
@ -586,22 +32,7 @@ def main():
        sys.exit(1)

    md_text = Path(input_path).read_text(encoding="utf-8")
-    ext = Path(output_path).suffix.lower()
-
-    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-
-    if ext == ".html":
-        html_content = markdown_to_html(md_text)
-        Path(output_path).write_text(html_content, encoding="utf-8")
-    elif ext == ".png":
-        markdown_to_png(md_text, output_path)
-    elif ext == ".txt":
-        text_content = markdown_to_text(md_text)
-        Path(output_path).write_text(text_content, encoding="utf-8")
-    else:
-        print(f"Error: Unsupported output format: {ext}", file=sys.stderr)
-        print("Supported formats: .html, .png, .txt", file=sys.stderr)
-        sys.exit(1)
+    markdown_to_png(md_text, output_path)

    print(f"Converted: {input_path} -> {output_path}")