diff --git a/markdown-converter/SKILL.md b/markdown-converter/SKILL.md index f178488..47bfa41 100644 --- a/markdown-converter/SKILL.md +++ b/markdown-converter/SKILL.md @@ -1,84 +1,45 @@ --- name: markdown-converter -description: A versatile Markdown conversion tool that supports converting Markdown to HTML, PNG images, and plain text formats. -metadata: {"clawdbot":{"emoji":"📝","os":["linux","darwin","win32"]}} +description: A simple tool to convert Markdown to PNG images using browser engine. +metadata: {"clawdbot":{"emoji":"🖼️","os":["linux","darwin","win32"]}} --- -# Markdown Converter +# Markdown to PNG Converter -A versatile Markdown conversion tool that supports converting Markdown to HTML, PNG images, and plain text formats. +A simple tool to convert Markdown documents to PNG images using Chromium browser engine. ## Features -- **HTML Conversion**: High-quality Markdown to HTML conversion -- **PNG Output**: Render Markdown as PNG images -- **CJK Support**: Uses system CJK fonts for Chinese character rendering -- **Code Highlighting**: Syntax highlighting for code blocks -- **Clean Output**: Removes invisible Unicode characters +- **Browser-based rendering**: High quality output using real browser engine +- **Full CSS support**: Supports complex Markdown with tables, code blocks, etc. +- **Code highlighting**: Syntax highlighting for code blocks +- **CJK support**: Uses system fonts for Chinese character rendering ## Installation ```bash -pip install matplotlib html2text pygments markdown +pip install markdown2image playwright && playwright install chromium ``` ## Dependencies -- **matplotlib**: PNG image rendering with excellent CJK support -- **html2text**: HTML and Markdown conversion -- **Pygments**: Code syntax highlighting -- **markdown**: Python Markdown processor +- **markdown2image**: Markdown to image conversion +- **playwright**: Browser automation (Chromium) ## Usage ```bash -# Convert to HTML -python scripts/md_convert.py input.md output.html - -# Convert to PNG +# Convert Markdown to PNG python scripts/md_convert.py input.md output.png - -# Convert to plain text -python scripts/md_convert.py input.md output.txt ``` -## Output Formats - -### HTML -- Complete HTML document structure -- Inline CSS styling -- Code syntax highlighting -- Responsive design - -### PNG Card -- White card background -- Large title font -- Automatic text wrapping -- CJK character support - -### Plain Text -- Plain text output -- Preserves basic formatting -- Removes invisible characters - ## Supported Platforms -- **Windows**: Uses system CJK fonts -- **macOS**: Uses PingFang and other system fonts -- **Linux**: Uses NotoSansCJK and other fonts - -## Workflow - -``` -Markdown Input - ↓ -[html2text / markdown library] - ↓ -HTML / PNG / Plain Text -``` +- **Windows**: Uses Chromium browser +- **macOS**: Uses Chromium browser +- **Linux**: Uses Chromium browser ## Notes -- PNG rendering requires Chinese fonts to be installed on the system -- Code highlighting requires Pygments support -- Large files may require longer processing time \ No newline at end of file +- Requires Chromium browser (installed via `playwright install chromium`) +- Chinese fonts are supported via system fonts diff --git a/markdown-converter/scripts/md_convert.py b/markdown-converter/scripts/md_convert.py index b42bfe7..b7a9a17 100644 --- a/markdown-converter/scripts/md_convert.py +++ b/markdown-converter/scripts/md_convert.py @@ -1,581 +1,27 @@ -"""Markdown 转换器 - 支持 HTML / PNG / Plain Text 输出 +"""Markdown to PNG converter Dependencies: - - HTML/PNG 输出: pip install Pillow html2text pygments markdown + - pip install markdown2image playwright && playwright install chromium """ -import html -import re -import sys import os +import sys from pathlib import Path - -# ============================================================ -# 字体查找 -# ============================================================ - -def _find_font(): - """Find a suitable TrueType font across platforms.""" - candidates = [] - if sys.platform == "win32": - pf = os.environ.get("WINDIR", r"C:\Windows") - candidates = [ - os.path.join(pf, "Fonts", "msyh.ttc"), - os.path.join(pf, "Fonts", "msyhbd.ttc"), - os.path.join(pf, "Fonts", "simhei.ttf"), - os.path.join(pf, "Fonts", "simsun.ttc"), - ] - elif sys.platform == "darwin": - candidates = [ - "/System/Library/Fonts/PingFang.ttc", - "/System/Library/Fonts/STHeiti Light.ttc", - "/Library/Fonts/Arial Unicode.ttf", - ] - else: - candidates = [ - "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc", - "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc", - "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", - "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc", - ] - for p in candidates: - if os.path.exists(p): - return p - return None - - -# ============================================================ -# 工具函数 -# ============================================================ - -def _clean_invisible_chars(text): - """清理不可见的 Unicode 字符""" - if not text: - return "" - text = re.sub(r'[\u200b-\u200f\u2028-\u202f\ufeff\u00ad]', '', text) - text = re.sub(r'[\xa0\u3000]', ' ', text) - text = re.sub(r'[\uff00-\uffef]', '', text) - return text - - -def _decode_html_entities(text): - """解码 HTML 实体""" - entities = { - '<': '<', - '>': '>', - '&': '&', - '"': '"', - ''': "'", - ''': "'", - } - for k, v in entities.items(): - text = text.replace(k, v) - # 处理特殊引号 - text = text.replace('"', '"').replace('"', '"') - text = text.replace(''', "'").replace(''', "'") - return text - - -# ============================================================ -# Markdown 解析 -# ============================================================ - -def parse_markdown(md_text): - """解析 Markdown 文本,提取标题、代码块、段落等元素""" - if not md_text: - return [] - - md_text = _clean_invisible_chars(md_text) - lines = md_text.split('\n') - elements = [] - current_paragraph = [] - - def flush_paragraph(): - nonlocal current_paragraph - if current_paragraph: - text = ' '.join(current_paragraph) - if text.strip(): - elements.append(('paragraph', text.strip())) - current_paragraph = [] - - for line in lines: - stripped = line.strip() - - # 跳过空行 - if not stripped: - flush_paragraph() - continue - - # 标题 - header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped) - if header_match: - flush_paragraph() - level = len(header_match.group(1)) - elements.append(('header', level, header_match.group(2).strip())) - continue - - # 代码块 - if stripped.startswith('```'): - flush_paragraph() - elements.append(('codeblock', stripped[3:].strip())) - continue - - # 无序列表 - list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped) - if list_match: - flush_paragraph() - elements.append(('list_item', list_match.group(1).strip())) - continue - - # 有序列表 - ordered_match = re.match(r'^\d+\.\s+(.+)$', stripped) - if ordered_match: - flush_paragraph() - elements.append(('ordered_item', ordered_match.group(1).strip())) - continue - - # 引用 - if stripped.startswith('>'): - flush_paragraph() - content = stripped[1:].strip() - elements.append(('quote', content)) - continue - - # 水平线 - if re.match(r'^[\-\*_]{3,}$', stripped): - flush_paragraph() - elements.append(('hr',)) - continue - - # 链接或图片 - link_match = re.match(r'!?\[([^\]]+)\]\([^\)]+\)', stripped) - if link_match: - flush_paragraph() - elements.append(('link', link_match.group(1))) - continue - - # 默认作为段落处理 - current_paragraph.append(stripped) - - flush_paragraph() - return elements - - -# ============================================================ -# 文本换行 -# ============================================================ - -def _wrap_text(text, font, max_width, draw): - """Wrap text to fit within max_width pixels.""" - lines = [] - for paragraph in text.split("\n"): - if not paragraph.strip(): - lines.append("") - continue - current = "" - for ch in paragraph: - test = current + ch - bbox = draw.textbbox((0, 0), test, font=font) - if bbox[2] - bbox[0] > max_width and current: - lines.append(current) - current = ch - else: - current = test - if current: - lines.append(current) - return lines - - -# ============================================================ -# HTML 转换 -# ============================================================ - -def markdown_to_html(md_text): - """将 Markdown 转换为 HTML""" - if not md_text: - return "" - - md_text = _clean_invisible_chars(md_text) - elements = parse_markdown(md_text) - - try: - import html - except ImportError: - import urllib.parse as html - - try: - import markdown - from markdown.extensions import codehilite - - md = markdown.Markdown(extensions=['codehilite', 'fenced_code', 'tables']) - html_content = md.convert(md_text) - return _html_template(html_content) - except ImportError: - # 降级处理:使用简单的转换 - return _simple_markdown_to_html(md_text) - - -def _simple_markdown_to_html(md_text): - """简单的 Markdown 到 HTML 转换(无外部依赖)""" - lines = md_text.split('\n') - html_lines = [] - - in_codeblock = False - - for line in lines: - stripped = line.strip() - - # 代码块开始/结束 - if stripped.startswith('```'): - if in_codeblock: - html_lines.append('') - in_codeblock = False - else: - lang = stripped[3:].strip() or '' - lang_attr = f' class="language-{lang}"' if lang else '' - html_lines.append(f'
')
-                in_codeblock = True
-            continue
-
-        if in_codeblock:
-            html_lines.append(html.escape(line))
-            continue
-
-        # 标题
-        header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
-        if header_match:
-            level = len(header_match.group(1))
-            content = header_match.group(2)
-            html_lines.append(f'{_decode_html_entities(content)}')
-            continue
-
-        # 水平线
-        if re.match(r'^[\-\*_]{3,}$', stripped):
-            html_lines.append('
') - continue - - # 引用 - if stripped.startswith('>'): - content = stripped[1:].strip() - html_lines.append(f'
{_decode_html_entities(content)}
') - continue - - # 列表项 - list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped) - if list_match: - content = list_match.group(1) - html_lines.append(f'
  • {_decode_html_entities(content)}
  • ') - continue - - # 段落 - if stripped: - # 处理粗体和斜体 - text = _decode_html_entities(stripped) - text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) - text = re.sub(r'\*(.+?)\*', r'\1', text) - text = re.sub(r'__(.+?)__', r'\1', text) - text = re.sub(r'_(.+?)_', r'\1', text) - text = re.sub(r'`(.+?)`', r'\1', text) - html_lines.append(f'

    {text}

    ') - - return _html_template('\n'.join(html_lines)) - - -def _html_template(content): - """生成完整的 HTML 文档""" - return f""" - - - - - Markdown Document - - - -{content} - -""" - - -# ============================================================ -# PNG 渲染 (使用 matplotlib) -# ============================================================ - -def _get_matplotlib_font(): - """获取支持中文的 matplotlib 字体(通过字体文件路径)""" - import matplotlib - import matplotlib.font_manager as fm - import os - import sys - - # Linux 中文字体路径 - linux_font_paths = [ - '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc', - '/usr/share/fonts/opentype/noto/NotoSansSC-Regular.otf', - '/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc', - '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc', - '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc', - '/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf', - '/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc', - '/usr/share/fonts/truetype/arphic/uming.ttc', - '/usr/share/fonts/truetype/arphic/ukai.ttc', - ] - - # Windows 中文字体路径 - if sys.platform == "win32": - windir = os.environ.get("WINDIR", r"C:\Windows") - windows_font_paths = [ - os.path.join(windir, "Fonts", "msyh.ttc"), - os.path.join(windir, "Fonts", "msyhbd.ttc"), - os.path.join(windir, "Fonts", "simhei.ttf"), - os.path.join(windir, "Fonts", "simsun.ttc"), - os.path.join(windir, "Fonts", "STHeiti Light.ttc"), - ] - linux_font_paths.extend(windows_font_paths) - - # macOS 中文字体路径 - elif sys.platform == "darwin": - mac_font_paths = [ - '/System/Library/Fonts/PingFang.ttc', - '/System/Library/Fonts/STHeiti Light.ttc', - '/Library/Fonts/Arial Unicode.ttf', - '/System/Library/Fonts/Supplemental/Arial Unicode.ttf', - ] - linux_font_paths.extend(mac_font_paths) - - # 查找存在的字体文件 - for font_path in linux_font_paths: - if os.path.exists(font_path): - # 清除字体缓存并加载指定字体 - fm.fontManager.addfont(font_path) - font = fm.FontProperties(fname=font_path) - # 验证字体可以显示中文 - return font - - # 如果没找到,返回 None 让 matplotlib 使用默认 - return None +from markdown2image import Markdown2Image def markdown_to_png(md_text, img_path): - """将 Markdown 渲染为 PNG 图片(使用 matplotlib)""" - try: - import matplotlib.pyplot as plt - import matplotlib.patches as patches - import matplotlib.font_manager as fm - import matplotlib - except ImportError: - raise ImportError("matplotlib not installed. Run: pip install matplotlib") - - # 设置非交互式后端 - matplotlib.use('Agg') - - # 获取中文字体 - font = _get_matplotlib_font() - - elements = parse_markdown(md_text) - - W, PAD = 10, 0.5 # 英寸, 边距 - FIG_H = 2.0 # 初始高度 - LINE_H = 0.35 # 每行高度 - CODE_H = 0.5 # 代码块初始高度 - - # 计算所需高度 - y = 2.5 # 顶部空间 - for elem in elements: - if elem[0] == 'header': - level = elem[1] - text = elem[2] - chars_per_line = 50 if level <= 2 else 60 - lines = max(1, len(text) // chars_per_line + 1) - y += lines * (0.5 if level <= 2 else 0.4) + 0.2 - elif elem[0] == 'paragraph': - chars_per_line = 60 - lines = max(1, len(elem[1]) // chars_per_line + 1) - y += lines * 0.35 + 0.3 - elif elem[0] == 'codeblock': - lines = elem[1].count('\n') + 2 - y += lines * 0.3 + 0.2 - elif elem[0] in ('list_item', 'quote'): - chars_per_line = 55 - lines = max(1, len(elem[1]) // chars_per_line + 1) - y += lines * 0.32 + 0.15 - - FIG_H = max(8, y) - - fig, ax = plt.subplots(figsize=(W, FIG_H)) - fig.patch.set_facecolor('#ffffff') - ax.set_facecolor('#ffffff') - - # 标题栏背景 - header = patches.Rectangle((0, FIG_H - 1.2), W, 1.0, linewidth=0, facecolor='#2d3748') - ax.add_patch(header) - - # 标题 - ax.text(0.5, FIG_H - 0.6, 'Markdown Document', - fontsize=18, fontweight='bold', color='white', - fontproperties=font, ha='left', va='center') - - ax.text(0.5, FIG_H - 1.0, 'Converted from Markdown', - fontsize=10, color='#888888', fontproperties=font, ha='left', va='center') - - ax.set_xlim(0, W) - ax.set_ylim(0, FIG_H) - ax.axis('off') - - cy = FIG_H - 1.5 - - for elem in elements: - if elem[0] == 'header': - level = elem[1] - text = elem[2] - size = 16 if level <= 2 else 14 - weight = 'bold' if level == 1 else 'normal' - color = '#1a1a2e' if level == 1 else '#2d3748' - ax.text(PAD, cy, text, fontsize=size, fontweight=weight, - color=color, fontproperties=font, ha='left', va='top') - cy -= size * 0.04 + 0.15 - - elif elem[0] == 'paragraph': - ax.text(PAD, cy, elem[1], fontsize=11, color='#374151', - fontproperties=font, ha='left', va='top', wrap=True) - lines = max(1, len(elem[1]) // 60 + 1) - cy -= lines * 0.35 + 0.25 - - elif elem[0] == 'codeblock': - code_h = max(0.5, (elem[1].count('\n') + 2) * 0.3) - code_box = patches.Rectangle((PAD, cy - code_h), W - PAD * 2, code_h, - linewidth=1, edgecolor='#e0e0e0', facecolor='#f4f4f4') - ax.add_patch(code_box) - ax.text(PAD + 0.1, cy - 0.15, elem[1][:500], fontsize=9, - color='#333333', fontfamily='monospace', va='top') - cy -= code_h + 0.2 - - elif elem[0] == 'list_item': - ax.text(PAD, cy, f'\u2022 {elem[1]}', fontsize=11, color='#374151', - fontproperties=font, ha='left', va='top') - cy -= 0.35 - - elif elem[0] == 'quote': - ax.plot([PAD, PAD, PAD + 0.05, PAD + 0.05], - [cy, cy - 0.4, cy - 0.4, cy - 0.6], - color='#0066cc', linewidth=2) - ax.text(PAD + 0.15, cy - 0.1, elem[1], fontsize=11, color='#666666', - fontproperties=font, ha='left', va='top') - cy -= 0.5 - - elif elem[0] == 'hr': - ax.axhline(y=cy, color='#e0e0e0', linewidth=1, xmin=0.05, xmax=0.95) - cy -= 0.3 - - plt.tight_layout(pad=0) - + """将 Markdown 渲染为 PNG 图片""" Path(img_path).parent.mkdir(parents=True, exist_ok=True) - plt.savefig(img_path, format='png', dpi=150, bbox_inches='tight', - facecolor='#ffffff', edgecolor='none') - plt.close() + + m2i = Markdown2Image() + m2i.b64_decode_and_dump(markdown_text=md_text, output_path=img_path) -# ============================================================ -# 纯文本转换 -# ============================================================ - -def markdown_to_text(md_text): - """将 Markdown 转换为纯文本""" - if not md_text: - return "" - - md_text = _clean_invisible_chars(md_text) - elements = parse_markdown(md_text) - - lines = [] - - for elem in elements: - if elem[0] == 'header': - level = elem[1] - prefix = "#" * level + " " - lines.append(f"{prefix}{elem[2]}") - lines.append("") - - elif elem[0] == 'paragraph': - lines.append(elem[1]) - lines.append("") - - elif elem[0] == 'codeblock': - lines.append("```") - lines.append(elem[1]) - lines.append("```") - lines.append("") - - elif elem[0] == 'list_item': - lines.append(f"• {elem[1]}") - - elif elem[0] == 'ordered_item': - lines.append(f" {elem[1]}") - - elif elem[0] == 'quote': - lines.append(f"> {elem[1]}") - - elif elem[0] == 'hr': - lines.append("─" * 50) - lines.append("") - - return '\n'.join(lines) - - -# ============================================================ -# 主函数 -# ============================================================ - def main(): if len(sys.argv) < 3: - print("Usage: python md_convert.py ", file=sys.stderr) + print("Usage: python md_convert.py ", file=sys.stderr) sys.exit(1) input_path = sys.argv[1] @@ -586,25 +32,10 @@ def main(): sys.exit(1) md_text = Path(input_path).read_text(encoding="utf-8") - ext = Path(output_path).suffix.lower() - - Path(output_path).parent.mkdir(parents=True, exist_ok=True) - - if ext == ".html": - html_content = markdown_to_html(md_text) - Path(output_path).write_text(html_content, encoding="utf-8") - elif ext == ".png": - markdown_to_png(md_text, output_path) - elif ext == ".txt": - text_content = markdown_to_text(md_text) - Path(output_path).write_text(text_content, encoding="utf-8") - else: - print(f"Error: Unsupported output format: {ext}", file=sys.stderr) - print("Supported formats: .html, .png, .txt", file=sys.stderr) - sys.exit(1) + markdown_to_png(md_text, output_path) print(f"Converted: {input_path} -> {output_path}") if __name__ == "__main__": - main() \ No newline at end of file + main()