From 6fc0be476ff9675e71250db749bfe0c1ada1c429 Mon Sep 17 00:00:00 2001 From: ViperEkura <3081035982@qq.com> Date: Sat, 18 Apr 2026 20:31:57 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E4=BF=AE=E6=94=B9=E8=BD=AC?= =?UTF-8?q?=E5=8C=96=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- markdown-converter/SKILL.md | 73 +-- markdown-converter/scripts/md_convert.py | 591 +---------------------- 2 files changed, 28 insertions(+), 636 deletions(-) diff --git a/markdown-converter/SKILL.md b/markdown-converter/SKILL.md index f178488..47bfa41 100644 --- a/markdown-converter/SKILL.md +++ b/markdown-converter/SKILL.md @@ -1,84 +1,45 @@ --- name: markdown-converter -description: A versatile Markdown conversion tool that supports converting Markdown to HTML, PNG images, and plain text formats. -metadata: {"clawdbot":{"emoji":"📝","os":["linux","darwin","win32"]}} +description: A simple tool to convert Markdown to PNG images using browser engine. +metadata: {"clawdbot":{"emoji":"🖼️","os":["linux","darwin","win32"]}} --- -# Markdown Converter +# Markdown to PNG Converter -A versatile Markdown conversion tool that supports converting Markdown to HTML, PNG images, and plain text formats. +A simple tool to convert Markdown documents to PNG images using Chromium browser engine. ## Features -- **HTML Conversion**: High-quality Markdown to HTML conversion -- **PNG Output**: Render Markdown as PNG images -- **CJK Support**: Uses system CJK fonts for Chinese character rendering -- **Code Highlighting**: Syntax highlighting for code blocks -- **Clean Output**: Removes invisible Unicode characters +- **Browser-based rendering**: High quality output using real browser engine +- **Full CSS support**: Supports complex Markdown with tables, code blocks, etc. +- **Code highlighting**: Syntax highlighting for code blocks +- **CJK support**: Uses system fonts for Chinese character rendering ## Installation ```bash -pip install matplotlib html2text pygments markdown +pip install markdown2image playwright && playwright install chromium ``` ## Dependencies -- **matplotlib**: PNG image rendering with excellent CJK support -- **html2text**: HTML and Markdown conversion -- **Pygments**: Code syntax highlighting -- **markdown**: Python Markdown processor +- **markdown2image**: Markdown to image conversion +- **playwright**: Browser automation (Chromium) ## Usage ```bash -# Convert to HTML -python scripts/md_convert.py input.md output.html - -# Convert to PNG +# Convert Markdown to PNG python scripts/md_convert.py input.md output.png - -# Convert to plain text -python scripts/md_convert.py input.md output.txt ``` -## Output Formats - -### HTML -- Complete HTML document structure -- Inline CSS styling -- Code syntax highlighting -- Responsive design - -### PNG Card -- White card background -- Large title font -- Automatic text wrapping -- CJK character support - -### Plain Text -- Plain text output -- Preserves basic formatting -- Removes invisible characters - ## Supported Platforms -- **Windows**: Uses system CJK fonts -- **macOS**: Uses PingFang and other system fonts -- **Linux**: Uses NotoSansCJK and other fonts - -## Workflow - -``` -Markdown Input - ↓ -[html2text / markdown library] - ↓ -HTML / PNG / Plain Text -``` +- **Windows**: Uses Chromium browser +- **macOS**: Uses Chromium browser +- **Linux**: Uses Chromium browser ## Notes -- PNG rendering requires Chinese fonts to be installed on the system -- Code highlighting requires Pygments support -- Large files may require longer processing time \ No newline at end of file +- Requires Chromium browser (installed via `playwright install chromium`) +- Chinese fonts are supported via system fonts diff --git a/markdown-converter/scripts/md_convert.py b/markdown-converter/scripts/md_convert.py index b42bfe7..b7a9a17 100644 --- a/markdown-converter/scripts/md_convert.py +++ b/markdown-converter/scripts/md_convert.py @@ -1,581 +1,27 @@ -"""Markdown 转换器 - 支持 HTML / PNG / Plain Text 输出 +"""Markdown to PNG converter Dependencies: - - HTML/PNG 输出: pip install Pillow html2text pygments markdown + - pip install markdown2image playwright && playwright install chromium """ -import html -import re -import sys import os +import sys from pathlib import Path - -# ============================================================ -# 字体查找 -# ============================================================ - -def _find_font(): - """Find a suitable TrueType font across platforms.""" - candidates = [] - if sys.platform == "win32": - pf = os.environ.get("WINDIR", r"C:\Windows") - candidates = [ - os.path.join(pf, "Fonts", "msyh.ttc"), - os.path.join(pf, "Fonts", "msyhbd.ttc"), - os.path.join(pf, "Fonts", "simhei.ttf"), - os.path.join(pf, "Fonts", "simsun.ttc"), - ] - elif sys.platform == "darwin": - candidates = [ - "/System/Library/Fonts/PingFang.ttc", - "/System/Library/Fonts/STHeiti Light.ttc", - "/Library/Fonts/Arial Unicode.ttf", - ] - else: - candidates = [ - "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc", - "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc", - "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", - "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc", - ] - for p in candidates: - if os.path.exists(p): - return p - return None - - -# ============================================================ -# 工具函数 -# ============================================================ - -def _clean_invisible_chars(text): - """清理不可见的 Unicode 字符""" - if not text: - return "" - text = re.sub(r'[\u200b-\u200f\u2028-\u202f\ufeff\u00ad]', '', text) - text = re.sub(r'[\xa0\u3000]', ' ', text) - text = re.sub(r'[\uff00-\uffef]', '', text) - return text - - -def _decode_html_entities(text): - """解码 HTML 实体""" - entities = { - '<': '<', - '>': '>', - '&': '&', - '"': '"', - ''': "'", - ''': "'", - } - for k, v in entities.items(): - text = text.replace(k, v) - # 处理特殊引号 - text = text.replace('"', '"').replace('"', '"') - text = text.replace(''', "'").replace(''', "'") - return text - - -# ============================================================ -# Markdown 解析 -# ============================================================ - -def parse_markdown(md_text): - """解析 Markdown 文本,提取标题、代码块、段落等元素""" - if not md_text: - return [] - - md_text = _clean_invisible_chars(md_text) - lines = md_text.split('\n') - elements = [] - current_paragraph = [] - - def flush_paragraph(): - nonlocal current_paragraph - if current_paragraph: - text = ' '.join(current_paragraph) - if text.strip(): - elements.append(('paragraph', text.strip())) - current_paragraph = [] - - for line in lines: - stripped = line.strip() - - # 跳过空行 - if not stripped: - flush_paragraph() - continue - - # 标题 - header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped) - if header_match: - flush_paragraph() - level = len(header_match.group(1)) - elements.append(('header', level, header_match.group(2).strip())) - continue - - # 代码块 - if stripped.startswith('```'): - flush_paragraph() - elements.append(('codeblock', stripped[3:].strip())) - continue - - # 无序列表 - list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped) - if list_match: - flush_paragraph() - elements.append(('list_item', list_match.group(1).strip())) - continue - - # 有序列表 - ordered_match = re.match(r'^\d+\.\s+(.+)$', stripped) - if ordered_match: - flush_paragraph() - elements.append(('ordered_item', ordered_match.group(1).strip())) - continue - - # 引用 - if stripped.startswith('>'): - flush_paragraph() - content = stripped[1:].strip() - elements.append(('quote', content)) - continue - - # 水平线 - if re.match(r'^[\-\*_]{3,}$', stripped): - flush_paragraph() - elements.append(('hr',)) - continue - - # 链接或图片 - link_match = re.match(r'!?\[([^\]]+)\]\([^\)]+\)', stripped) - if link_match: - flush_paragraph() - elements.append(('link', link_match.group(1))) - continue - - # 默认作为段落处理 - current_paragraph.append(stripped) - - flush_paragraph() - return elements - - -# ============================================================ -# 文本换行 -# ============================================================ - -def _wrap_text(text, font, max_width, draw): - """Wrap text to fit within max_width pixels.""" - lines = [] - for paragraph in text.split("\n"): - if not paragraph.strip(): - lines.append("") - continue - current = "" - for ch in paragraph: - test = current + ch - bbox = draw.textbbox((0, 0), test, font=font) - if bbox[2] - bbox[0] > max_width and current: - lines.append(current) - current = ch - else: - current = test - if current: - lines.append(current) - return lines - - -# ============================================================ -# HTML 转换 -# ============================================================ - -def markdown_to_html(md_text): - """将 Markdown 转换为 HTML""" - if not md_text: - return "" - - md_text = _clean_invisible_chars(md_text) - elements = parse_markdown(md_text) - - try: - import html - except ImportError: - import urllib.parse as html - - try: - import markdown - from markdown.extensions import codehilite - - md = markdown.Markdown(extensions=['codehilite', 'fenced_code', 'tables']) - html_content = md.convert(md_text) - return _html_template(html_content) - except ImportError: - # 降级处理:使用简单的转换 - return _simple_markdown_to_html(md_text) - - -def _simple_markdown_to_html(md_text): - """简单的 Markdown 到 HTML 转换(无外部依赖)""" - lines = md_text.split('\n') - html_lines = [] - - in_codeblock = False - - for line in lines: - stripped = line.strip() - - # 代码块开始/结束 - if stripped.startswith('```'): - if in_codeblock: - html_lines.append('') - in_codeblock = False - else: - lang = stripped[3:].strip() or '' - lang_attr = f' class="language-{lang}"' if lang else '' - html_lines.append(f'
')
- in_codeblock = True
- continue
-
- if in_codeblock:
- html_lines.append(html.escape(line))
- continue
-
- # 标题
- header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
- if header_match:
- level = len(header_match.group(1))
- content = header_match.group(2)
- html_lines.append(f'{_decode_html_entities(content)} ')
- continue
-
- # 水平线
- if re.match(r'^[\-\*_]{3,}$', stripped):
- html_lines.append('
')
- continue
-
- # 引用
- if stripped.startswith('>'):
- content = stripped[1:].strip()
- html_lines.append(f'{_decode_html_entities(content)}
')
- continue
-
- # 列表项
- list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped)
- if list_match:
- content = list_match.group(1)
- html_lines.append(f'{_decode_html_entities(content)} ')
- continue
-
- # 段落
- if stripped:
- # 处理粗体和斜体
- text = _decode_html_entities(stripped)
- text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
- text = re.sub(r'\*(.+?)\*', r'\1', text)
- text = re.sub(r'__(.+?)__', r'\1', text)
- text = re.sub(r'_(.+?)_', r'\1', text)
- text = re.sub(r'`(.+?)`', r'\1', text)
- html_lines.append(f'{text}
')
-
- return _html_template('\n'.join(html_lines))
-
-
-def _html_template(content):
- """生成完整的 HTML 文档"""
- return f"""
-
-
-
-
- Markdown Document
-
-
-
-{content}
-
-"""
-
-
-# ============================================================
-# PNG 渲染 (使用 matplotlib)
-# ============================================================
-
-def _get_matplotlib_font():
- """获取支持中文的 matplotlib 字体(通过字体文件路径)"""
- import matplotlib
- import matplotlib.font_manager as fm
- import os
- import sys
-
- # Linux 中文字体路径
- linux_font_paths = [
- '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc',
- '/usr/share/fonts/opentype/noto/NotoSansSC-Regular.otf',
- '/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc',
- '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc',
- '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc',
- '/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf',
- '/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc',
- '/usr/share/fonts/truetype/arphic/uming.ttc',
- '/usr/share/fonts/truetype/arphic/ukai.ttc',
- ]
-
- # Windows 中文字体路径
- if sys.platform == "win32":
- windir = os.environ.get("WINDIR", r"C:\Windows")
- windows_font_paths = [
- os.path.join(windir, "Fonts", "msyh.ttc"),
- os.path.join(windir, "Fonts", "msyhbd.ttc"),
- os.path.join(windir, "Fonts", "simhei.ttf"),
- os.path.join(windir, "Fonts", "simsun.ttc"),
- os.path.join(windir, "Fonts", "STHeiti Light.ttc"),
- ]
- linux_font_paths.extend(windows_font_paths)
-
- # macOS 中文字体路径
- elif sys.platform == "darwin":
- mac_font_paths = [
- '/System/Library/Fonts/PingFang.ttc',
- '/System/Library/Fonts/STHeiti Light.ttc',
- '/Library/Fonts/Arial Unicode.ttf',
- '/System/Library/Fonts/Supplemental/Arial Unicode.ttf',
- ]
- linux_font_paths.extend(mac_font_paths)
-
- # 查找存在的字体文件
- for font_path in linux_font_paths:
- if os.path.exists(font_path):
- # 清除字体缓存并加载指定字体
- fm.fontManager.addfont(font_path)
- font = fm.FontProperties(fname=font_path)
- # 验证字体可以显示中文
- return font
-
- # 如果没找到,返回 None 让 matplotlib 使用默认
- return None
+from markdown2image import Markdown2Image
def markdown_to_png(md_text, img_path):
- """将 Markdown 渲染为 PNG 图片(使用 matplotlib)"""
- try:
- import matplotlib.pyplot as plt
- import matplotlib.patches as patches
- import matplotlib.font_manager as fm
- import matplotlib
- except ImportError:
- raise ImportError("matplotlib not installed. Run: pip install matplotlib")
-
- # 设置非交互式后端
- matplotlib.use('Agg')
-
- # 获取中文字体
- font = _get_matplotlib_font()
-
- elements = parse_markdown(md_text)
-
- W, PAD = 10, 0.5 # 英寸, 边距
- FIG_H = 2.0 # 初始高度
- LINE_H = 0.35 # 每行高度
- CODE_H = 0.5 # 代码块初始高度
-
- # 计算所需高度
- y = 2.5 # 顶部空间
- for elem in elements:
- if elem[0] == 'header':
- level = elem[1]
- text = elem[2]
- chars_per_line = 50 if level <= 2 else 60
- lines = max(1, len(text) // chars_per_line + 1)
- y += lines * (0.5 if level <= 2 else 0.4) + 0.2
- elif elem[0] == 'paragraph':
- chars_per_line = 60
- lines = max(1, len(elem[1]) // chars_per_line + 1)
- y += lines * 0.35 + 0.3
- elif elem[0] == 'codeblock':
- lines = elem[1].count('\n') + 2
- y += lines * 0.3 + 0.2
- elif elem[0] in ('list_item', 'quote'):
- chars_per_line = 55
- lines = max(1, len(elem[1]) // chars_per_line + 1)
- y += lines * 0.32 + 0.15
-
- FIG_H = max(8, y)
-
- fig, ax = plt.subplots(figsize=(W, FIG_H))
- fig.patch.set_facecolor('#ffffff')
- ax.set_facecolor('#ffffff')
-
- # 标题栏背景
- header = patches.Rectangle((0, FIG_H - 1.2), W, 1.0, linewidth=0, facecolor='#2d3748')
- ax.add_patch(header)
-
- # 标题
- ax.text(0.5, FIG_H - 0.6, 'Markdown Document',
- fontsize=18, fontweight='bold', color='white',
- fontproperties=font, ha='left', va='center')
-
- ax.text(0.5, FIG_H - 1.0, 'Converted from Markdown',
- fontsize=10, color='#888888', fontproperties=font, ha='left', va='center')
-
- ax.set_xlim(0, W)
- ax.set_ylim(0, FIG_H)
- ax.axis('off')
-
- cy = FIG_H - 1.5
-
- for elem in elements:
- if elem[0] == 'header':
- level = elem[1]
- text = elem[2]
- size = 16 if level <= 2 else 14
- weight = 'bold' if level == 1 else 'normal'
- color = '#1a1a2e' if level == 1 else '#2d3748'
- ax.text(PAD, cy, text, fontsize=size, fontweight=weight,
- color=color, fontproperties=font, ha='left', va='top')
- cy -= size * 0.04 + 0.15
-
- elif elem[0] == 'paragraph':
- ax.text(PAD, cy, elem[1], fontsize=11, color='#374151',
- fontproperties=font, ha='left', va='top', wrap=True)
- lines = max(1, len(elem[1]) // 60 + 1)
- cy -= lines * 0.35 + 0.25
-
- elif elem[0] == 'codeblock':
- code_h = max(0.5, (elem[1].count('\n') + 2) * 0.3)
- code_box = patches.Rectangle((PAD, cy - code_h), W - PAD * 2, code_h,
- linewidth=1, edgecolor='#e0e0e0', facecolor='#f4f4f4')
- ax.add_patch(code_box)
- ax.text(PAD + 0.1, cy - 0.15, elem[1][:500], fontsize=9,
- color='#333333', fontfamily='monospace', va='top')
- cy -= code_h + 0.2
-
- elif elem[0] == 'list_item':
- ax.text(PAD, cy, f'\u2022 {elem[1]}', fontsize=11, color='#374151',
- fontproperties=font, ha='left', va='top')
- cy -= 0.35
-
- elif elem[0] == 'quote':
- ax.plot([PAD, PAD, PAD + 0.05, PAD + 0.05],
- [cy, cy - 0.4, cy - 0.4, cy - 0.6],
- color='#0066cc', linewidth=2)
- ax.text(PAD + 0.15, cy - 0.1, elem[1], fontsize=11, color='#666666',
- fontproperties=font, ha='left', va='top')
- cy -= 0.5
-
- elif elem[0] == 'hr':
- ax.axhline(y=cy, color='#e0e0e0', linewidth=1, xmin=0.05, xmax=0.95)
- cy -= 0.3
-
- plt.tight_layout(pad=0)
-
+ """将 Markdown 渲染为 PNG 图片"""
Path(img_path).parent.mkdir(parents=True, exist_ok=True)
- plt.savefig(img_path, format='png', dpi=150, bbox_inches='tight',
- facecolor='#ffffff', edgecolor='none')
- plt.close()
+
+ m2i = Markdown2Image()
+ m2i.b64_decode_and_dump(markdown_text=md_text, output_path=img_path)
-# ============================================================
-# 纯文本转换
-# ============================================================
-
-def markdown_to_text(md_text):
- """将 Markdown 转换为纯文本"""
- if not md_text:
- return ""
-
- md_text = _clean_invisible_chars(md_text)
- elements = parse_markdown(md_text)
-
- lines = []
-
- for elem in elements:
- if elem[0] == 'header':
- level = elem[1]
- prefix = "#" * level + " "
- lines.append(f"{prefix}{elem[2]}")
- lines.append("")
-
- elif elem[0] == 'paragraph':
- lines.append(elem[1])
- lines.append("")
-
- elif elem[0] == 'codeblock':
- lines.append("```")
- lines.append(elem[1])
- lines.append("```")
- lines.append("")
-
- elif elem[0] == 'list_item':
- lines.append(f"• {elem[1]}")
-
- elif elem[0] == 'ordered_item':
- lines.append(f" {elem[1]}")
-
- elif elem[0] == 'quote':
- lines.append(f"> {elem[1]}")
-
- elif elem[0] == 'hr':
- lines.append("─" * 50)
- lines.append("")
-
- return '\n'.join(lines)
-
-
-# ============================================================
-# 主函数
-# ============================================================
-
def main():
if len(sys.argv) < 3:
- print("Usage: python md_convert.py ", file=sys.stderr)
+ print("Usage: python md_convert.py ", file=sys.stderr)
sys.exit(1)
input_path = sys.argv[1]
@@ -586,25 +32,10 @@ def main():
sys.exit(1)
md_text = Path(input_path).read_text(encoding="utf-8")
- ext = Path(output_path).suffix.lower()
-
- Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-
- if ext == ".html":
- html_content = markdown_to_html(md_text)
- Path(output_path).write_text(html_content, encoding="utf-8")
- elif ext == ".png":
- markdown_to_png(md_text, output_path)
- elif ext == ".txt":
- text_content = markdown_to_text(md_text)
- Path(output_path).write_text(text_content, encoding="utf-8")
- else:
- print(f"Error: Unsupported output format: {ext}", file=sys.stderr)
- print("Supported formats: .html, .png, .txt", file=sys.stderr)
- sys.exit(1)
+ markdown_to_png(md_text, output_path)
print(f"Converted: {input_path} -> {output_path}")
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()