SKILLS/markdown-converter/scripts/md_convert.py

610 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Markdown 转换器 - 支持 HTML / PNG / Plain Text 输出
Dependencies:
- HTML/PNG 输出: pip install Pillow html2text pygments markdown
"""
import html
import re
import sys
import os
from pathlib import Path
# ============================================================
# 字体查找
# ============================================================
def _find_font():
"""Find a suitable TrueType font across platforms."""
candidates = []
if sys.platform == "win32":
pf = os.environ.get("WINDIR", r"C:\Windows")
candidates = [
os.path.join(pf, "Fonts", "msyh.ttc"),
os.path.join(pf, "Fonts", "msyhbd.ttc"),
os.path.join(pf, "Fonts", "simhei.ttf"),
os.path.join(pf, "Fonts", "simsun.ttc"),
]
elif sys.platform == "darwin":
candidates = [
"/System/Library/Fonts/PingFang.ttc",
"/System/Library/Fonts/STHeiti Light.ttc",
"/Library/Fonts/Arial Unicode.ttf",
]
else:
candidates = [
"/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
"/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
"/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
]
for p in candidates:
if os.path.exists(p):
return p
return None
# ============================================================
# 工具函数
# ============================================================
def _clean_invisible_chars(text):
"""清理不可见的 Unicode 字符"""
if not text:
return ""
text = re.sub(r'[\u200b-\u200f\u2028-\u202f\ufeff\u00ad]', '', text)
text = re.sub(r'[\xa0\u3000]', ' ', text)
text = re.sub(r'[\uff00-\uffef]', '', text)
return text
def _decode_html_entities(text):
"""解码 HTML 实体"""
entities = {
'<': '<',
'>': '>',
'&': '&',
'"': '"',
''': "'",
''': "'",
}
for k, v in entities.items():
text = text.replace(k, v)
# 处理特殊引号
text = text.replace('"', '"').replace('"', '"')
text = text.replace(''', "'").replace(''', "'")
return text
# ============================================================
# Markdown 解析
# ============================================================
def parse_markdown(md_text):
"""解析 Markdown 文本,提取标题、代码块、段落等元素"""
if not md_text:
return []
md_text = _clean_invisible_chars(md_text)
lines = md_text.split('\n')
elements = []
current_paragraph = []
def flush_paragraph():
nonlocal current_paragraph
if current_paragraph:
text = ' '.join(current_paragraph)
if text.strip():
elements.append(('paragraph', text.strip()))
current_paragraph = []
for line in lines:
stripped = line.strip()
# 跳过空行
if not stripped:
flush_paragraph()
continue
# 标题
header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
if header_match:
flush_paragraph()
level = len(header_match.group(1))
elements.append(('header', level, header_match.group(2).strip()))
continue
# 代码块
if stripped.startswith('```'):
flush_paragraph()
elements.append(('codeblock', stripped[3:].strip()))
continue
# 无序列表
list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped)
if list_match:
flush_paragraph()
elements.append(('list_item', list_match.group(1).strip()))
continue
# 有序列表
ordered_match = re.match(r'^\d+\.\s+(.+)$', stripped)
if ordered_match:
flush_paragraph()
elements.append(('ordered_item', ordered_match.group(1).strip()))
continue
# 引用
if stripped.startswith('>'):
flush_paragraph()
content = stripped[1:].strip()
elements.append(('quote', content))
continue
# 水平线
if re.match(r'^[\-\*_]{3,}$', stripped):
flush_paragraph()
elements.append(('hr',))
continue
# 链接或图片
link_match = re.match(r'!?\[([^\]]+)\]\([^\)]+\)', stripped)
if link_match:
flush_paragraph()
elements.append(('link', link_match.group(1)))
continue
# 默认作为段落处理
current_paragraph.append(stripped)
flush_paragraph()
return elements
# ============================================================
# 文本换行
# ============================================================
def _wrap_text(text, font, max_width, draw):
"""Wrap text to fit within max_width pixels."""
lines = []
for paragraph in text.split("\n"):
if not paragraph.strip():
lines.append("")
continue
current = ""
for ch in paragraph:
test = current + ch
bbox = draw.textbbox((0, 0), test, font=font)
if bbox[2] - bbox[0] > max_width and current:
lines.append(current)
current = ch
else:
current = test
if current:
lines.append(current)
return lines
# ============================================================
# HTML 转换
# ============================================================
def markdown_to_html(md_text):
"""将 Markdown 转换为 HTML"""
if not md_text:
return ""
md_text = _clean_invisible_chars(md_text)
elements = parse_markdown(md_text)
try:
import html
except ImportError:
import urllib.parse as html
try:
import markdown
from markdown.extensions import codehilite
md = markdown.Markdown(extensions=['codehilite', 'fenced_code', 'tables'])
html_content = md.convert(md_text)
return _html_template(html_content)
except ImportError:
# 降级处理:使用简单的转换
return _simple_markdown_to_html(md_text)
def _simple_markdown_to_html(md_text):
"""简单的 Markdown 到 HTML 转换(无外部依赖)"""
lines = md_text.split('\n')
html_lines = []
in_codeblock = False
for line in lines:
stripped = line.strip()
# 代码块开始/结束
if stripped.startswith('```'):
if in_codeblock:
html_lines.append('</code></pre>')
in_codeblock = False
else:
lang = stripped[3:].strip() or ''
lang_attr = f' class="language-{lang}"' if lang else ''
html_lines.append(f'<pre><code lang="{lang}">')
in_codeblock = True
continue
if in_codeblock:
html_lines.append(html.escape(line))
continue
# 标题
header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
if header_match:
level = len(header_match.group(1))
content = header_match.group(2)
html_lines.append(f'<h{level}>{_decode_html_entities(content)}</h{level}>')
continue
# 水平线
if re.match(r'^[\-\*_]{3,}$', stripped):
html_lines.append('<hr>')
continue
# 引用
if stripped.startswith('>'):
content = stripped[1:].strip()
html_lines.append(f'<blockquote>{_decode_html_entities(content)}</blockquote>')
continue
# 列表项
list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped)
if list_match:
content = list_match.group(1)
html_lines.append(f'<li>{_decode_html_entities(content)}</li>')
continue
# 段落
if stripped:
# 处理粗体和斜体
text = _decode_html_entities(stripped)
text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text)
text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text)
text = re.sub(r'__(.+?)__', r'<strong>\1</strong>', text)
text = re.sub(r'_(.+?)_', r'<em>\1</em>', text)
text = re.sub(r'`(.+?)`', r'<code>\1</code>', text)
html_lines.append(f'<p>{text}</p>')
return _html_template('\n'.join(html_lines))
def _html_template(content):
"""生成完整的 HTML 文档"""
return f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Markdown Document</title>
<style>
body {{
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 0 auto;
padding: 20px;
color: #333;
}}
h1, h2, h3, h4, h5, h6 {{
margin-top: 1.5em;
margin-bottom: 0.5em;
font-weight: 600;
}}
h1 {{ font-size: 2em; border-bottom: 2px solid #333; }}
h2 {{ font-size: 1.5em; border-bottom: 1px solid #ddd; }}
code {{
background: #f4f4f4;
padding: 2px 6px;
border-radius: 3px;
font-family: 'Consolas', 'Monaco', monospace;
}}
pre {{
background: #f4f4f4;
padding: 16px;
border-radius: 6px;
overflow-x: auto;
}}
pre code {{
background: none;
padding: 0;
}}
blockquote {{
border-left: 4px solid #ddd;
margin: 0;
padding-left: 16px;
color: #666;
}}
a {{
color: #0066cc;
}}
hr {{
border: none;
border-top: 1px solid #ddd;
margin: 24px 0;
}}
</style>
</head>
<body>
{content}
</body>
</html>"""
# ============================================================
# PNG 渲染 (使用 matplotlib)
# ============================================================
def _get_matplotlib_font():
"""获取支持中文的 matplotlib 字体(通过字体文件路径)"""
import matplotlib
import matplotlib.font_manager as fm
import os
import sys
# Linux 中文字体路径
linux_font_paths = [
'/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc',
'/usr/share/fonts/opentype/noto/NotoSansSC-Regular.otf',
'/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc',
'/usr/share/fonts/truetype/wqy/wqy-microhei.ttc',
'/usr/share/fonts/truetype/wqy/wqy-microhei.ttc',
'/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf',
'/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc',
'/usr/share/fonts/truetype/arphic/uming.ttc',
'/usr/share/fonts/truetype/arphic/ukai.ttc',
]
# Windows 中文字体路径
if sys.platform == "win32":
windir = os.environ.get("WINDIR", r"C:\Windows")
windows_font_paths = [
os.path.join(windir, "Fonts", "msyh.ttc"),
os.path.join(windir, "Fonts", "msyhbd.ttc"),
os.path.join(windir, "Fonts", "simhei.ttf"),
os.path.join(windir, "Fonts", "simsun.ttc"),
os.path.join(windir, "Fonts", "STHeiti Light.ttc"),
]
linux_font_paths.extend(windows_font_paths)
# macOS 中文字体路径
elif sys.platform == "darwin":
mac_font_paths = [
'/System/Library/Fonts/PingFang.ttc',
'/System/Library/Fonts/STHeiti Light.ttc',
'/Library/Fonts/Arial Unicode.ttf',
'/System/Library/Fonts/Supplemental/Arial Unicode.ttf',
]
linux_font_paths.extend(mac_font_paths)
# 查找存在的字体文件
for font_path in linux_font_paths:
if os.path.exists(font_path):
# 清除字体缓存并加载指定字体
fm.fontManager.addfont(font_path)
font = fm.FontProperties(fname=font_path)
# 验证字体可以显示中文
return font
# 如果没找到,返回 None 让 matplotlib 使用默认
return None
def markdown_to_png(md_text, img_path):
"""将 Markdown 渲染为 PNG 图片(使用 matplotlib"""
try:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.font_manager as fm
import matplotlib
except ImportError:
raise ImportError("matplotlib not installed. Run: pip install matplotlib")
# 设置非交互式后端
matplotlib.use('Agg')
# 获取中文字体
font = _get_matplotlib_font()
elements = parse_markdown(md_text)
W, PAD = 10, 0.5 # 英寸, 边距
FIG_H = 2.0 # 初始高度
LINE_H = 0.35 # 每行高度
CODE_H = 0.5 # 代码块初始高度
# 计算所需高度
y = 2.5 # 顶部空间
for elem in elements:
if elem[0] == 'header':
level = elem[1]
text = elem[2]
chars_per_line = 50 if level <= 2 else 60
lines = max(1, len(text) // chars_per_line + 1)
y += lines * (0.5 if level <= 2 else 0.4) + 0.2
elif elem[0] == 'paragraph':
chars_per_line = 60
lines = max(1, len(elem[1]) // chars_per_line + 1)
y += lines * 0.35 + 0.3
elif elem[0] == 'codeblock':
lines = elem[1].count('\n') + 2
y += lines * 0.3 + 0.2
elif elem[0] in ('list_item', 'quote'):
chars_per_line = 55
lines = max(1, len(elem[1]) // chars_per_line + 1)
y += lines * 0.32 + 0.15
FIG_H = max(8, y)
fig, ax = plt.subplots(figsize=(W, FIG_H))
fig.patch.set_facecolor('#ffffff')
ax.set_facecolor('#ffffff')
# 标题栏背景
header = patches.Rectangle((0, FIG_H - 1.2), W, 1.0, linewidth=0, facecolor='#2d3748')
ax.add_patch(header)
# 标题
ax.text(0.5, FIG_H - 0.6, 'Markdown Document',
fontsize=18, fontweight='bold', color='white',
fontproperties=font, ha='left', va='center')
ax.text(0.5, FIG_H - 1.0, 'Converted from Markdown',
fontsize=10, color='#888888', fontproperties=font, ha='left', va='center')
ax.set_xlim(0, W)
ax.set_ylim(0, FIG_H)
ax.axis('off')
cy = FIG_H - 1.5
for elem in elements:
if elem[0] == 'header':
level = elem[1]
text = elem[2]
size = 16 if level <= 2 else 14
weight = 'bold' if level == 1 else 'normal'
color = '#1a1a2e' if level == 1 else '#2d3748'
ax.text(PAD, cy, text, fontsize=size, fontweight=weight,
color=color, fontproperties=font, ha='left', va='top')
cy -= size * 0.04 + 0.15
elif elem[0] == 'paragraph':
ax.text(PAD, cy, elem[1], fontsize=11, color='#374151',
fontproperties=font, ha='left', va='top', wrap=True)
lines = max(1, len(elem[1]) // 60 + 1)
cy -= lines * 0.35 + 0.25
elif elem[0] == 'codeblock':
code_h = max(0.5, (elem[1].count('\n') + 2) * 0.3)
code_box = patches.Rectangle((PAD, cy - code_h), W - PAD * 2, code_h,
linewidth=1, edgecolor='#e0e0e0', facecolor='#f4f4f4')
ax.add_patch(code_box)
ax.text(PAD + 0.1, cy - 0.15, elem[1][:500], fontsize=9,
color='#333333', fontfamily='monospace', va='top')
cy -= code_h + 0.2
elif elem[0] == 'list_item':
ax.text(PAD, cy, f'\u2022 {elem[1]}', fontsize=11, color='#374151',
fontproperties=font, ha='left', va='top')
cy -= 0.35
elif elem[0] == 'quote':
ax.plot([PAD, PAD, PAD + 0.05, PAD + 0.05],
[cy, cy - 0.4, cy - 0.4, cy - 0.6],
color='#0066cc', linewidth=2)
ax.text(PAD + 0.15, cy - 0.1, elem[1], fontsize=11, color='#666666',
fontproperties=font, ha='left', va='top')
cy -= 0.5
elif elem[0] == 'hr':
ax.axhline(y=cy, color='#e0e0e0', linewidth=1, xmin=0.05, xmax=0.95)
cy -= 0.3
plt.tight_layout(pad=0)
Path(img_path).parent.mkdir(parents=True, exist_ok=True)
plt.savefig(img_path, format='png', dpi=150, bbox_inches='tight',
facecolor='#ffffff', edgecolor='none')
plt.close()
# ============================================================
# 纯文本转换
# ============================================================
def markdown_to_text(md_text):
"""将 Markdown 转换为纯文本"""
if not md_text:
return ""
md_text = _clean_invisible_chars(md_text)
elements = parse_markdown(md_text)
lines = []
for elem in elements:
if elem[0] == 'header':
level = elem[1]
prefix = "#" * level + " "
lines.append(f"{prefix}{elem[2]}")
lines.append("")
elif elem[0] == 'paragraph':
lines.append(elem[1])
lines.append("")
elif elem[0] == 'codeblock':
lines.append("```")
lines.append(elem[1])
lines.append("```")
lines.append("")
elif elem[0] == 'list_item':
lines.append(f"{elem[1]}")
elif elem[0] == 'ordered_item':
lines.append(f" {elem[1]}")
elif elem[0] == 'quote':
lines.append(f"> {elem[1]}")
elif elem[0] == 'hr':
lines.append("" * 50)
lines.append("")
return '\n'.join(lines)
# ============================================================
# 主函数
# ============================================================
def main():
if len(sys.argv) < 3:
print("Usage: python md_convert.py <input.md> <output.{html|png|txt}>", file=sys.stderr)
sys.exit(1)
input_path = sys.argv[1]
output_path = sys.argv[2]
if not os.path.exists(input_path):
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
sys.exit(1)
md_text = Path(input_path).read_text(encoding="utf-8")
ext = Path(output_path).suffix.lower()
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
if ext == ".html":
html_content = markdown_to_html(md_text)
Path(output_path).write_text(html_content, encoding="utf-8")
elif ext == ".png":
markdown_to_png(md_text, output_path)
elif ext == ".txt":
text_content = markdown_to_text(md_text)
Path(output_path).write_text(text_content, encoding="utf-8")
else:
print(f"Error: Unsupported output format: {ext}", file=sys.stderr)
print("Supported formats: .html, .png, .txt", file=sys.stderr)
sys.exit(1)
print(f"Converted: {input_path} -> {output_path}")
if __name__ == "__main__":
main()