"""Markdown 转换器 - 支持 HTML / PNG / Plain Text 输出 Dependencies: - HTML/PNG 输出: pip install Pillow html2text pygments markdown """ import html import re import sys import os from pathlib import Path # ============================================================ # 字体查找 # ============================================================ def _find_font(): """Find a suitable TrueType font across platforms.""" candidates = [] if sys.platform == "win32": pf = os.environ.get("WINDIR", r"C:\Windows") candidates = [ os.path.join(pf, "Fonts", "msyh.ttc"), os.path.join(pf, "Fonts", "msyhbd.ttc"), os.path.join(pf, "Fonts", "simhei.ttf"), os.path.join(pf, "Fonts", "simsun.ttc"), ] elif sys.platform == "darwin": candidates = [ "/System/Library/Fonts/PingFang.ttc", "/System/Library/Fonts/STHeiti Light.ttc", "/Library/Fonts/Arial Unicode.ttf", ] else: candidates = [ "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc", "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc", "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc", ] for p in candidates: if os.path.exists(p): return p return None # ============================================================ # 工具函数 # ============================================================ def _clean_invisible_chars(text): """清理不可见的 Unicode 字符""" if not text: return "" text = re.sub(r'[\u200b-\u200f\u2028-\u202f\ufeff\u00ad]', '', text) text = re.sub(r'[\xa0\u3000]', ' ', text) text = re.sub(r'[\uff00-\uffef]', '', text) return text def _decode_html_entities(text): """解码 HTML 实体""" entities = { '<': '<', '>': '>', '&': '&', '"': '"', ''': "'", ''': "'", } for k, v in entities.items(): text = text.replace(k, v) # 处理特殊引号 text = text.replace('"', '"').replace('"', '"') text = text.replace(''', "'").replace(''', "'") return text # ============================================================ # Markdown 解析 # ============================================================ def parse_markdown(md_text): """解析 Markdown 文本,提取标题、代码块、段落等元素""" if not md_text: return [] md_text = _clean_invisible_chars(md_text) lines = md_text.split('\n') elements = [] current_paragraph = [] def flush_paragraph(): nonlocal current_paragraph if current_paragraph: text = ' '.join(current_paragraph) if text.strip(): elements.append(('paragraph', text.strip())) current_paragraph = [] for line in lines: stripped = line.strip() # 跳过空行 if not stripped: flush_paragraph() continue # 标题 header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped) if header_match: flush_paragraph() level = len(header_match.group(1)) elements.append(('header', level, header_match.group(2).strip())) continue # 代码块 if stripped.startswith('```'): flush_paragraph() elements.append(('codeblock', stripped[3:].strip())) continue # 无序列表 list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped) if list_match: flush_paragraph() elements.append(('list_item', list_match.group(1).strip())) continue # 有序列表 ordered_match = re.match(r'^\d+\.\s+(.+)$', stripped) if ordered_match: flush_paragraph() elements.append(('ordered_item', ordered_match.group(1).strip())) continue # 引用 if stripped.startswith('>'): flush_paragraph() content = stripped[1:].strip() elements.append(('quote', content)) continue # 水平线 if re.match(r'^[\-\*_]{3,}$', stripped): flush_paragraph() elements.append(('hr',)) continue # 链接或图片 link_match = re.match(r'!?\[([^\]]+)\]\([^\)]+\)', stripped) if link_match: flush_paragraph() elements.append(('link', link_match.group(1))) continue # 默认作为段落处理 current_paragraph.append(stripped) flush_paragraph() return elements # ============================================================ # 文本换行 # ============================================================ def _wrap_text(text, font, max_width, draw): """Wrap text to fit within max_width pixels.""" lines = [] for paragraph in text.split("\n"): if not paragraph.strip(): lines.append("") continue current = "" for ch in paragraph: test = current + ch bbox = draw.textbbox((0, 0), test, font=font) if bbox[2] - bbox[0] > max_width and current: lines.append(current) current = ch else: current = test if current: lines.append(current) return lines # ============================================================ # HTML 转换 # ============================================================ def markdown_to_html(md_text): """将 Markdown 转换为 HTML""" if not md_text: return "" md_text = _clean_invisible_chars(md_text) elements = parse_markdown(md_text) try: import html except ImportError: import urllib.parse as html try: import markdown from markdown.extensions import codehilite md = markdown.Markdown(extensions=['codehilite', 'fenced_code', 'tables']) html_content = md.convert(md_text) return _html_template(html_content) except ImportError: # 降级处理:使用简单的转换 return _simple_markdown_to_html(md_text) def _simple_markdown_to_html(md_text): """简单的 Markdown 到 HTML 转换(无外部依赖)""" lines = md_text.split('\n') html_lines = [] in_codeblock = False for line in lines: stripped = line.strip() # 代码块开始/结束 if stripped.startswith('```'): if in_codeblock: html_lines.append('') in_codeblock = False else: lang = stripped[3:].strip() or '' lang_attr = f' class="language-{lang}"' if lang else '' html_lines.append(f'
')
                in_codeblock = True
            continue

        if in_codeblock:
            html_lines.append(html.escape(line))
            continue

        # 标题
        header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
        if header_match:
            level = len(header_match.group(1))
            content = header_match.group(2)
            html_lines.append(f'{_decode_html_entities(content)}')
            continue

        # 水平线
        if re.match(r'^[\-\*_]{3,}$', stripped):
            html_lines.append('
') continue # 引用 if stripped.startswith('>'): content = stripped[1:].strip() html_lines.append(f'
{_decode_html_entities(content)}
') continue # 列表项 list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped) if list_match: content = list_match.group(1) html_lines.append(f'
  • {_decode_html_entities(content)}
  • ') continue # 段落 if stripped: # 处理粗体和斜体 text = _decode_html_entities(stripped) text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) text = re.sub(r'\*(.+?)\*', r'\1', text) text = re.sub(r'__(.+?)__', r'\1', text) text = re.sub(r'_(.+?)_', r'\1', text) text = re.sub(r'`(.+?)`', r'\1', text) html_lines.append(f'

    {text}

    ') return _html_template('\n'.join(html_lines)) def _html_template(content): """生成完整的 HTML 文档""" return f""" Markdown Document {content} """ # ============================================================ # PNG 渲染 (使用 matplotlib) # ============================================================ def _get_matplotlib_font(): """获取支持中文的 matplotlib 字体(通过字体文件路径)""" import matplotlib import matplotlib.font_manager as fm import os import sys # Linux 中文字体路径 linux_font_paths = [ '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc', '/usr/share/fonts/opentype/noto/NotoSansSC-Regular.otf', '/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc', '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc', '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc', '/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf', '/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc', '/usr/share/fonts/truetype/arphic/uming.ttc', '/usr/share/fonts/truetype/arphic/ukai.ttc', ] # Windows 中文字体路径 if sys.platform == "win32": windir = os.environ.get("WINDIR", r"C:\Windows") windows_font_paths = [ os.path.join(windir, "Fonts", "msyh.ttc"), os.path.join(windir, "Fonts", "msyhbd.ttc"), os.path.join(windir, "Fonts", "simhei.ttf"), os.path.join(windir, "Fonts", "simsun.ttc"), os.path.join(windir, "Fonts", "STHeiti Light.ttc"), ] linux_font_paths.extend(windows_font_paths) # macOS 中文字体路径 elif sys.platform == "darwin": mac_font_paths = [ '/System/Library/Fonts/PingFang.ttc', '/System/Library/Fonts/STHeiti Light.ttc', '/Library/Fonts/Arial Unicode.ttf', '/System/Library/Fonts/Supplemental/Arial Unicode.ttf', ] linux_font_paths.extend(mac_font_paths) # 查找存在的字体文件 for font_path in linux_font_paths: if os.path.exists(font_path): # 清除字体缓存并加载指定字体 fm.fontManager.addfont(font_path) font = fm.FontProperties(fname=font_path) # 验证字体可以显示中文 return font # 如果没找到,返回 None 让 matplotlib 使用默认 return None def markdown_to_png(md_text, img_path): """将 Markdown 渲染为 PNG 图片(使用 matplotlib)""" try: import matplotlib.pyplot as plt import matplotlib.patches as patches import matplotlib.font_manager as fm import matplotlib except ImportError: raise ImportError("matplotlib not installed. Run: pip install matplotlib") # 设置非交互式后端 matplotlib.use('Agg') # 获取中文字体 font = _get_matplotlib_font() elements = parse_markdown(md_text) W, PAD = 10, 0.5 # 英寸, 边距 FIG_H = 2.0 # 初始高度 LINE_H = 0.35 # 每行高度 CODE_H = 0.5 # 代码块初始高度 # 计算所需高度 y = 2.5 # 顶部空间 for elem in elements: if elem[0] == 'header': level = elem[1] text = elem[2] chars_per_line = 50 if level <= 2 else 60 lines = max(1, len(text) // chars_per_line + 1) y += lines * (0.5 if level <= 2 else 0.4) + 0.2 elif elem[0] == 'paragraph': chars_per_line = 60 lines = max(1, len(elem[1]) // chars_per_line + 1) y += lines * 0.35 + 0.3 elif elem[0] == 'codeblock': lines = elem[1].count('\n') + 2 y += lines * 0.3 + 0.2 elif elem[0] in ('list_item', 'quote'): chars_per_line = 55 lines = max(1, len(elem[1]) // chars_per_line + 1) y += lines * 0.32 + 0.15 FIG_H = max(8, y) fig, ax = plt.subplots(figsize=(W, FIG_H)) fig.patch.set_facecolor('#ffffff') ax.set_facecolor('#ffffff') # 标题栏背景 header = patches.Rectangle((0, FIG_H - 1.2), W, 1.0, linewidth=0, facecolor='#2d3748') ax.add_patch(header) # 标题 ax.text(0.5, FIG_H - 0.6, 'Markdown Document', fontsize=18, fontweight='bold', color='white', fontproperties=font, ha='left', va='center') ax.text(0.5, FIG_H - 1.0, 'Converted from Markdown', fontsize=10, color='#888888', fontproperties=font, ha='left', va='center') ax.set_xlim(0, W) ax.set_ylim(0, FIG_H) ax.axis('off') cy = FIG_H - 1.5 for elem in elements: if elem[0] == 'header': level = elem[1] text = elem[2] size = 16 if level <= 2 else 14 weight = 'bold' if level == 1 else 'normal' color = '#1a1a2e' if level == 1 else '#2d3748' ax.text(PAD, cy, text, fontsize=size, fontweight=weight, color=color, fontproperties=font, ha='left', va='top') cy -= size * 0.04 + 0.15 elif elem[0] == 'paragraph': ax.text(PAD, cy, elem[1], fontsize=11, color='#374151', fontproperties=font, ha='left', va='top', wrap=True) lines = max(1, len(elem[1]) // 60 + 1) cy -= lines * 0.35 + 0.25 elif elem[0] == 'codeblock': code_h = max(0.5, (elem[1].count('\n') + 2) * 0.3) code_box = patches.Rectangle((PAD, cy - code_h), W - PAD * 2, code_h, linewidth=1, edgecolor='#e0e0e0', facecolor='#f4f4f4') ax.add_patch(code_box) ax.text(PAD + 0.1, cy - 0.15, elem[1][:500], fontsize=9, color='#333333', fontfamily='monospace', va='top') cy -= code_h + 0.2 elif elem[0] == 'list_item': ax.text(PAD, cy, f'\u2022 {elem[1]}', fontsize=11, color='#374151', fontproperties=font, ha='left', va='top') cy -= 0.35 elif elem[0] == 'quote': ax.plot([PAD, PAD, PAD + 0.05, PAD + 0.05], [cy, cy - 0.4, cy - 0.4, cy - 0.6], color='#0066cc', linewidth=2) ax.text(PAD + 0.15, cy - 0.1, elem[1], fontsize=11, color='#666666', fontproperties=font, ha='left', va='top') cy -= 0.5 elif elem[0] == 'hr': ax.axhline(y=cy, color='#e0e0e0', linewidth=1, xmin=0.05, xmax=0.95) cy -= 0.3 plt.tight_layout(pad=0) Path(img_path).parent.mkdir(parents=True, exist_ok=True) plt.savefig(img_path, format='png', dpi=150, bbox_inches='tight', facecolor='#ffffff', edgecolor='none') plt.close() # ============================================================ # 纯文本转换 # ============================================================ def markdown_to_text(md_text): """将 Markdown 转换为纯文本""" if not md_text: return "" md_text = _clean_invisible_chars(md_text) elements = parse_markdown(md_text) lines = [] for elem in elements: if elem[0] == 'header': level = elem[1] prefix = "#" * level + " " lines.append(f"{prefix}{elem[2]}") lines.append("") elif elem[0] == 'paragraph': lines.append(elem[1]) lines.append("") elif elem[0] == 'codeblock': lines.append("```") lines.append(elem[1]) lines.append("```") lines.append("") elif elem[0] == 'list_item': lines.append(f"• {elem[1]}") elif elem[0] == 'ordered_item': lines.append(f" {elem[1]}") elif elem[0] == 'quote': lines.append(f"> {elem[1]}") elif elem[0] == 'hr': lines.append("─" * 50) lines.append("") return '\n'.join(lines) # ============================================================ # 主函数 # ============================================================ def main(): if len(sys.argv) < 3: print("Usage: python md_convert.py ", file=sys.stderr) sys.exit(1) input_path = sys.argv[1] output_path = sys.argv[2] if not os.path.exists(input_path): print(f"Error: Input file not found: {input_path}", file=sys.stderr) sys.exit(1) md_text = Path(input_path).read_text(encoding="utf-8") ext = Path(output_path).suffix.lower() Path(output_path).parent.mkdir(parents=True, exist_ok=True) if ext == ".html": html_content = markdown_to_html(md_text) Path(output_path).write_text(html_content, encoding="utf-8") elif ext == ".png": markdown_to_png(md_text, output_path) elif ext == ".txt": text_content = markdown_to_text(md_text) Path(output_path).write_text(text_content, encoding="utf-8") else: print(f"Error: Unsupported output format: {ext}", file=sys.stderr) print("Supported formats: .html, .png, .txt", file=sys.stderr) sys.exit(1) print(f"Converted: {input_path} -> {output_path}") if __name__ == "__main__": main()