SKILLS/markdown-converter/scripts/md_convert.py

"""Markdown 转换器 - 支持 HTML / PNG / Plain Text 输出

Dependencies:
  - HTML/PNG 输出: pip install Pillow html2text pygments markdown
"""

import html
import re
import sys
import os
from pathlib import Path


# ============================================================
# 字体查找
# ============================================================

def _find_font():
    """Find a suitable TrueType font across platforms."""
    candidates = []
    if sys.platform == "win32":
        pf = os.environ.get("WINDIR", r"C:\Windows")
        candidates = [
            os.path.join(pf, "Fonts", "msyh.ttc"),
            os.path.join(pf, "Fonts", "msyhbd.ttc"),
            os.path.join(pf, "Fonts", "simhei.ttf"),
            os.path.join(pf, "Fonts", "simsun.ttc"),
        ]
    elif sys.platform == "darwin":
        candidates = [
            "/System/Library/Fonts/PingFang.ttc",
            "/System/Library/Fonts/STHeiti Light.ttc",
            "/Library/Fonts/Arial Unicode.ttf",
        ]
    else:
        candidates = [
            "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
            "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
            "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
        ]
    for p in candidates:
        if os.path.exists(p):
            return p
    return None


# ============================================================
# 工具函数
# ============================================================

def _clean_invisible_chars(text):
    """清理不可见的 Unicode 字符"""
    if not text:
        return ""
    text = re.sub(r'[\u200b-\u200f\u2028-\u202f\ufeff\u00ad]', '', text)
    text = re.sub(r'[\xa0\u3000]', ' ', text)
    text = re.sub(r'[\uff00-\uffef]', '', text)
    return text


def _decode_html_entities(text):
    """解码 HTML 实体"""
    entities = {
        '<': '<',
        '>': '>',
        '&': '&',
        '"': '"',
        ''': "'",
        ''': "'",
    }
    for k, v in entities.items():
        text = text.replace(k, v)
    # 处理特殊引号
    text = text.replace('"', '"').replace('"', '"')
    text = text.replace(''', "'").replace(''', "'")
    return text


# ============================================================
# Markdown 解析
# ============================================================

def parse_markdown(md_text):
    """解析 Markdown 文本，提取标题、代码块、段落等元素"""
    if not md_text:
        return []

    md_text = _clean_invisible_chars(md_text)
    lines = md_text.split('\n')
    elements = []
    current_paragraph = []

    def flush_paragraph():
        nonlocal current_paragraph
        if current_paragraph:
            text = ' '.join(current_paragraph)
            if text.strip():
                elements.append(('paragraph', text.strip()))
            current_paragraph = []

    for line in lines:
        stripped = line.strip()

        # 跳过空行
        if not stripped:
            flush_paragraph()
            continue

        # 标题
        header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
        if header_match:
            flush_paragraph()
            level = len(header_match.group(1))
            elements.append(('header', level, header_match.group(2).strip()))
            continue

        # 代码块
        if stripped.startswith('```'):
            flush_paragraph()
            elements.append(('codeblock', stripped[3:].strip()))
            continue

        # 无序列表
        list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped)
        if list_match:
            flush_paragraph()
            elements.append(('list_item', list_match.group(1).strip()))
            continue

        # 有序列表
        ordered_match = re.match(r'^\d+\.\s+(.+)$', stripped)
        if ordered_match:
            flush_paragraph()
            elements.append(('ordered_item', ordered_match.group(1).strip()))
            continue

        # 引用
        if stripped.startswith('>'):
            flush_paragraph()
            content = stripped[1:].strip()
            elements.append(('quote', content))
            continue

        # 水平线
        if re.match(r'^[\-\*_]{3,}$', stripped):
            flush_paragraph()
            elements.append(('hr',))
            continue

        # 链接或图片
        link_match = re.match(r'!?\[([^\]]+)\]\([^\)]+\)', stripped)
        if link_match:
            flush_paragraph()
            elements.append(('link', link_match.group(1)))
            continue

        # 默认作为段落处理
        current_paragraph.append(stripped)

    flush_paragraph()
    return elements


# ============================================================
# 文本换行
# ============================================================

def _wrap_text(text, font, max_width, draw):
    """Wrap text to fit within max_width pixels."""
    lines = []
    for paragraph in text.split("\n"):
        if not paragraph.strip():
            lines.append("")
            continue
        current = ""
        for ch in paragraph:
            test = current + ch
            bbox = draw.textbbox((0, 0), test, font=font)
            if bbox[2] - bbox[0] > max_width and current:
                lines.append(current)
                current = ch
            else:
                current = test
        if current:
            lines.append(current)
    return lines


# ============================================================
# HTML 转换
# ============================================================

def markdown_to_html(md_text):
    """将 Markdown 转换为 HTML"""
    if not md_text:
        return ""

    md_text = _clean_invisible_chars(md_text)
    elements = parse_markdown(md_text)

    try:
        import html
    except ImportError:
        import urllib.parse as html

    try:
        import markdown
        from markdown.extensions import codehilite

        md = markdown.Markdown(extensions=['codehilite', 'fenced_code', 'tables'])
        html_content = md.convert(md_text)
        return _html_template(html_content)
    except ImportError:
        # 降级处理：使用简单的转换
        return _simple_markdown_to_html(md_text)


def _simple_markdown_to_html(md_text):
    """简单的 Markdown 到 HTML 转换（无外部依赖）"""
    lines = md_text.split('\n')
    html_lines = []

    in_codeblock = False

    for line in lines:
        stripped = line.strip()

        # 代码块开始/结束
        if stripped.startswith('```'):
            if in_codeblock:
                html_lines.append('</code></pre>')
                in_codeblock = False
            else:
                lang = stripped[3:].strip() or ''
                lang_attr = f' class="language-{lang}"' if lang else ''
                html_lines.append(f'<pre><code lang="{lang}">')
                in_codeblock = True
            continue

        if in_codeblock:
            html_lines.append(html.escape(line))
            continue

        # 标题
        header_match = re.match(r'^(#{1,6})\s+(.+)$', stripped)
        if header_match:
            level = len(header_match.group(1))
            content = header_match.group(2)
            html_lines.append(f'<h{level}>{_decode_html_entities(content)}</h{level}>')
            continue

        # 水平线
        if re.match(r'^[\-\*_]{3,}$', stripped):
            html_lines.append('<hr>')
            continue

        # 引用
        if stripped.startswith('>'):
            content = stripped[1:].strip()
            html_lines.append(f'<blockquote>{_decode_html_entities(content)}</blockquote>')
            continue

        # 列表项
        list_match = re.match(r'^[\-\*+]\s+(.+)$', stripped)
        if list_match:
            content = list_match.group(1)
            html_lines.append(f'<li>{_decode_html_entities(content)}</li>')
            continue

        # 段落
        if stripped:
            # 处理粗体和斜体
            text = _decode_html_entities(stripped)
            text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text)
            text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text)
            text = re.sub(r'__(.+?)__', r'<strong>\1</strong>', text)
            text = re.sub(r'_(.+?)_', r'<em>\1</em>', text)
            text = re.sub(r'`(.+?)`', r'<code>\1</code>', text)
            html_lines.append(f'<p>{text}</p>')

    return _html_template('\n'.join(html_lines))


def _html_template(content):
    """生成完整的 HTML 文档"""
    return f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Markdown Document</title>
    <style>
        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            line-height: 1.6;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            color: #333;
        }}
        h1, h2, h3, h4, h5, h6 {{
            margin-top: 1.5em;
            margin-bottom: 0.5em;
            font-weight: 600;
        }}
        h1 {{ font-size: 2em; border-bottom: 2px solid #333; }}
        h2 {{ font-size: 1.5em; border-bottom: 1px solid #ddd; }}
        code {{
            background: #f4f4f4;
            padding: 2px 6px;
            border-radius: 3px;
            font-family: 'Consolas', 'Monaco', monospace;
        }}
        pre {{
            background: #f4f4f4;
            padding: 16px;
            border-radius: 6px;
            overflow-x: auto;
        }}
        pre code {{
            background: none;
            padding: 0;
        }}
        blockquote {{
            border-left: 4px solid #ddd;
            margin: 0;
            padding-left: 16px;
            color: #666;
        }}
        a {{
            color: #0066cc;
        }}
        hr {{
            border: none;
            border-top: 1px solid #ddd;
            margin: 24px 0;
        }}
    </style>
</head>
<body>
{content}
</body>
</html>"""


# ============================================================
# PNG 渲染 (使用 matplotlib)
# ============================================================

def _get_matplotlib_font():
    """获取支持中文的 matplotlib 字体（通过字体文件路径）"""
    import matplotlib
    import matplotlib.font_manager as fm
    import os
    import sys

    # Linux 中文字体路径
    linux_font_paths = [
        '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc',
        '/usr/share/fonts/opentype/noto/NotoSansSC-Regular.otf',
        '/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc',
        '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc',
        '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc',
        '/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf',
        '/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc',
        '/usr/share/fonts/truetype/arphic/uming.ttc',
        '/usr/share/fonts/truetype/arphic/ukai.ttc',
    ]

    # Windows 中文字体路径
    if sys.platform == "win32":
        windir = os.environ.get("WINDIR", r"C:\Windows")
        windows_font_paths = [
            os.path.join(windir, "Fonts", "msyh.ttc"),
            os.path.join(windir, "Fonts", "msyhbd.ttc"),
            os.path.join(windir, "Fonts", "simhei.ttf"),
            os.path.join(windir, "Fonts", "simsun.ttc"),
            os.path.join(windir, "Fonts", "STHeiti Light.ttc"),
        ]
        linux_font_paths.extend(windows_font_paths)

    # macOS 中文字体路径
    elif sys.platform == "darwin":
        mac_font_paths = [
            '/System/Library/Fonts/PingFang.ttc',
            '/System/Library/Fonts/STHeiti Light.ttc',
            '/Library/Fonts/Arial Unicode.ttf',
            '/System/Library/Fonts/Supplemental/Arial Unicode.ttf',
        ]
        linux_font_paths.extend(mac_font_paths)

    # 查找存在的字体文件
    for font_path in linux_font_paths:
        if os.path.exists(font_path):
            # 清除字体缓存并加载指定字体
            fm.fontManager.addfont(font_path)
            font = fm.FontProperties(fname=font_path)
            # 验证字体可以显示中文
            return font

    # 如果没找到，返回 None 让 matplotlib 使用默认
    return None


def markdown_to_png(md_text, img_path):
    """将 Markdown 渲染为 PNG 图片（使用 matplotlib）"""
    try:
        import matplotlib.pyplot as plt
        import matplotlib.patches as patches
        import matplotlib.font_manager as fm
        import matplotlib
    except ImportError:
        raise ImportError("matplotlib not installed. Run: pip install matplotlib")

    # 设置非交互式后端
    matplotlib.use('Agg')

    # 获取中文字体
    font = _get_matplotlib_font()

    elements = parse_markdown(md_text)

    W, PAD = 10, 0.5  # 英寸, 边距
    FIG_H = 2.0  # 初始高度
    LINE_H = 0.35  # 每行高度
    CODE_H = 0.5  # 代码块初始高度

    # 计算所需高度
    y = 2.5  # 顶部空间
    for elem in elements:
        if elem[0] == 'header':
            level = elem[1]
            text = elem[2]
            chars_per_line = 50 if level <= 2 else 60
            lines = max(1, len(text) // chars_per_line + 1)
            y += lines * (0.5 if level <= 2 else 0.4) + 0.2
        elif elem[0] == 'paragraph':
            chars_per_line = 60
            lines = max(1, len(elem[1]) // chars_per_line + 1)
            y += lines * 0.35 + 0.3
        elif elem[0] == 'codeblock':
            lines = elem[1].count('\n') + 2
            y += lines * 0.3 + 0.2
        elif elem[0] in ('list_item', 'quote'):
            chars_per_line = 55
            lines = max(1, len(elem[1]) // chars_per_line + 1)
            y += lines * 0.32 + 0.15

    FIG_H = max(8, y)

    fig, ax = plt.subplots(figsize=(W, FIG_H))
    fig.patch.set_facecolor('#ffffff')
    ax.set_facecolor('#ffffff')

    # 标题栏背景
    header = patches.Rectangle((0, FIG_H - 1.2), W, 1.0, linewidth=0, facecolor='#2d3748')
    ax.add_patch(header)

    # 标题
    ax.text(0.5, FIG_H - 0.6, 'Markdown Document',
            fontsize=18, fontweight='bold', color='white',
            fontproperties=font, ha='left', va='center')

    ax.text(0.5, FIG_H - 1.0, 'Converted from Markdown',
            fontsize=10, color='#888888', fontproperties=font, ha='left', va='center')

    ax.set_xlim(0, W)
    ax.set_ylim(0, FIG_H)
    ax.axis('off')

    cy = FIG_H - 1.5

    for elem in elements:
        if elem[0] == 'header':
            level = elem[1]
            text = elem[2]
            size = 16 if level <= 2 else 14
            weight = 'bold' if level == 1 else 'normal'
            color = '#1a1a2e' if level == 1 else '#2d3748'
            ax.text(PAD, cy, text, fontsize=size, fontweight=weight,
                    color=color, fontproperties=font, ha='left', va='top')
            cy -= size * 0.04 + 0.15

        elif elem[0] == 'paragraph':
            ax.text(PAD, cy, elem[1], fontsize=11, color='#374151',
                    fontproperties=font, ha='left', va='top', wrap=True)
            lines = max(1, len(elem[1]) // 60 + 1)
            cy -= lines * 0.35 + 0.25

        elif elem[0] == 'codeblock':
            code_h = max(0.5, (elem[1].count('\n') + 2) * 0.3)
            code_box = patches.Rectangle((PAD, cy - code_h), W - PAD * 2, code_h,
                                         linewidth=1, edgecolor='#e0e0e0', facecolor='#f4f4f4')
            ax.add_patch(code_box)
            ax.text(PAD + 0.1, cy - 0.15, elem[1][:500], fontsize=9,
                    color='#333333', fontfamily='monospace', va='top')
            cy -= code_h + 0.2

        elif elem[0] == 'list_item':
            ax.text(PAD, cy, f'\u2022 {elem[1]}', fontsize=11, color='#374151',
                    fontproperties=font, ha='left', va='top')
            cy -= 0.35

        elif elem[0] == 'quote':
            ax.plot([PAD, PAD, PAD + 0.05, PAD + 0.05],
                    [cy, cy - 0.4, cy - 0.4, cy - 0.6],
                    color='#0066cc', linewidth=2)
            ax.text(PAD + 0.15, cy - 0.1, elem[1], fontsize=11, color='#666666',
                    fontproperties=font, ha='left', va='top')
            cy -= 0.5

        elif elem[0] == 'hr':
            ax.axhline(y=cy, color='#e0e0e0', linewidth=1, xmin=0.05, xmax=0.95)
            cy -= 0.3

    plt.tight_layout(pad=0)

    Path(img_path).parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(img_path, format='png', dpi=150, bbox_inches='tight',
                facecolor='#ffffff', edgecolor='none')
    plt.close()


# ============================================================
# 纯文本转换
# ============================================================

def markdown_to_text(md_text):
    """将 Markdown 转换为纯文本"""
    if not md_text:
        return ""

    md_text = _clean_invisible_chars(md_text)
    elements = parse_markdown(md_text)

    lines = []

    for elem in elements:
        if elem[0] == 'header':
            level = elem[1]
            prefix = "#" * level + " "
            lines.append(f"{prefix}{elem[2]}")
            lines.append("")

        elif elem[0] == 'paragraph':
            lines.append(elem[1])
            lines.append("")

        elif elem[0] == 'codeblock':
            lines.append("```")
            lines.append(elem[1])
            lines.append("```")
            lines.append("")

        elif elem[0] == 'list_item':
            lines.append(f"• {elem[1]}")

        elif elem[0] == 'ordered_item':
            lines.append(f"  {elem[1]}")

        elif elem[0] == 'quote':
            lines.append(f"> {elem[1]}")

        elif elem[0] == 'hr':
            lines.append("─" * 50)
            lines.append("")

    return '\n'.join(lines)


# ============================================================
# 主函数
# ============================================================

def main():
    if len(sys.argv) < 3:
        print("Usage: python md_convert.py <input.md> <output.{html|png|txt}>", file=sys.stderr)
        sys.exit(1)

    input_path = sys.argv[1]
    output_path = sys.argv[2]

    if not os.path.exists(input_path):
        print(f"Error: Input file not found: {input_path}", file=sys.stderr)
        sys.exit(1)

    md_text = Path(input_path).read_text(encoding="utf-8")
    ext = Path(output_path).suffix.lower()

    Path(output_path).parent.mkdir(parents=True, exist_ok=True)

    if ext == ".html":
        html_content = markdown_to_html(md_text)
        Path(output_path).write_text(html_content, encoding="utf-8")
    elif ext == ".png":
        markdown_to_png(md_text, output_path)
    elif ext == ".txt":
        text_content = markdown_to_text(md_text)
        Path(output_path).write_text(text_content, encoding="utf-8")
    else:
        print(f"Error: Unsupported output format: {ext}", file=sys.stderr)
        print("Supported formats: .html, .png, .txt", file=sys.stderr)
        sys.exit(1)

    print(f"Converted: {input_path} -> {output_path}")


if __name__ == "__main__":
    main()