lijiarui.github.io/_publish_posts.py at master · lijiarui/lijiarui.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/env python3
"""Scan posts/*.md and publish them as blog posts.

Each .md file should have YAML frontmatter:
  ---
  title: 标题
  date: 2026-04-27
  category: thought
  tags: 创业, AI
  ---

  正文 markdown 内容...

The script:
1. Generates a Hexo-format HTML stub at <category>/<slug>.html
2. Registers / updates the entry in content.json (dedup by path)

Then run `_rewrite_posts.py` to convert to new layout, then `_build_pages.py`
to rebuild listings. Or just run `build.py` to chain all three.
"""
import json
import re
from html import escape
from pathlib import Path

import markdown as md_lib

ROOT = Path(__file__).parent
POSTS_DIR = ROOT / "posts"

VALID_CATEGORIES = {"thought", "reading", "chatbot", "project", "saas",
                    "interview", "microsoft", "presentation"}

HEXO_TEMPLATE = """<!DOCTYPE HTML>
<html lang="zh-CN">
<head>
<meta charset="utf-8">
<title>{title} - 李佳芮de博客</title>
<link rel="stylesheet" href="/css/site.css">
</head>
<body>
<article class="post" itemscope itemtype="http://schema.org/BlogPosting">
  <div class="post-header">
    <div class="post-author clearfix">
      <p>
        <span class="label">作者</span>
        <a href="/" target="_blank">李佳芮</a>
        <span title="最后编辑于&nbsp;{date_short}">{date_short}</span>
      </p>
    </div>
    <h2 class="post-title">{title}</h2>
    <div class="post-meta">本文共计{word_count}个字</div>
  </div>
  <div class="post-content markdown-body">
{body_html}
  </div>
  <div class="post-tags">标签：
{tags_html}
  </div>
</article>
</body>
</html>
"""


def parse_frontmatter(text):
    if not text.lstrip().startswith("---"):
        return {}, text.strip()
    rest = text.lstrip()[3:]
    if "---" not in rest:
        return {}, text.strip()
    fm, body = rest.split("---", 1)
    meta = {}
    for line in fm.split("\n"):
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" in line:
            k, v = line.split(":", 1)
            meta[k.strip().lower()] = v.strip().strip('"').strip("'")
    return meta, body.strip()


def md_to_plain_text(md_body):
    text = re.sub(r"```.*?```", "", md_body, flags=re.S)
    text = re.sub(r"`[^`]+`", "", text)
    text = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", text)
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    text = re.sub(r"[#>*_\-]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def slugify_title(s):
    s = re.sub(r"[^\w一-鿿-]+", "-", s)
    return re.sub(r"-+", "-", s).strip("-").lower() or "post"


def main():
    if not POSTS_DIR.exists():
        print(f"posts/ directory not found at {POSTS_DIR}")
        return

    cj_path = ROOT / "content.json"
    data = json.loads(cj_path.read_text())

    # Dedup posts by path, keeping the first occurrence. Historical content.json
    # may already contain duplicates that the {path: i} map below cannot fix on
    # its own (the dict only prevents *new* duplicates, not existing ones).
    seen_paths = set()
    deduped = []
    removed = 0
    for p in data["posts"]:
        if p["path"] in seen_paths:
            removed += 1
            continue
        seen_paths.add(p["path"])
        deduped.append(p)
    if removed:
        print(f"  · removed {removed} duplicate post entr{'y' if removed == 1 else 'ies'} from content.json")
        data["posts"] = deduped

    existing_paths = {p["path"]: i for i, p in enumerate(data["posts"])}

    converter = md_lib.Markdown(extensions=["extra", "sane_lists"])

    published = 0
    skipped = 0
    md_files = sorted(POSTS_DIR.glob("*.md"))

    for fp in md_files:
        if fp.name == "TEMPLATE.md" or fp.name.startswith("_"):
            skipped += 1
            continue
        text = fp.read_text(encoding="utf-8")
        meta, body_md = parse_frontmatter(text)

        # Required fields
        title = meta.get("title")
        date = meta.get("date")
        category = meta.get("category", "thought").lower()

        if not title:
            print(f"  ✗ {fp.name}: missing title in frontmatter, skipped")
            skipped += 1
            continue
        is_draft = str(meta.get("draft", "")).lower() in ("true", "yes", "1")
        if category not in VALID_CATEGORIES:
            print(f"  ✗ {fp.name}: invalid category '{category}' "
                  f"(must be one of {sorted(VALID_CATEGORIES)}), skipped")
            skipped += 1
            continue

        # Date: from frontmatter or filename prefix YYYY-MM-DD
        if not date:
            m = re.match(r"^(\d{4}-\d{2}-\d{2})", fp.stem)
            if m:
                date = m.group(1)
            else:
                print(f"  ✗ {fp.name}: missing date in frontmatter and filename, skipped")
                skipped += 1
                continue

        date_short = date[:10]
        date_iso = f"{date_short}T10:00:00.000Z"

        # Slug: explicit frontmatter slug = canonical filename stem.
        # If no slug, derive from filename (with date prefix logic for new posts).
        explicit_slug = meta.get("slug")
        if explicit_slug:
            # Treat as canonical full stem — no date prefix munging
            slug = explicit_slug
            path = f"{category}/{slug}.html"
        else:
            # Filename-based: if filename already has date prefix (any digit count for m/d), use as-is
            stem = fp.stem
            if re.match(r"^\d{4}-\d{1,2}-\d{1,2}([-_].+)?$", stem):
                slug = stem
                path = f"{category}/{stem}.html"
            else:
                slug = slugify_title(stem)
                path = f"{category}/{date_short}-{slug}.html"

        if is_draft:
            print(f"  · {fp.name}: draft, skipped")
            skipped += 1
            # Remove from content.json if previously published, and clean up
            # any stale rendered html so the listing pages don't link to a 404.
            if path in existing_paths:
                idx = existing_paths.pop(path)
                del data["posts"][idx]
                existing_paths = {p["path"]: i for i, p in enumerate(data["posts"])}
            stale = ROOT / path
            if stale.exists():
                stale.unlink()
            continue

        # Tags
        tags = []
        if meta.get("tags"):
            tags = [t.strip() for t in re.split(r"[,，]", meta["tags"]) if t.strip()]

        body_html = converter.convert(body_md)
        converter.reset()
        plain = md_to_plain_text(body_md)

        # Description (SEO): from frontmatter, or auto-generate from body
        description = meta.get("description") or ""
        if not description:
            description = plain[:150].replace("\n", " ").strip()
            if len(plain) > 150:
                description = description.rstrip() + "…"

        tags_html = "\n".join(
            f'    <a href="/tags/{escape(t)}/">{escape(t)}</a>' for t in tags
        )

        html = HEXO_TEMPLATE.format(
            title=escape(title),
            date_short=escape(date_short),
            word_count=len(plain),
            body_html=body_html,
            tags_html=tags_html,
        )

        out_path = ROOT / path
        out_path.parent.mkdir(parents=True, exist_ok=True)
        out_path.write_text(html, encoding="utf-8")

        # content.json entry
        post_entry = {
            "title": title,
            "slug": slug,
            "date": date_iso,
            "updated": date_iso,
            "comments": True,
            "path": path,
            "link": "",
            "permalink": f"https://rui.juzi.bot/{path}",
            "excerpt": "",
            "description": description,
            "text": plain,
            "categories": [{
                "name": category,
                "slug": category,
                "permalink": f"https://rui.juzi.bot/categories/{category}/",
            }],
            "tags": [{
                "name": t,
                "slug": t,
                "permalink": f"https://rui.juzi.bot/tags/{t}/",
            } for t in tags],
            "keywords": [{
                "name": category,
                "slug": category,
                "permalink": f"https://rui.juzi.bot/categories/{category}/",
            }],
        }

        if path in existing_paths:
            data["posts"][existing_paths[path]] = post_entry
        else:
            data["posts"].append(post_entry)
            existing_paths[path] = len(data["posts"]) - 1
        published += 1

    cj_path.write_text(json.dumps(data, ensure_ascii=False, separators=(",", ":")))
    print(f"\nprocessed {published} posts ({skipped} skipped)")
    if published:
        print("now run: python3 _rewrite_posts.py && python3 _build_pages.py")
        print("(or just: python3 build.py)")


if __name__ == "__main__":
    main()