-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy path_publish_posts.py
More file actions
275 lines (236 loc) · 8.68 KB
/
Copy path_publish_posts.py
File metadata and controls
275 lines (236 loc) · 8.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/env python3
"""Scan posts/*.md and publish them as blog posts.
Each .md file should have YAML frontmatter:
---
title: 标题
date: 2026-04-27
category: thought
tags: 创业, AI
---
正文 markdown 内容...
The script:
1. Generates a Hexo-format HTML stub at <category>/<slug>.html
2. Registers / updates the entry in content.json (dedup by path)
Then run `_rewrite_posts.py` to convert to new layout, then `_build_pages.py`
to rebuild listings. Or just run `build.py` to chain all three.
"""
import json
import re
from html import escape
from pathlib import Path
import markdown as md_lib
ROOT = Path(__file__).parent
POSTS_DIR = ROOT / "posts"
VALID_CATEGORIES = {"thought", "reading", "chatbot", "project", "saas",
"interview", "microsoft", "presentation"}
HEXO_TEMPLATE = """<!DOCTYPE HTML>
<html lang="zh-CN">
<head>
<meta charset="utf-8">
<title>{title} - 李佳芮de博客</title>
<link rel="stylesheet" href="/css/site.css">
</head>
<body>
<article class="post" itemscope itemtype="http://schema.org/BlogPosting">
<div class="post-header">
<div class="post-author clearfix">
<p>
<span class="label">作者</span>
<a href="/" target="_blank">李佳芮</a>
<span title="最后编辑于 {date_short}">{date_short}</span>
</p>
</div>
<h2 class="post-title">{title}</h2>
<div class="post-meta">本文共计{word_count}个字</div>
</div>
<div class="post-content markdown-body">
{body_html}
</div>
<div class="post-tags">标签:
{tags_html}
</div>
</article>
</body>
</html>
"""
def parse_frontmatter(text):
if not text.lstrip().startswith("---"):
return {}, text.strip()
rest = text.lstrip()[3:]
if "---" not in rest:
return {}, text.strip()
fm, body = rest.split("---", 1)
meta = {}
for line in fm.split("\n"):
line = line.strip()
if not line or line.startswith("#"):
continue
if ":" in line:
k, v = line.split(":", 1)
meta[k.strip().lower()] = v.strip().strip('"').strip("'")
return meta, body.strip()
def md_to_plain_text(md_body):
text = re.sub(r"```.*?```", "", md_body, flags=re.S)
text = re.sub(r"`[^`]+`", "", text)
text = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", text)
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
text = re.sub(r"[#>*_\-]", "", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def slugify_title(s):
s = re.sub(r"[^\w一-鿿-]+", "-", s)
return re.sub(r"-+", "-", s).strip("-").lower() or "post"
def main():
if not POSTS_DIR.exists():
print(f"posts/ directory not found at {POSTS_DIR}")
return
cj_path = ROOT / "content.json"
data = json.loads(cj_path.read_text())
# Dedup posts by path, keeping the first occurrence. Historical content.json
# may already contain duplicates that the {path: i} map below cannot fix on
# its own (the dict only prevents *new* duplicates, not existing ones).
seen_paths = set()
deduped = []
removed = 0
for p in data["posts"]:
if p["path"] in seen_paths:
removed += 1
continue
seen_paths.add(p["path"])
deduped.append(p)
if removed:
print(f" · removed {removed} duplicate post entr{'y' if removed == 1 else 'ies'} from content.json")
data["posts"] = deduped
existing_paths = {p["path"]: i for i, p in enumerate(data["posts"])}
converter = md_lib.Markdown(extensions=["extra", "sane_lists"])
published = 0
skipped = 0
md_files = sorted(POSTS_DIR.glob("*.md"))
for fp in md_files:
if fp.name == "TEMPLATE.md" or fp.name.startswith("_"):
skipped += 1
continue
text = fp.read_text(encoding="utf-8")
meta, body_md = parse_frontmatter(text)
# Required fields
title = meta.get("title")
date = meta.get("date")
category = meta.get("category", "thought").lower()
if not title:
print(f" ✗ {fp.name}: missing title in frontmatter, skipped")
skipped += 1
continue
is_draft = str(meta.get("draft", "")).lower() in ("true", "yes", "1")
if category not in VALID_CATEGORIES:
print(f" ✗ {fp.name}: invalid category '{category}' "
f"(must be one of {sorted(VALID_CATEGORIES)}), skipped")
skipped += 1
continue
# Date: from frontmatter or filename prefix YYYY-MM-DD
if not date:
m = re.match(r"^(\d{4}-\d{2}-\d{2})", fp.stem)
if m:
date = m.group(1)
else:
print(f" ✗ {fp.name}: missing date in frontmatter and filename, skipped")
skipped += 1
continue
date_short = date[:10]
date_iso = f"{date_short}T10:00:00.000Z"
# Slug: explicit frontmatter slug = canonical filename stem.
# If no slug, derive from filename (with date prefix logic for new posts).
explicit_slug = meta.get("slug")
if explicit_slug:
# Treat as canonical full stem — no date prefix munging
slug = explicit_slug
path = f"{category}/{slug}.html"
else:
# Filename-based: if filename already has date prefix (any digit count for m/d), use as-is
stem = fp.stem
if re.match(r"^\d{4}-\d{1,2}-\d{1,2}([-_].+)?$", stem):
slug = stem
path = f"{category}/{stem}.html"
else:
slug = slugify_title(stem)
path = f"{category}/{date_short}-{slug}.html"
if is_draft:
print(f" · {fp.name}: draft, skipped")
skipped += 1
# Remove from content.json if previously published, and clean up
# any stale rendered html so the listing pages don't link to a 404.
if path in existing_paths:
idx = existing_paths.pop(path)
del data["posts"][idx]
existing_paths = {p["path"]: i for i, p in enumerate(data["posts"])}
stale = ROOT / path
if stale.exists():
stale.unlink()
continue
# Tags
tags = []
if meta.get("tags"):
tags = [t.strip() for t in re.split(r"[,,]", meta["tags"]) if t.strip()]
body_html = converter.convert(body_md)
converter.reset()
plain = md_to_plain_text(body_md)
# Description (SEO): from frontmatter, or auto-generate from body
description = meta.get("description") or ""
if not description:
description = plain[:150].replace("\n", " ").strip()
if len(plain) > 150:
description = description.rstrip() + "…"
tags_html = "\n".join(
f' <a href="/tags/{escape(t)}/">{escape(t)}</a>' for t in tags
)
html = HEXO_TEMPLATE.format(
title=escape(title),
date_short=escape(date_short),
word_count=len(plain),
body_html=body_html,
tags_html=tags_html,
)
out_path = ROOT / path
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(html, encoding="utf-8")
# content.json entry
post_entry = {
"title": title,
"slug": slug,
"date": date_iso,
"updated": date_iso,
"comments": True,
"path": path,
"link": "",
"permalink": f"https://rui.juzi.bot/{path}",
"excerpt": "",
"description": description,
"text": plain,
"categories": [{
"name": category,
"slug": category,
"permalink": f"https://rui.juzi.bot/categories/{category}/",
}],
"tags": [{
"name": t,
"slug": t,
"permalink": f"https://rui.juzi.bot/tags/{t}/",
} for t in tags],
"keywords": [{
"name": category,
"slug": category,
"permalink": f"https://rui.juzi.bot/categories/{category}/",
}],
}
if path in existing_paths:
data["posts"][existing_paths[path]] = post_entry
else:
data["posts"].append(post_entry)
existing_paths[path] = len(data["posts"]) - 1
published += 1
cj_path.write_text(json.dumps(data, ensure_ascii=False, separators=(",", ":")))
print(f"\nprocessed {published} posts ({skipped} skipped)")
if published:
print("now run: python3 _rewrite_posts.py && python3 _build_pages.py")
print("(or just: python3 build.py)")
if __name__ == "__main__":
main()