-
Notifications
You must be signed in to change notification settings - Fork 45
Expand file tree
/
Copy pathmanage_tags.py
More file actions
333 lines (276 loc) · 12.4 KB
/
manage_tags.py
File metadata and controls
333 lines (276 loc) · 12.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# ==============================================================================
# Universal Tag Manager for cs4ncu
# Version: 3.0 (Intelligent Edition)
# Author: GitHub Copilot (guided by ywh555hhh)
# Date: 2025-08-27
#
# Features:
# - Strict format and prefix validation.
# - Canonical case validation for all tags.
# - Fuzzy matching to detect and suggest corrections for typos.
# - Interactive 'sync' mode for management.
# - Non-interactive 'check' mode for CI/CD.
# - Automatic regeneration of the tags index file.
#
# Dependencies: PyYAML, python-frontmatter, rapidfuzz
# ==============================================================================
import sys
import yaml
import frontmatter
from pathlib import Path
from collections import defaultdict
from rapidfuzz import fuzz, process
# --- 配置 (Configuration) ---
ROOT_DIR = Path(__file__).parent.parent
DOCS_DIR = ROOT_DIR / "docs"
TAG_DICTIONARY_FILE = ROOT_DIR / "tag_dictionary.yml"
TAGS_INDEX_FILE = DOCS_DIR / "tags.md"
FUZZY_MATCH_THRESHOLD = 80 # 相似度阈值 (0-100), 80 是一个比较好的起点
# ---
def load_tag_dictionary() -> tuple[dict, set, set, dict]:
"""
加载标签词典。
返回:
- 原始字典。
- 所有合法标签的集合 (区分大小写)。
- 所有合法前缀的集合。
- 从小写标签到其官方大小写形式的映射字典。
"""
if not TAG_DICTIONARY_FILE.exists():
print(f"❌ 错误:标签词典文件未找到:{TAG_DICTIONARY_FILE}")
sys.exit(1)
with open(TAG_DICTIONARY_FILE, "r", encoding="utf-8") as f:
dictionary = yaml.safe_load(f)
all_valid_tags = set()
valid_prefixes = set(dictionary.keys())
lowercase_to_canonical_map = {}
for tags in dictionary.values():
if tags:
for tag in tags:
all_valid_tags.add(tag)
lowercase_to_canonical_map[tag.lower()] = tag
print(
f"📖 从 {TAG_DICTIONARY_FILE} 加载了 {len(all_valid_tags)} 个合法标签和 {len(valid_prefixes)} 个前缀。"
)
return dictionary, all_valid_tags, valid_prefixes, lowercase_to_canonical_map
def find_best_fuzzy_match(tag: str, all_valid_tags: list) -> tuple[str | None, int]:
"""使用模糊匹配为未知标签寻找最佳建议。"""
if not all_valid_tags:
return None, 0
# WRatio 对于处理不同长度的字符串非常有效
best_match, score, _ = process.extractOne(tag, all_valid_tags, scorer=fuzz.WRatio)
if score >= FUZZY_MATCH_THRESHOLD:
return best_match, int(score)
return None, 0
def validate_and_find_issues(
all_valid_tags: set, valid_prefixes: set, lowercase_map: dict
) -> tuple[dict, dict, dict]:
"""
扫描所有 MD 文件并发现三类问题:
1. 格式错误的标签 (malformed_tags)
2. 大小写错误的标签 (case_error_tags)
3. 未知标签 (unknown_tags), 可能包含拼写建议
"""
print("\n🔍 开始扫描 Markdown 文件以寻找标签问题...")
malformed_tags = defaultdict(list)
case_error_tags = defaultdict(list)
unknown_tags = {}
valid_tags_list = list(all_valid_tags)
for md_file in DOCS_DIR.rglob("*.md"):
if md_file.samefile(TAGS_INDEX_FILE):
continue
try:
post = frontmatter.load(md_file)
tags_in_file = post.get("tags")
if not tags_in_file or not isinstance(tags_in_file, list):
continue
relative_path = str(md_file.relative_to(ROOT_DIR))
for tag in tags_in_file:
# 阶段一:格式验证
parts = tag.split("-", 1)
if (
len(parts) != 2
or not parts[0]
or not parts[1]
or parts[0] not in valid_prefixes
):
malformed_tags[tag].append(relative_path)
continue
# 阶段二:精确匹配和大小写验证
if tag in all_valid_tags:
continue
if tag.lower() in lowercase_map:
canonical = lowercase_map[tag.lower()]
if tag not in case_error_tags:
case_error_tags[tag] = {"files": [], "canonical": canonical}
case_error_tags[tag]["files"].append(relative_path)
continue
# 阶段三:未知标签,进行模糊匹配
if tag not in unknown_tags:
suggestion, score = find_best_fuzzy_match(tag, valid_tags_list)
unknown_tags[tag] = {
"files": [],
"suggestion": suggestion,
"score": score,
}
unknown_tags[tag]["files"].append(relative_path)
except Exception as e:
print(f"⚠️ 在处理文件 {md_file} 时发生错误:{e}")
pass
print(
f"🏁 扫描完成。发现 {len(malformed_tags)} 个格式错误,{len(case_error_tags)} 个大小写错误,以及 {len(unknown_tags)} 个未知标签。"
)
return malformed_tags, case_error_tags, unknown_tags
def report_critical_errors(malformed_tags: dict, case_error_tags: dict) -> bool:
"""报告需要手动修复的严重错误(格式和大小写)。"""
has_errors = False
if malformed_tags:
has_errors = True
print("\n--- 🚨 严重错误:发现格式不规范的标签 ---")
print("这些标签不符合 '前缀 - 值' 规范,或者前缀无效。请务必手动修正它们!")
for tag, files in malformed_tags.items():
print(f"\n- 格式错误的标签:[ {tag} ]")
print(f" 出现在:{', '.join(files)}")
print("\n----------------------------------------")
if case_error_tags:
has_errors = True
print("\n--- 🚨 严重错误:发现大小写不正确的标签 ---")
print("这些标签已在词典中定义,但大小写不匹配。请修正为官方形式。")
for tag, details in case_error_tags.items():
canonical_version = details["canonical"]
file_list = details["files"]
print(f"\n- 错误形式:[ {tag} ] (正确形式应为:[ {canonical_version} ])")
print(f" 出现在:{', '.join(file_list)}")
print("\n----------------------------------------")
return has_errors
def interactive_unknown_tag_management(unknown_tags: dict, dictionary: dict) -> bool:
"""交互式地管理未知标签,提供智能建议。"""
if not unknown_tags:
return False
print("\n--- 交互式未知标签管理 ---")
dictionary_changed = False
for tag, details in unknown_tags.items():
files = details["files"]
suggestion = details["suggestion"]
score = details["score"]
print(f"\n❓ 发现未知标签:[ {tag} ]")
print(f" 出现在:{', '.join(sorted(list(set(files))))}")
options = "[a]dd as new, [i]gnore"
if suggestion:
print(f" 🤔 你是不是想输入 '{suggestion}'? (相似度:{score}%)")
options = f"[r]eplace, {options}"
choice = input(f" 请选择:({options}): ").lower()
if choice == "i":
continue
elif choice == "a":
prefix = tag.split("-")[0]
if prefix in dictionary:
dictionary.setdefault(prefix, []).append(tag)
print(f" ✅ 已将新标签 '{tag}' 添加到 '{prefix}' 分类。")
dictionary_changed = True
else:
print(f" ❌ 无法添加,前缀 '{prefix}' 无效。")
elif choice == "r" and suggestion:
print(f" 👉 请在以上文件中,手动将 '{tag}' 修改为 '{suggestion}'。")
print(f" 这是一个安全措施,防止脚本意外修改你的文件内容。")
if dictionary_changed:
print(f"\n💾 正在保存新添加的标签到 {TAG_DICTIONARY_FILE}...")
for category in dictionary:
if dictionary[category]:
dictionary[category] = sorted(list(set(dictionary[category])))
with open(TAG_DICTIONARY_FILE, "w", encoding="utf-8") as f:
yaml.dump(dictionary, f, allow_unicode=True, sort_keys=False, indent=2)
return dictionary_changed
def generate_index_file(dictionary: dict):
"""根据标签词典重新生成 docs/tags.md。"""
print(f"\n🚀 正在生成新的标签索引文件:{TAGS_INDEX_FILE}...")
content = """# 标签索引
这里是本站点的所有内容标签,按不同维度进行分类展示。您可以点击任意标签,查看所有关联的文章。
---
"""
category_map = {
"Topic": "📚 按主题 (Topic)",
"Type": "📄 按类型 (Type)",
"Level": "📈 按级别 (Level)",
"Action": "⚡️ 按行为 (Action)",
"Context": "🎯 按场景 (Context)",
}
for category, header in category_map.items():
if category in dictionary and dictionary[category]:
content += f"\n## {header}\n\n"
sorted_tags = sorted(dictionary[category])
tags_list_str = ""
for i, tag in enumerate(sorted_tags):
tags_list_str += f'"{tag}"'
if i < len(sorted_tags) - 1:
tags_list_str += ", "
if (i + 1) % 5 == 0 and len(sorted_tags) > 5:
tags_list_str += "\n"
content += (
f"<!-- material/tags {{ include: [\n{tags_list_str}\n] }} -->\n\n---\n"
)
with open(TAGS_INDEX_FILE, "w", encoding="utf-8") as f:
f.write(content)
print("✅ 标签索引文件生成完毕!")
def main_sync():
"""“同步”模式:完整地发现、报告、交互式管理并生成所有内容。"""
dictionary, all_valid, prefixes, lower_map = load_tag_dictionary()
malformed, case_errors, unknown = validate_and_find_issues(
all_valid, prefixes, lower_map
)
has_critical_errors = report_critical_errors(malformed, case_errors)
if has_critical_errors:
print("\n❗️ 请先修正以上格式和大小写错误,然后再重新运行脚本。")
sys.exit(1)
if not unknown:
print("\n✨ 未发现新的未知标签。所有标签均符合规范。")
choice = input(" 是否需要强制重新生成 tags.md 文件?([y]es/[n]o): ").lower()
if choice == "y":
generate_index_file(dictionary)
return
dictionary_changed = interactive_unknown_tag_management(unknown, dictionary)
if dictionary_changed:
final_dictionary, _, _, _ = load_tag_dictionary()
generate_index_file(final_dictionary)
else:
print("\nℹ️ 你未添加任何新标签,因此词典和索引文件未被更新。")
def main_check():
"""“检查”模式:仅报告错误,用于 CI。"""
print("--- 正在以 'check' 模式运行 ---")
dictionary, all_valid, prefixes, lower_map = load_tag_dictionary()
malformed, case_errors, unknown = validate_and_find_issues(
all_valid, prefixes, lower_map
)
has_critical_errors = report_critical_errors(malformed, case_errors)
has_unknown = bool(unknown)
if has_unknown:
print("\n--- 🟡 警告:发现未在词典中定义的标签 ---")
for tag, details in unknown.items():
suggestion_text = ""
if details["suggestion"]:
suggestion_text = f" (你是不是想写:{details['suggestion']}?)"
print(f"- 未知标签:[ {tag} ]{suggestion_text}")
print(f" 出现在:{', '.join(details['files'])}")
print("\n----------------------------------------")
if has_critical_errors or has_unknown:
print(
"\n💡 检查失败。请在本地运行 `uv run tools/manage_tags.py sync` 来处理这些问题。"
)
sys.exit(1)
else:
print("\n--- 标签健康检查报告 ---")
print("🎉 恭喜!所有文件的标签都符合规范。")
sys.exit(0)
def main():
"""主路由器,根据命令行参数选择执行模式。"""
args = sys.argv[1:]
if "check" in args:
main_check()
elif "sync" in args:
main_sync()
else:
print("ℹ️ 未指定模式,默认执行 'sync' 交互模式。")
print(" (可使用 'check' 模式进行 CI 检查)")
main_sync()
if __name__ == "__main__":
main()