-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_processor.py
More file actions
219 lines (170 loc) · 7.29 KB
/
Copy pathdata_processor.py
File metadata and controls
219 lines (170 loc) · 7.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
數據處理模組
負責處理來自 GitLab 的原始數據,進行標籤篩選、專案分類和日期過濾。
"""
from datetime import datetime
class DataProcessor:
"""數據處理類"""
def __init__(self, gitlab_handler, config, logger):
"""
初始化數據處理器
Args:
gitlab_handler: GitLab API 處理器
config: 配置處理器
logger: 日誌處理器
"""
self.gitlab_handler = gitlab_handler
self.config = config
self.logger = logger
self.start_date = config.get_start_date()
self.include_open_only = config.get_include_open_only()
self.label_list = config.get_label_list()
self.project_id_list = config.get_project_id_list()
def process_all_projects(self):
"""
處理所有配置的專案
Returns:
list: 處理後的 issue 列表
"""
all_issues = []
self.logger.log_info("開始處理所有專案")
# 遍歷專案列表
for project_item in self.project_id_list:
# 檢查專案配置的格式
if isinstance(project_item, dict):
# 物件格式,可能包含特殊配置
project_id = project_item.get('project_id')
project_name = project_item.get('name', f"專案 {project_id}")
fetch_all_open_issues = project_item.get('fetch_all_open_issues', False)
if not project_id:
self.logger.log_warning(f"跳過缺少 project_id 的專案配置:{project_item}")
continue
if fetch_all_open_issues:
self.logger.log_info(f"從專案 {project_name} (ID: {project_id}) 擷取所有開放 issues")
project_issues = self._fetch_all_open_issues(project_id)
else:
self.logger.log_info(f"從專案 {project_name} (ID: {project_id}) 根據標籤列表擷取 issues")
project_issues = self._fetch_issues_by_labels(project_id)
else:
# 簡單的專案 ID,使用全域標籤列表
project_id = project_item
self.logger.log_info(f"從專案 {project_id} 根據標籤列表擷取 issues")
project_issues = self._fetch_issues_by_labels(project_id)
# 過濾並標準化 issues
filtered_issues = self._filter_and_format_issues(project_issues)
# 將此專案的 issues 添加到總列表中
all_issues.extend(filtered_issues)
# 依照創建日期排序
all_issues.sort(key=lambda x: x['created_at'])
self.logger.log_info(f"完成所有專案處理,共獲取 {len(all_issues)} 個符合條件的 issues")
return all_issues
def _fetch_issues_by_labels(self, project_id):
"""
根據標籤列表擷取專案的 issues
Args:
project_id: 專案 ID
Returns:
list: 擷取到的 issues 列表
"""
all_issues = []
# 如果標籤列表為空,則直接擷取不做標籤過濾
if not self.label_list:
self.logger.log_info(f"標籤列表為空,擷取專案 {project_id} 的所有 issues")
params = {'per_page': 100} # 每頁最大數量
issues = self.gitlab_handler.get_project_issues(project_id, params)
all_issues.extend(issues)
return all_issues
# 遍歷標籤列表
for label in self.label_list:
self.logger.log_debug(f"使用標籤 '{label}' 擷取專案 {project_id} 的 issues")
issues = self.gitlab_handler.get_issues_by_label(
project_id,
label,
self.include_open_only
)
all_issues.extend(issues)
# 移除重複項目 (同一個 issue 可能有多個標籤)
unique_issues = self._remove_duplicates(all_issues)
self.logger.log_info(f"專案 {project_id} 根據標籤列表共擷取 {len(unique_issues)} 個 issues")
return unique_issues
def _fetch_all_open_issues(self, project_id):
"""
擷取專案的所有開放 issues
Args:
project_id: 專案 ID
Returns:
list: 擷取到的 issues 列表
"""
self.logger.log_info(f"擷取專案 {project_id} 的所有開放 issues")
# 設定分頁參數,獲取更多的 issues
params = {'per_page': 100} # 每頁最大數量
issues = self.gitlab_handler.get_all_open_issues(project_id, params)
self.logger.log_info(f"專案 {project_id} 共擷取 {len(issues)} 個開放 issues")
return issues
def _filter_and_format_issues(self, issues):
"""
過濾並格式化 issues
Args:
issues: 原始 issues 列表
Returns:
list: 過濾並格式化後的 issues 列表
"""
# 過濾日期
filtered_issues = self._filter_issues_by_date(issues)
# 標準化格式
formatted_issues = []
for issue in filtered_issues:
# 構建標準化的 issue 資訊
formatted_issue = {
'gitlab_link': issue.get('web_url', ''),
'title': issue.get('title', ''),
'created_at': issue.get('created_at', ''),
'labels': issue.get('labels', []),
'iid': issue.get('iid', ''), # 內部 ID
'project_id': issue.get('project_id', ''), # 專案 ID
}
formatted_issues.append(formatted_issue)
return formatted_issues
def _filter_issues_by_date(self, issues):
"""
根據起始日期過濾 issues
Args:
issues: issues 列表
Returns:
list: 過濾後的 issues 列表
"""
if not self.start_date:
return issues
try:
# 將起始日期字符串轉換為 datetime 對象
start_datetime = datetime.strptime(self.start_date, '%Y-%m-%d')
# 過濾出在起始日期之後創建的 issues
filtered_issues = []
for issue in issues:
created_at = issue.get('created_at')
if created_at:
issue_datetime = datetime.strptime(created_at.split('T')[0], '%Y-%m-%d')
if issue_datetime >= start_datetime:
filtered_issues.append(issue)
self.logger.log_info(f"日期過濾:從 {len(issues)} 個 issues 中篩選出 {len(filtered_issues)} 個在 {self.start_date} 之後的 issues")
return filtered_issues
except ValueError as e:
self.logger.log_error(f"日期過濾失敗:{e}")
return issues
def _remove_duplicates(self, issues):
"""
移除重複的 issues
Args:
issues: issues 列表
Returns:
list: 移除重複後的 issues 列表
"""
# 使用 issue ID 作為唯一標識
unique_issues = {}
for issue in issues:
issue_id = issue.get('id')
if issue_id and issue_id not in unique_issues:
unique_issues[issue_id] = issue
return list(unique_issues.values())