Coverage for src / eclipse / care / copyright / copyright_headers.py: 86%

125 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-24 09:38 +0100

1# Copyright (c) 2026 The Eclipse Foundation 

2# 

3# This program and the accompanying materials are made available under the 

4# terms of the Eclipse Public License 2.0 which is available at 

5# http://www.eclipse.org/legal/epl-2.0. 

6# 

7# SPDX-License-Identifier: EPL-2.0 

8# 

9# Contributors: 

10# asgomes - Initial implementation 

11 

12import json 

13import os 

14from importlib import resources 

15from math import ceil 

16from pathlib import Path 

17from tempfile import TemporaryDirectory 

18from typing import Optional 

19 

20from scancode.api import get_copyrights 

21from typing_extensions import Any 

22 

23from care.utils.cli_utils import print_debug, print_error 

24from care.utils.eclipse import Eclipse 

25from care.utils.git_provider import GitProviderEMO 

26from care.utils.github import GitHubEmo 

27from care.utils.gitlab import GitLabEmo 

28 

29 

30def _load_known_extensions() -> list[dict[str, Any]] | None: 

31 """ 

32 Load known source code file extensions. 

33 """ 

34 

35 try: 

36 with resources.files('care').joinpath('templates/comment-templates.json').open('r') as fp: 

37 return json.load(fp) 

38 except FileNotFoundError: 

39 print_error("No comment-templates.json found to load known source code file extensions.") 

40 except json.JSONDecodeError as e: 

41 print_error(f"The comment-templates.json could not be parsed to load known source code file extensions. " 

42 f"Details: {e}") 

43 except OSError as e: # Catches other I/O related errors like permissions issues 

44 print_error(f"An IO error occurred while accessing comment-templates.json to load known source code file " 

45 f"extensions. Details: {e}") 

46 except Exception as e: # Fallback for any other unexpected errors 

47 print_error(f"An unexpected error occurred while attempting to load known source code file extensions: {e}") 

48 

49 

50def _detect_copyright(file_path: Path) -> dict | None: 

51 """ 

52 Detect copyright header in a single file and return copyright holders. 

53 

54 Parameters 

55 ---------- 

56 file_path : Path 

57 The path to the file to process 

58 """ 

59 

60 try: 

61 results = get_copyrights(str(file_path)) 

62 holders = { 

63 holder["holder"].strip() 

64 for holder in results.get("holders", []) 

65 if holder.get("holder") and holder.get("start_line") <= 20 

66 } 

67 return sorted(holders) if holders else None 

68 except Exception as e: 

69 print_error(f"An unexpected error occurred during copyright detection: {e}") 

70 return None 

71 

72 

73def _get_copyright_info(provider: GitProviderEMO, repo: str, branch: Optional[str] = None, 

74 verbose: Optional[bool] = False) -> tuple[int, list]: 

75 """ 

76 Get copyright headers percentage and copyright holders for all files in a repository. 

77 

78 Parameters 

79 ---------- 

80 provider : GitProviderEMO 

81 The Git provider to connect to fetch repo files 

82 repo : str 

83 The repository to retrieve info for, e.g. eclipse-dash/dash 

84 branch: Optional[str] 

85 A specific branch to check out 

86 verbose : Optional[bool] 

87 Should we print more information on stdout? 

88 """ 

89 

90 # Get a list of files 

91 files = provider.get_content_recursive(repo, branch=branch) 

92 if files is None: 

93 return -1, [] 

94 count = 0 

95 with_header = 0 

96 percentage_headers = 0 

97 copyright_holders = set() 

98 file_count = 0 

99 for file in files: 

100 file_count += 1 

101 if verbose: 

102 print_debug(f"Checking repository item {str(file_count)}/{str(len(files))}") 

103 print_debug(f"Item location: {file['path']}") 

104 if file['type'] != 'blob': 

105 continue 

106 # If the file has no extension, the name is the extension 

107 extension = os.path.splitext(file['name'])[1] 

108 if extension == "": 

109 extension = os.path.splitext(file['name'])[0] 

110 # Load known source code file extensions 

111 extensions = _load_known_extensions() 

112 # Match file extension 

113 known = False 

114 for i in extensions: 

115 if extension.lower() in (ext.lower() for ext in i['extensions']): 

116 known = True 

117 # If extension is known 

118 if known: 

119 count += 1 

120 with TemporaryDirectory() as tmpdir: 

121 gl_file = provider.get_file(repo, tmpdir, file['path'], branch=branch) 

122 holders = _detect_copyright(gl_file) 

123 if not holders: 

124 if verbose: 

125 print_debug(f"No header found for: {file['path']}") 

126 continue 

127 else: 

128 with_header += 1 

129 for holder in holders: 

130 copyright_holders.add(holder) 

131 # Calculate the percentage of files with headers 

132 if count > 0: 

133 if verbose: 

134 print_debug(f"Number of files with headers: {with_header}") 

135 print_debug(f"Number of files with known extension: {count}") 

136 # Round to the next integer (avoid 0% when something is found) 

137 percentage_headers = ceil((with_header / count) * 100) 

138 

139 return percentage_headers, sorted(copyright_holders) 

140 

141 

142def gh_analyze(organization: str, credentials: Optional[dict] = None, verbose: Optional[bool] = False) -> dict | None: 

143 """ 

144 Analyze a list of GitHub repositories, looking for copyright headers. 

145 

146 Parameters 

147 ---------- 

148 organization : str 

149 GitHub organization to analyze. 

150 credentials: Optional[dict] 

151 A mapping of service providers (GitHub, GitLab) to their respective authentication tokens. 

152 verbose : Optional[bool] 

153 Should we print more information on stdout? 

154 """ 

155 

156 results = {} 

157 ghe = GitHubEmo(credentials=credentials, verbose=verbose) 

158 if not ghe: 

159 return None 

160 

161 # Get list of repositories from the organization 

162 repos = ghe.get_repos(organization) 

163 if not repos: 

164 return None 

165 

166 for repo in repos: 

167 if verbose: 

168 print(f"- Analysing GH repo {repo}.") 

169 percentage_headers, copyright_holders = _get_copyright_info(ghe, repo, verbose=verbose) 

170 results[repo] = {} 

171 results[repo]['copyright_headers_percentage'] = percentage_headers 

172 results[repo]['copyright_holders'] = sorted(copyright_holders) 

173 return results 

174 

175 

176def gl_analyze(group: str, credentials: Optional[dict] = None, verbose: Optional[bool] = False) -> dict | None: 

177 """ 

178 Analyze a list of GitLab repositories, looking for copyright headers. 

179 

180 Parameters 

181 ---------- 

182 group : list 

183 GitLab group to analyze. 

184 credentials: Optional[dict] 

185 A mapping of service providers (GitHub, GitLab) to their respective authentication tokens. 

186 verbose : Optional[bool] 

187 Should we print more information on stdout? 

188 """ 

189 

190 results = {} 

191 gle = GitLabEmo(credentials=credentials, verbose=verbose) 

192 if not gle: 

193 return None 

194 

195 # Get list of repositories from the group 

196 repos = gle.get_repos(group) 

197 if not repos: 

198 return None 

199 

200 for repo in repos: 

201 if verbose: 

202 print(f"- Analysing GL repo {repo}.") 

203 percentage_headers, copyright_holders = _get_copyright_info(gle, repo, verbose=verbose) 

204 results[repo] = {} 

205 results[repo]['copyright_headers_percentage'] = percentage_headers 

206 results[repo]['copyright_holders'] = sorted(copyright_holders) 

207 return results 

208 

209 

210def analyse_project(project_id: str, credentials: Optional[dict] = None, verbose: bool = False): 

211 """ 

212 Generic entrypoint to analyze a project. This function will identify the project 

213 repositories and their type (GitHub/GitLab) and execute the corresponding functions. 

214 

215 Parameters 

216 ---------- 

217 project_id : str 

218 Project ID of the Eclipse project to analyze, e.g. `technology.dash`. 

219 credentials: Optional[dict] 

220 A mapping of service providers (GitHub, GitLab) to their respective authentication tokens. 

221 verbose : Optional[bool] 

222 Should we print more information on stdout? 

223 """ 

224 

225 eclipse = Eclipse() 

226 project_api = eclipse.get_project_api(project_id) 

227 

228 if project_api is None: 

229 return None 

230 

231 results = {} 

232 

233 if 'github' in project_api and 'org' in project_api['github'] and len(project_api['github']['org']) > 0: 

234 gh_org = project_api['github']['org'] 

235 if verbose: 

236 print(f" Looking for projects in GH org {gh_org}.") 

237 results.update(gh_analyze(gh_org, credentials=credentials, verbose=verbose)) 

238 

239 if 'gitlab' in project_api and 'project_group' in project_api['gitlab'] and len(project_api['gitlab']['project_group']) > 0: 

240 gl_org = project_api['gitlab']['project_group'] 

241 if verbose: 

242 print(f" Looking for projects in GL group {gl_org}.") 

243 results.update(gl_analyze(gl_org, credentials=credentials, verbose=verbose)) 

244 

245 return results