Coverage for src / codeaudit / api_interfaces.py: 22%

178 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-09 09:33 +0200

1""" 

2License GPLv3 or higher. 

3 

4(C) 2025 Created by Maikel Mardjan - https://nocomplexity.com/ 

5 

6This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. 

7 

8This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 

9 

10You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. 

11 

12 

13Public API functions for Python Code Audit aka codeaudit on pypi.org 

14""" 

15 

16import datetime 

17import json 

18import platform 

19from collections import Counter 

20from pathlib import Path 

21 

22import altair as alt 

23import pandas as pd 

24 

25from codeaudit import __version__ 

26from codeaudit.checkmodules import ( 

27 check_module_vulnerability, 

28 get_all_modules, 

29 get_imported_modules_by_file, 

30 get_standard_library_modules, 

31) 

32from codeaudit.filehelpfunctions import ( 

33 collect_python_source_files, 

34 get_filename_from_path, 

35 is_ast_parsable, 

36) 

37from codeaudit.privacy_lint import data_egress_scan 

38from codeaudit.pypi_package_scan import get_package_source, get_pypi_download_info 

39from codeaudit.security_checks import ast_security_checks, perform_validations 

40from codeaudit.suppression import filter_sast_results 

41from codeaudit.totals import ( 

42 get_statistics, 

43 overview_count, 

44 overview_per_file, 

45 total_modules, 

46) 

47 

48 

49def version(): 

50 """Returns the version of Python Code Audit""" 

51 ca_version = __version__ 

52 return {"name": "Python_Code_Audit", "version": ca_version} 

53 

54 

55def filescan(input_path, nosec=False): 

56 """ 

57 Scan a Python source file, a local directory, or a **PyPI package** from PyPI.org for 

58 security weaknesses and return the results as a JSON-serializable 

59 dictionary. 

60 

61 This API function works on: 

62 

63 - **Local directory**: Recursively scans all supported Python files in the 

64 directory. 

65 - **Single Python file**: Scans the file if it exists and can be parsed 

66 into an AST. 

67 - **PyPI package** on PyPI.org: Downloads the 

68 source distribution from PyPI, scans it, and cleans up temporary files. 

69 

70 The returned output always includes Python Code Audit version information and a 

71 generation timestamp. For consistency, single-file scans are normalized 

72 to match the structure of directory/package scans. 

73 

74 **Note:** 

75 The filescan command does NOT include all directories. This is done on purpose! 

76 The following directories are skipped by default: 

77 

78 - `/docs` 

79 - `/docker` 

80 - `/dist` 

81 - `/tests` 

82 - all directories that start with . (dot) or _ (underscore) 

83 

84 But you can easily change this if needed! 

85 

86 Args: 

87 input_path (str): One of the following: 

88 - Path to a local directory containing Python code. 

89 - Path to a single ``.py`` file. 

90 - Name of a package available on PyPI. 

91 

92 Returns: 

93 dict: A JSON-serializable dictionary containing scan results and 

94 metadata. The structure varies slightly depending on the scan type, 

95 but always includes: 

96 - Version information from ``version()``. 

97 - ``generated_on`` timestamp (``YYYY-MM-DD HH:MM``). 

98 - Package or file-level security findings. 

99 

100 If the input cannot be interpreted as a valid directory, Python file, 

101 or PyPI package, a dictionary with an ``"Error"`` key is returned. 

102 

103 Raises: 

104 None explicitly. Any unexpected exceptions are allowed to propagate 

105 unless handled by downstream callers. 

106 

107 Example: 

108 >>> result = filescan("example_package") 

109 >>> result["package_name"] 

110 

111 """ 

112 file_output = {} 

113 file_path = Path(input_path) 

114 ca_version_info = version() 

115 now = datetime.datetime.now() 

116 timestamp_str = now.strftime("%Y-%m-%d %H:%M") 

117 output = ca_version_info | {"generated_on": timestamp_str} 

118 # Check if the input is a valid directory or a single valid Python file 

119 if file_path.is_dir(): # local directory scan 

120 package_name = get_filename_from_path(input_path) 

121 output |= {"package_name": package_name} 

122 scan_output = _codeaudit_directory_scan(input_path, nosec_flag=nosec) 

123 output |= scan_output 

124 return output 

125 elif ( 

126 file_path.suffix.lower() == ".py" 

127 and file_path.is_file() 

128 and is_ast_parsable(input_path) 

129 ): # check on parseable single Python file 

130 # do a file check 

131 file_information = overview_per_file(input_path) 

132 module_information = get_modules(input_path) # modules per file 

133 scan_output = _codeaudit_scan(input_path, nosec_flag=nosec) 

134 file_output["0"] = ( 

135 file_information | module_information | scan_output 

136 ) # there is only 1 file , so index 0 equals as for package to make functionality that use the output that works on the dict or json can equal for a package or a single file! 

137 output |= {"file_security_info": file_output} 

138 return output 

139 elif pypi_data := get_pypi_download_info(input_path): 

140 package_name = ( 

141 input_path # The variable input_path is now equal to the package name 

142 ) 

143 url = pypi_data["download_url"] 

144 release = pypi_data["release"] 

145 if url is not None: 

146 src_dir, tmp_handle = get_package_source(url) 

147 output |= {"package_name": package_name, "package_release": release} 

148 try: 

149 scan_output = _codeaudit_directory_scan(src_dir, nosec_flag=nosec) 

150 output |= scan_output 

151 finally: 

152 # Cleaning up temp directory 

153 tmp_handle.cleanup() # deletes everything from temp directory 

154 return output 

155 else: 

156 # Its not a directory nor a valid Python file: 

157 return { 

158 "Error": "File is not a *.py file, does not exist or is not a valid directory path towards a Python package." 

159 } 

160 

161 

162def _codeaudit_scan(filename, nosec_flag): 

163 """Internal helper function to do a SAST scan on a single file 

164 To scan a file, or Python package using the API interface, use the `filescan` API call! 

165 """ 

166 # get the file name 

167 name_of_file = get_filename_from_path(filename) 

168 if not nosec_flag: # no filtering on reviewed items with markers in code 

169 sast_data = perform_validations(filename) 

170 else: 

171 unfiltered_scan_output = perform_validations( 

172 filename 

173 ) # scans for weaknesses in the file 

174 sast_data = filter_sast_results(unfiltered_scan_output) 

175 sast_data_results = sast_data["result"] 

176 sast_result = dict(sorted(sast_data_results.items())) 

177 output = {"file_name": name_of_file, "sast_result": sast_result} 

178 return output 

179 

180 

181def _codeaudit_directory_scan(input_path, nosec_flag): 

182 """Performs a scan on a local directory 

183 Function is also used with scanning directory PyPI.org packages, since in that case a tmp directory is used 

184 """ 

185 output = {} 

186 file_output = {} 

187 files_to_check = collect_python_source_files(input_path) 

188 if len(files_to_check) > 1: 

189 modules_discovered = get_all_modules( 

190 input_path 

191 ) # all modules for the package aka directory 

192 package_overview = get_overview(input_path) 

193 output |= { 

194 "statistics_overview": package_overview, 

195 "module_overview": modules_discovered, 

196 } 

197 for i, file in enumerate(files_to_check): 

198 file_information = overview_per_file(file) 

199 module_information = get_modules(file) # modules per file 

200 scan_output = _codeaudit_scan(file, nosec_flag) 

201 file_output[i] = file_information | module_information | scan_output 

202 output |= {"file_security_info": file_output} 

203 return output 

204 else: 

205 output_msg = f"Directory path {input_path} contains no Python files." 

206 return {"Error": output_msg} 

207 

208 

209def save_to_json(sast_result, filename="codeaudit_output.json"): 

210 """ 

211 Save a SAST result (dict or serializable object) to a JSON file. 

212 

213 Args: 

214 sast_result (dict or list): The data to be saved as JSON. 

215 filename (str, optional): The file path to save the JSON data. 

216 Defaults to "codeaudit_output.json". 

217 

218 Returns: 

219 Path: The absolute path of the saved file, or None if saving failed. 

220 """ 

221 filepath = Path(filename).expanduser().resolve() 

222 

223 try: 

224 filepath.parent.mkdir(parents=True, exist_ok=True) # ensure directory exists 

225 with filepath.open("w", encoding="utf-8") as f: 

226 json.dump(sast_result, f, indent=2, ensure_ascii=False) 

227 return 

228 except (TypeError, ValueError) as e: 

229 print(f"[Error] Failed to serialize data to JSON: {e}") 

230 except OSError as e: 

231 print(f"[Error] Failed to write file '{filepath}': {e}") 

232 

233 

234def read_input_file(filename, safe_directory="data_folder"): 

235 """ 

236 Securely read a Python CodeAudit JSON file and return its contents as a dictionary. 

237 

238 Args: 

239 filename: Path to the JSON file (str or Path). 

240 safe_directory: Base directory considered "safe" for reading files. 

241 

242 Returns: 

243 dict: The contents of the JSON file. 

244 

245 Raises: 

246 FileNotFoundError: If the file does not exist. 

247 PermissionError: If the file is outside the allowed safe directory. 

248 json.JSONDecodeError: If the file is not valid JSON. 

249 """ 

250 # Convert to Path object 

251 file_path = Path(filename).expanduser().resolve() 

252 base_dir = Path(safe_directory).expanduser().resolve() 

253 

254 # Security check: ensure the file is within the safe directory 

255 if not file_path.is_relative_to(base_dir): 

256 raise PermissionError( 

257 f"Access denied: {file_path} is outside the safe directory" 

258 ) 

259 

260 # Ensure the file exists and is a file 

261 if not file_path.is_file(): 

262 raise FileNotFoundError(f"File not found or not a regular file: {file_path}") 

263 

264 try: 

265 # Read JSON content safely 

266 return json.loads(file_path.read_text(encoding="utf-8")) 

267 except json.JSONDecodeError as e: 

268 raise json.JSONDecodeError(f"Invalid JSON in file: {file_path}", e.doc, e.pos) 

269 

270 

271def get_weakness_counts(input_file, nosec=False): 

272 """ 

273 Analyze a Python file or package (directory) and count occurrences of code weaknesses. 

274 

275 This function uses the `filescan` API call to retrieve security-related information 

276 and aggregates the total number of occurrences per weakness construct. 

277 

278 Args: 

279 input_file (str): Path to the file or directory (package) to scan. 

280 nosec (bool): Whether to suppress findings marked with nosec comments. 

281 

282 Returns: 

283 dict: A dictionary mapping each construct name (str) to the total 

284 number of occurrences (int). 

285 

286 Raises: 

287 ValueError: If the scan fails or returns an error result. 

288 TypeError: If the scan result has an unexpected structure. 

289 """ 

290 scan_result = filescan(input_file, nosec) 

291 

292 # Explicitly handle scan failure or unexpected return 

293 if not isinstance(scan_result, dict): 

294 raise ValueError("filescan() did not return a valid result dictionary") 

295 

296 if "Error" in scan_result: 

297 raise ValueError(scan_result["Error"]) 

298 

299 file_security_info = scan_result.get("file_security_info") 

300 if not isinstance(file_security_info, dict): 

301 # Valid scan, but no findings (e.g. empty or non-parsable input) 

302 return {} 

303 

304 counter = Counter() 

305 

306 for file_info in file_security_info.values(): 

307 if not isinstance(file_info, dict): 

308 continue 

309 

310 sast_result = file_info.get("sast_result", {}) 

311 if not isinstance(sast_result, dict): 

312 continue 

313 

314 for construct, occurrences in sast_result.items(): 

315 if isinstance(occurrences, (list, tuple)): 

316 counter[construct] += len(occurrences) 

317 

318 return dict(counter) 

319 

320 

321# def get_weakness_counts(input_file , nosec=False): 

322# """ 

323# Analyze a Python file or package(directory) and count occurrences of code weaknesses. 

324 

325# This function uses `filescan` API call to retrieve security-related information 

326# about the input file. This returns a dict. Then it counts how many times each code construct 

327# appears across all scanned files. 

328 

329# Args: 

330# input_file (str): Path to the file or directory(package) to scan. 

331 

332# Returns: 

333# dict: A dictionary mapping each construct name (str) to the total 

334# number of occurrences (int) across all scanned files. 

335 

336# Notes: 

337# - The `filescan` function is expected to return a dictionary with 

338# a 'file_security_info' key, containing per-file information. 

339# - Each file's 'sast_result' should be a dictionary mapping 

340# construct names to lists of occurrences. 

341# """ 

342# scan_result = filescan(input_file, nosec) 

343# counter = Counter() 

344 

345# for file_info in scan_result.get('file_security_info', {}).values(): 

346# sast_result = file_info.get('sast_result', {}) 

347# for construct, occurrence in sast_result.items(): #occurrence is times the construct appears in a single file 

348# counter[construct] += len(occurrence) 

349 

350# return dict(counter) 

351 

352 

353def get_modules(filename): 

354 """Gets modules of a Python file""" 

355 modules_found = get_imported_modules_by_file(filename) 

356 return modules_found 

357 

358 

359def get_overview(input_path): 

360 """Retrieves the security relevant statistics of a Python package(directory) or of a single Python 

361 

362 Based on the input path, call the overview function and return the result in a dict 

363 

364 Args: 

365 input_path: Directory path of the package to use 

366 

367 

368 Returns: 

369 dict: Returns the overview statistics in DICT format 

370 """ 

371 file_path = Path(input_path) 

372 if file_path.is_dir(): # only for valid parsable Python files 

373 files_to_check = collect_python_source_files(input_path) 

374 if len(files_to_check) > 1: 

375 statistics = get_statistics(input_path) 

376 modules = total_modules(input_path) 

377 df = pd.DataFrame(statistics) 

378 df["Std-Modules"] = modules[ 

379 "Std-Modules" 

380 ] # Needed for the correct overall count 

381 df["External-Modules"] = modules[ 

382 "External-Modules" 

383 ] # Needed for the correct overall count 

384 overview_df = overview_count(df) # create the overview Dataframe 

385 dict_overview = overview_df.to_dict(orient="records")[ 

386 0 

387 ] # The overview Dataframe has only one row 

388 return dict_overview 

389 else: 

390 output_msg = f"Directory path {input_path} contains no Python files." 

391 return {"Error": output_msg} 

392 elif ( 

393 file_path.suffix.lower() == ".py" 

394 and file_path.is_file() 

395 and is_ast_parsable(input_path) 

396 ): 

397 security_statistics = overview_per_file(input_path) 

398 return security_statistics 

399 else: 

400 # Its not a directory nor a valid Python file: 

401 return { 

402 "Error": "File is not a *.py file, does not exist or is not a valid directory path to a Python package." 

403 } 

404 

405 

406def get_default_validations(): 

407 """Retrieve the default implemented security validations. 

408 

409 This function collects the built-in Static Application Security Testing (SAST) 

410 validations applied to standard Python modules. It retrieves the validation 

411 definitions, converts them into a serializable format, and enriches the result 

412 with generation metadata. 

413 

414 The returned structure is intended to be consumed by reporting, API, or 

415 documentation layers. 

416 

417 Returns: 

418 dict: A dictionary containing generation metadata and a list of security 

419 validations. The dictionary has the following structure: 

420 

421 { 

422 "<metadata_key>": <metadata_value>, 

423 ..., 

424 "validations": [ 

425 { 

426 "<field>": <value>, 

427 ... 

428 }, 

429 ... 

430 ] 

431 } 

432 

433 

434 **Notes**: 

435 

436 - Requires Python 3.9 or later due to use of the dictionary union operator (`|`). 

437 - The `validations` list is derived from a pandas DataFrame using 

438 `to_dict(orient="records")`. 

439 """ 

440 df = ast_security_checks() 

441 result = df.to_dict(orient="records") 

442 output = _generation_info() | {"validations": result} 

443 return output 

444 

445 

446def _generation_info(): 

447 """Internal function to retrieve generation info for APIs output""" 

448 ca_version_info = version() 

449 now = datetime.datetime.now() 

450 timestamp_str = now.strftime("%Y-%m-%d %H:%M") 

451 output = ca_version_info | {"generated_on": timestamp_str} 

452 return output 

453 

454 

455def platform_info(): 

456 """Get Python platform information - Python version and Python runtime interpreter used. 

457 Args: 

458 none 

459 

460 Returns: 

461 dict: Overview of implemented security SAST validation on Standard Python modules 

462 """ 

463 python_version = platform.python_version() 

464 platform_implementation = platform.python_implementation() 

465 output = { 

466 "python_version": python_version, 

467 "python_implementation": platform_implementation, 

468 } 

469 return output 

470 

471 

472def get_psl_modules(): 

473 """Retrieves a list of collection of Python modules that are part of a Python distribution aka standard installation 

474 

475 Returns: 

476 dict: Overview of PSL modules in the Python version used. 

477 

478 """ 

479 psl_modules = get_standard_library_modules() 

480 output = _generation_info() | platform_info() | {"psl_modules": psl_modules} 

481 return output 

482 

483 

484def get_module_vulnerability_info(module): 

485 """ 

486 Retrieves vulnerability information for an external module using the OSV Database. 

487 

488 Args: 

489 module (str): Name of the module to query. 

490 

491 Returns: 

492 dict: Generation metadata combined with OSV vulnerability results. 

493 """ 

494 vuln_info = check_module_vulnerability(module) 

495 key_string = f"{module}_vulnerability_info" 

496 output = _generation_info() | {key_string: vuln_info} 

497 return output 

498 

499 

500def egress_check(input_path): 

501 """Scan Python code for potential data egress or privacy leaks. 

502 

503 This function performs a static analysis of Python source code to 

504 detect patterns that may indicate privacy or data-egress risks. 

505 The analysis is based on an Abstract Syntax Tree (AST) inspection 

506 of the provided source. 

507 

508 The input can refer to: 

509 - A local directory containing a Python package 

510 - A single Python file 

511 - A PyPI package name (the package will be downloaded and scanned) 

512 

513 Depending on the input type, the function performs a file-level or 

514 package-level scan and returns structured metadata together with 

515 the detected findings. 

516 

517 Args: 

518 input_path (str): Location of the Python code to analyze. This can be: 

519 - Path to a local Python package directory. 

520 - Path to a single `.py` file. 

521 - Name of a package published on PyPI. 

522 

523 Returns: 

524 dict: Dictionary containing scan metadata and analysis results. 

525 The dictionary always includes basic metadata such as the tool 

526 name, version, and generation timestamp. Additional fields 

527 depend on the input type: 

528 

529 **Directory or PyPI package input** 

530 - ``package_name``: Name of the scanned package. 

531 - ``package_release`` (PyPI only): Package version. 

532 - Package-level privacy findings. 

533 

534 **Single file input** 

535 - ``file_name``: Name of the scanned file. 

536 - ``file_privacy_check``: Results of the file-level analysis. 

537 

538 **Invalid input** 

539 - ``{"Error": "<message>"}`` 

540 

541 Raises: 

542 None: All errors are handled internally and reported in the 

543 returned dictionary instead of raising exceptions. 

544 

545 

546 **Notes:** 

547 

548 - The scan uses static AST analysis and does **not execute code**. 

549 - PyPI packages are downloaded to a temporary directory before scanning. 

550 - Temporary directories are automatically removed after the scan. 

551 - Only syntactically valid Python files that can be parsed into an AST 

552 are analyzed. 

553 

554 Examples for API use: 

555 

556 1. Scan a local Python file: 

557 

558 >>> data_egress_scan("script.py") 

559 

560 2. Scan a local package directory: 

561 

562 >>> data_egress_scan("./my_package") 

563 

564 3. Scan a package from PyPI: 

565 

566 >>> data_egress_scan("requests") 

567 

568 """ 

569 output = data_egress_scan(input_path) 

570 return output 

571 

572 

573def get_construct_counts(input_file): 

574 """ 

575 Analyze a Python file or package(directory) and count occurrences of code constructs (aka weaknesses). 

576 

577 This function uses `filescan` API call to retrieve security-related information 

578 about the input file. This returns a dict. Then it counts how many times each code construct 

579 appears across all scanned files. 

580 

581 Args: 

582 input_file (str): Path to the file or directory(package) to scan. 

583 

584 Returns: 

585 dict: A dictionary mapping each construct name (str) to the total 

586 number of occurrences (int) across all scanned files. 

587 

588 Notes: 

589 - The `filescan` function is expected to return a dictionary with 

590 a 'file_security_info' key, containing per-file information. 

591 - Each file's 'sast_result' should be a dictionary mapping 

592 construct names to lists of occurrences. 

593 """ 

594 scan_result = filescan(input_file) 

595 counter = Counter() 

596 

597 for file_info in scan_result.get("file_security_info", {}).values(): 

598 sast_result = file_info.get("sast_result", {}) 

599 for ( 

600 construct, 

601 occurence, 

602 ) in ( 

603 sast_result.items() 

604 ): # occurence is times the construct appears in a single file 

605 counter[construct] += len(occurence) 

606 

607 return dict(counter)