Coverage for src / codeaudit / api_interfaces.py: 22%
178 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-09 09:33 +0200
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-09 09:33 +0200
1"""
2License GPLv3 or higher.
4(C) 2025 Created by Maikel Mardjan - https://nocomplexity.com/
6This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
8This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
10You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.
13Public API functions for Python Code Audit aka codeaudit on pypi.org
14"""
16import datetime
17import json
18import platform
19from collections import Counter
20from pathlib import Path
22import altair as alt
23import pandas as pd
25from codeaudit import __version__
26from codeaudit.checkmodules import (
27 check_module_vulnerability,
28 get_all_modules,
29 get_imported_modules_by_file,
30 get_standard_library_modules,
31)
32from codeaudit.filehelpfunctions import (
33 collect_python_source_files,
34 get_filename_from_path,
35 is_ast_parsable,
36)
37from codeaudit.privacy_lint import data_egress_scan
38from codeaudit.pypi_package_scan import get_package_source, get_pypi_download_info
39from codeaudit.security_checks import ast_security_checks, perform_validations
40from codeaudit.suppression import filter_sast_results
41from codeaudit.totals import (
42 get_statistics,
43 overview_count,
44 overview_per_file,
45 total_modules,
46)
49def version():
50 """Returns the version of Python Code Audit"""
51 ca_version = __version__
52 return {"name": "Python_Code_Audit", "version": ca_version}
55def filescan(input_path, nosec=False):
56 """
57 Scan a Python source file, a local directory, or a **PyPI package** from PyPI.org for
58 security weaknesses and return the results as a JSON-serializable
59 dictionary.
61 This API function works on:
63 - **Local directory**: Recursively scans all supported Python files in the
64 directory.
65 - **Single Python file**: Scans the file if it exists and can be parsed
66 into an AST.
67 - **PyPI package** on PyPI.org: Downloads the
68 source distribution from PyPI, scans it, and cleans up temporary files.
70 The returned output always includes Python Code Audit version information and a
71 generation timestamp. For consistency, single-file scans are normalized
72 to match the structure of directory/package scans.
74 **Note:**
75 The filescan command does NOT include all directories. This is done on purpose!
76 The following directories are skipped by default:
78 - `/docs`
79 - `/docker`
80 - `/dist`
81 - `/tests`
82 - all directories that start with . (dot) or _ (underscore)
84 But you can easily change this if needed!
86 Args:
87 input_path (str): One of the following:
88 - Path to a local directory containing Python code.
89 - Path to a single ``.py`` file.
90 - Name of a package available on PyPI.
92 Returns:
93 dict: A JSON-serializable dictionary containing scan results and
94 metadata. The structure varies slightly depending on the scan type,
95 but always includes:
96 - Version information from ``version()``.
97 - ``generated_on`` timestamp (``YYYY-MM-DD HH:MM``).
98 - Package or file-level security findings.
100 If the input cannot be interpreted as a valid directory, Python file,
101 or PyPI package, a dictionary with an ``"Error"`` key is returned.
103 Raises:
104 None explicitly. Any unexpected exceptions are allowed to propagate
105 unless handled by downstream callers.
107 Example:
108 >>> result = filescan("example_package")
109 >>> result["package_name"]
111 """
112 file_output = {}
113 file_path = Path(input_path)
114 ca_version_info = version()
115 now = datetime.datetime.now()
116 timestamp_str = now.strftime("%Y-%m-%d %H:%M")
117 output = ca_version_info | {"generated_on": timestamp_str}
118 # Check if the input is a valid directory or a single valid Python file
119 if file_path.is_dir(): # local directory scan
120 package_name = get_filename_from_path(input_path)
121 output |= {"package_name": package_name}
122 scan_output = _codeaudit_directory_scan(input_path, nosec_flag=nosec)
123 output |= scan_output
124 return output
125 elif (
126 file_path.suffix.lower() == ".py"
127 and file_path.is_file()
128 and is_ast_parsable(input_path)
129 ): # check on parseable single Python file
130 # do a file check
131 file_information = overview_per_file(input_path)
132 module_information = get_modules(input_path) # modules per file
133 scan_output = _codeaudit_scan(input_path, nosec_flag=nosec)
134 file_output["0"] = (
135 file_information | module_information | scan_output
136 ) # there is only 1 file , so index 0 equals as for package to make functionality that use the output that works on the dict or json can equal for a package or a single file!
137 output |= {"file_security_info": file_output}
138 return output
139 elif pypi_data := get_pypi_download_info(input_path):
140 package_name = (
141 input_path # The variable input_path is now equal to the package name
142 )
143 url = pypi_data["download_url"]
144 release = pypi_data["release"]
145 if url is not None:
146 src_dir, tmp_handle = get_package_source(url)
147 output |= {"package_name": package_name, "package_release": release}
148 try:
149 scan_output = _codeaudit_directory_scan(src_dir, nosec_flag=nosec)
150 output |= scan_output
151 finally:
152 # Cleaning up temp directory
153 tmp_handle.cleanup() # deletes everything from temp directory
154 return output
155 else:
156 # Its not a directory nor a valid Python file:
157 return {
158 "Error": "File is not a *.py file, does not exist or is not a valid directory path towards a Python package."
159 }
162def _codeaudit_scan(filename, nosec_flag):
163 """Internal helper function to do a SAST scan on a single file
164 To scan a file, or Python package using the API interface, use the `filescan` API call!
165 """
166 # get the file name
167 name_of_file = get_filename_from_path(filename)
168 if not nosec_flag: # no filtering on reviewed items with markers in code
169 sast_data = perform_validations(filename)
170 else:
171 unfiltered_scan_output = perform_validations(
172 filename
173 ) # scans for weaknesses in the file
174 sast_data = filter_sast_results(unfiltered_scan_output)
175 sast_data_results = sast_data["result"]
176 sast_result = dict(sorted(sast_data_results.items()))
177 output = {"file_name": name_of_file, "sast_result": sast_result}
178 return output
181def _codeaudit_directory_scan(input_path, nosec_flag):
182 """Performs a scan on a local directory
183 Function is also used with scanning directory PyPI.org packages, since in that case a tmp directory is used
184 """
185 output = {}
186 file_output = {}
187 files_to_check = collect_python_source_files(input_path)
188 if len(files_to_check) > 1:
189 modules_discovered = get_all_modules(
190 input_path
191 ) # all modules for the package aka directory
192 package_overview = get_overview(input_path)
193 output |= {
194 "statistics_overview": package_overview,
195 "module_overview": modules_discovered,
196 }
197 for i, file in enumerate(files_to_check):
198 file_information = overview_per_file(file)
199 module_information = get_modules(file) # modules per file
200 scan_output = _codeaudit_scan(file, nosec_flag)
201 file_output[i] = file_information | module_information | scan_output
202 output |= {"file_security_info": file_output}
203 return output
204 else:
205 output_msg = f"Directory path {input_path} contains no Python files."
206 return {"Error": output_msg}
209def save_to_json(sast_result, filename="codeaudit_output.json"):
210 """
211 Save a SAST result (dict or serializable object) to a JSON file.
213 Args:
214 sast_result (dict or list): The data to be saved as JSON.
215 filename (str, optional): The file path to save the JSON data.
216 Defaults to "codeaudit_output.json".
218 Returns:
219 Path: The absolute path of the saved file, or None if saving failed.
220 """
221 filepath = Path(filename).expanduser().resolve()
223 try:
224 filepath.parent.mkdir(parents=True, exist_ok=True) # ensure directory exists
225 with filepath.open("w", encoding="utf-8") as f:
226 json.dump(sast_result, f, indent=2, ensure_ascii=False)
227 return
228 except (TypeError, ValueError) as e:
229 print(f"[Error] Failed to serialize data to JSON: {e}")
230 except OSError as e:
231 print(f"[Error] Failed to write file '{filepath}': {e}")
234def read_input_file(filename, safe_directory="data_folder"):
235 """
236 Securely read a Python CodeAudit JSON file and return its contents as a dictionary.
238 Args:
239 filename: Path to the JSON file (str or Path).
240 safe_directory: Base directory considered "safe" for reading files.
242 Returns:
243 dict: The contents of the JSON file.
245 Raises:
246 FileNotFoundError: If the file does not exist.
247 PermissionError: If the file is outside the allowed safe directory.
248 json.JSONDecodeError: If the file is not valid JSON.
249 """
250 # Convert to Path object
251 file_path = Path(filename).expanduser().resolve()
252 base_dir = Path(safe_directory).expanduser().resolve()
254 # Security check: ensure the file is within the safe directory
255 if not file_path.is_relative_to(base_dir):
256 raise PermissionError(
257 f"Access denied: {file_path} is outside the safe directory"
258 )
260 # Ensure the file exists and is a file
261 if not file_path.is_file():
262 raise FileNotFoundError(f"File not found or not a regular file: {file_path}")
264 try:
265 # Read JSON content safely
266 return json.loads(file_path.read_text(encoding="utf-8"))
267 except json.JSONDecodeError as e:
268 raise json.JSONDecodeError(f"Invalid JSON in file: {file_path}", e.doc, e.pos)
271def get_weakness_counts(input_file, nosec=False):
272 """
273 Analyze a Python file or package (directory) and count occurrences of code weaknesses.
275 This function uses the `filescan` API call to retrieve security-related information
276 and aggregates the total number of occurrences per weakness construct.
278 Args:
279 input_file (str): Path to the file or directory (package) to scan.
280 nosec (bool): Whether to suppress findings marked with nosec comments.
282 Returns:
283 dict: A dictionary mapping each construct name (str) to the total
284 number of occurrences (int).
286 Raises:
287 ValueError: If the scan fails or returns an error result.
288 TypeError: If the scan result has an unexpected structure.
289 """
290 scan_result = filescan(input_file, nosec)
292 # Explicitly handle scan failure or unexpected return
293 if not isinstance(scan_result, dict):
294 raise ValueError("filescan() did not return a valid result dictionary")
296 if "Error" in scan_result:
297 raise ValueError(scan_result["Error"])
299 file_security_info = scan_result.get("file_security_info")
300 if not isinstance(file_security_info, dict):
301 # Valid scan, but no findings (e.g. empty or non-parsable input)
302 return {}
304 counter = Counter()
306 for file_info in file_security_info.values():
307 if not isinstance(file_info, dict):
308 continue
310 sast_result = file_info.get("sast_result", {})
311 if not isinstance(sast_result, dict):
312 continue
314 for construct, occurrences in sast_result.items():
315 if isinstance(occurrences, (list, tuple)):
316 counter[construct] += len(occurrences)
318 return dict(counter)
321# def get_weakness_counts(input_file , nosec=False):
322# """
323# Analyze a Python file or package(directory) and count occurrences of code weaknesses.
325# This function uses `filescan` API call to retrieve security-related information
326# about the input file. This returns a dict. Then it counts how many times each code construct
327# appears across all scanned files.
329# Args:
330# input_file (str): Path to the file or directory(package) to scan.
332# Returns:
333# dict: A dictionary mapping each construct name (str) to the total
334# number of occurrences (int) across all scanned files.
336# Notes:
337# - The `filescan` function is expected to return a dictionary with
338# a 'file_security_info' key, containing per-file information.
339# - Each file's 'sast_result' should be a dictionary mapping
340# construct names to lists of occurrences.
341# """
342# scan_result = filescan(input_file, nosec)
343# counter = Counter()
345# for file_info in scan_result.get('file_security_info', {}).values():
346# sast_result = file_info.get('sast_result', {})
347# for construct, occurrence in sast_result.items(): #occurrence is times the construct appears in a single file
348# counter[construct] += len(occurrence)
350# return dict(counter)
353def get_modules(filename):
354 """Gets modules of a Python file"""
355 modules_found = get_imported_modules_by_file(filename)
356 return modules_found
359def get_overview(input_path):
360 """Retrieves the security relevant statistics of a Python package(directory) or of a single Python
362 Based on the input path, call the overview function and return the result in a dict
364 Args:
365 input_path: Directory path of the package to use
368 Returns:
369 dict: Returns the overview statistics in DICT format
370 """
371 file_path = Path(input_path)
372 if file_path.is_dir(): # only for valid parsable Python files
373 files_to_check = collect_python_source_files(input_path)
374 if len(files_to_check) > 1:
375 statistics = get_statistics(input_path)
376 modules = total_modules(input_path)
377 df = pd.DataFrame(statistics)
378 df["Std-Modules"] = modules[
379 "Std-Modules"
380 ] # Needed for the correct overall count
381 df["External-Modules"] = modules[
382 "External-Modules"
383 ] # Needed for the correct overall count
384 overview_df = overview_count(df) # create the overview Dataframe
385 dict_overview = overview_df.to_dict(orient="records")[
386 0
387 ] # The overview Dataframe has only one row
388 return dict_overview
389 else:
390 output_msg = f"Directory path {input_path} contains no Python files."
391 return {"Error": output_msg}
392 elif (
393 file_path.suffix.lower() == ".py"
394 and file_path.is_file()
395 and is_ast_parsable(input_path)
396 ):
397 security_statistics = overview_per_file(input_path)
398 return security_statistics
399 else:
400 # Its not a directory nor a valid Python file:
401 return {
402 "Error": "File is not a *.py file, does not exist or is not a valid directory path to a Python package."
403 }
406def get_default_validations():
407 """Retrieve the default implemented security validations.
409 This function collects the built-in Static Application Security Testing (SAST)
410 validations applied to standard Python modules. It retrieves the validation
411 definitions, converts them into a serializable format, and enriches the result
412 with generation metadata.
414 The returned structure is intended to be consumed by reporting, API, or
415 documentation layers.
417 Returns:
418 dict: A dictionary containing generation metadata and a list of security
419 validations. The dictionary has the following structure:
421 {
422 "<metadata_key>": <metadata_value>,
423 ...,
424 "validations": [
425 {
426 "<field>": <value>,
427 ...
428 },
429 ...
430 ]
431 }
434 **Notes**:
436 - Requires Python 3.9 or later due to use of the dictionary union operator (`|`).
437 - The `validations` list is derived from a pandas DataFrame using
438 `to_dict(orient="records")`.
439 """
440 df = ast_security_checks()
441 result = df.to_dict(orient="records")
442 output = _generation_info() | {"validations": result}
443 return output
446def _generation_info():
447 """Internal function to retrieve generation info for APIs output"""
448 ca_version_info = version()
449 now = datetime.datetime.now()
450 timestamp_str = now.strftime("%Y-%m-%d %H:%M")
451 output = ca_version_info | {"generated_on": timestamp_str}
452 return output
455def platform_info():
456 """Get Python platform information - Python version and Python runtime interpreter used.
457 Args:
458 none
460 Returns:
461 dict: Overview of implemented security SAST validation on Standard Python modules
462 """
463 python_version = platform.python_version()
464 platform_implementation = platform.python_implementation()
465 output = {
466 "python_version": python_version,
467 "python_implementation": platform_implementation,
468 }
469 return output
472def get_psl_modules():
473 """Retrieves a list of collection of Python modules that are part of a Python distribution aka standard installation
475 Returns:
476 dict: Overview of PSL modules in the Python version used.
478 """
479 psl_modules = get_standard_library_modules()
480 output = _generation_info() | platform_info() | {"psl_modules": psl_modules}
481 return output
484def get_module_vulnerability_info(module):
485 """
486 Retrieves vulnerability information for an external module using the OSV Database.
488 Args:
489 module (str): Name of the module to query.
491 Returns:
492 dict: Generation metadata combined with OSV vulnerability results.
493 """
494 vuln_info = check_module_vulnerability(module)
495 key_string = f"{module}_vulnerability_info"
496 output = _generation_info() | {key_string: vuln_info}
497 return output
500def egress_check(input_path):
501 """Scan Python code for potential data egress or privacy leaks.
503 This function performs a static analysis of Python source code to
504 detect patterns that may indicate privacy or data-egress risks.
505 The analysis is based on an Abstract Syntax Tree (AST) inspection
506 of the provided source.
508 The input can refer to:
509 - A local directory containing a Python package
510 - A single Python file
511 - A PyPI package name (the package will be downloaded and scanned)
513 Depending on the input type, the function performs a file-level or
514 package-level scan and returns structured metadata together with
515 the detected findings.
517 Args:
518 input_path (str): Location of the Python code to analyze. This can be:
519 - Path to a local Python package directory.
520 - Path to a single `.py` file.
521 - Name of a package published on PyPI.
523 Returns:
524 dict: Dictionary containing scan metadata and analysis results.
525 The dictionary always includes basic metadata such as the tool
526 name, version, and generation timestamp. Additional fields
527 depend on the input type:
529 **Directory or PyPI package input**
530 - ``package_name``: Name of the scanned package.
531 - ``package_release`` (PyPI only): Package version.
532 - Package-level privacy findings.
534 **Single file input**
535 - ``file_name``: Name of the scanned file.
536 - ``file_privacy_check``: Results of the file-level analysis.
538 **Invalid input**
539 - ``{"Error": "<message>"}``
541 Raises:
542 None: All errors are handled internally and reported in the
543 returned dictionary instead of raising exceptions.
546 **Notes:**
548 - The scan uses static AST analysis and does **not execute code**.
549 - PyPI packages are downloaded to a temporary directory before scanning.
550 - Temporary directories are automatically removed after the scan.
551 - Only syntactically valid Python files that can be parsed into an AST
552 are analyzed.
554 Examples for API use:
556 1. Scan a local Python file:
558 >>> data_egress_scan("script.py")
560 2. Scan a local package directory:
562 >>> data_egress_scan("./my_package")
564 3. Scan a package from PyPI:
566 >>> data_egress_scan("requests")
568 """
569 output = data_egress_scan(input_path)
570 return output
573def get_construct_counts(input_file):
574 """
575 Analyze a Python file or package(directory) and count occurrences of code constructs (aka weaknesses).
577 This function uses `filescan` API call to retrieve security-related information
578 about the input file. This returns a dict. Then it counts how many times each code construct
579 appears across all scanned files.
581 Args:
582 input_file (str): Path to the file or directory(package) to scan.
584 Returns:
585 dict: A dictionary mapping each construct name (str) to the total
586 number of occurrences (int) across all scanned files.
588 Notes:
589 - The `filescan` function is expected to return a dictionary with
590 a 'file_security_info' key, containing per-file information.
591 - Each file's 'sast_result' should be a dictionary mapping
592 construct names to lists of occurrences.
593 """
594 scan_result = filescan(input_file)
595 counter = Counter()
597 for file_info in scan_result.get("file_security_info", {}).values():
598 sast_result = file_info.get("sast_result", {})
599 for (
600 construct,
601 occurence,
602 ) in (
603 sast_result.items()
604 ): # occurence is times the construct appears in a single file
605 counter[construct] += len(occurence)
607 return dict(counter)