Source code for pain001.csv.load_csv_data
# Copyright (C) 2023-2026 Sebastien Rousseau.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import logging
import os
from collections.abc import Generator
from typing import Any
from pain001.exceptions import DataSourceError
from pain001.security import sanitize_for_log, validate_path # noqa: PYI100
logging.basicConfig(level=logging.ERROR, format="%(levelname)s: %(message)s")
[docs]
def load_csv_data(file_path: str) -> list[dict[str, Any]]:
"""Load CSV data from a file.
Args:
file_path (str): The path to the CSV file.
Returns:
list: A list of dictionaries containing the CSV data.
Raises:
FileNotFoundError: If the file does not exist.
IOError: If there is an issue reading the file.
UnicodeDecodeError: If there is an issue decoding the file's content.
ValueError: If the CSV file is empty.
Note:
For large files, consider using load_csv_data_streaming() to reduce
memory footprint.
"""
# Validate path to prevent traversal attacks
# Pre-validate file path (CodeQL: prevent path traversal)
try:
# Restrict CSV file access to the current working directory by default.
base_dir = os.getcwd()
safe_path = validate_path(
file_path,
must_exist=True,
base_dir=base_dir,
) # nosec B108 - Returns sanitized string
except Exception as e:
# Sanitize at sink (CWE-117: Log Injection prevention)
logging.error(
f"Path validation failed: {sanitize_for_log(str(file_path))} - {e}"
)
raise
# Check file existence using os.path for string path
if not os.path.isfile(safe_path):
# Sanitize at sink (CWE-117: Log Injection prevention)
logging.error(f"File not found: {sanitize_for_log(str(file_path))}")
raise FileNotFoundError(
f"File '{sanitize_for_log(str(file_path))}' not found."
)
data: list[dict[str, Any]] = []
try:
with open(safe_path, encoding="utf-8") as file: # nosec B108
csv_reader = csv.DictReader(file)
for row in csv_reader:
data.append(row)
except OSError:
# Sanitize at sink (CWE-117: Log Injection prevention)
logging.error(
f"IOError reading file: {sanitize_for_log(str(file_path))}"
)
raise
except UnicodeDecodeError:
# Sanitize at sink (CWE-117: Log Injection prevention)
logging.error(
f"UnicodeDecodeError decoding file: {sanitize_for_log(str(file_path))}"
)
raise
if not data:
raise DataSourceError(f"The CSV file '{file_path}' is empty.")
return data
[docs]
def load_csv_data_streaming(
file_path: str, chunk_size: int = 1000
) -> Generator[list[dict[str, Any]], None, None]:
"""Load CSV data from a file in chunks for memory-efficient processing.
This function yields chunks of CSV data instead of loading the entire
file into memory, making it suitable for large files.
Args:
file_path (str): The path to the CSV file.
chunk_size (int): Number of rows to yield per chunk. Default is 1000.
Yields:
list: A list of dictionaries containing chunk_size rows of CSV data.
Raises:
FileNotFoundError: If the file does not exist.
IOError: If there is an issue reading the file.
UnicodeDecodeError: If there is an issue decoding the file's content.
ValueError: If the CSV file is empty.
Example:
>>> for chunk in load_csv_data_streaming('large_file.csv', chunk_size=500):
... # Process chunk
... process_payment_batch(chunk)
Performance:
- Memory usage: ~90% reduction for large files (10K+ rows)
- Enables processing of files larger than available RAM
- Slightly slower than load_csv_data() due to yielding overhead
"""
chunk: list[dict[str, Any]] = []
row_count = 0
try:
# CodeQL: Prevent path traversal
base_dir = os.getcwd()
safe_path = validate_path(
file_path,
must_exist=True,
base_dir=base_dir,
) # nosec B108
except Exception as e:
# Sanitize at sink (CWE-117: Log Injection prevention)
logging.error(
f"Path validation failed: {sanitize_for_log(str(file_path))} - {e}"
)
raise
try:
with open(safe_path, encoding="utf-8") as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
chunk.append(row)
row_count += 1
if len(chunk) >= chunk_size:
yield chunk
chunk = []
# Yield remaining rows
if chunk:
yield chunk
except FileNotFoundError:
# Sanitize at sink (CWE-117: Log Injection prevention)
logging.error(f"File '{sanitize_for_log(str(file_path))}' not found.")
raise
except OSError:
# Sanitize at sink (CWE-117: Log Injection prevention)
logging.error(
f"An IOError occurred while reading the file '{sanitize_for_log(str(file_path))}'."
)
raise
except UnicodeDecodeError:
# Sanitize at sink (CWE-117: Log Injection prevention)
logging.error(
f"A UnicodeDecodeError occurred while decoding the file '{sanitize_for_log(str(file_path))}'."
)
raise
if row_count == 0:
# Sanitize at sink (CWE-117: Log Injection prevention)
raise DataSourceError(
f"The CSV file '{sanitize_for_log(str(file_path))}' is empty."
)