Coverage for src / ezxl / io / _converters.py: 84%
45 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-29 22:41 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-29 22:41 +0000
1# ///////////////////////////////////////////////////////////////
2# _converters - Format conversion utilities
3# Project: EzXl
4# ///////////////////////////////////////////////////////////////
6"""Format conversion utilities for Excel and CSV files.
8Provides read and export paths backed by polars for high-throughput
9data processing on closed files (no running Excel process required):
11- ``read_excel``: read an ``.xlsx`` file into a polars DataFrame.
12- ``read_csv``: read a ``.csv`` file into a polars DataFrame.
13- ``xlsx_to_csv``: convert an Excel sheet to CSV via polars.
14- ``csv_to_xlsx``: convert a CSV file to an ``.xlsx`` file via polars.
15- ``read_sheet``: compatibility shim — returns ``list[list[Any]]`` for
16 callers that expect the legacy row-major format.
18All functions operate on **closed** files and require no running Excel
19process. polars delegates Excel I/O to ``fastexcel`` (a Rust-based
20engine bundled with polars extras) which provides performance comparable
21to the former ``python-calamine`` path.
22"""
24from __future__ import annotations
26# ///////////////////////////////////////////////////////////////
27# IMPORTS
28# ///////////////////////////////////////////////////////////////
29# Standard library imports
30from pathlib import Path
31from typing import Any
33# Third-party imports
34import polars as pl
35from ezplog.lib_mode import get_logger, get_printer
37# ///////////////////////////////////////////////////////////////
38# CONSTANTS
39# ///////////////////////////////////////////////////////////////
41logger = get_logger(__name__)
42printer = get_printer()
44# ///////////////////////////////////////////////////////////////
45# FUNCTIONS
46# ///////////////////////////////////////////////////////////////
49def read_excel(
50 source: str | Path,
51 sheet: str | None = None,
52) -> pl.DataFrame:
53 """Read an Excel workbook sheet into a polars DataFrame.
55 Delegates to ``polars.read_excel`` which uses ``fastexcel`` (Rust)
56 under the hood. No running Excel process is required.
58 Args:
59 source: Path to the source ``.xlsx`` / ``.xlsm`` file.
60 sheet: Worksheet name to read. Pass ``None`` to read the first
61 sheet (polars default when ``sheet_name`` is omitted).
63 Returns:
64 pl.DataFrame: Contents of the requested sheet as a polars
65 DataFrame, with the first row used as column headers.
67 Raises:
68 FileNotFoundError: If ``source`` does not exist.
69 ImportError: If polars (or its ``fastexcel`` extra) is not
70 installed.
72 Example:
73 >>> df = read_excel("report.xlsx", sheet="Data")
74 >>> print(df.head())
75 """
76 source_path = Path(source).resolve()
78 if not source_path.exists():
79 raise FileNotFoundError(f"Source file not found: {source_path}")
81 logger.debug("read_excel: %s (sheet=%r)", source_path, sheet)
83 df: pl.DataFrame = pl.read_excel(source_path, sheet_name=sheet)
85 logger.debug("read_excel: read %d rows from '%s'.", len(df), source_path)
86 return df
89def read_csv(
90 source: str | Path,
91 separator: str = ",",
92 encoding: str = "utf-8",
93) -> pl.DataFrame:
94 """Read a CSV file into a polars DataFrame.
96 Args:
97 source: Path to the source ``.csv`` file.
98 separator: Column delimiter character. Defaults to ``","``
99 (standard CSV). Use ``"\\t"`` for TSV files.
100 encoding: File encoding passed through to polars. Defaults to
101 ``"utf-8"``.
103 Returns:
104 pl.DataFrame: Parsed contents of the CSV file.
106 Raises:
107 FileNotFoundError: If ``source`` does not exist.
109 Example:
110 >>> df = read_csv("transactions.csv", separator=";")
111 >>> print(df.schema)
112 """
113 source_path = Path(source).resolve()
115 if not source_path.exists():
116 raise FileNotFoundError(f"Source file not found: {source_path}")
118 logger.debug("read_csv: %s (sep=%r, enc=%r)", source_path, separator, encoding)
120 df: pl.DataFrame = pl.read_csv(
121 source_path,
122 separator=separator,
123 encoding=encoding,
124 )
126 logger.debug("read_csv: read %d rows from '%s'.", len(df), source_path)
127 return df
130def xlsx_to_csv(
131 source: str | Path,
132 dest: str | Path,
133 sheet: str | None = None,
134 separator: str = ",",
135) -> None:
136 """Convert an Excel workbook sheet to a CSV file using polars.
138 Supersedes both the former ``xlsx_to_csv`` (openpyxl) and
139 ``xlsx_to_csv_fast`` (python-calamine) functions. polars uses
140 ``fastexcel`` (Rust) for the read step, providing the same
141 high-throughput characteristics as the former fast path.
143 Args:
144 source: Path to the source ``.xlsx`` / ``.xlsm`` file.
145 dest: Destination ``.csv`` file path. Parent directories must
146 exist.
147 sheet: Worksheet name to export. Pass ``None`` to use the
148 first sheet.
149 separator: Column delimiter for the CSV output. Defaults to
150 ``","`` (standard CSV).
152 Raises:
153 FileNotFoundError: If ``source`` does not exist.
155 Example:
156 >>> xlsx_to_csv("data.xlsx", "data.csv", sheet="Transactions")
157 >>> xlsx_to_csv("data.xlsx", "data.tsv", separator="\\t")
158 """
159 dest_path = Path(dest).resolve()
161 logger.debug(
162 "xlsx_to_csv: %s → %s (sheet=%r, sep=%r)",
163 Path(source).resolve(),
164 dest_path,
165 sheet,
166 separator,
167 )
169 df = read_excel(source, sheet=sheet)
170 df.write_csv(dest_path, separator=separator)
172 logger.debug("xlsx_to_csv: completed — wrote %s", dest_path)
173 printer.success(f"xlsx_to_csv: conversion complete — {dest_path}")
176def csv_to_xlsx(
177 source: str | Path,
178 dest: str | Path,
179 sheet_name: str = "Sheet1",
180) -> None:
181 """Convert a CSV file to an Excel workbook using polars.
183 Reads the CSV with polars and writes it as an ``.xlsx`` file.
184 polars delegates the Excel write step to ``xlsxwriter`` or
185 ``openpyxl`` depending on which is installed; no additional
186 configuration is required.
188 Args:
189 source: Path to the source ``.csv`` file.
190 dest: Destination ``.xlsx`` file path. Parent directories must
191 exist.
192 sheet_name: Name of the worksheet to create in the output
193 workbook. Defaults to ``"Sheet1"``.
195 Raises:
196 FileNotFoundError: If ``source`` does not exist.
198 Example:
199 >>> csv_to_xlsx("transactions.csv", "transactions.xlsx", sheet_name="Data")
200 """
201 dest_path = Path(dest).resolve()
203 logger.debug(
204 "csv_to_xlsx: %s → %s (sheet=%r)",
205 Path(source).resolve(),
206 dest_path,
207 sheet_name,
208 )
210 df = read_csv(source)
211 df.write_excel(dest_path, worksheet=sheet_name)
213 logger.debug("csv_to_xlsx: completed — wrote %s", dest_path)
214 printer.success(f"csv_to_xlsx: conversion complete — {dest_path}")
217def read_sheet(
218 source: str | Path,
219 sheet: str | None = None,
220) -> list[list[Any]]:
221 """Read a worksheet into a row-major list of lists (compatibility shim).
223 Wraps ``read_excel`` and converts the resulting polars DataFrame to
224 a ``list[list[Any]]`` via ``DataFrame.rows()``. The first row
225 contains the column headers as extracted by polars.
227 This function exists for backwards compatibility with callers that
228 pre-date the polars migration. New code should use ``read_excel``
229 directly to benefit from the full polars API.
231 Args:
232 source: Path to the source ``.xlsx`` / ``.xlsm`` file.
233 sheet: Worksheet name to read. Pass ``None`` to use the first
234 sheet.
236 Returns:
237 list[list[Any]]: Row-major 2D list of cell values. The first
238 row contains column headers; subsequent rows contain data
239 values. Empty cells are represented as ``None``.
241 Raises:
242 FileNotFoundError: If ``source`` does not exist.
244 Example:
245 >>> data = read_sheet("report.xlsx", sheet="Data")
246 >>> headers = data[0]
247 >>> rows = data[1:]
248 """
249 logger.debug("read_sheet: %s (sheet=%r) — delegating to read_excel", source, sheet)
251 df = read_excel(source, sheet=sheet)
253 # Prepend column names as the first row to preserve the legacy contract
254 # where callers expected headers in row 0.
255 header_row: list[Any] = list(df.columns)
256 data_rows: list[list[Any]] = [list(row) for row in df.rows()]
258 result: list[list[Any]] = [header_row, *data_rows]
260 logger.debug("read_sheet: returning %d rows (incl. header).", len(result))
261 return result