Coverage for src / ezxl / io / _converters.py: 87.72%
51 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-29 15:53 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-29 15:53 +0000
1# ///////////////////////////////////////////////////////////////
2# _converters - Format conversion utilities
3# Project: EzXl
4# ///////////////////////////////////////////////////////////////
6"""Format conversion utilities for Excel and CSV files.
8Provides read and export paths backed by polars for high-throughput
9data processing on closed files (no running Excel process required):
11- ``read_excel``: read an ``.xlsx`` file into a polars DataFrame.
12- ``read_csv``: read a ``.csv`` file into a polars DataFrame.
13- ``xlsx_to_csv``: convert an Excel sheet to CSV via polars.
14- ``csv_to_xlsx``: convert a CSV file to an ``.xlsx`` file via polars.
15- ``read_sheet``: compatibility shim — returns ``list[list[Any]]`` for
16 callers that expect the legacy row-major format.
18All functions operate on **closed** files and require no running Excel
19process. polars delegates Excel I/O to ``fastexcel`` (a Rust-based
20engine bundled with polars extras) which provides performance comparable
21to the former ``python-calamine`` path.
22"""
24from __future__ import annotations
26# ///////////////////////////////////////////////////////////////
27# IMPORTS
28# ///////////////////////////////////////////////////////////////
29# Standard library imports
30from pathlib import Path
31from typing import Any
33# Third-party imports
34import polars as pl
35from ezplog.lib_mode import get_logger, get_printer
37# ///////////////////////////////////////////////////////////////
38# CONSTANTS
39# ///////////////////////////////////////////////////////////////
41logger = get_logger(__name__)
42printer = get_printer()
44# ///////////////////////////////////////////////////////////////
45# FUNCTIONS
46# ///////////////////////////////////////////////////////////////
49def read_excel(
50 source: str | Path,
51 sheet: str | None = None,
52 *pl_args: Any,
53 **pl_kwargs: Any,
54) -> pl.DataFrame:
55 """Read an Excel workbook sheet into a polars DataFrame.
57 Delegates to ``polars.read_excel`` which uses ``fastexcel`` (Rust)
58 under the hood. No running Excel process is required.
60 Args:
61 source: Path to the source ``.xlsx`` / ``.xlsm`` file.
62 sheet: Worksheet name to read. Pass ``None`` to read the first
63 sheet (polars default when ``sheet_name`` is omitted).
64 *pl_args: Positional arguments forwarded to ``polars.read_excel``.
65 **pl_kwargs: Keyword arguments forwarded to ``polars.read_excel``.
66 If ``sheet`` is provided and ``sheet_name`` is not set, this
67 function injects ``sheet_name=sheet`` for compatibility.
69 Returns:
70 pl.DataFrame: Contents of the requested sheet as a polars
71 DataFrame, with the first row used as column headers.
73 Raises:
74 FileNotFoundError: If ``source`` does not exist.
75 ImportError: If polars (or its ``fastexcel`` extra) is not
76 installed.
78 Example:
79 >>> df = read_excel("report.xlsx", sheet="Data")
80 >>> print(df.head())
81 """
82 source_path = Path(source).resolve()
84 if not source_path.exists():
85 raise FileNotFoundError(f"Source file not found: {source_path}")
87 logger.debug("read_excel: %s (sheet=%r)", source_path, sheet)
89 if sheet is not None and "sheet_name" not in pl_kwargs:
90 pl_kwargs["sheet_name"] = sheet
92 df: pl.DataFrame = pl.read_excel(source_path, *pl_args, **pl_kwargs)
94 logger.debug("read_excel: read %d rows from '%s'.", len(df), source_path)
95 return df
98def read_csv(
99 source: str | Path,
100 separator: str = ",",
101 encoding: str = "utf-8",
102 *pl_args: Any,
103 **pl_kwargs: Any,
104) -> pl.DataFrame:
105 """Read a CSV file into a polars DataFrame.
107 Args:
108 source: Path to the source ``.csv`` file.
109 separator: Column delimiter character. Defaults to ``","``
110 (standard CSV). Use ``"\\t"`` for TSV files.
111 encoding: File encoding passed through to polars. Defaults to
112 ``"utf-8"``.
113 *pl_args: Positional arguments forwarded to ``polars.read_csv``.
114 **pl_kwargs: Keyword arguments forwarded to ``polars.read_csv``.
115 ``separator`` and ``encoding`` are applied only when these
116 keys are not already present in ``pl_kwargs``.
118 Returns:
119 pl.DataFrame: Parsed contents of the CSV file.
121 Raises:
122 FileNotFoundError: If ``source`` does not exist.
124 Example:
125 >>> df = read_csv("transactions.csv", separator=";")
126 >>> print(df.schema)
127 """
128 source_path = Path(source).resolve()
130 if not source_path.exists():
131 raise FileNotFoundError(f"Source file not found: {source_path}")
133 logger.debug("read_csv: %s (sep=%r, enc=%r)", source_path, separator, encoding)
135 pl_kwargs.setdefault("separator", separator)
136 pl_kwargs.setdefault("encoding", encoding)
138 df: pl.DataFrame = pl.read_csv(source_path, *pl_args, **pl_kwargs)
140 logger.debug("read_csv: read %d rows from '%s'.", len(df), source_path)
141 return df
144def xlsx_to_csv(
145 source: str | Path,
146 dest: str | Path,
147 sheet: str | None = None,
148 separator: str = ",",
149 *pl_write_args: Any,
150 **pl_write_kwargs: Any,
151) -> None:
152 """Convert an Excel workbook sheet to a CSV file using polars.
154 Supersedes both the former ``xlsx_to_csv`` (openpyxl) and
155 ``xlsx_to_csv_fast`` (python-calamine) functions. polars uses
156 ``fastexcel`` (Rust) for the read step, providing the same
157 high-throughput characteristics as the former fast path.
159 Args:
160 source: Path to the source ``.xlsx`` / ``.xlsm`` file.
161 dest: Destination ``.csv`` file path. Parent directories must
162 exist.
163 sheet: Worksheet name to export. Pass ``None`` to use the
164 first sheet.
165 separator: Column delimiter for the CSV output. Defaults to
166 ``","`` (standard CSV).
167 *pl_write_args: Positional arguments forwarded to
168 ``DataFrame.write_csv``.
169 **pl_write_kwargs: Keyword arguments forwarded to
170 ``DataFrame.write_csv``. ``separator`` is only applied when
171 it is not already present.
173 Raises:
174 FileNotFoundError: If ``source`` does not exist.
176 Example:
177 >>> xlsx_to_csv("data.xlsx", "data.csv", sheet="Transactions")
178 >>> xlsx_to_csv("data.xlsx", "data.tsv", separator="\\t")
179 """
180 dest_path = Path(dest).resolve()
182 logger.debug(
183 "xlsx_to_csv: %s → %s (sheet=%r, sep=%r)",
184 Path(source).resolve(),
185 dest_path,
186 sheet,
187 separator,
188 )
190 df = read_excel(source, sheet=sheet)
192 pl_write_kwargs.setdefault("separator", separator)
193 df.write_csv(dest_path, *pl_write_args, **pl_write_kwargs)
195 logger.debug("xlsx_to_csv: completed — wrote %s", dest_path)
196 printer.success(f"xlsx_to_csv: conversion complete — {dest_path}")
199def csv_to_xlsx(
200 source: str | Path,
201 dest: str | Path,
202 sheet_name: str = "Sheet1",
203 *pl_write_args: Any,
204 **pl_write_kwargs: Any,
205) -> None:
206 """Convert a CSV file to an Excel workbook using polars.
208 Reads the CSV with polars and writes it as an ``.xlsx`` file.
209 polars delegates the Excel write step to ``xlsxwriter`` or
210 ``openpyxl`` depending on which is installed; no additional
211 configuration is required.
213 Args:
214 source: Path to the source ``.csv`` file.
215 dest: Destination ``.xlsx`` file path. Parent directories must
216 exist.
217 sheet_name: Name of the worksheet to create in the output
218 workbook. Defaults to ``"Sheet1"``.
219 *pl_write_args: Positional arguments forwarded to
220 ``DataFrame.write_excel``.
221 **pl_write_kwargs: Keyword arguments forwarded to
222 ``DataFrame.write_excel``. ``worksheet`` is only applied when
223 it is not already present.
225 Raises:
226 FileNotFoundError: If ``source`` does not exist.
228 Example:
229 >>> csv_to_xlsx("transactions.csv", "transactions.xlsx", sheet_name="Data")
230 """
231 dest_path = Path(dest).resolve()
233 logger.debug(
234 "csv_to_xlsx: %s → %s (sheet=%r)",
235 Path(source).resolve(),
236 dest_path,
237 sheet_name,
238 )
240 df = read_csv(source)
242 pl_write_kwargs.setdefault("worksheet", sheet_name)
243 df.write_excel(dest_path, *pl_write_args, **pl_write_kwargs)
245 logger.debug("csv_to_xlsx: completed — wrote %s", dest_path)
246 printer.success(f"csv_to_xlsx: conversion complete — {dest_path}")
249def read_sheet(
250 source: str | Path,
251 sheet: str | None = None,
252 *pl_args: Any,
253 **pl_kwargs: Any,
254) -> list[list[Any]]:
255 """Read a worksheet into a row-major list of lists (compatibility shim).
257 Wraps ``read_excel`` and converts the resulting polars DataFrame to
258 a ``list[list[Any]]`` via ``DataFrame.rows()``. The first row
259 contains the column headers as extracted by polars.
261 This function exists for backwards compatibility with callers that
262 pre-date the polars migration. New code should use ``read_excel``
263 directly to benefit from the full polars API.
265 Args:
266 source: Path to the source ``.xlsx`` / ``.xlsm`` file.
267 sheet: Worksheet name to read. Pass ``None`` to use the first
268 sheet.
269 *pl_args: Positional arguments forwarded to ``read_excel``.
270 **pl_kwargs: Keyword arguments forwarded to ``read_excel``.
272 Returns:
273 list[list[Any]]: Row-major 2D list of cell values. The first
274 row contains column headers; subsequent rows contain data
275 values. Empty cells are represented as ``None``.
277 Raises:
278 FileNotFoundError: If ``source`` does not exist.
280 Example:
281 >>> data = read_sheet("report.xlsx", sheet="Data")
282 >>> headers = data[0]
283 >>> rows = data[1:]
284 """
285 logger.debug("read_sheet: %s (sheet=%r) — delegating to read_excel", source, sheet)
287 df = read_excel(source, sheet, *pl_args, **pl_kwargs)
289 # Prepend column names as the first row to preserve the legacy contract
290 # where callers expected headers in row 0.
291 header_row: list[Any] = list(df.columns)
292 data_rows: list[list[Any]] = [list(row) for row in df.rows()]
294 result: list[list[Any]] = [header_row, *data_rows]
296 logger.debug("read_sheet: returning %d rows (incl. header).", len(result))
297 return result