Coverage for src/ezxl/io/_converters.py: 87.72%

1# ///////////////////////////////////////////////////////////////

2# _converters - Format conversion utilities

3# Project: EzXl

4# ///////////////////////////////////////////////////////////////

6"""Format conversion utilities for Excel and CSV files.

8Provides read and export paths backed by polars for high-throughput

9data processing on closed files (no running Excel process required):

11- ``read_excel``: read an ``.xlsx`` file into a polars DataFrame.

12- ``read_csv``: read a ``.csv`` file into a polars DataFrame.

13- ``xlsx_to_csv``: convert an Excel sheet to CSV via polars.

14- ``csv_to_xlsx``: convert a CSV file to an ``.xlsx`` file via polars.

15- ``read_sheet``: compatibility shim — returns ``list[list[Any]]`` for

16 callers that expect the legacy row-major format.

18All functions operate on **closed** files and require no running Excel

19process. polars delegates Excel I/O to ``fastexcel`` (a Rust-based

20engine bundled with polars extras) which provides performance comparable

21to the former ``python-calamine`` path.

22"""

24from __future__ import annotations

26# ///////////////////////////////////////////////////////////////

27# IMPORTS

28# ///////////////////////////////////////////////////////////////

29# Standard library imports

30from pathlib import Path

31from typing import Any

33# Third-party imports

34import polars as pl

35from ezplog.lib_mode import get_logger, get_printer

37# ///////////////////////////////////////////////////////////////

38# CONSTANTS

39# ///////////////////////////////////////////////////////////////

41logger = get_logger(__name__)

42printer = get_printer()

44# ///////////////////////////////////////////////////////////////

45# FUNCTIONS

46# ///////////////////////////////////////////////////////////////

49def read_excel(

50 source: str | Path,

51 sheet: str | None = None,

52 *pl_args: Any,

53 **pl_kwargs: Any,

54) -> pl.DataFrame:

55 """Read an Excel workbook sheet into a polars DataFrame.

57 Delegates to ``polars.read_excel`` which uses ``fastexcel`` (Rust)

58 under the hood. No running Excel process is required.

60 Args:

61 source: Path to the source ``.xlsx`` / ``.xlsm`` file.

62 sheet: Worksheet name to read. Pass ``None`` to read the first

63 sheet (polars default when ``sheet_name`` is omitted).

64 *pl_args: Positional arguments forwarded to ``polars.read_excel``.

65 **pl_kwargs: Keyword arguments forwarded to ``polars.read_excel``.

66 If ``sheet`` is provided and ``sheet_name`` is not set, this

67 function injects ``sheet_name=sheet`` for compatibility.

69 Returns:

70 pl.DataFrame: Contents of the requested sheet as a polars

71 DataFrame, with the first row used as column headers.

73 Raises:

74 FileNotFoundError: If ``source`` does not exist.

75 ImportError: If polars (or its ``fastexcel`` extra) is not

76 installed.

78 Example:

79 >>> df = read_excel("report.xlsx", sheet="Data")

80 >>> print(df.head())

81 """

82 source_path = Path(source).resolve()

84 if not source_path.exists():

85 raise FileNotFoundError(f"Source file not found: {source_path}")

87 logger.debug("read_excel: %s (sheet=%r)", source_path, sheet)

89 if sheet is not None and "sheet_name" not in pl_kwargs:

90 pl_kwargs["sheet_name"] = sheet

92 df: pl.DataFrame = pl.read_excel(source_path, *pl_args, **pl_kwargs)

94 logger.debug("read_excel: read %d rows from '%s'.", len(df), source_path)

95 return df

98def read_csv(

99 source: str | Path,

100 separator: str = ",",

101 encoding: str = "utf-8",

102 *pl_args: Any,

103 **pl_kwargs: Any,

104) -> pl.DataFrame:

105 """Read a CSV file into a polars DataFrame.

106

107 Args:

108 source: Path to the source ``.csv`` file.

109 separator: Column delimiter character. Defaults to ``","``

110 (standard CSV). Use ``"\\t"`` for TSV files.

111 encoding: File encoding passed through to polars. Defaults to

112 ``"utf-8"``.

113 *pl_args: Positional arguments forwarded to ``polars.read_csv``.

114 **pl_kwargs: Keyword arguments forwarded to ``polars.read_csv``.

115 ``separator`` and ``encoding`` are applied only when these

116 keys are not already present in ``pl_kwargs``.

117

118 Returns:

119 pl.DataFrame: Parsed contents of the CSV file.

120

121 Raises:

122 FileNotFoundError: If ``source`` does not exist.

123

124 Example:

125 >>> df = read_csv("transactions.csv", separator=";")

126 >>> print(df.schema)

127 """

128 source_path = Path(source).resolve()

129

130 if not source_path.exists():

131 raise FileNotFoundError(f"Source file not found: {source_path}")

132

133 logger.debug("read_csv: %s (sep=%r, enc=%r)", source_path, separator, encoding)

134

135 pl_kwargs.setdefault("separator", separator)

136 pl_kwargs.setdefault("encoding", encoding)

137

138 df: pl.DataFrame = pl.read_csv(source_path, *pl_args, **pl_kwargs)

139

140 logger.debug("read_csv: read %d rows from '%s'.", len(df), source_path)

141 return df

142

143

144def xlsx_to_csv(

145 source: str | Path,

146 dest: str | Path,

147 sheet: str | None = None,

148 separator: str = ",",

149 *pl_write_args: Any,

150 **pl_write_kwargs: Any,

151) -> None:

152 """Convert an Excel workbook sheet to a CSV file using polars.

153

154 Supersedes both the former ``xlsx_to_csv`` (openpyxl) and

155 ``xlsx_to_csv_fast`` (python-calamine) functions. polars uses

156 ``fastexcel`` (Rust) for the read step, providing the same

157 high-throughput characteristics as the former fast path.

158

159 Args:

160 source: Path to the source ``.xlsx`` / ``.xlsm`` file.

161 dest: Destination ``.csv`` file path. Parent directories must

162 exist.

163 sheet: Worksheet name to export. Pass ``None`` to use the

164 first sheet.

165 separator: Column delimiter for the CSV output. Defaults to

166 ``","`` (standard CSV).

167 *pl_write_args: Positional arguments forwarded to

168 ``DataFrame.write_csv``.

169 **pl_write_kwargs: Keyword arguments forwarded to

170 ``DataFrame.write_csv``. ``separator`` is only applied when

171 it is not already present.

172

173 Raises:

174 FileNotFoundError: If ``source`` does not exist.

175

176 Example:

177 >>> xlsx_to_csv("data.xlsx", "data.csv", sheet="Transactions")

178 >>> xlsx_to_csv("data.xlsx", "data.tsv", separator="\\t")

179 """

180 dest_path = Path(dest).resolve()

181

182 logger.debug(

183 "xlsx_to_csv: %s → %s (sheet=%r, sep=%r)",

184 Path(source).resolve(),

185 dest_path,

186 sheet,

187 separator,

188 )

189

190 df = read_excel(source, sheet=sheet)

191

192 pl_write_kwargs.setdefault("separator", separator)

193 df.write_csv(dest_path, *pl_write_args, **pl_write_kwargs)

194

195 logger.debug("xlsx_to_csv: completed — wrote %s", dest_path)

196 printer.success(f"xlsx_to_csv: conversion complete — {dest_path}")

197

198

199def csv_to_xlsx(

200 source: str | Path,

201 dest: str | Path,

202 sheet_name: str = "Sheet1",

203 *pl_write_args: Any,

204 **pl_write_kwargs: Any,

205) -> None:

206 """Convert a CSV file to an Excel workbook using polars.

207

208 Reads the CSV with polars and writes it as an ``.xlsx`` file.

209 polars delegates the Excel write step to ``xlsxwriter`` or

210 ``openpyxl`` depending on which is installed; no additional

211 configuration is required.

212

213 Args:

214 source: Path to the source ``.csv`` file.

215 dest: Destination ``.xlsx`` file path. Parent directories must

216 exist.

217 sheet_name: Name of the worksheet to create in the output

218 workbook. Defaults to ``"Sheet1"``.

219 *pl_write_args: Positional arguments forwarded to

220 ``DataFrame.write_excel``.

221 **pl_write_kwargs: Keyword arguments forwarded to

222 ``DataFrame.write_excel``. ``worksheet`` is only applied when

223 it is not already present.

224

225 Raises:

226 FileNotFoundError: If ``source`` does not exist.

227

228 Example:

229 >>> csv_to_xlsx("transactions.csv", "transactions.xlsx", sheet_name="Data")

230 """

231 dest_path = Path(dest).resolve()

232

233 logger.debug(

234 "csv_to_xlsx: %s → %s (sheet=%r)",

235 Path(source).resolve(),

236 dest_path,

237 sheet_name,

238 )

239

240 df = read_csv(source)

241

242 pl_write_kwargs.setdefault("worksheet", sheet_name)

243 df.write_excel(dest_path, *pl_write_args, **pl_write_kwargs)

244

245 logger.debug("csv_to_xlsx: completed — wrote %s", dest_path)

246 printer.success(f"csv_to_xlsx: conversion complete — {dest_path}")

247

248

249def read_sheet(

250 source: str | Path,

251 sheet: str | None = None,

252 *pl_args: Any,

253 **pl_kwargs: Any,

254) -> list[list[Any]]:

255 """Read a worksheet into a row-major list of lists (compatibility shim).

256

257 Wraps ``read_excel`` and converts the resulting polars DataFrame to

258 a ``list[list[Any]]`` via ``DataFrame.rows()``. The first row

259 contains the column headers as extracted by polars.

260

261 This function exists for backwards compatibility with callers that

262 pre-date the polars migration. New code should use ``read_excel``

263 directly to benefit from the full polars API.

264

265 Args:

266 source: Path to the source ``.xlsx`` / ``.xlsm`` file.

267 sheet: Worksheet name to read. Pass ``None`` to use the first

268 sheet.

269 *pl_args: Positional arguments forwarded to ``read_excel``.

270 **pl_kwargs: Keyword arguments forwarded to ``read_excel``.

271

272 Returns:

273 list[list[Any]]: Row-major 2D list of cell values. The first

274 row contains column headers; subsequent rows contain data

275 values. Empty cells are represented as ``None``.

276

277 Raises:

278 FileNotFoundError: If ``source`` does not exist.

279

280 Example:

281 >>> data = read_sheet("report.xlsx", sheet="Data")

282 >>> headers = data[0]

283 >>> rows = data[1:]

284 """

285 logger.debug("read_sheet: %s (sheet=%r) — delegating to read_excel", source, sheet)

286

287 df = read_excel(source, sheet, *pl_args, **pl_kwargs)

288

289 # Prepend column names as the first row to preserve the legacy contract

290 # where callers expected headers in row 0.

291 header_row: list[Any] = list(df.columns)

292 data_rows: list[list[Any]] = [list(row) for row in df.rows()]

293

294 result: list[list[Any]] = [header_row, *data_rows]

295

296 logger.debug("read_sheet: returning %d rows (incl. header).", len(result))

297 return result

Coverage for src / ezxl / io / _converters.py: 87.72%

51 statements