Coverage for src/ezxl/io/_converters.py: 84%

1# ///////////////////////////////////////////////////////////////

2# _converters - Format conversion utilities

3# Project: EzXl

4# ///////////////////////////////////////////////////////////////

6"""Format conversion utilities for Excel and CSV files.

8Provides read and export paths backed by polars for high-throughput

9data processing on closed files (no running Excel process required):

11- ``read_excel``: read an ``.xlsx`` file into a polars DataFrame.

12- ``read_csv``: read a ``.csv`` file into a polars DataFrame.

13- ``xlsx_to_csv``: convert an Excel sheet to CSV via polars.

14- ``csv_to_xlsx``: convert a CSV file to an ``.xlsx`` file via polars.

15- ``read_sheet``: compatibility shim — returns ``list[list[Any]]`` for

16 callers that expect the legacy row-major format.

18All functions operate on **closed** files and require no running Excel

19process. polars delegates Excel I/O to ``fastexcel`` (a Rust-based

20engine bundled with polars extras) which provides performance comparable

21to the former ``python-calamine`` path.

22"""

24from __future__ import annotations

26# ///////////////////////////////////////////////////////////////

27# IMPORTS

28# ///////////////////////////////////////////////////////////////

29# Standard library imports

30from pathlib import Path

31from typing import Any

33# Third-party imports

34import polars as pl

35from ezplog.lib_mode import get_logger, get_printer

37# ///////////////////////////////////////////////////////////////

38# CONSTANTS

39# ///////////////////////////////////////////////////////////////

41logger = get_logger(__name__)

42printer = get_printer()

44# ///////////////////////////////////////////////////////////////

45# FUNCTIONS

46# ///////////////////////////////////////////////////////////////

49def read_excel(

50 source: str | Path,

51 sheet: str | None = None,

52) -> pl.DataFrame:

53 """Read an Excel workbook sheet into a polars DataFrame.

55 Delegates to ``polars.read_excel`` which uses ``fastexcel`` (Rust)

56 under the hood. No running Excel process is required.

58 Args:

59 source: Path to the source ``.xlsx`` / ``.xlsm`` file.

60 sheet: Worksheet name to read. Pass ``None`` to read the first

61 sheet (polars default when ``sheet_name`` is omitted).

63 Returns:

64 pl.DataFrame: Contents of the requested sheet as a polars

65 DataFrame, with the first row used as column headers.

67 Raises:

68 FileNotFoundError: If ``source`` does not exist.

69 ImportError: If polars (or its ``fastexcel`` extra) is not

70 installed.

72 Example:

73 >>> df = read_excel("report.xlsx", sheet="Data")

74 >>> print(df.head())

75 """

76 source_path = Path(source).resolve()

78 if not source_path.exists():

79 raise FileNotFoundError(f"Source file not found: {source_path}")

81 logger.debug("read_excel: %s (sheet=%r)", source_path, sheet)

83 df: pl.DataFrame = pl.read_excel(source_path, sheet_name=sheet)

85 logger.debug("read_excel: read %d rows from '%s'.", len(df), source_path)

86 return df

89def read_csv(

90 source: str | Path,

91 separator: str = ",",

92 encoding: str = "utf-8",

93) -> pl.DataFrame:

94 """Read a CSV file into a polars DataFrame.

96 Args:

97 source: Path to the source ``.csv`` file.

98 separator: Column delimiter character. Defaults to ``","``

99 (standard CSV). Use ``"\\t"`` for TSV files.

100 encoding: File encoding passed through to polars. Defaults to

101 ``"utf-8"``.

102

103 Returns:

104 pl.DataFrame: Parsed contents of the CSV file.

105

106 Raises:

107 FileNotFoundError: If ``source`` does not exist.

108

109 Example:

110 >>> df = read_csv("transactions.csv", separator=";")

111 >>> print(df.schema)

112 """

113 source_path = Path(source).resolve()

114

115 if not source_path.exists():

116 raise FileNotFoundError(f"Source file not found: {source_path}")

117

118 logger.debug("read_csv: %s (sep=%r, enc=%r)", source_path, separator, encoding)

119

120 df: pl.DataFrame = pl.read_csv(

121 source_path,

122 separator=separator,

123 encoding=encoding,

124 )

125

126 logger.debug("read_csv: read %d rows from '%s'.", len(df), source_path)

127 return df

128

129

130def xlsx_to_csv(

131 source: str | Path,

132 dest: str | Path,

133 sheet: str | None = None,

134 separator: str = ",",

135) -> None:

136 """Convert an Excel workbook sheet to a CSV file using polars.

137

138 Supersedes both the former ``xlsx_to_csv`` (openpyxl) and

139 ``xlsx_to_csv_fast`` (python-calamine) functions. polars uses

140 ``fastexcel`` (Rust) for the read step, providing the same

141 high-throughput characteristics as the former fast path.

142

143 Args:

144 source: Path to the source ``.xlsx`` / ``.xlsm`` file.

145 dest: Destination ``.csv`` file path. Parent directories must

146 exist.

147 sheet: Worksheet name to export. Pass ``None`` to use the

148 first sheet.

149 separator: Column delimiter for the CSV output. Defaults to

150 ``","`` (standard CSV).

151

152 Raises:

153 FileNotFoundError: If ``source`` does not exist.

154

155 Example:

156 >>> xlsx_to_csv("data.xlsx", "data.csv", sheet="Transactions")

157 >>> xlsx_to_csv("data.xlsx", "data.tsv", separator="\\t")

158 """

159 dest_path = Path(dest).resolve()

160

161 logger.debug(

162 "xlsx_to_csv: %s → %s (sheet=%r, sep=%r)",

163 Path(source).resolve(),

164 dest_path,

165 sheet,

166 separator,

167 )

168

169 df = read_excel(source, sheet=sheet)

170 df.write_csv(dest_path, separator=separator)

171

172 logger.debug("xlsx_to_csv: completed — wrote %s", dest_path)

173 printer.success(f"xlsx_to_csv: conversion complete — {dest_path}")

174

175

176def csv_to_xlsx(

177 source: str | Path,

178 dest: str | Path,

179 sheet_name: str = "Sheet1",

180) -> None:

181 """Convert a CSV file to an Excel workbook using polars.

182

183 Reads the CSV with polars and writes it as an ``.xlsx`` file.

184 polars delegates the Excel write step to ``xlsxwriter`` or

185 ``openpyxl`` depending on which is installed; no additional

186 configuration is required.

187

188 Args:

189 source: Path to the source ``.csv`` file.

190 dest: Destination ``.xlsx`` file path. Parent directories must

191 exist.

192 sheet_name: Name of the worksheet to create in the output

193 workbook. Defaults to ``"Sheet1"``.

194

195 Raises:

196 FileNotFoundError: If ``source`` does not exist.

197

198 Example:

199 >>> csv_to_xlsx("transactions.csv", "transactions.xlsx", sheet_name="Data")

200 """

201 dest_path = Path(dest).resolve()

202

203 logger.debug(

204 "csv_to_xlsx: %s → %s (sheet=%r)",

205 Path(source).resolve(),

206 dest_path,

207 sheet_name,

208 )

209

210 df = read_csv(source)

211 df.write_excel(dest_path, worksheet=sheet_name)

212

213 logger.debug("csv_to_xlsx: completed — wrote %s", dest_path)

214 printer.success(f"csv_to_xlsx: conversion complete — {dest_path}")

215

216

217def read_sheet(

218 source: str | Path,

219 sheet: str | None = None,

220) -> list[list[Any]]:

221 """Read a worksheet into a row-major list of lists (compatibility shim).

222

223 Wraps ``read_excel`` and converts the resulting polars DataFrame to

224 a ``list[list[Any]]`` via ``DataFrame.rows()``. The first row

225 contains the column headers as extracted by polars.

226

227 This function exists for backwards compatibility with callers that

228 pre-date the polars migration. New code should use ``read_excel``

229 directly to benefit from the full polars API.

230

231 Args:

232 source: Path to the source ``.xlsx`` / ``.xlsm`` file.

233 sheet: Worksheet name to read. Pass ``None`` to use the first

234 sheet.

235

236 Returns:

237 list[list[Any]]: Row-major 2D list of cell values. The first

238 row contains column headers; subsequent rows contain data

239 values. Empty cells are represented as ``None``.

240

241 Raises:

242 FileNotFoundError: If ``source`` does not exist.

243

244 Example:

245 >>> data = read_sheet("report.xlsx", sheet="Data")

246 >>> headers = data[0]

247 >>> rows = data[1:]

248 """

249 logger.debug("read_sheet: %s (sheet=%r) — delegating to read_excel", source, sheet)

250

251 df = read_excel(source, sheet=sheet)

252

253 # Prepend column names as the first row to preserve the legacy contract

254 # where callers expected headers in row 0.

255 header_row: list[Any] = list(df.columns)

256 data_rows: list[list[Any]] = [list(row) for row in df.rows()]

257

258 result: list[list[Any]] = [header_row, *data_rows]

259

260 logger.debug("read_sheet: returning %d rows (incl. header).", len(result))

261 return result

Coverage for src / ezxl / io / _converters.py: 84%

45 statements