Coverage for src / ezxl / io / _converters.py: 87.72%

51 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-29 15:53 +0000

1# /////////////////////////////////////////////////////////////// 

2# _converters - Format conversion utilities 

3# Project: EzXl 

4# /////////////////////////////////////////////////////////////// 

5 

6"""Format conversion utilities for Excel and CSV files. 

7 

8Provides read and export paths backed by polars for high-throughput 

9data processing on closed files (no running Excel process required): 

10 

11- ``read_excel``: read an ``.xlsx`` file into a polars DataFrame. 

12- ``read_csv``: read a ``.csv`` file into a polars DataFrame. 

13- ``xlsx_to_csv``: convert an Excel sheet to CSV via polars. 

14- ``csv_to_xlsx``: convert a CSV file to an ``.xlsx`` file via polars. 

15- ``read_sheet``: compatibility shim — returns ``list[list[Any]]`` for 

16 callers that expect the legacy row-major format. 

17 

18All functions operate on **closed** files and require no running Excel 

19process. polars delegates Excel I/O to ``fastexcel`` (a Rust-based 

20engine bundled with polars extras) which provides performance comparable 

21to the former ``python-calamine`` path. 

22""" 

23 

24from __future__ import annotations 

25 

26# /////////////////////////////////////////////////////////////// 

27# IMPORTS 

28# /////////////////////////////////////////////////////////////// 

29# Standard library imports 

30from pathlib import Path 

31from typing import Any 

32 

33# Third-party imports 

34import polars as pl 

35from ezplog.lib_mode import get_logger, get_printer 

36 

37# /////////////////////////////////////////////////////////////// 

38# CONSTANTS 

39# /////////////////////////////////////////////////////////////// 

40 

41logger = get_logger(__name__) 

42printer = get_printer() 

43 

44# /////////////////////////////////////////////////////////////// 

45# FUNCTIONS 

46# /////////////////////////////////////////////////////////////// 

47 

48 

49def read_excel( 

50 source: str | Path, 

51 sheet: str | None = None, 

52 *pl_args: Any, 

53 **pl_kwargs: Any, 

54) -> pl.DataFrame: 

55 """Read an Excel workbook sheet into a polars DataFrame. 

56 

57 Delegates to ``polars.read_excel`` which uses ``fastexcel`` (Rust) 

58 under the hood. No running Excel process is required. 

59 

60 Args: 

61 source: Path to the source ``.xlsx`` / ``.xlsm`` file. 

62 sheet: Worksheet name to read. Pass ``None`` to read the first 

63 sheet (polars default when ``sheet_name`` is omitted). 

64 *pl_args: Positional arguments forwarded to ``polars.read_excel``. 

65 **pl_kwargs: Keyword arguments forwarded to ``polars.read_excel``. 

66 If ``sheet`` is provided and ``sheet_name`` is not set, this 

67 function injects ``sheet_name=sheet`` for compatibility. 

68 

69 Returns: 

70 pl.DataFrame: Contents of the requested sheet as a polars 

71 DataFrame, with the first row used as column headers. 

72 

73 Raises: 

74 FileNotFoundError: If ``source`` does not exist. 

75 ImportError: If polars (or its ``fastexcel`` extra) is not 

76 installed. 

77 

78 Example: 

79 >>> df = read_excel("report.xlsx", sheet="Data") 

80 >>> print(df.head()) 

81 """ 

82 source_path = Path(source).resolve() 

83 

84 if not source_path.exists(): 

85 raise FileNotFoundError(f"Source file not found: {source_path}") 

86 

87 logger.debug("read_excel: %s (sheet=%r)", source_path, sheet) 

88 

89 if sheet is not None and "sheet_name" not in pl_kwargs: 

90 pl_kwargs["sheet_name"] = sheet 

91 

92 df: pl.DataFrame = pl.read_excel(source_path, *pl_args, **pl_kwargs) 

93 

94 logger.debug("read_excel: read %d rows from '%s'.", len(df), source_path) 

95 return df 

96 

97 

98def read_csv( 

99 source: str | Path, 

100 separator: str = ",", 

101 encoding: str = "utf-8", 

102 *pl_args: Any, 

103 **pl_kwargs: Any, 

104) -> pl.DataFrame: 

105 """Read a CSV file into a polars DataFrame. 

106 

107 Args: 

108 source: Path to the source ``.csv`` file. 

109 separator: Column delimiter character. Defaults to ``","`` 

110 (standard CSV). Use ``"\\t"`` for TSV files. 

111 encoding: File encoding passed through to polars. Defaults to 

112 ``"utf-8"``. 

113 *pl_args: Positional arguments forwarded to ``polars.read_csv``. 

114 **pl_kwargs: Keyword arguments forwarded to ``polars.read_csv``. 

115 ``separator`` and ``encoding`` are applied only when these 

116 keys are not already present in ``pl_kwargs``. 

117 

118 Returns: 

119 pl.DataFrame: Parsed contents of the CSV file. 

120 

121 Raises: 

122 FileNotFoundError: If ``source`` does not exist. 

123 

124 Example: 

125 >>> df = read_csv("transactions.csv", separator=";") 

126 >>> print(df.schema) 

127 """ 

128 source_path = Path(source).resolve() 

129 

130 if not source_path.exists(): 

131 raise FileNotFoundError(f"Source file not found: {source_path}") 

132 

133 logger.debug("read_csv: %s (sep=%r, enc=%r)", source_path, separator, encoding) 

134 

135 pl_kwargs.setdefault("separator", separator) 

136 pl_kwargs.setdefault("encoding", encoding) 

137 

138 df: pl.DataFrame = pl.read_csv(source_path, *pl_args, **pl_kwargs) 

139 

140 logger.debug("read_csv: read %d rows from '%s'.", len(df), source_path) 

141 return df 

142 

143 

144def xlsx_to_csv( 

145 source: str | Path, 

146 dest: str | Path, 

147 sheet: str | None = None, 

148 separator: str = ",", 

149 *pl_write_args: Any, 

150 **pl_write_kwargs: Any, 

151) -> None: 

152 """Convert an Excel workbook sheet to a CSV file using polars. 

153 

154 Supersedes both the former ``xlsx_to_csv`` (openpyxl) and 

155 ``xlsx_to_csv_fast`` (python-calamine) functions. polars uses 

156 ``fastexcel`` (Rust) for the read step, providing the same 

157 high-throughput characteristics as the former fast path. 

158 

159 Args: 

160 source: Path to the source ``.xlsx`` / ``.xlsm`` file. 

161 dest: Destination ``.csv`` file path. Parent directories must 

162 exist. 

163 sheet: Worksheet name to export. Pass ``None`` to use the 

164 first sheet. 

165 separator: Column delimiter for the CSV output. Defaults to 

166 ``","`` (standard CSV). 

167 *pl_write_args: Positional arguments forwarded to 

168 ``DataFrame.write_csv``. 

169 **pl_write_kwargs: Keyword arguments forwarded to 

170 ``DataFrame.write_csv``. ``separator`` is only applied when 

171 it is not already present. 

172 

173 Raises: 

174 FileNotFoundError: If ``source`` does not exist. 

175 

176 Example: 

177 >>> xlsx_to_csv("data.xlsx", "data.csv", sheet="Transactions") 

178 >>> xlsx_to_csv("data.xlsx", "data.tsv", separator="\\t") 

179 """ 

180 dest_path = Path(dest).resolve() 

181 

182 logger.debug( 

183 "xlsx_to_csv: %s → %s (sheet=%r, sep=%r)", 

184 Path(source).resolve(), 

185 dest_path, 

186 sheet, 

187 separator, 

188 ) 

189 

190 df = read_excel(source, sheet=sheet) 

191 

192 pl_write_kwargs.setdefault("separator", separator) 

193 df.write_csv(dest_path, *pl_write_args, **pl_write_kwargs) 

194 

195 logger.debug("xlsx_to_csv: completed — wrote %s", dest_path) 

196 printer.success(f"xlsx_to_csv: conversion complete — {dest_path}") 

197 

198 

199def csv_to_xlsx( 

200 source: str | Path, 

201 dest: str | Path, 

202 sheet_name: str = "Sheet1", 

203 *pl_write_args: Any, 

204 **pl_write_kwargs: Any, 

205) -> None: 

206 """Convert a CSV file to an Excel workbook using polars. 

207 

208 Reads the CSV with polars and writes it as an ``.xlsx`` file. 

209 polars delegates the Excel write step to ``xlsxwriter`` or 

210 ``openpyxl`` depending on which is installed; no additional 

211 configuration is required. 

212 

213 Args: 

214 source: Path to the source ``.csv`` file. 

215 dest: Destination ``.xlsx`` file path. Parent directories must 

216 exist. 

217 sheet_name: Name of the worksheet to create in the output 

218 workbook. Defaults to ``"Sheet1"``. 

219 *pl_write_args: Positional arguments forwarded to 

220 ``DataFrame.write_excel``. 

221 **pl_write_kwargs: Keyword arguments forwarded to 

222 ``DataFrame.write_excel``. ``worksheet`` is only applied when 

223 it is not already present. 

224 

225 Raises: 

226 FileNotFoundError: If ``source`` does not exist. 

227 

228 Example: 

229 >>> csv_to_xlsx("transactions.csv", "transactions.xlsx", sheet_name="Data") 

230 """ 

231 dest_path = Path(dest).resolve() 

232 

233 logger.debug( 

234 "csv_to_xlsx: %s → %s (sheet=%r)", 

235 Path(source).resolve(), 

236 dest_path, 

237 sheet_name, 

238 ) 

239 

240 df = read_csv(source) 

241 

242 pl_write_kwargs.setdefault("worksheet", sheet_name) 

243 df.write_excel(dest_path, *pl_write_args, **pl_write_kwargs) 

244 

245 logger.debug("csv_to_xlsx: completed — wrote %s", dest_path) 

246 printer.success(f"csv_to_xlsx: conversion complete — {dest_path}") 

247 

248 

249def read_sheet( 

250 source: str | Path, 

251 sheet: str | None = None, 

252 *pl_args: Any, 

253 **pl_kwargs: Any, 

254) -> list[list[Any]]: 

255 """Read a worksheet into a row-major list of lists (compatibility shim). 

256 

257 Wraps ``read_excel`` and converts the resulting polars DataFrame to 

258 a ``list[list[Any]]`` via ``DataFrame.rows()``. The first row 

259 contains the column headers as extracted by polars. 

260 

261 This function exists for backwards compatibility with callers that 

262 pre-date the polars migration. New code should use ``read_excel`` 

263 directly to benefit from the full polars API. 

264 

265 Args: 

266 source: Path to the source ``.xlsx`` / ``.xlsm`` file. 

267 sheet: Worksheet name to read. Pass ``None`` to use the first 

268 sheet. 

269 *pl_args: Positional arguments forwarded to ``read_excel``. 

270 **pl_kwargs: Keyword arguments forwarded to ``read_excel``. 

271 

272 Returns: 

273 list[list[Any]]: Row-major 2D list of cell values. The first 

274 row contains column headers; subsequent rows contain data 

275 values. Empty cells are represented as ``None``. 

276 

277 Raises: 

278 FileNotFoundError: If ``source`` does not exist. 

279 

280 Example: 

281 >>> data = read_sheet("report.xlsx", sheet="Data") 

282 >>> headers = data[0] 

283 >>> rows = data[1:] 

284 """ 

285 logger.debug("read_sheet: %s (sheet=%r) — delegating to read_excel", source, sheet) 

286 

287 df = read_excel(source, sheet, *pl_args, **pl_kwargs) 

288 

289 # Prepend column names as the first row to preserve the legacy contract 

290 # where callers expected headers in row 0. 

291 header_row: list[Any] = list(df.columns) 

292 data_rows: list[list[Any]] = [list(row) for row in df.rows()] 

293 

294 result: list[list[Any]] = [header_row, *data_rows] 

295 

296 logger.debug("read_sheet: returning %d rows (incl. header).", len(result)) 

297 return result