Coverage for src / ezxl / io / _converters.py: 84%

45 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-29 22:41 +0000

1# /////////////////////////////////////////////////////////////// 

2# _converters - Format conversion utilities 

3# Project: EzXl 

4# /////////////////////////////////////////////////////////////// 

5 

6"""Format conversion utilities for Excel and CSV files. 

7 

8Provides read and export paths backed by polars for high-throughput 

9data processing on closed files (no running Excel process required): 

10 

11- ``read_excel``: read an ``.xlsx`` file into a polars DataFrame. 

12- ``read_csv``: read a ``.csv`` file into a polars DataFrame. 

13- ``xlsx_to_csv``: convert an Excel sheet to CSV via polars. 

14- ``csv_to_xlsx``: convert a CSV file to an ``.xlsx`` file via polars. 

15- ``read_sheet``: compatibility shim — returns ``list[list[Any]]`` for 

16 callers that expect the legacy row-major format. 

17 

18All functions operate on **closed** files and require no running Excel 

19process. polars delegates Excel I/O to ``fastexcel`` (a Rust-based 

20engine bundled with polars extras) which provides performance comparable 

21to the former ``python-calamine`` path. 

22""" 

23 

24from __future__ import annotations 

25 

26# /////////////////////////////////////////////////////////////// 

27# IMPORTS 

28# /////////////////////////////////////////////////////////////// 

29# Standard library imports 

30from pathlib import Path 

31from typing import Any 

32 

33# Third-party imports 

34import polars as pl 

35from ezplog.lib_mode import get_logger, get_printer 

36 

37# /////////////////////////////////////////////////////////////// 

38# CONSTANTS 

39# /////////////////////////////////////////////////////////////// 

40 

41logger = get_logger(__name__) 

42printer = get_printer() 

43 

44# /////////////////////////////////////////////////////////////// 

45# FUNCTIONS 

46# /////////////////////////////////////////////////////////////// 

47 

48 

49def read_excel( 

50 source: str | Path, 

51 sheet: str | None = None, 

52) -> pl.DataFrame: 

53 """Read an Excel workbook sheet into a polars DataFrame. 

54 

55 Delegates to ``polars.read_excel`` which uses ``fastexcel`` (Rust) 

56 under the hood. No running Excel process is required. 

57 

58 Args: 

59 source: Path to the source ``.xlsx`` / ``.xlsm`` file. 

60 sheet: Worksheet name to read. Pass ``None`` to read the first 

61 sheet (polars default when ``sheet_name`` is omitted). 

62 

63 Returns: 

64 pl.DataFrame: Contents of the requested sheet as a polars 

65 DataFrame, with the first row used as column headers. 

66 

67 Raises: 

68 FileNotFoundError: If ``source`` does not exist. 

69 ImportError: If polars (or its ``fastexcel`` extra) is not 

70 installed. 

71 

72 Example: 

73 >>> df = read_excel("report.xlsx", sheet="Data") 

74 >>> print(df.head()) 

75 """ 

76 source_path = Path(source).resolve() 

77 

78 if not source_path.exists(): 

79 raise FileNotFoundError(f"Source file not found: {source_path}") 

80 

81 logger.debug("read_excel: %s (sheet=%r)", source_path, sheet) 

82 

83 df: pl.DataFrame = pl.read_excel(source_path, sheet_name=sheet) 

84 

85 logger.debug("read_excel: read %d rows from '%s'.", len(df), source_path) 

86 return df 

87 

88 

89def read_csv( 

90 source: str | Path, 

91 separator: str = ",", 

92 encoding: str = "utf-8", 

93) -> pl.DataFrame: 

94 """Read a CSV file into a polars DataFrame. 

95 

96 Args: 

97 source: Path to the source ``.csv`` file. 

98 separator: Column delimiter character. Defaults to ``","`` 

99 (standard CSV). Use ``"\\t"`` for TSV files. 

100 encoding: File encoding passed through to polars. Defaults to 

101 ``"utf-8"``. 

102 

103 Returns: 

104 pl.DataFrame: Parsed contents of the CSV file. 

105 

106 Raises: 

107 FileNotFoundError: If ``source`` does not exist. 

108 

109 Example: 

110 >>> df = read_csv("transactions.csv", separator=";") 

111 >>> print(df.schema) 

112 """ 

113 source_path = Path(source).resolve() 

114 

115 if not source_path.exists(): 

116 raise FileNotFoundError(f"Source file not found: {source_path}") 

117 

118 logger.debug("read_csv: %s (sep=%r, enc=%r)", source_path, separator, encoding) 

119 

120 df: pl.DataFrame = pl.read_csv( 

121 source_path, 

122 separator=separator, 

123 encoding=encoding, 

124 ) 

125 

126 logger.debug("read_csv: read %d rows from '%s'.", len(df), source_path) 

127 return df 

128 

129 

130def xlsx_to_csv( 

131 source: str | Path, 

132 dest: str | Path, 

133 sheet: str | None = None, 

134 separator: str = ",", 

135) -> None: 

136 """Convert an Excel workbook sheet to a CSV file using polars. 

137 

138 Supersedes both the former ``xlsx_to_csv`` (openpyxl) and 

139 ``xlsx_to_csv_fast`` (python-calamine) functions. polars uses 

140 ``fastexcel`` (Rust) for the read step, providing the same 

141 high-throughput characteristics as the former fast path. 

142 

143 Args: 

144 source: Path to the source ``.xlsx`` / ``.xlsm`` file. 

145 dest: Destination ``.csv`` file path. Parent directories must 

146 exist. 

147 sheet: Worksheet name to export. Pass ``None`` to use the 

148 first sheet. 

149 separator: Column delimiter for the CSV output. Defaults to 

150 ``","`` (standard CSV). 

151 

152 Raises: 

153 FileNotFoundError: If ``source`` does not exist. 

154 

155 Example: 

156 >>> xlsx_to_csv("data.xlsx", "data.csv", sheet="Transactions") 

157 >>> xlsx_to_csv("data.xlsx", "data.tsv", separator="\\t") 

158 """ 

159 dest_path = Path(dest).resolve() 

160 

161 logger.debug( 

162 "xlsx_to_csv: %s → %s (sheet=%r, sep=%r)", 

163 Path(source).resolve(), 

164 dest_path, 

165 sheet, 

166 separator, 

167 ) 

168 

169 df = read_excel(source, sheet=sheet) 

170 df.write_csv(dest_path, separator=separator) 

171 

172 logger.debug("xlsx_to_csv: completed — wrote %s", dest_path) 

173 printer.success(f"xlsx_to_csv: conversion complete — {dest_path}") 

174 

175 

176def csv_to_xlsx( 

177 source: str | Path, 

178 dest: str | Path, 

179 sheet_name: str = "Sheet1", 

180) -> None: 

181 """Convert a CSV file to an Excel workbook using polars. 

182 

183 Reads the CSV with polars and writes it as an ``.xlsx`` file. 

184 polars delegates the Excel write step to ``xlsxwriter`` or 

185 ``openpyxl`` depending on which is installed; no additional 

186 configuration is required. 

187 

188 Args: 

189 source: Path to the source ``.csv`` file. 

190 dest: Destination ``.xlsx`` file path. Parent directories must 

191 exist. 

192 sheet_name: Name of the worksheet to create in the output 

193 workbook. Defaults to ``"Sheet1"``. 

194 

195 Raises: 

196 FileNotFoundError: If ``source`` does not exist. 

197 

198 Example: 

199 >>> csv_to_xlsx("transactions.csv", "transactions.xlsx", sheet_name="Data") 

200 """ 

201 dest_path = Path(dest).resolve() 

202 

203 logger.debug( 

204 "csv_to_xlsx: %s → %s (sheet=%r)", 

205 Path(source).resolve(), 

206 dest_path, 

207 sheet_name, 

208 ) 

209 

210 df = read_csv(source) 

211 df.write_excel(dest_path, worksheet=sheet_name) 

212 

213 logger.debug("csv_to_xlsx: completed — wrote %s", dest_path) 

214 printer.success(f"csv_to_xlsx: conversion complete — {dest_path}") 

215 

216 

217def read_sheet( 

218 source: str | Path, 

219 sheet: str | None = None, 

220) -> list[list[Any]]: 

221 """Read a worksheet into a row-major list of lists (compatibility shim). 

222 

223 Wraps ``read_excel`` and converts the resulting polars DataFrame to 

224 a ``list[list[Any]]`` via ``DataFrame.rows()``. The first row 

225 contains the column headers as extracted by polars. 

226 

227 This function exists for backwards compatibility with callers that 

228 pre-date the polars migration. New code should use ``read_excel`` 

229 directly to benefit from the full polars API. 

230 

231 Args: 

232 source: Path to the source ``.xlsx`` / ``.xlsm`` file. 

233 sheet: Worksheet name to read. Pass ``None`` to use the first 

234 sheet. 

235 

236 Returns: 

237 list[list[Any]]: Row-major 2D list of cell values. The first 

238 row contains column headers; subsequent rows contain data 

239 values. Empty cells are represented as ``None``. 

240 

241 Raises: 

242 FileNotFoundError: If ``source`` does not exist. 

243 

244 Example: 

245 >>> data = read_sheet("report.xlsx", sheet="Data") 

246 >>> headers = data[0] 

247 >>> rows = data[1:] 

248 """ 

249 logger.debug("read_sheet: %s (sheet=%r) — delegating to read_excel", source, sheet) 

250 

251 df = read_excel(source, sheet=sheet) 

252 

253 # Prepend column names as the first row to preserve the legacy contract 

254 # where callers expected headers in row 0. 

255 header_row: list[Any] = list(df.columns) 

256 data_rows: list[list[Any]] = [list(row) for row in df.rows()] 

257 

258 result: list[list[Any]] = [header_row, *data_rows] 

259 

260 logger.debug("read_sheet: returning %d rows (incl. header).", len(result)) 

261 return result