Coverage for src/rtflite/encode.py: 75%

130 statements  

« prev     ^ index     » next       coverage.py v7.10.5, created at 2025-08-25 22:35 +0000

1"""RTF Document class - main entry point for RTF generation. 

2 

3This module provides the RTFDocument class with a clean, service-oriented architecture. 

4All complex logic has been delegated to specialized services and strategies. 

5""" 

6 

7from collections.abc import Sequence 

8 

9import polars as pl 

10from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator 

11 

12from .input import ( 

13 RTFBody, 

14 RTFColumnHeader, 

15 RTFFigure, 

16 RTFFootnote, 

17 RTFPage, 

18 RTFPageFooter, 

19 RTFPageHeader, 

20 RTFSource, 

21 RTFSubline, 

22 RTFTitle, 

23) 

24from .row import Utils 

25 

26 

27class RTFDocument(BaseModel): 

28 """Main class for creating RTF documents with tables, text, and figures. 

29 

30 RTFDocument is the central class for generating Rich Text Format (RTF) files 

31 containing formatted tables, titles, footnotes, and other document elements. 

32 It provides a comprehensive API for creating professional documents commonly 

33 used in clinical trials, scientific research, and data reporting. 

34 

35 Examples: 

36 Simple table with title: 

37 ```python 

38 import rtflite as rtf 

39 import polars as pl 

40 

41 df = pl.DataFrame({ 

42 "Subject": ["001", "002", "003"], 

43 "Age": [45, 52, 38], 

44 "Treatment": ["Drug A", "Drug B", "Placebo"] 

45 }) 

46 

47 doc = rtf.RTFDocument( 

48 df=df, 

49 rtf_title=rtf.RTFTitle(text="Patient Demographics"), 

50 rtf_body=rtf.RTFBody(col_rel_width=[2, 1, 2]) 

51 ) 

52 doc.write_rtf("demographics.rtf") 

53 ``` 

54 

55 Multi-page document with headers and footers: 

56 ```python 

57 doc = rtf.RTFDocument( 

58 df=large_df, 

59 rtf_page=rtf.RTFPage(nrow=40, orientation="landscape"), 

60 rtf_page_header=rtf.RTFPageHeader(), # Default page numbering 

61 rtf_page_footer=rtf.RTFPageFooter(text="Confidential"), 

62 rtf_title=rtf.RTFTitle(text="Clinical Study Results"), 

63 rtf_column_header=rtf.RTFColumnHeader( 

64 text=["Subject ID", "Visit", "Result", "Units"] 

65 ), 

66 rtf_body=rtf.RTFBody( 

67 col_rel_width=[2, 1, 1, 1], 

68 text_justification=[["l", "c", "r", "c"]] 

69 ), 

70 rtf_footnote=rtf.RTFFootnote( 

71 text="Results are mean +/- SD" 

72 ) 

73 ) 

74 doc.write_rtf("results.rtf") 

75 ``` 

76 

77 Document with grouped data and sublines: 

78 ```python 

79 doc = rtf.RTFDocument( 

80 df=grouped_df, 

81 rtf_body=rtf.RTFBody( 

82 group_by=["SITE", "TREATMENT"], # Suppress duplicate values 

83 subline_by=["STUDY_PHASE"], # Create section headers 

84 col_rel_width=[2, 2, 1, 1] 

85 ) 

86 ) 

87 ``` 

88 

89 Attributes: 

90 df: Data to display in the table. Can be a single DataFrame or list of 

91 DataFrames for multi-section documents. Accepts pandas or polars 

92 DataFrames (automatically converted to polars internally). 

93 

94 rtf_page: Page configuration including size, orientation, margins, and 

95 pagination settings. 

96 

97 rtf_page_header: Optional header appearing at the top of every page. 

98 

99 rtf_page_footer: Optional footer appearing at the bottom of every page. 

100 

101 rtf_title: Document title(s) displayed at the top. 

102 

103 rtf_column_header: Column headers for the table. Can be a single header 

104 or list of headers for multi-row headers. 

105 

106 rtf_body: Table body configuration including column widths, formatting, 

107 borders, and special features like group_by and subline_by. 

108 

109 rtf_footnote: Optional footnote text displayed after the table. 

110 

111 rtf_source: Optional source citation displayed at the very bottom. 

112 

113 rtf_figure: Optional figure/image to embed in the document. 

114 

115 Methods: 

116 rtf_encode(): Generate the complete RTF document as a string. 

117 write_rtf(file_path): Write the RTF document to a file. 

118 """ 

119 

120 model_config = ConfigDict(arbitrary_types_allowed=True) 

121 

122 # Core data 

123 df: pl.DataFrame | list[pl.DataFrame] | None = Field( 

124 default=None, 

125 description="The DataFrame(s) containing the data for the RTF document. Accepts single DataFrame or list of DataFrames for multi-section documents. Accepts pandas or polars DataFrame, internally converted to polars. Optional when using figure-only documents.", 

126 ) 

127 

128 # Document structure 

129 rtf_page: RTFPage = Field( 

130 default_factory=lambda: RTFPage(), 

131 description="Page settings including size, orientation and margins", 

132 ) 

133 rtf_page_header: RTFPageHeader | None = Field( 

134 default=None, description="Text to appear in the header of each page" 

135 ) 

136 rtf_title: RTFTitle | None = Field( 

137 default_factory=lambda: RTFTitle(), 

138 description="Title section settings including text and formatting", 

139 ) 

140 rtf_subline: RTFSubline | None = Field( 

141 default=None, description="Subject line text to appear below the title" 

142 ) 

143 rtf_column_header: ( 

144 Sequence[RTFColumnHeader] | Sequence[Sequence[RTFColumnHeader | None]] 

145 ) = Field( 

146 default_factory=lambda: [RTFColumnHeader()], 

147 description="Column header settings. For multi-section documents, use nested list format: [[header1], [header2], [None]] where None means no header for that section.", 

148 ) 

149 rtf_body: RTFBody | Sequence[RTFBody] | None = Field( 

150 default_factory=lambda: RTFBody(), 

151 description="Table body section settings including column widths and formatting. For multi-section documents, provide a list of RTFBody objects.", 

152 ) 

153 rtf_footnote: RTFFootnote | None = Field( 

154 default=None, description="Footnote text to appear at bottom of document" 

155 ) 

156 rtf_source: RTFSource | None = Field( 

157 default=None, description="Data source citation text" 

158 ) 

159 rtf_page_footer: RTFPageFooter | None = Field( 

160 default=None, description="Text to appear in the footer of each page" 

161 ) 

162 rtf_figure: RTFFigure | None = Field( 

163 default=None, description="Figure/image content to embed in the document" 

164 ) 

165 

166 @field_validator("rtf_column_header", mode="before") 

167 def convert_column_header_to_list(cls, v): 

168 """Convert single RTFColumnHeader to list or handle nested list format""" 

169 if v is not None and isinstance(v, RTFColumnHeader): 

170 return [v] 

171 return v 

172 

173 @model_validator(mode="before") 

174 @classmethod 

175 def validate_dataframe(cls, values): 

176 """Convert DataFrame(s) to polars for internal processing.""" 

177 if "df" in values and values["df"] is not None: 

178 df = values["df"] 

179 import narwhals as nw 

180 import polars as pl 

181 

182 # Handle single DataFrame 

183 if not isinstance(df, list): 

184 if isinstance(df, pl.DataFrame): 

185 pass # Already polars 

186 else: 

187 # Use narwhals to handle any DataFrame type 

188 try: 

189 nw_df = nw.from_native(df) 

190 values["df"] = nw_df.to_native(pl.DataFrame) 

191 except Exception as e: 

192 raise ValueError( 

193 f"DataFrame must be a valid DataFrame: {str(e)}" 

194 ) 

195 # Handle list of DataFrames 

196 else: 

197 converted_dfs = [] 

198 for i, single_df in enumerate(df): 

199 if isinstance(single_df, pl.DataFrame): 

200 converted_dfs.append(single_df) 

201 else: 

202 try: 

203 # Use narwhals to handle any DataFrame type 

204 nw_df = nw.from_native(single_df) 

205 converted_dfs.append(nw_df.to_native(pl.DataFrame)) 

206 except Exception as e: 

207 raise ValueError( 

208 f"DataFrame at index {i} must be a valid DataFrame: {str(e)}" 

209 ) 

210 values["df"] = converted_dfs 

211 return values 

212 

213 @model_validator(mode="after") 

214 def validate_column_names(self): 

215 """Validate that column references exist in DataFrame and multi-section consistency.""" 

216 # Validate df and rtf_figure usage 

217 if self.df is None and self.rtf_figure is None: 

218 raise ValueError("Either 'df' or 'rtf_figure' must be provided") 

219 

220 if self.df is not None and self.rtf_figure is not None: 

221 raise ValueError( 

222 "Cannot use both 'df' and 'rtf_figure' together. Use either tables or figures in a single document." 

223 ) 

224 

225 # When RTFFigure is used, enforce as_table=False for footnotes and sources 

226 if self.rtf_figure is not None: 

227 if self.rtf_footnote is not None and getattr( 

228 self.rtf_footnote, "as_table", True 

229 ): 

230 raise ValueError( 

231 "When using RTFFigure, RTFFootnote must have as_table=False" 

232 ) 

233 if self.rtf_source is not None and getattr( 

234 self.rtf_source, "as_table", False 

235 ): 

236 raise ValueError( 

237 "When using RTFFigure, RTFSource must have as_table=False" 

238 ) 

239 

240 # Skip column validation if no DataFrame provided (figure-only documents) 

241 if self.df is None: 

242 return self 

243 

244 # Multi-section validation 

245 is_multi_section = isinstance(self.df, list) 

246 if is_multi_section: 

247 # Validate rtf_body is also a list with matching length 

248 if not isinstance(self.rtf_body, list): 

249 raise ValueError("When df is a list, rtf_body must also be a list") 

250 if len(self.df) != len(self.rtf_body): 

251 raise ValueError( 

252 f"df list length ({len(self.df)}) must match rtf_body list length ({len(self.rtf_body)})" 

253 ) 

254 

255 # Validate rtf_column_header if it's nested list format 

256 if isinstance(self.rtf_column_header[0], list): 

257 if len(self.rtf_column_header) != len(self.df): 

258 raise ValueError( 

259 f"rtf_column_header nested list length ({len(self.rtf_column_header)}) must match df list length ({len(self.df)})" 

260 ) 

261 

262 # Per-section column validation 

263 for i, (section_df, section_body) in enumerate(zip(self.df, self.rtf_body)): 

264 self._validate_section_columns(section_df, section_body, i) 

265 else: 

266 # Single section validation (existing logic) 

267 self._validate_section_columns(self.df, self.rtf_body, 0) 

268 

269 return self 

270 

271 def _validate_section_columns(self, df, body, section_index): 

272 """Validate column references for a single section.""" 

273 columns = df.columns 

274 section_label = f"section {section_index}" if section_index > 0 else "df" 

275 

276 if body.group_by is not None: 

277 for column in body.group_by: 

278 if column not in columns: 

279 raise ValueError( 

280 f"`group_by` column {column} not found in {section_label}" 

281 ) 

282 

283 if body.page_by is not None: 

284 for column in body.page_by: 

285 if column not in columns: 

286 raise ValueError( 

287 f"`page_by` column {column} not found in {section_label}" 

288 ) 

289 

290 if body.subline_by is not None: 

291 for column in body.subline_by: 

292 if column not in columns: 

293 raise ValueError( 

294 f"`subline_by` column {column} not found in {section_label}" 

295 ) 

296 

297 def __init__(self, **data): 

298 super().__init__(**data) 

299 

300 # Set default column widths based on DataFrame dimensions (if DataFrame provided) 

301 if self.df is not None: 

302 is_multi_section = isinstance(self.df, list) 

303 

304 if is_multi_section: 

305 # Handle multi-section documents 

306 for section_df, section_body in zip(self.df, self.rtf_body): 

307 dim = section_df.shape 

308 section_body.col_rel_width = ( 

309 section_body.col_rel_width or [1] * dim[1] 

310 ) 

311 

312 # Handle column headers for multi-section 

313 if self.rtf_column_header and isinstance( 

314 self.rtf_column_header[0], list 

315 ): 

316 # Nested list format: [[header1], [header2], [None]] 

317 for section_headers, section_body in zip( 

318 self.rtf_column_header, self.rtf_body 

319 ): 

320 if section_headers: # Skip if [None] 

321 for header in section_headers: 

322 if header and header.col_rel_width is None: 

323 header.col_rel_width = ( 

324 section_body.col_rel_width.copy() 

325 ) 

326 elif self.rtf_column_header: 

327 # Flat list format - apply to first section only 

328 for header in self.rtf_column_header: 

329 if header.col_rel_width is None: 

330 header.col_rel_width = self.rtf_body[0].col_rel_width.copy() 

331 else: 

332 # Handle single section documents (existing logic) 

333 dim = self.df.shape 

334 self.rtf_body.col_rel_width = ( 

335 self.rtf_body.col_rel_width or [1] * dim[1] 

336 ) 

337 

338 # Inherit col_rel_width from rtf_body to rtf_column_header if not specified 

339 if self.rtf_column_header: 

340 for header in self.rtf_column_header: 

341 if header.col_rel_width is None: 

342 header.col_rel_width = self.rtf_body.col_rel_width.copy() 

343 

344 # Calculate table spacing for text components 

345 self._table_space = int( 

346 Utils._inch_to_twip(self.rtf_page.width - self.rtf_page.col_width) / 2 

347 ) 

348 

349 # Apply table spacing to text components if needed 

350 self._apply_table_spacing() 

351 

352 def _apply_table_spacing(self): 

353 """Apply table-based spacing to text components that reference the table.""" 

354 for component in [self.rtf_subline, self.rtf_page_header, self.rtf_page_footer]: 

355 if component is not None and component.text_indent_reference == "table": 

356 component.text_space_before = ( 

357 self._table_space + component.text_space_before 

358 ) 

359 component.text_space_after = ( 

360 self._table_space + component.text_space_after 

361 ) 

362 

363 def rtf_encode(self) -> str: 

364 """Generate the complete RTF document as a string. 

365 

366 This method processes all document components and generates the final 

367 RTF code including headers, formatting, tables, and all other elements. 

368 The resulting string can be written to a file or processed further. 

369 

370 Returns: 

371 str: Complete RTF document string ready to be saved as an .rtf file. 

372 

373 Examples: 

374 ```python 

375 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report")) 

376 rtf_string = doc.rtf_encode() 

377 # Can write manually or process further 

378 with open("output.rtf", "w") as f: 

379 f.write(rtf_string) 

380 ``` 

381 """ 

382 from .encoding import RTFEncodingEngine 

383 

384 engine = RTFEncodingEngine() 

385 return engine.encode_document(self) 

386 

387 def write_rtf(self, file_path: str) -> None: 

388 """Write the RTF document to a file. 

389 

390 Generates the complete RTF document and writes it to the specified file path. 

391 The file is written in UTF-8 encoding and will have the .rtf extension. 

392 

393 Args: 

394 file_path: Path where the RTF file should be saved. Can be absolute 

395 or relative path. Directory must exist. 

396 

397 Examples: 

398 ```python 

399 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report")) 

400 doc.write_rtf("output/report.rtf") 

401 ``` 

402 

403 Note: 

404 The method prints the file path to stdout for confirmation. 

405 Ensure the directory exists before calling this method. 

406 """ 

407 print(file_path) 

408 rtf_code = self.rtf_encode() 

409 with open(file_path, "w", encoding="utf-8") as f: 

410 f.write(rtf_code)