Coverage for src / rtflite / encode.py: 76%

135 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-28 05:09 +0000

1"""RTF Document class - main entry point for RTF generation. 

2 

3This module provides the RTFDocument class with a clean, service-oriented architecture. 

4All complex logic has been delegated to specialized services and strategies. 

5""" 

6 

7from collections.abc import Sequence 

8 

9import polars as pl 

10from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator 

11 

12from .input import ( 

13 RTFBody, 

14 RTFColumnHeader, 

15 RTFFigure, 

16 RTFFootnote, 

17 RTFPage, 

18 RTFPageFooter, 

19 RTFPageHeader, 

20 RTFSource, 

21 RTFSubline, 

22 RTFTitle, 

23) 

24from .row import Utils 

25 

26 

27class RTFDocument(BaseModel): 

28 """Main class for creating RTF documents with tables, text, and figures. 

29 

30 RTFDocument is the central class for generating Rich Text Format (RTF) files 

31 containing formatted tables, titles, footnotes, and other document elements. 

32 It provides a comprehensive API for creating professional documents commonly 

33 used in clinical trials, scientific research, and data reporting. 

34 

35 Examples: 

36 Simple table with title: 

37 ```python 

38 import rtflite as rtf 

39 import polars as pl 

40 

41 df = pl.DataFrame({ 

42 "Subject": ["001", "002", "003"], 

43 "Age": [45, 52, 38], 

44 "Treatment": ["Drug A", "Drug B", "Placebo"] 

45 }) 

46 

47 doc = rtf.RTFDocument( 

48 df=df, 

49 rtf_title=rtf.RTFTitle(text="Patient Demographics"), 

50 rtf_body=rtf.RTFBody(col_rel_width=[2, 1, 2]) 

51 ) 

52 doc.write_rtf("demographics.rtf") 

53 ``` 

54 

55 Multi-page document with headers and footers: 

56 ```python 

57 doc = rtf.RTFDocument( 

58 df=large_df, 

59 rtf_page=rtf.RTFPage(nrow=40, orientation="landscape"), 

60 rtf_page_header=rtf.RTFPageHeader(), # Default page numbering 

61 rtf_page_footer=rtf.RTFPageFooter(text="Confidential"), 

62 rtf_title=rtf.RTFTitle(text="Clinical Study Results"), 

63 rtf_column_header=rtf.RTFColumnHeader( 

64 text=["Subject ID", "Visit", "Result", "Units"] 

65 ), 

66 rtf_body=rtf.RTFBody( 

67 col_rel_width=[2, 1, 1, 1], 

68 text_justification=[["l", "c", "r", "c"]] 

69 ), 

70 rtf_footnote=rtf.RTFFootnote( 

71 text="Results are mean +/- SD" 

72 ) 

73 ) 

74 doc.write_rtf("results.rtf") 

75 ``` 

76 

77 Document with grouped data and sublines: 

78 ```python 

79 doc = rtf.RTFDocument( 

80 df=grouped_df, 

81 rtf_body=rtf.RTFBody( 

82 group_by=["SITE", "TREATMENT"], # Suppress duplicate values 

83 subline_by=["STUDY_PHASE"], # Create section headers 

84 col_rel_width=[2, 2, 1, 1] 

85 ) 

86 ) 

87 ``` 

88 

89 Attributes: 

90 df: Data to display in the table. Can be a single DataFrame or list of 

91 DataFrames for multi-section documents. Accepts pandas or polars 

92 DataFrames (automatically converted to polars internally). 

93 

94 rtf_page: Page configuration including size, orientation, margins, and 

95 pagination settings. 

96 

97 rtf_page_header: Optional header appearing at the top of every page. 

98 

99 rtf_page_footer: Optional footer appearing at the bottom of every page. 

100 

101 rtf_title: Document title(s) displayed at the top. 

102 

103 rtf_column_header: Column headers for the table. Can be a single header 

104 or list of headers for multi-row headers. 

105 

106 rtf_body: Table body configuration including column widths, formatting, 

107 borders, and special features like group_by and subline_by. 

108 

109 rtf_footnote: Optional footnote text displayed after the table. 

110 

111 rtf_source: Optional source citation displayed at the very bottom. 

112 

113 rtf_figure: Optional figure/image to embed in the document. 

114 

115 Methods: 

116 rtf_encode(): Generate the complete RTF document as a string. 

117 write_rtf(file_path): Write the RTF document to a file. 

118 """ 

119 

120 model_config = ConfigDict(arbitrary_types_allowed=True) 

121 

122 # Core data 

123 df: pl.DataFrame | list[pl.DataFrame] | None = Field( 

124 default=None, 

125 description=( 

126 "The DataFrame(s) containing the data for the RTF document. " 

127 "Accepts single DataFrame or list of DataFrames for " 

128 "multi-section documents. Accepts pandas or polars DataFrame, " 

129 "internally converted to polars. Optional when using figure-only " 

130 "documents." 

131 ), 

132 ) 

133 

134 # Document structure 

135 rtf_page: RTFPage = Field( 

136 default_factory=lambda: RTFPage(), 

137 description="Page settings including size, orientation and margins", 

138 ) 

139 rtf_page_header: RTFPageHeader | None = Field( 

140 default=None, description="Text to appear in the header of each page" 

141 ) 

142 rtf_title: RTFTitle | None = Field( 

143 default_factory=lambda: RTFTitle(), 

144 description="Title section settings including text and formatting", 

145 ) 

146 rtf_subline: RTFSubline | None = Field( 

147 default=None, description="Subject line text to appear below the title" 

148 ) 

149 rtf_column_header: ( 

150 Sequence[RTFColumnHeader] | Sequence[Sequence[RTFColumnHeader | None]] 

151 ) = Field( 

152 default_factory=lambda: [RTFColumnHeader()], 

153 description=( 

154 "Column header settings. For multi-section documents, use nested " 

155 "list format: [[header1], [header2], [None]] where None means no " 

156 "header for that section." 

157 ), 

158 ) 

159 rtf_body: RTFBody | Sequence[RTFBody] | None = Field( 

160 default_factory=lambda: RTFBody(), 

161 description=( 

162 "Table body section settings including column widths and " 

163 "formatting. For multi-section documents, provide a list of " 

164 "RTFBody objects." 

165 ), 

166 ) 

167 rtf_footnote: RTFFootnote | None = Field( 

168 default=None, description="Footnote text to appear at bottom of document" 

169 ) 

170 rtf_source: RTFSource | None = Field( 

171 default=None, description="Data source citation text" 

172 ) 

173 rtf_page_footer: RTFPageFooter | None = Field( 

174 default=None, description="Text to appear in the footer of each page" 

175 ) 

176 rtf_figure: RTFFigure | None = Field( 

177 default=None, description="Figure/image content to embed in the document" 

178 ) 

179 

180 @field_validator("rtf_column_header", mode="before") 

181 def convert_column_header_to_list(cls, v): 

182 """Convert single RTFColumnHeader to list or handle nested list format""" 

183 if v is not None and isinstance(v, RTFColumnHeader): 

184 return [v] 

185 return v 

186 

187 @model_validator(mode="before") 

188 @classmethod 

189 def validate_dataframe(cls, values): 

190 """Convert DataFrame(s) to polars for internal processing.""" 

191 if "df" in values and values["df"] is not None: 

192 df = values["df"] 

193 import narwhals as nw 

194 import polars as pl 

195 

196 # Handle single DataFrame 

197 if not isinstance(df, list): 

198 if isinstance(df, pl.DataFrame): 

199 pass # Already polars 

200 else: 

201 # Use narwhals to handle any DataFrame type 

202 try: 

203 nw_df = nw.from_native(df) 

204 values["df"] = nw_df.to_native(pl.DataFrame) 

205 except Exception as e: 

206 raise ValueError( 

207 f"DataFrame must be a valid DataFrame: {str(e)}" 

208 ) from e 

209 # Handle list of DataFrames 

210 else: 

211 converted_dfs = [] 

212 for i, single_df in enumerate(df): 

213 if isinstance(single_df, pl.DataFrame): 

214 converted_dfs.append(single_df) 

215 else: 

216 try: 

217 # Use narwhals to handle any DataFrame type 

218 nw_df = nw.from_native(single_df) 

219 converted_dfs.append(nw_df.to_native(pl.DataFrame)) 

220 except Exception as e: 

221 raise ValueError( 

222 f"DataFrame at index {i} must be a valid " 

223 f"DataFrame: {str(e)}" 

224 ) from e 

225 values["df"] = converted_dfs 

226 return values 

227 

228 @model_validator(mode="after") 

229 def validate_column_names(self): 

230 """Validate column references and multi-section consistency.""" 

231 # Validate df and rtf_figure usage 

232 if self.df is None and self.rtf_figure is None: 

233 raise ValueError("Either 'df' or 'rtf_figure' must be provided") 

234 

235 if self.df is not None and self.rtf_figure is not None: 

236 raise ValueError( 

237 "Cannot use both 'df' and 'rtf_figure' together. Use either " 

238 "tables or figures in a single document." 

239 ) 

240 

241 # When RTFFigure is used, enforce as_table=False for footnotes and sources 

242 if self.rtf_figure is not None: 

243 if self.rtf_footnote is not None and getattr( 

244 self.rtf_footnote, "as_table", True 

245 ): 

246 raise ValueError( 

247 "When using RTFFigure, RTFFootnote must have as_table=False" 

248 ) 

249 if self.rtf_source is not None and getattr( 

250 self.rtf_source, "as_table", False 

251 ): 

252 raise ValueError( 

253 "When using RTFFigure, RTFSource must have as_table=False" 

254 ) 

255 

256 # Skip column validation if no DataFrame provided (figure-only documents) 

257 if self.df is None: 

258 return self 

259 

260 # Multi-section validation 

261 is_multi_section = isinstance(self.df, list) 

262 if is_multi_section: 

263 # Validate rtf_body is also a list with matching length 

264 if not isinstance(self.rtf_body, list): 

265 raise ValueError("When df is a list, rtf_body must also be a list") 

266 if len(self.df) != len(self.rtf_body): 

267 raise ValueError( 

268 "df list length " 

269 f"({len(self.df)}) must match rtf_body list length " 

270 f"({len(self.rtf_body)})" 

271 ) 

272 

273 # Validate rtf_column_header if it's nested list format 

274 if ( 

275 isinstance(self.rtf_column_header, list) 

276 and self.rtf_column_header 

277 and isinstance(self.rtf_column_header[0], list) 

278 and len(self.rtf_column_header) != len(self.df) 

279 ): 

280 raise ValueError( 

281 "rtf_column_header nested list length " 

282 f"({len(self.rtf_column_header)}) must match df list " 

283 f"length ({len(self.df)})" 

284 ) 

285 

286 # Per-section column validation 

287 for i, (section_df, section_body) in enumerate( 

288 zip(self.df, self.rtf_body, strict=True) 

289 ): 

290 self._validate_section_columns(section_df, section_body, i) 

291 else: 

292 # Single section validation (existing logic) 

293 self._validate_section_columns(self.df, self.rtf_body, 0) 

294 

295 return self 

296 

297 def _validate_section_columns(self, df, body, section_index): 

298 """Validate column references for a single section.""" 

299 columns = df.columns 

300 section_label = f"section {section_index}" if section_index > 0 else "df" 

301 

302 if body.group_by is not None: 

303 for column in body.group_by: 

304 if column not in columns: 

305 raise ValueError( 

306 f"`group_by` column {column} not found in {section_label}" 

307 ) 

308 

309 if body.page_by is not None: 

310 for column in body.page_by: 

311 if column not in columns: 

312 raise ValueError( 

313 f"`page_by` column {column} not found in {section_label}" 

314 ) 

315 

316 if body.subline_by is not None: 

317 for column in body.subline_by: 

318 if column not in columns: 

319 raise ValueError( 

320 f"`subline_by` column {column} not found in {section_label}" 

321 ) 

322 

323 def __init__(self, **data): 

324 super().__init__(**data) 

325 

326 # Set default column widths based on DataFrame dimensions when a 

327 # DataFrame is provided. 

328 if self.df is not None: 

329 is_multi_section = isinstance(self.df, list) 

330 

331 if is_multi_section: 

332 # Handle multi-section documents 

333 for section_df, section_body in zip( 

334 self.df, self.rtf_body, strict=True 

335 ): 

336 dim = section_df.shape 

337 if section_body.col_rel_width is None: 

338 section_body.col_rel_width = [1] * dim[1] 

339 elif len(section_body.col_rel_width) == 1 and dim[1] > 1: 

340 section_body.col_rel_width = section_body.col_rel_width * dim[1] 

341 

342 # Handle column headers for multi-section 

343 if self.rtf_column_header and isinstance( 

344 self.rtf_column_header[0], list 

345 ): 

346 # Nested list format: [[header1], [header2], [None]] 

347 for section_headers, section_body in zip( 

348 self.rtf_column_header, self.rtf_body, strict=True 

349 ): 

350 if section_headers: # Skip if [None] 

351 for header in section_headers: 

352 if header and header.col_rel_width is None: 

353 header.col_rel_width = ( 

354 section_body.col_rel_width.copy() 

355 ) 

356 elif self.rtf_column_header: 

357 # Flat list format - apply to first section only 

358 for header in self.rtf_column_header: 

359 if header.col_rel_width is None: 

360 header.col_rel_width = self.rtf_body[0].col_rel_width.copy() 

361 else: 

362 # Handle single section documents (existing logic) 

363 dim = self.df.shape 

364 if self.rtf_body.col_rel_width is None: 

365 self.rtf_body.col_rel_width = [1] * dim[1] 

366 elif len(self.rtf_body.col_rel_width) == 1 and dim[1] > 1: 

367 self.rtf_body.col_rel_width = self.rtf_body.col_rel_width * dim[1] 

368 

369 # Inherit col_rel_width from rtf_body to rtf_column_header if 

370 # not specified 

371 if self.rtf_column_header: 

372 for header in self.rtf_column_header: 

373 if header.col_rel_width is None: 

374 header.col_rel_width = self.rtf_body.col_rel_width.copy() 

375 

376 # Calculate table spacing for text components 

377 self._table_space = int( 

378 Utils._inch_to_twip(self.rtf_page.width - self.rtf_page.col_width) / 2 

379 ) 

380 

381 # Apply table spacing to text components if needed 

382 self._apply_table_spacing() 

383 

384 def _apply_table_spacing(self): 

385 """Apply table-based spacing to text components that reference the table.""" 

386 for component in [self.rtf_subline, self.rtf_page_header, self.rtf_page_footer]: 

387 if component is not None and component.text_indent_reference == "table": 

388 component.text_space_before = ( 

389 self._table_space + component.text_space_before 

390 ) 

391 component.text_space_after = ( 

392 self._table_space + component.text_space_after 

393 ) 

394 

395 def rtf_encode(self) -> str: 

396 """Generate the complete RTF document as a string. 

397 

398 This method processes all document components and generates the final 

399 RTF code including headers, formatting, tables, and all other elements. 

400 The resulting string can be written to a file or processed further. 

401 

402 Returns: 

403 str: Complete RTF document string ready to be saved as an .rtf file. 

404 

405 Examples: 

406 ```python 

407 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report")) 

408 rtf_string = doc.rtf_encode() 

409 # Can write manually or process further 

410 with open("output.rtf", "w") as f: 

411 f.write(rtf_string) 

412 ``` 

413 """ 

414 from .encoding import RTFEncodingEngine 

415 

416 engine = RTFEncodingEngine() 

417 return engine.encode_document(self) 

418 

419 def write_rtf(self, file_path: str) -> None: 

420 """Write the RTF document to a file. 

421 

422 Generates the complete RTF document and writes it to the specified file path. 

423 The file is written in UTF-8 encoding and will have the .rtf extension. 

424 

425 Args: 

426 file_path: Path where the RTF file should be saved. Can be absolute 

427 or relative path. Directory must exist. 

428 

429 Examples: 

430 ```python 

431 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report")) 

432 doc.write_rtf("output/report.rtf") 

433 ``` 

434 

435 Note: 

436 The method prints the file path to stdout for confirmation. 

437 Ensure the directory exists before calling this method. 

438 """ 

439 print(file_path) 

440 rtf_code = self.rtf_encode() 

441 with open(file_path, "w", encoding="utf-8") as f: 

442 f.write(rtf_code)