Coverage for src/rtflite/encode.py: 75%

129 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-14 16:35 +0000

1"""RTF Document class - main entry point for RTF generation. 

2 

3This module provides the RTFDocument class with a clean, service-oriented architecture. 

4All complex logic has been delegated to specialized services and strategies. 

5""" 

6 

7import polars as pl 

8from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator 

9 

10from .input import ( 

11 RTFBody, 

12 RTFColumnHeader, 

13 RTFFigure, 

14 RTFFootnote, 

15 RTFPage, 

16 RTFPageFooter, 

17 RTFPageHeader, 

18 RTFSource, 

19 RTFSubline, 

20 RTFTitle, 

21) 

22from .row import Utils 

23 

24 

25class RTFDocument(BaseModel): 

26 """Main class for creating RTF documents with tables, text, and figures. 

27 

28 RTFDocument is the central class for generating Rich Text Format (RTF) files 

29 containing formatted tables, titles, footnotes, and other document elements. 

30 It provides a comprehensive API for creating professional documents commonly 

31 used in clinical trials, scientific research, and data reporting. 

32 

33 Examples: 

34 Simple table with title: 

35 ```python 

36 import rtflite as rtf 

37 import polars as pl 

38 

39 df = pl.DataFrame({ 

40 "Subject": ["001", "002", "003"], 

41 "Age": [45, 52, 38], 

42 "Treatment": ["Drug A", "Drug B", "Placebo"] 

43 }) 

44 

45 doc = rtf.RTFDocument( 

46 df=df, 

47 rtf_title=rtf.RTFTitle(text="Patient Demographics"), 

48 rtf_body=rtf.RTFBody(col_rel_width=[2, 1, 2]) 

49 ) 

50 doc.write_rtf("demographics.rtf") 

51 ``` 

52 

53 Multi-page document with headers and footers: 

54 ```python 

55 doc = rtf.RTFDocument( 

56 df=large_df, 

57 rtf_page=rtf.RTFPage(nrow=40, orientation="landscape"), 

58 rtf_page_header=rtf.RTFPageHeader(), # Default page numbering 

59 rtf_page_footer=rtf.RTFPageFooter(text="Confidential"), 

60 rtf_title=rtf.RTFTitle(text="Clinical Study Results"), 

61 rtf_column_header=rtf.RTFColumnHeader( 

62 text=["Subject ID", "Visit", "Result", "Units"] 

63 ), 

64 rtf_body=rtf.RTFBody( 

65 col_rel_width=[2, 1, 1, 1], 

66 text_justification=[["l", "c", "r", "c"]] 

67 ), 

68 rtf_footnote=rtf.RTFFootnote( 

69 text="Results are mean +/- SD" 

70 ) 

71 ) 

72 doc.write_rtf("results.rtf") 

73 ``` 

74 

75 Document with grouped data and sublines: 

76 ```python 

77 doc = rtf.RTFDocument( 

78 df=grouped_df, 

79 rtf_body=rtf.RTFBody( 

80 group_by=["SITE", "TREATMENT"], # Suppress duplicate values 

81 subline_by=["STUDY_PHASE"], # Create section headers 

82 col_rel_width=[2, 2, 1, 1] 

83 ) 

84 ) 

85 ``` 

86 

87 Attributes: 

88 df: Data to display in the table. Can be a single DataFrame or list of 

89 DataFrames for multi-section documents. Accepts pandas or polars 

90 DataFrames (automatically converted to polars internally). 

91 

92 rtf_page: Page configuration including size, orientation, margins, and 

93 pagination settings. 

94 

95 rtf_page_header: Optional header appearing at the top of every page. 

96 

97 rtf_page_footer: Optional footer appearing at the bottom of every page. 

98 

99 rtf_title: Document title(s) displayed at the top. 

100 

101 rtf_column_header: Column headers for the table. Can be a single header 

102 or list of headers for multi-row headers. 

103 

104 rtf_body: Table body configuration including column widths, formatting, 

105 borders, and special features like group_by and subline_by. 

106 

107 rtf_footnote: Optional footnote text displayed after the table. 

108 

109 rtf_source: Optional source citation displayed at the very bottom. 

110 

111 rtf_figure: Optional figure/image to embed in the document. 

112 

113 Methods: 

114 rtf_encode(): Generate the complete RTF document as a string. 

115 write_rtf(file_path): Write the RTF document to a file. 

116 """ 

117 

118 model_config = ConfigDict(arbitrary_types_allowed=True) 

119 

120 # Core data 

121 df: pl.DataFrame | list[pl.DataFrame] | None = Field( 

122 default=None, 

123 description="The DataFrame(s) containing the data for the RTF document. Accepts single DataFrame or list of DataFrames for multi-section documents. Accepts pandas or polars DataFrame, internally converted to polars. Optional when using figure-only documents.", 

124 ) 

125 

126 # Document structure 

127 rtf_page: RTFPage = Field( 

128 default_factory=lambda: RTFPage(), 

129 description="Page settings including size, orientation and margins", 

130 ) 

131 rtf_page_header: RTFPageHeader | None = Field( 

132 default=None, description="Text to appear in the header of each page" 

133 ) 

134 rtf_title: RTFTitle | None = Field( 

135 default_factory=lambda: RTFTitle(), 

136 description="Title section settings including text and formatting", 

137 ) 

138 rtf_subline: RTFSubline | None = Field( 

139 default=None, description="Subject line text to appear below the title" 

140 ) 

141 rtf_column_header: list[RTFColumnHeader] | list[list[RTFColumnHeader | None]] = ( 

142 Field( 

143 default_factory=lambda: [RTFColumnHeader()], 

144 description="Column header settings. For multi-section documents, use nested list format: [[header1], [header2], [None]] where None means no header for that section.", 

145 ) 

146 ) 

147 rtf_body: RTFBody | list[RTFBody] | None = Field( 

148 default_factory=lambda: RTFBody(), 

149 description="Table body section settings including column widths and formatting. For multi-section documents, provide a list of RTFBody objects.", 

150 ) 

151 rtf_footnote: RTFFootnote | None = Field( 

152 default=None, description="Footnote text to appear at bottom of document" 

153 ) 

154 rtf_source: RTFSource | None = Field( 

155 default=None, description="Data source citation text" 

156 ) 

157 rtf_page_footer: RTFPageFooter | None = Field( 

158 default=None, description="Text to appear in the footer of each page" 

159 ) 

160 rtf_figure: RTFFigure | None = Field( 

161 default=None, description="Figure/image content to embed in the document" 

162 ) 

163 

164 @field_validator("rtf_column_header", mode="before") 

165 def convert_column_header_to_list(cls, v): 

166 """Convert single RTFColumnHeader to list or handle nested list format""" 

167 if v is not None and isinstance(v, RTFColumnHeader): 

168 return [v] 

169 return v 

170 

171 @model_validator(mode="before") 

172 @classmethod 

173 def validate_dataframe(cls, values): 

174 """Convert DataFrame(s) to polars for internal processing.""" 

175 if "df" in values and values["df"] is not None: 

176 df = values["df"] 

177 import narwhals as nw 

178 import polars as pl 

179 

180 # Handle single DataFrame 

181 if not isinstance(df, list): 

182 if isinstance(df, pl.DataFrame): 

183 pass # Already polars 

184 else: 

185 # Use narwhals to handle any DataFrame type 

186 try: 

187 nw_df = nw.from_native(df) 

188 values["df"] = nw_df.to_native(pl.DataFrame) 

189 except Exception as e: 

190 raise ValueError( 

191 f"DataFrame must be a valid DataFrame: {str(e)}" 

192 ) 

193 # Handle list of DataFrames 

194 else: 

195 converted_dfs = [] 

196 for i, single_df in enumerate(df): 

197 if isinstance(single_df, pl.DataFrame): 

198 converted_dfs.append(single_df) 

199 else: 

200 try: 

201 # Use narwhals to handle any DataFrame type 

202 nw_df = nw.from_native(single_df) 

203 converted_dfs.append(nw_df.to_native(pl.DataFrame)) 

204 except Exception as e: 

205 raise ValueError( 

206 f"DataFrame at index {i} must be a valid DataFrame: {str(e)}" 

207 ) 

208 values["df"] = converted_dfs 

209 return values 

210 

211 @model_validator(mode="after") 

212 def validate_column_names(self): 

213 """Validate that column references exist in DataFrame and multi-section consistency.""" 

214 # Validate df and rtf_figure usage 

215 if self.df is None and self.rtf_figure is None: 

216 raise ValueError("Either 'df' or 'rtf_figure' must be provided") 

217 

218 if self.df is not None and self.rtf_figure is not None: 

219 raise ValueError( 

220 "Cannot use both 'df' and 'rtf_figure' together. Use either tables or figures in a single document." 

221 ) 

222 

223 # When RTFFigure is used, enforce as_table=False for footnotes and sources 

224 if self.rtf_figure is not None: 

225 if self.rtf_footnote is not None and getattr( 

226 self.rtf_footnote, "as_table", True 

227 ): 

228 raise ValueError( 

229 "When using RTFFigure, RTFFootnote must have as_table=False" 

230 ) 

231 if self.rtf_source is not None and getattr( 

232 self.rtf_source, "as_table", False 

233 ): 

234 raise ValueError( 

235 "When using RTFFigure, RTFSource must have as_table=False" 

236 ) 

237 

238 # Skip column validation if no DataFrame provided (figure-only documents) 

239 if self.df is None: 

240 return self 

241 

242 # Multi-section validation 

243 is_multi_section = isinstance(self.df, list) 

244 if is_multi_section: 

245 # Validate rtf_body is also a list with matching length 

246 if not isinstance(self.rtf_body, list): 

247 raise ValueError("When df is a list, rtf_body must also be a list") 

248 if len(self.df) != len(self.rtf_body): 

249 raise ValueError( 

250 f"df list length ({len(self.df)}) must match rtf_body list length ({len(self.rtf_body)})" 

251 ) 

252 

253 # Validate rtf_column_header if it's nested list format 

254 if isinstance(self.rtf_column_header[0], list): 

255 if len(self.rtf_column_header) != len(self.df): 

256 raise ValueError( 

257 f"rtf_column_header nested list length ({len(self.rtf_column_header)}) must match df list length ({len(self.df)})" 

258 ) 

259 

260 # Per-section column validation 

261 for i, (section_df, section_body) in enumerate(zip(self.df, self.rtf_body)): 

262 self._validate_section_columns(section_df, section_body, i) 

263 else: 

264 # Single section validation (existing logic) 

265 self._validate_section_columns(self.df, self.rtf_body, 0) 

266 

267 return self 

268 

269 def _validate_section_columns(self, df, body, section_index): 

270 """Validate column references for a single section.""" 

271 columns = df.columns 

272 section_label = f"section {section_index}" if section_index > 0 else "df" 

273 

274 if body.group_by is not None: 

275 for column in body.group_by: 

276 if column not in columns: 

277 raise ValueError( 

278 f"`group_by` column {column} not found in {section_label}" 

279 ) 

280 

281 if body.page_by is not None: 

282 for column in body.page_by: 

283 if column not in columns: 

284 raise ValueError( 

285 f"`page_by` column {column} not found in {section_label}" 

286 ) 

287 

288 if body.subline_by is not None: 

289 for column in body.subline_by: 

290 if column not in columns: 

291 raise ValueError( 

292 f"`subline_by` column {column} not found in {section_label}" 

293 ) 

294 

295 def __init__(self, **data): 

296 super().__init__(**data) 

297 

298 # Set default column widths based on DataFrame dimensions (if DataFrame provided) 

299 if self.df is not None: 

300 is_multi_section = isinstance(self.df, list) 

301 

302 if is_multi_section: 

303 # Handle multi-section documents 

304 for section_df, section_body in zip(self.df, self.rtf_body): 

305 dim = section_df.shape 

306 section_body.col_rel_width = ( 

307 section_body.col_rel_width or [1] * dim[1] 

308 ) 

309 

310 # Handle column headers for multi-section 

311 if self.rtf_column_header and isinstance( 

312 self.rtf_column_header[0], list 

313 ): 

314 # Nested list format: [[header1], [header2], [None]] 

315 for section_headers, section_body in zip( 

316 self.rtf_column_header, self.rtf_body 

317 ): 

318 if section_headers: # Skip if [None] 

319 for header in section_headers: 

320 if header and header.col_rel_width is None: 

321 header.col_rel_width = ( 

322 section_body.col_rel_width.copy() 

323 ) 

324 elif self.rtf_column_header: 

325 # Flat list format - apply to first section only 

326 for header in self.rtf_column_header: 

327 if header.col_rel_width is None: 

328 header.col_rel_width = self.rtf_body[0].col_rel_width.copy() 

329 else: 

330 # Handle single section documents (existing logic) 

331 dim = self.df.shape 

332 self.rtf_body.col_rel_width = ( 

333 self.rtf_body.col_rel_width or [1] * dim[1] 

334 ) 

335 

336 # Inherit col_rel_width from rtf_body to rtf_column_header if not specified 

337 if self.rtf_column_header: 

338 for header in self.rtf_column_header: 

339 if header.col_rel_width is None: 

340 header.col_rel_width = self.rtf_body.col_rel_width.copy() 

341 

342 # Calculate table spacing for text components 

343 self._table_space = int( 

344 Utils._inch_to_twip(self.rtf_page.width - self.rtf_page.col_width) / 2 

345 ) 

346 

347 # Apply table spacing to text components if needed 

348 self._apply_table_spacing() 

349 

350 def _apply_table_spacing(self): 

351 """Apply table-based spacing to text components that reference the table.""" 

352 for component in [self.rtf_subline, self.rtf_page_header, self.rtf_page_footer]: 

353 if component is not None and component.text_indent_reference == "table": 

354 component.text_space_before = ( 

355 self._table_space + component.text_space_before 

356 ) 

357 component.text_space_after = ( 

358 self._table_space + component.text_space_after 

359 ) 

360 

361 def rtf_encode(self) -> str: 

362 """Generate the complete RTF document as a string. 

363 

364 This method processes all document components and generates the final 

365 RTF code including headers, formatting, tables, and all other elements. 

366 The resulting string can be written to a file or processed further. 

367 

368 Returns: 

369 str: Complete RTF document string ready to be saved as an .rtf file. 

370 

371 Examples: 

372 ```python 

373 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report")) 

374 rtf_string = doc.rtf_encode() 

375 # Can write manually or process further 

376 with open("output.rtf", "w") as f: 

377 f.write(rtf_string) 

378 ``` 

379 """ 

380 from .encoding import RTFEncodingEngine 

381 

382 engine = RTFEncodingEngine() 

383 return engine.encode_document(self) 

384 

385 def write_rtf(self, file_path: str) -> None: 

386 """Write the RTF document to a file. 

387 

388 Generates the complete RTF document and writes it to the specified file path. 

389 The file is written in UTF-8 encoding and will have the .rtf extension. 

390 

391 Args: 

392 file_path: Path where the RTF file should be saved. Can be absolute 

393 or relative path. Directory must exist. 

394 

395 Examples: 

396 ```python 

397 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report")) 

398 doc.write_rtf("output/report.rtf") 

399 ``` 

400 

401 Note: 

402 The method prints the file path to stdout for confirmation. 

403 Ensure the directory exists before calling this method. 

404 """ 

405 print(file_path) 

406 rtf_code = self.rtf_encode() 

407 with open(file_path, "w", encoding="utf-8") as f: 

408 f.write(rtf_code)