Coverage for src / rtflite / encode.py: 84%
203 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-19 02:02 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-19 02:02 +0000
1"""RTF Document class - main entry point for RTF generation.
3This module provides the RTFDocument class with a clean, service-oriented architecture.
4All complex logic has been delegated to specialized services and strategies.
5"""
7import shutil
8import tempfile
9from collections.abc import Sequence
10from pathlib import Path
12import polars as pl
13from pydantic import (
14 BaseModel,
15 ConfigDict,
16 Field,
17 PrivateAttr,
18 field_validator,
19 model_validator,
20)
22from .convert import LibreOfficeConverter
23from .input import (
24 RTFBody,
25 RTFColumnHeader,
26 RTFFigure,
27 RTFFootnote,
28 RTFPage,
29 RTFPageFooter,
30 RTFPageHeader,
31 RTFSource,
32 RTFSubline,
33 RTFTitle,
34)
35from .row import Utils
38class RTFDocument(BaseModel):
39 """Main class for creating RTF documents with tables, text, and figures.
41 RTFDocument is the central class for generating Rich Text Format (RTF) files
42 containing formatted tables, titles, footnotes, and other document elements.
43 It provides a comprehensive API for creating professional documents commonly
44 used in clinical trials, scientific research, and data reporting.
46 Examples:
47 Simple table with title:
48 ```python
49 import rtflite as rtf
50 import polars as pl
52 df = pl.DataFrame({
53 "Subject": ["001", "002", "003"],
54 "Age": [45, 52, 38],
55 "Treatment": ["Drug A", "Drug B", "Placebo"]
56 })
58 doc = rtf.RTFDocument(
59 df=df,
60 rtf_title=rtf.RTFTitle(text="Patient Demographics"),
61 rtf_body=rtf.RTFBody(col_rel_width=[2, 1, 2])
62 )
63 doc.write_rtf("demographics.rtf")
64 ```
66 Multi-page document with headers and footers:
67 ```python
68 doc = rtf.RTFDocument(
69 df=large_df,
70 rtf_page=rtf.RTFPage(nrow=40, orientation="landscape"),
71 rtf_page_header=rtf.RTFPageHeader(), # Default page numbering
72 rtf_page_footer=rtf.RTFPageFooter(text="Confidential"),
73 rtf_title=rtf.RTFTitle(text="Clinical Study Results"),
74 rtf_column_header=rtf.RTFColumnHeader(
75 text=["Subject ID", "Visit", "Result", "Units"]
76 ),
77 rtf_body=rtf.RTFBody(
78 col_rel_width=[2, 1, 1, 1],
79 text_justification=[["l", "c", "r", "c"]]
80 ),
81 rtf_footnote=rtf.RTFFootnote(
82 text="Results are mean +/- SD"
83 )
84 )
85 doc.write_rtf("results.rtf")
86 ```
88 Document with grouped data and sublines:
89 ```python
90 doc = rtf.RTFDocument(
91 df=grouped_df,
92 rtf_body=rtf.RTFBody(
93 group_by=["SITE", "TREATMENT"], # Suppress duplicate values
94 subline_by=["STUDY_PHASE"], # Create section headers
95 col_rel_width=[2, 2, 1, 1]
96 )
97 )
98 ```
100 Attributes:
101 df: Data to display in the table. Can be a single DataFrame or list of
102 DataFrames for multi-section documents. Accepts pandas or polars
103 DataFrames (automatically converted to polars internally).
105 rtf_page: Page configuration including size, orientation, margins, and
106 pagination settings.
108 rtf_page_header: Optional header appearing at the top of every page.
110 rtf_page_footer: Optional footer appearing at the bottom of every page.
112 rtf_title: Document title(s) displayed at the top.
114 rtf_column_header: Column headers for the table. Can be a single header
115 or list of headers for multi-row headers.
117 rtf_body: Table body configuration including column widths, formatting,
118 borders, and special features like group_by and subline_by.
120 rtf_footnote: Optional footnote text displayed after the table.
122 rtf_source: Optional source citation displayed at the very bottom.
124 rtf_figure: Optional figure/image to embed in the document.
126 Methods:
127 rtf_encode(): Generate the complete RTF document as a string.
128 write_rtf(file_path): Write the RTF document to a file.
129 """
131 model_config = ConfigDict(arbitrary_types_allowed=True)
132 _table_space: int = PrivateAttr(default=0)
134 # Core data
135 df: pl.DataFrame | list[pl.DataFrame] | None = Field(
136 default=None,
137 description=(
138 "The DataFrame(s) containing the data for the RTF document. "
139 "Accepts single DataFrame or list of DataFrames for "
140 "multi-section documents. Accepts pandas or polars DataFrame, "
141 "internally converted to polars. Optional when using figure-only "
142 "documents."
143 ),
144 )
146 # Document structure
147 rtf_page: RTFPage = Field(
148 default_factory=lambda: RTFPage(),
149 description="Page settings including size, orientation and margins",
150 )
151 rtf_page_header: RTFPageHeader | None = Field(
152 default=None, description="Text to appear in the header of each page"
153 )
154 rtf_title: RTFTitle | None = Field(
155 default_factory=lambda: RTFTitle(),
156 description="Title section settings including text and formatting",
157 )
158 rtf_subline: RTFSubline | None = Field(
159 default=None, description="Subject line text to appear below the title"
160 )
161 rtf_column_header: (
162 Sequence[RTFColumnHeader] | Sequence[Sequence[RTFColumnHeader | None]]
163 ) = Field(
164 default_factory=lambda: [RTFColumnHeader()],
165 description=(
166 "Column header settings. For multi-section documents, use nested "
167 "list format: [[header1], [header2], [None]] where None means no "
168 "header for that section."
169 ),
170 )
171 rtf_body: RTFBody | list[RTFBody] | None = Field(
172 default_factory=lambda: RTFBody(),
173 description=(
174 "Table body section settings including column widths and "
175 "formatting. For multi-section documents, provide a list of "
176 "RTFBody objects."
177 ),
178 )
179 rtf_footnote: RTFFootnote | None = Field(
180 default=None, description="Footnote text to appear at bottom of document"
181 )
182 rtf_source: RTFSource | None = Field(
183 default=None, description="Data source citation text"
184 )
185 rtf_page_footer: RTFPageFooter | None = Field(
186 default=None, description="Text to appear in the footer of each page"
187 )
188 rtf_figure: RTFFigure | None = Field(
189 default=None, description="Figure/image content to embed in the document"
190 )
192 @field_validator("rtf_column_header", mode="before")
193 def convert_column_header_to_list(cls, v):
194 """Convert single RTFColumnHeader to list or handle nested list format"""
195 if v is not None and isinstance(v, RTFColumnHeader):
196 return [v]
197 return v
199 @field_validator("rtf_body", mode="before")
200 def normalize_rtf_body_sequence(cls, v):
201 """Convert non-list body sequences to lists before validation."""
202 if (
203 v is not None
204 and not isinstance(v, (RTFBody, list, str, bytes, bytearray))
205 and isinstance(v, Sequence)
206 ):
207 return list(v)
208 return v
210 @model_validator(mode="before")
211 @classmethod
212 def validate_dataframe(cls, values):
213 """Convert DataFrame(s) to polars for internal processing."""
214 if "df" in values and values["df"] is not None:
215 df = values["df"]
216 import narwhals as nw
217 import polars as pl
219 # Handle single DataFrame
220 if not isinstance(df, list):
221 if isinstance(df, pl.DataFrame):
222 pass # Already polars
223 else:
224 # Use narwhals to handle any DataFrame type
225 try:
226 nw_df = nw.from_native(df)
227 values["df"] = nw_df.to_native(pl.DataFrame)
228 except Exception as e:
229 raise ValueError(
230 f"DataFrame must be a valid DataFrame: {str(e)}"
231 ) from e
232 # Handle list of DataFrames
233 else:
234 converted_dfs = []
235 for i, single_df in enumerate(df):
236 if isinstance(single_df, pl.DataFrame):
237 converted_dfs.append(single_df)
238 else:
239 try:
240 # Use narwhals to handle any DataFrame type
241 nw_df = nw.from_native(single_df)
242 converted_dfs.append(nw_df.to_native(pl.DataFrame))
243 except Exception as e:
244 raise ValueError(
245 f"DataFrame at index {i} must be a valid "
246 f"DataFrame: {str(e)}"
247 ) from e
248 values["df"] = converted_dfs
249 return values
251 @model_validator(mode="after")
252 def validate_column_names(self):
253 """Validate column references and multi-section consistency."""
254 # Validate df and rtf_figure usage
255 if self.df is None and self.rtf_figure is None:
256 raise ValueError("Either 'df' or 'rtf_figure' must be provided")
258 if self.df is not None and self.rtf_figure is not None:
259 raise ValueError(
260 "Cannot use both 'df' and 'rtf_figure' together. Use either "
261 "tables or figures in a single document."
262 )
264 # When RTFFigure is used, enforce as_table=False for footnotes and sources
265 if self.rtf_figure is not None:
266 if self.rtf_footnote is not None and getattr(
267 self.rtf_footnote, "as_table", True
268 ):
269 raise ValueError(
270 "When using RTFFigure, RTFFootnote must have as_table=False"
271 )
272 if self.rtf_source is not None and getattr(
273 self.rtf_source, "as_table", False
274 ):
275 raise ValueError(
276 "When using RTFFigure, RTFSource must have as_table=False"
277 )
279 # Skip column validation if no DataFrame provided (figure-only documents)
280 if self.df is None:
281 return self
283 # Multi-section validation
284 is_multi_section = isinstance(self.df, list)
285 if is_multi_section:
286 # Validate rtf_body is also a list with matching length
287 if not isinstance(self.rtf_body, list):
288 raise ValueError("When df is a list, rtf_body must also be a list")
289 if len(self.df) != len(self.rtf_body):
290 raise ValueError(
291 "df list length "
292 f"({len(self.df)}) must match rtf_body list length "
293 f"({len(self.rtf_body)})"
294 )
296 # Validate rtf_column_header if it's nested list format
297 if (
298 isinstance(self.rtf_column_header, list)
299 and self.rtf_column_header
300 and isinstance(self.rtf_column_header[0], list)
301 and len(self.rtf_column_header) != len(self.df)
302 ):
303 raise ValueError(
304 "rtf_column_header nested list length "
305 f"({len(self.rtf_column_header)}) must match df list "
306 f"length ({len(self.df)})"
307 )
309 # Per-section column validation
310 for i, (section_df, section_body) in enumerate(
311 zip(self.df, self.rtf_body, strict=True)
312 ):
313 self._validate_section_columns(section_df, section_body, i)
314 else:
315 if self.rtf_body is None:
316 raise ValueError("When df is a single DataFrame, rtf_body is required")
317 if isinstance(self.rtf_body, list):
318 if len(self.rtf_body) != 1:
319 raise ValueError(
320 "When df is a single DataFrame, rtf_body must be a "
321 "single RTFBody"
322 )
323 self.rtf_body = self.rtf_body[0]
324 # Single section validation (existing logic)
325 self._validate_section_columns(self.df, self.rtf_body, 0)
327 return self
329 def _validate_section_columns(self, df, body, section_index):
330 """Validate column references for a single section."""
331 columns = df.columns
332 section_label = f"section {section_index}" if section_index > 0 else "df"
334 if body.group_by is not None:
335 for column in body.group_by:
336 if column not in columns:
337 raise ValueError(
338 f"`group_by` column {column} not found in {section_label}"
339 )
341 if body.page_by is not None:
342 for column in body.page_by:
343 if column not in columns:
344 raise ValueError(
345 f"`page_by` column {column} not found in {section_label}"
346 )
348 if body.subline_by is not None:
349 for column in body.subline_by:
350 if column not in columns:
351 raise ValueError(
352 f"`subline_by` column {column} not found in {section_label}"
353 )
355 def __init__(self, **data):
356 super().__init__(**data)
358 # Set default column widths based on DataFrame dimensions when a
359 # DataFrame is provided.
360 if self.df is not None:
361 is_multi_section = isinstance(self.df, list)
363 if is_multi_section:
364 # Handle multi-section documents
365 for section_df, section_body in zip(
366 self.df, self.rtf_body, strict=True
367 ):
368 dim = section_df.shape
369 if section_body.col_rel_width is None:
370 section_body.col_rel_width = [1] * dim[1]
371 elif len(section_body.col_rel_width) == 1 and dim[1] > 1:
372 section_body.col_rel_width = section_body.col_rel_width * dim[1]
374 # Handle column headers for multi-section
375 if self.rtf_column_header and isinstance(
376 self.rtf_column_header[0], list
377 ):
378 # Nested list format: [[header1], [header2], [None]]
379 for section_headers, section_body in zip(
380 self.rtf_column_header, self.rtf_body, strict=True
381 ):
382 if section_headers: # Skip if [None]
383 for header in section_headers:
384 if header and header.col_rel_width is None:
385 header.col_rel_width = (
386 section_body.col_rel_width.copy()
387 )
388 elif self.rtf_column_header:
389 # Flat list format - apply to first section only
390 for header in self.rtf_column_header:
391 if header.col_rel_width is None:
392 header.col_rel_width = self.rtf_body[0].col_rel_width.copy()
393 else:
394 # Handle single section documents (existing logic)
395 dim = self.df.shape
396 if self.rtf_body.col_rel_width is None:
397 self.rtf_body.col_rel_width = [1] * dim[1]
398 elif len(self.rtf_body.col_rel_width) == 1 and dim[1] > 1:
399 self.rtf_body.col_rel_width = self.rtf_body.col_rel_width * dim[1]
401 # Inherit col_rel_width from rtf_body to rtf_column_header if
402 # not specified
403 if self.rtf_column_header:
404 for header in self.rtf_column_header:
405 if header.col_rel_width is None:
406 header.col_rel_width = self.rtf_body.col_rel_width.copy()
408 # Calculate table spacing for text components
409 self._table_space = int(
410 Utils._inch_to_twip(self.rtf_page.width - self.rtf_page.col_width) / 2
411 )
413 # Apply table spacing to text components if needed
414 self._apply_table_spacing()
416 def _apply_table_spacing(self):
417 """Apply table-based spacing to text components that reference the table."""
418 for component in [self.rtf_subline, self.rtf_page_header, self.rtf_page_footer]:
419 if component is not None and component.text_indent_reference == "table":
420 component.text_space_before = (
421 self._table_space + component.text_space_before
422 )
423 component.text_space_after = (
424 self._table_space + component.text_space_after
425 )
427 def rtf_encode(self) -> str:
428 """Generate the complete RTF document as a string.
430 This method processes all document components and generates the final
431 RTF code including headers, formatting, tables, and all other elements.
432 The resulting string can be written to a file or processed further.
434 Returns:
435 str: Complete RTF document string ready to be saved as an .rtf file.
437 Examples:
438 ```python
439 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report"))
440 rtf_string = doc.rtf_encode()
441 # Can write manually or process further
442 with open("output.rtf", "w") as f:
443 f.write(rtf_string)
444 ```
445 """
446 from .encoding import RTFEncodingEngine
448 engine = RTFEncodingEngine()
449 return engine.encode_document(self)
451 def write_rtf(self, file_path: str | Path) -> None:
452 """Write the RTF document to a file.
454 Generates the complete RTF document and writes it to the specified file path.
455 The file is written in UTF-8 encoding and will have the `.rtf` extension.
457 Args:
458 file_path: Path where the RTF file should be saved.
459 Accepts string or Path input. Can be absolute or relative.
460 Directories are created if they do not already exist.
462 Examples:
463 ```python
464 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report"))
465 doc.write_rtf("output/report.rtf")
466 ```
468 Note:
469 The method prints the file path to stdout for confirmation.
470 """
471 target_path = Path(file_path).expanduser()
472 target_path.parent.mkdir(parents=True, exist_ok=True)
473 print(target_path)
474 rtf_code = self.rtf_encode()
475 target_path.write_text(rtf_code, encoding="utf-8")
477 def write_docx(
478 self,
479 file_path: str | Path,
480 *,
481 converter: LibreOfficeConverter | None = None,
482 ) -> None:
483 """Write the document as a DOCX file.
485 Writes the document to a temporary RTF file first, and then converts
486 it to DOCX with LibreOffice. Temporary directories are used for
487 all intermediate files to avoid placing artifacts alongside the
488 requested output path.
490 Args:
491 file_path: Destination path for the DOCX file.
492 Accepts string or Path input. Can be absolute or relative.
493 Directories are created if they do not already exist.
494 converter: Optional LibreOffice converter instance.
495 Pass a configured instance (for example with a custom
496 `executable_path`) to control how LibreOffice is invoked and to
497 avoid re-initializing and re-verifying the executable path across
498 multiple conversions. Note that each call to ``convert()`` still
499 starts a new LibreOffice process in headless mode; the process is
500 not kept alive between conversions.
502 Examples:
503 ```python
504 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report"))
505 doc.write_docx("output/report.docx")
506 ```
508 Custom LibreOffice executable:
509 ```python
510 converter = LibreOfficeConverter(executable_path="/custom/path/to/soffice")
511 doc.write_docx("output/report.docx", converter=converter)
512 ```
514 Note:
515 The method prints the file path to stdout for confirmation.
516 """
517 target_path = Path(file_path).expanduser()
518 target_path.parent.mkdir(parents=True, exist_ok=True)
520 if converter is None:
521 converter = LibreOfficeConverter()
522 with tempfile.TemporaryDirectory() as tmpdir:
523 rtf_path = Path(tmpdir) / f"{target_path.stem}.rtf"
524 rtf_code = self.rtf_encode()
525 rtf_path.write_text(rtf_code, encoding="utf-8")
527 with tempfile.TemporaryDirectory() as convert_tmpdir:
528 converted = converter.convert(
529 input_files=rtf_path,
530 output_dir=Path(convert_tmpdir),
531 format="docx",
532 overwrite=True,
533 )
534 if not isinstance(converted, Path):
535 raise TypeError(
536 "LibreOffice conversion returned an unexpected output for a "
537 "single input file; expected `Path`, got object of type "
538 f"{type(converted)!r} with value {converted!r}."
539 )
540 docx_path = converted
541 shutil.move(str(docx_path), target_path)
543 print(target_path)
545 def write_html(
546 self,
547 file_path: str | Path,
548 *,
549 converter: LibreOfficeConverter | None = None,
550 ) -> None:
551 """Write the document as an HTML file.
553 Writes the document to a temporary RTF file first, and then converts
554 it to HTML with LibreOffice. Temporary directories are used for
555 all intermediate files to avoid placing artifacts alongside the
556 requested output path.
558 Args:
559 file_path: Destination path for the HTML file.
560 Accepts string or Path input. Can be absolute or relative.
561 Directories are created if they do not already exist.
562 converter: Optional LibreOffice converter instance.
563 Pass a configured instance (for example with a custom
564 `executable_path`) to control how LibreOffice is invoked and to
565 avoid re-initializing and re-verifying the executable path across
566 multiple conversions. Note that each call to ``convert()`` still
567 starts a new LibreOffice process in headless mode; the process is
568 not kept alive between conversions.
570 Examples:
571 ```python
572 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report"))
573 doc.write_html("output/report.html")
574 ```
576 Note:
577 LibreOffice may create a companion directory (for example
578 `report.html_files`) for embedded resources. When present, it is moved
579 alongside the requested output path.
580 """
581 target_path = Path(file_path).expanduser()
582 target_path.parent.mkdir(parents=True, exist_ok=True)
584 if converter is None:
585 converter = LibreOfficeConverter()
586 with tempfile.TemporaryDirectory() as tmpdir:
587 rtf_path = Path(tmpdir) / f"{target_path.stem}.rtf"
588 rtf_code = self.rtf_encode()
589 rtf_path.write_text(rtf_code, encoding="utf-8")
591 with tempfile.TemporaryDirectory() as convert_tmpdir:
592 converted = converter.convert(
593 input_files=rtf_path,
594 output_dir=Path(convert_tmpdir),
595 format="html",
596 overwrite=True,
597 )
598 if not isinstance(converted, Path):
599 raise TypeError(
600 "LibreOffice conversion returned an unexpected output for a "
601 "single input file; expected `Path`, got object of type "
602 f"{type(converted)!r} with value {converted!r}."
603 )
604 html_path = converted
605 resources_dir = html_path.with_name(f"{html_path.name}_files")
606 shutil.move(str(html_path), target_path)
607 if resources_dir.is_dir():
608 shutil.move(
609 str(resources_dir), target_path.parent / resources_dir.name
610 )
612 print(target_path)
614 def write_pdf(
615 self,
616 file_path: str | Path,
617 *,
618 converter: LibreOfficeConverter | None = None,
619 ) -> None:
620 """Write the document as a PDF file.
622 Writes the document to a temporary RTF file first, and then converts
623 it to PDF with LibreOffice. Temporary directories are used for
624 all intermediate files to avoid placing artifacts alongside the
625 requested output path.
627 Args:
628 file_path: Destination path for the PDF file.
629 Accepts string or Path input. Can be absolute or relative.
630 Directories are created if they do not already exist.
631 converter: Optional LibreOffice converter instance.
632 Pass a configured instance (for example with a custom
633 `executable_path`) to control how LibreOffice is invoked and to
634 avoid re-initializing and re-verifying the executable path across
635 multiple conversions. Note that each call to ``convert()`` still
636 starts a new LibreOffice process in headless mode; the process is
637 not kept alive between conversions.
639 Examples:
640 ```python
641 doc = RTFDocument(df=data, rtf_title=RTFTitle(text="Report"))
642 doc.write_pdf("output/report.pdf")
643 ```
644 """
645 target_path = Path(file_path).expanduser()
646 target_path.parent.mkdir(parents=True, exist_ok=True)
648 if converter is None:
649 converter = LibreOfficeConverter()
650 with tempfile.TemporaryDirectory() as tmpdir:
651 rtf_path = Path(tmpdir) / f"{target_path.stem}.rtf"
652 rtf_code = self.rtf_encode()
653 rtf_path.write_text(rtf_code, encoding="utf-8")
655 with tempfile.TemporaryDirectory() as convert_tmpdir:
656 converted = converter.convert(
657 input_files=rtf_path,
658 output_dir=Path(convert_tmpdir),
659 format="pdf",
660 overwrite=True,
661 )
662 if not isinstance(converted, Path):
663 raise TypeError(
664 "LibreOffice conversion returned an unexpected output for a "
665 "single input file; expected `Path`, got object of type "
666 f"{type(converted)!r} with value {converted!r}."
667 )
668 pdf_path = converted
669 shutil.move(str(pdf_path), target_path)
671 print(target_path)