Coverage for src/rtflite/services/document_service.py: 87%
199 statements
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-25 22:35 +0000
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-25 22:35 +0000
1"""RTF Document Service - handles all document-level operations."""
3from collections.abc import Mapping, Sequence
4from typing import Any, Tuple
7class RTFDocumentService:
8 """Service for handling RTF document operations including pagination and layout."""
10 def __init__(self):
11 from .encoding_service import RTFEncodingService
13 self.encoding_service = RTFEncodingService()
15 def calculate_additional_rows_per_page(self, document) -> int:
16 """Calculate additional rows needed per page for headers, footnotes, sources."""
17 additional_rows = 0
19 # Count subline_by header (appears on each page)
20 if document.rtf_body.subline_by:
21 additional_rows += 1 # Each subline_by header consumes 1 row
23 # Count column headers (repeat on each page)
24 if document.rtf_column_header:
25 # Handle nested column headers for multi-section documents
26 if isinstance(document.rtf_column_header[0], list):
27 # Nested format: count all non-None headers across all sections
28 for section_headers in document.rtf_column_header:
29 if section_headers: # Skip [None] sections
30 for header in section_headers:
31 if header and header.text is not None:
32 additional_rows += 1
33 else:
34 # Flat format: original logic
35 for header in document.rtf_column_header:
36 if header.text is not None:
37 additional_rows += 1
39 # Count footnote rows
40 if document.rtf_footnote and document.rtf_footnote.text:
41 additional_rows += 1
43 # Count source rows
44 if document.rtf_source and document.rtf_source.text:
45 additional_rows += 1
47 return additional_rows
49 def needs_pagination(self, document) -> bool:
50 """Check if document needs pagination based on content size and page limits."""
52 # Multiple figures always need pagination (each figure on separate page)
53 if document.rtf_figure and document.rtf_figure.figures:
54 # Check if multiple figures are provided
55 figures = document.rtf_figure.figures
56 if isinstance(figures, (list, tuple)) and len(figures) > 1:
57 return True
59 # Figure-only documents don't need pagination beyond multi-figure handling above
60 if document.df is None:
61 return False
63 # Handle multi-section documents
64 if isinstance(document.df, list):
65 # Check if any section needs pagination
66 for body in document.rtf_body:
67 if (body.page_by and body.new_page) or body.subline_by:
68 return True
69 # For now, multi-section documents use single page strategy
70 return False
71 else:
72 # Single section document
73 if (
74 document.rtf_body.page_by and document.rtf_body.new_page
75 ) or document.rtf_body.subline_by:
76 return True
78 # Create pagination instance to calculate rows needed
79 from ..pagination import PageBreakCalculator, RTFPagination
81 pagination = RTFPagination(
82 page_width=document.rtf_page.width,
83 page_height=document.rtf_page.height,
84 margin=document.rtf_page.margin,
85 nrow=document.rtf_page.nrow,
86 orientation=document.rtf_page.orientation,
87 )
89 calculator = PageBreakCalculator(pagination=pagination)
90 from ..row import Utils
92 col_total_width = document.rtf_page.col_width
94 # Handle multi-section vs single section for column widths
95 if isinstance(document.df, list):
96 # Use first section for pagination calculation
97 col_widths = Utils._col_widths(
98 document.rtf_body[0].col_rel_width, col_total_width
99 )
100 # Calculate rows needed for all sections combined
101 total_content_rows: list[Any] = []
102 for df, body in zip(document.df, document.rtf_body):
103 section_col_widths = Utils._col_widths(
104 body.col_rel_width, col_total_width
105 )
106 section_content_rows = calculator.calculate_content_rows(
107 df, section_col_widths, body
108 )
109 total_content_rows.extend(section_content_rows)
110 content_rows = total_content_rows
111 else:
112 col_widths = Utils._col_widths(
113 document.rtf_body.col_rel_width, col_total_width
114 )
115 # Calculate rows needed for data content only
116 content_rows = list(
117 calculator.calculate_content_rows(
118 document.df, col_widths, document.rtf_body
119 )
120 )
122 # Calculate additional rows per page
123 additional_rows_per_page = self.calculate_additional_rows_per_page(document)
125 # Calculate how many data rows can fit per page
126 data_rows = sum(content_rows)
127 available_data_rows_per_page = max(
128 1, document.rtf_page.nrow - additional_rows_per_page
129 )
131 # If we can't fit even the additional components, we definitely need pagination
132 if additional_rows_per_page >= document.rtf_page.nrow:
133 return True
135 # Check if data rows exceed what can fit on a single page
136 return data_rows > available_data_rows_per_page
138 def create_pagination_instance(self, document) -> Tuple:
139 """Create pagination and content distributor instances."""
140 from ..pagination import ContentDistributor, PageBreakCalculator, RTFPagination
142 pagination = RTFPagination(
143 page_width=document.rtf_page.width,
144 page_height=document.rtf_page.height,
145 margin=document.rtf_page.margin,
146 nrow=document.rtf_page.nrow,
147 orientation=document.rtf_page.orientation,
148 )
150 calculator = PageBreakCalculator(pagination=pagination)
151 distributor = ContentDistributor(pagination=pagination, calculator=calculator)
153 return pagination, distributor
155 def generate_page_break(self, document) -> str:
156 """Generate proper RTF page break sequence."""
157 return self.encoding_service.encode_page_break(
158 document.rtf_page,
159 lambda: self.encoding_service.encode_page_margin(document.rtf_page),
160 )
162 def should_show_element_on_page(
163 self, element_location: str, page_info: dict
164 ) -> bool:
165 """Determine if an element should be shown on a specific page."""
166 if element_location == "all":
167 return True
168 elif element_location == "first":
169 return page_info["is_first_page"]
170 elif element_location == "last":
171 return page_info["is_last_page"]
172 else:
173 return False
175 def process_page_by(
176 self, document
177 ) -> Sequence[Sequence[Tuple[int, int, int]]] | None:
178 """Create components for page_by format."""
179 # Obtain input data
180 data = document.df.to_dicts()
181 var = document.rtf_body.page_by
183 # Handle empty DataFrame
184 if len(data) == 0:
185 return None
187 # Obtain column names and dimensions
188 columns = list(data[0].keys())
190 if var is None:
191 return None
193 def get_column_index(column_name: str) -> int:
194 """Get the index of a column in the column list."""
195 return columns.index(column_name)
197 def get_matching_rows(group_values: Mapping) -> Sequence[int]:
198 """Get row indices that match the group values."""
199 return [
200 i
201 for i, row in enumerate(data)
202 if all(row[k] == v for k, v in group_values.items())
203 ]
205 def get_unique_combinations(variables: Sequence[str]) -> Sequence[Mapping]:
206 """Get unique combinations of values for the specified variables."""
207 seen = set()
208 unique = []
209 for row in data:
210 key = tuple(row[v] for v in variables)
211 if key not in seen:
212 seen.add(key)
213 unique.append({v: row[v] for v in variables})
214 return unique
216 output = []
217 prev_values = {v: None for v in var}
219 # Process each unique combination of grouping variables
220 for group in get_unique_combinations(var):
221 indices = get_matching_rows(group)
223 # Handle headers for each level
224 for level, var_name in enumerate(var):
225 current_val = group[var_name]
227 need_header = False
228 if level == len(var) - 1:
229 need_header = True
230 else:
231 for lvl in range(level + 1):
232 if group[var[lvl]] != prev_values[var[lvl]]:
233 need_header = True
234 break
236 if need_header:
237 col_idx = get_column_index(var_name)
238 # Add level information as third element in tuple
239 output.append([(indices[0], col_idx, level)])
241 prev_values[var_name] = current_val
243 # Handle data rows
244 for index in indices:
245 output.append(
246 [
247 (index, j, len(var))
248 for j in range(len(columns))
249 if columns[j] not in var
250 ]
251 )
253 return output
255 def apply_pagination_borders(
256 self, document, rtf_attrs, page_info: dict, total_pages: int
257 ):
258 """Apply proper borders for paginated context following r2rtf design:
260 rtf_page.border_first/last: Controls borders for the entire table
261 rtf_body.border_first/last: Controls borders for each page
262 rtf_body.border_top/bottom: Controls borders for individual cells
264 Logic:
265 - First page, first row: Apply rtf_page.border_first (overrides rtf_body.border_first)
266 - Last page, last row: Apply rtf_page.border_last (overrides rtf_body.border_last)
267 - Non-first pages, first row: Apply rtf_body.border_first
268 - Non-last pages, last row: Apply rtf_body.border_last
269 - All other rows: Use existing border_top/bottom from rtf_body
270 """
271 from copy import deepcopy
273 from ..attributes import BroadcastValue
274 from ..input import TableAttributes
276 # Create a deep copy of the attributes to avoid modifying the original
277 page_attrs = deepcopy(rtf_attrs)
278 page_df_height = page_info["data"].height
279 page_df_width = page_info["data"].width
280 page_shape = (page_df_height, page_df_width)
282 if page_df_height == 0:
283 return page_attrs
285 # Clear border_first and border_last from being broadcast to all rows
286 # These should only apply to specific rows based on pagination logic
287 if hasattr(page_attrs, "border_first") and page_attrs.border_first:
288 # Don't use border_first in pagination - it's handled separately
289 page_attrs.border_first = None
291 if hasattr(page_attrs, "border_last") and page_attrs.border_last:
292 # Don't use border_last in pagination - it's handled separately
293 page_attrs.border_last = None
295 # Ensure border_top and border_bottom are properly sized for this page
296 if not page_attrs.border_top:
297 page_attrs.border_top = [
298 [""] * page_df_width for _ in range(page_df_height)
299 ]
300 if not page_attrs.border_bottom:
301 page_attrs.border_bottom = [
302 [""] * page_df_width for _ in range(page_df_height)
303 ]
305 # Apply borders based on page position
306 # For first page: only apply rtf_page.border_first to table body if NO column headers
307 has_column_headers = (
308 document.rtf_column_header and len(document.rtf_column_header) > 0
309 )
310 if page_info["is_first_page"] and not has_column_headers:
311 if document.rtf_page.border_first:
312 # Apply border to all cells in the first row
313 for col_idx in range(page_df_width):
314 page_attrs = self._apply_border_to_cell(
315 page_attrs,
316 0,
317 col_idx,
318 "top",
319 document.rtf_page.border_first,
320 page_shape,
321 )
323 # For first page with column headers: ensure consistent border style
324 if page_info["is_first_page"] and has_column_headers:
325 # Apply same border style as non-first pages to maintain consistency
326 if document.rtf_body.border_first:
327 border_style = (
328 document.rtf_body.border_first[0][0]
329 if isinstance(document.rtf_body.border_first, list)
330 else document.rtf_body.border_first
331 )
332 # Apply single border style to first data row (same as other pages)
333 for col_idx in range(page_df_width):
334 page_attrs = self._apply_border_to_cell(
335 page_attrs, 0, col_idx, "top", border_style, page_shape
336 )
338 # Apply page-level borders for non-first/last pages
339 if not page_info["is_first_page"] and document.rtf_body.border_first:
340 # Apply border_first to first row of non-first pages
341 border_style = (
342 document.rtf_body.border_first[0][0]
343 if isinstance(document.rtf_body.border_first, list)
344 else document.rtf_body.border_first
345 )
346 for col_idx in range(page_df_width):
347 page_attrs = self._apply_border_to_cell(
348 page_attrs, 0, col_idx, "top", border_style, page_shape
349 )
351 # Check if footnotes or sources will appear on this page
352 has_footnote_on_page = (
353 document.rtf_footnote
354 and document.rtf_footnote.text
355 and self.should_show_element_on_page(
356 document.rtf_page.page_footnote, page_info
357 )
358 )
359 has_source_on_page = (
360 document.rtf_source
361 and document.rtf_source.text
362 and self.should_show_element_on_page(
363 document.rtf_page.page_source, page_info
364 )
365 )
367 # Determine if footnotes/sources appear as tables on the last page
368 # This is crucial for border placement when components are set to "first" only
369 footnote_as_table_on_last = (
370 document.rtf_footnote
371 and document.rtf_footnote.text
372 and getattr(document.rtf_footnote, "as_table", True)
373 and document.rtf_page.page_footnote in ("last", "all")
374 )
375 source_as_table_on_last = (
376 document.rtf_source
377 and document.rtf_source.text
378 and getattr(document.rtf_source, "as_table", False)
379 and document.rtf_page.page_source in ("last", "all")
380 )
382 # Apply border logic based on page position and footnote/source presence
383 if not page_info["is_last_page"]:
384 # Non-last pages: apply single border after footnote/source, or after data if no footnote/source
385 if document.rtf_body.border_last:
386 border_style = (
387 document.rtf_body.border_last[0][0]
388 if isinstance(document.rtf_body.border_last, list)
389 else document.rtf_body.border_last
390 )
392 if not (has_footnote_on_page or has_source_on_page):
393 # No footnote/source: apply border to last data row
394 for col_idx in range(page_df_width):
395 page_attrs = self._apply_border_to_cell(
396 page_attrs,
397 page_df_height - 1,
398 col_idx,
399 "bottom",
400 border_style,
401 page_shape,
402 )
403 else:
404 # Has footnote/source: apply border_last from RTFBody
405 self._apply_footnote_source_borders(
406 document, page_info, border_style, is_last_page=False
407 )
409 else: # is_last_page
410 # Last page: check if we should apply border to data or footnote/source
411 if document.rtf_page.border_last:
412 # Check if this page contains the absolute last row
413 total_rows = document.df.height
414 is_absolute_last_row = page_info["end_row"] == total_rows - 1
416 if is_absolute_last_row:
417 # If footnotes/sources are set to "first" only and appear as tables,
418 # they won't be on the last page, so apply border to last data row
419 if not (footnote_as_table_on_last or source_as_table_on_last):
420 # No footnote/source on last page: apply border to last data row
421 last_row_idx = page_df_height - 1
422 for col_idx in range(page_df_width):
423 page_attrs = self._apply_border_to_cell(
424 page_attrs,
425 last_row_idx,
426 col_idx,
427 "bottom",
428 document.rtf_page.border_last,
429 page_shape,
430 )
431 else:
432 # Has footnote/source on last page: set border for footnote/source
433 self._apply_footnote_source_borders(
434 document,
435 page_info,
436 document.rtf_page.border_last,
437 is_last_page=True,
438 )
440 return page_attrs
442 def _apply_footnote_source_borders(
443 self, document, page_info: dict, border_style: str, is_last_page: bool
444 ):
445 """Apply borders to footnote and source components based on page position."""
446 # Determine which component should get the border
447 has_footnote = (
448 document.rtf_footnote
449 and document.rtf_footnote.text
450 and self.should_show_element_on_page(
451 document.rtf_page.page_footnote, page_info
452 )
453 )
454 has_source = (
455 document.rtf_source
456 and document.rtf_source.text
457 and self.should_show_element_on_page(
458 document.rtf_page.page_source, page_info
459 )
460 )
462 # Apply border to components based on as_table setting
463 # Priority: Source with as_table=True > Footnote with as_table=True > any component
464 target_component = None
466 # Extract as_table values (now stored as booleans)
467 footnote_as_table = None
468 if has_footnote:
469 footnote_as_table = getattr(document.rtf_footnote, "as_table", True)
471 source_as_table = None
472 if has_source:
473 source_as_table = getattr(document.rtf_source, "as_table", False)
475 if has_source and source_as_table:
476 # Source is rendered as table: prioritize source for borders
477 target_component = ("source", document.rtf_source)
478 elif has_footnote and footnote_as_table:
479 # Footnote is rendered as table: use footnote for borders
480 target_component = ("footnote", document.rtf_footnote)
481 # Note: Removed fallback to plain text components - borders should only be applied
482 # to components that are rendered as tables (as_table=True)
484 if target_component:
485 component_name, component = target_component
486 if not hasattr(component, "_page_border_style"):
487 component._page_border_style = {}
488 component._page_border_style[page_info["page_number"]] = border_style
490 def _apply_border_to_cell(
491 self,
492 page_attrs,
493 row_idx: int,
494 col_idx: int,
495 border_side: str,
496 border_style: str,
497 page_shape: tuple,
498 ):
499 """Apply specified border style to a specific cell using BroadcastValue"""
500 from ..attributes import BroadcastValue
502 border_attr = f"border_{border_side}"
504 if not hasattr(page_attrs, border_attr):
505 return page_attrs
507 # Get current border values
508 current_borders = getattr(page_attrs, border_attr)
510 # Create BroadcastValue to expand borders to page shape
511 border_broadcast = BroadcastValue(value=current_borders, dimension=page_shape)
513 # Update the specific cell
514 border_broadcast.update_cell(row_idx, col_idx, border_style)
516 # Update the attribute with the expanded value
517 setattr(page_attrs, border_attr, border_broadcast.value)
518 return page_attrs