Coverage for src/rtflite/services/document_service.py: 87%
198 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 16:35 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 16:35 +0000
1"""RTF Document Service - handles all document-level operations."""
3from typing import List, Tuple
6class RTFDocumentService:
7 """Service for handling RTF document operations including pagination and layout."""
9 def __init__(self):
10 from .encoding_service import RTFEncodingService
12 self.encoding_service = RTFEncodingService()
14 def calculate_additional_rows_per_page(self, document) -> int:
15 """Calculate additional rows needed per page for headers, footnotes, sources."""
16 additional_rows = 0
18 # Count subline_by header (appears on each page)
19 if document.rtf_body.subline_by:
20 additional_rows += 1 # Each subline_by header consumes 1 row
22 # Count column headers (repeat on each page)
23 if document.rtf_column_header:
24 # Handle nested column headers for multi-section documents
25 if isinstance(document.rtf_column_header[0], list):
26 # Nested format: count all non-None headers across all sections
27 for section_headers in document.rtf_column_header:
28 if section_headers: # Skip [None] sections
29 for header in section_headers:
30 if header and header.text is not None:
31 additional_rows += 1
32 else:
33 # Flat format: original logic
34 for header in document.rtf_column_header:
35 if header.text is not None:
36 additional_rows += 1
38 # Count footnote rows
39 if document.rtf_footnote and document.rtf_footnote.text:
40 additional_rows += 1
42 # Count source rows
43 if document.rtf_source and document.rtf_source.text:
44 additional_rows += 1
46 return additional_rows
48 def needs_pagination(self, document) -> bool:
49 """Check if document needs pagination based on content size and page limits."""
51 # Multiple figures always need pagination (each figure on separate page)
52 if document.rtf_figure and document.rtf_figure.figures:
53 # Check if multiple figures are provided
54 figures = document.rtf_figure.figures
55 if isinstance(figures, (list, tuple)) and len(figures) > 1:
56 return True
58 # Figure-only documents don't need pagination beyond multi-figure handling above
59 if document.df is None:
60 return False
62 # Handle multi-section documents
63 if isinstance(document.df, list):
64 # Check if any section needs pagination
65 for body in document.rtf_body:
66 if (body.page_by and body.new_page) or body.subline_by:
67 return True
68 # For now, multi-section documents use single page strategy
69 return False
70 else:
71 # Single section document
72 if (
73 document.rtf_body.page_by and document.rtf_body.new_page
74 ) or document.rtf_body.subline_by:
75 return True
77 # Create pagination instance to calculate rows needed
78 from ..pagination import PageBreakCalculator, RTFPagination
80 pagination = RTFPagination(
81 page_width=document.rtf_page.width,
82 page_height=document.rtf_page.height,
83 margin=document.rtf_page.margin,
84 nrow=document.rtf_page.nrow,
85 orientation=document.rtf_page.orientation,
86 )
88 calculator = PageBreakCalculator(pagination=pagination)
89 from ..row import Utils
91 col_total_width = document.rtf_page.col_width
93 # Handle multi-section vs single section for column widths
94 if isinstance(document.df, list):
95 # Use first section for pagination calculation
96 col_widths = Utils._col_widths(
97 document.rtf_body[0].col_rel_width, col_total_width
98 )
99 # Calculate rows needed for all sections combined
100 total_content_rows = []
101 for df, body in zip(document.df, document.rtf_body):
102 section_col_widths = Utils._col_widths(
103 body.col_rel_width, col_total_width
104 )
105 section_content_rows = calculator.calculate_content_rows(
106 df, section_col_widths, body
107 )
108 total_content_rows.extend(section_content_rows)
109 content_rows = total_content_rows
110 else:
111 col_widths = Utils._col_widths(
112 document.rtf_body.col_rel_width, col_total_width
113 )
114 # Calculate rows needed for data content only
115 content_rows = calculator.calculate_content_rows(
116 document.df, col_widths, document.rtf_body
117 )
119 # Calculate additional rows per page
120 additional_rows_per_page = self.calculate_additional_rows_per_page(document)
122 # Calculate how many data rows can fit per page
123 data_rows = sum(content_rows)
124 available_data_rows_per_page = max(
125 1, document.rtf_page.nrow - additional_rows_per_page
126 )
128 # If we can't fit even the additional components, we definitely need pagination
129 if additional_rows_per_page >= document.rtf_page.nrow:
130 return True
132 # Check if data rows exceed what can fit on a single page
133 return data_rows > available_data_rows_per_page
135 def create_pagination_instance(self, document) -> Tuple:
136 """Create pagination and content distributor instances."""
137 from ..pagination import ContentDistributor, PageBreakCalculator, RTFPagination
139 pagination = RTFPagination(
140 page_width=document.rtf_page.width,
141 page_height=document.rtf_page.height,
142 margin=document.rtf_page.margin,
143 nrow=document.rtf_page.nrow,
144 orientation=document.rtf_page.orientation,
145 )
147 calculator = PageBreakCalculator(pagination=pagination)
148 distributor = ContentDistributor(pagination=pagination, calculator=calculator)
150 return pagination, distributor
152 def generate_page_break(self, document) -> str:
153 """Generate proper RTF page break sequence."""
154 return self.encoding_service.encode_page_break(
155 document.rtf_page,
156 lambda: self.encoding_service.encode_page_margin(document.rtf_page),
157 )
159 def should_show_element_on_page(
160 self, element_location: str, page_info: dict
161 ) -> bool:
162 """Determine if an element should be shown on a specific page."""
163 if element_location == "all":
164 return True
165 elif element_location == "first":
166 return page_info["is_first_page"]
167 elif element_location == "last":
168 return page_info["is_last_page"]
169 else:
170 return False
172 def process_page_by(self, document) -> List[List[Tuple[int, int, int]]] | None:
173 """Create components for page_by format."""
174 # Obtain input data
175 data = document.df.to_dicts()
176 var = document.rtf_body.page_by
178 # Handle empty DataFrame
179 if len(data) == 0:
180 return None
182 # Obtain column names and dimensions
183 columns = list(data[0].keys())
185 if var is None:
186 return None
188 def get_column_index(column_name: str) -> int:
189 """Get the index of a column in the column list."""
190 return columns.index(column_name)
192 def get_matching_rows(group_values: dict) -> List[int]:
193 """Get row indices that match the group values."""
194 return [
195 i
196 for i, row in enumerate(data)
197 if all(row[k] == v for k, v in group_values.items())
198 ]
200 def get_unique_combinations(variables: List[str]) -> List[dict]:
201 """Get unique combinations of values for the specified variables."""
202 seen = set()
203 unique = []
204 for row in data:
205 key = tuple(row[v] for v in variables)
206 if key not in seen:
207 seen.add(key)
208 unique.append({v: row[v] for v in variables})
209 return unique
211 output = []
212 prev_values = {v: None for v in var}
214 # Process each unique combination of grouping variables
215 for group in get_unique_combinations(var):
216 indices = get_matching_rows(group)
218 # Handle headers for each level
219 for level, var_name in enumerate(var):
220 current_val = group[var_name]
222 need_header = False
223 if level == len(var) - 1:
224 need_header = True
225 else:
226 for lvl in range(level + 1):
227 if group[var[lvl]] != prev_values[var[lvl]]:
228 need_header = True
229 break
231 if need_header:
232 col_idx = get_column_index(var_name)
233 # Add level information as third element in tuple
234 output.append([(indices[0], col_idx, level)])
236 prev_values[var_name] = current_val
238 # Handle data rows
239 for index in indices:
240 output.append(
241 [
242 (index, j, len(var))
243 for j in range(len(columns))
244 if columns[j] not in var
245 ]
246 )
248 return output
250 def apply_pagination_borders(
251 self, document, rtf_attrs, page_info: dict, total_pages: int
252 ):
253 """Apply proper borders for paginated context following r2rtf design:
255 rtf_page.border_first/last: Controls borders for the entire table
256 rtf_body.border_first/last: Controls borders for each page
257 rtf_body.border_top/bottom: Controls borders for individual cells
259 Logic:
260 - First page, first row: Apply rtf_page.border_first (overrides rtf_body.border_first)
261 - Last page, last row: Apply rtf_page.border_last (overrides rtf_body.border_last)
262 - Non-first pages, first row: Apply rtf_body.border_first
263 - Non-last pages, last row: Apply rtf_body.border_last
264 - All other rows: Use existing border_top/bottom from rtf_body
265 """
266 from copy import deepcopy
268 from ..attributes import BroadcastValue
269 from ..input import TableAttributes
271 # Create a deep copy of the attributes to avoid modifying the original
272 page_attrs = deepcopy(rtf_attrs)
273 page_df_height = page_info["data"].height
274 page_df_width = page_info["data"].width
275 page_shape = (page_df_height, page_df_width)
277 if page_df_height == 0:
278 return page_attrs
280 # Clear border_first and border_last from being broadcast to all rows
281 # These should only apply to specific rows based on pagination logic
282 if hasattr(page_attrs, "border_first") and page_attrs.border_first:
283 # Don't use border_first in pagination - it's handled separately
284 page_attrs.border_first = None
286 if hasattr(page_attrs, "border_last") and page_attrs.border_last:
287 # Don't use border_last in pagination - it's handled separately
288 page_attrs.border_last = None
290 # Ensure border_top and border_bottom are properly sized for this page
291 if not page_attrs.border_top:
292 page_attrs.border_top = [
293 [""] * page_df_width for _ in range(page_df_height)
294 ]
295 if not page_attrs.border_bottom:
296 page_attrs.border_bottom = [
297 [""] * page_df_width for _ in range(page_df_height)
298 ]
300 # Apply borders based on page position
301 # For first page: only apply rtf_page.border_first to table body if NO column headers
302 has_column_headers = (
303 document.rtf_column_header and len(document.rtf_column_header) > 0
304 )
305 if page_info["is_first_page"] and not has_column_headers:
306 if document.rtf_page.border_first:
307 # Apply border to all cells in the first row
308 for col_idx in range(page_df_width):
309 page_attrs = self._apply_border_to_cell(
310 page_attrs,
311 0,
312 col_idx,
313 "top",
314 document.rtf_page.border_first,
315 page_shape,
316 )
318 # For first page with column headers: ensure consistent border style
319 if page_info["is_first_page"] and has_column_headers:
320 # Apply same border style as non-first pages to maintain consistency
321 if document.rtf_body.border_first:
322 border_style = (
323 document.rtf_body.border_first[0][0]
324 if isinstance(document.rtf_body.border_first, list)
325 else document.rtf_body.border_first
326 )
327 # Apply single border style to first data row (same as other pages)
328 for col_idx in range(page_df_width):
329 page_attrs = self._apply_border_to_cell(
330 page_attrs, 0, col_idx, "top", border_style, page_shape
331 )
333 # Apply page-level borders for non-first/last pages
334 if not page_info["is_first_page"] and document.rtf_body.border_first:
335 # Apply border_first to first row of non-first pages
336 border_style = (
337 document.rtf_body.border_first[0][0]
338 if isinstance(document.rtf_body.border_first, list)
339 else document.rtf_body.border_first
340 )
341 for col_idx in range(page_df_width):
342 page_attrs = self._apply_border_to_cell(
343 page_attrs, 0, col_idx, "top", border_style, page_shape
344 )
346 # Check if footnotes or sources will appear on this page
347 has_footnote_on_page = (
348 document.rtf_footnote
349 and document.rtf_footnote.text
350 and self.should_show_element_on_page(
351 document.rtf_page.page_footnote, page_info
352 )
353 )
354 has_source_on_page = (
355 document.rtf_source
356 and document.rtf_source.text
357 and self.should_show_element_on_page(
358 document.rtf_page.page_source, page_info
359 )
360 )
362 # Determine if footnotes/sources appear as tables on the last page
363 # This is crucial for border placement when components are set to "first" only
364 footnote_as_table_on_last = (
365 document.rtf_footnote
366 and document.rtf_footnote.text
367 and getattr(document.rtf_footnote, "as_table", True)
368 and document.rtf_page.page_footnote in ("last", "all")
369 )
370 source_as_table_on_last = (
371 document.rtf_source
372 and document.rtf_source.text
373 and getattr(document.rtf_source, "as_table", False)
374 and document.rtf_page.page_source in ("last", "all")
375 )
377 # Apply border logic based on page position and footnote/source presence
378 if not page_info["is_last_page"]:
379 # Non-last pages: apply single border after footnote/source, or after data if no footnote/source
380 if document.rtf_body.border_last:
381 border_style = (
382 document.rtf_body.border_last[0][0]
383 if isinstance(document.rtf_body.border_last, list)
384 else document.rtf_body.border_last
385 )
387 if not (has_footnote_on_page or has_source_on_page):
388 # No footnote/source: apply border to last data row
389 for col_idx in range(page_df_width):
390 page_attrs = self._apply_border_to_cell(
391 page_attrs,
392 page_df_height - 1,
393 col_idx,
394 "bottom",
395 border_style,
396 page_shape,
397 )
398 else:
399 # Has footnote/source: apply border_last from RTFBody
400 self._apply_footnote_source_borders(
401 document, page_info, border_style, is_last_page=False
402 )
404 else: # is_last_page
405 # Last page: check if we should apply border to data or footnote/source
406 if document.rtf_page.border_last:
407 # Check if this page contains the absolute last row
408 total_rows = document.df.height
409 is_absolute_last_row = page_info["end_row"] == total_rows - 1
411 if is_absolute_last_row:
412 # If footnotes/sources are set to "first" only and appear as tables,
413 # they won't be on the last page, so apply border to last data row
414 if not (footnote_as_table_on_last or source_as_table_on_last):
415 # No footnote/source on last page: apply border to last data row
416 last_row_idx = page_df_height - 1
417 for col_idx in range(page_df_width):
418 page_attrs = self._apply_border_to_cell(
419 page_attrs,
420 last_row_idx,
421 col_idx,
422 "bottom",
423 document.rtf_page.border_last,
424 page_shape,
425 )
426 else:
427 # Has footnote/source on last page: set border for footnote/source
428 self._apply_footnote_source_borders(
429 document,
430 page_info,
431 document.rtf_page.border_last,
432 is_last_page=True,
433 )
435 return page_attrs
437 def _apply_footnote_source_borders(
438 self, document, page_info: dict, border_style: str, is_last_page: bool
439 ):
440 """Apply borders to footnote and source components based on page position."""
441 # Determine which component should get the border
442 has_footnote = (
443 document.rtf_footnote
444 and document.rtf_footnote.text
445 and self.should_show_element_on_page(
446 document.rtf_page.page_footnote, page_info
447 )
448 )
449 has_source = (
450 document.rtf_source
451 and document.rtf_source.text
452 and self.should_show_element_on_page(
453 document.rtf_page.page_source, page_info
454 )
455 )
457 # Apply border to components based on as_table setting
458 # Priority: Source with as_table=True > Footnote with as_table=True > any component
459 target_component = None
461 # Extract as_table values (now stored as booleans)
462 footnote_as_table = None
463 if has_footnote:
464 footnote_as_table = getattr(document.rtf_footnote, "as_table", True)
466 source_as_table = None
467 if has_source:
468 source_as_table = getattr(document.rtf_source, "as_table", False)
470 if has_source and source_as_table:
471 # Source is rendered as table: prioritize source for borders
472 target_component = ("source", document.rtf_source)
473 elif has_footnote and footnote_as_table:
474 # Footnote is rendered as table: use footnote for borders
475 target_component = ("footnote", document.rtf_footnote)
476 # Note: Removed fallback to plain text components - borders should only be applied
477 # to components that are rendered as tables (as_table=True)
479 if target_component:
480 component_name, component = target_component
481 if not hasattr(component, "_page_border_style"):
482 component._page_border_style = {}
483 component._page_border_style[page_info["page_number"]] = border_style
485 def _apply_border_to_cell(
486 self,
487 page_attrs,
488 row_idx: int,
489 col_idx: int,
490 border_side: str,
491 border_style: str,
492 page_shape: tuple,
493 ):
494 """Apply specified border style to a specific cell using BroadcastValue"""
495 from ..attributes import BroadcastValue
497 border_attr = f"border_{border_side}"
499 if not hasattr(page_attrs, border_attr):
500 return page_attrs
502 # Get current border values
503 current_borders = getattr(page_attrs, border_attr)
505 # Create BroadcastValue to expand borders to page shape
506 border_broadcast = BroadcastValue(value=current_borders, dimension=page_shape)
508 # Update the specific cell
509 border_broadcast.update_cell(row_idx, col_idx, border_style)
511 # Update the attribute with the expanded value
512 setattr(page_attrs, border_attr, border_broadcast.value)
513 return page_attrs