Coverage for src / rtflite / pagination / page_dict.py: 89%
152 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-28 05:09 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-28 05:09 +0000
1"""
2Advanced pagination control system for rtflite.
4This module implements a PageDict equivalent to r2rtf's advanced pagination
5features, providing page_index-like functionality while maintaining rtflite's
6existing architecture.
7"""
9from collections.abc import Mapping, MutableMapping, MutableSet, Sequence
10from dataclasses import dataclass, field
11from enum import Enum
12from typing import Any
14import polars as pl
15from pydantic import BaseModel, ConfigDict, Field
18class PageBreakType(Enum):
19 """Types of page breaks that can occur"""
21 AUTOMATIC = "automatic" # Based on nrow limit
22 FORCED = "forced" # Based on page_by with new_page=True
23 SUBLINE = "subline" # Based on subline_by changes
24 MANUAL = "manual" # Manually specified via PageIndexManager
27@dataclass
28class PageConfig:
29 """Configuration for a specific page"""
31 page_number: int
32 start_row: int
33 end_row: int
34 break_type: PageBreakType
35 section_headers: list[str] = field(default_factory=list)
36 subline_header: str | None = None
37 group_context: dict[str, Any] = field(default_factory=dict)
38 forced_content: set[str] = field(
39 default_factory=set
40 ) # Content IDs forced to this page
42 @property
43 def row_count(self) -> int:
44 """Number of data rows on this page"""
45 return self.end_row - self.start_row + 1
47 @property
48 def is_section_start(self) -> bool:
49 """True if this page starts a new section"""
50 return self.break_type in {PageBreakType.FORCED, PageBreakType.SUBLINE}
53@dataclass
54class PageBreakRule:
55 """Rule for determining when page breaks should occur"""
57 column: str
58 break_on_change: bool = True
59 force_new_page: bool = False
60 priority: int = 0 # Higher priority rules are processed first
62 def applies_to_row(
63 self, df: pl.DataFrame, row_idx: int, prev_row_idx: int | None = None
64 ) -> bool:
65 """Check if this rule should trigger a page break for the given row"""
66 if prev_row_idx is None:
67 return False
69 if self.column not in df.columns:
70 return False
72 current_value = df[self.column][row_idx]
73 previous_value = df[self.column][prev_row_idx]
75 return self.break_on_change and current_value != previous_value
78class PageDict(BaseModel):
79 """Advanced pagination control structure (r2rtf PageDict equivalent)
81 This class provides sophisticated pagination control similar to r2rtf's page_dict,
82 enabling page_index-like functionality while maintaining compatibility with
83 rtflite's existing row-based pagination system.
84 """
86 model_config = ConfigDict(arbitrary_types_allowed=True)
88 page_configs: MutableMapping[int, PageConfig] = Field(
89 default_factory=dict, description="Configuration for each page"
90 )
91 break_rules: list[PageBreakRule] = Field(
92 default_factory=list, description="Rules determining where page breaks occur"
93 )
94 content_index: MutableMapping[str, list[int]] = Field(
95 default_factory=dict, description="Maps content identifiers to page numbers"
96 )
97 total_pages: int = Field(default=0, description="Total number of pages")
98 nrow_per_page: int = Field(default=40, description="Base rows per page")
100 def add_page_config(self, config: PageConfig) -> None:
101 """Add a page configuration"""
102 self.page_configs[config.page_number] = config
103 self.total_pages = max(self.total_pages, config.page_number)
105 def get_page_config(self, page_num: int) -> PageConfig | None:
106 """Get configuration for a specific page"""
107 return self.page_configs.get(page_num)
109 def add_break_rule(self, rule: PageBreakRule) -> None:
110 """Add a page break rule"""
111 self.break_rules.append(rule)
112 # Sort rules by priority (highest first)
113 self.break_rules.sort(key=lambda r: r.priority, reverse=True)
115 def get_page_for_content(self, content_id: str) -> int:
116 """Get the page number where specific content appears (page_index equivalent)"""
117 pages = self.content_index.get(content_id, [])
118 return pages[0] if pages else 1 # Default to page 1
120 def get_pages_for_content(self, content_id: str) -> list[int]:
121 """Get all page numbers where specific content appears"""
122 return self.content_index.get(content_id, [])
124 def add_content_to_page(self, content_id: str, page_num: int) -> None:
125 """Add content to a specific page"""
126 if content_id not in self.content_index:
127 self.content_index[content_id] = []
128 if page_num not in self.content_index[content_id]:
129 self.content_index[content_id].append(page_num)
130 self.content_index[content_id].sort()
132 def get_section_pages(self, section_header: str) -> Sequence[int]:
133 """Get all pages that belong to a specific section"""
134 section_pages = []
135 for page_num, config in self.page_configs.items():
136 if section_header in config.section_headers:
137 section_pages.append(page_num)
138 return sorted(section_pages)
140 def get_page_break_summary(self) -> Mapping[str, int]:
141 """Get summary of page break types"""
142 summary: MutableMapping[str, int] = {}
143 for config in self.page_configs.values():
144 break_type = config.break_type.value
145 summary[break_type] = summary.get(break_type, 0) + 1
146 return summary
148 def calculate_pages_from_dataframe(
149 self,
150 df: pl.DataFrame,
151 page_by: Sequence[str] | None = None,
152 subline_by: str | None = None,
153 new_page: bool = False,
154 additional_rows_per_page: int = 0,
155 ) -> None:
156 """Calculate page configurations from a DataFrame.
158 This method implements the core pagination algorithm inspired by
159 r2rtf's approach.
160 """
161 if df.is_empty():
162 return
164 # Clear existing configurations
165 self.page_configs.clear()
166 self.content_index.clear()
168 # Calculate effective rows per page (accounting for headers, footers, etc.)
169 effective_nrow = max(1, self.nrow_per_page - additional_rows_per_page)
171 # Add break rules based on parameters
172 # When page_by + new_page=True, force breaks at group boundaries
173 if page_by and new_page:
174 for col in page_by:
175 self.add_break_rule(
176 PageBreakRule(
177 column=col,
178 break_on_change=True,
179 force_new_page=True,
180 priority=10,
181 )
182 )
184 if subline_by:
185 self.add_break_rule(
186 PageBreakRule(
187 column=subline_by,
188 break_on_change=True,
189 force_new_page=True,
190 priority=20, # Higher priority than page_by
191 )
192 )
194 # Calculate page boundaries
195 page_boundaries = self._calculate_page_boundaries(df, effective_nrow)
197 # Create page configurations
198 for page_num, (start_row, end_row, break_type) in enumerate(page_boundaries, 1):
199 config = PageConfig(
200 page_number=page_num,
201 start_row=start_row,
202 end_row=end_row,
203 break_type=break_type,
204 )
206 # Add section headers for page_by columns
207 if page_by and start_row < df.height:
208 for col in page_by:
209 if col in df.columns:
210 header_value = str(df[col][start_row])
211 config.section_headers.append(f"{col}: {header_value}")
213 # Add subline header
214 if subline_by and subline_by in df.columns and start_row < df.height:
215 subline_value = str(df[subline_by][start_row])
216 config.subline_header = f"{subline_by}: {subline_value}"
218 self.add_page_config(config)
220 self.total_pages = len(page_boundaries)
222 def _calculate_page_boundaries(
223 self, df: pl.DataFrame, effective_nrow: int
224 ) -> Sequence[tuple[int, int, PageBreakType]]:
225 """Calculate where page boundaries should occur"""
226 boundaries = []
227 current_start = 0
229 for row_idx in range(df.height):
230 # Check if any break rules apply
231 forced_break = False
232 break_type = PageBreakType.AUTOMATIC
234 if row_idx > 0: # Don't break on first row
235 for rule in self.break_rules:
236 if rule.force_new_page and rule.applies_to_row(
237 df, row_idx, row_idx - 1
238 ):
239 forced_break = True
240 break_type = PageBreakType.FORCED
241 if rule.column and "subline" in rule.column.lower():
242 break_type = PageBreakType.SUBLINE
243 break
245 # Check if we need to break due to row limit or forced break
246 rows_on_current_page = row_idx - current_start
247 if forced_break or (rows_on_current_page >= effective_nrow and row_idx > 0):
248 # End current page
249 boundaries.append((current_start, row_idx - 1, break_type))
250 current_start = row_idx
252 # Add final page
253 if current_start < df.height:
254 boundaries.append((current_start, df.height - 1, PageBreakType.AUTOMATIC))
256 return boundaries
258 def to_legacy_page_info(self) -> Sequence[Mapping[str, Any]]:
259 """Convert to legacy page info format for backward compatibility"""
260 page_info_list = []
262 for page_num in sorted(self.page_configs.keys()):
263 config = self.page_configs[page_num]
264 page_info = {
265 "page_number": page_num,
266 "total_pages": self.total_pages,
267 "start_row": config.start_row,
268 "end_row": config.end_row,
269 "is_first_page": page_num == 1,
270 "is_last_page": page_num == self.total_pages,
271 "break_type": config.break_type.value,
272 "section_headers": config.section_headers,
273 "subline_header": config.subline_header,
274 }
275 page_info_list.append(page_info)
277 return page_info_list
280class PageIndexManager:
281 """Provides page_index-like functionality for advanced page control
283 This class enables explicit control over which content appears on which pages,
284 similar to how a page_index parameter would work in other pagination systems.
285 """
287 def __init__(self, page_dict: PageDict):
288 self.page_dict = page_dict
289 self._content_assignments: MutableMapping[str, int] = {}
290 self._page_content_map: MutableMapping[int, MutableSet[str]] = {}
292 def assign_content_to_page(self, content_id: str, page_num: int) -> None:
293 """Assign specific content to a specific page (explicit page_index control)"""
294 self._content_assignments[content_id] = page_num
296 if page_num not in self._page_content_map:
297 self._page_content_map[page_num] = set()
298 self._page_content_map[page_num].add(content_id)
300 # Update the PageDict
301 self.page_dict.add_content_to_page(content_id, page_num)
303 # Mark content as forced on the target page
304 if page_num in self.page_dict.page_configs:
305 self.page_dict.page_configs[page_num].forced_content.add(content_id)
307 def get_content_page(self, content_id: str) -> int | None:
308 """Get the assigned page for specific content"""
309 return self._content_assignments.get(content_id)
311 def get_page_content(self, page_num: int) -> MutableSet[str]:
312 """Get all content assigned to a specific page"""
313 return self._page_content_map.get(page_num, set())
315 def force_page_break_before_content(self, content_id: str) -> None:
316 """Force a page break before specific content appears"""
317 # This would require integration with the DataFrame processing
318 # to identify where the content appears and insert a break rule
319 pass
321 def get_content_summary(self) -> Mapping[str, Mapping[str, Any]]:
322 """Get summary of all content assignments"""
323 summary = {}
324 for content_id, page_num in self._content_assignments.items():
325 summary[content_id] = {
326 "assigned_page": page_num,
327 "is_forced": content_id
328 in self.page_dict.page_configs.get(
329 page_num, PageConfig(0, 0, 0, PageBreakType.AUTOMATIC)
330 ).forced_content,
331 }
332 return summary
334 def optimize_page_distribution(self) -> None:
335 """Optimize content distribution across pages to balance page lengths"""
336 # Advanced algorithm to redistribute content for better balance
337 # This could implement sophisticated optimization based on content weight,
338 # page capacity, and user constraints
339 pass