Coverage for src/rtflite/pagination/page_dict.py: 90%
158 statements
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-25 22:35 +0000
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-25 22:35 +0000
1"""
2Advanced pagination control system for rtflite.
4This module implements a PageDict equivalent to r2rtf's advanced pagination features,
5providing page_index-like functionality while maintaining rtflite's existing architecture.
6"""
8from collections.abc import Mapping, MutableMapping, MutableSet, Sequence
9from dataclasses import dataclass, field
10from enum import Enum
11from typing import Any
13import polars as pl
14from pydantic import BaseModel, ConfigDict, Field
17class PageBreakType(Enum):
18 """Types of page breaks that can occur"""
20 AUTOMATIC = "automatic" # Based on nrow limit
21 FORCED = "forced" # Based on page_by with new_page=True
22 SUBLINE = "subline" # Based on subline_by changes
23 MANUAL = "manual" # Manually specified via PageIndexManager
26@dataclass
27class PageConfig:
28 """Configuration for a specific page"""
30 page_number: int
31 start_row: int
32 end_row: int
33 break_type: PageBreakType
34 section_headers: list[str] = field(default_factory=list)
35 subline_header: str | None = None
36 group_context: dict[str, Any] = field(default_factory=dict)
37 forced_content: set[str] = field(
38 default_factory=set
39 ) # Content IDs forced to this page
41 @property
42 def row_count(self) -> int:
43 """Number of data rows on this page"""
44 return self.end_row - self.start_row + 1
46 @property
47 def is_section_start(self) -> bool:
48 """True if this page starts a new section"""
49 return self.break_type in {PageBreakType.FORCED, PageBreakType.SUBLINE}
52@dataclass
53class PageBreakRule:
54 """Rule for determining when page breaks should occur"""
56 column: str
57 break_on_change: bool = True
58 force_new_page: bool = False
59 priority: int = 0 # Higher priority rules are processed first
61 def applies_to_row(
62 self, df: pl.DataFrame, row_idx: int, prev_row_idx: int | None = None
63 ) -> bool:
64 """Check if this rule should trigger a page break for the given row"""
65 if prev_row_idx is None:
66 return False
68 if self.column not in df.columns:
69 return False
71 current_value = df[self.column][row_idx]
72 previous_value = df[self.column][prev_row_idx]
74 return self.break_on_change and current_value != previous_value
77class PageDict(BaseModel):
78 """Advanced pagination control structure (r2rtf PageDict equivalent)
80 This class provides sophisticated pagination control similar to r2rtf's page_dict,
81 enabling page_index-like functionality while maintaining compatibility with
82 rtflite's existing row-based pagination system.
83 """
85 model_config = ConfigDict(arbitrary_types_allowed=True)
87 page_configs: MutableMapping[int, PageConfig] = Field(
88 default_factory=dict, description="Configuration for each page"
89 )
90 break_rules: list[PageBreakRule] = Field(
91 default_factory=list, description="Rules determining where page breaks occur"
92 )
93 content_index: MutableMapping[str, list[int]] = Field(
94 default_factory=dict, description="Maps content identifiers to page numbers"
95 )
96 total_pages: int = Field(default=0, description="Total number of pages")
97 nrow_per_page: int = Field(default=40, description="Base rows per page")
99 def add_page_config(self, config: PageConfig) -> None:
100 """Add a page configuration"""
101 self.page_configs[config.page_number] = config
102 self.total_pages = max(self.total_pages, config.page_number)
104 def get_page_config(self, page_num: int) -> PageConfig | None:
105 """Get configuration for a specific page"""
106 return self.page_configs.get(page_num)
108 def add_break_rule(self, rule: PageBreakRule) -> None:
109 """Add a page break rule"""
110 self.break_rules.append(rule)
111 # Sort rules by priority (highest first)
112 self.break_rules.sort(key=lambda r: r.priority, reverse=True)
114 def get_page_for_content(self, content_id: str) -> int:
115 """Get the page number where specific content appears (page_index equivalent)"""
116 pages = self.content_index.get(content_id, [])
117 return pages[0] if pages else 1 # Default to page 1
119 def get_pages_for_content(self, content_id: str) -> list[int]:
120 """Get all page numbers where specific content appears"""
121 return self.content_index.get(content_id, [])
123 def add_content_to_page(self, content_id: str, page_num: int) -> None:
124 """Add content to a specific page"""
125 if content_id not in self.content_index:
126 self.content_index[content_id] = []
127 if page_num not in self.content_index[content_id]:
128 self.content_index[content_id].append(page_num)
129 self.content_index[content_id].sort()
131 def get_section_pages(self, section_header: str) -> Sequence[int]:
132 """Get all pages that belong to a specific section"""
133 section_pages = []
134 for page_num, config in self.page_configs.items():
135 if section_header in config.section_headers:
136 section_pages.append(page_num)
137 return sorted(section_pages)
139 def get_page_break_summary(self) -> Mapping[str, int]:
140 """Get summary of page break types"""
141 summary: MutableMapping[str, int] = {}
142 for config in self.page_configs.values():
143 break_type = config.break_type.value
144 summary[break_type] = summary.get(break_type, 0) + 1
145 return summary
147 def calculate_pages_from_dataframe(
148 self,
149 df: pl.DataFrame,
150 page_by: Sequence[str] | None = None,
151 subline_by: str | None = None,
152 new_page: bool = False,
153 additional_rows_per_page: int = 0,
154 ) -> None:
155 """Calculate page configurations from a DataFrame
157 This method implements the core pagination algorithm inspired by r2rtf's approach.
158 """
159 if df.is_empty():
160 return
162 # Clear existing configurations
163 self.page_configs.clear()
164 self.content_index.clear()
166 # Calculate effective rows per page (accounting for headers, footers, etc.)
167 effective_nrow = max(1, self.nrow_per_page - additional_rows_per_page)
169 # Add break rules based on parameters
170 if page_by and new_page:
171 for col in page_by:
172 self.add_break_rule(
173 PageBreakRule(
174 column=col,
175 break_on_change=True,
176 force_new_page=True,
177 priority=10,
178 )
179 )
181 if subline_by:
182 self.add_break_rule(
183 PageBreakRule(
184 column=subline_by,
185 break_on_change=True,
186 force_new_page=True,
187 priority=20, # Higher priority than page_by
188 )
189 )
191 # Calculate page boundaries
192 page_boundaries = self._calculate_page_boundaries(df, effective_nrow)
194 # Create page configurations
195 for page_num, (start_row, end_row, break_type) in enumerate(page_boundaries, 1):
196 config = PageConfig(
197 page_number=page_num,
198 start_row=start_row,
199 end_row=end_row,
200 break_type=break_type,
201 )
203 # Add section headers for page_by columns
204 if page_by and start_row < df.height:
205 for col in page_by:
206 if col in df.columns:
207 header_value = str(df[col][start_row])
208 config.section_headers.append(f"{col}: {header_value}")
210 # Add subline header
211 if subline_by and subline_by in df.columns and start_row < df.height:
212 subline_value = str(df[subline_by][start_row])
213 config.subline_header = f"{subline_by}: {subline_value}"
215 self.add_page_config(config)
217 self.total_pages = len(page_boundaries)
219 def _calculate_page_boundaries(
220 self, df: pl.DataFrame, effective_nrow: int
221 ) -> Sequence[tuple[int, int, PageBreakType]]:
222 """Calculate where page boundaries should occur"""
223 boundaries = []
224 current_start = 0
226 for row_idx in range(df.height):
227 # Check if any break rules apply
228 forced_break = False
229 break_type = PageBreakType.AUTOMATIC
231 if row_idx > 0: # Don't break on first row
232 for rule in self.break_rules:
233 if rule.applies_to_row(df, row_idx, row_idx - 1):
234 if rule.force_new_page:
235 forced_break = True
236 break_type = PageBreakType.FORCED
237 if rule.column and "subline" in rule.column.lower():
238 break_type = PageBreakType.SUBLINE
239 break
241 # Check if we need to break due to row limit or forced break
242 rows_on_current_page = row_idx - current_start
243 if forced_break or (rows_on_current_page >= effective_nrow and row_idx > 0):
244 # End current page
245 boundaries.append((current_start, row_idx - 1, break_type))
246 current_start = row_idx
248 # Add final page
249 if current_start < df.height:
250 boundaries.append((current_start, df.height - 1, PageBreakType.AUTOMATIC))
252 return boundaries
254 def to_legacy_page_info(self) -> Sequence[Mapping[str, Any]]:
255 """Convert to legacy page info format for backward compatibility"""
256 page_info_list = []
258 for page_num in sorted(self.page_configs.keys()):
259 config = self.page_configs[page_num]
260 page_info = {
261 "page_number": page_num,
262 "total_pages": self.total_pages,
263 "start_row": config.start_row,
264 "end_row": config.end_row,
265 "is_first_page": page_num == 1,
266 "is_last_page": page_num == self.total_pages,
267 "break_type": config.break_type.value,
268 "section_headers": config.section_headers,
269 "subline_header": config.subline_header,
270 }
271 page_info_list.append(page_info)
273 return page_info_list
276class PageIndexManager:
277 """Provides page_index-like functionality for advanced page control
279 This class enables explicit control over which content appears on which pages,
280 similar to how a page_index parameter would work in other pagination systems.
281 """
283 def __init__(self, page_dict: PageDict):
284 self.page_dict = page_dict
285 self._content_assignments: MutableMapping[str, int] = {}
286 self._page_content_map: MutableMapping[int, MutableSet[str]] = {}
288 def assign_content_to_page(self, content_id: str, page_num: int) -> None:
289 """Assign specific content to a specific page (explicit page_index control)"""
290 self._content_assignments[content_id] = page_num
292 if page_num not in self._page_content_map:
293 self._page_content_map[page_num] = set()
294 self._page_content_map[page_num].add(content_id)
296 # Update the PageDict
297 self.page_dict.add_content_to_page(content_id, page_num)
299 # Mark content as forced on the target page
300 if page_num in self.page_dict.page_configs:
301 self.page_dict.page_configs[page_num].forced_content.add(content_id)
303 def get_content_page(self, content_id: str) -> int | None:
304 """Get the assigned page for specific content"""
305 return self._content_assignments.get(content_id)
307 def get_page_content(self, page_num: int) -> MutableSet[str]:
308 """Get all content assigned to a specific page"""
309 return self._page_content_map.get(page_num, set())
311 def force_page_break_before_content(self, content_id: str) -> None:
312 """Force a page break before specific content appears"""
313 # This would require integration with the DataFrame processing
314 # to identify where the content appears and insert a break rule
315 pass
317 def get_content_summary(self) -> Mapping[str, Mapping[str, Any]]:
318 """Get summary of all content assignments"""
319 summary = {}
320 for content_id, page_num in self._content_assignments.items():
321 summary[content_id] = {
322 "assigned_page": page_num,
323 "is_forced": content_id
324 in self.page_dict.page_configs.get(
325 page_num, PageConfig(0, 0, 0, PageBreakType.AUTOMATIC)
326 ).forced_content,
327 }
328 return summary
330 def optimize_page_distribution(self) -> None:
331 """Optimize content distribution across pages to balance page lengths"""
332 # Advanced algorithm to redistribute content for better balance
333 # This could implement sophisticated optimization based on content weight,
334 # page capacity, and user constraints
335 pass