Coverage for src/rtflite/pagination/page_dict.py: 90%
157 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 16:35 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 16:35 +0000
1"""
2Advanced pagination control system for rtflite.
4This module implements a PageDict equivalent to r2rtf's advanced pagination features,
5providing page_index-like functionality while maintaining rtflite's existing architecture.
6"""
8from dataclasses import dataclass, field
9from enum import Enum
10from typing import Any
12import polars as pl
13from pydantic import BaseModel, ConfigDict, Field
16class PageBreakType(Enum):
17 """Types of page breaks that can occur"""
19 AUTOMATIC = "automatic" # Based on nrow limit
20 FORCED = "forced" # Based on page_by with new_page=True
21 SUBLINE = "subline" # Based on subline_by changes
22 MANUAL = "manual" # Manually specified via PageIndexManager
25@dataclass
26class PageConfig:
27 """Configuration for a specific page"""
29 page_number: int
30 start_row: int
31 end_row: int
32 break_type: PageBreakType
33 section_headers: list[str] = field(default_factory=list)
34 subline_header: str | None = None
35 group_context: dict[str, Any] = field(default_factory=dict)
36 forced_content: set[str] = field(
37 default_factory=set
38 ) # Content IDs forced to this page
40 @property
41 def row_count(self) -> int:
42 """Number of data rows on this page"""
43 return self.end_row - self.start_row + 1
45 @property
46 def is_section_start(self) -> bool:
47 """True if this page starts a new section"""
48 return self.break_type in {PageBreakType.FORCED, PageBreakType.SUBLINE}
51@dataclass
52class PageBreakRule:
53 """Rule for determining when page breaks should occur"""
55 column: str
56 break_on_change: bool = True
57 force_new_page: bool = False
58 priority: int = 0 # Higher priority rules are processed first
60 def applies_to_row(
61 self, df: pl.DataFrame, row_idx: int, prev_row_idx: int | None = None
62 ) -> bool:
63 """Check if this rule should trigger a page break for the given row"""
64 if prev_row_idx is None:
65 return False
67 if self.column not in df.columns:
68 return False
70 current_value = df[self.column][row_idx]
71 previous_value = df[self.column][prev_row_idx]
73 return self.break_on_change and current_value != previous_value
76class PageDict(BaseModel):
77 """Advanced pagination control structure (r2rtf PageDict equivalent)
79 This class provides sophisticated pagination control similar to r2rtf's page_dict,
80 enabling page_index-like functionality while maintaining compatibility with
81 rtflite's existing row-based pagination system.
82 """
84 model_config = ConfigDict(arbitrary_types_allowed=True)
86 page_configs: dict[int, PageConfig] = Field(
87 default_factory=dict, description="Configuration for each page"
88 )
89 break_rules: list[PageBreakRule] = Field(
90 default_factory=list, description="Rules determining where page breaks occur"
91 )
92 content_index: dict[str, list[int]] = Field(
93 default_factory=dict, description="Maps content identifiers to page numbers"
94 )
95 total_pages: int = Field(default=0, description="Total number of pages")
96 nrow_per_page: int = Field(default=40, description="Base rows per page")
98 def add_page_config(self, config: PageConfig) -> None:
99 """Add a page configuration"""
100 self.page_configs[config.page_number] = config
101 self.total_pages = max(self.total_pages, config.page_number)
103 def get_page_config(self, page_num: int) -> PageConfig | None:
104 """Get configuration for a specific page"""
105 return self.page_configs.get(page_num)
107 def add_break_rule(self, rule: PageBreakRule) -> None:
108 """Add a page break rule"""
109 self.break_rules.append(rule)
110 # Sort rules by priority (highest first)
111 self.break_rules.sort(key=lambda r: r.priority, reverse=True)
113 def get_page_for_content(self, content_id: str) -> int:
114 """Get the page number where specific content appears (page_index equivalent)"""
115 pages = self.content_index.get(content_id, [])
116 return pages[0] if pages else 1 # Default to page 1
118 def get_pages_for_content(self, content_id: str) -> list[int]:
119 """Get all page numbers where specific content appears"""
120 return self.content_index.get(content_id, [])
122 def add_content_to_page(self, content_id: str, page_num: int) -> None:
123 """Add content to a specific page"""
124 if content_id not in self.content_index:
125 self.content_index[content_id] = []
126 if page_num not in self.content_index[content_id]:
127 self.content_index[content_id].append(page_num)
128 self.content_index[content_id].sort()
130 def get_section_pages(self, section_header: str) -> list[int]:
131 """Get all pages that belong to a specific section"""
132 section_pages = []
133 for page_num, config in self.page_configs.items():
134 if section_header in config.section_headers:
135 section_pages.append(page_num)
136 return sorted(section_pages)
138 def get_page_break_summary(self) -> dict[str, int]:
139 """Get summary of page break types"""
140 summary: dict[str, int] = {}
141 for config in self.page_configs.values():
142 break_type = config.break_type.value
143 summary[break_type] = summary.get(break_type, 0) + 1
144 return summary
146 def calculate_pages_from_dataframe(
147 self,
148 df: pl.DataFrame,
149 page_by: list[str] | None = None,
150 subline_by: str | None = None,
151 new_page: bool = False,
152 additional_rows_per_page: int = 0,
153 ) -> None:
154 """Calculate page configurations from a DataFrame
156 This method implements the core pagination algorithm inspired by r2rtf's approach.
157 """
158 if df.is_empty():
159 return
161 # Clear existing configurations
162 self.page_configs.clear()
163 self.content_index.clear()
165 # Calculate effective rows per page (accounting for headers, footers, etc.)
166 effective_nrow = max(1, self.nrow_per_page - additional_rows_per_page)
168 # Add break rules based on parameters
169 if page_by and new_page:
170 for col in page_by:
171 self.add_break_rule(
172 PageBreakRule(
173 column=col,
174 break_on_change=True,
175 force_new_page=True,
176 priority=10,
177 )
178 )
180 if subline_by:
181 self.add_break_rule(
182 PageBreakRule(
183 column=subline_by,
184 break_on_change=True,
185 force_new_page=True,
186 priority=20, # Higher priority than page_by
187 )
188 )
190 # Calculate page boundaries
191 page_boundaries = self._calculate_page_boundaries(df, effective_nrow)
193 # Create page configurations
194 for page_num, (start_row, end_row, break_type) in enumerate(page_boundaries, 1):
195 config = PageConfig(
196 page_number=page_num,
197 start_row=start_row,
198 end_row=end_row,
199 break_type=break_type,
200 )
202 # Add section headers for page_by columns
203 if page_by and start_row < df.height:
204 for col in page_by:
205 if col in df.columns:
206 header_value = str(df[col][start_row])
207 config.section_headers.append(f"{col}: {header_value}")
209 # Add subline header
210 if subline_by and subline_by in df.columns and start_row < df.height:
211 subline_value = str(df[subline_by][start_row])
212 config.subline_header = f"{subline_by}: {subline_value}"
214 self.add_page_config(config)
216 self.total_pages = len(page_boundaries)
218 def _calculate_page_boundaries(
219 self, df: pl.DataFrame, effective_nrow: int
220 ) -> list[tuple[int, int, PageBreakType]]:
221 """Calculate where page boundaries should occur"""
222 boundaries = []
223 current_start = 0
225 for row_idx in range(df.height):
226 # Check if any break rules apply
227 forced_break = False
228 break_type = PageBreakType.AUTOMATIC
230 if row_idx > 0: # Don't break on first row
231 for rule in self.break_rules:
232 if rule.applies_to_row(df, row_idx, row_idx - 1):
233 if rule.force_new_page:
234 forced_break = True
235 break_type = PageBreakType.FORCED
236 if rule.column and "subline" in rule.column.lower():
237 break_type = PageBreakType.SUBLINE
238 break
240 # Check if we need to break due to row limit or forced break
241 rows_on_current_page = row_idx - current_start
242 if forced_break or (rows_on_current_page >= effective_nrow and row_idx > 0):
243 # End current page
244 boundaries.append((current_start, row_idx - 1, break_type))
245 current_start = row_idx
247 # Add final page
248 if current_start < df.height:
249 boundaries.append((current_start, df.height - 1, PageBreakType.AUTOMATIC))
251 return boundaries
253 def to_legacy_page_info(self) -> list[dict[str, Any]]:
254 """Convert to legacy page info format for backward compatibility"""
255 page_info_list = []
257 for page_num in sorted(self.page_configs.keys()):
258 config = self.page_configs[page_num]
259 page_info = {
260 "page_number": page_num,
261 "total_pages": self.total_pages,
262 "start_row": config.start_row,
263 "end_row": config.end_row,
264 "is_first_page": page_num == 1,
265 "is_last_page": page_num == self.total_pages,
266 "break_type": config.break_type.value,
267 "section_headers": config.section_headers,
268 "subline_header": config.subline_header,
269 }
270 page_info_list.append(page_info)
272 return page_info_list
275class PageIndexManager:
276 """Provides page_index-like functionality for advanced page control
278 This class enables explicit control over which content appears on which pages,
279 similar to how a page_index parameter would work in other pagination systems.
280 """
282 def __init__(self, page_dict: PageDict):
283 self.page_dict = page_dict
284 self._content_assignments: dict[str, int] = {}
285 self._page_content_map: dict[int, set[str]] = {}
287 def assign_content_to_page(self, content_id: str, page_num: int) -> None:
288 """Assign specific content to a specific page (explicit page_index control)"""
289 self._content_assignments[content_id] = page_num
291 if page_num not in self._page_content_map:
292 self._page_content_map[page_num] = set()
293 self._page_content_map[page_num].add(content_id)
295 # Update the PageDict
296 self.page_dict.add_content_to_page(content_id, page_num)
298 # Mark content as forced on the target page
299 if page_num in self.page_dict.page_configs:
300 self.page_dict.page_configs[page_num].forced_content.add(content_id)
302 def get_content_page(self, content_id: str) -> int | None:
303 """Get the assigned page for specific content"""
304 return self._content_assignments.get(content_id)
306 def get_page_content(self, page_num: int) -> set[str]:
307 """Get all content assigned to a specific page"""
308 return self._page_content_map.get(page_num, set())
310 def force_page_break_before_content(self, content_id: str) -> None:
311 """Force a page break before specific content appears"""
312 # This would require integration with the DataFrame processing
313 # to identify where the content appears and insert a break rule
314 pass
316 def get_content_summary(self) -> dict[str, dict[str, Any]]:
317 """Get summary of all content assignments"""
318 summary = {}
319 for content_id, page_num in self._content_assignments.items():
320 summary[content_id] = {
321 "assigned_page": page_num,
322 "is_forced": content_id
323 in self.page_dict.page_configs.get(
324 page_num, PageConfig(0, 0, 0, PageBreakType.AUTOMATIC)
325 ).forced_content,
326 }
327 return summary
329 def optimize_page_distribution(self) -> None:
330 """Optimize content distribution across pages to balance page lengths"""
331 # Advanced algorithm to redistribute content for better balance
332 # This could implement sophisticated optimization based on content weight,
333 # page capacity, and user constraints
334 pass