Coverage for src / rtflite / pagination / page_dict.py: 89%

152 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-28 05:09 +0000

1""" 

2Advanced pagination control system for rtflite. 

3 

4This module implements a PageDict equivalent to r2rtf's advanced pagination 

5features, providing page_index-like functionality while maintaining rtflite's 

6existing architecture. 

7""" 

8 

9from collections.abc import Mapping, MutableMapping, MutableSet, Sequence 

10from dataclasses import dataclass, field 

11from enum import Enum 

12from typing import Any 

13 

14import polars as pl 

15from pydantic import BaseModel, ConfigDict, Field 

16 

17 

18class PageBreakType(Enum): 

19 """Types of page breaks that can occur""" 

20 

21 AUTOMATIC = "automatic" # Based on nrow limit 

22 FORCED = "forced" # Based on page_by with new_page=True 

23 SUBLINE = "subline" # Based on subline_by changes 

24 MANUAL = "manual" # Manually specified via PageIndexManager 

25 

26 

27@dataclass 

28class PageConfig: 

29 """Configuration for a specific page""" 

30 

31 page_number: int 

32 start_row: int 

33 end_row: int 

34 break_type: PageBreakType 

35 section_headers: list[str] = field(default_factory=list) 

36 subline_header: str | None = None 

37 group_context: dict[str, Any] = field(default_factory=dict) 

38 forced_content: set[str] = field( 

39 default_factory=set 

40 ) # Content IDs forced to this page 

41 

42 @property 

43 def row_count(self) -> int: 

44 """Number of data rows on this page""" 

45 return self.end_row - self.start_row + 1 

46 

47 @property 

48 def is_section_start(self) -> bool: 

49 """True if this page starts a new section""" 

50 return self.break_type in {PageBreakType.FORCED, PageBreakType.SUBLINE} 

51 

52 

53@dataclass 

54class PageBreakRule: 

55 """Rule for determining when page breaks should occur""" 

56 

57 column: str 

58 break_on_change: bool = True 

59 force_new_page: bool = False 

60 priority: int = 0 # Higher priority rules are processed first 

61 

62 def applies_to_row( 

63 self, df: pl.DataFrame, row_idx: int, prev_row_idx: int | None = None 

64 ) -> bool: 

65 """Check if this rule should trigger a page break for the given row""" 

66 if prev_row_idx is None: 

67 return False 

68 

69 if self.column not in df.columns: 

70 return False 

71 

72 current_value = df[self.column][row_idx] 

73 previous_value = df[self.column][prev_row_idx] 

74 

75 return self.break_on_change and current_value != previous_value 

76 

77 

78class PageDict(BaseModel): 

79 """Advanced pagination control structure (r2rtf PageDict equivalent) 

80 

81 This class provides sophisticated pagination control similar to r2rtf's page_dict, 

82 enabling page_index-like functionality while maintaining compatibility with 

83 rtflite's existing row-based pagination system. 

84 """ 

85 

86 model_config = ConfigDict(arbitrary_types_allowed=True) 

87 

88 page_configs: MutableMapping[int, PageConfig] = Field( 

89 default_factory=dict, description="Configuration for each page" 

90 ) 

91 break_rules: list[PageBreakRule] = Field( 

92 default_factory=list, description="Rules determining where page breaks occur" 

93 ) 

94 content_index: MutableMapping[str, list[int]] = Field( 

95 default_factory=dict, description="Maps content identifiers to page numbers" 

96 ) 

97 total_pages: int = Field(default=0, description="Total number of pages") 

98 nrow_per_page: int = Field(default=40, description="Base rows per page") 

99 

100 def add_page_config(self, config: PageConfig) -> None: 

101 """Add a page configuration""" 

102 self.page_configs[config.page_number] = config 

103 self.total_pages = max(self.total_pages, config.page_number) 

104 

105 def get_page_config(self, page_num: int) -> PageConfig | None: 

106 """Get configuration for a specific page""" 

107 return self.page_configs.get(page_num) 

108 

109 def add_break_rule(self, rule: PageBreakRule) -> None: 

110 """Add a page break rule""" 

111 self.break_rules.append(rule) 

112 # Sort rules by priority (highest first) 

113 self.break_rules.sort(key=lambda r: r.priority, reverse=True) 

114 

115 def get_page_for_content(self, content_id: str) -> int: 

116 """Get the page number where specific content appears (page_index equivalent)""" 

117 pages = self.content_index.get(content_id, []) 

118 return pages[0] if pages else 1 # Default to page 1 

119 

120 def get_pages_for_content(self, content_id: str) -> list[int]: 

121 """Get all page numbers where specific content appears""" 

122 return self.content_index.get(content_id, []) 

123 

124 def add_content_to_page(self, content_id: str, page_num: int) -> None: 

125 """Add content to a specific page""" 

126 if content_id not in self.content_index: 

127 self.content_index[content_id] = [] 

128 if page_num not in self.content_index[content_id]: 

129 self.content_index[content_id].append(page_num) 

130 self.content_index[content_id].sort() 

131 

132 def get_section_pages(self, section_header: str) -> Sequence[int]: 

133 """Get all pages that belong to a specific section""" 

134 section_pages = [] 

135 for page_num, config in self.page_configs.items(): 

136 if section_header in config.section_headers: 

137 section_pages.append(page_num) 

138 return sorted(section_pages) 

139 

140 def get_page_break_summary(self) -> Mapping[str, int]: 

141 """Get summary of page break types""" 

142 summary: MutableMapping[str, int] = {} 

143 for config in self.page_configs.values(): 

144 break_type = config.break_type.value 

145 summary[break_type] = summary.get(break_type, 0) + 1 

146 return summary 

147 

148 def calculate_pages_from_dataframe( 

149 self, 

150 df: pl.DataFrame, 

151 page_by: Sequence[str] | None = None, 

152 subline_by: str | None = None, 

153 new_page: bool = False, 

154 additional_rows_per_page: int = 0, 

155 ) -> None: 

156 """Calculate page configurations from a DataFrame. 

157 

158 This method implements the core pagination algorithm inspired by 

159 r2rtf's approach. 

160 """ 

161 if df.is_empty(): 

162 return 

163 

164 # Clear existing configurations 

165 self.page_configs.clear() 

166 self.content_index.clear() 

167 

168 # Calculate effective rows per page (accounting for headers, footers, etc.) 

169 effective_nrow = max(1, self.nrow_per_page - additional_rows_per_page) 

170 

171 # Add break rules based on parameters 

172 # When page_by + new_page=True, force breaks at group boundaries 

173 if page_by and new_page: 

174 for col in page_by: 

175 self.add_break_rule( 

176 PageBreakRule( 

177 column=col, 

178 break_on_change=True, 

179 force_new_page=True, 

180 priority=10, 

181 ) 

182 ) 

183 

184 if subline_by: 

185 self.add_break_rule( 

186 PageBreakRule( 

187 column=subline_by, 

188 break_on_change=True, 

189 force_new_page=True, 

190 priority=20, # Higher priority than page_by 

191 ) 

192 ) 

193 

194 # Calculate page boundaries 

195 page_boundaries = self._calculate_page_boundaries(df, effective_nrow) 

196 

197 # Create page configurations 

198 for page_num, (start_row, end_row, break_type) in enumerate(page_boundaries, 1): 

199 config = PageConfig( 

200 page_number=page_num, 

201 start_row=start_row, 

202 end_row=end_row, 

203 break_type=break_type, 

204 ) 

205 

206 # Add section headers for page_by columns 

207 if page_by and start_row < df.height: 

208 for col in page_by: 

209 if col in df.columns: 

210 header_value = str(df[col][start_row]) 

211 config.section_headers.append(f"{col}: {header_value}") 

212 

213 # Add subline header 

214 if subline_by and subline_by in df.columns and start_row < df.height: 

215 subline_value = str(df[subline_by][start_row]) 

216 config.subline_header = f"{subline_by}: {subline_value}" 

217 

218 self.add_page_config(config) 

219 

220 self.total_pages = len(page_boundaries) 

221 

222 def _calculate_page_boundaries( 

223 self, df: pl.DataFrame, effective_nrow: int 

224 ) -> Sequence[tuple[int, int, PageBreakType]]: 

225 """Calculate where page boundaries should occur""" 

226 boundaries = [] 

227 current_start = 0 

228 

229 for row_idx in range(df.height): 

230 # Check if any break rules apply 

231 forced_break = False 

232 break_type = PageBreakType.AUTOMATIC 

233 

234 if row_idx > 0: # Don't break on first row 

235 for rule in self.break_rules: 

236 if rule.force_new_page and rule.applies_to_row( 

237 df, row_idx, row_idx - 1 

238 ): 

239 forced_break = True 

240 break_type = PageBreakType.FORCED 

241 if rule.column and "subline" in rule.column.lower(): 

242 break_type = PageBreakType.SUBLINE 

243 break 

244 

245 # Check if we need to break due to row limit or forced break 

246 rows_on_current_page = row_idx - current_start 

247 if forced_break or (rows_on_current_page >= effective_nrow and row_idx > 0): 

248 # End current page 

249 boundaries.append((current_start, row_idx - 1, break_type)) 

250 current_start = row_idx 

251 

252 # Add final page 

253 if current_start < df.height: 

254 boundaries.append((current_start, df.height - 1, PageBreakType.AUTOMATIC)) 

255 

256 return boundaries 

257 

258 def to_legacy_page_info(self) -> Sequence[Mapping[str, Any]]: 

259 """Convert to legacy page info format for backward compatibility""" 

260 page_info_list = [] 

261 

262 for page_num in sorted(self.page_configs.keys()): 

263 config = self.page_configs[page_num] 

264 page_info = { 

265 "page_number": page_num, 

266 "total_pages": self.total_pages, 

267 "start_row": config.start_row, 

268 "end_row": config.end_row, 

269 "is_first_page": page_num == 1, 

270 "is_last_page": page_num == self.total_pages, 

271 "break_type": config.break_type.value, 

272 "section_headers": config.section_headers, 

273 "subline_header": config.subline_header, 

274 } 

275 page_info_list.append(page_info) 

276 

277 return page_info_list 

278 

279 

280class PageIndexManager: 

281 """Provides page_index-like functionality for advanced page control 

282 

283 This class enables explicit control over which content appears on which pages, 

284 similar to how a page_index parameter would work in other pagination systems. 

285 """ 

286 

287 def __init__(self, page_dict: PageDict): 

288 self.page_dict = page_dict 

289 self._content_assignments: MutableMapping[str, int] = {} 

290 self._page_content_map: MutableMapping[int, MutableSet[str]] = {} 

291 

292 def assign_content_to_page(self, content_id: str, page_num: int) -> None: 

293 """Assign specific content to a specific page (explicit page_index control)""" 

294 self._content_assignments[content_id] = page_num 

295 

296 if page_num not in self._page_content_map: 

297 self._page_content_map[page_num] = set() 

298 self._page_content_map[page_num].add(content_id) 

299 

300 # Update the PageDict 

301 self.page_dict.add_content_to_page(content_id, page_num) 

302 

303 # Mark content as forced on the target page 

304 if page_num in self.page_dict.page_configs: 

305 self.page_dict.page_configs[page_num].forced_content.add(content_id) 

306 

307 def get_content_page(self, content_id: str) -> int | None: 

308 """Get the assigned page for specific content""" 

309 return self._content_assignments.get(content_id) 

310 

311 def get_page_content(self, page_num: int) -> MutableSet[str]: 

312 """Get all content assigned to a specific page""" 

313 return self._page_content_map.get(page_num, set()) 

314 

315 def force_page_break_before_content(self, content_id: str) -> None: 

316 """Force a page break before specific content appears""" 

317 # This would require integration with the DataFrame processing 

318 # to identify where the content appears and insert a break rule 

319 pass 

320 

321 def get_content_summary(self) -> Mapping[str, Mapping[str, Any]]: 

322 """Get summary of all content assignments""" 

323 summary = {} 

324 for content_id, page_num in self._content_assignments.items(): 

325 summary[content_id] = { 

326 "assigned_page": page_num, 

327 "is_forced": content_id 

328 in self.page_dict.page_configs.get( 

329 page_num, PageConfig(0, 0, 0, PageBreakType.AUTOMATIC) 

330 ).forced_content, 

331 } 

332 return summary 

333 

334 def optimize_page_distribution(self) -> None: 

335 """Optimize content distribution across pages to balance page lengths""" 

336 # Advanced algorithm to redistribute content for better balance 

337 # This could implement sophisticated optimization based on content weight, 

338 # page capacity, and user constraints 

339 pass