Coverage for src/rtflite/pagination/page_dict.py: 90%

157 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-14 16:35 +0000

1""" 

2Advanced pagination control system for rtflite. 

3 

4This module implements a PageDict equivalent to r2rtf's advanced pagination features, 

5providing page_index-like functionality while maintaining rtflite's existing architecture. 

6""" 

7 

8from dataclasses import dataclass, field 

9from enum import Enum 

10from typing import Any 

11 

12import polars as pl 

13from pydantic import BaseModel, ConfigDict, Field 

14 

15 

16class PageBreakType(Enum): 

17 """Types of page breaks that can occur""" 

18 

19 AUTOMATIC = "automatic" # Based on nrow limit 

20 FORCED = "forced" # Based on page_by with new_page=True 

21 SUBLINE = "subline" # Based on subline_by changes 

22 MANUAL = "manual" # Manually specified via PageIndexManager 

23 

24 

25@dataclass 

26class PageConfig: 

27 """Configuration for a specific page""" 

28 

29 page_number: int 

30 start_row: int 

31 end_row: int 

32 break_type: PageBreakType 

33 section_headers: list[str] = field(default_factory=list) 

34 subline_header: str | None = None 

35 group_context: dict[str, Any] = field(default_factory=dict) 

36 forced_content: set[str] = field( 

37 default_factory=set 

38 ) # Content IDs forced to this page 

39 

40 @property 

41 def row_count(self) -> int: 

42 """Number of data rows on this page""" 

43 return self.end_row - self.start_row + 1 

44 

45 @property 

46 def is_section_start(self) -> bool: 

47 """True if this page starts a new section""" 

48 return self.break_type in {PageBreakType.FORCED, PageBreakType.SUBLINE} 

49 

50 

51@dataclass 

52class PageBreakRule: 

53 """Rule for determining when page breaks should occur""" 

54 

55 column: str 

56 break_on_change: bool = True 

57 force_new_page: bool = False 

58 priority: int = 0 # Higher priority rules are processed first 

59 

60 def applies_to_row( 

61 self, df: pl.DataFrame, row_idx: int, prev_row_idx: int | None = None 

62 ) -> bool: 

63 """Check if this rule should trigger a page break for the given row""" 

64 if prev_row_idx is None: 

65 return False 

66 

67 if self.column not in df.columns: 

68 return False 

69 

70 current_value = df[self.column][row_idx] 

71 previous_value = df[self.column][prev_row_idx] 

72 

73 return self.break_on_change and current_value != previous_value 

74 

75 

76class PageDict(BaseModel): 

77 """Advanced pagination control structure (r2rtf PageDict equivalent) 

78 

79 This class provides sophisticated pagination control similar to r2rtf's page_dict, 

80 enabling page_index-like functionality while maintaining compatibility with 

81 rtflite's existing row-based pagination system. 

82 """ 

83 

84 model_config = ConfigDict(arbitrary_types_allowed=True) 

85 

86 page_configs: dict[int, PageConfig] = Field( 

87 default_factory=dict, description="Configuration for each page" 

88 ) 

89 break_rules: list[PageBreakRule] = Field( 

90 default_factory=list, description="Rules determining where page breaks occur" 

91 ) 

92 content_index: dict[str, list[int]] = Field( 

93 default_factory=dict, description="Maps content identifiers to page numbers" 

94 ) 

95 total_pages: int = Field(default=0, description="Total number of pages") 

96 nrow_per_page: int = Field(default=40, description="Base rows per page") 

97 

98 def add_page_config(self, config: PageConfig) -> None: 

99 """Add a page configuration""" 

100 self.page_configs[config.page_number] = config 

101 self.total_pages = max(self.total_pages, config.page_number) 

102 

103 def get_page_config(self, page_num: int) -> PageConfig | None: 

104 """Get configuration for a specific page""" 

105 return self.page_configs.get(page_num) 

106 

107 def add_break_rule(self, rule: PageBreakRule) -> None: 

108 """Add a page break rule""" 

109 self.break_rules.append(rule) 

110 # Sort rules by priority (highest first) 

111 self.break_rules.sort(key=lambda r: r.priority, reverse=True) 

112 

113 def get_page_for_content(self, content_id: str) -> int: 

114 """Get the page number where specific content appears (page_index equivalent)""" 

115 pages = self.content_index.get(content_id, []) 

116 return pages[0] if pages else 1 # Default to page 1 

117 

118 def get_pages_for_content(self, content_id: str) -> list[int]: 

119 """Get all page numbers where specific content appears""" 

120 return self.content_index.get(content_id, []) 

121 

122 def add_content_to_page(self, content_id: str, page_num: int) -> None: 

123 """Add content to a specific page""" 

124 if content_id not in self.content_index: 

125 self.content_index[content_id] = [] 

126 if page_num not in self.content_index[content_id]: 

127 self.content_index[content_id].append(page_num) 

128 self.content_index[content_id].sort() 

129 

130 def get_section_pages(self, section_header: str) -> list[int]: 

131 """Get all pages that belong to a specific section""" 

132 section_pages = [] 

133 for page_num, config in self.page_configs.items(): 

134 if section_header in config.section_headers: 

135 section_pages.append(page_num) 

136 return sorted(section_pages) 

137 

138 def get_page_break_summary(self) -> dict[str, int]: 

139 """Get summary of page break types""" 

140 summary: dict[str, int] = {} 

141 for config in self.page_configs.values(): 

142 break_type = config.break_type.value 

143 summary[break_type] = summary.get(break_type, 0) + 1 

144 return summary 

145 

146 def calculate_pages_from_dataframe( 

147 self, 

148 df: pl.DataFrame, 

149 page_by: list[str] | None = None, 

150 subline_by: str | None = None, 

151 new_page: bool = False, 

152 additional_rows_per_page: int = 0, 

153 ) -> None: 

154 """Calculate page configurations from a DataFrame 

155 

156 This method implements the core pagination algorithm inspired by r2rtf's approach. 

157 """ 

158 if df.is_empty(): 

159 return 

160 

161 # Clear existing configurations 

162 self.page_configs.clear() 

163 self.content_index.clear() 

164 

165 # Calculate effective rows per page (accounting for headers, footers, etc.) 

166 effective_nrow = max(1, self.nrow_per_page - additional_rows_per_page) 

167 

168 # Add break rules based on parameters 

169 if page_by and new_page: 

170 for col in page_by: 

171 self.add_break_rule( 

172 PageBreakRule( 

173 column=col, 

174 break_on_change=True, 

175 force_new_page=True, 

176 priority=10, 

177 ) 

178 ) 

179 

180 if subline_by: 

181 self.add_break_rule( 

182 PageBreakRule( 

183 column=subline_by, 

184 break_on_change=True, 

185 force_new_page=True, 

186 priority=20, # Higher priority than page_by 

187 ) 

188 ) 

189 

190 # Calculate page boundaries 

191 page_boundaries = self._calculate_page_boundaries(df, effective_nrow) 

192 

193 # Create page configurations 

194 for page_num, (start_row, end_row, break_type) in enumerate(page_boundaries, 1): 

195 config = PageConfig( 

196 page_number=page_num, 

197 start_row=start_row, 

198 end_row=end_row, 

199 break_type=break_type, 

200 ) 

201 

202 # Add section headers for page_by columns 

203 if page_by and start_row < df.height: 

204 for col in page_by: 

205 if col in df.columns: 

206 header_value = str(df[col][start_row]) 

207 config.section_headers.append(f"{col}: {header_value}") 

208 

209 # Add subline header 

210 if subline_by and subline_by in df.columns and start_row < df.height: 

211 subline_value = str(df[subline_by][start_row]) 

212 config.subline_header = f"{subline_by}: {subline_value}" 

213 

214 self.add_page_config(config) 

215 

216 self.total_pages = len(page_boundaries) 

217 

218 def _calculate_page_boundaries( 

219 self, df: pl.DataFrame, effective_nrow: int 

220 ) -> list[tuple[int, int, PageBreakType]]: 

221 """Calculate where page boundaries should occur""" 

222 boundaries = [] 

223 current_start = 0 

224 

225 for row_idx in range(df.height): 

226 # Check if any break rules apply 

227 forced_break = False 

228 break_type = PageBreakType.AUTOMATIC 

229 

230 if row_idx > 0: # Don't break on first row 

231 for rule in self.break_rules: 

232 if rule.applies_to_row(df, row_idx, row_idx - 1): 

233 if rule.force_new_page: 

234 forced_break = True 

235 break_type = PageBreakType.FORCED 

236 if rule.column and "subline" in rule.column.lower(): 

237 break_type = PageBreakType.SUBLINE 

238 break 

239 

240 # Check if we need to break due to row limit or forced break 

241 rows_on_current_page = row_idx - current_start 

242 if forced_break or (rows_on_current_page >= effective_nrow and row_idx > 0): 

243 # End current page 

244 boundaries.append((current_start, row_idx - 1, break_type)) 

245 current_start = row_idx 

246 

247 # Add final page 

248 if current_start < df.height: 

249 boundaries.append((current_start, df.height - 1, PageBreakType.AUTOMATIC)) 

250 

251 return boundaries 

252 

253 def to_legacy_page_info(self) -> list[dict[str, Any]]: 

254 """Convert to legacy page info format for backward compatibility""" 

255 page_info_list = [] 

256 

257 for page_num in sorted(self.page_configs.keys()): 

258 config = self.page_configs[page_num] 

259 page_info = { 

260 "page_number": page_num, 

261 "total_pages": self.total_pages, 

262 "start_row": config.start_row, 

263 "end_row": config.end_row, 

264 "is_first_page": page_num == 1, 

265 "is_last_page": page_num == self.total_pages, 

266 "break_type": config.break_type.value, 

267 "section_headers": config.section_headers, 

268 "subline_header": config.subline_header, 

269 } 

270 page_info_list.append(page_info) 

271 

272 return page_info_list 

273 

274 

275class PageIndexManager: 

276 """Provides page_index-like functionality for advanced page control 

277 

278 This class enables explicit control over which content appears on which pages, 

279 similar to how a page_index parameter would work in other pagination systems. 

280 """ 

281 

282 def __init__(self, page_dict: PageDict): 

283 self.page_dict = page_dict 

284 self._content_assignments: dict[str, int] = {} 

285 self._page_content_map: dict[int, set[str]] = {} 

286 

287 def assign_content_to_page(self, content_id: str, page_num: int) -> None: 

288 """Assign specific content to a specific page (explicit page_index control)""" 

289 self._content_assignments[content_id] = page_num 

290 

291 if page_num not in self._page_content_map: 

292 self._page_content_map[page_num] = set() 

293 self._page_content_map[page_num].add(content_id) 

294 

295 # Update the PageDict 

296 self.page_dict.add_content_to_page(content_id, page_num) 

297 

298 # Mark content as forced on the target page 

299 if page_num in self.page_dict.page_configs: 

300 self.page_dict.page_configs[page_num].forced_content.add(content_id) 

301 

302 def get_content_page(self, content_id: str) -> int | None: 

303 """Get the assigned page for specific content""" 

304 return self._content_assignments.get(content_id) 

305 

306 def get_page_content(self, page_num: int) -> set[str]: 

307 """Get all content assigned to a specific page""" 

308 return self._page_content_map.get(page_num, set()) 

309 

310 def force_page_break_before_content(self, content_id: str) -> None: 

311 """Force a page break before specific content appears""" 

312 # This would require integration with the DataFrame processing 

313 # to identify where the content appears and insert a break rule 

314 pass 

315 

316 def get_content_summary(self) -> dict[str, dict[str, Any]]: 

317 """Get summary of all content assignments""" 

318 summary = {} 

319 for content_id, page_num in self._content_assignments.items(): 

320 summary[content_id] = { 

321 "assigned_page": page_num, 

322 "is_forced": content_id 

323 in self.page_dict.page_configs.get( 

324 page_num, PageConfig(0, 0, 0, PageBreakType.AUTOMATIC) 

325 ).forced_content, 

326 } 

327 return summary 

328 

329 def optimize_page_distribution(self) -> None: 

330 """Optimize content distribution across pages to balance page lengths""" 

331 # Advanced algorithm to redistribute content for better balance 

332 # This could implement sophisticated optimization based on content weight, 

333 # page capacity, and user constraints 

334 pass