Coverage for src/rtflite/pagination/page_dict.py: 90%

158 statements  

« prev     ^ index     » next       coverage.py v7.10.5, created at 2025-08-25 22:35 +0000

1""" 

2Advanced pagination control system for rtflite. 

3 

4This module implements a PageDict equivalent to r2rtf's advanced pagination features, 

5providing page_index-like functionality while maintaining rtflite's existing architecture. 

6""" 

7 

8from collections.abc import Mapping, MutableMapping, MutableSet, Sequence 

9from dataclasses import dataclass, field 

10from enum import Enum 

11from typing import Any 

12 

13import polars as pl 

14from pydantic import BaseModel, ConfigDict, Field 

15 

16 

17class PageBreakType(Enum): 

18 """Types of page breaks that can occur""" 

19 

20 AUTOMATIC = "automatic" # Based on nrow limit 

21 FORCED = "forced" # Based on page_by with new_page=True 

22 SUBLINE = "subline" # Based on subline_by changes 

23 MANUAL = "manual" # Manually specified via PageIndexManager 

24 

25 

26@dataclass 

27class PageConfig: 

28 """Configuration for a specific page""" 

29 

30 page_number: int 

31 start_row: int 

32 end_row: int 

33 break_type: PageBreakType 

34 section_headers: list[str] = field(default_factory=list) 

35 subline_header: str | None = None 

36 group_context: dict[str, Any] = field(default_factory=dict) 

37 forced_content: set[str] = field( 

38 default_factory=set 

39 ) # Content IDs forced to this page 

40 

41 @property 

42 def row_count(self) -> int: 

43 """Number of data rows on this page""" 

44 return self.end_row - self.start_row + 1 

45 

46 @property 

47 def is_section_start(self) -> bool: 

48 """True if this page starts a new section""" 

49 return self.break_type in {PageBreakType.FORCED, PageBreakType.SUBLINE} 

50 

51 

52@dataclass 

53class PageBreakRule: 

54 """Rule for determining when page breaks should occur""" 

55 

56 column: str 

57 break_on_change: bool = True 

58 force_new_page: bool = False 

59 priority: int = 0 # Higher priority rules are processed first 

60 

61 def applies_to_row( 

62 self, df: pl.DataFrame, row_idx: int, prev_row_idx: int | None = None 

63 ) -> bool: 

64 """Check if this rule should trigger a page break for the given row""" 

65 if prev_row_idx is None: 

66 return False 

67 

68 if self.column not in df.columns: 

69 return False 

70 

71 current_value = df[self.column][row_idx] 

72 previous_value = df[self.column][prev_row_idx] 

73 

74 return self.break_on_change and current_value != previous_value 

75 

76 

77class PageDict(BaseModel): 

78 """Advanced pagination control structure (r2rtf PageDict equivalent) 

79 

80 This class provides sophisticated pagination control similar to r2rtf's page_dict, 

81 enabling page_index-like functionality while maintaining compatibility with 

82 rtflite's existing row-based pagination system. 

83 """ 

84 

85 model_config = ConfigDict(arbitrary_types_allowed=True) 

86 

87 page_configs: MutableMapping[int, PageConfig] = Field( 

88 default_factory=dict, description="Configuration for each page" 

89 ) 

90 break_rules: list[PageBreakRule] = Field( 

91 default_factory=list, description="Rules determining where page breaks occur" 

92 ) 

93 content_index: MutableMapping[str, list[int]] = Field( 

94 default_factory=dict, description="Maps content identifiers to page numbers" 

95 ) 

96 total_pages: int = Field(default=0, description="Total number of pages") 

97 nrow_per_page: int = Field(default=40, description="Base rows per page") 

98 

99 def add_page_config(self, config: PageConfig) -> None: 

100 """Add a page configuration""" 

101 self.page_configs[config.page_number] = config 

102 self.total_pages = max(self.total_pages, config.page_number) 

103 

104 def get_page_config(self, page_num: int) -> PageConfig | None: 

105 """Get configuration for a specific page""" 

106 return self.page_configs.get(page_num) 

107 

108 def add_break_rule(self, rule: PageBreakRule) -> None: 

109 """Add a page break rule""" 

110 self.break_rules.append(rule) 

111 # Sort rules by priority (highest first) 

112 self.break_rules.sort(key=lambda r: r.priority, reverse=True) 

113 

114 def get_page_for_content(self, content_id: str) -> int: 

115 """Get the page number where specific content appears (page_index equivalent)""" 

116 pages = self.content_index.get(content_id, []) 

117 return pages[0] if pages else 1 # Default to page 1 

118 

119 def get_pages_for_content(self, content_id: str) -> list[int]: 

120 """Get all page numbers where specific content appears""" 

121 return self.content_index.get(content_id, []) 

122 

123 def add_content_to_page(self, content_id: str, page_num: int) -> None: 

124 """Add content to a specific page""" 

125 if content_id not in self.content_index: 

126 self.content_index[content_id] = [] 

127 if page_num not in self.content_index[content_id]: 

128 self.content_index[content_id].append(page_num) 

129 self.content_index[content_id].sort() 

130 

131 def get_section_pages(self, section_header: str) -> Sequence[int]: 

132 """Get all pages that belong to a specific section""" 

133 section_pages = [] 

134 for page_num, config in self.page_configs.items(): 

135 if section_header in config.section_headers: 

136 section_pages.append(page_num) 

137 return sorted(section_pages) 

138 

139 def get_page_break_summary(self) -> Mapping[str, int]: 

140 """Get summary of page break types""" 

141 summary: MutableMapping[str, int] = {} 

142 for config in self.page_configs.values(): 

143 break_type = config.break_type.value 

144 summary[break_type] = summary.get(break_type, 0) + 1 

145 return summary 

146 

147 def calculate_pages_from_dataframe( 

148 self, 

149 df: pl.DataFrame, 

150 page_by: Sequence[str] | None = None, 

151 subline_by: str | None = None, 

152 new_page: bool = False, 

153 additional_rows_per_page: int = 0, 

154 ) -> None: 

155 """Calculate page configurations from a DataFrame 

156 

157 This method implements the core pagination algorithm inspired by r2rtf's approach. 

158 """ 

159 if df.is_empty(): 

160 return 

161 

162 # Clear existing configurations 

163 self.page_configs.clear() 

164 self.content_index.clear() 

165 

166 # Calculate effective rows per page (accounting for headers, footers, etc.) 

167 effective_nrow = max(1, self.nrow_per_page - additional_rows_per_page) 

168 

169 # Add break rules based on parameters 

170 if page_by and new_page: 

171 for col in page_by: 

172 self.add_break_rule( 

173 PageBreakRule( 

174 column=col, 

175 break_on_change=True, 

176 force_new_page=True, 

177 priority=10, 

178 ) 

179 ) 

180 

181 if subline_by: 

182 self.add_break_rule( 

183 PageBreakRule( 

184 column=subline_by, 

185 break_on_change=True, 

186 force_new_page=True, 

187 priority=20, # Higher priority than page_by 

188 ) 

189 ) 

190 

191 # Calculate page boundaries 

192 page_boundaries = self._calculate_page_boundaries(df, effective_nrow) 

193 

194 # Create page configurations 

195 for page_num, (start_row, end_row, break_type) in enumerate(page_boundaries, 1): 

196 config = PageConfig( 

197 page_number=page_num, 

198 start_row=start_row, 

199 end_row=end_row, 

200 break_type=break_type, 

201 ) 

202 

203 # Add section headers for page_by columns 

204 if page_by and start_row < df.height: 

205 for col in page_by: 

206 if col in df.columns: 

207 header_value = str(df[col][start_row]) 

208 config.section_headers.append(f"{col}: {header_value}") 

209 

210 # Add subline header 

211 if subline_by and subline_by in df.columns and start_row < df.height: 

212 subline_value = str(df[subline_by][start_row]) 

213 config.subline_header = f"{subline_by}: {subline_value}" 

214 

215 self.add_page_config(config) 

216 

217 self.total_pages = len(page_boundaries) 

218 

219 def _calculate_page_boundaries( 

220 self, df: pl.DataFrame, effective_nrow: int 

221 ) -> Sequence[tuple[int, int, PageBreakType]]: 

222 """Calculate where page boundaries should occur""" 

223 boundaries = [] 

224 current_start = 0 

225 

226 for row_idx in range(df.height): 

227 # Check if any break rules apply 

228 forced_break = False 

229 break_type = PageBreakType.AUTOMATIC 

230 

231 if row_idx > 0: # Don't break on first row 

232 for rule in self.break_rules: 

233 if rule.applies_to_row(df, row_idx, row_idx - 1): 

234 if rule.force_new_page: 

235 forced_break = True 

236 break_type = PageBreakType.FORCED 

237 if rule.column and "subline" in rule.column.lower(): 

238 break_type = PageBreakType.SUBLINE 

239 break 

240 

241 # Check if we need to break due to row limit or forced break 

242 rows_on_current_page = row_idx - current_start 

243 if forced_break or (rows_on_current_page >= effective_nrow and row_idx > 0): 

244 # End current page 

245 boundaries.append((current_start, row_idx - 1, break_type)) 

246 current_start = row_idx 

247 

248 # Add final page 

249 if current_start < df.height: 

250 boundaries.append((current_start, df.height - 1, PageBreakType.AUTOMATIC)) 

251 

252 return boundaries 

253 

254 def to_legacy_page_info(self) -> Sequence[Mapping[str, Any]]: 

255 """Convert to legacy page info format for backward compatibility""" 

256 page_info_list = [] 

257 

258 for page_num in sorted(self.page_configs.keys()): 

259 config = self.page_configs[page_num] 

260 page_info = { 

261 "page_number": page_num, 

262 "total_pages": self.total_pages, 

263 "start_row": config.start_row, 

264 "end_row": config.end_row, 

265 "is_first_page": page_num == 1, 

266 "is_last_page": page_num == self.total_pages, 

267 "break_type": config.break_type.value, 

268 "section_headers": config.section_headers, 

269 "subline_header": config.subline_header, 

270 } 

271 page_info_list.append(page_info) 

272 

273 return page_info_list 

274 

275 

276class PageIndexManager: 

277 """Provides page_index-like functionality for advanced page control 

278 

279 This class enables explicit control over which content appears on which pages, 

280 similar to how a page_index parameter would work in other pagination systems. 

281 """ 

282 

283 def __init__(self, page_dict: PageDict): 

284 self.page_dict = page_dict 

285 self._content_assignments: MutableMapping[str, int] = {} 

286 self._page_content_map: MutableMapping[int, MutableSet[str]] = {} 

287 

288 def assign_content_to_page(self, content_id: str, page_num: int) -> None: 

289 """Assign specific content to a specific page (explicit page_index control)""" 

290 self._content_assignments[content_id] = page_num 

291 

292 if page_num not in self._page_content_map: 

293 self._page_content_map[page_num] = set() 

294 self._page_content_map[page_num].add(content_id) 

295 

296 # Update the PageDict 

297 self.page_dict.add_content_to_page(content_id, page_num) 

298 

299 # Mark content as forced on the target page 

300 if page_num in self.page_dict.page_configs: 

301 self.page_dict.page_configs[page_num].forced_content.add(content_id) 

302 

303 def get_content_page(self, content_id: str) -> int | None: 

304 """Get the assigned page for specific content""" 

305 return self._content_assignments.get(content_id) 

306 

307 def get_page_content(self, page_num: int) -> MutableSet[str]: 

308 """Get all content assigned to a specific page""" 

309 return self._page_content_map.get(page_num, set()) 

310 

311 def force_page_break_before_content(self, content_id: str) -> None: 

312 """Force a page break before specific content appears""" 

313 # This would require integration with the DataFrame processing 

314 # to identify where the content appears and insert a break rule 

315 pass 

316 

317 def get_content_summary(self) -> Mapping[str, Mapping[str, Any]]: 

318 """Get summary of all content assignments""" 

319 summary = {} 

320 for content_id, page_num in self._content_assignments.items(): 

321 summary[content_id] = { 

322 "assigned_page": page_num, 

323 "is_forced": content_id 

324 in self.page_dict.page_configs.get( 

325 page_num, PageConfig(0, 0, 0, PageBreakType.AUTOMATIC) 

326 ).forced_content, 

327 } 

328 return summary 

329 

330 def optimize_page_distribution(self) -> None: 

331 """Optimize content distribution across pages to balance page lengths""" 

332 # Advanced algorithm to redistribute content for better balance 

333 # This could implement sophisticated optimization based on content weight, 

334 # page capacity, and user constraints 

335 pass