Coverage for src/rtflite/pagination/page

1"""

2Advanced pagination control system for rtflite.

4This module implements a PageDict equivalent to r2rtf's advanced pagination features,

5providing page_index-like functionality while maintaining rtflite's existing architecture.

6"""

8from collections.abc import Mapping, MutableMapping, MutableSet, Sequence

9from dataclasses import dataclass, field

10from enum import Enum

11from typing import Any

13import polars as pl

14from pydantic import BaseModel, ConfigDict, Field

17class PageBreakType(Enum):

18 """Types of page breaks that can occur"""

20 AUTOMATIC = "automatic" # Based on nrow limit

21 FORCED = "forced" # Based on page_by with new_page=True

22 SUBLINE = "subline" # Based on subline_by changes

23 MANUAL = "manual" # Manually specified via PageIndexManager

26@dataclass

27class PageConfig:

28 """Configuration for a specific page"""

30 page_number: int

31 start_row: int

32 end_row: int

33 break_type: PageBreakType

34 section_headers: list[str] = field(default_factory=list)

35 subline_header: str | None = None

36 group_context: dict[str, Any] = field(default_factory=dict)

37 forced_content: set[str] = field(

38 default_factory=set

39 ) # Content IDs forced to this page

41 @property

42 def row_count(self) -> int:

43 """Number of data rows on this page"""

44 return self.end_row - self.start_row + 1

46 @property

47 def is_section_start(self) -> bool:

48 """True if this page starts a new section"""

49 return self.break_type in {PageBreakType.FORCED, PageBreakType.SUBLINE}

52@dataclass

53class PageBreakRule:

54 """Rule for determining when page breaks should occur"""

56 column: str

57 break_on_change: bool = True

58 force_new_page: bool = False

59 priority: int = 0 # Higher priority rules are processed first

61 def applies_to_row(

62 self, df: pl.DataFrame, row_idx: int, prev_row_idx: int | None = None

63 ) -> bool:

64 """Check if this rule should trigger a page break for the given row"""

65 if prev_row_idx is None:

66 return False

68 if self.column not in df.columns:

69 return False

71 current_value = df[self.column][row_idx]

72 previous_value = df[self.column][prev_row_idx]

74 return self.break_on_change and current_value != previous_value

77class PageDict(BaseModel):

78 """Advanced pagination control structure (r2rtf PageDict equivalent)

80 This class provides sophisticated pagination control similar to r2rtf's page_dict,

81 enabling page_index-like functionality while maintaining compatibility with

82 rtflite's existing row-based pagination system.

83 """

85 model_config = ConfigDict(arbitrary_types_allowed=True)

87 page_configs: MutableMapping[int, PageConfig] = Field(

88 default_factory=dict, description="Configuration for each page"

89 )

90 break_rules: list[PageBreakRule] = Field(

91 default_factory=list, description="Rules determining where page breaks occur"

92 )

93 content_index: MutableMapping[str, list[int]] = Field(

94 default_factory=dict, description="Maps content identifiers to page numbers"

95 )

96 total_pages: int = Field(default=0, description="Total number of pages")

97 nrow_per_page: int = Field(default=40, description="Base rows per page")

99 def add_page_config(self, config: PageConfig) -> None:

100 """Add a page configuration"""

101 self.page_configs[config.page_number] = config

102 self.total_pages = max(self.total_pages, config.page_number)

103

104 def get_page_config(self, page_num: int) -> PageConfig | None:

105 """Get configuration for a specific page"""

106 return self.page_configs.get(page_num)

107

108 def add_break_rule(self, rule: PageBreakRule) -> None:

109 """Add a page break rule"""

110 self.break_rules.append(rule)

111 # Sort rules by priority (highest first)

112 self.break_rules.sort(key=lambda r: r.priority, reverse=True)

113

114 def get_page_for_content(self, content_id: str) -> int:

115 """Get the page number where specific content appears (page_index equivalent)"""

116 pages = self.content_index.get(content_id, [])

117 return pages[0] if pages else 1 # Default to page 1

118

119 def get_pages_for_content(self, content_id: str) -> list[int]:

120 """Get all page numbers where specific content appears"""

121 return self.content_index.get(content_id, [])

122

123 def add_content_to_page(self, content_id: str, page_num: int) -> None:

124 """Add content to a specific page"""

125 if content_id not in self.content_index:

126 self.content_index[content_id] = []

127 if page_num not in self.content_index[content_id]:

128 self.content_index[content_id].append(page_num)

129 self.content_index[content_id].sort()

130

131 def get_section_pages(self, section_header: str) -> Sequence[int]:

132 """Get all pages that belong to a specific section"""

133 section_pages = []

134 for page_num, config in self.page_configs.items():

135 if section_header in config.section_headers:

136 section_pages.append(page_num)

137 return sorted(section_pages)

138

139 def get_page_break_summary(self) -> Mapping[str, int]:

140 """Get summary of page break types"""

141 summary: MutableMapping[str, int] = {}

142 for config in self.page_configs.values():

143 break_type = config.break_type.value

144 summary[break_type] = summary.get(break_type, 0) + 1

145 return summary

146

147 def calculate_pages_from_dataframe(

148 self,

149 df: pl.DataFrame,

150 page_by: Sequence[str] | None = None,

151 subline_by: str | None = None,

152 new_page: bool = False,

153 additional_rows_per_page: int = 0,

154 ) -> None:

155 """Calculate page configurations from a DataFrame

156

157 This method implements the core pagination algorithm inspired by r2rtf's approach.

158 """

159 if df.is_empty():

160 return

161

162 # Clear existing configurations

163 self.page_configs.clear()

164 self.content_index.clear()

165

166 # Calculate effective rows per page (accounting for headers, footers, etc.)

167 effective_nrow = max(1, self.nrow_per_page - additional_rows_per_page)

168

169 # Add break rules based on parameters

170 if page_by and new_page:

171 for col in page_by:

172 self.add_break_rule(

173 PageBreakRule(

174 column=col,

175 break_on_change=True,

176 force_new_page=True,

177 priority=10,

178 )

179 )

180

181 if subline_by:

182 self.add_break_rule(

183 PageBreakRule(

184 column=subline_by,

185 break_on_change=True,

186 force_new_page=True,

187 priority=20, # Higher priority than page_by

188 )

189 )

190

191 # Calculate page boundaries

192 page_boundaries = self._calculate_page_boundaries(df, effective_nrow)

193

194 # Create page configurations

195 for page_num, (start_row, end_row, break_type) in enumerate(page_boundaries, 1):

196 config = PageConfig(

197 page_number=page_num,

198 start_row=start_row,

199 end_row=end_row,

200 break_type=break_type,

201 )

202

203 # Add section headers for page_by columns

204 if page_by and start_row < df.height:

205 for col in page_by:

206 if col in df.columns:

207 header_value = str(df[col][start_row])

208 config.section_headers.append(f"{col}: {header_value}")

209

210 # Add subline header

211 if subline_by and subline_by in df.columns and start_row < df.height:

212 subline_value = str(df[subline_by][start_row])

213 config.subline_header = f"{subline_by}: {subline_value}"

214

215 self.add_page_config(config)

216

217 self.total_pages = len(page_boundaries)

218

219 def _calculate_page_boundaries(

220 self, df: pl.DataFrame, effective_nrow: int

221 ) -> Sequence[tuple[int, int, PageBreakType]]:

222 """Calculate where page boundaries should occur"""

223 boundaries = []

224 current_start = 0

225

226 for row_idx in range(df.height):

227 # Check if any break rules apply

228 forced_break = False

229 break_type = PageBreakType.AUTOMATIC

230

231 if row_idx > 0: # Don't break on first row

232 for rule in self.break_rules:

233 if rule.applies_to_row(df, row_idx, row_idx - 1):

234 if rule.force_new_page:

235 forced_break = True

236 break_type = PageBreakType.FORCED

237 if rule.column and "subline" in rule.column.lower():

238 break_type = PageBreakType.SUBLINE

239 break

240

241 # Check if we need to break due to row limit or forced break

242 rows_on_current_page = row_idx - current_start

243 if forced_break or (rows_on_current_page >= effective_nrow and row_idx > 0):

244 # End current page

245 boundaries.append((current_start, row_idx - 1, break_type))

246 current_start = row_idx

247

248 # Add final page

249 if current_start < df.height:

250 boundaries.append((current_start, df.height - 1, PageBreakType.AUTOMATIC))

251

252 return boundaries

253

254 def to_legacy_page_info(self) -> Sequence[Mapping[str, Any]]:

255 """Convert to legacy page info format for backward compatibility"""

256 page_info_list = []

257

258 for page_num in sorted(self.page_configs.keys()):

259 config = self.page_configs[page_num]

260 page_info = {

261 "page_number": page_num,

262 "total_pages": self.total_pages,

263 "start_row": config.start_row,

264 "end_row": config.end_row,

265 "is_first_page": page_num == 1,

266 "is_last_page": page_num == self.total_pages,

267 "break_type": config.break_type.value,

268 "section_headers": config.section_headers,

269 "subline_header": config.subline_header,

270 }

271 page_info_list.append(page_info)

272

273 return page_info_list

274

275

276class PageIndexManager:

277 """Provides page_index-like functionality for advanced page control

278

279 This class enables explicit control over which content appears on which pages,

280 similar to how a page_index parameter would work in other pagination systems.

281 """

282

283 def __init__(self, page_dict: PageDict):

284 self.page_dict = page_dict

285 self._content_assignments: MutableMapping[str, int] = {}

286 self._page_content_map: MutableMapping[int, MutableSet[str]] = {}

287

288 def assign_content_to_page(self, content_id: str, page_num: int) -> None:

289 """Assign specific content to a specific page (explicit page_index control)"""

290 self._content_assignments[content_id] = page_num

291

292 if page_num not in self._page_content_map:

293 self._page_content_map[page_num] = set()

294 self._page_content_map[page_num].add(content_id)

295

296 # Update the PageDict

297 self.page_dict.add_content_to_page(content_id, page_num)

298

299 # Mark content as forced on the target page

300 if page_num in self.page_dict.page_configs:

301 self.page_dict.page_configs[page_num].forced_content.add(content_id)

302

303 def get_content_page(self, content_id: str) -> int | None:

304 """Get the assigned page for specific content"""

305 return self._content_assignments.get(content_id)

306

307 def get_page_content(self, page_num: int) -> MutableSet[str]:

308 """Get all content assigned to a specific page"""

309 return self._page_content_map.get(page_num, set())

310

311 def force_page_break_before_content(self, content_id: str) -> None:

312 """Force a page break before specific content appears"""

313 # This would require integration with the DataFrame processing

314 # to identify where the content appears and insert a break rule

315 pass

316

317 def get_content_summary(self) -> Mapping[str, Mapping[str, Any]]:

318 """Get summary of all content assignments"""

319 summary = {}

320 for content_id, page_num in self._content_assignments.items():

321 summary[content_id] = {

322 "assigned_page": page_num,

323 "is_forced": content_id

324 in self.page_dict.page_configs.get(

325 page_num, PageConfig(0, 0, 0, PageBreakType.AUTOMATIC)

326 ).forced_content,

327 }

328 return summary

329

330 def optimize_page_distribution(self) -> None:

331 """Optimize content distribution across pages to balance page lengths"""

332 # Advanced algorithm to redistribute content for better balance

333 # This could implement sophisticated optimization based on content weight,

334 # page capacity, and user constraints

335 pass

Coverage for src/rtflite/pagination/page_dict.py: 90%

158 statements