Coverage for src/rtflite/services/document_service.py: 87%

199 statements  

« prev     ^ index     » next       coverage.py v7.10.5, created at 2025-08-25 22:35 +0000

1"""RTF Document Service - handles all document-level operations.""" 

2 

3from collections.abc import Mapping, Sequence 

4from typing import Any, Tuple 

5 

6 

7class RTFDocumentService: 

8 """Service for handling RTF document operations including pagination and layout.""" 

9 

10 def __init__(self): 

11 from .encoding_service import RTFEncodingService 

12 

13 self.encoding_service = RTFEncodingService() 

14 

15 def calculate_additional_rows_per_page(self, document) -> int: 

16 """Calculate additional rows needed per page for headers, footnotes, sources.""" 

17 additional_rows = 0 

18 

19 # Count subline_by header (appears on each page) 

20 if document.rtf_body.subline_by: 

21 additional_rows += 1 # Each subline_by header consumes 1 row 

22 

23 # Count column headers (repeat on each page) 

24 if document.rtf_column_header: 

25 # Handle nested column headers for multi-section documents 

26 if isinstance(document.rtf_column_header[0], list): 

27 # Nested format: count all non-None headers across all sections 

28 for section_headers in document.rtf_column_header: 

29 if section_headers: # Skip [None] sections 

30 for header in section_headers: 

31 if header and header.text is not None: 

32 additional_rows += 1 

33 else: 

34 # Flat format: original logic 

35 for header in document.rtf_column_header: 

36 if header.text is not None: 

37 additional_rows += 1 

38 

39 # Count footnote rows 

40 if document.rtf_footnote and document.rtf_footnote.text: 

41 additional_rows += 1 

42 

43 # Count source rows 

44 if document.rtf_source and document.rtf_source.text: 

45 additional_rows += 1 

46 

47 return additional_rows 

48 

49 def needs_pagination(self, document) -> bool: 

50 """Check if document needs pagination based on content size and page limits.""" 

51 

52 # Multiple figures always need pagination (each figure on separate page) 

53 if document.rtf_figure and document.rtf_figure.figures: 

54 # Check if multiple figures are provided 

55 figures = document.rtf_figure.figures 

56 if isinstance(figures, (list, tuple)) and len(figures) > 1: 

57 return True 

58 

59 # Figure-only documents don't need pagination beyond multi-figure handling above 

60 if document.df is None: 

61 return False 

62 

63 # Handle multi-section documents 

64 if isinstance(document.df, list): 

65 # Check if any section needs pagination 

66 for body in document.rtf_body: 

67 if (body.page_by and body.new_page) or body.subline_by: 

68 return True 

69 # For now, multi-section documents use single page strategy 

70 return False 

71 else: 

72 # Single section document 

73 if ( 

74 document.rtf_body.page_by and document.rtf_body.new_page 

75 ) or document.rtf_body.subline_by: 

76 return True 

77 

78 # Create pagination instance to calculate rows needed 

79 from ..pagination import PageBreakCalculator, RTFPagination 

80 

81 pagination = RTFPagination( 

82 page_width=document.rtf_page.width, 

83 page_height=document.rtf_page.height, 

84 margin=document.rtf_page.margin, 

85 nrow=document.rtf_page.nrow, 

86 orientation=document.rtf_page.orientation, 

87 ) 

88 

89 calculator = PageBreakCalculator(pagination=pagination) 

90 from ..row import Utils 

91 

92 col_total_width = document.rtf_page.col_width 

93 

94 # Handle multi-section vs single section for column widths 

95 if isinstance(document.df, list): 

96 # Use first section for pagination calculation 

97 col_widths = Utils._col_widths( 

98 document.rtf_body[0].col_rel_width, col_total_width 

99 ) 

100 # Calculate rows needed for all sections combined 

101 total_content_rows: list[Any] = [] 

102 for df, body in zip(document.df, document.rtf_body): 

103 section_col_widths = Utils._col_widths( 

104 body.col_rel_width, col_total_width 

105 ) 

106 section_content_rows = calculator.calculate_content_rows( 

107 df, section_col_widths, body 

108 ) 

109 total_content_rows.extend(section_content_rows) 

110 content_rows = total_content_rows 

111 else: 

112 col_widths = Utils._col_widths( 

113 document.rtf_body.col_rel_width, col_total_width 

114 ) 

115 # Calculate rows needed for data content only 

116 content_rows = list( 

117 calculator.calculate_content_rows( 

118 document.df, col_widths, document.rtf_body 

119 ) 

120 ) 

121 

122 # Calculate additional rows per page 

123 additional_rows_per_page = self.calculate_additional_rows_per_page(document) 

124 

125 # Calculate how many data rows can fit per page 

126 data_rows = sum(content_rows) 

127 available_data_rows_per_page = max( 

128 1, document.rtf_page.nrow - additional_rows_per_page 

129 ) 

130 

131 # If we can't fit even the additional components, we definitely need pagination 

132 if additional_rows_per_page >= document.rtf_page.nrow: 

133 return True 

134 

135 # Check if data rows exceed what can fit on a single page 

136 return data_rows > available_data_rows_per_page 

137 

138 def create_pagination_instance(self, document) -> Tuple: 

139 """Create pagination and content distributor instances.""" 

140 from ..pagination import ContentDistributor, PageBreakCalculator, RTFPagination 

141 

142 pagination = RTFPagination( 

143 page_width=document.rtf_page.width, 

144 page_height=document.rtf_page.height, 

145 margin=document.rtf_page.margin, 

146 nrow=document.rtf_page.nrow, 

147 orientation=document.rtf_page.orientation, 

148 ) 

149 

150 calculator = PageBreakCalculator(pagination=pagination) 

151 distributor = ContentDistributor(pagination=pagination, calculator=calculator) 

152 

153 return pagination, distributor 

154 

155 def generate_page_break(self, document) -> str: 

156 """Generate proper RTF page break sequence.""" 

157 return self.encoding_service.encode_page_break( 

158 document.rtf_page, 

159 lambda: self.encoding_service.encode_page_margin(document.rtf_page), 

160 ) 

161 

162 def should_show_element_on_page( 

163 self, element_location: str, page_info: dict 

164 ) -> bool: 

165 """Determine if an element should be shown on a specific page.""" 

166 if element_location == "all": 

167 return True 

168 elif element_location == "first": 

169 return page_info["is_first_page"] 

170 elif element_location == "last": 

171 return page_info["is_last_page"] 

172 else: 

173 return False 

174 

175 def process_page_by( 

176 self, document 

177 ) -> Sequence[Sequence[Tuple[int, int, int]]] | None: 

178 """Create components for page_by format.""" 

179 # Obtain input data 

180 data = document.df.to_dicts() 

181 var = document.rtf_body.page_by 

182 

183 # Handle empty DataFrame 

184 if len(data) == 0: 

185 return None 

186 

187 # Obtain column names and dimensions 

188 columns = list(data[0].keys()) 

189 

190 if var is None: 

191 return None 

192 

193 def get_column_index(column_name: str) -> int: 

194 """Get the index of a column in the column list.""" 

195 return columns.index(column_name) 

196 

197 def get_matching_rows(group_values: Mapping) -> Sequence[int]: 

198 """Get row indices that match the group values.""" 

199 return [ 

200 i 

201 for i, row in enumerate(data) 

202 if all(row[k] == v for k, v in group_values.items()) 

203 ] 

204 

205 def get_unique_combinations(variables: Sequence[str]) -> Sequence[Mapping]: 

206 """Get unique combinations of values for the specified variables.""" 

207 seen = set() 

208 unique = [] 

209 for row in data: 

210 key = tuple(row[v] for v in variables) 

211 if key not in seen: 

212 seen.add(key) 

213 unique.append({v: row[v] for v in variables}) 

214 return unique 

215 

216 output = [] 

217 prev_values = {v: None for v in var} 

218 

219 # Process each unique combination of grouping variables 

220 for group in get_unique_combinations(var): 

221 indices = get_matching_rows(group) 

222 

223 # Handle headers for each level 

224 for level, var_name in enumerate(var): 

225 current_val = group[var_name] 

226 

227 need_header = False 

228 if level == len(var) - 1: 

229 need_header = True 

230 else: 

231 for lvl in range(level + 1): 

232 if group[var[lvl]] != prev_values[var[lvl]]: 

233 need_header = True 

234 break 

235 

236 if need_header: 

237 col_idx = get_column_index(var_name) 

238 # Add level information as third element in tuple 

239 output.append([(indices[0], col_idx, level)]) 

240 

241 prev_values[var_name] = current_val 

242 

243 # Handle data rows 

244 for index in indices: 

245 output.append( 

246 [ 

247 (index, j, len(var)) 

248 for j in range(len(columns)) 

249 if columns[j] not in var 

250 ] 

251 ) 

252 

253 return output 

254 

255 def apply_pagination_borders( 

256 self, document, rtf_attrs, page_info: dict, total_pages: int 

257 ): 

258 """Apply proper borders for paginated context following r2rtf design: 

259 

260 rtf_page.border_first/last: Controls borders for the entire table 

261 rtf_body.border_first/last: Controls borders for each page 

262 rtf_body.border_top/bottom: Controls borders for individual cells 

263 

264 Logic: 

265 - First page, first row: Apply rtf_page.border_first (overrides rtf_body.border_first) 

266 - Last page, last row: Apply rtf_page.border_last (overrides rtf_body.border_last) 

267 - Non-first pages, first row: Apply rtf_body.border_first 

268 - Non-last pages, last row: Apply rtf_body.border_last 

269 - All other rows: Use existing border_top/bottom from rtf_body 

270 """ 

271 from copy import deepcopy 

272 

273 from ..attributes import BroadcastValue 

274 from ..input import TableAttributes 

275 

276 # Create a deep copy of the attributes to avoid modifying the original 

277 page_attrs = deepcopy(rtf_attrs) 

278 page_df_height = page_info["data"].height 

279 page_df_width = page_info["data"].width 

280 page_shape = (page_df_height, page_df_width) 

281 

282 if page_df_height == 0: 

283 return page_attrs 

284 

285 # Clear border_first and border_last from being broadcast to all rows 

286 # These should only apply to specific rows based on pagination logic 

287 if hasattr(page_attrs, "border_first") and page_attrs.border_first: 

288 # Don't use border_first in pagination - it's handled separately 

289 page_attrs.border_first = None 

290 

291 if hasattr(page_attrs, "border_last") and page_attrs.border_last: 

292 # Don't use border_last in pagination - it's handled separately 

293 page_attrs.border_last = None 

294 

295 # Ensure border_top and border_bottom are properly sized for this page 

296 if not page_attrs.border_top: 

297 page_attrs.border_top = [ 

298 [""] * page_df_width for _ in range(page_df_height) 

299 ] 

300 if not page_attrs.border_bottom: 

301 page_attrs.border_bottom = [ 

302 [""] * page_df_width for _ in range(page_df_height) 

303 ] 

304 

305 # Apply borders based on page position 

306 # For first page: only apply rtf_page.border_first to table body if NO column headers 

307 has_column_headers = ( 

308 document.rtf_column_header and len(document.rtf_column_header) > 0 

309 ) 

310 if page_info["is_first_page"] and not has_column_headers: 

311 if document.rtf_page.border_first: 

312 # Apply border to all cells in the first row 

313 for col_idx in range(page_df_width): 

314 page_attrs = self._apply_border_to_cell( 

315 page_attrs, 

316 0, 

317 col_idx, 

318 "top", 

319 document.rtf_page.border_first, 

320 page_shape, 

321 ) 

322 

323 # For first page with column headers: ensure consistent border style 

324 if page_info["is_first_page"] and has_column_headers: 

325 # Apply same border style as non-first pages to maintain consistency 

326 if document.rtf_body.border_first: 

327 border_style = ( 

328 document.rtf_body.border_first[0][0] 

329 if isinstance(document.rtf_body.border_first, list) 

330 else document.rtf_body.border_first 

331 ) 

332 # Apply single border style to first data row (same as other pages) 

333 for col_idx in range(page_df_width): 

334 page_attrs = self._apply_border_to_cell( 

335 page_attrs, 0, col_idx, "top", border_style, page_shape 

336 ) 

337 

338 # Apply page-level borders for non-first/last pages 

339 if not page_info["is_first_page"] and document.rtf_body.border_first: 

340 # Apply border_first to first row of non-first pages 

341 border_style = ( 

342 document.rtf_body.border_first[0][0] 

343 if isinstance(document.rtf_body.border_first, list) 

344 else document.rtf_body.border_first 

345 ) 

346 for col_idx in range(page_df_width): 

347 page_attrs = self._apply_border_to_cell( 

348 page_attrs, 0, col_idx, "top", border_style, page_shape 

349 ) 

350 

351 # Check if footnotes or sources will appear on this page 

352 has_footnote_on_page = ( 

353 document.rtf_footnote 

354 and document.rtf_footnote.text 

355 and self.should_show_element_on_page( 

356 document.rtf_page.page_footnote, page_info 

357 ) 

358 ) 

359 has_source_on_page = ( 

360 document.rtf_source 

361 and document.rtf_source.text 

362 and self.should_show_element_on_page( 

363 document.rtf_page.page_source, page_info 

364 ) 

365 ) 

366 

367 # Determine if footnotes/sources appear as tables on the last page 

368 # This is crucial for border placement when components are set to "first" only 

369 footnote_as_table_on_last = ( 

370 document.rtf_footnote 

371 and document.rtf_footnote.text 

372 and getattr(document.rtf_footnote, "as_table", True) 

373 and document.rtf_page.page_footnote in ("last", "all") 

374 ) 

375 source_as_table_on_last = ( 

376 document.rtf_source 

377 and document.rtf_source.text 

378 and getattr(document.rtf_source, "as_table", False) 

379 and document.rtf_page.page_source in ("last", "all") 

380 ) 

381 

382 # Apply border logic based on page position and footnote/source presence 

383 if not page_info["is_last_page"]: 

384 # Non-last pages: apply single border after footnote/source, or after data if no footnote/source 

385 if document.rtf_body.border_last: 

386 border_style = ( 

387 document.rtf_body.border_last[0][0] 

388 if isinstance(document.rtf_body.border_last, list) 

389 else document.rtf_body.border_last 

390 ) 

391 

392 if not (has_footnote_on_page or has_source_on_page): 

393 # No footnote/source: apply border to last data row 

394 for col_idx in range(page_df_width): 

395 page_attrs = self._apply_border_to_cell( 

396 page_attrs, 

397 page_df_height - 1, 

398 col_idx, 

399 "bottom", 

400 border_style, 

401 page_shape, 

402 ) 

403 else: 

404 # Has footnote/source: apply border_last from RTFBody 

405 self._apply_footnote_source_borders( 

406 document, page_info, border_style, is_last_page=False 

407 ) 

408 

409 else: # is_last_page 

410 # Last page: check if we should apply border to data or footnote/source 

411 if document.rtf_page.border_last: 

412 # Check if this page contains the absolute last row 

413 total_rows = document.df.height 

414 is_absolute_last_row = page_info["end_row"] == total_rows - 1 

415 

416 if is_absolute_last_row: 

417 # If footnotes/sources are set to "first" only and appear as tables, 

418 # they won't be on the last page, so apply border to last data row 

419 if not (footnote_as_table_on_last or source_as_table_on_last): 

420 # No footnote/source on last page: apply border to last data row 

421 last_row_idx = page_df_height - 1 

422 for col_idx in range(page_df_width): 

423 page_attrs = self._apply_border_to_cell( 

424 page_attrs, 

425 last_row_idx, 

426 col_idx, 

427 "bottom", 

428 document.rtf_page.border_last, 

429 page_shape, 

430 ) 

431 else: 

432 # Has footnote/source on last page: set border for footnote/source 

433 self._apply_footnote_source_borders( 

434 document, 

435 page_info, 

436 document.rtf_page.border_last, 

437 is_last_page=True, 

438 ) 

439 

440 return page_attrs 

441 

442 def _apply_footnote_source_borders( 

443 self, document, page_info: dict, border_style: str, is_last_page: bool 

444 ): 

445 """Apply borders to footnote and source components based on page position.""" 

446 # Determine which component should get the border 

447 has_footnote = ( 

448 document.rtf_footnote 

449 and document.rtf_footnote.text 

450 and self.should_show_element_on_page( 

451 document.rtf_page.page_footnote, page_info 

452 ) 

453 ) 

454 has_source = ( 

455 document.rtf_source 

456 and document.rtf_source.text 

457 and self.should_show_element_on_page( 

458 document.rtf_page.page_source, page_info 

459 ) 

460 ) 

461 

462 # Apply border to components based on as_table setting 

463 # Priority: Source with as_table=True > Footnote with as_table=True > any component 

464 target_component = None 

465 

466 # Extract as_table values (now stored as booleans) 

467 footnote_as_table = None 

468 if has_footnote: 

469 footnote_as_table = getattr(document.rtf_footnote, "as_table", True) 

470 

471 source_as_table = None 

472 if has_source: 

473 source_as_table = getattr(document.rtf_source, "as_table", False) 

474 

475 if has_source and source_as_table: 

476 # Source is rendered as table: prioritize source for borders 

477 target_component = ("source", document.rtf_source) 

478 elif has_footnote and footnote_as_table: 

479 # Footnote is rendered as table: use footnote for borders 

480 target_component = ("footnote", document.rtf_footnote) 

481 # Note: Removed fallback to plain text components - borders should only be applied 

482 # to components that are rendered as tables (as_table=True) 

483 

484 if target_component: 

485 component_name, component = target_component 

486 if not hasattr(component, "_page_border_style"): 

487 component._page_border_style = {} 

488 component._page_border_style[page_info["page_number"]] = border_style 

489 

490 def _apply_border_to_cell( 

491 self, 

492 page_attrs, 

493 row_idx: int, 

494 col_idx: int, 

495 border_side: str, 

496 border_style: str, 

497 page_shape: tuple, 

498 ): 

499 """Apply specified border style to a specific cell using BroadcastValue""" 

500 from ..attributes import BroadcastValue 

501 

502 border_attr = f"border_{border_side}" 

503 

504 if not hasattr(page_attrs, border_attr): 

505 return page_attrs 

506 

507 # Get current border values 

508 current_borders = getattr(page_attrs, border_attr) 

509 

510 # Create BroadcastValue to expand borders to page shape 

511 border_broadcast = BroadcastValue(value=current_borders, dimension=page_shape) 

512 

513 # Update the specific cell 

514 border_broadcast.update_cell(row_idx, col_idx, border_style) 

515 

516 # Update the attribute with the expanded value 

517 setattr(page_attrs, border_attr, border_broadcast.value) 

518 return page_attrs