Coverage for src/rtflite/services/document_service.py: 87%

198 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-14 16:35 +0000

1"""RTF Document Service - handles all document-level operations.""" 

2 

3from typing import List, Tuple 

4 

5 

6class RTFDocumentService: 

7 """Service for handling RTF document operations including pagination and layout.""" 

8 

9 def __init__(self): 

10 from .encoding_service import RTFEncodingService 

11 

12 self.encoding_service = RTFEncodingService() 

13 

14 def calculate_additional_rows_per_page(self, document) -> int: 

15 """Calculate additional rows needed per page for headers, footnotes, sources.""" 

16 additional_rows = 0 

17 

18 # Count subline_by header (appears on each page) 

19 if document.rtf_body.subline_by: 

20 additional_rows += 1 # Each subline_by header consumes 1 row 

21 

22 # Count column headers (repeat on each page) 

23 if document.rtf_column_header: 

24 # Handle nested column headers for multi-section documents 

25 if isinstance(document.rtf_column_header[0], list): 

26 # Nested format: count all non-None headers across all sections 

27 for section_headers in document.rtf_column_header: 

28 if section_headers: # Skip [None] sections 

29 for header in section_headers: 

30 if header and header.text is not None: 

31 additional_rows += 1 

32 else: 

33 # Flat format: original logic 

34 for header in document.rtf_column_header: 

35 if header.text is not None: 

36 additional_rows += 1 

37 

38 # Count footnote rows 

39 if document.rtf_footnote and document.rtf_footnote.text: 

40 additional_rows += 1 

41 

42 # Count source rows 

43 if document.rtf_source and document.rtf_source.text: 

44 additional_rows += 1 

45 

46 return additional_rows 

47 

48 def needs_pagination(self, document) -> bool: 

49 """Check if document needs pagination based on content size and page limits.""" 

50 

51 # Multiple figures always need pagination (each figure on separate page) 

52 if document.rtf_figure and document.rtf_figure.figures: 

53 # Check if multiple figures are provided 

54 figures = document.rtf_figure.figures 

55 if isinstance(figures, (list, tuple)) and len(figures) > 1: 

56 return True 

57 

58 # Figure-only documents don't need pagination beyond multi-figure handling above 

59 if document.df is None: 

60 return False 

61 

62 # Handle multi-section documents 

63 if isinstance(document.df, list): 

64 # Check if any section needs pagination 

65 for body in document.rtf_body: 

66 if (body.page_by and body.new_page) or body.subline_by: 

67 return True 

68 # For now, multi-section documents use single page strategy 

69 return False 

70 else: 

71 # Single section document 

72 if ( 

73 document.rtf_body.page_by and document.rtf_body.new_page 

74 ) or document.rtf_body.subline_by: 

75 return True 

76 

77 # Create pagination instance to calculate rows needed 

78 from ..pagination import PageBreakCalculator, RTFPagination 

79 

80 pagination = RTFPagination( 

81 page_width=document.rtf_page.width, 

82 page_height=document.rtf_page.height, 

83 margin=document.rtf_page.margin, 

84 nrow=document.rtf_page.nrow, 

85 orientation=document.rtf_page.orientation, 

86 ) 

87 

88 calculator = PageBreakCalculator(pagination=pagination) 

89 from ..row import Utils 

90 

91 col_total_width = document.rtf_page.col_width 

92 

93 # Handle multi-section vs single section for column widths 

94 if isinstance(document.df, list): 

95 # Use first section for pagination calculation 

96 col_widths = Utils._col_widths( 

97 document.rtf_body[0].col_rel_width, col_total_width 

98 ) 

99 # Calculate rows needed for all sections combined 

100 total_content_rows = [] 

101 for df, body in zip(document.df, document.rtf_body): 

102 section_col_widths = Utils._col_widths( 

103 body.col_rel_width, col_total_width 

104 ) 

105 section_content_rows = calculator.calculate_content_rows( 

106 df, section_col_widths, body 

107 ) 

108 total_content_rows.extend(section_content_rows) 

109 content_rows = total_content_rows 

110 else: 

111 col_widths = Utils._col_widths( 

112 document.rtf_body.col_rel_width, col_total_width 

113 ) 

114 # Calculate rows needed for data content only 

115 content_rows = calculator.calculate_content_rows( 

116 document.df, col_widths, document.rtf_body 

117 ) 

118 

119 # Calculate additional rows per page 

120 additional_rows_per_page = self.calculate_additional_rows_per_page(document) 

121 

122 # Calculate how many data rows can fit per page 

123 data_rows = sum(content_rows) 

124 available_data_rows_per_page = max( 

125 1, document.rtf_page.nrow - additional_rows_per_page 

126 ) 

127 

128 # If we can't fit even the additional components, we definitely need pagination 

129 if additional_rows_per_page >= document.rtf_page.nrow: 

130 return True 

131 

132 # Check if data rows exceed what can fit on a single page 

133 return data_rows > available_data_rows_per_page 

134 

135 def create_pagination_instance(self, document) -> Tuple: 

136 """Create pagination and content distributor instances.""" 

137 from ..pagination import ContentDistributor, PageBreakCalculator, RTFPagination 

138 

139 pagination = RTFPagination( 

140 page_width=document.rtf_page.width, 

141 page_height=document.rtf_page.height, 

142 margin=document.rtf_page.margin, 

143 nrow=document.rtf_page.nrow, 

144 orientation=document.rtf_page.orientation, 

145 ) 

146 

147 calculator = PageBreakCalculator(pagination=pagination) 

148 distributor = ContentDistributor(pagination=pagination, calculator=calculator) 

149 

150 return pagination, distributor 

151 

152 def generate_page_break(self, document) -> str: 

153 """Generate proper RTF page break sequence.""" 

154 return self.encoding_service.encode_page_break( 

155 document.rtf_page, 

156 lambda: self.encoding_service.encode_page_margin(document.rtf_page), 

157 ) 

158 

159 def should_show_element_on_page( 

160 self, element_location: str, page_info: dict 

161 ) -> bool: 

162 """Determine if an element should be shown on a specific page.""" 

163 if element_location == "all": 

164 return True 

165 elif element_location == "first": 

166 return page_info["is_first_page"] 

167 elif element_location == "last": 

168 return page_info["is_last_page"] 

169 else: 

170 return False 

171 

172 def process_page_by(self, document) -> List[List[Tuple[int, int, int]]] | None: 

173 """Create components for page_by format.""" 

174 # Obtain input data 

175 data = document.df.to_dicts() 

176 var = document.rtf_body.page_by 

177 

178 # Handle empty DataFrame 

179 if len(data) == 0: 

180 return None 

181 

182 # Obtain column names and dimensions 

183 columns = list(data[0].keys()) 

184 

185 if var is None: 

186 return None 

187 

188 def get_column_index(column_name: str) -> int: 

189 """Get the index of a column in the column list.""" 

190 return columns.index(column_name) 

191 

192 def get_matching_rows(group_values: dict) -> List[int]: 

193 """Get row indices that match the group values.""" 

194 return [ 

195 i 

196 for i, row in enumerate(data) 

197 if all(row[k] == v for k, v in group_values.items()) 

198 ] 

199 

200 def get_unique_combinations(variables: List[str]) -> List[dict]: 

201 """Get unique combinations of values for the specified variables.""" 

202 seen = set() 

203 unique = [] 

204 for row in data: 

205 key = tuple(row[v] for v in variables) 

206 if key not in seen: 

207 seen.add(key) 

208 unique.append({v: row[v] for v in variables}) 

209 return unique 

210 

211 output = [] 

212 prev_values = {v: None for v in var} 

213 

214 # Process each unique combination of grouping variables 

215 for group in get_unique_combinations(var): 

216 indices = get_matching_rows(group) 

217 

218 # Handle headers for each level 

219 for level, var_name in enumerate(var): 

220 current_val = group[var_name] 

221 

222 need_header = False 

223 if level == len(var) - 1: 

224 need_header = True 

225 else: 

226 for lvl in range(level + 1): 

227 if group[var[lvl]] != prev_values[var[lvl]]: 

228 need_header = True 

229 break 

230 

231 if need_header: 

232 col_idx = get_column_index(var_name) 

233 # Add level information as third element in tuple 

234 output.append([(indices[0], col_idx, level)]) 

235 

236 prev_values[var_name] = current_val 

237 

238 # Handle data rows 

239 for index in indices: 

240 output.append( 

241 [ 

242 (index, j, len(var)) 

243 for j in range(len(columns)) 

244 if columns[j] not in var 

245 ] 

246 ) 

247 

248 return output 

249 

250 def apply_pagination_borders( 

251 self, document, rtf_attrs, page_info: dict, total_pages: int 

252 ): 

253 """Apply proper borders for paginated context following r2rtf design: 

254 

255 rtf_page.border_first/last: Controls borders for the entire table 

256 rtf_body.border_first/last: Controls borders for each page 

257 rtf_body.border_top/bottom: Controls borders for individual cells 

258 

259 Logic: 

260 - First page, first row: Apply rtf_page.border_first (overrides rtf_body.border_first) 

261 - Last page, last row: Apply rtf_page.border_last (overrides rtf_body.border_last) 

262 - Non-first pages, first row: Apply rtf_body.border_first 

263 - Non-last pages, last row: Apply rtf_body.border_last 

264 - All other rows: Use existing border_top/bottom from rtf_body 

265 """ 

266 from copy import deepcopy 

267 

268 from ..attributes import BroadcastValue 

269 from ..input import TableAttributes 

270 

271 # Create a deep copy of the attributes to avoid modifying the original 

272 page_attrs = deepcopy(rtf_attrs) 

273 page_df_height = page_info["data"].height 

274 page_df_width = page_info["data"].width 

275 page_shape = (page_df_height, page_df_width) 

276 

277 if page_df_height == 0: 

278 return page_attrs 

279 

280 # Clear border_first and border_last from being broadcast to all rows 

281 # These should only apply to specific rows based on pagination logic 

282 if hasattr(page_attrs, "border_first") and page_attrs.border_first: 

283 # Don't use border_first in pagination - it's handled separately 

284 page_attrs.border_first = None 

285 

286 if hasattr(page_attrs, "border_last") and page_attrs.border_last: 

287 # Don't use border_last in pagination - it's handled separately 

288 page_attrs.border_last = None 

289 

290 # Ensure border_top and border_bottom are properly sized for this page 

291 if not page_attrs.border_top: 

292 page_attrs.border_top = [ 

293 [""] * page_df_width for _ in range(page_df_height) 

294 ] 

295 if not page_attrs.border_bottom: 

296 page_attrs.border_bottom = [ 

297 [""] * page_df_width for _ in range(page_df_height) 

298 ] 

299 

300 # Apply borders based on page position 

301 # For first page: only apply rtf_page.border_first to table body if NO column headers 

302 has_column_headers = ( 

303 document.rtf_column_header and len(document.rtf_column_header) > 0 

304 ) 

305 if page_info["is_first_page"] and not has_column_headers: 

306 if document.rtf_page.border_first: 

307 # Apply border to all cells in the first row 

308 for col_idx in range(page_df_width): 

309 page_attrs = self._apply_border_to_cell( 

310 page_attrs, 

311 0, 

312 col_idx, 

313 "top", 

314 document.rtf_page.border_first, 

315 page_shape, 

316 ) 

317 

318 # For first page with column headers: ensure consistent border style 

319 if page_info["is_first_page"] and has_column_headers: 

320 # Apply same border style as non-first pages to maintain consistency 

321 if document.rtf_body.border_first: 

322 border_style = ( 

323 document.rtf_body.border_first[0][0] 

324 if isinstance(document.rtf_body.border_first, list) 

325 else document.rtf_body.border_first 

326 ) 

327 # Apply single border style to first data row (same as other pages) 

328 for col_idx in range(page_df_width): 

329 page_attrs = self._apply_border_to_cell( 

330 page_attrs, 0, col_idx, "top", border_style, page_shape 

331 ) 

332 

333 # Apply page-level borders for non-first/last pages 

334 if not page_info["is_first_page"] and document.rtf_body.border_first: 

335 # Apply border_first to first row of non-first pages 

336 border_style = ( 

337 document.rtf_body.border_first[0][0] 

338 if isinstance(document.rtf_body.border_first, list) 

339 else document.rtf_body.border_first 

340 ) 

341 for col_idx in range(page_df_width): 

342 page_attrs = self._apply_border_to_cell( 

343 page_attrs, 0, col_idx, "top", border_style, page_shape 

344 ) 

345 

346 # Check if footnotes or sources will appear on this page 

347 has_footnote_on_page = ( 

348 document.rtf_footnote 

349 and document.rtf_footnote.text 

350 and self.should_show_element_on_page( 

351 document.rtf_page.page_footnote, page_info 

352 ) 

353 ) 

354 has_source_on_page = ( 

355 document.rtf_source 

356 and document.rtf_source.text 

357 and self.should_show_element_on_page( 

358 document.rtf_page.page_source, page_info 

359 ) 

360 ) 

361 

362 # Determine if footnotes/sources appear as tables on the last page 

363 # This is crucial for border placement when components are set to "first" only 

364 footnote_as_table_on_last = ( 

365 document.rtf_footnote 

366 and document.rtf_footnote.text 

367 and getattr(document.rtf_footnote, "as_table", True) 

368 and document.rtf_page.page_footnote in ("last", "all") 

369 ) 

370 source_as_table_on_last = ( 

371 document.rtf_source 

372 and document.rtf_source.text 

373 and getattr(document.rtf_source, "as_table", False) 

374 and document.rtf_page.page_source in ("last", "all") 

375 ) 

376 

377 # Apply border logic based on page position and footnote/source presence 

378 if not page_info["is_last_page"]: 

379 # Non-last pages: apply single border after footnote/source, or after data if no footnote/source 

380 if document.rtf_body.border_last: 

381 border_style = ( 

382 document.rtf_body.border_last[0][0] 

383 if isinstance(document.rtf_body.border_last, list) 

384 else document.rtf_body.border_last 

385 ) 

386 

387 if not (has_footnote_on_page or has_source_on_page): 

388 # No footnote/source: apply border to last data row 

389 for col_idx in range(page_df_width): 

390 page_attrs = self._apply_border_to_cell( 

391 page_attrs, 

392 page_df_height - 1, 

393 col_idx, 

394 "bottom", 

395 border_style, 

396 page_shape, 

397 ) 

398 else: 

399 # Has footnote/source: apply border_last from RTFBody 

400 self._apply_footnote_source_borders( 

401 document, page_info, border_style, is_last_page=False 

402 ) 

403 

404 else: # is_last_page 

405 # Last page: check if we should apply border to data or footnote/source 

406 if document.rtf_page.border_last: 

407 # Check if this page contains the absolute last row 

408 total_rows = document.df.height 

409 is_absolute_last_row = page_info["end_row"] == total_rows - 1 

410 

411 if is_absolute_last_row: 

412 # If footnotes/sources are set to "first" only and appear as tables, 

413 # they won't be on the last page, so apply border to last data row 

414 if not (footnote_as_table_on_last or source_as_table_on_last): 

415 # No footnote/source on last page: apply border to last data row 

416 last_row_idx = page_df_height - 1 

417 for col_idx in range(page_df_width): 

418 page_attrs = self._apply_border_to_cell( 

419 page_attrs, 

420 last_row_idx, 

421 col_idx, 

422 "bottom", 

423 document.rtf_page.border_last, 

424 page_shape, 

425 ) 

426 else: 

427 # Has footnote/source on last page: set border for footnote/source 

428 self._apply_footnote_source_borders( 

429 document, 

430 page_info, 

431 document.rtf_page.border_last, 

432 is_last_page=True, 

433 ) 

434 

435 return page_attrs 

436 

437 def _apply_footnote_source_borders( 

438 self, document, page_info: dict, border_style: str, is_last_page: bool 

439 ): 

440 """Apply borders to footnote and source components based on page position.""" 

441 # Determine which component should get the border 

442 has_footnote = ( 

443 document.rtf_footnote 

444 and document.rtf_footnote.text 

445 and self.should_show_element_on_page( 

446 document.rtf_page.page_footnote, page_info 

447 ) 

448 ) 

449 has_source = ( 

450 document.rtf_source 

451 and document.rtf_source.text 

452 and self.should_show_element_on_page( 

453 document.rtf_page.page_source, page_info 

454 ) 

455 ) 

456 

457 # Apply border to components based on as_table setting 

458 # Priority: Source with as_table=True > Footnote with as_table=True > any component 

459 target_component = None 

460 

461 # Extract as_table values (now stored as booleans) 

462 footnote_as_table = None 

463 if has_footnote: 

464 footnote_as_table = getattr(document.rtf_footnote, "as_table", True) 

465 

466 source_as_table = None 

467 if has_source: 

468 source_as_table = getattr(document.rtf_source, "as_table", False) 

469 

470 if has_source and source_as_table: 

471 # Source is rendered as table: prioritize source for borders 

472 target_component = ("source", document.rtf_source) 

473 elif has_footnote and footnote_as_table: 

474 # Footnote is rendered as table: use footnote for borders 

475 target_component = ("footnote", document.rtf_footnote) 

476 # Note: Removed fallback to plain text components - borders should only be applied 

477 # to components that are rendered as tables (as_table=True) 

478 

479 if target_component: 

480 component_name, component = target_component 

481 if not hasattr(component, "_page_border_style"): 

482 component._page_border_style = {} 

483 component._page_border_style[page_info["page_number"]] = border_style 

484 

485 def _apply_border_to_cell( 

486 self, 

487 page_attrs, 

488 row_idx: int, 

489 col_idx: int, 

490 border_side: str, 

491 border_style: str, 

492 page_shape: tuple, 

493 ): 

494 """Apply specified border style to a specific cell using BroadcastValue""" 

495 from ..attributes import BroadcastValue 

496 

497 border_attr = f"border_{border_side}" 

498 

499 if not hasattr(page_attrs, border_attr): 

500 return page_attrs 

501 

502 # Get current border values 

503 current_borders = getattr(page_attrs, border_attr) 

504 

505 # Create BroadcastValue to expand borders to page shape 

506 border_broadcast = BroadcastValue(value=current_borders, dimension=page_shape) 

507 

508 # Update the specific cell 

509 border_broadcast.update_cell(row_idx, col_idx, border_style) 

510 

511 # Update the attribute with the expanded value 

512 setattr(page_attrs, border_attr, border_broadcast.value) 

513 return page_attrs