Coverage for src / rtflite / assemble.py: 93%

152 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-08 04:50 +0000

1"""Assemble multiple RTF files into a single RTF or DOCX file.""" 

2 

3import os 

4from collections.abc import Sequence 

5from copy import deepcopy 

6from pathlib import Path 

7from typing import TYPE_CHECKING 

8 

9if TYPE_CHECKING: # pragma: no cover 

10 from docx.document import Document as DocxDocument 

11 from docx.section import Section 

12 

13# from .input import RTFPage # Unused 

14 

15 

16def assemble_rtf( 

17 input_files: list[str], 

18 output_file: str, 

19) -> None: 

20 """Combine multiple RTF files into a single RTF file. 

21 

22 Args: 

23 input_files: List of paths to RTF files to combine. 

24 output_file: Path to the output RTF file. 

25 """ 

26 if not input_files: 

27 return 

28 

29 # Check if files exist 

30 missing_files = [f for f in input_files if not os.path.exists(f)] 

31 if missing_files: 

32 raise FileNotFoundError(f"Missing files: {', '.join(missing_files)}") 

33 

34 # Read all files 

35 rtf_contents = [] 

36 for f in input_files: 

37 with open(f, encoding="utf-8") as file: 

38 rtf_contents.append(file.readlines()) 

39 

40 if not rtf_contents: 

41 return 

42 

43 # Process first file 

44 # We keep everything from the first file except the last closing brace '}' 

45 

46 # Remove last line if it contains only '}' or remove the last '}' char 

47 # r2rtf simply removes the last line: end[-n] <- end[-n] - 1 

48 

49 # Helper to find start index based on fcharset 

50 def find_start_index(lines): 

51 last_idx = 0 

52 found = False 

53 for i, line in enumerate(lines): 

54 if "fcharset" in line: 

55 last_idx = i 

56 found = True 

57 

58 if found: 

59 return last_idx + 2 

60 return 0 

61 

62 new_page_cmd = r"\page" + "\n" 

63 

64 processed_parts = [] 

65 

66 for i, lines in enumerate(rtf_contents): 

67 start_idx = 0 

68 if i > 0: 

69 # For subsequent files, skip header 

70 start_idx = find_start_index(lines) 

71 

72 end_idx = len(lines) 

73 if i < len(rtf_contents) - 1 and lines[-1].strip() == "}": 

74 # Remove last line (closing brace) for all but last file 

75 end_idx -= 1 

76 

77 part = lines[start_idx:end_idx] 

78 processed_parts.extend(part) 

79 

80 if i < len(rtf_contents) - 1: 

81 processed_parts.append(new_page_cmd) 

82 

83 # Write output 

84 with open(output_file, "w", encoding="utf-8") as outfile: 

85 outfile.writelines(processed_parts) 

86 

87 

88def assemble_docx( 

89 input_files: list[str], 

90 output_file: str, 

91 landscape: bool | list[bool] = False, 

92) -> None: 

93 """Combine multiple RTF files into a single DOCX file. 

94 

95 Args: 

96 input_files: List of paths to input RTF files. 

97 output_file: Path to the output DOCX file. 

98 landscape: Whether the output should be landscape. Can be a single bool 

99 (applies to all) or a list of bools (one per file). Defaults to False. 

100 """ 

101 try: 

102 import docx # type: ignore 

103 from docx.enum.section import WD_ORIENT # type: ignore 

104 except ImportError as e: 

105 raise ImportError( 

106 "python-docx is required for assemble_docx. " 

107 "Install it with: pip install 'rtflite[docx]'" 

108 ) from e 

109 

110 if not input_files: 

111 raise ValueError("Input files list cannot be empty") 

112 

113 # Check input files exist 

114 missing_files = [f for f in input_files if not os.path.exists(f)] 

115 if missing_files: 

116 raise FileNotFoundError(f"Missing files: {', '.join(missing_files)}") 

117 

118 # Handle landscape argument 

119 if isinstance(landscape, bool): 

120 landscape_list = [landscape] * len(input_files) 

121 else: 

122 if len(landscape) != len(input_files): 

123 raise ValueError("Length of landscape list must match input files") 

124 landscape_list = landscape 

125 

126 # Create new document 

127 doc = docx.Document() 

128 

129 for i, (input_file, is_landscape) in enumerate( 

130 zip(input_files, landscape_list, strict=True) 

131 ): 

132 # Set orientation for the current section 

133 section = doc.sections[-1] 

134 if is_landscape: 

135 section.orientation = WD_ORIENT.LANDSCAPE 

136 w, h = section.page_width, section.page_height 

137 if w is not None and h is not None and w < h: # If currently portrait 

138 section.page_width = h 

139 section.page_height = w 

140 else: 

141 section.orientation = WD_ORIENT.PORTRAIT 

142 w, h = section.page_width, section.page_height 

143 if w is not None and h is not None and w > h: # If currently landscape 

144 section.page_width = h 

145 section.page_height = w 

146 

147 # Absolute path needed for fields 

148 abs_path = os.path.abspath(input_file) 

149 

150 # Escape backslashes for the field code 

151 path_str = abs_path.replace("\\", "\\\\") 

152 

153 # Create INCLUDETEXT field 

154 field_code = f'INCLUDETEXT "{path_str}"' 

155 

156 # Add "Table X" caption 

157 p = doc.add_paragraph() 

158 p.add_run("Table ") 

159 _add_field(p, r"SEQ Table \* ARABIC") 

160 p.add_run("\n") # Linebreak 

161 

162 # Add the INCLUDETEXT field 

163 _add_field(p, field_code) 

164 

165 # Handle section breaks 

166 if i < len(input_files) - 1: 

167 doc.add_section() 

168 

169 doc.save(output_file) 

170 

171 

172def _add_field(paragraph, field_code): 

173 """Add a complex field to a paragraph.""" 

174 # This is low-level XML manipulation for python-docx to add fields 

175 from docx.oxml.ns import qn # type: ignore 

176 from docx.oxml.shared import OxmlElement # type: ignore 

177 

178 run = paragraph.add_run() 

179 r = run._r 

180 fldChar = OxmlElement("w:fldChar") 

181 fldChar.set(qn("w:fldCharType"), "begin") 

182 r.append(fldChar) 

183 

184 run = paragraph.add_run() 

185 r = run._r 

186 instrText = OxmlElement("w:instrText") 

187 instrText.set(qn("xml:space"), "preserve") 

188 instrText.text = field_code 

189 r.append(instrText) 

190 

191 run = paragraph.add_run() 

192 r = run._r 

193 fldChar = OxmlElement("w:fldChar") 

194 fldChar.set(qn("w:fldCharType"), "separate") 

195 r.append(fldChar) 

196 

197 # Add placeholder text so the field is visible/clickable 

198 if "SEQ" in field_code: 

199 run = paragraph.add_run("1") 

200 else: 

201 run = paragraph.add_run("Error! Reference source not found.") 

202 

203 run = paragraph.add_run() 

204 r = run._r 

205 fldChar = OxmlElement("w:fldChar") 

206 fldChar.set(qn("w:fldCharType"), "end") 

207 r.append(fldChar) 

208 

209 

210def concatenate_docx( 

211 input_files: Sequence[str | os.PathLike[str]], 

212 output_file: str | os.PathLike[str], 

213 landscape: bool | Sequence[bool] = False, 

214) -> None: 

215 """Concatenate DOCX files without relying on Word field toggles. 

216 

217 This helper is useful when `RTFDocument.write_docx` already produced DOCX 

218 files and you need to stitch them together into a single document that can 

219 be distributed without refreshing fields in Microsoft Word. 

220 

221 Args: 

222 input_files: Ordered collection of DOCX file paths to combine. The 

223 first document becomes the base; subsequent documents are appended 

224 as new sections. 

225 output_file: Path to the combined DOCX file. 

226 landscape: Whether each appended section should be landscape. Accepts 

227 a single boolean applied to every section or a list/tuple matching 

228 ``input_files``. 

229 

230 Raises: 

231 ImportError: If ``python-docx`` is not installed. 

232 ValueError: If ``input_files`` is empty or the ``landscape`` list length 

233 does not match ``input_files``. 

234 FileNotFoundError: If any input file is missing. 

235 """ 

236 try: 

237 from docx import Document # type: ignore 

238 from docx.enum.section import WD_SECTION # type: ignore 

239 except ImportError as exc: 

240 raise ImportError( 

241 "python-docx is required for concatenate_docx. " 

242 "Install it with: pip install 'rtflite[docx]'" 

243 ) from exc 

244 

245 paths = [Path(path).expanduser() for path in input_files] 

246 if not paths: 

247 raise ValueError("Input files list cannot be empty") 

248 

249 missing_files = [str(path) for path in paths if not path.exists()] 

250 if missing_files: 

251 raise FileNotFoundError(f"Missing files: {', '.join(missing_files)}") 

252 

253 orientation_flags = _coerce_landscape_flags(landscape, len(paths)) 

254 

255 combined_doc = Document(str(paths[0])) 

256 _set_section_orientation(combined_doc.sections[0], orientation_flags[0]) 

257 

258 for source_path, is_landscape in zip(paths[1:], orientation_flags[1:], strict=True): 

259 combined_doc.add_section(WD_SECTION.NEW_PAGE) 

260 _set_section_orientation(combined_doc.sections[-1], is_landscape) 

261 _append_document_body(combined_doc, Document(str(source_path))) 

262 

263 output_path = Path(output_file).expanduser() 

264 output_path.parent.mkdir(parents=True, exist_ok=True) 

265 combined_doc.save(str(output_path)) 

266 

267 

268def _coerce_landscape_flags( 

269 landscape: bool | Sequence[bool], 

270 expected_length: int, 

271) -> list[bool]: 

272 """Normalize the ``landscape`` argument to a list and validate its length.""" 

273 if isinstance(landscape, bool): 

274 return [landscape] * expected_length 

275 

276 flags = list(landscape) 

277 if len(flags) != expected_length: 

278 raise ValueError("Length of landscape list must match input files") 

279 

280 return flags 

281 

282 

283def _set_section_orientation(section: "Section", landscape: bool) -> None: 

284 """Set section orientation and swap dimensions if needed.""" 

285 from docx.enum.section import WD_ORIENT # type: ignore 

286 

287 section.orientation = WD_ORIENT.LANDSCAPE if landscape else WD_ORIENT.PORTRAIT 

288 width, height = section.page_width, section.page_height 

289 if width is None or height is None: 

290 return 

291 

292 if (landscape and width < height) or (not landscape and width > height): 

293 section.page_width, section.page_height = height, width 

294 

295 

296def _append_document_body(target: "DocxDocument", source: "DocxDocument") -> None: 

297 """Copy body content from ``source`` into ``target`` without section props.""" 

298 for element in list(source.element.body): 

299 if element.tag.endswith("}sectPr"): 

300 continue 

301 target.element.body.append(deepcopy(element))