Coverage for src/rtflite/text_conversion/converter.py: 97%
38 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 16:35 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 16:35 +0000
1"""
2Text conversion engine for LaTeX to Unicode conversion.
4This module implements the core conversion logic that processes text containing
5LaTeX commands and converts them to Unicode characters. It focuses on
6readability and maintainability rather than performance.
7"""
9import re
10from typing import Pattern
12from .symbols import LaTeXSymbolMapper
15class TextConverter:
16 """
17 Converts LaTeX commands in text to Unicode characters.
19 This class handles the parsing and conversion of LaTeX mathematical
20 commands within text strings. It's designed for clarity and ease of
21 maintenance rather than maximum performance.
22 """
24 def __init__(self):
25 """Initialize the converter with symbol mapping."""
26 self.symbol_mapper = LaTeXSymbolMapper()
27 self._latex_pattern = self._create_latex_pattern()
29 def _create_latex_pattern(self) -> Pattern[str]:
30 """
31 Create the regular expression pattern for matching LaTeX commands.
33 This pattern matches:
34 - Simple commands: \\alpha, \\beta, \\pm
35 - Commands with braces: \\mathbb{R}, \\mathcal{L}
36 - Commands with optional parameters (future extension)
38 Returns:
39 Compiled regular expression pattern
40 """
41 # Pattern explanation:
42 # \\ - Literal backslash (escaped)
43 # [a-zA-Z]+ - One or more letters (command name)
44 # (?: - Non-capturing group for optional braces
45 # \{[^}]*\} - Opening brace, any content except }, closing brace
46 # )? - Make the brace group optional
47 pattern = r"\\[a-zA-Z]+(?:\{[^}]*\})?"
48 return re.compile(pattern)
50 def convert_latex_to_unicode(self, text: str) -> str:
51 """
52 Convert all LaTeX commands in text to Unicode characters.
54 This method processes the input text and replaces any LaTeX commands
55 with their Unicode equivalents. Commands without mappings are left
56 unchanged.
58 Args:
59 text: Input text potentially containing LaTeX commands
61 Returns:
62 Text with LaTeX commands converted to Unicode
64 Examples:
65 >>> converter = TextConverter()
66 >>> converter.convert_latex_to_unicode("\\alpha + \\beta = \\gamma")
67 "alpha + beta = gamma"
69 >>> converter.convert_latex_to_unicode("Mean \\pm SD")
70 "Mean +/- SD"
72 >>> converter.convert_latex_to_unicode("Set \\mathbb{R}")
73 "Set R"
74 """
75 if not text:
76 return text
78 def replace_latex_command(match) -> str:
79 """Replace a single LaTeX command match with Unicode."""
80 latex_command = match.group(0)
81 return self._convert_single_command(latex_command)
83 # Apply the conversion to all matches
84 converted_text = self._latex_pattern.sub(replace_latex_command, text)
85 return converted_text
87 def _convert_single_command(self, latex_command: str) -> str:
88 """
89 Convert a single LaTeX command to Unicode.
91 This method handles the conversion logic for individual commands,
92 including special cases for commands with braces.
94 Args:
95 latex_command: The LaTeX command to convert
97 Returns:
98 Unicode character or original command if no mapping exists
99 """
100 # Handle commands with braces (e.g., \\mathbb{R})
101 if "{" in latex_command and "}" in latex_command:
102 return self._handle_braced_command(latex_command)
104 # Handle simple commands (e.g., \\alpha, \\pm)
105 return self.symbol_mapper.get_unicode_char(latex_command)
107 def _handle_braced_command(self, latex_command: str) -> str:
108 """
109 Handle LaTeX commands that contain braces.
111 Commands like \\mathbb{R} or \\mathcal{L} need special handling
112 to extract the argument and look up the full command.
114 Args:
115 latex_command: LaTeX command with braces
117 Returns:
118 Unicode character or original command
119 """
120 # Try the full command as-is first (for exact matches)
121 unicode_result = self.symbol_mapper.get_unicode_char(latex_command)
122 if unicode_result != latex_command: # Found a mapping
123 return unicode_result
125 # If no exact match, we could implement more sophisticated parsing
126 # For now, return the original command
127 return latex_command
129 def get_conversion_statistics(self, text: str) -> dict:
130 """
131 Get statistics about LaTeX commands in the text.
133 This is useful for debugging and understanding conversion coverage.
135 Args:
136 text: Text to analyze
138 Returns:
139 Dictionary with conversion statistics
140 """
141 if not text:
142 return {"total_commands": 0, "converted": 0, "unconverted": []}
144 matches = self._latex_pattern.findall(text)
145 converted = []
146 unconverted = []
148 for command in matches:
149 if self.symbol_mapper.has_mapping(command):
150 converted.append(command)
151 else:
152 unconverted.append(command)
154 return {
155 "total_commands": len(matches),
156 "converted": len(converted),
157 "unconverted": unconverted,
158 "conversion_rate": len(converted) / len(matches) if matches else 0,
159 }