Coverage for src/rtflite/text_conversion/converter.py: 97%

38 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-14 16:35 +0000

1""" 

2Text conversion engine for LaTeX to Unicode conversion. 

3 

4This module implements the core conversion logic that processes text containing 

5LaTeX commands and converts them to Unicode characters. It focuses on 

6readability and maintainability rather than performance. 

7""" 

8 

9import re 

10from typing import Pattern 

11 

12from .symbols import LaTeXSymbolMapper 

13 

14 

15class TextConverter: 

16 """ 

17 Converts LaTeX commands in text to Unicode characters. 

18 

19 This class handles the parsing and conversion of LaTeX mathematical 

20 commands within text strings. It's designed for clarity and ease of 

21 maintenance rather than maximum performance. 

22 """ 

23 

24 def __init__(self): 

25 """Initialize the converter with symbol mapping.""" 

26 self.symbol_mapper = LaTeXSymbolMapper() 

27 self._latex_pattern = self._create_latex_pattern() 

28 

29 def _create_latex_pattern(self) -> Pattern[str]: 

30 """ 

31 Create the regular expression pattern for matching LaTeX commands. 

32 

33 This pattern matches: 

34 - Simple commands: \\alpha, \\beta, \\pm 

35 - Commands with braces: \\mathbb{R}, \\mathcal{L} 

36 - Commands with optional parameters (future extension) 

37 

38 Returns: 

39 Compiled regular expression pattern 

40 """ 

41 # Pattern explanation: 

42 # \\ - Literal backslash (escaped) 

43 # [a-zA-Z]+ - One or more letters (command name) 

44 # (?: - Non-capturing group for optional braces 

45 # \{[^}]*\} - Opening brace, any content except }, closing brace 

46 # )? - Make the brace group optional 

47 pattern = r"\\[a-zA-Z]+(?:\{[^}]*\})?" 

48 return re.compile(pattern) 

49 

50 def convert_latex_to_unicode(self, text: str) -> str: 

51 """ 

52 Convert all LaTeX commands in text to Unicode characters. 

53 

54 This method processes the input text and replaces any LaTeX commands 

55 with their Unicode equivalents. Commands without mappings are left 

56 unchanged. 

57 

58 Args: 

59 text: Input text potentially containing LaTeX commands 

60 

61 Returns: 

62 Text with LaTeX commands converted to Unicode 

63 

64 Examples: 

65 >>> converter = TextConverter() 

66 >>> converter.convert_latex_to_unicode("\\alpha + \\beta = \\gamma") 

67 "alpha + beta = gamma" 

68 

69 >>> converter.convert_latex_to_unicode("Mean \\pm SD") 

70 "Mean +/- SD" 

71 

72 >>> converter.convert_latex_to_unicode("Set \\mathbb{R}") 

73 "Set R" 

74 """ 

75 if not text: 

76 return text 

77 

78 def replace_latex_command(match) -> str: 

79 """Replace a single LaTeX command match with Unicode.""" 

80 latex_command = match.group(0) 

81 return self._convert_single_command(latex_command) 

82 

83 # Apply the conversion to all matches 

84 converted_text = self._latex_pattern.sub(replace_latex_command, text) 

85 return converted_text 

86 

87 def _convert_single_command(self, latex_command: str) -> str: 

88 """ 

89 Convert a single LaTeX command to Unicode. 

90 

91 This method handles the conversion logic for individual commands, 

92 including special cases for commands with braces. 

93 

94 Args: 

95 latex_command: The LaTeX command to convert 

96 

97 Returns: 

98 Unicode character or original command if no mapping exists 

99 """ 

100 # Handle commands with braces (e.g., \\mathbb{R}) 

101 if "{" in latex_command and "}" in latex_command: 

102 return self._handle_braced_command(latex_command) 

103 

104 # Handle simple commands (e.g., \\alpha, \\pm) 

105 return self.symbol_mapper.get_unicode_char(latex_command) 

106 

107 def _handle_braced_command(self, latex_command: str) -> str: 

108 """ 

109 Handle LaTeX commands that contain braces. 

110 

111 Commands like \\mathbb{R} or \\mathcal{L} need special handling 

112 to extract the argument and look up the full command. 

113 

114 Args: 

115 latex_command: LaTeX command with braces 

116 

117 Returns: 

118 Unicode character or original command 

119 """ 

120 # Try the full command as-is first (for exact matches) 

121 unicode_result = self.symbol_mapper.get_unicode_char(latex_command) 

122 if unicode_result != latex_command: # Found a mapping 

123 return unicode_result 

124 

125 # If no exact match, we could implement more sophisticated parsing 

126 # For now, return the original command 

127 return latex_command 

128 

129 def get_conversion_statistics(self, text: str) -> dict: 

130 """ 

131 Get statistics about LaTeX commands in the text. 

132 

133 This is useful for debugging and understanding conversion coverage. 

134 

135 Args: 

136 text: Text to analyze 

137 

138 Returns: 

139 Dictionary with conversion statistics 

140 """ 

141 if not text: 

142 return {"total_commands": 0, "converted": 0, "unconverted": []} 

143 

144 matches = self._latex_pattern.findall(text) 

145 converted = [] 

146 unconverted = [] 

147 

148 for command in matches: 

149 if self.symbol_mapper.has_mapping(command): 

150 converted.append(command) 

151 else: 

152 unconverted.append(command) 

153 

154 return { 

155 "total_commands": len(matches), 

156 "converted": len(converted), 

157 "unconverted": unconverted, 

158 "conversion_rate": len(converted) / len(matches) if matches else 0, 

159 }