diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 77f30a50a61..ef8e39ba0f4 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -18,6 +18,35 @@ from paddle.nn import functional as F import re import json +import unicodedata + + +def is_latin_char(char): + """ + Check if a character is a Latin letter (including accented characters). + This will properly categorize accented characters like é, è, à, ç, etc. + """ + try: + # Get the Unicode category + category = unicodedata.category(char) + # Lu = Letter, uppercase + # Ll = Letter, lowercase + # Lt = Letter, titlecase + # Lm = Letter, modifier (includes some Latin extended characters) + if not category.startswith("L"): + return False + + # Check if the character name starts with LATIN + # This covers all Latin-based characters including: + # - LATIN SMALL LETTER E WITH ACUTE (é) + # - LATIN SMALL LETTER A WITH GRAVE (à) + # - LATIN SMALL LETTER C WITH CEDILLA (ç) + # - LATIN SMALL LETTER E WITH CIRCUMFLEX (ê) + # - etc. + char_name = unicodedata.name(char, "") + return char_name.startswith("LATIN") + except ValueError: + return False class BaseRecLabelDecode(object): @@ -95,11 +124,16 @@ def get_word_info(self, text, selection): for c_i, char in enumerate(text): if "\u4e00" <= char <= "\u9fff": c_state = "cn" - elif bool(re.search("[a-zA-Z0-9]", char)): + # Modified condition to include accented characters used in French and other Latin-based languages + elif bool(re.search("[a-zA-Z0-9]", char)) or is_latin_char(char): c_state = "en&num" else: c_state = "splitter" + # Handle apostrophes in French words like "n'êtes" + if char == "'" and state == "en&num": + c_state = "en&num" + if ( char == "." and state == "en&num" diff --git a/tests/test_french_accents.py b/tests/test_french_accents.py new file mode 100644 index 00000000000..e16bc45084f --- /dev/null +++ b/tests/test_french_accents.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Test script to verify French accented character handling in OCR text recognition. + +This script tests that French words with accented characters (é, è, à, ç, etc.) +and contractions (n'êtes, l'été) are properly grouped as single words and not +split at each accented character. +""" + +import sys +import os +import numpy as np + +# Add the project root to the path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode + + +def test_french_word_grouping(): + """Test that French words with accents are properly grouped.""" + + # Initialize the decoder + decoder = BaseRecLabelDecode(character_dict_path=None, use_space_char=True) + + # Test cases with French accented words + test_cases = [ + { + "name": "Simple accented word: été (summer)", + "text": "été", + "expected_words": [["é", "t", "é"]], + "expected_states": ["en&num"], + }, + { + "name": "Word with ç: français (French)", + "text": "français", + "expected_words": [["f", "r", "a", "n", "ç", "a", "i", "s"]], + "expected_states": ["en&num"], + }, + { + "name": "Contraction: n'êtes (you are)", + "text": "n'êtes", + "expected_words": [["n", "'", "ê", "t", "e", "s"]], + "expected_states": ["en&num"], + }, + { + "name": "Multiple accents: élève (student)", + "text": "élève", + "expected_words": [["é", "l", "è", "v", "e"]], + "expected_states": ["en&num"], + }, + { + "name": "Word with à: à demain (see you tomorrow)", + "text": "à demain", + "expected_words": [["à"], ["d", "e", "m", "a", "i", "n"]], + "expected_states": ["en&num", "en&num"], + }, + { + "name": "Complex: C'était très français (It was very French)", + "text": "C'était très français", + "expected_words": [ + ["C", "'", "é", "t", "a", "i", "t"], + ["t", "r", "è", "s"], + ["f", "r", "a", "n", "ç", "a", "i", "s"], + ], + "expected_states": ["en&num", "en&num", "en&num"], + }, + ] + + print("=" * 70) + print("Testing French Accented Character Word Grouping") + print("=" * 70) + + all_passed = True + + for test in test_cases: + text = test["name"] + test_text = test["text"] + + # Create a mock selection array (all characters are valid) + selection = np.ones(len(test_text), dtype=bool) + + # Call get_word_info + word_list, word_col_list, state_list = decoder.get_word_info( + test_text, selection + ) + + # Check results + passed = True + + if len(word_list) != len(test["expected_words"]): + passed = False + print(f"\nFAILED: {text}") + print( + f" Expected {len(test['expected_words'])} words, got {len(word_list)}" + ) + elif state_list != test["expected_states"]: + passed = False + print(f"\nFAILED: {text}") + print(f" Expected states: {test['expected_states']}") + print(f" Got states: {state_list}") + else: + # Check if words match + for i, (expected, actual) in enumerate( + zip(test["expected_words"], word_list) + ): + if expected != actual: + passed = False + print(f"\nFAILED: {text}") + print(f" Word {i}: Expected {expected}, got {actual}") + break + + if passed: + print(f"\nPASSED: {text}") + print(f" Text: '{test_text}'") + print(f" Words: {[''.join(w) for w in word_list]}") + print(f" States: {state_list}") + else: + all_passed = False + print(f" Text: '{test_text}'") + print(f" Expected words: {[''.join(w) for w in test['expected_words']]}") + print(f" Got words: {[''.join(w) for w in word_list]}") + + print("\n" + "=" * 70) + if all_passed: + print("All tests PASSED! French accented words are properly grouped.") + else: + print("Some tests FAILED. Please review the output above.") + print("=" * 70) + + return all_passed + + +if __name__ == "__main__": + success = test_french_word_grouping() + sys.exit(0 if success else 1)