Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion ppocr/postprocess/rec_postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,35 @@
from paddle.nn import functional as F
import re
import json
import unicodedata


def is_latin_char(char):
"""
Check if a character is a Latin letter (including accented characters).
This will properly categorize accented characters like é, è, à, ç, etc.
"""
try:
# Get the Unicode category
category = unicodedata.category(char)
# Lu = Letter, uppercase
# Ll = Letter, lowercase
# Lt = Letter, titlecase
# Lm = Letter, modifier (includes some Latin extended characters)
if not category.startswith("L"):
return False

# Check if the character name starts with LATIN
# This covers all Latin-based characters including:
# - LATIN SMALL LETTER E WITH ACUTE (é)
# - LATIN SMALL LETTER A WITH GRAVE (à)
# - LATIN SMALL LETTER C WITH CEDILLA (ç)
# - LATIN SMALL LETTER E WITH CIRCUMFLEX (ê)
# - etc.
char_name = unicodedata.name(char, "")
return char_name.startswith("LATIN")
except ValueError:
return False


class BaseRecLabelDecode(object):
Expand Down Expand Up @@ -95,11 +124,16 @@ def get_word_info(self, text, selection):
for c_i, char in enumerate(text):
if "\u4e00" <= char <= "\u9fff":
c_state = "cn"
elif bool(re.search("[a-zA-Z0-9]", char)):
# Modified condition to include accented characters used in French and other Latin-based languages
elif bool(re.search("[a-zA-Z0-9]", char)) or is_latin_char(char):
c_state = "en&num"
else:
c_state = "splitter"

# Handle apostrophes in French words like "n'êtes"
if char == "'" and state == "en&num":
c_state = "en&num"

if (
char == "."
and state == "en&num"
Expand Down
137 changes: 137 additions & 0 deletions tests/test_french_accents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Test script to verify French accented character handling in OCR text recognition.
This script tests that French words with accented characters (é, è, à, ç, etc.)
and contractions (n'êtes, l'été) are properly grouped as single words and not
split at each accented character.
"""

import sys
import os
import numpy as np

# Add the project root to the path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode


def test_french_word_grouping():
"""Test that French words with accents are properly grouped."""

# Initialize the decoder
decoder = BaseRecLabelDecode(character_dict_path=None, use_space_char=True)

# Test cases with French accented words
test_cases = [
{
"name": "Simple accented word: été (summer)",
"text": "été",
"expected_words": [["é", "t", "é"]],
"expected_states": ["en&num"],
},
{
"name": "Word with ç: français (French)",
"text": "français",
"expected_words": [["f", "r", "a", "n", "ç", "a", "i", "s"]],
"expected_states": ["en&num"],
},
{
"name": "Contraction: n'êtes (you are)",
"text": "n'êtes",
"expected_words": [["n", "'", "ê", "t", "e", "s"]],
"expected_states": ["en&num"],
},
{
"name": "Multiple accents: élève (student)",
"text": "élève",
"expected_words": [["é", "l", "è", "v", "e"]],
"expected_states": ["en&num"],
},
{
"name": "Word with à: à demain (see you tomorrow)",
"text": "à demain",
"expected_words": [["à"], ["d", "e", "m", "a", "i", "n"]],
"expected_states": ["en&num", "en&num"],
},
{
"name": "Complex: C'était très français (It was very French)",
"text": "C'était très français",
"expected_words": [
["C", "'", "é", "t", "a", "i", "t"],
["t", "r", "è", "s"],
["f", "r", "a", "n", "ç", "a", "i", "s"],
],
"expected_states": ["en&num", "en&num", "en&num"],
},
]

print("=" * 70)
print("Testing French Accented Character Word Grouping")
print("=" * 70)

all_passed = True

for test in test_cases:
text = test["name"]
test_text = test["text"]

# Create a mock selection array (all characters are valid)
selection = np.ones(len(test_text), dtype=bool)

# Call get_word_info
word_list, word_col_list, state_list = decoder.get_word_info(
test_text, selection
)

# Check results
passed = True

if len(word_list) != len(test["expected_words"]):
passed = False
print(f"\nFAILED: {text}")
print(
f" Expected {len(test['expected_words'])} words, got {len(word_list)}"
)
elif state_list != test["expected_states"]:
passed = False
print(f"\nFAILED: {text}")
print(f" Expected states: {test['expected_states']}")
print(f" Got states: {state_list}")
else:
# Check if words match
for i, (expected, actual) in enumerate(
zip(test["expected_words"], word_list)
):
if expected != actual:
passed = False
print(f"\nFAILED: {text}")
print(f" Word {i}: Expected {expected}, got {actual}")
break

if passed:
print(f"\nPASSED: {text}")
print(f" Text: '{test_text}'")
print(f" Words: {[''.join(w) for w in word_list]}")
print(f" States: {state_list}")
else:
all_passed = False
print(f" Text: '{test_text}'")
print(f" Expected words: {[''.join(w) for w in test['expected_words']]}")
print(f" Got words: {[''.join(w) for w in word_list]}")

print("\n" + "=" * 70)
if all_passed:
print("All tests PASSED! French accented words are properly grouped.")
else:
print("Some tests FAILED. Please review the output above.")
print("=" * 70)

return all_passed


if __name__ == "__main__":
success = test_french_word_grouping()
sys.exit(0 if success else 1)