Skip to content

Commit e383de1

Browse files
authored
fix: fix issue in map_to_canonical_id function (#332)
Solves #331
1 parent 3a1375e commit e383de1

File tree

4 files changed

+73
-4
lines changed

4 files changed

+73
-4
lines changed

openfoodfacts/taxonomy.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import requests
55

6-
from openfoodfacts.utils.text import get_tag
6+
from openfoodfacts.utils.text import get_tag, replace_lang_prefix
77

88
from .types import Environment, Flavor, JSONType, TaxonomyType
99
from .utils import (
@@ -495,5 +495,18 @@ def map_to_canonical_id(
495495
raise ValueError(
496496
f"Invalid value: '{value}', expected value to be in 'lang:tag' format"
497497
)
498-
tags = [get_tag(value) for value in values]
499-
return {value: taxonomy_mapping.get(tag, tag) for tag, value in zip(tags, values)}
498+
499+
output = {}
500+
for value in values:
501+
tag = get_tag(value)
502+
output[value] = (
503+
# Look for a direct match first
504+
taxonomy_mapping.get(tag)
505+
# Then look for a match with the xx prefix (language-independent
506+
# entry)
507+
or taxonomy_mapping.get(replace_lang_prefix(tag, "xx"))
508+
# If no match is found, return the original taggified value
509+
or tag
510+
)
511+
512+
return output

openfoodfacts/utils/text.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,17 @@ def get_tag(text: str) -> str:
9090
if lang_prefix:
9191
text = f"{lang_prefix}:{text}"
9292
return text
93+
94+
95+
def replace_lang_prefix(tag: str, new_lang_prefix: str) -> str:
96+
"""Replace the language prefix of a tag with a new one."""
97+
98+
if len(new_lang_prefix) != 2:
99+
raise ValueError(
100+
f"new_lang_prefix '{new_lang_prefix}' must be a 2-letter code."
101+
)
102+
103+
if len(tag) < 3 or tag[2] != ":":
104+
raise ValueError(f"tag '{tag}' has an invalid language prefix")
105+
106+
return f"{new_lang_prefix}:{tag[3:]}"

tests/unit/test_taxonomy.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,28 @@ def test_map_to_canonical_id():
2121
"en:apples": "en:apples",
2222
"fr:pomme": "en:apples",
2323
"fr:noix-d-isere": "en:nuts-from-isere",
24+
"xx:provence-alpes-cote-d-azur": "en:provence-alpes-cote-d-azur",
25+
"xx:sashimi": "xx:sashimi",
2426
}
2527
values = [
2628
"en: Apple",
2729
"en: apples",
2830
"fr: Pomme",
2931
"fr: Bananes d'Isère",
3032
"fr: Noix d'Isère",
33+
"fr: Provence-Alpes-Côte d'Azur",
34+
"pt: Provence-Alpes-Côte d'Azur",
35+
"it: sashimi",
3136
]
3237
expected = {
3338
"en: Apple": "en:apples",
3439
"en: apples": "en:apples",
3540
"fr: Pomme": "en:apples",
3641
"fr: Bananes d'Isère": "fr:bananes-d-isere",
3742
"fr: Noix d'Isère": "en:nuts-from-isere",
43+
"fr: Provence-Alpes-Côte d'Azur": "en:provence-alpes-cote-d-azur",
44+
"pt: Provence-Alpes-Côte d'Azur": "en:provence-alpes-cote-d-azur",
45+
"it: sashimi": "xx:sashimi",
3846
}
3947
assert map_to_canonical_id(taxonomy_mapping, values) == expected
4048

@@ -70,15 +78,22 @@ def test_basic(self):
7078
names={"fr": "Noix d'Isère"},
7179
synonyms={"fr": ["Noix d'Isère"]},
7280
)
81+
node3 = TaxonomyNode(
82+
identifier="xx:sashimi",
83+
names={"xx": "Sashimi"},
84+
synonyms={"xx": ["Sashimi"]},
85+
)
7386
taxonomy.add(node1.id, node1)
7487
taxonomy.add(node2.id, node2)
88+
taxonomy.add(node3.id, node3)
7589

7690
expected_mapping = {
7791
"en:apple": "en:apples",
7892
"fr:pomme": "en:apples",
7993
"en:apples": "en:apples",
8094
"fr:pommes": "en:apples",
8195
"fr:noix-d-isere": "en:nuts-from-isere",
96+
"xx:sashimi": "xx:sashimi",
8297
}
8398

8499
assert create_taxonomy_mapping(taxonomy) == expected_mapping

tests/unit/utils/test_text.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pytest
22

3-
from openfoodfacts.utils.text import get_tag
3+
from openfoodfacts.utils.text import get_tag, replace_lang_prefix
44

55

66
@pytest.mark.parametrize(
@@ -22,3 +22,30 @@
2222
)
2323
def test_get_tag(value: str, output: str):
2424
assert get_tag(value) == output
25+
26+
27+
@pytest.mark.parametrize(
28+
"tag,new_lang_prefix,output",
29+
[
30+
("fr:gesiers", "en", "en:gesiers"),
31+
("fr:gesiers", "fr", "fr:gesiers"),
32+
("fr:gesiers", "ar", "ar:gesiers"),
33+
("en:apple", "fr", "fr:apple"),
34+
("xx:sashimi", "it", "it:sashimi"),
35+
("xx:sashimi", "xx", "xx:sashimi"),
36+
],
37+
)
38+
def test_replace_lang_prefix(tag, new_lang_prefix, output):
39+
assert replace_lang_prefix(tag, new_lang_prefix) == output
40+
41+
42+
def test_replace_lang_prefix_invalid_new_lang_prefix():
43+
with pytest.raises(ValueError, match="new_lang_prefix 'a' must be a 2-letter code"):
44+
replace_lang_prefix("en:apples", "a")
45+
46+
47+
def test_replace_lang_prefix_invalid_tag():
48+
with pytest.raises(
49+
ValueError, match="tag 'e:apples' has an invalid language prefix"
50+
):
51+
replace_lang_prefix("e:apples", "fr")

0 commit comments

Comments
 (0)