Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions spacy/lang/fr/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ class FrenchLemmatizer(Lemmatizer):
the lookup table.
"""

def is_base_form(self, token: Token) -> bool:
"""Check whether the token is already in base form so that suffix
rules are skipped. French infinitives (VerbForm=Inf) are already in
base form; applying further suffix rules to them produces incorrect
results (e.g. "descendre" → "descendrer").
"""
return token.morph.to_dict().get("VerbForm") == "Inf"

@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "rule":
Expand All @@ -30,6 +38,8 @@ def rule_lemmatize(self, token: Token) -> List[str]:
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
if self.is_base_form(token):
return [string.lower()]
elif "lemma_rules" not in self.lookups or univ_pos not in (
"noun",
"verb",
Expand Down