fix: Payment inference returns only exact-name matches when present
match_members() now short-circuits on whole-word full-name hits and uses word-boundary regex everywhere else, so a nickname that is a substring of another member's surname (e.g. "tov" inside "ottova") no longer produces false positives. Adds tests/test_match_members.py. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -48,6 +48,11 @@ def _build_name_variants(name: str) -> list[str]:
|
||||
return [v for v in variants if len(v) >= 3]
|
||||
|
||||
|
||||
def _word_in(needle: str, haystack: str) -> bool:
|
||||
"""Return True if needle appears as a whole word in haystack."""
|
||||
return bool(re.search(rf"\b{re.escape(needle)}\b", haystack))
|
||||
|
||||
|
||||
def match_members(
|
||||
text: str, member_names: list[str]
|
||||
) -> list[tuple[str, str]]:
|
||||
@@ -56,13 +61,26 @@ def match_members(
|
||||
Returns list of (member_name, confidence) where confidence is 'auto' or 'review'.
|
||||
"""
|
||||
normalized_text = normalize(text)
|
||||
|
||||
# Short-circuit: if any member's full canonical name appears verbatim (whole words),
|
||||
# return only those matches and skip all fuzzy/nickname checks. This prevents a
|
||||
# nickname that is a substring of another member's surname from producing false hits.
|
||||
exact_matches = []
|
||||
for name in member_names:
|
||||
variants = _build_name_variants(name)
|
||||
full_name = variants[0] if variants else ""
|
||||
if full_name and _word_in(full_name, normalized_text):
|
||||
exact_matches.append((name, "auto"))
|
||||
if exact_matches:
|
||||
return exact_matches
|
||||
|
||||
matches = []
|
||||
|
||||
for name in member_names:
|
||||
variants = _build_name_variants(name)
|
||||
full_name = variants[0] if variants else ""
|
||||
parts = full_name.split()
|
||||
|
||||
|
||||
# 1. Full name match (exact sequence) = high confidence
|
||||
if full_name and full_name in normalized_text:
|
||||
matches.append((name, "auto"))
|
||||
@@ -70,17 +88,16 @@ def match_members(
|
||||
|
||||
# 2. Both first and last name present (any order) = high confidence
|
||||
if len(parts) >= 2:
|
||||
if parts[0] in normalized_text and parts[-1] in normalized_text:
|
||||
if _word_in(parts[0], normalized_text) and _word_in(parts[-1], normalized_text):
|
||||
matches.append((name, "auto"))
|
||||
continue
|
||||
|
||||
# 3. Nickname + one part of the name = high confidence
|
||||
# 3. Nickname present = high confidence
|
||||
nickname = ""
|
||||
nickname_match = re.search(r"\(([^)]+)\)", name)
|
||||
if nickname_match:
|
||||
nickname = normalize(nickname_match.group(1))
|
||||
if nickname and nickname in normalized_text:
|
||||
# Nickname alone is often enough, but let's check if it's combined with a name part
|
||||
if nickname and _word_in(nickname, normalized_text):
|
||||
matches.append((name, "auto"))
|
||||
continue
|
||||
|
||||
@@ -89,19 +106,16 @@ def match_members(
|
||||
first_name = parts[0]
|
||||
last_name = parts[-1]
|
||||
_COMMON_SURNAMES = {"novak", "novakova", "prach"}
|
||||
|
||||
# Match last name
|
||||
if len(last_name) >= 4 and last_name not in _COMMON_SURNAMES and last_name in normalized_text:
|
||||
|
||||
if len(last_name) >= 4 and last_name not in _COMMON_SURNAMES and _word_in(last_name, normalized_text):
|
||||
matches.append((name, "review"))
|
||||
continue
|
||||
|
||||
# Match first name (if not too short)
|
||||
if len(first_name) >= 3 and first_name in normalized_text:
|
||||
|
||||
if len(first_name) >= 3 and _word_in(first_name, normalized_text):
|
||||
matches.append((name, "review"))
|
||||
continue
|
||||
elif len(parts) == 1:
|
||||
# Single name member
|
||||
if len(parts[0]) >= 4 and parts[0] in normalized_text:
|
||||
if len(parts[0]) >= 4 and _word_in(parts[0], normalized_text):
|
||||
matches.append((name, "review"))
|
||||
continue
|
||||
|
||||
@@ -109,7 +123,6 @@ def match_members(
|
||||
# If we have any "auto" matches, discard all "review" matches
|
||||
auto_matches = [m for m in matches if m[1] == "auto"]
|
||||
if auto_matches:
|
||||
# If multiple auto matches, keep them (ambiguous but high priority)
|
||||
return auto_matches
|
||||
|
||||
return matches
|
||||
|
||||
Reference in New Issue
Block a user