Skip to content

Commit 1fee0b1

Browse files
Merge pull request #619 from microsoft/psl-entity-score
fix: Psl entity score
2 parents 3f193ba + 8d7b592 commit 1fee0b1

14 files changed

Lines changed: 406 additions & 46 deletions

File tree

src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py

Lines changed: 92 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -112,20 +112,14 @@ def find_process_result(step_name: str):
112112
)
113113
)
114114

115-
total_evaluated_fields_count = evaluated_result.confidence.get(
116-
"total_evaluated_fields_count", 0
117-
)
118-
schema_score = (
119-
0
120-
if total_evaluated_fields_count == 0
121-
else round(
122-
(
123-
len(evaluated_result.comparison_result.items)
124-
- evaluated_result.confidence["zero_confidence_fields_count"]
125-
)
126-
/ len(evaluated_result.comparison_result.items),
127-
3,
128-
)
115+
# Compute the aggregate scores. Successful (Completed) processing
116+
# always yields numeric scores: when probabilistic confidence is
117+
# available (logprobs from non-reasoning models / Content Understanding
118+
# signal) we use it; otherwise we fall back to a structural
119+
# completeness score (fraction of expected fields actually filled).
120+
# Failed runs and genuinely empty extractions remain at ``0.0``.
121+
entity_score, schema_score, min_extracted_entity_score = (
122+
self._derive_aggregate_scores(evaluated_result)
129123
)
130124

131125
processed_result = ContentProcess(
@@ -143,11 +137,9 @@ def find_process_result(step_name: str):
143137
self._current_message_context.data_pipeline.pipeline_status.creation_time,
144138
"%Y-%m-%dT%H:%M:%S.%fZ",
145139
),
146-
entity_score=evaluated_result.confidence["overall_confidence"],
140+
entity_score=entity_score,
147141
schema_score=schema_score,
148-
min_extracted_entity_score=evaluated_result.confidence[
149-
"min_extracted_field_confidence"
150-
],
142+
min_extracted_entity_score=min_extracted_entity_score,
151143
prompt_tokens=evaluated_result.prompt_tokens,
152144
completion_tokens=evaluated_result.completion_tokens,
153145
target_schema=Schema.get_schema(
@@ -241,3 +233,85 @@ def _summarize_processed_time(self, step_results: list[StepResult]) -> str:
241233
# Format the total elapsed time as a string
242234
formatted_elapsed_time = f"{total_hours:02}:{total_minutes:02}:{total_seconds:02}.{total_milliseconds:03}"
243235
return formatted_elapsed_time
236+
237+
@staticmethod
238+
def _is_filled_value(value: object) -> bool:
239+
"""Heuristic: does an extracted value count as "actually filled"?
240+
241+
Treats ``None``, empty strings, whitespace-only strings, and empty
242+
containers as *not* filled. Recursively descends into dicts/lists so a
243+
nested object that contains only nulls is still counted as empty.
244+
"""
245+
if value is None:
246+
return False
247+
if isinstance(value, bool):
248+
return True
249+
if isinstance(value, str):
250+
return value.strip() != ""
251+
if isinstance(value, dict):
252+
return any(SaveHandler._is_filled_value(v) for v in value.values())
253+
if isinstance(value, (list, tuple, set)):
254+
return any(SaveHandler._is_filled_value(v) for v in value)
255+
return True
256+
257+
@staticmethod
258+
def _derive_aggregate_scores(
259+
evaluated_result: DataExtractionResult,
260+
) -> tuple[float, float, float]:
261+
"""Compute ``(entity_score, schema_score, min_extracted_entity_score)``.
262+
263+
Score selection order:
264+
265+
1. **Probabilistic confidence** — when the evaluate step produced
266+
per-field confidence (``total_evaluated_fields_count > 0``), use the
267+
probabilistic ``overall_confidence`` plus the ratio of
268+
above-threshold fields. This is the highest-fidelity signal.
269+
270+
2. **Structural completeness fallback** — when no probabilistic
271+
signal was produced (e.g. reasoning models like ``gpt-5``/``o1``/``o3``
272+
don't return logprobs, and image-only flow has no Content
273+
Understanding signal), but extraction still produced a comparison
274+
table, score by *how much of the schema was actually filled*. This
275+
replaces the old behaviour of falsely emitting ``0%`` for completed
276+
runs that simply lacked logprobs.
277+
278+
3. **Zero** — only when there is literally no extraction data
279+
(failed pipeline / genuinely empty result). Failed processing
280+
continues to surface as ``0`` so the UI consistently renders
281+
``0%`` for failures and genuine zeros.
282+
"""
283+
confidence = evaluated_result.confidence or {}
284+
total_evaluated_fields_count = confidence.get(
285+
"total_evaluated_fields_count", 0
286+
)
287+
comparison_items = (
288+
evaluated_result.comparison_result.items
289+
if evaluated_result.comparison_result is not None
290+
else []
291+
)
292+
293+
# Path 1: probabilistic confidence
294+
if total_evaluated_fields_count > 0 and comparison_items:
295+
zero_count = confidence.get("zero_confidence_fields_count", 0)
296+
schema_score = round(
297+
(len(comparison_items) - zero_count) / len(comparison_items),
298+
3,
299+
)
300+
entity_score = float(confidence.get("overall_confidence") or 0.0)
301+
min_extracted_entity_score = float(
302+
confidence.get("min_extracted_field_confidence") or 0.0
303+
)
304+
return (entity_score, schema_score, min_extracted_entity_score)
305+
306+
# Path 2: structural completeness fallback
307+
if comparison_items:
308+
filled = sum(
309+
1
310+
for item in comparison_items
311+
if SaveHandler._is_filled_value(item.Extracted)
312+
)
313+
ratio = round(filled / len(comparison_items), 3)
314+
return (ratio, ratio, ratio)
315+
316+
# Path 3: nothing to score on
317+
return (0.0, 0.0, 0.0)

src/ContentProcessor/src/libs/utils/azure_credential_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from azure.identity import (
2020
AzureCliCredential,
2121
AzureDeveloperCliCredential,
22-
DefaultAzureCredential,
2322
ManagedIdentityCredential,
2423
)
2524
from azure.identity import (

src/ContentProcessor/src/libs/utils/credential_util.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from azure.identity import (
2020
AzureCliCredential,
2121
AzureDeveloperCliCredential,
22-
DefaultAzureCredential,
2322
ManagedIdentityCredential,
2423
)
2524
from azure.identity import (
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
"""Tests for ``SaveHandler._derive_aggregate_scores``.
5+
6+
Covers the score-derivation contract:
7+
- probabilistic confidence flows through verbatim when available
8+
- structural completeness fallback fires for Completed runs without logprobs
9+
(e.g. reasoning models / image-only flow) instead of emitting a misleading 0%
10+
- a genuine zero is preserved as ``0.0``
11+
- failed/empty runs return ``0.0``
12+
"""
13+
14+
from __future__ import annotations
15+
16+
from libs.pipeline.handlers.logics.evaluate_handler.comparison import (
17+
ExtractionComparisonData,
18+
ExtractionComparisonItem,
19+
)
20+
from libs.pipeline.handlers.logics.evaluate_handler.model import DataExtractionResult
21+
from libs.pipeline.handlers.save_handler import SaveHandler
22+
23+
24+
def _make_result(
25+
*,
26+
items: list[ExtractionComparisonItem],
27+
confidence: dict,
28+
) -> DataExtractionResult:
29+
return DataExtractionResult(
30+
extracted_result={},
31+
confidence=confidence,
32+
comparison_result=ExtractionComparisonData(items=items),
33+
prompt_tokens=0,
34+
completion_tokens=0,
35+
execution_time=0,
36+
)
37+
38+
39+
class TestProbabilisticPath:
40+
def test_valid_scores_flow_through(self):
41+
"""A normal evaluate-step result must produce numeric scores."""
42+
items = [
43+
ExtractionComparisonItem(
44+
Field="a", Extracted="x", Confidence="90.00%", IsAboveThreshold="True"
45+
),
46+
ExtractionComparisonItem(
47+
Field="b", Extracted="y", Confidence="80.00%", IsAboveThreshold="True"
48+
),
49+
ExtractionComparisonItem(
50+
Field="c", Extracted="z", Confidence="0.00%", IsAboveThreshold="False"
51+
),
52+
]
53+
confidence = {
54+
"total_evaluated_fields_count": 3,
55+
"overall_confidence": 0.567,
56+
"min_extracted_field_confidence": 0.0,
57+
"zero_confidence_fields_count": 1,
58+
}
59+
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
60+
_make_result(items=items, confidence=confidence)
61+
)
62+
assert entity == 0.567
63+
# 2 of 3 fields above threshold → 0.667
64+
assert schema == round(2 / 3, 3)
65+
assert min_score == 0.0
66+
67+
def test_all_fields_above_threshold(self):
68+
items = [
69+
ExtractionComparisonItem(
70+
Field="a", Extracted="x", Confidence="95.00%", IsAboveThreshold="True"
71+
),
72+
ExtractionComparisonItem(
73+
Field="b", Extracted="y", Confidence="90.00%", IsAboveThreshold="True"
74+
),
75+
]
76+
confidence = {
77+
"total_evaluated_fields_count": 2,
78+
"overall_confidence": 0.925,
79+
"min_extracted_field_confidence": 0.9,
80+
"zero_confidence_fields_count": 0,
81+
}
82+
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
83+
_make_result(items=items, confidence=confidence)
84+
)
85+
assert entity == 0.925
86+
assert schema == 1.0
87+
assert min_score == 0.9
88+
89+
90+
class TestStructuralFallback:
91+
"""When logprobs are unavailable (reasoning model / image-only) but
92+
extraction succeeded, the Completed file must still get a meaningful
93+
numeric score based on schema completeness."""
94+
95+
def test_all_fields_filled_yields_one(self):
96+
items = [
97+
ExtractionComparisonItem(
98+
Field="a", Extracted="x", Confidence="0.00%", IsAboveThreshold="False"
99+
),
100+
ExtractionComparisonItem(
101+
Field="b", Extracted="y", Confidence="0.00%", IsAboveThreshold="False"
102+
),
103+
ExtractionComparisonItem(
104+
Field="c", Extracted=42, Confidence="0.00%", IsAboveThreshold="False"
105+
),
106+
]
107+
# No probabilistic signal: total_evaluated_fields_count == 0
108+
confidence = {
109+
"total_evaluated_fields_count": 0,
110+
"overall_confidence": 0.0,
111+
"min_extracted_field_confidence": 0.0,
112+
"zero_confidence_fields_count": 0,
113+
}
114+
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
115+
_make_result(items=items, confidence=confidence)
116+
)
117+
assert entity == 1.0
118+
assert schema == 1.0
119+
assert min_score == 1.0
120+
121+
def test_partial_fill_yields_ratio(self):
122+
items = [
123+
ExtractionComparisonItem(
124+
Field="a", Extracted="x", Confidence="0.00%", IsAboveThreshold="False"
125+
),
126+
ExtractionComparisonItem(
127+
Field="b", Extracted=None, Confidence="0.00%", IsAboveThreshold="False"
128+
),
129+
ExtractionComparisonItem(
130+
Field="c", Extracted="", Confidence="0.00%", IsAboveThreshold="False"
131+
),
132+
ExtractionComparisonItem(
133+
Field="d", Extracted="z", Confidence="0.00%", IsAboveThreshold="False"
134+
),
135+
]
136+
confidence = {"total_evaluated_fields_count": 0}
137+
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
138+
_make_result(items=items, confidence=confidence)
139+
)
140+
# 2 of 4 fields actually filled → 0.5
141+
assert entity == 0.5
142+
assert schema == 0.5
143+
assert min_score == 0.5
144+
145+
def test_all_fields_empty_yields_zero(self):
146+
"""Genuine-empty extraction: structural fallback collapses to ``0.0``."""
147+
items = [
148+
ExtractionComparisonItem(
149+
Field="a", Extracted=None, Confidence="0.00%", IsAboveThreshold="False"
150+
),
151+
ExtractionComparisonItem(
152+
Field="b", Extracted="", Confidence="0.00%", IsAboveThreshold="False"
153+
),
154+
ExtractionComparisonItem(
155+
Field="c", Extracted=" ", Confidence="0.00%", IsAboveThreshold="False"
156+
),
157+
]
158+
confidence = {"total_evaluated_fields_count": 0}
159+
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
160+
_make_result(items=items, confidence=confidence)
161+
)
162+
assert entity == 0.0
163+
assert schema == 0.0
164+
assert min_score == 0.0
165+
166+
167+
class TestZeroPath:
168+
def test_no_comparison_items_returns_zero(self):
169+
"""No extraction data at all (failed pipeline) → ``0.0``."""
170+
confidence = {
171+
"total_evaluated_fields_count": 0,
172+
"overall_confidence": 0.0,
173+
"min_extracted_field_confidence": 0.0,
174+
"zero_confidence_fields_count": 0,
175+
}
176+
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
177+
_make_result(items=[], confidence=confidence)
178+
)
179+
assert entity == 0.0
180+
assert schema == 0.0
181+
assert min_score == 0.0
182+
183+
def test_genuine_zero_probabilistic_score_preserved(self):
184+
"""A real ``0`` confidence (every field below threshold) must NOT be
185+
replaced by the structural fallback — it's genuinely 0%."""
186+
items = [
187+
ExtractionComparisonItem(
188+
Field="a", Extracted="x", Confidence="0.00%", IsAboveThreshold="False"
189+
),
190+
]
191+
confidence = {
192+
"total_evaluated_fields_count": 1,
193+
"overall_confidence": 0.0,
194+
"min_extracted_field_confidence": 0.0,
195+
"zero_confidence_fields_count": 1,
196+
}
197+
entity, schema, min_score = SaveHandler._derive_aggregate_scores(
198+
_make_result(items=items, confidence=confidence)
199+
)
200+
assert entity == 0.0
201+
assert schema == 0.0
202+
assert min_score == 0.0
203+
204+
205+
class TestIsFilledValue:
206+
"""Coverage for the ``_is_filled_value`` helper used by the structural fallback."""
207+
208+
def test_none_is_empty(self):
209+
assert SaveHandler._is_filled_value(None) is False
210+
211+
def test_empty_string_is_empty(self):
212+
assert SaveHandler._is_filled_value("") is False
213+
assert SaveHandler._is_filled_value(" ") is False
214+
215+
def test_non_empty_string_is_filled(self):
216+
assert SaveHandler._is_filled_value("x") is True
217+
218+
def test_zero_int_is_filled(self):
219+
# A literal ``0`` is a valid extracted value (e.g. count fields).
220+
assert SaveHandler._is_filled_value(0) is True
221+
222+
def test_bool_is_filled(self):
223+
assert SaveHandler._is_filled_value(False) is True
224+
assert SaveHandler._is_filled_value(True) is True
225+
226+
def test_empty_container_is_empty(self):
227+
assert SaveHandler._is_filled_value([]) is False
228+
assert SaveHandler._is_filled_value({}) is False
229+
230+
def test_nested_all_null_is_empty(self):
231+
assert SaveHandler._is_filled_value({"a": None, "b": ""}) is False
232+
assert SaveHandler._is_filled_value([None, "", {"c": None}]) is False
233+
234+
def test_nested_with_value_is_filled(self):
235+
assert SaveHandler._is_filled_value({"a": None, "b": "x"}) is True
236+
assert SaveHandler._is_filled_value([None, "x"]) is True

src/ContentProcessorAPI/app/routers/models/contentprocessor/claim_process.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@ class Content_Process(EntityBase):
5454
description="MIME type of the processed content file", default=None
5555
)
5656
entity_score: float = Field(
57-
description="Score indicating the quality of entity extraction from the content",
57+
description="Score indicating the quality of entity extraction from the content. For Completed runs this is either the probabilistic confidence (when logprobs are available) or a structural completeness fallback (fraction of expected fields actually filled). Failed runs and genuinely empty extractions remain at ``0.0``.",
5858
default=0.0,
5959
)
6060
schema_score: float = Field(
61-
description="Score indicating the quality of schema matching for the content",
61+
description="Score indicating the quality of schema matching for the content. For Completed runs this is either the probabilistic above-threshold ratio or a structural completeness fallback. Failed runs remain at ``0.0``.",
6262
default=0.0,
6363
)
6464
status: Optional[str] = Field(

0 commit comments

Comments
 (0)