Skip to content

Commit 111d5d6

Browse files
mihowclaude
andcommitted
test(jobs): cover _fail_job reason append + clarify defensive comments
Add TestFailJob with two TDD-confirmed cases: - ``test_fail_job_appends_reason_to_progress_errors`` — verifies the reason string lands in ``job.progress.errors`` (both in-memory and after refresh_from_db) so UI surfaces the cause of the FAILURE. The existing ``_fail_job`` call-site tests in ``TestProcessNatsPipelineResultError`` mock ``_fail_job`` entirely, so a regression that stops appending to ``progress.errors`` would slip through undetected. - ``test_fail_job_is_noop_on_already_final_job`` — regression guard for the early-return branch when the job is already in a final state. Also: - Comment the bare ``except Exception: pass`` around the ``progress.errors.append`` in ``_fail_job`` to explain why we swallow diagnostic-write failures. - Extend the ``AsyncJobStateManager.diagnose_missing_state`` docstring with a one-paragraph note about the SCAN cost (failure-path only, per-job fanout of at most four keys) to head off the obvious review question. Co-Authored-By: Claude <noreply@anthropic.com>
1 parent d3b09ac commit 111d5d6

3 files changed

Lines changed: 95 additions & 0 deletions

File tree

ami/jobs/tasks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ def _fail_job(job_id: int, reason: str) -> None:
355355
try:
356356
job.progress.errors.append(reason)
357357
except Exception:
358+
# Don't let diagnostic-write failures mask the original FAILURE.
358359
pass
359360
job.update_status(JobState.FAILURE, save=False)
360361
job.finished_at = datetime.datetime.now()

ami/jobs/tests/test_tasks.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,95 @@ def test_task_failure_marks_sync_api_job_failure_and_cleans_up(self, mock_cleanu
629629
mock_cleanup.assert_called_once()
630630

631631

632+
class TestFailJob(TransactionTestCase):
633+
"""
634+
Regression tests for ``_fail_job`` — specifically for the reason-string
635+
mirroring into ``progress.errors`` that this PR adds.
636+
637+
The FAILURE log line alone is not enough for operators; the UI reads
638+
``progress.errors``, and prior to this PR that list stayed empty on the
639+
missing-Redis-state path. Any regression that stops appending the reason
640+
(e.g. silently dropping it via the defensive ``try/except``) would put
641+
operators back in the position of digging through Celery worker logs to
642+
find out why a job died.
643+
"""
644+
645+
def setUp(self):
646+
cache.clear()
647+
self.project = Project.objects.create(name="FailJob Test Project")
648+
self.pipeline = Pipeline.objects.create(name="FailJob Pipeline", slug="fail-job-pipeline")
649+
self.pipeline.projects.add(self.project)
650+
self.collection = SourceImageCollection.objects.create(name="FailJob Collection", project=self.project)
651+
652+
def tearDown(self):
653+
cache.clear()
654+
655+
def _make_job(self, dispatch_mode: JobDispatchMode = JobDispatchMode.ASYNC_API) -> Job:
656+
job = Job.objects.create(
657+
job_type_key=MLJob.key,
658+
project=self.project,
659+
name=f"{dispatch_mode} fail-job test",
660+
pipeline=self.pipeline,
661+
source_image_collection=self.collection,
662+
dispatch_mode=dispatch_mode,
663+
)
664+
job.update_status(JobState.STARTED, save=True)
665+
return job
666+
667+
@patch("ami.ml.orchestration.jobs.cleanup_async_job_resources")
668+
def test_fail_job_appends_reason_to_progress_errors(self, mock_cleanup):
669+
"""
670+
Reason string must end up in ``job.progress.errors`` (persisted) so the
671+
UI shows the cause of the FAILURE alongside the status change. Before
672+
this PR the reason lived only in ``job.logger`` and the UI showed
673+
``errors=[]``. A silent regression here would not be caught by the
674+
``_fail_job`` call-site tests in ``TestProcessNatsPipelineResultError``
675+
(they mock ``_fail_job`` entirely).
676+
"""
677+
from ami.jobs.tasks import _fail_job
678+
679+
job = self._make_job()
680+
reason = "Job state missing from Redis (stage=process): redis=host:6379/db1 keys_for_job=<none>"
681+
682+
_fail_job(job.pk, reason)
683+
684+
job.refresh_from_db()
685+
self.assertEqual(job.status, JobState.FAILURE)
686+
self.assertIn(
687+
reason,
688+
job.progress.errors,
689+
f"expected reason in progress.errors, got: {job.progress.errors!r}",
690+
)
691+
# Sanity: the fix also propagates to the DB-persisted copy (i.e. the
692+
# update_fields tuple on job.save includes 'progress'). Re-read from a
693+
# fresh Job instance to prove the append wasn't only visible on the
694+
# in-memory object returned by select_for_update.
695+
reloaded = Job.objects.get(pk=job.pk)
696+
self.assertIn(reason, reloaded.progress.errors)
697+
mock_cleanup.assert_called_once_with(job.pk)
698+
699+
@patch("ami.ml.orchestration.jobs.cleanup_async_job_resources")
700+
def test_fail_job_is_noop_on_already_final_job(self, mock_cleanup):
701+
"""
702+
If the job is already in a final state (e.g. concurrent cleanup
703+
beat us), ``_fail_job`` must return early without touching status
704+
or progress. This protects against double-failing a job that has
705+
already been reconciled to SUCCESS by the reconciler path.
706+
"""
707+
from ami.jobs.tasks import _fail_job
708+
709+
job = self._make_job()
710+
job.update_status(JobState.SUCCESS, save=True)
711+
errors_before = list(job.progress.errors)
712+
713+
_fail_job(job.pk, "should be ignored")
714+
715+
job.refresh_from_db()
716+
self.assertEqual(job.status, JobState.SUCCESS)
717+
self.assertEqual(job.progress.errors, errors_before)
718+
mock_cleanup.assert_not_called()
719+
720+
632721
class TestResultEndpointWithError(APITestCase):
633722
"""Integration test for the result API endpoint with error results."""
634723

ami/ml/orchestration/async_job_state.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,11 @@ def diagnose_missing_state(self) -> str:
204204
eviction, and never-initialized state — instead of a single hardcoded
205205
"likely cleaned up concurrently" guess that all three collapse to.
206206
207+
Cost: the internal ``SCAN`` runs only on the failure path (once per
208+
job-lifetime FAILURE), and the per-job key fanout is at most four
209+
(pending:process, pending:results, failed, total), so the cost is
210+
negligible compared to the FAILURE branch it only helps diagnose.
211+
207212
Intentionally defensive: any failure to collect diagnostics is
208213
swallowed, because the caller is already about to fail the job and
209214
an exception from diagnostics would mask the original cause.

0 commit comments

Comments
 (0)