Skip to content

Commit eb5ed41

Browse files
authored
[cuebot] Recover lost dependencies (#2219)
Add a scheduled jobs to run every 10 minutes to search for dependencies that should have been cleared and are still active. Cuebot's dependency system relies solely on the FrameCompleteReport being successfully processed, and with no recover logic, there's a great risk of dangling dependencies, that usually require manual intervention to clear dependencies and get frames rolling. **Attention** Three new properties have been added to opencue.properties, and one of them (`maintenance.stuck_dependency_recovery_interval_ms`) is required: ``` # Enable/disable the stuck dependency recovery maintenance task. maintenance.stuck_dependency_recovery_enabled=true # Interval in milliseconds for stuck dependency recovery task. # Default is 10 minutes (600000 ms). This task finds and fixes frames # stuck in DEPEND state due to transient failures during dependency satisfaction. maintenance.stuck_dependency_recovery_interval_ms=600000 # Maximum number of depends/frames to process per recovery run. maintenance.stuck_dependency_recovery_batch_size=1000 ``` ### LLM Disclosure Claude Opus model was used to brainstorm this PR, optimize queries and review the work. All content has been fully reviewed and tested by the author (me).
1 parent 7e268fb commit eb5ed41

10 files changed

Lines changed: 227 additions & 2 deletions

File tree

VERSION.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.17
1+
1.18

cuebot/src/main/java/com/imageworks/spcue/MaintenanceTask.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,5 +49,10 @@ public enum MaintenanceTask {
4949
/**
5050
* Lock the subscription recalculation task.
5151
*/
52-
LOCK_SUBSCRIPTION_RECALCULATION
52+
LOCK_SUBSCRIPTION_RECALCULATION,
53+
54+
/**
55+
* Lock the stuck dependency recovery task.
56+
*/
57+
LOCK_STUCK_DEPENDENCY_RECOVERY
5358
}

cuebot/src/main/java/com/imageworks/spcue/dao/MaintenanceDao.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
package com.imageworks.spcue.dao;
1717

18+
import java.util.List;
19+
1820
import com.imageworks.spcue.MaintenanceTask;
1921

2022
/**
@@ -63,4 +65,24 @@ public interface MaintenanceDao {
6365
*/
6466
void recalculateSubscriptions();
6567

68+
/**
69+
* Finds active depends whose depended-on entity (job, layer, or frame) is already complete.
70+
* These are stale depends that should have been satisfied but weren't due to transient
71+
* failures.
72+
*
73+
* @param limit maximum number of depend IDs to return
74+
* @return list of pk_depend values for stale active depends
75+
*/
76+
List<String> findStaleDependIds(int limit);
77+
78+
/**
79+
* Fixes frames stuck in DEPEND state by setting int_depend_count to 0 for frames that have no
80+
* active depends targeting them. The DB trigger update_frame_dep_to_wait handles the DEPEND to
81+
* WAITING state transition automatically.
82+
*
83+
* @param limit maximum number of frames to fix per invocation
84+
* @return number of frames fixed
85+
*/
86+
int fixStuckDependCounts(int limit);
87+
6688
}

cuebot/src/main/java/com/imageworks/spcue/dao/postgres/MaintenanceDaoJdbc.java

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
package com.imageworks.spcue.dao.postgres;
1717

18+
import java.util.List;
19+
1820
import org.springframework.jdbc.core.support.JdbcDaoSupport;
1921

2022
import com.imageworks.spcue.MaintenanceTask;
@@ -62,4 +64,95 @@ public void unlockTask(MaintenanceTask task) {
6264
public void recalculateSubscriptions() {
6365
getJdbcTemplate().execute(RECALCULATE_SUBS);
6466
}
67+
68+
// spotless:off
69+
/**
70+
* Finds active, non-composite depends whose depended-upon entity has already completed
71+
* successfully. Composite depends (b_composite = true) are parent depends that are
72+
* automatically satisfied when all their child depends are satisfied, so they should
73+
* not be resolved directly.
74+
*
75+
* Section 1: Job-level depends (JOB_ON_JOB, LAYER_ON_JOB, FRAME_ON_JOB) where
76+
* every frame in the depended-upon job has SUCCEEDED or been EATEN.
77+
* Section 2: Layer-level depends (JOB_ON_LAYER, LAYER_ON_LAYER, FRAME_ON_LAYER) where
78+
* every frame in the depended-upon layer has SUCCEEDED or been EATEN.
79+
* Section 3: Frame-level depends (JOB_ON_FRAME, LAYER_ON_FRAME, FRAME_ON_FRAME) where
80+
* the specific depended-upon frame has SUCCEEDED or been EATEN.
81+
*/
82+
private static final String FIND_STALE_DEPEND_IDS =
83+
// Section 1: Job-level depends where all frames in the job succeeded or were eaten.
84+
// Pre-filter on job.str_state = 'FINISHED' leverages the i_job_str_state index to
85+
// skip depends on active jobs. A FINISHED job may still have DEAD frames, so we
86+
// also verify frame states via NOT EXISTS.
87+
"SELECT pk_depend FROM depend d "
88+
+ "JOIN job j ON j.pk_job = d.pk_job_depend_on AND j.str_state = 'FINISHED' "
89+
+ "WHERE d.b_active = true "
90+
+ "AND d.b_composite = false "
91+
+ "AND d.pk_layer_depend_on IS NULL "
92+
+ "AND d.pk_frame_depend_on IS NULL "
93+
+ "AND d.str_type IN ('JOB_ON_JOB', 'LAYER_ON_JOB', 'FRAME_ON_JOB') "
94+
+ "AND NOT EXISTS ("
95+
+ "SELECT 1 FROM frame f "
96+
+ "WHERE f.pk_job = d.pk_job_depend_on "
97+
+ "AND f.str_state NOT IN ('SUCCEEDED', 'EATEN')) "
98+
99+
+ "UNION ALL "
100+
101+
// Section 2: Layer-level depends where all frames in the layer succeeded or were eaten.
102+
+ "SELECT pk_depend FROM depend d "
103+
+ "WHERE d.b_active = true "
104+
+ "AND d.b_composite = false "
105+
+ "AND d.pk_layer_depend_on IS NOT NULL "
106+
+ "AND d.str_type IN ('JOB_ON_LAYER', 'LAYER_ON_LAYER', 'FRAME_ON_LAYER') "
107+
+ "AND NOT EXISTS ("
108+
+ "SELECT 1 FROM frame f "
109+
+ "WHERE f.pk_layer = d.pk_layer_depend_on "
110+
+ "AND f.str_state NOT IN ('SUCCEEDED', 'EATEN')) "
111+
112+
+ "UNION ALL "
113+
114+
// Section 3: Frame-level depends where the specific frame succeeded or was eaten.
115+
+ "SELECT pk_depend FROM depend d "
116+
+ "WHERE d.b_active = true "
117+
+ "AND d.b_composite = false "
118+
+ "AND d.pk_frame_depend_on IS NOT NULL "
119+
+ "AND d.str_type IN ('JOB_ON_FRAME', 'LAYER_ON_FRAME', 'FRAME_ON_FRAME') "
120+
+ "AND EXISTS ("
121+
+ "SELECT 1 FROM frame f "
122+
+ "WHERE f.pk_frame = d.pk_frame_depend_on "
123+
+ "AND f.str_state IN ('SUCCEEDED', 'EATEN')) "
124+
125+
+ "LIMIT ?";
126+
// spotless:on
127+
128+
@Override
129+
public List<String> findStaleDependIds(int limit) {
130+
return getJdbcTemplate().queryForList(FIND_STALE_DEPEND_IDS, String.class, limit);
131+
}
132+
133+
// spotless:off
134+
private static final String FIX_STUCK_DEPEND_COUNTS =
135+
"UPDATE frame SET int_depend_count = 0 "
136+
+ "WHERE pk_frame IN ("
137+
+ "SELECT f.pk_frame FROM frame f "
138+
+ "WHERE f.str_state = 'DEPEND' "
139+
+ "AND f.int_depend_count > 0 "
140+
+ "AND NOT EXISTS ("
141+
+ "SELECT 1 FROM depend d "
142+
+ "WHERE d.b_active = true "
143+
+ "AND d.pk_job_depend_er = f.pk_job "
144+
+ "AND ("
145+
+ "d.pk_frame_depend_er = f.pk_frame "
146+
+ "OR (d.pk_layer_depend_er = f.pk_layer AND d.pk_frame_depend_er IS NULL) "
147+
+ "OR (d.pk_layer_depend_er IS NULL AND d.pk_frame_depend_er IS NULL)"
148+
+ ")"
149+
+ ") "
150+
+ "LIMIT ?"
151+
+ ")";
152+
// spotless:on
153+
154+
@Override
155+
public int fixStuckDependCounts(int limit) {
156+
return getJdbcTemplate().update(FIX_STUCK_DEPEND_COUNTS, limit);
157+
}
65158
}

cuebot/src/main/java/com/imageworks/spcue/service/MaintenanceManagerSupport.java

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
import com.imageworks.spcue.FrameDetail;
2929
import com.imageworks.spcue.FrameInterface;
30+
import com.imageworks.spcue.LightweightDependency;
3031
import com.imageworks.spcue.MaintenanceTask;
3132
import com.imageworks.spcue.PointDetail;
3233
import com.imageworks.spcue.VirtualProc;
@@ -63,6 +64,8 @@ public class MaintenanceManagerSupport {
6364

6465
private DepartmentManager departmentManager;
6566

67+
private DependManager dependManager;
68+
6669
private static final long WAIT_FOR_HOST_REPORTS_MS = 600000;
6770

6871
private static final int CHECKPOINT_MAX_WAIT_SEC = 300;
@@ -243,6 +246,61 @@ public void recalculateSubscriptions() {
243246
}
244247
}
245248

249+
/**
250+
* Recovers frames stuck in DEPEND state due to transient failures during dependency
251+
* satisfaction. Runs in two phases:
252+
*
253+
* Phase 1: Finds active depends whose depended-on entity is already complete and satisfies them
254+
* through the normal code path (handles composite depends and publishes events). Each
255+
* {@code satisfyDepend} call runs in its own transaction via {@code DependManagerService}'s
256+
* {@code @Transactional}, providing per-depend isolation.
257+
*
258+
* Phase 2: Fixes frames still stuck in DEPEND by resetting int_depend_count to 0 where no
259+
* active depends target them. The DB trigger {@code update_frame_dep_to_wait} handles the
260+
* DEPEND to WAITING transition. This UPDATE auto-commits as a single statement transaction.
261+
*/
262+
public void recoverStuckDependencies() {
263+
if (!env.getProperty("maintenance.stuck_dependency_recovery_enabled", Boolean.class,
264+
true)) {
265+
return;
266+
}
267+
268+
// Use a MaintenanceTask lock to prevent multiple instances from racing each other
269+
if (!maintenanceDao.lockTask(MaintenanceTask.LOCK_STUCK_DEPENDENCY_RECOVERY)) {
270+
return;
271+
}
272+
try {
273+
int batchSize = env.getProperty("maintenance.stuck_dependency_recovery_batch_size",
274+
Integer.class, 1000);
275+
276+
// Phase 1: Satisfy stale active depends through normal code path
277+
List<String> staleDependIds = maintenanceDao.findStaleDependIds(batchSize);
278+
int satisfiedCount = 0;
279+
for (String dependId : staleDependIds) {
280+
try {
281+
LightweightDependency depend = dependManager.getDepend(dependId);
282+
dependManager.satisfyDepend(depend);
283+
satisfiedCount++;
284+
} catch (Exception e) {
285+
logger.warn("failed to satisfy stale depend " + dependId + ": " + e);
286+
}
287+
}
288+
if (satisfiedCount > 0) {
289+
logger.info("recovered " + satisfiedCount + " stale active depends");
290+
}
291+
292+
// Phase 2: Fix count mismatches via SQL
293+
int fixedFrames = maintenanceDao.fixStuckDependCounts(batchSize);
294+
if (fixedFrames > 0) {
295+
logger.info("fixed depend counts for " + fixedFrames + " stuck frames");
296+
}
297+
} catch (Exception e) {
298+
logger.warn("failed to recover stuck dependencies: " + e);
299+
} finally {
300+
maintenanceDao.unlockTask(MaintenanceTask.LOCK_STUCK_DEPENDENCY_RECOVERY);
301+
}
302+
}
303+
246304
public FrameDao getFrameDao() {
247305
return frameDao;
248306
}
@@ -303,4 +361,12 @@ public void setJobManager(JobManager jobManager) {
303361
this.jobManager = jobManager;
304362
}
305363

364+
public DependManager getDependManager() {
365+
return dependManager;
366+
}
367+
368+
public void setDependManager(DependManager dependManager) {
369+
this.dependManager = dependManager;
370+
}
371+
306372
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
-- Add task_lock entry for stuck dependency recovery maintenance task.
2+
-- This task periodically detects and fixes frames stuck in DEPEND state
3+
-- due to transient failures during dependency satisfaction.
4+
5+
INSERT INTO task_lock (pk_task_lock, str_name, int_lock, int_timeout)
6+
VALUES ('00000000-0000-0000-0000-000000000007', 'LOCK_STUCK_DEPENDENCY_RECOVERY', 0, 600);

cuebot/src/main/resources/conf/ddl/postgres/seed_data.sql

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,5 @@ Insert into TASK_LOCK (PK_TASK_LOCK,STR_NAME,INT_LOCK,INT_TIMEOUT) values ('0000
8484
Insert into TASK_LOCK (PK_TASK_LOCK,STR_NAME,INT_LOCK,INT_TIMEOUT) values ('00000000-0000-0000-0000-000000000005','LOCK_TASK_UPDATE',1240618998852,3600);
8585

8686
Insert into TASK_LOCK (PK_TASK_LOCK,STR_NAME,INT_LOCK,INT_TIMEOUT) values ('00000000-0000-0000-0000-000000000006','LOCK_SUBSCRIPTION_RECALCULATION',0,7200);
87+
88+
Insert into TASK_LOCK (PK_TASK_LOCK,STR_NAME,INT_LOCK,INT_TIMEOUT) values ('00000000-0000-0000-0000-000000000007','LOCK_STUCK_DEPENDENCY_RECOVERY',0,600);

cuebot/src/main/resources/conf/spring/applicationContext-service.xml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,7 @@
422422
<property name="historicalSupport" ref="historicalSupport" />
423423
<property name="departmentManager" ref="departmentManager" />
424424
<property name="jobManager" ref="jobManager" />
425+
<property name="dependManager" ref="dependManager" />
425426
</bean>
426427

427428
<bean id="hardwareCheck" class="org.springframework.scheduling.quartz.MethodInvokingJobDetailFactoryBean">
@@ -604,6 +605,19 @@
604605
<property name="repeatInterval" value="${maintenance.subscription_recalculation_interval_ms}" />
605606
</bean>
606607

608+
<bean id="stuckDependencyRecovery" class="org.springframework.scheduling.quartz.MethodInvokingJobDetailFactoryBean">
609+
<property name="targetObject" ref="maintenanceManager" />
610+
<property name="targetMethod" value="recoverStuckDependencies" />
611+
</bean>
612+
613+
<bean id="stuckDependencyRecoveryTrigger" class="org.springframework.scheduling.quartz.SimpleTriggerFactoryBean">
614+
<property name="jobDetail" ref="stuckDependencyRecovery" />
615+
<!-- 10 minute start delay to let the cue get up and running -->
616+
<property name="startDelay" value="600000" />
617+
<!-- repeat every 10 minutes by default (configurable via maintenance.stuck_dependency_recovery_interval_ms) -->
618+
<property name="repeatInterval" value="${maintenance.stuck_dependency_recovery_interval_ms}" />
619+
</bean>
620+
607621
<bean class="org.springframework.scheduling.quartz.SchedulerFactoryBean" destroy-method="destroy">
608622
<property name="waitForJobsToCompleteOnShutdown"><value>false</value></property>
609623
<property name="triggers">
@@ -621,6 +635,7 @@
621635
<ref bean="updateShowsStatusTrigger" />
622636
<ref bean="collectPrometheusMetricsTrigger" />
623637
<ref bean="subscriptionRecalculationTrigger" />
638+
<ref bean="stuckDependencyRecoveryTrigger" />
624639
</list>
625640
</property>
626641
</bean>

cuebot/src/main/resources/opencue.properties

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,17 @@ maintenance.auto_delete_down_hosts=false
187187
# accountability issues that can occur at large scale.
188188
maintenance.subscription_recalculation_interval_ms=7200000
189189

190+
# Enable/disable the stuck dependency recovery maintenance task.
191+
maintenance.stuck_dependency_recovery_enabled=true
192+
193+
# Interval in milliseconds for stuck dependency recovery task.
194+
# Default is 10 minutes (600000 ms). This task finds and fixes frames
195+
# stuck in DEPEND state due to transient failures during dependency satisfaction.
196+
maintenance.stuck_dependency_recovery_interval_ms=600000
197+
198+
# Maximum number of depends/frames to process per recovery run.
199+
maintenance.stuck_dependency_recovery_batch_size=1000
200+
190201
# Set hostname/IP of the smtp host. Will be used for mailing
191202
smtp_host=smtp
192203

cuebot/src/test/resources/opencue.properties

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,8 @@ log.loki.url = http://localhost/loki/api
7979

8080
# Interval in milliseconds for subscription recalculation task.
8181
maintenance.subscription_recalculation_interval_ms=7200000
82+
83+
# Interval in milliseconds for stuck dependency recovery task.
84+
# Default is 10 minutes (600000 ms). This task finds and fixes frames
85+
# stuck in DEPEND state due to transient failures during dependency satisfaction.
86+
maintenance.stuck_dependency_recovery_interval_ms=600000

0 commit comments

Comments
 (0)