fix(node-agent): resilient output upload — 60s HTTP timeout + 4× retry on upload-URL

After a CPU-heavy AE render+transcode the orchestrator/DB can be briefly slow;
the 15s client timeout made the post-render output-upload-url call fail and the
finished MP4 was dropped (completed without export). Bumped client timeout to 60s
and retry the upload-URL call up to 4× with backoff so a finished render's output
is never lost to a transient stall.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-06 18:57:09 +03:30
parent e59f07df4e
commit 62807f5f41
2 changed files with 19 additions and 3 deletions
+16 -2
View File
@@ -472,9 +472,23 @@ func (a *Agent) runJob(ctx context.Context, job *client.ClaimedJob) {
uploadCtx, uploadCancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer uploadCancel()
uploadInfo, urlErr := a.orch.GetOutputUploadURL(uploadCtx, job.JobID)
// Retry the upload-URL call: right after a CPU-heavy render the orchestrator/DB
// can be briefly slow, and dropping a finished render's output is the worst outcome.
var uploadInfo *client.OutputUploadURLResponse
var urlErr error
for attempt := 1; attempt <= 4; attempt++ {
uploadInfo, urlErr = a.orch.GetOutputUploadURL(uploadCtx, job.JobID)
if urlErr == nil {
break
}
log.Printf("[job %s] get upload URL attempt %d failed: %v", job.JobID, attempt, urlErr)
select {
case <-uploadCtx.Done():
case <-time.After(time.Duration(attempt*3) * time.Second):
}
}
if urlErr != nil {
log.Printf("[job %s] get upload URL failed: %v — completing without export", job.JobID, urlErr)
log.Printf("[job %s] get upload URL failed after retries: %v — completing without export", job.JobID, urlErr)
} else {
log.Printf("[job %s] uploading output to %s", job.JobID, uploadInfo.ObjectKey)
if _, upErr := runner.UploadFile(uploadCtx, uploadInfo.UploadURL, outputPath); upErr != nil {