From 62807f5f41c5c7b07354fc3a3a5549259bfb7117 Mon Sep 17 00:00:00 2001 From: "soroush.asadi" Date: Sat, 6 Jun 2026 18:57:09 +0330 Subject: [PATCH] =?UTF-8?q?fix(node-agent):=20resilient=20output=20upload?= =?UTF-8?q?=20=E2=80=94=2060s=20HTTP=20timeout=20+=204=C3=97=20retry=20on?= =?UTF-8?q?=20upload-URL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a CPU-heavy AE render+transcode the orchestrator/DB can be briefly slow; the 15s client timeout made the post-render output-upload-url call fail and the finished MP4 was dropped (completed without export). Bumped client timeout to 60s and retry the upload-URL call up to 4× with backoff so a finished render's output is never lost to a transient stall. Co-Authored-By: Claude Opus 4.8 --- services/node-agent/cmd/agent/main.go | 18 ++++++++++++++++-- services/node-agent/internal/client/client.go | 4 +++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/services/node-agent/cmd/agent/main.go b/services/node-agent/cmd/agent/main.go index f64dfc0..3f3b304 100644 --- a/services/node-agent/cmd/agent/main.go +++ b/services/node-agent/cmd/agent/main.go @@ -472,9 +472,23 @@ func (a *Agent) runJob(ctx context.Context, job *client.ClaimedJob) { uploadCtx, uploadCancel := context.WithTimeout(context.Background(), 5*time.Minute) defer uploadCancel() - uploadInfo, urlErr := a.orch.GetOutputUploadURL(uploadCtx, job.JobID) + // Retry the upload-URL call: right after a CPU-heavy render the orchestrator/DB + // can be briefly slow, and dropping a finished render's output is the worst outcome. + var uploadInfo *client.OutputUploadURLResponse + var urlErr error + for attempt := 1; attempt <= 4; attempt++ { + uploadInfo, urlErr = a.orch.GetOutputUploadURL(uploadCtx, job.JobID) + if urlErr == nil { + break + } + log.Printf("[job %s] get upload URL attempt %d failed: %v", job.JobID, attempt, urlErr) + select { + case <-uploadCtx.Done(): + case <-time.After(time.Duration(attempt*3) * time.Second): + } + } if urlErr != nil { - log.Printf("[job %s] get upload URL failed: %v — completing without export", job.JobID, urlErr) + log.Printf("[job %s] get upload URL failed after retries: %v — completing without export", job.JobID, urlErr) } else { log.Printf("[job %s] uploading output to %s", job.JobID, uploadInfo.ObjectKey) if _, upErr := runner.UploadFile(uploadCtx, uploadInfo.UploadURL, outputPath); upErr != nil { diff --git a/services/node-agent/internal/client/client.go b/services/node-agent/internal/client/client.go index 6a421fc..e934c52 100644 --- a/services/node-agent/internal/client/client.go +++ b/services/node-agent/internal/client/client.go @@ -26,7 +26,9 @@ func New(baseURL, nodeHMACSecret string) *Client { return &Client{ base: strings.TrimRight(baseURL, "/"), secret: nodeHMACSecret, - http: &http.Client{Timeout: 15 * time.Second}, + // 60s: the post-render output-upload-url call (export insert + presign) can be + // slow when the DB is briefly busy right after a CPU-heavy render/transcode. + http: &http.Client{Timeout: 60 * time.Second}, } }