diff --git a/services/node-agent/cmd/agent/main.go b/services/node-agent/cmd/agent/main.go index f64dfc0..3f3b304 100644 --- a/services/node-agent/cmd/agent/main.go +++ b/services/node-agent/cmd/agent/main.go @@ -472,9 +472,23 @@ func (a *Agent) runJob(ctx context.Context, job *client.ClaimedJob) { uploadCtx, uploadCancel := context.WithTimeout(context.Background(), 5*time.Minute) defer uploadCancel() - uploadInfo, urlErr := a.orch.GetOutputUploadURL(uploadCtx, job.JobID) + // Retry the upload-URL call: right after a CPU-heavy render the orchestrator/DB + // can be briefly slow, and dropping a finished render's output is the worst outcome. + var uploadInfo *client.OutputUploadURLResponse + var urlErr error + for attempt := 1; attempt <= 4; attempt++ { + uploadInfo, urlErr = a.orch.GetOutputUploadURL(uploadCtx, job.JobID) + if urlErr == nil { + break + } + log.Printf("[job %s] get upload URL attempt %d failed: %v", job.JobID, attempt, urlErr) + select { + case <-uploadCtx.Done(): + case <-time.After(time.Duration(attempt*3) * time.Second): + } + } if urlErr != nil { - log.Printf("[job %s] get upload URL failed: %v — completing without export", job.JobID, urlErr) + log.Printf("[job %s] get upload URL failed after retries: %v — completing without export", job.JobID, urlErr) } else { log.Printf("[job %s] uploading output to %s", job.JobID, uploadInfo.ObjectKey) if _, upErr := runner.UploadFile(uploadCtx, uploadInfo.UploadURL, outputPath); upErr != nil { diff --git a/services/node-agent/internal/client/client.go b/services/node-agent/internal/client/client.go index 6a421fc..e934c52 100644 --- a/services/node-agent/internal/client/client.go +++ b/services/node-agent/internal/client/client.go @@ -26,7 +26,9 @@ func New(baseURL, nodeHMACSecret string) *Client { return &Client{ base: strings.TrimRight(baseURL, "/"), secret: nodeHMACSecret, - http: &http.Client{Timeout: 15 * time.Second}, + // 60s: the post-render output-upload-url call (export insert + presign) can be + // slow when the DB is briefly busy right after a CPU-heavy render/transcode. + http: &http.Client{Timeout: 60 * time.Second}, } }