fix(node-agent): resilient output upload — 60s HTTP timeout + 4× retry on upload-URL
After a CPU-heavy AE render+transcode the orchestrator/DB can be briefly slow; the 15s client timeout made the post-render output-upload-url call fail and the finished MP4 was dropped (completed without export). Bumped client timeout to 60s and retry the upload-URL call up to 4× with backoff so a finished render's output is never lost to a transient stall. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -472,9 +472,23 @@ func (a *Agent) runJob(ctx context.Context, job *client.ClaimedJob) {
|
||||
uploadCtx, uploadCancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer uploadCancel()
|
||||
|
||||
uploadInfo, urlErr := a.orch.GetOutputUploadURL(uploadCtx, job.JobID)
|
||||
// Retry the upload-URL call: right after a CPU-heavy render the orchestrator/DB
|
||||
// can be briefly slow, and dropping a finished render's output is the worst outcome.
|
||||
var uploadInfo *client.OutputUploadURLResponse
|
||||
var urlErr error
|
||||
for attempt := 1; attempt <= 4; attempt++ {
|
||||
uploadInfo, urlErr = a.orch.GetOutputUploadURL(uploadCtx, job.JobID)
|
||||
if urlErr == nil {
|
||||
break
|
||||
}
|
||||
log.Printf("[job %s] get upload URL attempt %d failed: %v", job.JobID, attempt, urlErr)
|
||||
select {
|
||||
case <-uploadCtx.Done():
|
||||
case <-time.After(time.Duration(attempt*3) * time.Second):
|
||||
}
|
||||
}
|
||||
if urlErr != nil {
|
||||
log.Printf("[job %s] get upload URL failed: %v — completing without export", job.JobID, urlErr)
|
||||
log.Printf("[job %s] get upload URL failed after retries: %v — completing without export", job.JobID, urlErr)
|
||||
} else {
|
||||
log.Printf("[job %s] uploading output to %s", job.JobID, uploadInfo.ObjectKey)
|
||||
if _, upErr := runner.UploadFile(uploadCtx, uploadInfo.UploadURL, outputPath); upErr != nil {
|
||||
|
||||
@@ -26,7 +26,9 @@ func New(baseURL, nodeHMACSecret string) *Client {
|
||||
return &Client{
|
||||
base: strings.TrimRight(baseURL, "/"),
|
||||
secret: nodeHMACSecret,
|
||||
http: &http.Client{Timeout: 15 * time.Second},
|
||||
// 60s: the post-render output-upload-url call (export insert + presign) can be
|
||||
// slow when the DB is briefly busy right after a CPU-heavy render/transcode.
|
||||
http: &http.Client{Timeout: 60 * time.Second},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user