fix(node-agent): resilient output upload — 60s HTTP timeout + 4× retry on upload-URL
After a CPU-heavy AE render+transcode the orchestrator/DB can be briefly slow; the 15s client timeout made the post-render output-upload-url call fail and the finished MP4 was dropped (completed without export). Bumped client timeout to 60s and retry the upload-URL call up to 4× with backoff so a finished render's output is never lost to a transient stall. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -472,9 +472,23 @@ func (a *Agent) runJob(ctx context.Context, job *client.ClaimedJob) {
|
|||||||
uploadCtx, uploadCancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
uploadCtx, uploadCancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
defer uploadCancel()
|
defer uploadCancel()
|
||||||
|
|
||||||
uploadInfo, urlErr := a.orch.GetOutputUploadURL(uploadCtx, job.JobID)
|
// Retry the upload-URL call: right after a CPU-heavy render the orchestrator/DB
|
||||||
|
// can be briefly slow, and dropping a finished render's output is the worst outcome.
|
||||||
|
var uploadInfo *client.OutputUploadURLResponse
|
||||||
|
var urlErr error
|
||||||
|
for attempt := 1; attempt <= 4; attempt++ {
|
||||||
|
uploadInfo, urlErr = a.orch.GetOutputUploadURL(uploadCtx, job.JobID)
|
||||||
|
if urlErr == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
log.Printf("[job %s] get upload URL attempt %d failed: %v", job.JobID, attempt, urlErr)
|
||||||
|
select {
|
||||||
|
case <-uploadCtx.Done():
|
||||||
|
case <-time.After(time.Duration(attempt*3) * time.Second):
|
||||||
|
}
|
||||||
|
}
|
||||||
if urlErr != nil {
|
if urlErr != nil {
|
||||||
log.Printf("[job %s] get upload URL failed: %v — completing without export", job.JobID, urlErr)
|
log.Printf("[job %s] get upload URL failed after retries: %v — completing without export", job.JobID, urlErr)
|
||||||
} else {
|
} else {
|
||||||
log.Printf("[job %s] uploading output to %s", job.JobID, uploadInfo.ObjectKey)
|
log.Printf("[job %s] uploading output to %s", job.JobID, uploadInfo.ObjectKey)
|
||||||
if _, upErr := runner.UploadFile(uploadCtx, uploadInfo.UploadURL, outputPath); upErr != nil {
|
if _, upErr := runner.UploadFile(uploadCtx, uploadInfo.UploadURL, outputPath); upErr != nil {
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ func New(baseURL, nodeHMACSecret string) *Client {
|
|||||||
return &Client{
|
return &Client{
|
||||||
base: strings.TrimRight(baseURL, "/"),
|
base: strings.TrimRight(baseURL, "/"),
|
||||||
secret: nodeHMACSecret,
|
secret: nodeHMACSecret,
|
||||||
http: &http.Client{Timeout: 15 * time.Second},
|
// 60s: the post-render output-upload-url call (export insert + presign) can be
|
||||||
|
// slow when the DB is briefly busy right after a CPU-heavy render/transcode.
|
||||||
|
http: &http.Client{Timeout: 60 * time.Second},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user