feat(render-svc+node-agent): add job-claim endpoint and build node-agent skeleton

render-svc:
- db: ClaimJob() — atomic SELECT FOR UPDATE SKIP LOCKED; transitions job to
  Preparing, marks node Busy in a single transaction
- models: ClaimJobRequest + ClaimedJob types
- handlers/internal: POST /v1/internal/render/jobs/claim — 200 with job or 204 when queue empty
- main: register the claim route under /v1/internal (nodeAuth)

services/node-agent/ (new Go module github.com/flatrender/node-agent):
- internal/config: env-var based config (NODE_ID required, sensible defaults)
- internal/client: typed orchestrator HTTP client (Online, Heartbeat, ClaimJob,
  Complete, Fail, ReportCrash) — X-Node-Signature auth
- internal/runner: AE render via aerender.exe or mock (for dev without AE)
- cmd/agent/main: register online → heartbeat loop (5s) + poll loop (3s) →
  claim job → run render → report complete/fail; health endpoint on :7777
- Dockerfile: cross-compiles to Windows amd64 static binary

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-01 09:28:31 +03:30
parent 541e935418
commit ee421ccc68
10 changed files with 901 additions and 0 deletions
@@ -0,0 +1,233 @@
// Package client provides a typed HTTP client for the V2 render orchestrator's
// internal (node-agent) API. All requests are authenticated via the shared
// X-Node-Signature header.
package client
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// Client talks to the V2 render orchestrator.
type Client struct {
base string
secret string
http *http.Client
}
// New returns a Client targeting the given base URL (e.g. "http://gateway:8080").
func New(baseURL, nodeHMACSecret string) *Client {
return &Client{
base: strings.TrimRight(baseURL, "/"),
secret: nodeHMACSecret,
http: &http.Client{Timeout: 15 * time.Second},
}
}
// ── Request helpers ───────────────────────────────────────────────────────────
func (c *Client) do(ctx context.Context, method, path string, body any) (*http.Response, error) {
var bodyReader io.Reader
if body != nil {
b, err := json.Marshal(body)
if err != nil {
return nil, fmt.Errorf("marshal: %w", err)
}
bodyReader = bytes.NewReader(b)
}
req, err := http.NewRequestWithContext(ctx, method, c.base+path, bodyReader)
if err != nil {
return nil, err
}
req.Header.Set("X-Node-Signature", c.secret)
if body != nil {
req.Header.Set("Content-Type", "application/json")
}
req.Header.Set("Accept", "application/json")
return c.http.Do(req)
}
func decodeJSON(resp *http.Response, out any) error {
defer resp.Body.Close()
if out == nil {
return nil
}
return json.NewDecoder(resp.Body).Decode(out)
}
// ── Domain types ──────────────────────────────────────────────────────────────
// OnlineRequest is sent once on startup to mark the node Ready.
type OnlineRequest struct {
NodeAgentVersion string `json:"node_agent_version"`
CurrentAEVersion string `json:"current_ae_version"`
AvailableAEVersions []string `json:"available_ae_versions"`
RamGB *int `json:"ram_gb,omitempty"`
CPUCores *int `json:"cpu_cores,omitempty"`
CacheUsedGB *int `json:"cache_used_gb,omitempty"`
CachedTemplateMD5s []string `json:"cached_template_md5s"`
}
// HeartbeatRequest is sent every HeartbeatIntervalSec seconds.
type HeartbeatRequest struct {
NodeID string `json:"node_id"`
Status string `json:"status"` // Ready | Busy
CPUPct *int `json:"cpu_pct,omitempty"`
RAMAvailableMB *int `json:"ram_available_mb,omitempty"`
AERunning *bool `json:"ae_running,omitempty"`
CurrentJobID *string `json:"current_job_id,omitempty"`
CacheUsedGB *int `json:"cache_used_gb,omitempty"`
}
// HeartbeatResponse carries optional commands from the orchestrator.
type HeartbeatResponse struct {
NextHeartbeatInSec int `json:"next_heartbeat_in_sec"`
PendingCommands []any `json:"pending_commands"`
}
// ClaimJobRequest asks the orchestrator for the next queued job.
type ClaimJobRequest struct {
NodeID string `json:"node_id"`
Region string `json:"region,omitempty"`
}
// ClaimedJob is the response when a job is successfully claimed.
type ClaimedJob struct {
JobID string `json:"job_id"`
SavedProjectID string `json:"saved_project_id"`
Quality string `json:"quality"`
Resolution string `json:"resolution"`
FrameRate int `json:"frame_rate"`
HasMusic bool `json:"has_music"`
HasVoiceover bool `json:"has_voiceover"`
}
// ProgressRequest reports render progress (frame-level) for a job.
type ProgressRequest struct {
FrameJobID string `json:"frame_job_id"`
FrameNumber int `json:"frame_number"`
CompletedAt *time.Time `json:"completed_at,omitempty"`
}
// CompleteRequest marks a job as Done.
type CompleteRequest struct {
ExportID *string `json:"export_id,omitempty"`
}
// FailRequest marks a job as Failed.
type FailRequest struct {
Reason string `json:"reason"`
AtStep string `json:"at_step,omitempty"`
}
// CrashRequest reports a node crash.
type CrashRequest struct {
NodeID string `json:"node_id"`
LastKnownFrame *int `json:"last_known_frame,omitempty"`
CrashSignal *string `json:"crash_signal,omitempty"`
ErrorLogTail *string `json:"error_log_tail,omitempty"`
}
// ── API methods ───────────────────────────────────────────────────────────────
// Online marks the node as Ready on startup.
func (c *Client) Online(ctx context.Context, nodeID string, req OnlineRequest) error {
resp, err := c.do(ctx, http.MethodPost,
fmt.Sprintf("/v1/internal/nodes/%s/online", nodeID), req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return fmt.Errorf("online: HTTP %d", resp.StatusCode)
}
return nil
}
// Heartbeat sends a heartbeat and returns the orchestrator's response.
func (c *Client) Heartbeat(ctx context.Context, nodeID string, req HeartbeatRequest) (*HeartbeatResponse, error) {
resp, err := c.do(ctx, http.MethodPost,
fmt.Sprintf("/v1/internal/nodes/%s/heartbeat", nodeID), req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return nil, fmt.Errorf("heartbeat: HTTP %d", resp.StatusCode)
}
var out HeartbeatResponse
_ = json.NewDecoder(resp.Body).Decode(&out)
return &out, nil
}
// ClaimJob atomically claims the next queued render job.
// Returns (nil, nil) when the queue is empty (204 No Content).
func (c *Client) ClaimJob(ctx context.Context, nodeID, region string) (*ClaimedJob, error) {
resp, err := c.do(ctx, http.MethodPost, "/v1/internal/render/jobs/claim",
ClaimJobRequest{NodeID: nodeID, Region: region})
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNoContent {
return nil, nil // nothing queued
}
if resp.StatusCode >= 300 {
return nil, fmt.Errorf("claim: HTTP %d", resp.StatusCode)
}
var job ClaimedJob
if err := json.NewDecoder(resp.Body).Decode(&job); err != nil {
return nil, fmt.Errorf("claim decode: %w", err)
}
return &job, nil
}
// Complete marks a render job as Done.
func (c *Client) Complete(ctx context.Context, jobID string, exportID *string) error {
resp, err := c.do(ctx, http.MethodPost,
fmt.Sprintf("/v1/internal/render/jobs/%s/complete", jobID),
CompleteRequest{ExportID: exportID})
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return fmt.Errorf("complete: HTTP %d", resp.StatusCode)
}
return nil
}
// Fail marks a render job as Failed.
func (c *Client) Fail(ctx context.Context, jobID, reason, atStep string) error {
resp, err := c.do(ctx, http.MethodPost,
fmt.Sprintf("/v1/internal/render/jobs/%s/fail", jobID),
FailRequest{Reason: reason, AtStep: atStep})
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return fmt.Errorf("fail: HTTP %d", resp.StatusCode)
}
return nil
}
// ReportCrash reports a node crash for the given job.
func (c *Client) ReportCrash(ctx context.Context, jobID string, req CrashRequest) error {
resp, err := c.do(ctx, http.MethodPost,
fmt.Sprintf("/v1/internal/render/jobs/%s/crash", jobID), req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return fmt.Errorf("crash: HTTP %d", resp.StatusCode)
}
return nil
}
@@ -0,0 +1,89 @@
// Package config loads node-agent runtime configuration from environment variables.
package config
import (
"fmt"
"os"
"strconv"
)
// Config holds all runtime settings for the node agent.
type Config struct {
// NodeID is the UUID of this render node, registered in the orchestrator.
// Must match a row in render.render_nodes.
NodeID string
// OrchestratorURL is the base URL of the V2 API gateway (internal network).
// Example: http://gateway:8080 or http://172.30.0.5:8088
OrchestratorURL string
// NodeHMACSecret is the shared secret sent as X-Node-Signature header.
// Must match NODE_HMAC_SECRET in the render-svc environment.
NodeHMACSecret string
// Region is the datacenter/region label for this node (e.g. "iran-tehran-1").
// The orchestrator uses it to route region-preferred jobs to this node.
Region string
// AEPath is the full path to the aerender.exe binary.
// Example: C:\Program Files\Adobe\Adobe After Effects 2024\Support Files\aerender.exe
// Leave empty to use mock rendering (for development / testing without AE).
AEPath string
// WorkDir is the scratch directory for render temp files and AE project copies.
WorkDir string
// HeartbeatIntervalSec is how often the agent sends a heartbeat to the orchestrator.
HeartbeatIntervalSec int
// PollIntervalSec is how long the agent waits between job-claim attempts when idle.
PollIntervalSec int
// AgentVersion is the semantic version string reported to the orchestrator.
AgentVersion string
// AEVersion is the After Effects version string reported to the orchestrator.
// Example: "2024"
AEVersion string
// ListenPort is the port for the agent's own HTTP health endpoint.
ListenPort int
}
// Load reads configuration from environment variables, returning an error
// if any required variable is missing.
func Load() (*Config, error) {
c := &Config{
NodeID: os.Getenv("NODE_ID"),
OrchestratorURL: getEnv("ORCHESTRATOR_URL", "http://localhost:8088"),
NodeHMACSecret: getEnv("NODE_HMAC_SECRET", "node-secret-change-me"),
Region: getEnv("NODE_REGION", ""),
AEPath: getEnv("AE_PATH", ""),
WorkDir: getEnv("WORK_DIR", os.TempDir()),
AgentVersion: getEnv("AGENT_VERSION", "0.1.0"),
AEVersion: getEnv("AE_VERSION", "2024"),
HeartbeatIntervalSec: getInt("HEARTBEAT_INTERVAL_SEC", 5),
PollIntervalSec: getInt("POLL_INTERVAL_SEC", 3),
ListenPort: getInt("LISTEN_PORT", 7777),
}
if c.NodeID == "" {
return nil, fmt.Errorf("NODE_ID environment variable is required")
}
return c, nil
}
func getEnv(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}
func getInt(key string, fallback int) int {
if v := os.Getenv(key); v != "" {
if n, err := strconv.Atoi(v); err == nil {
return n
}
}
return fallback
}
@@ -0,0 +1,141 @@
// Package runner executes After Effects render jobs and streams progress back
// via the provided callback. When AE_PATH is empty, a mock render is used
// (useful for CI and dev environments without a licensed AE installation).
package runner
import (
"context"
"fmt"
"log"
"os"
"os/exec"
"path/filepath"
"time"
)
// ProgressFn is called periodically during rendering with (percent 0-100, message).
type ProgressFn func(ctx context.Context, percent int, message string) error
// Job holds the parameters for a single render.
type Job struct {
JobID string
SavedProjectID string
Quality string
Resolution string
FrameRate int
HasMusic bool
HasVoiceover bool
// AEPFilePath is the local path to the downloaded .aep project file.
// In a full implementation the agent downloads this from MinIO before calling Run.
AEPFilePath string
}
// Run executes the render job, calling onProgress as it advances.
// Returns the path to the output MP4 file on success.
func Run(ctx context.Context, aePath, workDir string, job *Job, onProgress ProgressFn) (string, error) {
outputDir := filepath.Join(workDir, "renders", job.JobID)
if err := os.MkdirAll(outputDir, 0o755); err != nil {
return "", fmt.Errorf("create output dir: %w", err)
}
outputPath := filepath.Join(outputDir, "output.mp4")
if aePath == "" {
return mockRender(ctx, job, outputPath, onProgress)
}
return aeRender(ctx, aePath, job, outputPath, onProgress)
}
// ── Mock render (no AE installed) ────────────────────────────────────────────
func mockRender(ctx context.Context, job *Job, outputPath string, onProgress ProgressFn) (string, error) {
log.Printf("[mock] starting render for job %s (%s %s %dfps)", job.JobID, job.Quality, job.Resolution, job.FrameRate)
steps := []struct {
pct int
msg string
}{
{5, "Preparing project…"},
{15, "Loading template…"},
{30, "Rendering frames…"},
{50, "Rendering frames… (50%)"},
{70, "Rendering frames… (70%)"},
{85, "Encoding MP4…"},
{95, "Uploading output…"},
}
for _, s := range steps {
select {
case <-ctx.Done():
return "", ctx.Err()
case <-time.After(800 * time.Millisecond):
}
if err := onProgress(ctx, s.pct, s.msg); err != nil {
log.Printf("[mock] progress callback error: %v", err)
}
log.Printf("[mock] %d%% — %s", s.pct, s.msg)
}
// Write a placeholder file so the path is valid
if err := os.WriteFile(outputPath, []byte("mock-render-output"), 0o644); err != nil {
return "", fmt.Errorf("write mock output: %w", err)
}
log.Printf("[mock] render complete: %s", outputPath)
return outputPath, nil
}
// ── Real AE render via aerender.exe ──────────────────────────────────────────
func aeRender(ctx context.Context, aePath string, job *Job, outputPath string, onProgress ProgressFn) (string, error) {
if job.AEPFilePath == "" {
return "", fmt.Errorf("AEPFilePath is required for real AE render")
}
// aerender flags:
// -project <path.aep>
// -output <output.mp4>
// -RStemplate "Multi-Machine Settings" (optional)
// -OMtemplate "H.264 Match Render Settings 15 Mbps"
// -s <start_frame> -e <end_frame>
args := []string{
"-project", job.AEPFilePath,
"-output", outputPath,
}
log.Printf("[ae] running: %s %v", aePath, args)
cmd := exec.CommandContext(ctx, aePath, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
return "", fmt.Errorf("start aerender: %w", err)
}
// Poll process while alive — aerender does not expose machine-readable progress.
// We advance the progress indicator every 10 seconds until the process exits.
done := make(chan error, 1)
go func() { done <- cmd.Wait() }()
_ = onProgress(ctx, 10, "After Effects starting…")
pct := 10
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for {
select {
case err := <-done:
if err != nil {
return "", fmt.Errorf("aerender exit: %w", err)
}
_ = onProgress(ctx, 95, "Encoding complete")
return outputPath, nil
case <-ticker.C:
if pct < 90 {
pct += 5
}
_ = onProgress(ctx, pct, fmt.Sprintf("Rendering… %d%%", pct))
case <-ctx.Done():
_ = cmd.Process.Kill()
return "", ctx.Err()
}
}
}