feat(render-svc+node-agent): add job-claim endpoint and build node-agent skeleton

render-svc:
- db: ClaimJob() — atomic SELECT FOR UPDATE SKIP LOCKED; transitions job to
  Preparing, marks node Busy in a single transaction
- models: ClaimJobRequest + ClaimedJob types
- handlers/internal: POST /v1/internal/render/jobs/claim — 200 with job or 204 when queue empty
- main: register the claim route under /v1/internal (nodeAuth)

services/node-agent/ (new Go module github.com/flatrender/node-agent):
- internal/config: env-var based config (NODE_ID required, sensible defaults)
- internal/client: typed orchestrator HTTP client (Online, Heartbeat, ClaimJob,
  Complete, Fail, ReportCrash) — X-Node-Signature auth
- internal/runner: AE render via aerender.exe or mock (for dev without AE)
- cmd/agent/main: register online → heartbeat loop (5s) + poll loop (3s) →
  claim job → run render → report complete/fail; health endpoint on :7777
- Dockerfile: cross-compiles to Windows amd64 static binary

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
soroush.asadi
2026-06-01 09:28:31 +03:30
parent 541e935418
commit ee421ccc68
10 changed files with 901 additions and 0 deletions
+56
View File
@@ -463,6 +463,62 @@ func (s *Store) getJobByIDInternal(ctx context.Context, id uuid.UUID) (*models.R
return jobs[0], nil
}
// ClaimJob atomically picks the highest-priority Queued job (optionally filtered
// by region) and moves it to Preparing, setting the current_job_id on the node.
// Returns (nil, nil) when there is nothing to do.
func (s *Store) ClaimJob(ctx context.Context, nodeID uuid.UUID, region string) (*models.RenderJob, error) {
tx, err := s.pool.Begin(ctx)
if err != nil {
return nil, err
}
defer func() { _ = tx.Rollback(ctx) }()
q := `SELECT id FROM render.render_jobs
WHERE step = 'Queued'::render_step`
args := []any{}
argIdx := 1
if region != "" {
q += fmt.Sprintf(" AND (region IS NULL OR region = $%d)", argIdx)
args = append(args, region)
argIdx++
}
q += " ORDER BY priority_score DESC, queued_at ASC LIMIT 1 FOR UPDATE SKIP LOCKED"
var jobID uuid.UUID
if err := tx.QueryRow(ctx, q, args...).Scan(&jobID); err != nil {
if err.Error() == "no rows in result set" {
return nil, nil // nothing to do
}
return nil, err
}
// Advance to Preparing and assign to this node
_, err = tx.Exec(ctx, `
UPDATE render.render_jobs SET
step = 'Preparing'::render_step,
started_at = COALESCE(started_at, NOW()),
updated_at = NOW()
WHERE id = $1`, jobID)
if err != nil {
return nil, err
}
_, err = tx.Exec(ctx, `
UPDATE render.render_nodes SET
status = 'Busy'::node_status,
current_job_id = $1,
job_started_at = NOW(),
updated_at = NOW()
WHERE id = $2`, jobID, nodeID)
if err != nil {
return nil, err
}
if err := tx.Commit(ctx); err != nil {
return nil, err
}
return s.getJobByIDInternal(ctx, jobID)
}
func (s *Store) CancelJob(ctx context.Context, id, userID uuid.UUID) (bool, error) {
tag, err := s.pool.Exec(ctx, `
UPDATE render.render_jobs
@@ -198,6 +198,37 @@ func (h *InternalHandler) ReplicaReady(c *gin.Context) {
c.Status(http.StatusNoContent)
}
// POST /v1/internal/render/jobs/claim
// Node agent calls this to atomically claim the next queued job.
// Returns 204 when there is nothing queued (agent should back off and retry).
func (h *InternalHandler) Claim(c *gin.Context) {
var req models.ClaimJobRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, models.APIError{Code: "bad_request", Message: err.Error()})
return
}
job, err := h.store.ClaimJob(c.Request.Context(), req.NodeID, req.Region)
if err != nil {
c.JSON(http.StatusInternalServerError, models.APIError{Code: "internal_error", Message: err.Error()})
return
}
if job == nil {
c.Status(http.StatusNoContent) // nothing queued
return
}
c.JSON(http.StatusOK, models.ClaimedJob{
JobID: job.ID,
SavedProjectID: job.SavedProjectID,
Quality: job.Quality,
Resolution: job.Resolution,
FrameRate: job.FrameRate,
HasMusic: job.HasMusic,
HasVoiceover: job.HasVoiceover,
})
}
// POST /v1/internal/nodes/:node_id/cache-update
func (h *InternalHandler) CacheUpdate(c *gin.Context) {
nodeID, err := uuid.Parse(c.Param("node_id"))
+15
View File
@@ -402,6 +402,21 @@ type CrashReportRequest struct {
LogFileURL *string `json:"log_file_url"`
}
type ClaimJobRequest struct {
NodeID uuid.UUID `json:"node_id" binding:"required"`
Region string `json:"region"`
}
type ClaimedJob struct {
JobID uuid.UUID `json:"job_id"`
SavedProjectID uuid.UUID `json:"saved_project_id"`
Quality string `json:"quality"`
Resolution string `json:"resolution"`
FrameRate int `json:"frame_rate"`
HasMusic bool `json:"has_music"`
HasVoiceover bool `json:"has_voiceover"`
}
type CacheUpdateRequest struct {
Action string `json:"action" binding:"required"`
ProjectID *uuid.UUID `json:"project_id"`