Add /api/generate endpoint for model loading and unloading

ericcurtin · ericcurtin · commit 50f4324cdd6f · 2025-10-28T09:45:42.000Z
So we can load and unload models

Signed-off-by: Eric Curtin &lt;eric.curtin@docker.com&gt;
diff --git a/cmd/cli/commands/root.go b/cmd/cli/commands/root.go
@@ -113,6 +113,7 @@ func NewRootCmd(cli *command.DockerCli) *cobra.Command {
 		newConfigureCmd(),
 		newPSCmd(),
 		newDFCmd(),
+		newStopCmd(),
 		newUnloadCmd(),
 		newRequestsCmd(),
 	)
diff --git a/cmd/cli/commands/run.go b/cmd/cli/commands/run.go
@@ -691,8 +691,12 @@ func newRunCmd() *cobra.Command {
 				}
 			}
 
+			// Check if a prompt was explicitly provided (even if empty string)
+			// If args length > 1, then a prompt argument was provided (even if it's "")
+			explicitPromptProvided := len(args) > 1
+
 			// Handle --detach flag: just load the model without interaction
-			if detach {
+			if detach || (explicitPromptProvided && prompt == "") {
 				// Make a minimal request to load the model into memory
 				err := desktopClient.Chat(model, "", nil, func(content string) {
 					// Silently discard output in detach mode
@@ -714,6 +718,14 @@ func newRunCmd() *cobra.Command {
 				return nil
 			}
 
+			// Preload the model in the background to optimize for the first user interaction
+			// This makes sure the model is loaded when the user types their first prompt
+			go func() {
+				_ = desktopClient.Chat(model, "", nil, func(content string) {
+					// Silently preload the model - discard output
+				}, false)
+			}()
+
 			// Use enhanced readline-based interactive mode when terminal is available
 			if term.IsTerminal(int(os.Stdin.Fd())) {
 				return generateInteractiveWithReadline(cmd, desktopClient, model)
diff --git a/cmd/cli/commands/stop.go b/cmd/cli/commands/stop.go
@@ -0,0 +1,48 @@
+package commands
+
+import (
+	"fmt"
+
+	"github.com/docker/model-runner/cmd/cli/commands/completion"
+	"github.com/docker/model-runner/cmd/cli/desktop"
+	"github.com/docker/model-runner/pkg/inference/models"
+	"github.com/spf13/cobra"
+)
+
+func newStopCmd() *cobra.Command {
+	var backend string
+
+	const cmdArgs = "MODEL"
+	c := &cobra.Command{
+		Use:   "stop " + cmdArgs,
+		Short: "Stop a running model",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			model := models.NormalizeModelName(args[0])
+			unloadResp, err := desktopClient.Unload(desktop.UnloadRequest{Backend: backend, Models: []string{model}})
+			if err != nil {
+				err = handleClientError(err, "Failed to stop model")
+				return handleNotRunningError(err)
+			}
+			unloaded := unloadResp.UnloadedRunners
+			if unloaded == 0 {
+				cmd.Println("No such model running.")
+			} else {
+				cmd.Printf("Stopped %d model(s).\n", unloaded)
+			}
+			return nil
+		},
+		ValidArgsFunction: completion.NoComplete,
+	}
+	c.Args = func(cmd *cobra.Command, args []string) error {
+		if len(args) < 1 {
+			return fmt.Errorf(
+				"'docker model stop' requires MODEL.\\n\\n" +
+					"Usage:  docker model stop " + cmdArgs + "\\n\\n" +
+					"See 'docker model stop --help' for more information.",
+			)
+		}
+		return nil
+	}
+	c.Flags().StringVar(&backend, "backend", "", "Optional backend to target")
+	return c
+}
diff --git a/cmd/cli/commands/utils.go b/cmd/cli/commands/utils.go
@@ -39,6 +39,14 @@ func handleClientError(err error, message string) error {
 	return errors.Join(err, errors.New(message))
 }
 
+// handleNotRunningError checks if the error indicates that the model was not running
+// and returns a user-friendly message in that case
+func handleNotRunningError(err error) error {
+	// For now, just return the error as-is
+	// This function can be expanded to handle specific "model not running" errors in the future
+	return err
+}
+
 // stripDefaultsFromModelName removes the default "ai/" prefix and ":latest" tag for display.
 // Examples:
 //   - "ai/gemma3:latest" -> "gemma3"
diff --git a/cmd/cli/docs/reference/docker_model.yaml b/cmd/cli/docs/reference/docker_model.yaml
@@ -22,6 +22,7 @@ cname:
     - docker model run
     - docker model start-runner
     - docker model status
+    - docker model stop
     - docker model stop-runner
     - docker model tag
     - docker model uninstall-runner
@@ -44,6 +45,7 @@ clink:
     - docker_model_run.yaml
     - docker_model_start-runner.yaml
     - docker_model_status.yaml
+    - docker_model_stop.yaml
     - docker_model_stop-runner.yaml
     - docker_model_tag.yaml
     - docker_model_uninstall-runner.yaml
diff --git a/cmd/cli/docs/reference/docker_model_stop.yaml b/cmd/cli/docs/reference/docker_model_stop.yaml
@@ -0,0 +1,23 @@
+command: docker model stop
+short: Stop a running model
+long: Stop a running model
+usage: docker model stop MODEL
+pname: docker model
+plink: docker_model.yaml
+options:
+    - option: backend
+      value_type: string
+      description: Optional backend to target
+      deprecated: false
+      hidden: false
+      experimental: false
+      experimentalcli: false
+      kubernetes: false
+      swarm: false
+deprecated: false
+hidden: false
+experimental: false
+experimentalcli: false
+kubernetes: false
+swarm: false
+
diff --git a/cmd/cli/docs/reference/model.md b/cmd/cli/docs/reference/model.md
@@ -23,6 +23,7 @@ Docker Model Runner
 | [`run`](model_run.md)                           | Run a model and interact with it using a submitted prompt or chat mode                          |
 | [`start-runner`](model_start-runner.md)         | Start Docker Model Runner (Docker Engine only)                                                  |
 | [`status`](model_status.md)                     | Check if the Docker Model Runner is running                                                     |
+| [`stop`](model_stop.md)                         | Stop a running model                                                                            |
 | [`stop-runner`](model_stop-runner.md)           | Stop Docker Model Runner (Docker Engine only)                                                   |
 | [`tag`](model_tag.md)                           | Tag a model                                                                                     |
 | [`uninstall-runner`](model_uninstall-runner.md) | Uninstall Docker Model Runner (Docker Engine only)                                              |
diff --git a/cmd/cli/docs/reference/model_stop.md b/cmd/cli/docs/reference/model_stop.md
@@ -0,0 +1,14 @@
+# docker model stop
+
+<!---MARKER_GEN_START-->
+Stop a running model
+
+### Options
+
+| Name        | Type     | Default | Description                |
+|:------------|:---------|:--------|:---------------------------|
+| `--backend` | `string` |         | Optional backend to target |
+
+
+<!---MARKER_GEN_END-->
+
diff --git a/main.go b/main.go
@@ -179,6 +179,17 @@ func main() {
 	// Add /v1 as an alias for /engines/v1
 	router.Handle("/v1/", &V1AliasHandler{scheduler: scheduler})
 
+	// Add API endpoints by creating a custom handler
+	apiHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/generate":
+			scheduler.HandleGenerate(w, r)
+		default:
+			http.NotFound(w, r)
+		}
+	})
+	router.Handle("/api/generate", apiHandler)
+
 	// Add metrics endpoint if enabled
 	if os.Getenv("DISABLE_METRICS") != "1" {
 		metricsHandler := metrics.NewAggregatedMetricsHandler(
diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go
@@ -94,3 +94,33 @@ type ConfigureRequest struct {
 	RawRuntimeFlags string                              `json:"raw-runtime-flags,omitempty"`
 	Speculative     *inference.SpeculativeDecodingConfig `json:"speculative,omitempty"`
 }
+
+// GenerateRequest represents the request structure for /api/generate endpoint
+type GenerateRequest struct {
+	Model     string `json:"model"`
+	Prompt    string `json:"prompt"`
+	System    string `json:"system,omitempty"`
+	Template  string `json:"template,omitempty"`
+	Context   []int  `json:"context,omitempty"`
+	Stream    *bool  `json:"stream,omitempty"`
+	Raw       bool   `json:"raw,omitempty"`
+	KeepAlive *int   `json:"keep_alive,omitempty"`
+	Options   map[string]interface{} `json:"options,omitempty"`
+}
+
+// GenerateResponse represents the response structure for /api/generate endpoint
+type GenerateResponse struct {
+	Model     string    `json:"model"`
+	CreatedAt time.Time `json:"created_at"`
+	Response  string    `json:"response"`
+	Done      bool      `json:"done"`
+	DoneReason string   `json:"done_reason,omitempty"`
+	Context    []int    `json:"context,omitempty"`
+	TotalDuration int64 `json:"total_duration,omitempty"`
+	LoadDuration  int64 `json:"load_duration,omitempty"`
+	PromptEvalCount int `json:"prompt_eval_count,omitempty"`
+	PromptEvalDuration int64 `json:"prompt_eval_duration,omitempty"`
+	EvalCount int         `json:"eval_count,omitempty"`
+	EvalDuration int64    `json:"eval_duration,omitempty"`
+}
+
diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go
@@ -13,6 +13,7 @@ import (
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/memory"
 	"github.com/docker/model-runner/pkg/inference/models"
+	"github.com/docker/model-runner/pkg/internal/utils"
 	"github.com/docker/model-runner/pkg/logging"
 	"github.com/docker/model-runner/pkg/metrics"
 )
@@ -229,7 +230,7 @@ func (l *loader) evict(idleOnly bool) int {
 		}
 		if unused && (!idleOnly || idle || defunct) {
 			l.log.Infof("Evicting %s backend runner with model %s (%s) in %s mode",
-				r.backend, r.modelID, runnerInfo.modelRef, r.mode,
+				r.backend, r.modelID, utils.SanitizeForLog(runnerInfo.modelRef), r.mode,
 			)
 			l.slots[runnerInfo.slot].terminate()
 			l.slots[runnerInfo.slot] = nil
@@ -251,7 +252,7 @@ func (l *loader) evictRunner(backend, model string, mode inference.BackendMode)
 		unused := l.references[runnerInfo.slot] == 0
 		if unused && (allBackends || r.backend == backend) && r.modelID == model && r.mode == mode {
 			l.log.Infof("Evicting %s backend runner with model %s (%s) in %s mode",
-				r.backend, r.modelID, runnerInfo.modelRef, r.mode,
+				r.backend, r.modelID, utils.SanitizeForLog(runnerInfo.modelRef), r.mode,
 			)
 			l.slots[runnerInfo.slot].terminate()
 			l.slots[runnerInfo.slot] = nil
@@ -434,15 +435,15 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 		// TODO(p1-0tr): For now override memory checks in case model can't be parsed
 		// e.g. model is too new for gguf-parser-go to know. We should provide a cleaner
 		// way to bypass these checks.
-		l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it. Error: %s", modelID, parseErr)
+		l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it. Error: %s", utils.SanitizeForLog(modelID), parseErr)
 		memory = inference.RequiredMemory{
 			RAM:  0,
 			VRAM: 0,
 		}
 	} else if err != nil {
 		return nil, err
 	}
-	l.log.Infof("Loading %s, which will require %d MB RAM and %d MB VRAM on a system with %d MB RAM and %d MB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024, l.totalMemory.RAM/1024/1024, l.totalMemory.VRAM/1024/1024)
+	l.log.Infof("Loading %s, which will require %d MB RAM and %d MB VRAM on a system with %d MB RAM and %d MB VRAM", utils.SanitizeForLog(modelID), memory.RAM/1024/1024, memory.VRAM/1024/1024, l.totalMemory.RAM/1024/1024, l.totalMemory.VRAM/1024/1024)
 	if l.totalMemory.RAM == 1 {
 		l.log.Warnf("RAM size unknown. Assume model will fit, but only one.")
 		memory.RAM = 1
@@ -491,7 +492,7 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 		if ok {
 			select {
 			case <-l.slots[existing.slot].done:
-				l.log.Warnf("%s runner for %s is defunct. Waiting for it to be evicted.", backendName, existing.modelRef)
+				l.log.Warnf("%s runner for %s is defunct. Waiting for it to be evicted.", backendName, utils.SanitizeForLog(existing.modelRef))
 				if l.references[existing.slot] == 0 {
 					l.evictRunner(backendName, modelID, mode)
 				} else {
@@ -534,11 +535,11 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 		if slot >= 0 {
 			// runnerConfig was already retrieved earlier (lines 401-405), no need to look it up again
 			// Create the runner.
-			l.log.Infof("Loading %s backend runner with model %s in %s mode", backendName, modelID, mode)
+			l.log.Infof("Loading %s backend runner with model %s in %s mode", backendName, utils.SanitizeForLog(modelID), mode)
 			runner, err := run(l.log, backend, modelID, modelRef, mode, slot, runnerConfig, l.openAIRecorder)
 			if err != nil {
 				l.log.Warnf("Unable to start %s backend runner with model %s in %s mode: %v",
-					backendName, modelID, mode, err,
+					backendName, utils.SanitizeForLog(modelID), mode, err,
 				)
 				return nil, fmt.Errorf("unable to start runner: %w", err)
 			}
@@ -552,7 +553,7 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 			if err := runner.wait(ctx); err != nil {
 				runner.terminate()
 				l.log.Warnf("Initialization for %s backend runner with model %s in %s mode failed: %v",
-					backendName, modelID, mode, err,
+					backendName, utils.SanitizeForLog(modelID), mode, err,
 				)
 				return nil, fmt.Errorf("error waiting for runner to be ready: %w", err)
 			}
diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go
diff --git a/pkg/internal/utils/log.go b/pkg/internal/utils/log.go

Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ func NewRootCmd(cli command.DockerCli) cobra.Command {`
`113`	`113`	`newConfigureCmd(),`
`114`	`114`	`newPSCmd(),`
`115`	`115`	`newDFCmd(),`
	`116`	`+ newStopCmd(),`
`116`	`117`	`newUnloadCmd(),`
`117`	`118`	`newRequestsCmd(),`
`118`	`119`	`)`