diff --git a/README.md b/README.md
index 4203278..fe64a21 100644
--- a/README.md
+++ b/README.md
@@ -123,6 +123,33 @@ supplied via the `input` parameter in YAML format. Additionally, you can
 provide file-based variables via `file_input`, where each key maps to a file
 path.
 
+### Prompt.yml with model parameters
+
+You can specify model parameters directly in your `.prompt.yml` files using the
+`modelParameters` key:
+
+```yaml
+messages:
+  - role: system
+    content: Be as concise as possible
+  - role: user
+    content: 'Compare {{a}} and {{b}}, please'
+model: openai/gpt-4o
+modelParameters:
+  maxCompletionTokens: 500
+  temperature: 0.7
+```
+
+| Key                   | Type   | Description                                                    |
+| --------------------- | ------ | -------------------------------------------------------------- |
+| `maxCompletionTokens` | number | The maximum number of tokens to generate                       |
+| `maxTokens`           | number | The maximum number of tokens to generate (deprecated)          |
+| `temperature`         | number | The sampling temperature to use (0-1)                          |
+| `topP`                | number | The nucleus sampling parameter to use (0-1)                    |
+
+> ![Note]
+> Parameters set in `modelParameters` take precedence over the corresponding action inputs.
+
 ### Using a system prompt file
 
 In addition to the regular prompt, you can provide a system prompt file instead
@@ -287,7 +314,8 @@ the action:
 | `system-prompt-file` | Path to a file containing the system prompt. If both `system-prompt` and `system-prompt-file` are provided, `system-prompt-file` takes precedence                                                                  | `""`                                 |
 | `model`              | The model to use for inference. Must be available in the [GitHub Models](https://github.com/marketplace?type=models) catalog                                                                                       | `openai/gpt-4o`                      |
 | `endpoint`           | The endpoint to use for inference. If you're running this as part of an org, you should probably use the org-specific Models endpoint                                                                              | `https://models.github.ai/inference` |
-| `max-tokens`         | The max number of tokens to generate                                                                                                                                                                               | 200                                  |
+| `max-tokens`         | The maximum number of tokens to generate (deprecated, use `max-completion-tokens` instead)                                                                                                                         | 200                                  |
+| `max-completion-tokens` | The maximum number of tokens to generate                                                                                                                                                                        | `""`                                 |
 | `temperature`        | The sampling temperature to use (0-1)                                                                                                                                                                              | `""`                                 |
 | `top-p`              | The nucleus sampling parameter to use (0-1)                                                                                                                                                                        | `""`                                 |
 | `enable-github-mcp`  | Enable Model Context Protocol integration with GitHub tools                                                                                                                                                        | `false`                              |
diff --git a/src/inference.ts b/src/inference.ts
index df9d8f7..4a1ce52 100644
--- a/src/inference.ts
+++ b/src/inference.ts
@@ -1,6 +1,6 @@
 import * as core from '@actions/core'
 import OpenAI from 'openai'
-import {GitHubMCPClient, executeToolCalls, ToolCall} from './mcp.js'
+import { GitHubMCPClient, executeToolCalls, ToolCall } from './mcp.js'
 
 interface ChatMessage {
   role: 'system' | 'user' | 'assistant' | 'tool'
@@ -10,7 +10,7 @@ interface ChatMessage {
 }
 
 export interface InferenceRequest {
-  messages: Array<{role: 'system' | 'user' | 'assistant' | 'tool'; content: string}>
+  messages: Array<{ role: 'system' | 'user' | 'assistant' | 'tool'; content: string }>
   modelName: string
   maxTokens?: number // Deprecated
   maxCompletionTokens?: number
@@ -18,7 +18,7 @@ export interface InferenceRequest {
   token: string
   temperature?: number
   topP?: number
-  responseFormat?: {type: 'json_schema'; json_schema: unknown} // Processed response format for the API
+  responseFormat?: { type: 'json_schema'; json_schema: unknown } // Processed response format for the API
   customHeaders?: Record<string, string> // Custom HTTP headers to include in API requests
 }
 
@@ -34,18 +34,17 @@ export interface InferenceResponse {
   }>
 }
 
-// Note: solution around models using different underlying max tokens properties
 
 /**
  * Build according to what input was passed, default to max_tokens.
  * Only one of max_tokens or max_completion_tokens will be set.
  */
-function buildMaxTokensParam(request: InferenceRequest): {max_tokens?: number; max_completion_tokens?: number} {
+function buildMaxTokensParam(request: InferenceRequest): { max_tokens?: number; max_completion_tokens?: number } {
   if (request.maxCompletionTokens != null) {
-    return {max_completion_tokens: request.maxCompletionTokens}
+    return { max_completion_tokens: request.maxCompletionTokens }
   }
   if (request.maxTokens != null) {
-    return {max_tokens: request.maxTokens}
+    return { max_tokens: request.maxTokens }
   }
   return {}
 }
@@ -115,7 +114,7 @@ export async function mcpInference(
       model: request.modelName,
       temperature: request.temperature,
       top_p: request.topP,
-      ...buildMaxTokensParam(request),
+      ...buildMaxTokensParam(request), // Note: solution around models using different underlying max tokens properties
     }
 
     // Add response format if specified (only on final iteration to avoid conflicts with tool calls)
@@ -138,7 +137,7 @@ export async function mcpInference(
       messages.push({
         role: 'assistant',
         content: modelResponse || '',
-        ...(toolCalls && {tool_calls: toolCalls as ToolCall[]}),
+        ...(toolCalls && { tool_calls: toolCalls as ToolCall[] }),
       })
 
       if (!toolCalls || toolCalls.length === 0) {