Skip to content

OpenAI Compatible API

LayerScale is a drop-in replacement for the OpenAI API. Point your existing OpenAI SDK code at LayerScale and it works without changes.

Setup

Terminal window
pip install openai
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8080/v1",
api_key="your-api-key"
)

Chat Completions

from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain the difference between TCP and UDP."},
],
max_tokens=300,
temperature=0.7,
)
print(response.choices[0].message.content)

Streaming

from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
stream = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original",
messages=[
{"role": "user", "content": "Write a short poem about distributed systems."},
],
max_tokens=200,
stream=True,
)
for chunk in stream:
content = chunk.choices[0].delta.content
if content:
print(content, end="", flush=True)
print()

Tool Calling

from openai import OpenAI
import json
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current weather for a city",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City name, e.g. San Francisco",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
]
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
# First call: model decides to use the tool
response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original",
messages=messages,
tools=tools,
max_tokens=200,
)
assistant_message = response.choices[0].message
if assistant_message.tool_calls:
tool_call = assistant_message.tool_calls[0]
args = json.loads(tool_call.function.arguments)
print(f"Model called: {tool_call.function.name}({args})")
# Simulate the tool result
weather_result = {"temperature": 22, "condition": "partly cloudy", "unit": "celsius"}
# Second call: send the tool result back
messages.append(assistant_message)
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": json.dumps(weather_result),
})
final_response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original",
messages=messages,
tools=tools,
max_tokens=200,
)
print(final_response.choices[0].message.content)

Multi-Turn Conversations

The SDK handles conversation history automatically. Just keep appending messages:

from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
messages = [
{"role": "system", "content": "You are a Python tutor."},
{"role": "user", "content": "What is a list comprehension?"},
]
response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, max_tokens=200
)
messages.append({"role": "assistant", "content": response.choices[0].message.content})
# Follow-up question
messages.append({"role": "user", "content": "Show me a nested example."})
response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, max_tokens=200
)
print(response.choices[0].message.content)

Migrating from OpenAI

To switch from the OpenAI API to LayerScale, change two lines:

# Before (OpenAI)
client = OpenAI()
# After (LayerScale)
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")

Everything else (messages format, streaming, tool calling, response parsing) stays the same.