Skip to content

OpenAI Compatible API

LayerScale is a drop-in replacement for the OpenAI API. Point your existing OpenAI SDK code at LayerScale and it works without changes.

Setup

Python SDK

Terminal window
pip install openai
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8080/v1",
api_key="your-api-key"
)

TypeScript SDK

Terminal window
npm install openai
import OpenAI from "openai";
const client = new OpenAI({
baseURL: "http://localhost:8080/v1",
apiKey: "your-api-key",
});

Chat Completions

Python

from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain the difference between TCP and UDP."},
],
max_tokens=300,
temperature=0.7,
)
print(response.choices[0].message.content)

TypeScript

import OpenAI from "openai";
const client = new OpenAI({
baseURL: "http://localhost:8080/v1",
apiKey: "your-api-key",
});
const response = await client.chat.completions.create({
model: "meta-llama/Llama-4-Maverick-17B-128E-Original",
messages: [
{ role: "system", content: "You are a helpful assistant." },
{ role: "user", content: "Explain the difference between TCP and UDP." },
],
max_tokens: 300,
temperature: 0.7,
});
console.log(response.choices[0].message.content);

REST

Terminal window
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Llama-4-Maverick-17B-128E-Original",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain the difference between TCP and UDP."}
],
"max_tokens": 300,
"temperature": 0.7
}'

Streaming

Python

from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
stream = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original",
messages=[
{"role": "user", "content": "Write a short poem about distributed systems."},
],
max_tokens=200,
stream=True,
)
for chunk in stream:
content = chunk.choices[0].delta.content
if content:
print(content, end="", flush=True)
print()

TypeScript

import OpenAI from "openai";
const client = new OpenAI({
baseURL: "http://localhost:8080/v1",
apiKey: "your-api-key",
});
const stream = await client.chat.completions.create({
model: "meta-llama/Llama-4-Maverick-17B-128E-Original",
messages: [
{ role: "user", content: "Write a short poem about distributed systems." },
],
max_tokens: 200,
stream: true,
});
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content;
if (content) {
process.stdout.write(content);
}
}
console.log();

Tool Calling

Python

from openai import OpenAI
import json
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current weather for a city",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City name, e.g. San Francisco",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
]
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
# First call: model decides to use the tool
response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original",
messages=messages,
tools=tools,
max_tokens=200,
)
assistant_message = response.choices[0].message
if assistant_message.tool_calls:
tool_call = assistant_message.tool_calls[0]
args = json.loads(tool_call.function.arguments)
print(f"Model called: {tool_call.function.name}({args})")
# Simulate the tool result
weather_result = {"temperature": 22, "condition": "partly cloudy", "unit": "celsius"}
# Second call: send the tool result back
messages.append(assistant_message)
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": json.dumps(weather_result),
})
final_response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original",
messages=messages,
tools=tools,
max_tokens=200,
)
print(final_response.choices[0].message.content)

TypeScript

import OpenAI from "openai";
const client = new OpenAI({
baseURL: "http://localhost:8080/v1",
apiKey: "your-api-key",
});
const tools: OpenAI.ChatCompletionTool[] = [
{
type: "function",
function: {
name: "get_weather",
description: "Get current weather for a city",
parameters: {
type: "object",
properties: {
location: { type: "string", description: "City name" },
unit: { type: "string", enum: ["celsius", "fahrenheit"] },
},
required: ["location"],
},
},
},
];
const messages: OpenAI.ChatCompletionMessageParam[] = [
{ role: "user", content: "What's the weather in Tokyo?" },
];
const response = await client.chat.completions.create({
model: "meta-llama/Llama-4-Maverick-17B-128E-Original",
messages,
tools,
max_tokens: 200,
});
const assistantMessage = response.choices[0].message;
if (assistantMessage.tool_calls) {
const toolCall = assistantMessage.tool_calls[0];
const args = JSON.parse(toolCall.function.arguments);
console.log(`Model called: ${toolCall.function.name}(${JSON.stringify(args)})`);
const weatherResult = { temperature: 22, condition: "partly cloudy", unit: "celsius" };
messages.push(assistantMessage);
messages.push({
role: "tool",
tool_call_id: toolCall.id,
content: JSON.stringify(weatherResult),
});
const finalResponse = await client.chat.completions.create({
model: "meta-llama/Llama-4-Maverick-17B-128E-Original",
messages,
tools,
max_tokens: 200,
});
console.log(finalResponse.choices[0].message.content);
}

Multi-Turn Conversations

The SDK handles conversation history automatically. Just keep appending messages:

from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
messages = [
{"role": "system", "content": "You are a Python tutor."},
{"role": "user", "content": "What is a list comprehension?"},
]
response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, max_tokens=200
)
messages.append({"role": "assistant", "content": response.choices[0].message.content})
# Follow-up question
messages.append({"role": "user", "content": "Show me a nested example."})
response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, max_tokens=200
)
print(response.choices[0].message.content)

Migrating from OpenAI

To switch from the OpenAI API to LayerScale, change two lines:

# Before (OpenAI)
client = OpenAI()
# After (LayerScale)
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")

Everything else (messages format, streaming, tool calling, response parsing) stays the same.