OpenAI Compatible API
LayerScale is a drop-in replacement for the OpenAI API. Point your existing OpenAI SDK code at LayerScale and it works without changes.
Setup
Python SDK
pip install openaifrom openai import OpenAI
client = OpenAI( base_url="http://localhost:8080/v1", api_key="your-api-key")TypeScript SDK
npm install openaiimport OpenAI from "openai";
const client = new OpenAI({ baseURL: "http://localhost:8080/v1", apiKey: "your-api-key",});Chat Completions
Python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
response = client.chat.completions.create( model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Explain the difference between TCP and UDP."}, ], max_tokens=300, temperature=0.7,)
print(response.choices[0].message.content)TypeScript
import OpenAI from "openai";
const client = new OpenAI({ baseURL: "http://localhost:8080/v1", apiKey: "your-api-key",});
const response = await client.chat.completions.create({ model: "meta-llama/Llama-4-Maverick-17B-128E-Original", messages: [ { role: "system", content: "You are a helpful assistant." }, { role: "user", content: "Explain the difference between TCP and UDP." }, ], max_tokens: 300, temperature: 0.7,});
console.log(response.choices[0].message.content);REST
curl http://localhost:8080/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "meta-llama/Llama-4-Maverick-17B-128E-Original", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Explain the difference between TCP and UDP."} ], "max_tokens": 300, "temperature": 0.7 }'Streaming
Python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
stream = client.chat.completions.create( model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=[ {"role": "user", "content": "Write a short poem about distributed systems."}, ], max_tokens=200, stream=True,)
for chunk in stream: content = chunk.choices[0].delta.content if content: print(content, end="", flush=True)print()TypeScript
import OpenAI from "openai";
const client = new OpenAI({ baseURL: "http://localhost:8080/v1", apiKey: "your-api-key",});
const stream = await client.chat.completions.create({ model: "meta-llama/Llama-4-Maverick-17B-128E-Original", messages: [ { role: "user", content: "Write a short poem about distributed systems." }, ], max_tokens: 200, stream: true,});
for await (const chunk of stream) { const content = chunk.choices[0]?.delta?.content; if (content) { process.stdout.write(content); }}console.log();Tool Calling
Python
from openai import OpenAIimport json
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get current weather for a city", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "City name, e.g. San Francisco", }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"], }, }, "required": ["location"], }, }, }]
messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]
# First call: model decides to use the toolresponse = client.chat.completions.create( model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, tools=tools, max_tokens=200,)
assistant_message = response.choices[0].message
if assistant_message.tool_calls: tool_call = assistant_message.tool_calls[0] args = json.loads(tool_call.function.arguments) print(f"Model called: {tool_call.function.name}({args})")
# Simulate the tool result weather_result = {"temperature": 22, "condition": "partly cloudy", "unit": "celsius"}
# Second call: send the tool result back messages.append(assistant_message) messages.append({ "role": "tool", "tool_call_id": tool_call.id, "content": json.dumps(weather_result), })
final_response = client.chat.completions.create( model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, tools=tools, max_tokens=200, ) print(final_response.choices[0].message.content)TypeScript
import OpenAI from "openai";
const client = new OpenAI({ baseURL: "http://localhost:8080/v1", apiKey: "your-api-key",});
const tools: OpenAI.ChatCompletionTool[] = [ { type: "function", function: { name: "get_weather", description: "Get current weather for a city", parameters: { type: "object", properties: { location: { type: "string", description: "City name" }, unit: { type: "string", enum: ["celsius", "fahrenheit"] }, }, required: ["location"], }, }, },];
const messages: OpenAI.ChatCompletionMessageParam[] = [ { role: "user", content: "What's the weather in Tokyo?" },];
const response = await client.chat.completions.create({ model: "meta-llama/Llama-4-Maverick-17B-128E-Original", messages, tools, max_tokens: 200,});
const assistantMessage = response.choices[0].message;
if (assistantMessage.tool_calls) { const toolCall = assistantMessage.tool_calls[0]; const args = JSON.parse(toolCall.function.arguments); console.log(`Model called: ${toolCall.function.name}(${JSON.stringify(args)})`);
const weatherResult = { temperature: 22, condition: "partly cloudy", unit: "celsius" };
messages.push(assistantMessage); messages.push({ role: "tool", tool_call_id: toolCall.id, content: JSON.stringify(weatherResult), });
const finalResponse = await client.chat.completions.create({ model: "meta-llama/Llama-4-Maverick-17B-128E-Original", messages, tools, max_tokens: 200, }); console.log(finalResponse.choices[0].message.content);}Multi-Turn Conversations
The SDK handles conversation history automatically. Just keep appending messages:
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")
messages = [ {"role": "system", "content": "You are a Python tutor."}, {"role": "user", "content": "What is a list comprehension?"},]
response = client.chat.completions.create( model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, max_tokens=200)messages.append({"role": "assistant", "content": response.choices[0].message.content})
# Follow-up questionmessages.append({"role": "user", "content": "Show me a nested example."})
response = client.chat.completions.create( model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, max_tokens=200)print(response.choices[0].message.content)Migrating from OpenAI
To switch from the OpenAI API to LayerScale, change two lines:
# Before (OpenAI)client = OpenAI()
# After (LayerScale)client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")Everything else (messages format, streaming, tool calling, response parsing) stays the same.