Skip to content

OpenAI Compatible API

LayerScale is a drop-in replacement for the OpenAI API. Point your existing OpenAI SDK code at LayerScale and it works without changes.

Setup

Python SDK

pip install openai

from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8080/v1",
    api_key="your-api-key"
)

TypeScript SDK

npm install openai

import OpenAI from "openai";

const client = new OpenAI({
    baseURL: "http://localhost:8080/v1",
    apiKey: "your-api-key",
});

Chat Completions

Python

from openai import OpenAI

client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")

response = client.chat.completions.create(
    model="meta-llama/Llama-4-Maverick-17B-128E-Original",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Explain the difference between TCP and UDP."},
    ],
    max_tokens=300,
    temperature=0.7,
)

print(response.choices[0].message.content)

TypeScript

import OpenAI from "openai";

const client = new OpenAI({
    baseURL: "http://localhost:8080/v1",
    apiKey: "your-api-key",
});

const response = await client.chat.completions.create({
    model: "meta-llama/Llama-4-Maverick-17B-128E-Original",
    messages: [
        { role: "system", content: "You are a helpful assistant." },
        { role: "user", content: "Explain the difference between TCP and UDP." },
    ],
    max_tokens: 300,
    temperature: 0.7,
});

console.log(response.choices[0].message.content);

REST

curl http://localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "meta-llama/Llama-4-Maverick-17B-128E-Original",
    "messages": [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "Explain the difference between TCP and UDP."}
    ],
    "max_tokens": 300,
    "temperature": 0.7
  }'

Streaming

Python

from openai import OpenAI

client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")

stream = client.chat.completions.create(
    model="meta-llama/Llama-4-Maverick-17B-128E-Original",
    messages=[
        {"role": "user", "content": "Write a short poem about distributed systems."},
    ],
    max_tokens=200,
    stream=True,
)

for chunk in stream:
    content = chunk.choices[0].delta.content
    if content:
        print(content, end="", flush=True)
print()

TypeScript

import OpenAI from "openai";

const client = new OpenAI({
    baseURL: "http://localhost:8080/v1",
    apiKey: "your-api-key",
});

const stream = await client.chat.completions.create({
    model: "meta-llama/Llama-4-Maverick-17B-128E-Original",
    messages: [
        { role: "user", content: "Write a short poem about distributed systems." },
    ],
    max_tokens: 200,
    stream: true,
});

for await (const chunk of stream) {
    const content = chunk.choices[0]?.delta?.content;
    if (content) {
        process.stdout.write(content);
    }
}
console.log();

Tool Calling

Python

from openai import OpenAI
import json

client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get current weather for a city",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "City name, e.g. San Francisco",
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                    },
                },
                "required": ["location"],
            },
        },
    }
]

messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]

# First call: model decides to use the tool
response = client.chat.completions.create(
    model="meta-llama/Llama-4-Maverick-17B-128E-Original",
    messages=messages,
    tools=tools,
    max_tokens=200,
)

assistant_message = response.choices[0].message

if assistant_message.tool_calls:
    tool_call = assistant_message.tool_calls[0]
    args = json.loads(tool_call.function.arguments)
    print(f"Model called: {tool_call.function.name}({args})")

    # Simulate the tool result
    weather_result = {"temperature": 22, "condition": "partly cloudy", "unit": "celsius"}

    # Second call: send the tool result back
    messages.append(assistant_message)
    messages.append({
        "role": "tool",
        "tool_call_id": tool_call.id,
        "content": json.dumps(weather_result),
    })

    final_response = client.chat.completions.create(
        model="meta-llama/Llama-4-Maverick-17B-128E-Original",
        messages=messages,
        tools=tools,
        max_tokens=200,
    )
    print(final_response.choices[0].message.content)

TypeScript

import OpenAI from "openai";

const client = new OpenAI({
    baseURL: "http://localhost:8080/v1",
    apiKey: "your-api-key",
});

const tools: OpenAI.ChatCompletionTool[] = [
    {
        type: "function",
        function: {
            name: "get_weather",
            description: "Get current weather for a city",
            parameters: {
                type: "object",
                properties: {
                    location: { type: "string", description: "City name" },
                    unit: { type: "string", enum: ["celsius", "fahrenheit"] },
                },
                required: ["location"],
            },
        },
    },
];

const messages: OpenAI.ChatCompletionMessageParam[] = [
    { role: "user", content: "What's the weather in Tokyo?" },
];

const response = await client.chat.completions.create({
    model: "meta-llama/Llama-4-Maverick-17B-128E-Original",
    messages,
    tools,
    max_tokens: 200,
});

const assistantMessage = response.choices[0].message;

if (assistantMessage.tool_calls) {
    const toolCall = assistantMessage.tool_calls[0];
    const args = JSON.parse(toolCall.function.arguments);
    console.log(`Model called: ${toolCall.function.name}(${JSON.stringify(args)})`);

    const weatherResult = { temperature: 22, condition: "partly cloudy", unit: "celsius" };

    messages.push(assistantMessage);
    messages.push({
        role: "tool",
        tool_call_id: toolCall.id,
        content: JSON.stringify(weatherResult),
    });

    const finalResponse = await client.chat.completions.create({
        model: "meta-llama/Llama-4-Maverick-17B-128E-Original",
        messages,
        tools,
        max_tokens: 200,
    });
    console.log(finalResponse.choices[0].message.content);
}

Multi-Turn Conversations

The SDK handles conversation history automatically. Just keep appending messages:

from openai import OpenAI

client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")

messages = [
    {"role": "system", "content": "You are a Python tutor."},
    {"role": "user", "content": "What is a list comprehension?"},
]

response = client.chat.completions.create(
    model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, max_tokens=200
)
messages.append({"role": "assistant", "content": response.choices[0].message.content})

# Follow-up question
messages.append({"role": "user", "content": "Show me a nested example."})

response = client.chat.completions.create(
    model="meta-llama/Llama-4-Maverick-17B-128E-Original", messages=messages, max_tokens=200
)
print(response.choices[0].message.content)

Migrating from OpenAI

To switch from the OpenAI API to LayerScale, change two lines:

# Before (OpenAI)
client = OpenAI()

# After (LayerScale)
client = OpenAI(base_url="http://localhost:8080/v1", api_key="your-api-key")

Everything else (messages format, streaming, tool calling, response parsing) stays the same.