Spracherkennung für: .ts vermutete Sprache: Unknown {[0] [0] [0]} [Methode: Schwerpunktbildung, einfache Gewichte, sechs Dimensionen]
import type { AssistantMessage } from "@mariozechner/pi-ai";
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../../../src/config/config.js";
import { BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS } from "../../../src/plugins/contracts/inv entory/bundled-capability-metadata.js";
import { createEmptyPluginRegistry } from "../../../src/plugins/registry-empty.js";
import { setActivePluginRegistry } from "../../../src/plugins/runtime.js";
import type { SpeechProviderPlugin } from "../../../src/plugins/types.js";
import { resolveWorkspacePackagePublicModuleUrl } from "../../../src/test-utils/bundled-plugin-public-surface.js";
import { withEnv, withEnvAsync } from "../../../src/test-utils/env.js";
import type { ResolvedTtsConfig } from "../../../src/tts/tts-types.js";
type TtsRuntimeModule = typeof import("../../../src/tts/tts.js");
type TtsCoreModule = typeof import("../../../src/tts/tts-core.js");
const speechCoreRuntimeApiModuleId = resolveWorkspacePackagePublicModuleUrl({
packageName: "@openclaw/speech-core",
artifactBasename: "runtime-api.js",
});
let ttsRuntime: TtsRuntimeModule;
let ttsRuntimePromise: Promise<TtsRuntimeModule> | null = null;
let ttsRuntimeInitialized = false;
let ttsCorePromise: Promise<TtsCoreModule> | null = null;
let completeSimple: typeof import("@mariozechner/pi-ai").completeSimple;
let getApiKeyForModelMock: typeof import("../../../src/agents/model-auth.js").getApiKeyForModel;
let requireApiKeyMock: typeof import("../../../src/agents/model-auth.js").requireApiKey;
let resolveModelAsyncMock: typeof import("../../../src/agents/pi-embedded-runner/model.js").resolveModelAsync;
let ensureCustomApiRegisteredMock: typeof import("../../../src/agents/custom-api-registry.js").ensureCustomApiRegistered;
let prepareModelForSimpleCompletionMock: typeof import("../../../src/agents/simple-completion-transport.js").prepareModelForSimpleCompletion;
let summarizeTextCore: TtsCoreModule["summarizeText"];
let resolveTtsConfig: TtsRuntimeModule["resolveTtsConfig"];
let maybeApplyTtsToPayload: TtsRuntimeModule["maybeApplyTtsToPayload"];
let getTtsProvider: TtsRuntimeModule["getTtsProvider"];
let parseTtsDirectives: TtsRuntimeModule["_test"]["parseTtsDirectives"];
let resolveModelOverridePolicy: TtsRuntimeModule["_test"]["resolveModelOverridePolicy"];
let getResolvedSpeechProviderConfig: TtsRuntimeModule["_test"]["getResolvedSpeechProviderConfig"];
let formatTtsProviderError: TtsRuntimeModule["_test"]["formatTtsProviderError"];
let sanitizeTtsErrorForLog: TtsRuntimeModule["_test"]["sanitizeTtsErrorForLog"];
const SPEECH_PROVIDER_ENV_KEYS = [
...new Set(
BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS.flatMap((entry) =>
entry.speechProviderIds.flatMap((providerId) => entry.providerAuthEnvVars[providerId] ?? []),
),
),
].toSorted((left, right) => left.localeCompare(right));
function isolatedSpeechProviderEnv(
overrides: Record<string, string | undefined> = {},
): Record<string, string | undefined> {
return {
...Object.fromEntries(SPEECH_PROVIDER_ENV_KEYS.map((key) => [key, undefined])),
...overrides,
};
}
function withIsolatedSpeechProviderEnv<T>(
overrides: Record<string, string | undefined>,
fn: () => T,
): T {
return withEnv(isolatedSpeechProviderEnv(overrides), fn);
}
async function withIsolatedSpeechProviderEnvAsync<T>(
overrides: Record<string, string | undefined>,
fn: () => Promise<T>,
): Promise<T> {
return await withEnvAsync(isolatedSpeechProviderEnv(overrides), fn);
}
vi.mock("@mariozechner/pi-ai", () => {
const getApiProvider = vi.fn(() => undefined);
return {
completeSimple: vi.fn(),
createAssistantMessageEventStream: vi.fn(),
getApiProvider,
getModel: vi.fn(),
registerApiProvider: vi.fn(),
streamAnthropic: vi.fn(),
streamSimple: vi.fn(),
streamSimpleOpenAICompletions: vi.fn(),
};
});
vi.mock("@mariozechner/pi-ai/oauth", () => {
return {
getOAuthProviders: () => [],
getOAuthApiKey: vi.fn(async () => null),
loginOpenAICodex: vi.fn(),
};
});
function createResolvedModel(provider: string, modelId: string, api = "openai-completions") {
return {
model: {
provider,
id: modelId,
name: modelId,
api,
reasoning: false,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 128000,
maxTokens: 8192,
},
authStorage: { profiles: {} },
modelRegistry: { find: vi.fn() },
};
}
vi.mock("../../../src/agents/pi-embedded-runner/model.js", () => ({
resolveModel: vi.fn((provider: string, modelId: string) =>
createResolvedModel(provider, modelId),
),
resolveModelAsync: vi.fn(async (provider: string, modelId: string) =>
createResolvedModel(provider, modelId),
),
}));
vi.mock("../../../src/agents/model-auth.js", () => ({
getApiKeyForModel: vi.fn(async () => ({
apiKey: "test-api-key",
source: "test",
mode: "api-key",
})),
requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""),
}));
vi.mock("../../../src/agents/custom-api-registry.js", () => ({
ensureCustomApiRegistered: vi.fn(),
}));
function asLegacyTtsConfig(value: unknown): OpenClawConfig {
return value as OpenClawConfig;
}
function asLegacyOpenClawConfig(value: Record<string, unknown>): OpenClawConfig {
return value as unknown as OpenClawConfig;
}
const mockAssistantMessage = (content: AssistantMessage["content"]): AssistantMessage => ({
role: "assistant",
content,
api: "openai-completions",
provider: "openai",
model: "gpt-4o-mini",
usage: {
input: 1,
output: 1,
cacheRead: 0,
cacheWrite: 0,
totalTokens: 2,
cost: {
input: 0,
output: 0,
cacheRead: 0,
cacheWrite: 0,
total: 0,
},
},
stopReason: "stop",
timestamp: Date.now(),
});
function createSummarizeTextDeps() {
return {
completeSimple,
getApiKeyForModel: getApiKeyForModelMock,
prepareModelForSimpleCompletion: prepareModelForSimpleCompletionMock,
requireApiKey: requireApiKeyMock,
resolveModelAsync: resolveModelAsyncMock,
};
}
function createOpenAiTelephonyCfg(model: "tts-1" | "gpt-4o-mini-tts"): OpenClawConfig {
return asLegacyTtsConfig({
messages: {
tts: {
provider: "openai",
providers: {
openai: {
apiKey: "test-key",
model,
voice: "alloy",
instructions: "Speak warmly",
},
},
},
},
});
}
function createAudioBuffer(length = 2): Buffer {
return Buffer.from(new Uint8Array(length).fill(1));
}
async function withMockedSpeechFetch(
run: (fetchMock: ReturnType<typeof vi.fn>) => Promise<void>,
audioLength: number,
) {
const originalFetch = globalThis.fetch;
const fetchMock = vi.fn(async () => ({
ok: true,
arrayBuffer: async () => new ArrayBuffer(audioLength),
}));
globalThis.fetch = fetchMock as unknown as typeof fetch;
try {
await run(fetchMock);
} finally {
globalThis.fetch = originalFetch;
}
}
function resolveBaseUrl(rawValue: unknown, fallback: string): string {
return typeof rawValue === "string" && rawValue.trim() ? rawValue.replace(/\/+$/u, "") : fallback;
}
function resolveTestProviderConfig(
rawConfig: Record<string, unknown>,
providerId: string,
...aliases: string[]
): Record<string, unknown> {
const providers =
typeof rawConfig.providers === "object" &&
rawConfig.providers !== null &&
!Array.isArray(rawConfig.providers)
? (rawConfig.providers as Record<string, unknown>)
: {};
for (const key of [providerId, ...aliases]) {
const direct = rawConfig[key];
if (typeof direct === "object" && direct !== null && !Array.isArray(direct)) {
return direct as Record<string, unknown>;
}
const nested = providers[key];
if (typeof nested === "object" && nested !== null && !Array.isArray(nested)) {
return nested as Record<string, unknown>;
}
}
return {};
}
function buildTestOpenAISpeechProvider(): SpeechProviderPlugin {
return {
id: "openai",
label: "OpenAI",
autoSelectOrder: 10,
resolveConfig: ({ rawConfig }) => {
const config = resolveTestProviderConfig(rawConfig, "openai");
return {
...config,
baseUrl: resolveBaseUrl(
config.baseUrl ?? process.env.OPENAI_TTS_BASE_URL,
"https://api.openai.com/v1",
),
};
},
parseDirectiveToken: ({ key, value, providerConfig }) => {
if (key === "voice") {
const baseUrl = resolveBaseUrl(
(providerConfig as Record<string, unknown> | undefined)?.baseUrl,
"https://api.openai.com/v1",
);
const isDefaultEndpoint = baseUrl === "https://api.openai.com/v1";
const allowedVoices = new Set([
"alloy",
"ash",
"ballad",
"coral",
"echo",
"sage",
"shimmer",
"verse",
]);
if (isDefaultEndpoint && !allowedVoices.has(value)) {
return { handled: true, warnings: [`invalid OpenAI voice "${value}"`] };
}
return { handled: true, overrides: { voice: value } };
}
if (key === "model") {
const baseUrl = resolveBaseUrl(
(providerConfig as Record<string, unknown> | undefined)?.baseUrl,
"https://api.openai.com/v1",
);
const isDefaultEndpoint = baseUrl === "https://api.openai.com/v1";
const allowedModels = new Set(["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]);
if (isDefaultEndpoint && !allowedModels.has(value)) {
return { handled: true, warnings: [`invalid OpenAI model "${value}"`] };
}
return { handled: true, overrides: { model: value } };
}
return { handled: false };
},
isConfigured: ({ providerConfig }) =>
typeof (providerConfig as Record<string, unknown> | undefined)?.apiKey === "string" ||
typeof process.env.OPENAI_API_KEY === "string",
synthesize: async ({ text, providerConfig, providerOverrides }) => {
const config = providerConfig as Record<string, unknown> | undefined;
await fetch(`${resolveBaseUrl(config?.baseUrl, "https://api.openai.com/v1")}/audio/speech`, {
method: "POST",
body: JSON.stringify({
input: text,
model: providerOverrides?.model ?? config?.model ?? "gpt-4o-mini-tts",
voice: providerOverrides?.voice ?? config?.voice ?? "alloy",
}),
});
return {
audioBuffer: createAudioBuffer(1),
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: true,
};
},
synthesizeTelephony: async ({ text, providerConfig }) => {
const config = providerConfig as Record<string, unknown> | undefined;
const configuredModel = typeof config?.model === "string" ? config.model : undefined;
const model = configuredModel ?? "tts-1";
const configuredInstructions =
typeof config?.instructions === "string" ? config.instructions : undefined;
const instructions =
model === "gpt-4o-mini-tts" ? configuredInstructions || undefined : undefined;
await fetch(`${resolveBaseUrl(config?.baseUrl, "https://api.openai.com/v1")}/audio/speech`, {
method: "POST",
body: JSON.stringify({
input: text,
model,
voice: config?.voice ?? "alloy",
instructions,
}),
});
return {
audioBuffer: createAudioBuffer(2),
outputFormat: "mp3",
sampleRate: 24000,
};
},
listVoices: async () => [{ id: "alloy", label: "Alloy" }],
};
}
function buildTestMicrosoftSpeechProvider(): SpeechProviderPlugin {
return {
id: "microsoft",
label: "Microsoft",
aliases: ["edge"],
autoSelectOrder: 30,
resolveConfig: ({ rawConfig }) => {
const edgeConfig = resolveTestProviderConfig(rawConfig, "microsoft", "edge");
return {
...edgeConfig,
outputFormat: edgeConfig.outputFormat ?? "audio-24khz-48kbitrate-mono-mp3",
};
},
isConfigured: ({ providerConfig }) =>
(providerConfig as Record<string, unknown> | undefined)?.enabled !== false,
synthesize: async () => ({
audioBuffer: createAudioBuffer(),
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: true,
}),
listVoices: async () => [{ id: "edge", label: "Edge" }],
};
}
function buildTestElevenLabsSpeechProvider(): SpeechProviderPlugin {
return {
id: "elevenlabs",
label: "ElevenLabs",
autoSelectOrder: 20,
resolveConfig: ({ rawConfig }) => resolveTestProviderConfig(rawConfig, "elevenlabs"),
parseDirectiveToken: ({ key, value, currentOverrides }) => {
if (key === "voiceid") {
return { handled: true, overrides: { voiceId: value } };
}
if (key === "stability") {
return {
handled: true,
overrides: {
voiceSettings: {
...(currentOverrides as { voiceSettings?: Record<string, unknown> } | undefined)
?.voiceSettings,
stability: Number(value),
},
},
};
}
if (key === "speed") {
return {
handled: true,
overrides: {
voiceSettings: {
...(currentOverrides as { voiceSettings?: Record<string, unknown> } | undefined)
?.voiceSettings,
speed: Number(value),
},
},
};
}
return { handled: false };
},
isConfigured: ({ providerConfig }) =>
typeof (providerConfig as Record<string, unknown> | undefined)?.apiKey === "string" ||
typeof process.env.ELEVENLABS_API_KEY === "string" ||
typeof process.env.XI_API_KEY === "string",
synthesize: async () => ({
audioBuffer: createAudioBuffer(),
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: true,
}),
listVoices: async () => [{ id: "eleven", label: "Eleven" }],
};
}
function buildTestGoogleSpeechProvider(): SpeechProviderPlugin {
return {
id: "google",
label: "Google",
autoSelectOrder: 50,
resolveConfig: ({ rawConfig }) => resolveTestProviderConfig(rawConfig, "google"),
isConfigured: ({ cfg, providerConfig }) =>
typeof (providerConfig as Record<string, unknown> | undefined)?.apiKey === "string" ||
typeof cfg?.models?.providers?.google?.apiKey === "string" ||
typeof process.env.GEMINI_API_KEY === "string" ||
typeof process.env.GOOGLE_API_KEY === "string",
synthesize: async () => ({
audioBuffer: createAudioBuffer(),
outputFormat: "wav",
fileExtension: ".wav",
voiceCompatible: false,
}),
synthesizeTelephony: async () => ({
audioBuffer: createAudioBuffer(),
outputFormat: "pcm",
sampleRate: 24_000,
}),
listVoices: async () => [{ id: "Kore", label: "Kore" }],
};
}
async function loadTtsRuntime(): Promise<TtsRuntimeModule> {
ttsRuntimePromise ??= import(speechCoreRuntimeApiModuleId) as Promise<TtsRuntimeModule>;
return await ttsRuntimePromise;
}
async function loadTtsCore(): Promise<TtsCoreModule> {
ttsCorePromise ??= import("../../../src/tts/tts-core.js");
return await ttsCorePromise;
}
async function setupTtsRuntime() {
if (ttsRuntimeInitialized) {
return;
}
ttsRuntime = await loadTtsRuntime();
resolveTtsConfig = ttsRuntime.resolveTtsConfig;
maybeApplyTtsToPayload = ttsRuntime.maybeApplyTtsToPayload;
getTtsProvider = ttsRuntime.getTtsProvider;
({
parseTtsDirectives,
resolveModelOverridePolicy,
getResolvedSpeechProviderConfig,
formatTtsProviderError,
sanitizeTtsErrorForLog,
} = ttsRuntime._test);
ttsRuntimeInitialized = true;
}
function setupTestSpeechProviderRegistry() {
prepareModelForSimpleCompletionMock = vi.fn(({ model }) => model);
const registry = createEmptyPluginRegistry();
registry.speechProviders = [
{ pluginId: "openai", provider: buildTestOpenAISpeechProvider(), source: "test" },
{ pluginId: "microsoft", provider: buildTestMicrosoftSpeechProvider(), source: "test" },
{ pluginId: "elevenlabs", provider: buildTestElevenLabsSpeechProvider(), source: "test" },
{ pluginId: "google", provider: buildTestGoogleSpeechProvider(), source: "test" },
];
setActivePluginRegistry(registry);
}
function createResolvedSummarizationConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
const rawConfig =
typeof cfg.messages?.tts === "object" && cfg.messages?.tts !== null ? cfg.messages.tts : {};
return {
auto: "off",
mode: rawConfig.mode ?? "final",
provider: "",
providerSource:
typeof rawConfig.provider === "string" && rawConfig.provider ? "config" : "default",
summaryModel: typeof rawConfig.summaryModel === "string" ? rawConfig.summaryModel : undefined,
modelOverrides: {
enabled: true,
allowText: true,
allowProvider: false,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
},
providerConfigs: {},
prefsPath: typeof rawConfig.prefsPath === "string" ? rawConfig.prefsPath : undefined,
maxTextLength: typeof rawConfig.maxTextLength === "number" ? rawConfig.maxTextLength : 4096,
timeoutMs: typeof rawConfig.timeoutMs === "number" ? rawConfig.timeoutMs : 30_000,
rawConfig,
sourceConfig: cfg,
};
}
async function setupSummarizationMocks() {
({ summarizeText: summarizeTextCore } = await loadTtsCore());
prepareModelForSimpleCompletionMock = vi.fn(({ model }) => model);
({ completeSimple } = await import("@mariozechner/pi-ai"));
({ getApiKeyForModel: getApiKeyForModelMock, requireApiKey: requireApiKeyMock } =
await import("../../../src/agents/model-auth.js"));
({ resolveModelAsync: resolveModelAsyncMock } =
await import("../../../src/agents/pi-embedded-runner/model.js"));
({ ensureCustomApiRegistered: ensureCustomApiRegisteredMock } =
await import("../../../src/agents/custom-api-registry.js"));
vi.mocked(completeSimple).mockResolvedValue(
mockAssistantMessage([{ type: "text", text: "Summary" }]),
);
vi.mocked(getApiKeyForModelMock).mockResolvedValue({
apiKey: "test-api-key",
source: "test",
mode: "api-key",
});
vi.mocked(requireApiKeyMock).mockImplementation((auth: { apiKey?: string }) => auth.apiKey ?? "");
vi.mocked(resolveModelAsyncMock).mockImplementation(
async (provider: string, modelId: string) =>
createResolvedModel(provider, modelId) as unknown as Awaited<
ReturnType<typeof resolveModelAsyncMock>
>,
);
vi.mocked(ensureCustomApiRegisteredMock).mockReset();
prepareModelForSimpleCompletionMock = vi.fn(({ model }) => model);
}
async function setupTtsContractTest() {
await setupTtsRuntime();
setupTestSpeechProviderRegistry();
vi.clearAllMocks();
}
async function setupTtsSummarizationTest() {
vi.clearAllMocks();
await setupSummarizationMocks();
}
export function describeTtsConfigContract() {
describe("tts config contract", () => {
beforeEach(setupTtsContractTest);
describe("resolveEdgeOutputFormat", () => {
const baseCfg: OpenClawConfig = {
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
messages: { tts: {} },
};
it.each([
{
name: "default",
cfg: baseCfg,
expected: "audio-24khz-48kbitrate-mono-mp3",
},
{
name: "override",
cfg: {
...baseCfg,
messages: {
tts: {
edge: { outputFormat: "audio-24khz-96kbitrate-mono-mp3" },
},
},
} as unknown as OpenClawConfig,
expected: "audio-24khz-96kbitrate-mono-mp3",
},
] as const)("$name", ({ cfg, expected, name }) => {
const config = resolveTtsConfig(cfg);
const providerConfig = getResolvedSpeechProviderConfig(config, "microsoft") as {
outputFormat?: string;
};
expect(providerConfig.outputFormat, name).toBe(expected);
});
});
describe("parseTtsDirectives", () => {
it("extracts overrides and strips directives when enabled", () => {
const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true });
const input =
"Hello [[tts:provider=elevenlabs voiceId=pMsXgVXv3BLzUgSXRplE stability=0.4 speed=1.1]] world\n\n" +
"[[tts:text]](laughs) Read the song once more.[[/tts:text]]";
const result = parseTtsDirectives(input, policy);
const elevenlabsOverrides = result.overrides.providerOverrides?.elevenlabs as
| {
voiceId?: string;
voiceSettings?: { stability?: number; speed?: number };
}
| undefined;
expect(result.cleanedText).not.toContain("[[tts:");
expect(result.ttsText).toBe("(laughs) Read the song once more.");
expect(result.overrides.provider).toBe("elevenlabs");
expect(elevenlabsOverrides?.voiceId).toBe("pMsXgVXv3BLzUgSXRplE");
expect(elevenlabsOverrides?.voiceSettings?.stability).toBe(0.4);
expect(elevenlabsOverrides?.voiceSettings?.speed).toBe(1.1);
});
it("accepts edge as a legacy microsoft provider override", () => {
const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true });
const input = "Hello [[tts:provider=edge]] world";
const result = parseTtsDirectives(input, policy);
expect(result.overrides.provider).toBe("edge");
});
it("rejects provider override by default while keeping voice overrides enabled", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
const input = "Hello [[tts:provider=edge voice=alloy]] world";
const result = parseTtsDirectives(input, policy);
const openaiOverrides = result.overrides.providerOverrides?.openai as
| { voice?: string }
| undefined;
expect(result.overrides.provider).toBeUndefined();
expect(openaiOverrides?.voice).toBe("alloy");
});
it("keeps text intact when overrides are disabled", () => {
const policy = resolveModelOverridePolicy({ enabled: false });
const input = "Hello [[tts:voice=alloy]] world";
const result = parseTtsDirectives(input, policy);
expect(result.cleanedText).toBe(input);
expect(result.overrides.provider).toBeUndefined();
});
it("accepts custom voices and models when openaiBaseUrl is a non-default endpoint", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
const input = "Hello [[tts:voice=kokoro-chinese model=kokoro-v1]] world";
const result = parseTtsDirectives(input, policy, {
providerConfigs: {
openai: { baseUrl: "http://localhost:8880/v1" },
},
});
const openaiOverrides = result.overrides.providerOverrides?.openai as
| { voice?: string; model?: string }
| undefined;
expect(openaiOverrides?.voice).toBe("kokoro-chinese");
expect(openaiOverrides?.model).toBe("kokoro-v1");
expect(result.warnings).toHaveLength(0);
});
it("rejects unknown voices and models when openaiBaseUrl is the default OpenAI endpoint", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
const input = "Hello [[tts:voice=kokoro-chinese model=kokoro-v1]] world";
const result = parseTtsDirectives(input, policy, {
providerConfigs: {
openai: { baseUrl: "https://api.openai.com/v1" },
},
});
const openaiOverrides = result.overrides.providerOverrides?.openai as
| { voice?: string }
| undefined;
expect(openaiOverrides?.voice).toBeUndefined();
expect(result.warnings).toContain('invalid OpenAI voice "kokoro-chinese"');
});
});
describe("getTtsProvider", () => {
it.each([
{
name: "openai key available",
env: {
OPENAI_API_KEY: "test-openai-key",
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
},
prefsPath: "/tmp/tts-prefs-openai.json",
expected: "openai",
},
{
name: "elevenlabs key available",
env: {
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: "test-elevenlabs-key",
XI_API_KEY: undefined,
},
prefsPath: "/tmp/tts-prefs-elevenlabs.json",
expected: "elevenlabs",
},
{
name: "falls back to microsoft",
env: {
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
},
prefsPath: "/tmp/tts-prefs-microsoft.json",
expected: "microsoft",
},
] as const)("selects provider based on available API keys: $name", (testCase) => {
withIsolatedSpeechProviderEnv(testCase.env, () => {
const config = {
auto: "off",
mode: "final",
provider: "openai",
providerSource: "default",
summaryModel: undefined,
modelOverrides: resolveModelOverridePolicy(undefined),
providerConfigs: {
openai: {},
microsoft: {},
elevenlabs: {},
},
prefsPath: undefined,
maxTextLength: 4000,
timeoutMs: 30_000,
} as ReturnType<typeof resolveTtsConfig>;
const provider = getTtsProvider(config, testCase.prefsPath);
expect(provider).toBe(testCase.expected);
});
});
it("passes cfg into auto-selection so model-provider Google keys can configure TTS", () => {
withIsolatedSpeechProviderEnv(
{
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
MINIMAX_API_KEY: undefined,
GEMINI_API_KEY: undefined,
GOOGLE_API_KEY: undefined,
},
() => {
const cfg = asLegacyOpenClawConfig({
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
models: {
providers: {
google: {
apiKey: "model-provider-google-key",
},
},
},
messages: {
tts: {
providers: {
microsoft: {
enabled: false,
},
},
},
},
});
const config = resolveTtsConfig(cfg);
const prefsPath = `/tmp/tts-prefs-google-model-provider-${Date.now()}.json`;
expect(getTtsProvider(config, prefsPath)).toBe("google");
},
);
});
});
describe("resolveTtsConfig provider normalization", () => {
it("normalizes legacy edge provider ids to microsoft", () => {
const config = resolveTtsConfig(
asLegacyOpenClawConfig({
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
messages: {
tts: {
provider: "edge",
providers: {
edge: {
enabled: true,
},
},
},
},
}),
);
expect(config.provider).toBe("microsoft");
expect(getTtsProvider(config, "/tmp/tts-prefs-normalized.json")).toBe("microsoft");
});
});
describe("resolveTtsConfig – openai.baseUrl", () => {
const baseCfg: OpenClawConfig = {
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
messages: { tts: {} },
};
it.each([
{
name: "default endpoint",
cfg: baseCfg,
env: { OPENAI_TTS_BASE_URL: undefined },
expected: "https://api.openai.com/v1",
},
{
name: "env override",
cfg: baseCfg,
env: { OPENAI_TTS_BASE_URL: "http://localhost:8880/v1" },
expected: "http://localhost:8880/v1",
},
{
name: "config wins over env",
cfg: {
...baseCfg,
messages: {
tts: { ...baseCfg.messages!.tts, openai: { baseUrl: "http://my-server:9000/v1" } },
},
} as unknown as OpenClawConfig,
env: { OPENAI_TTS_BASE_URL: "http://localhost:8880/v1" },
expected: "http://my-server:9000/v1",
},
{
name: "config slash trimming",
cfg: {
...baseCfg,
messages: {
tts: {
...baseCfg.messages!.tts,
openai: { baseUrl: "http://my-server:9000/v1///" },
},
},
} as unknown as OpenClawConfig,
env: { OPENAI_TTS_BASE_URL: undefined },
expected: "http://my-server:9000/v1",
},
{
name: "env slash trimming",
cfg: baseCfg,
env: { OPENAI_TTS_BASE_URL: "http://localhost:8880/v1/" },
expected: "http://localhost:8880/v1",
},
] as const)(
"resolves openai.baseUrl from config/env with config precedence and slash trimming: $name",
(testCase) => {
withEnv(testCase.env, () => {
const config = resolveTtsConfig(testCase.cfg);
const openaiConfig = getResolvedSpeechProviderConfig(config, "openai") as {
baseUrl?: string;
};
expect(openaiConfig.baseUrl, testCase.name).toBe(testCase.expected);
});
},
);
it("hydrates provider config lazily when no explicit speech provider is configured", () => {
withEnv({ OPENAI_TTS_BASE_URL: "http://localhost:8880/v1" }, () => {
const config = resolveTtsConfig(baseCfg);
const openaiConfig = getResolvedSpeechProviderConfig(config, "openai", baseCfg) as {
baseUrl?: string;
};
expect(config.provider).toBe("");
expect(openaiConfig.baseUrl).toBe("http://localhost:8880/v1");
});
});
});
});
}
export function describeTtsSummarizationContract() {
describe("tts summarization contract", () => {
beforeEach(setupTtsSummarizationTest);
const baseCfg: OpenClawConfig = {
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
messages: { tts: {} },
};
async function runSummarizeText(params?: {
text?: string;
targetLength?: number;
cfg?: OpenClawConfig;
}) {
const cfg = params?.cfg ?? baseCfg;
const config = createResolvedSummarizationConfig(cfg);
return await summarizeTextCore(
{
text: params?.text ?? "Long text to summarize",
targetLength: params?.targetLength ?? 500,
cfg,
config,
timeoutMs: 30_000,
},
createSummarizeTextDeps(),
);
}
it("summarizes text and returns result with metrics", async () => {
const mockSummary = "This is a summarized version of the text.";
vi.mocked(completeSimple).mockResolvedValue(
mockAssistantMessage([{ type: "text", text: mockSummary }]),
);
const longText = "A".repeat(2000);
const result = await runSummarizeText({
text: longText,
targetLength: 1500,
});
expect(result.summary).toBe(mockSummary);
expect(result.inputLength).toBe(2000);
expect(result.outputLength).toBe(mockSummary.length);
expect(result.latencyMs).toBeGreaterThanOrEqual(0);
expect(completeSimple).toHaveBeenCalledTimes(1);
});
it("calls the summary model with the expected parameters", async () => {
await runSummarizeText();
const callArgs = vi.mocked(completeSimple).mock.calls[0];
expect(callArgs?.[1]?.messages?.[0]?.role).toBe("user");
expect(callArgs?.[2]?.maxTokens).toBe(250);
expect(callArgs?.[2]?.temperature).toBe(0.3);
expect(getApiKeyForModelMock).toHaveBeenCalledTimes(1);
});
it("uses summaryModel override when configured", async () => {
const cfg: OpenClawConfig = {
agents: { defaults: { model: { primary: "anthropic/claude-opus-4-5" } } },
messages: { tts: { summaryModel: "openai/gpt-4.1-mini" } },
};
await runSummarizeText({ cfg });
expect(resolveModelAsyncMock).toHaveBeenCalledWith("openai", "gpt-4.1-mini", undefined, cfg);
});
it("keeps native completion APIs for direct summarization", async () => {
vi.mocked(resolveModelAsyncMock).mockResolvedValue({
...createResolvedModel("local-summary", "demo-model", "openai-completions"),
model: {
...createResolvedModel("local-summary", "demo-model", "openai-completions").model,
baseUrl: "http://127.0.0.1:4000/v1",
},
} as never);
await runSummarizeText();
expect(vi.mocked(completeSimple).mock.calls[0]?.[0]?.api).toBe("openai-completions");
expect(ensureCustomApiRegisteredMock).not.toHaveBeenCalled();
});
it.each([
{ targetLength: 99, shouldThrow: true },
{ targetLength: 100, shouldThrow: false },
{ targetLength: 10000, shouldThrow: false },
{ targetLength: 10001, shouldThrow: true },
] as const)("validates targetLength bounds: $targetLength", async (testCase) => {
const call = runSummarizeText({ text: "text", targetLength: testCase.targetLength });
if (testCase.shouldThrow) {
await expect(call, String(testCase.targetLength)).rejects.toThrow(
`Invalid targetLength: ${testCase.targetLength}`,
);
} else {
await expect(call, String(testCase.targetLength)).resolves.toBeDefined();
}
});
it.each([
{ name: "no summary blocks", message: mockAssistantMessage([]) },
{
name: "empty summary content",
message: mockAssistantMessage([{ type: "text", text: " " }]),
},
] as const)("throws when summary output is missing or empty: $name", async (testCase) => {
vi.mocked(completeSimple).mockResolvedValue(testCase.message);
await expect(runSummarizeText({ text: "text" }), testCase.name).rejects.toThrow(
"No summary returned",
);
});
});
}
export function describeTtsProviderRuntimeContract() {
describe("tts provider runtime contract", () => {
beforeEach(setupTtsContractTest);
describe("provider error redaction", () => {
it("redacts sensitive tokens in provider errors", () => {
const result = formatTtsProviderError(
"openai",
new Error("Authorization: Bearer sk-super-secret-token-1234567890"),
);
expect(result).toContain("openai:");
expect(result).toContain("Authorization: Bearer");
expect(result).not.toContain("sk-super-secret-token-1234567890");
});
it("escapes control characters in verbose fallback error logs", () => {
const result = sanitizeTtsErrorForLog(
new Error("failed\nAuthorization: Bearer sk-super-secret-token-1234567890\tboom"),
);
expect(result).toContain("\\n");
expect(result).toContain("\\t");
expect(result).not.toContain("sk-super-secret-token-1234567890");
});
});
describe("fallback readiness errors", () => {
it("continues synthesize fallback when primary readiness checks throw", async () => {
await withIsolatedSpeechProviderEnvAsync({}, async () => {
const throwingPrimary: SpeechProviderPlugin = {
id: "openai",
label: "OpenAI",
autoSelectOrder: 10,
resolveConfig: () => ({}),
isConfigured: () => {
throw new Error("Authorization: Bearer sk-readiness-throw-token-1234567890\nboom");
},
synthesize: async () => {
throw new Error("unexpected synthesize call");
},
};
const fallback: SpeechProviderPlugin = {
id: "microsoft",
label: "Microsoft",
autoSelectOrder: 20,
resolveConfig: () => ({}),
isConfigured: () => true,
synthesize: async () => ({
audioBuffer: createAudioBuffer(2),
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: true,
}),
};
const registry = createEmptyPluginRegistry();
registry.speechProviders = [
{ pluginId: "openai", provider: throwingPrimary, source: "test" },
{ pluginId: "microsoft", provider: fallback, source: "test" },
];
setActivePluginRegistry(registry);
const result = await ttsRuntime.synthesizeSpeech({
text: "hello fallback",
cfg: {
messages: {
tts: {
provider: "openai",
},
},
},
});
expect(result.success).toBe(true);
if (!result.success) {
throw new Error("expected fallback synthesis success");
}
expect(result.provider).toBe("microsoft");
expect(result.fallbackFrom).toBe("openai");
expect(result.attemptedProviders).toEqual(["openai", "microsoft"]);
expect(result.attempts?.[0]).toMatchObject({
provider: "openai",
outcome: "failed",
reasonCode: "provider_error",
});
expect(result.attempts?.[1]).toMatchObject({
provider: "microsoft",
outcome: "success",
reasonCode: "success",
});
});
});
it("continues telephony fallback when primary readiness checks throw", async () => {
await withIsolatedSpeechProviderEnvAsync({}, async () => {
const throwingPrimary: SpeechProviderPlugin = {
id: "primary-throws",
label: "PrimaryThrows",
autoSelectOrder: 10,
resolveConfig: () => ({}),
isConfigured: () => {
throw new Error("Authorization: Bearer sk-telephony-throw-token-1234567890\tboom");
},
synthesize: async () => {
throw new Error("unexpected synthesize call");
},
};
const fallback: SpeechProviderPlugin = {
id: "microsoft",
label: "Microsoft",
autoSelectOrder: 20,
resolveConfig: () => ({}),
isConfigured: () => true,
synthesize: async () => ({
audioBuffer: createAudioBuffer(2),
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: true,
}),
synthesizeTelephony: async () => ({
audioBuffer: createAudioBuffer(2),
outputFormat: "mp3",
sampleRate: 24000,
}),
};
const registry = createEmptyPluginRegistry();
registry.speechProviders = [
{ pluginId: "primary-throws", provider: throwingPrimary, source: "test" },
{ pluginId: "microsoft", provider: fallback, source: "test" },
];
setActivePluginRegistry(registry);
const result = await ttsRuntime.textToSpeechTelephony({
text: "hello telephony fallback",
cfg: {
messages: {
tts: {
provider: "primary-throws",
},
},
},
});
expect(result.success).toBe(true);
if (!result.success) {
throw new Error("expected telephony fallback success");
}
expect(result.provider).toBe("microsoft");
expect(result.fallbackFrom).toBe("primary-throws");
expect(result.attemptedProviders).toEqual(["primary-throws", "microsoft"]);
expect(result.attempts?.[0]).toMatchObject({
provider: "primary-throws",
outcome: "failed",
reasonCode: "provider_error",
});
expect(result.attempts?.[1]).toMatchObject({
provider: "microsoft",
outcome: "success",
reasonCode: "success",
});
});
});
it("does not double-prefix textToSpeech failure messages", async () => {
const failingProvider: SpeechProviderPlugin = {
id: "openai",
label: "OpenAI",
autoSelectOrder: 10,
resolveConfig: () => ({}),
isConfigured: () => true,
synthesize: async () => {
throw new Error("provider failed");
},
};
const registry = createEmptyPluginRegistry();
registry.speechProviders = [
{ pluginId: "openai", provider: failingProvider, source: "test" },
];
setActivePluginRegistry(registry);
const result = await ttsRuntime.textToSpeech({
text: "hello",
cfg: {
messages: {
tts: {
provider: "openai",
},
},
},
disableFallback: true,
});
expect(result.success).toBe(false);
if (result.success) {
throw new Error("expected synthesis failure");
}
expect(result.error).toBeDefined();
const errorMessage = result.error ?? "";
expect(errorMessage).toBe("TTS conversion failed: openai: provider failed");
expect(errorMessage).not.toContain("TTS conversion failed: TTS conversion failed:");
expect(errorMessage.match(/TTS conversion failed:/g)).toHaveLength(1);
});
});
describe("textToSpeechTelephony – openai instructions", () => {
async function expectTelephonyInstructions(
model: "tts-1" | "gpt-4o-mini-tts",
expectedInstructions: string | undefined,
) {
await withMockedSpeechFetch(async (fetchMock) => {
const result = await ttsRuntime.textToSpeechTelephony({
text: "Hello there, friendly caller.",
cfg: createOpenAiTelephonyCfg(model),
});
expect(result.success).toBe(true);
expect(fetchMock).toHaveBeenCalledTimes(1);
const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
expect(typeof init.body).toBe("string");
const body = JSON.parse(init.body as string) as Record<string, unknown>;
expect(body.instructions).toBe(expectedInstructions);
}, 2);
}
it.each([
{ name: "tts-1 omits instructions", model: "tts-1", expectedInstructions: undefined },
{
name: "gpt-4o-mini-tts keeps instructions",
model: "gpt-4o-mini-tts",
expectedInstructions: "Speak warmly",
},
] as const)(
"only includes instructions for supported telephony models: $name",
async (testCase) => {
await expectTelephonyInstructions(testCase.model, testCase.expectedInstructions);
},
);
});
});
}
export function describeTtsAutoApplyContract() {
describe("tts auto-apply contract", () => {
beforeEach(setupTtsContractTest);
const baseCfg: OpenClawConfig = asLegacyOpenClawConfig({
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
messages: {
tts: {
auto: "inbound",
provider: "openai",
providers: {
openai: { apiKey: "test-key", model: "gpt-4o-mini-tts", voice: "alloy" },
},
},
},
});
const withMockedAutoTtsFetch = async (
run: (fetchMock: ReturnType<typeof vi.fn>) => Promise<void>,
) => {
const prevPrefs = process.env.OPENCLAW_TTS_PREFS;
process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
try {
await withMockedSpeechFetch(run, 1);
} finally {
process.env.OPENCLAW_TTS_PREFS = prevPrefs;
}
};
const taggedCfg: OpenClawConfig = {
...baseCfg,
messages: {
...baseCfg.messages!,
tts: { ...baseCfg.messages!.tts, auto: "tagged" },
},
};
async function expectAutoTtsOutcome(params: {
cfg: OpenClawConfig;
payload: { text: string };
inboundAudio?: boolean;
expectedFetchCalls: number;
expectSamePayload: boolean;
}) {
await withMockedAutoTtsFetch(async (fetchMock) => {
const result = await maybeApplyTtsToPayload({
payload: params.payload,
cfg: params.cfg,
kind: "final",
...(params.inboundAudio !== undefined ? { inboundAudio: params.inboundAudio } : {}),
});
expect(fetchMock).toHaveBeenCalledTimes(params.expectedFetchCalls);
if (params.expectSamePayload) {
expect(result).toBe(params.payload);
} else {
expect(result.mediaUrl).toBeDefined();
}
});
}
it.each([
{
name: "inbound gating blocks non-audio",
payload: { text: "Hello world" },
inboundAudio: false,
expectedFetchCalls: 0,
expectSamePayload: true,
},
{
name: "inbound gating blocks too-short cleaned text",
payload: { text: "### **bold**" },
inboundAudio: true,
expectedFetchCalls: 0,
expectSamePayload: true,
},
{
name: "inbound gating allows audio with real text",
payload: { text: "Hello world" },
inboundAudio: true,
expectedFetchCalls: 1,
expectSamePayload: false,
},
] as const)(
"applies inbound auto-TTS gating by audio status and cleaned text length: $name",
async (testCase) => {
await expectAutoTtsOutcome({
cfg: baseCfg,
payload: testCase.payload,
inboundAudio: testCase.inboundAudio,
expectedFetchCalls: testCase.expectedFetchCalls,
expectSamePayload: testCase.expectSamePayload,
});
},
);
it.each([
{
name: "plain text is skipped",
payload: { text: "Hello world" },
expectedFetchCalls: 0,
expectSamePayload: true,
},
{
name: "tagged text is synthesized",
payload: { text: "[[tts:text]]Hello world[[/tts:text]]" },
expectedFetchCalls: 1,
expectSamePayload: false,
},
] as const)("respects tagged-mode auto-TTS gating: $name", async (testCase) => {
await expectAutoTtsOutcome({
cfg: taggedCfg,
payload: testCase.payload,
expectedFetchCalls: testCase.expectedFetchCalls,
expectSamePayload: testCase.expectSamePayload,
});
});
});
}
¤ Dauer der Verarbeitung: 0.29 Sekunden
(vorverarbeitet am 2026-04-27)
¤
*© Formatika GbR, Deutschland
|
|