Skip to content

Commit 64da2a4

Browse files
committed
cartesia util
1 parent fd4b8cc commit 64da2a4

File tree

5 files changed

+96
-27
lines changed

5 files changed

+96
-27
lines changed

next/README.md

+1-2
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22

33
## now
44

5-
- kaikki
6-
- redis caching
75
- real cloning
6+
- summarize kaikki definitions w llm
87

98
## future
109

next/app/api/speak/route.ts

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import { NextResponse } from "next/server";
2+
import { tts, CartesiaModelId, SUPPORTED_LANGUAGES } from "@/lib/cartesia";
3+
import { z } from "zod";
4+
5+
const speakSchema = z.object({
6+
text: z.string().min(1),
7+
voiceId: z.string(),
8+
language: z.enum(SUPPORTED_LANGUAGES).optional(),
9+
modelId: z.nativeEnum(CartesiaModelId).optional(),
10+
});
11+
12+
export async function POST(req: Request) {
13+
try {
14+
const body = await req.json();
15+
const result = speakSchema.parse(body);
16+
17+
const audioBytes = await tts(result);
18+
19+
return new NextResponse(audioBytes, {
20+
status: 200,
21+
headers: {
22+
"Content-Type": "audio/wav",
23+
"Content-Length": audioBytes.length.toString(),
24+
},
25+
});
26+
} catch (error) {
27+
console.error("Failed to generate speech:", error);
28+
return NextResponse.json(
29+
{ error: "Failed to generate speech" },
30+
{ status: 400 }
31+
);
32+
}
33+
}

next/components/fragment.tsx

+1-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ export const FragmentComponent = ({ fragment, showTranslation, outputLanguage }:
159159

160160
const desktopContent = selectedWord && (
161161
<div className="p-4">
162-
<div className="mt-4 overflow-auto max-h-[calc(100vh-12rem)]">
162+
<div className="overflow-auto max-h-[calc(100vh-12rem)]">
163163
{renderDefinitions()}
164164
</div>
165165
<div className="mt-4">

next/lib/cartesia.ts

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { CartesiaClient } from "@cartesia/cartesia-js";
2+
import { getRequiredEnv } from "./env";
3+
4+
export enum CartesiaModelId {
5+
SONIC = "sonic", // stable alias, recommended for prod
6+
SONIC_PREVIEW = "sonic-preview", // latest features, may be unstable
7+
}
8+
9+
export const SUPPORTED_LANGUAGES = [
10+
"en", "fr", "de", "es", "pt", "zh", "ja",
11+
"hi", "it", "ko", "nl", "pl", "ru", "sv", "tr"
12+
] as const;
13+
14+
export type SupportedLanguage = typeof SUPPORTED_LANGUAGES[number];
15+
16+
interface TTSOptions {
17+
text: string;
18+
voiceId: string;
19+
language?: SupportedLanguage;
20+
modelId?: CartesiaModelId;
21+
}
22+
23+
// get raw audio bytes from cartesia
24+
export async function tts({
25+
text,
26+
voiceId,
27+
language = "en",
28+
modelId = CartesiaModelId.SONIC,
29+
}: TTSOptions): Promise<Uint8Array> {
30+
const cartesia = new CartesiaClient({
31+
apiKey: getRequiredEnv("CARTESIA_API_KEY"),
32+
});
33+
34+
const audioData = await cartesia.tts.bytes({
35+
modelId,
36+
transcript: text,
37+
voice: {
38+
mode: "id",
39+
id: voiceId,
40+
},
41+
language,
42+
outputFormat: {
43+
container: "wav",
44+
sampleRate: 44100,
45+
encoding: "pcm_f32le",
46+
},
47+
});
48+
49+
return new Uint8Array(audioData);
50+
}

next/scripts/speak.ts

+11-24
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import { openai } from "@ai-sdk/openai";
22
import { generateObject } from "ai";
3-
import { CartesiaClient } from "@cartesia/cartesia-js";
43
import { z } from "zod";
54
import { writeFileSync } from "fs";
65
import { exec } from "child_process";
76
import { promisify } from "util";
87
import fetch from "node-fetch";
8+
import { tts, CartesiaModelId } from "@/lib/cartesia";
99

1010
const execAsync = promisify(exec);
1111
const wordsSchema = z.object({
@@ -62,6 +62,7 @@ async function main() {
6262
const delaySeconds = slowMode ? 10 : (delayArg ? parseFloat(delayArg.split("=")[1]) : 4);
6363
const taoMode = process.argv.includes("--tao");
6464
const spanishMode = process.argv.includes("--spanish");
65+
const previewMode = process.argv.includes("--preview");
6566

6667
if (taoMode && spanishMode) {
6768
throw new Error("Cannot use --tao and --spanish together");
@@ -77,16 +78,10 @@ async function main() {
7778
console.log(`Loaded ${taoVerses.length} verses`);
7879
}
7980

80-
if (!process.env.OPENAI_API_KEY) {
81-
throw new Error("OPENAI_API_KEY is required");
81+
const voiceId = process.env.CARTESIA_VOICE_ID;
82+
if (!voiceId) {
83+
throw new Error("CARTESIA_VOICE_ID is required");
8284
}
83-
if (!process.env.CARTESIA_API_KEY) {
84-
throw new Error("CARTESIA_API_KEY is required");
85-
}
86-
87-
const cartesia = new CartesiaClient({
88-
apiKey: process.env.CARTESIA_API_KEY,
89-
});
9085

9186
const recentUtterances: string[] = [];
9287

@@ -124,24 +119,16 @@ Return only valid JSON.`.trim(),
124119
}
125120

126121
try {
127-
const audioData = await cartesia.tts.bytes({
128-
modelId: "sonic-english",
129-
transcript: utterance,
130-
voice: {
131-
mode: "id",
132-
id: "694f9389-aac1-45b6-b726-9d9369183238",
133-
},
134-
language: "en",
135-
outputFormat: {
136-
container: "wav",
137-
sampleRate: 44100,
138-
encoding: "pcm_f32le",
139-
},
122+
const audioData = await tts({
123+
text: utterance,
124+
voiceId,
125+
language: spanishMode ? "es" : "en",
126+
modelId: previewMode ? CartesiaModelId.SONIC_PREVIEW : CartesiaModelId.SONIC,
140127
});
141128

142129
// Save audio to temp file
143130
const tempFile = `/tmp/speech-${Date.now()}.wav`;
144-
writeFileSync(tempFile, new Uint8Array(audioData));
131+
writeFileSync(tempFile, audioData);
145132

146133
// Play using sox to Loopback device
147134
await execAsync(`sox "${tempFile}" -t coreaudio "Loopback"`);

0 commit comments

Comments
 (0)