Skip to content

Commit 33b68a9

Browse files
committed
cartesia
1 parent 64da2a4 commit 33b68a9

File tree

7 files changed

+289
-13
lines changed

7 files changed

+289
-13
lines changed

.cursorrules

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,4 @@
4343
- You're blocked on making progress until you run the command.
4444
- You changed important business logic changes (suggest running unit tests)
4545
- You updated a unit test file, or created a new set of unit tests.
46-
- NEVER ASK TO RUN `bun test` (the entire test suite). Always run `bun test:unit`.
46+
- Use `bun test` to run a specific test file

next/app/api/speak/route.ts

+19-4
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,35 @@
11
import { NextResponse } from "next/server";
2-
import { tts, CartesiaModelId, SUPPORTED_LANGUAGES } from "@/lib/cartesia";
2+
import { tts, CartesiaModelId, SUPPORTED_LANGUAGES, VoiceId, GENDERS } from "@/lib/cartesia";
33
import { z } from "zod";
44

5-
const speakSchema = z.object({
5+
const baseSchema = z.object({
66
text: z.string().min(1),
7-
voiceId: z.string(),
87
language: z.enum(SUPPORTED_LANGUAGES).optional(),
98
modelId: z.nativeEnum(CartesiaModelId).optional(),
109
});
1110

11+
const voiceIdSchema = baseSchema.extend({
12+
voiceId: z.nativeEnum(VoiceId),
13+
gender: z.never().optional(),
14+
});
15+
16+
const genderSchema = baseSchema.extend({
17+
voiceId: z.never().optional(),
18+
gender: z.enum(GENDERS).default('male'),
19+
});
20+
21+
const speakSchema = z.discriminatedUnion('type', [
22+
voiceIdSchema.extend({ type: z.literal('voiceId') }),
23+
genderSchema.extend({ type: z.literal('gender') }),
24+
]);
25+
1226
export async function POST(req: Request) {
1327
try {
1428
const body = await req.json();
1529
const result = speakSchema.parse(body);
1630

17-
const audioBytes = await tts(result);
31+
const { type, ...options } = result;
32+
const audioBytes = await tts(options);
1833

1934
return new NextResponse(audioBytes, {
2035
status: 200,

next/bun.lock

+53-4
Large diffs are not rendered by default.

next/lib/cartesia.ts

+150-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,70 @@
11
import { CartesiaClient } from "@cartesia/cartesia-js";
22
import { getRequiredEnv } from "./env";
33

4+
// Voice IDs with their descriptions
5+
export enum VoiceId {
6+
// English
7+
NATHAN = "97f4b8fb-f2fe-444b-bb9a-c109783a857a", // Warm, natural male voice
8+
SARAH = "694f9389-aac1-45b6-b726-9d9369183238", // Natural, expressive American female
9+
JORDAN = "87bc56aa-ab01-4baa-9071-77d497064686", // Smooth, friendly male voice
10+
11+
// Japanese
12+
KENJI = "6b92f628-be90-497c-8f4c-3b035002df71", // Calm, clear Japanese male
13+
YUKI = "59d4fd2f-f5eb-4410-8105-58db7661144f", // Calm, clear Japanese female
14+
15+
// Italian
16+
ITALIAN_WOMAN = "0e21713a-5e9a-428a-bed4-90d410b87f13", // Graceful, melodic female
17+
ITALIAN_MAN = "029c3c7a-b6d9-44f0-814b-200d849830ff", // Deep, resonant male
18+
19+
// Spanish/Mexican
20+
MEXICAN_WOMAN = "5c5ad5e7-1020-476b-8b91-fdcbe9cc313c", // Even-toned Mexican female
21+
MEXICAN_MAN = "15d0c2e2-8d29-44c3-be23-d585d5f154a1", // Rich Mexican male
22+
23+
// French
24+
FRENCH_MAN = "5c3c89e5-535f-43ef-b14d-f8ffe148c1f0", // Even, rich male
25+
FRENCH_WOMAN = "8832a0b5-47b2-4751-bb22-6a8e2149303d", // Velvety, neutral female
26+
27+
// Chinese
28+
CHINESE_MAN = "eda5bbff-1ff1-4886-8ef1-4e69a77640a0", // Enthusiastic, deep male
29+
CHINESE_WOMAN = "e90c6678-f0d3-4767-9883-5d0ecf5894a8", // Friendly, inviting female
30+
31+
// German
32+
GERMAN_MAN = "b7187e84-fe22-4344-ba4a-bc013fcb533e", // Warm, expressive male
33+
GERMAN_WOMAN = "4ab1ff51-476d-42bb-8019-4d315f7c0c05", // Warm, expressive female
34+
35+
// Dutch
36+
DUTCH_MAN = "af482421-80f4-4379-b00c-a118def29cde", // Smooth, articulate male
37+
DUTCH_WOMAN = "0eb213fe-4658-45bc-9442-33a48b24b133", // Calm, clear female
38+
39+
// Portuguese
40+
PORTUGUESE_MAN = "6a360542-a117-4ed5-9e09-e8bf9b05eabb", // Calm, clear male
41+
PORTUGUESE_WOMAN = "d4b44b9a-82bc-4b65-b456-763fce4c52f9", // Friendly, natural female
42+
43+
// Russian
44+
RUSSIAN_MAN = "da05e96d-ca10-4220-9042-d8acef654fa9", // Deep, expressive male
45+
RUSSIAN_WOMAN = "642014de-c0e3-4133-adc0-36b5309c23e6", // Graceful, melodic female
46+
47+
// Hindi
48+
HINDI_MAN = "be79f378-47fe-4f9c-b92b-f02cefa62ccf", // Deep-toned, everyday male
49+
HINDI_WOMAN = "dc40fa13-d344-4c77-b579-1f0aec140e05", // Engaging, high energy female
50+
51+
// Korean
52+
KOREAN_MAN = "af6beeea-d732-40b6-8292-73af0035b740", // Warm, resonant male
53+
KOREAN_WOMAN = "663afeec-d082-4ab5-827e-2e41bf73a25b", // Graceful, melodic female
54+
55+
// Swedish
56+
SWEDISH_WOMAN = "6c6b05bf-ae5f-4013-82ab-7348e99ffdb2", // Warm, expressive female
57+
SWEDISH_MAN = "38a146c3-69d7-40ad-aada-76d5a2621758", // Deep, resonant male
58+
59+
// Turkish
60+
TURKISH_WOMAN = "fa7bfcdc-603c-4bf1-a600-a371400d2f8c", // Smooth, expressive female
61+
TURKISH_MAN = "39f753ef-b0eb-41cd-aa53-2f3c284f948f", // Soothing, reassuring male
62+
63+
// Polish
64+
POLISH_WOMAN = "575a5d29-1fdc-4d4e-9afa-5a9a71759864", // Graceful, melodic female
65+
POLISH_MAN = "4ef93bb3-682a-46e6-b881-8e157b6b4388", // Deep, resonant male
66+
}
67+
468
export enum CartesiaModelId {
569
SONIC = "sonic", // stable alias, recommended for prod
670
SONIC_PREVIEW = "sonic-preview", // latest features, may be unstable
@@ -13,30 +77,112 @@ export const SUPPORTED_LANGUAGES = [
1377

1478
export type SupportedLanguage = typeof SUPPORTED_LANGUAGES[number];
1579

16-
interface TTSOptions {
80+
export type Gender = "male" | "female";
81+
export const GENDERS = ["male", "female"] as const;
82+
83+
// Voice mapping by language and gender
84+
export const VOICE_MAP: Record<SupportedLanguage, Record<Gender, VoiceId>> = {
85+
en: {
86+
male: VoiceId.NATHAN,
87+
female: VoiceId.SARAH
88+
},
89+
ja: {
90+
male: VoiceId.KENJI,
91+
female: VoiceId.YUKI
92+
},
93+
it: {
94+
male: VoiceId.ITALIAN_MAN,
95+
female: VoiceId.ITALIAN_WOMAN
96+
},
97+
es: {
98+
male: VoiceId.MEXICAN_MAN,
99+
female: VoiceId.MEXICAN_WOMAN
100+
},
101+
fr: {
102+
male: VoiceId.FRENCH_MAN,
103+
female: VoiceId.FRENCH_WOMAN
104+
},
105+
zh: {
106+
male: VoiceId.CHINESE_MAN,
107+
female: VoiceId.CHINESE_WOMAN
108+
},
109+
de: {
110+
male: VoiceId.GERMAN_MAN,
111+
female: VoiceId.GERMAN_WOMAN
112+
},
113+
nl: {
114+
male: VoiceId.DUTCH_MAN,
115+
female: VoiceId.DUTCH_WOMAN
116+
},
117+
pt: {
118+
male: VoiceId.PORTUGUESE_MAN,
119+
female: VoiceId.PORTUGUESE_WOMAN
120+
},
121+
ru: {
122+
male: VoiceId.RUSSIAN_MAN,
123+
female: VoiceId.RUSSIAN_WOMAN
124+
},
125+
hi: {
126+
male: VoiceId.HINDI_MAN,
127+
female: VoiceId.HINDI_WOMAN
128+
},
129+
ko: {
130+
male: VoiceId.KOREAN_MAN,
131+
female: VoiceId.KOREAN_WOMAN
132+
},
133+
sv: {
134+
male: VoiceId.SWEDISH_MAN,
135+
female: VoiceId.SWEDISH_WOMAN
136+
},
137+
tr: {
138+
male: VoiceId.TURKISH_MAN,
139+
female: VoiceId.TURKISH_WOMAN
140+
},
141+
pl: {
142+
male: VoiceId.POLISH_MAN,
143+
female: VoiceId.POLISH_WOMAN
144+
}
145+
};
146+
147+
interface BaseTTSOptions {
17148
text: string;
18-
voiceId: string;
19149
language?: SupportedLanguage;
20150
modelId?: CartesiaModelId;
21151
}
22152

153+
interface VoiceIdTTSOptions extends BaseTTSOptions {
154+
voiceId: VoiceId;
155+
gender?: never;
156+
}
157+
158+
interface GenderTTSOptions extends BaseTTSOptions {
159+
voiceId?: never;
160+
gender: Gender;
161+
}
162+
163+
type TTSOptions = VoiceIdTTSOptions | GenderTTSOptions;
164+
23165
// get raw audio bytes from cartesia
24166
export async function tts({
25167
text,
26-
voiceId,
27168
language = "en",
28169
modelId = CartesiaModelId.SONIC,
170+
voiceId,
171+
gender = "male"
29172
}: TTSOptions): Promise<Uint8Array> {
30173
const cartesia = new CartesiaClient({
31174
apiKey: getRequiredEnv("CARTESIA_API_KEY"),
32175
});
33176

177+
// Use provided voiceId or get from language/gender map
178+
const finalVoiceId = voiceId || VOICE_MAP[language][gender];
179+
34180
const audioData = await cartesia.tts.bytes({
35181
modelId,
36182
transcript: text,
37183
voice: {
38184
mode: "id",
39-
id: voiceId,
185+
id: finalVoiceId,
40186
},
41187
language,
42188
outputFormat: {

next/lib/cartesia.unit.test.ts

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import { describe, it, expect } from "bun:test";
2+
import { SUPPORTED_LANGUAGES, GENDERS, VOICE_MAP, type SupportedLanguage, type Gender } from "./cartesia";
3+
4+
describe("cartesia voice coverage", () => {
5+
it("should have male and female voices for all supported languages", () => {
6+
// Check each language has both male and female voices
7+
SUPPORTED_LANGUAGES.forEach((lang: SupportedLanguage) => {
8+
GENDERS.forEach((gender: Gender) => {
9+
const voiceId = VOICE_MAP[lang][gender];
10+
expect(voiceId).toBeDefined();
11+
expect(typeof voiceId).toBe("string");
12+
expect(voiceId.length).toBeGreaterThan(0);
13+
});
14+
});
15+
});
16+
});

next/package.json

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
"@types/react-dom": "19.0.3",
6060
"autoprefixer": "^10.0.1",
6161
"bun": "^1.2.2",
62+
"cheerio": "^1.0.0",
6263
"eslint": "^8",
6364
"eslint-config-next": "15.1.7",
6465
"husky": "^9.0.11",

next/scripts/extract-voices.ts

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/**
2+
* 1. Star voices in Cartesia dashboard
3+
* 2. Inspect/copy list element
4+
* 3. Copy to scripts/content.txt
5+
* 4. Run script and tell Cursor to update lib/cartesia.ts with new voices
6+
*/
7+
import fs from "fs";
8+
import * as cheerio from "cheerio";
9+
10+
interface Voice {
11+
id: string;
12+
name: string;
13+
description: string;
14+
}
15+
16+
// Extract voice details from provided HTML
17+
function extractVoicesFromHTML(html: string): Voice[] {
18+
const $ = cheerio.load(html);
19+
const voices: Voice[] = [];
20+
$('a.block[href^="/voices/"]').each((_i: number, elem: any) => {
21+
const href = $(elem).attr("href");
22+
if (!href) return; // skip if no href
23+
const id = href.split("/").pop() || ""; // extract id from URL
24+
const pTags = $(elem).find("div.grid p");
25+
if (pTags.length < 2) return; // ensure we got both name & description
26+
const name = $(pTags[0]).text().trim();
27+
const description = $(pTags[1]).text().trim();
28+
voices.push({
29+
id,
30+
name,
31+
description,
32+
});
33+
});
34+
return voices;
35+
}
36+
37+
function main(): void {
38+
const filePath = process.argv[2] || "scripts/content.txt";
39+
fs.readFile(filePath, "utf8", (err: NodeJS.ErrnoException | null, data: string) => {
40+
if (err) {
41+
console.error("Error reading file:", err);
42+
process.exit(1);
43+
}
44+
const voices = extractVoicesFromHTML(data);
45+
console.log(JSON.stringify(voices, null, 2));
46+
});
47+
}
48+
49+
main();

0 commit comments

Comments
 (0)