cartesia

0thernet · 0thernet · commit 33b68a91ea4e · 2025-02-18T17:24:36.000-04:00
diff --git a/.cursorrules b/.cursorrules
@@ -43,4 +43,4 @@
 - You're blocked on making progress until you run the command.
 - You changed important business logic changes (suggest running unit tests)
 - You updated a unit test file, or created a new set of unit tests.
-- NEVER ASK TO RUN `bun test` (the entire test suite). Always run `bun test:unit`.
+- Use `bun test` to run a specific test file
diff --git a/next/app/api/speak/route.ts b/next/app/api/speak/route.ts
@@ -1,20 +1,35 @@
 import { NextResponse } from "next/server";
-import { tts, CartesiaModelId, SUPPORTED_LANGUAGES } from "@/lib/cartesia";
+import { tts, CartesiaModelId, SUPPORTED_LANGUAGES, VoiceId, GENDERS } from "@/lib/cartesia";
 import { z } from "zod";
 
-const speakSchema = z.object({
+const baseSchema = z.object({
   text: z.string().min(1),
-  voiceId: z.string(),
   language: z.enum(SUPPORTED_LANGUAGES).optional(),
   modelId: z.nativeEnum(CartesiaModelId).optional(),
 });
 
+const voiceIdSchema = baseSchema.extend({
+  voiceId: z.nativeEnum(VoiceId),
+  gender: z.never().optional(),
+});
+
+const genderSchema = baseSchema.extend({
+  voiceId: z.never().optional(),
+  gender: z.enum(GENDERS).default('male'),
+});
+
+const speakSchema = z.discriminatedUnion('type', [
+  voiceIdSchema.extend({ type: z.literal('voiceId') }),
+  genderSchema.extend({ type: z.literal('gender') }),
+]);
+
 export async function POST(req: Request) {
   try {
     const body = await req.json();
     const result = speakSchema.parse(body);
 
-    const audioBytes = await tts(result);
+    const { type, ...options } = result;
+    const audioBytes = await tts(options);
 
     return new NextResponse(audioBytes, {
       status: 200,
diff --git a/next/bun.lock b/next/bun.lock
diff --git a/next/lib/cartesia.ts b/next/lib/cartesia.ts
@@ -1,6 +1,70 @@
 import { CartesiaClient } from "@cartesia/cartesia-js";
 import { getRequiredEnv } from "./env";
 
+// Voice IDs with their descriptions
+export enum VoiceId {
+  // English
+  NATHAN = "97f4b8fb-f2fe-444b-bb9a-c109783a857a", // Warm, natural male voice
+  SARAH = "694f9389-aac1-45b6-b726-9d9369183238",  // Natural, expressive American female
+  JORDAN = "87bc56aa-ab01-4baa-9071-77d497064686", // Smooth, friendly male voice
+
+  // Japanese
+  KENJI = "6b92f628-be90-497c-8f4c-3b035002df71",  // Calm, clear Japanese male
+  YUKI = "59d4fd2f-f5eb-4410-8105-58db7661144f",   // Calm, clear Japanese female
+
+  // Italian
+  ITALIAN_WOMAN = "0e21713a-5e9a-428a-bed4-90d410b87f13", // Graceful, melodic female
+  ITALIAN_MAN = "029c3c7a-b6d9-44f0-814b-200d849830ff",   // Deep, resonant male
+
+  // Spanish/Mexican
+  MEXICAN_WOMAN = "5c5ad5e7-1020-476b-8b91-fdcbe9cc313c", // Even-toned Mexican female
+  MEXICAN_MAN = "15d0c2e2-8d29-44c3-be23-d585d5f154a1",   // Rich Mexican male
+
+  // French
+  FRENCH_MAN = "5c3c89e5-535f-43ef-b14d-f8ffe148c1f0",    // Even, rich male
+  FRENCH_WOMAN = "8832a0b5-47b2-4751-bb22-6a8e2149303d",   // Velvety, neutral female
+
+  // Chinese
+  CHINESE_MAN = "eda5bbff-1ff1-4886-8ef1-4e69a77640a0",    // Enthusiastic, deep male
+  CHINESE_WOMAN = "e90c6678-f0d3-4767-9883-5d0ecf5894a8",  // Friendly, inviting female
+
+  // German
+  GERMAN_MAN = "b7187e84-fe22-4344-ba4a-bc013fcb533e",     // Warm, expressive male
+  GERMAN_WOMAN = "4ab1ff51-476d-42bb-8019-4d315f7c0c05",   // Warm, expressive female
+
+  // Dutch
+  DUTCH_MAN = "af482421-80f4-4379-b00c-a118def29cde",      // Smooth, articulate male
+  DUTCH_WOMAN = "0eb213fe-4658-45bc-9442-33a48b24b133",    // Calm, clear female
+
+  // Portuguese
+  PORTUGUESE_MAN = "6a360542-a117-4ed5-9e09-e8bf9b05eabb", // Calm, clear male
+  PORTUGUESE_WOMAN = "d4b44b9a-82bc-4b65-b456-763fce4c52f9", // Friendly, natural female
+
+  // Russian
+  RUSSIAN_MAN = "da05e96d-ca10-4220-9042-d8acef654fa9",    // Deep, expressive male
+  RUSSIAN_WOMAN = "642014de-c0e3-4133-adc0-36b5309c23e6",  // Graceful, melodic female
+
+  // Hindi
+  HINDI_MAN = "be79f378-47fe-4f9c-b92b-f02cefa62ccf",      // Deep-toned, everyday male
+  HINDI_WOMAN = "dc40fa13-d344-4c77-b579-1f0aec140e05",    // Engaging, high energy female
+
+  // Korean
+  KOREAN_MAN = "af6beeea-d732-40b6-8292-73af0035b740",     // Warm, resonant male
+  KOREAN_WOMAN = "663afeec-d082-4ab5-827e-2e41bf73a25b",   // Graceful, melodic female
+
+  // Swedish
+  SWEDISH_WOMAN = "6c6b05bf-ae5f-4013-82ab-7348e99ffdb2",  // Warm, expressive female
+  SWEDISH_MAN = "38a146c3-69d7-40ad-aada-76d5a2621758",     // Deep, resonant male
+
+  // Turkish
+  TURKISH_WOMAN = "fa7bfcdc-603c-4bf1-a600-a371400d2f8c",  // Smooth, expressive female
+  TURKISH_MAN = "39f753ef-b0eb-41cd-aa53-2f3c284f948f",    // Soothing, reassuring male
+
+  // Polish
+  POLISH_WOMAN = "575a5d29-1fdc-4d4e-9afa-5a9a71759864",   // Graceful, melodic female
+  POLISH_MAN = "4ef93bb3-682a-46e6-b881-8e157b6b4388",     // Deep, resonant male
+}
+
 export enum CartesiaModelId {
   SONIC = "sonic", // stable alias, recommended for prod
   SONIC_PREVIEW = "sonic-preview", // latest features, may be unstable
@@ -13,30 +77,112 @@ export const SUPPORTED_LANGUAGES = [
 
 export type SupportedLanguage = typeof SUPPORTED_LANGUAGES[number];
 
-interface TTSOptions {
+export type Gender = "male" | "female";
+export const GENDERS = ["male", "female"] as const;
+
+// Voice mapping by language and gender
+export const VOICE_MAP: Record<SupportedLanguage, Record<Gender, VoiceId>> = {
+  en: {
+    male: VoiceId.NATHAN,
+    female: VoiceId.SARAH
+  },
+  ja: {
+    male: VoiceId.KENJI,
+    female: VoiceId.YUKI
+  },
+  it: {
+    male: VoiceId.ITALIAN_MAN,
+    female: VoiceId.ITALIAN_WOMAN
+  },
+  es: {
+    male: VoiceId.MEXICAN_MAN,
+    female: VoiceId.MEXICAN_WOMAN
+  },
+  fr: {
+    male: VoiceId.FRENCH_MAN,
+    female: VoiceId.FRENCH_WOMAN
+  },
+  zh: {
+    male: VoiceId.CHINESE_MAN,
+    female: VoiceId.CHINESE_WOMAN
+  },
+  de: {
+    male: VoiceId.GERMAN_MAN,
+    female: VoiceId.GERMAN_WOMAN
+  },
+  nl: {
+    male: VoiceId.DUTCH_MAN,
+    female: VoiceId.DUTCH_WOMAN
+  },
+  pt: {
+    male: VoiceId.PORTUGUESE_MAN,
+    female: VoiceId.PORTUGUESE_WOMAN
+  },
+  ru: {
+    male: VoiceId.RUSSIAN_MAN,
+    female: VoiceId.RUSSIAN_WOMAN
+  },
+  hi: {
+    male: VoiceId.HINDI_MAN,
+    female: VoiceId.HINDI_WOMAN
+  },
+  ko: {
+    male: VoiceId.KOREAN_MAN,
+    female: VoiceId.KOREAN_WOMAN
+  },
+  sv: {
+    male: VoiceId.SWEDISH_MAN,
+    female: VoiceId.SWEDISH_WOMAN
+  },
+  tr: {
+    male: VoiceId.TURKISH_MAN,
+    female: VoiceId.TURKISH_WOMAN
+  },
+  pl: {
+    male: VoiceId.POLISH_MAN,
+    female: VoiceId.POLISH_WOMAN
+  }
+};
+
+interface BaseTTSOptions {
   text: string;
-  voiceId: string;
   language?: SupportedLanguage;
   modelId?: CartesiaModelId;
 }
 
+interface VoiceIdTTSOptions extends BaseTTSOptions {
+  voiceId: VoiceId;
+  gender?: never;
+}
+
+interface GenderTTSOptions extends BaseTTSOptions {
+  voiceId?: never;
+  gender: Gender;
+}
+
+type TTSOptions = VoiceIdTTSOptions | GenderTTSOptions;
+
 // get raw audio bytes from cartesia
 export async function tts({
   text,
-  voiceId,
   language = "en",
   modelId = CartesiaModelId.SONIC,
+  voiceId,
+  gender = "male"
 }: TTSOptions): Promise<Uint8Array> {
   const cartesia = new CartesiaClient({
     apiKey: getRequiredEnv("CARTESIA_API_KEY"),
   });
 
+  // Use provided voiceId or get from language/gender map
+  const finalVoiceId = voiceId || VOICE_MAP[language][gender];
+
   const audioData = await cartesia.tts.bytes({
     modelId,
     transcript: text,
     voice: {
       mode: "id",
-      id: voiceId,
+      id: finalVoiceId,
     },
     language,
     outputFormat: {
diff --git a/next/lib/cartesia.unit.test.ts b/next/lib/cartesia.unit.test.ts
@@ -0,0 +1,16 @@
+import { describe, it, expect } from "bun:test";
+import { SUPPORTED_LANGUAGES, GENDERS, VOICE_MAP, type SupportedLanguage, type Gender } from "./cartesia";
+
+describe("cartesia voice coverage", () => {
+  it("should have male and female voices for all supported languages", () => {
+    // Check each language has both male and female voices
+    SUPPORTED_LANGUAGES.forEach((lang: SupportedLanguage) => {
+      GENDERS.forEach((gender: Gender) => {
+        const voiceId = VOICE_MAP[lang][gender];
+        expect(voiceId).toBeDefined();
+        expect(typeof voiceId).toBe("string");
+        expect(voiceId.length).toBeGreaterThan(0);
+      });
+    });
+  });
+});
diff --git a/next/package.json b/next/package.json
@@ -59,6 +59,7 @@
     "@types/react-dom": "19.0.3",
     "autoprefixer": "^10.0.1",
     "bun": "^1.2.2",
+    "cheerio": "^1.0.0",
     "eslint": "^8",
     "eslint-config-next": "15.1.7",
     "husky": "^9.0.11",
diff --git a/next/scripts/extract-voices.ts b/next/scripts/extract-voices.ts
@@ -0,0 +1,49 @@
+/**
+ * 1. Star voices in Cartesia dashboard
+ * 2. Inspect/copy list element
+ * 3. Copy to scripts/content.txt
+ * 4. Run script and tell Cursor to update lib/cartesia.ts with new voices
+ */
+import fs from "fs";
+import * as cheerio from "cheerio";
+
+interface Voice {
+  id: string;
+  name: string;
+  description: string;
+}
+
+// Extract voice details from provided HTML
+function extractVoicesFromHTML(html: string): Voice[] {
+  const $ = cheerio.load(html);
+  const voices: Voice[] = [];
+  $('a.block[href^="/voices/"]').each((_i: number, elem: any) => {
+    const href = $(elem).attr("href");
+    if (!href) return; // skip if no href
+    const id = href.split("/").pop() || ""; // extract id from URL
+    const pTags = $(elem).find("div.grid p");
+    if (pTags.length < 2) return; // ensure we got both name & description
+    const name = $(pTags[0]).text().trim();
+    const description = $(pTags[1]).text().trim();
+    voices.push({
+      id,
+      name,
+      description,
+    });
+  });
+  return voices;
+}
+
+function main(): void {
+  const filePath = process.argv[2] || "scripts/content.txt";
+  fs.readFile(filePath, "utf8", (err: NodeJS.ErrnoException | null, data: string) => {
+    if (err) {
+      console.error("Error reading file:", err);
+      process.exit(1);
+    }
+    const voices = extractVoicesFromHTML(data);
+    console.log(JSON.stringify(voices, null, 2));
+  });
+}
+
+main();