cartesia util

0thernet · 0thernet · commit 64da2a4b46d0 · 2025-02-18T16:55:03.000-04:00
diff --git a/next/README.md b/next/README.md
@@ -2,9 +2,8 @@
 
 ## now
 
-- kaikki
-  - redis caching
 - real cloning
+- summarize kaikki definitions w llm
 
 ## future
 
diff --git a/next/app/api/speak/route.ts b/next/app/api/speak/route.ts
@@ -0,0 +1,33 @@
+import { NextResponse } from "next/server";
+import { tts, CartesiaModelId, SUPPORTED_LANGUAGES } from "@/lib/cartesia";
+import { z } from "zod";
+
+const speakSchema = z.object({
+  text: z.string().min(1),
+  voiceId: z.string(),
+  language: z.enum(SUPPORTED_LANGUAGES).optional(),
+  modelId: z.nativeEnum(CartesiaModelId).optional(),
+});
+
+export async function POST(req: Request) {
+  try {
+    const body = await req.json();
+    const result = speakSchema.parse(body);
+
+    const audioBytes = await tts(result);
+
+    return new NextResponse(audioBytes, {
+      status: 200,
+      headers: {
+        "Content-Type": "audio/wav",
+        "Content-Length": audioBytes.length.toString(),
+      },
+    });
+  } catch (error) {
+    console.error("Failed to generate speech:", error);
+    return NextResponse.json(
+      { error: "Failed to generate speech" },
+      { status: 400 }
+    );
+  }
+}
diff --git a/next/components/fragment.tsx b/next/components/fragment.tsx
@@ -159,7 +159,7 @@ export const FragmentComponent = ({ fragment, showTranslation, outputLanguage }:
 
   const desktopContent = selectedWord && (
     <div className="p-4">
-      <div className="mt-4 overflow-auto max-h-[calc(100vh-12rem)]">
+      <div className="overflow-auto max-h-[calc(100vh-12rem)]">
         {renderDefinitions()}
       </div>
       <div className="mt-4">
diff --git a/next/lib/cartesia.ts b/next/lib/cartesia.ts
@@ -0,0 +1,50 @@
+import { CartesiaClient } from "@cartesia/cartesia-js";
+import { getRequiredEnv } from "./env";
+
+export enum CartesiaModelId {
+  SONIC = "sonic", // stable alias, recommended for prod
+  SONIC_PREVIEW = "sonic-preview", // latest features, may be unstable
+}
+
+export const SUPPORTED_LANGUAGES = [
+  "en", "fr", "de", "es", "pt", "zh", "ja",
+  "hi", "it", "ko", "nl", "pl", "ru", "sv", "tr"
+] as const;
+
+export type SupportedLanguage = typeof SUPPORTED_LANGUAGES[number];
+
+interface TTSOptions {
+  text: string;
+  voiceId: string;
+  language?: SupportedLanguage;
+  modelId?: CartesiaModelId;
+}
+
+// get raw audio bytes from cartesia
+export async function tts({
+  text,
+  voiceId,
+  language = "en",
+  modelId = CartesiaModelId.SONIC,
+}: TTSOptions): Promise<Uint8Array> {
+  const cartesia = new CartesiaClient({
+    apiKey: getRequiredEnv("CARTESIA_API_KEY"),
+  });
+
+  const audioData = await cartesia.tts.bytes({
+    modelId,
+    transcript: text,
+    voice: {
+      mode: "id",
+      id: voiceId,
+    },
+    language,
+    outputFormat: {
+      container: "wav",
+      sampleRate: 44100,
+      encoding: "pcm_f32le",
+    },
+  });
+
+  return new Uint8Array(audioData);
+}
diff --git a/next/scripts/speak.ts b/next/scripts/speak.ts
@@ -1,11 +1,11 @@
 import { openai } from "@ai-sdk/openai";
 import { generateObject } from "ai";
-import { CartesiaClient } from "@cartesia/cartesia-js";
 import { z } from "zod";
 import { writeFileSync } from "fs";
 import { exec } from "child_process";
 import { promisify } from "util";
 import fetch from "node-fetch";
+import { tts, CartesiaModelId } from "@/lib/cartesia";
 
 const execAsync = promisify(exec);
 const wordsSchema = z.object({
@@ -62,6 +62,7 @@ async function main() {
   const delaySeconds = slowMode ? 10 : (delayArg ? parseFloat(delayArg.split("=")[1]) : 4);
   const taoMode = process.argv.includes("--tao");
   const spanishMode = process.argv.includes("--spanish");
+  const previewMode = process.argv.includes("--preview");
 
   if (taoMode && spanishMode) {
     throw new Error("Cannot use --tao and --spanish together");
@@ -77,16 +78,10 @@ async function main() {
     console.log(`Loaded ${taoVerses.length} verses`);
   }
 
-  if (!process.env.OPENAI_API_KEY) {
-    throw new Error("OPENAI_API_KEY is required");
+  const voiceId = process.env.CARTESIA_VOICE_ID;
+  if (!voiceId) {
+    throw new Error("CARTESIA_VOICE_ID is required");
   }
-  if (!process.env.CARTESIA_API_KEY) {
-    throw new Error("CARTESIA_API_KEY is required");
-  }
-
-  const cartesia = new CartesiaClient({
-    apiKey: process.env.CARTESIA_API_KEY,
-  });
 
   const recentUtterances: string[] = [];
 
@@ -124,24 +119,16 @@ Return only valid JSON.`.trim(),
       }
 
       try {
-        const audioData = await cartesia.tts.bytes({
-          modelId: "sonic-english",
-          transcript: utterance,
-          voice: {
-            mode: "id",
-            id: "694f9389-aac1-45b6-b726-9d9369183238",
-          },
-          language: "en",
-          outputFormat: {
-            container: "wav",
-            sampleRate: 44100,
-            encoding: "pcm_f32le",
-          },
+        const audioData = await tts({
+          text: utterance,
+          voiceId,
+          language: spanishMode ? "es" : "en",
+          modelId: previewMode ? CartesiaModelId.SONIC_PREVIEW : CartesiaModelId.SONIC,
         });
 
         // Save audio to temp file
         const tempFile = `/tmp/speech-${Date.now()}.wav`;
-        writeFileSync(tempFile, new Uint8Array(audioData));
+        writeFileSync(tempFile, audioData);
 
         // Play using sox to Loopback device
         await execAsync(`sox "${tempFile}" -t coreaudio "Loopback"`);