1
1
import { CartesiaClient } from "@cartesia/cartesia-js" ;
2
2
import { getRequiredEnv } from "./env" ;
3
3
4
+ // Voice IDs with their descriptions
5
+ export enum VoiceId {
6
+ // English
7
+ NATHAN = "97f4b8fb-f2fe-444b-bb9a-c109783a857a" , // Warm, natural male voice
8
+ SARAH = "694f9389-aac1-45b6-b726-9d9369183238" , // Natural, expressive American female
9
+ JORDAN = "87bc56aa-ab01-4baa-9071-77d497064686" , // Smooth, friendly male voice
10
+
11
+ // Japanese
12
+ KENJI = "6b92f628-be90-497c-8f4c-3b035002df71" , // Calm, clear Japanese male
13
+ YUKI = "59d4fd2f-f5eb-4410-8105-58db7661144f" , // Calm, clear Japanese female
14
+
15
+ // Italian
16
+ ITALIAN_WOMAN = "0e21713a-5e9a-428a-bed4-90d410b87f13" , // Graceful, melodic female
17
+ ITALIAN_MAN = "029c3c7a-b6d9-44f0-814b-200d849830ff" , // Deep, resonant male
18
+
19
+ // Spanish/Mexican
20
+ MEXICAN_WOMAN = "5c5ad5e7-1020-476b-8b91-fdcbe9cc313c" , // Even-toned Mexican female
21
+ MEXICAN_MAN = "15d0c2e2-8d29-44c3-be23-d585d5f154a1" , // Rich Mexican male
22
+
23
+ // French
24
+ FRENCH_MAN = "5c3c89e5-535f-43ef-b14d-f8ffe148c1f0" , // Even, rich male
25
+ FRENCH_WOMAN = "8832a0b5-47b2-4751-bb22-6a8e2149303d" , // Velvety, neutral female
26
+
27
+ // Chinese
28
+ CHINESE_MAN = "eda5bbff-1ff1-4886-8ef1-4e69a77640a0" , // Enthusiastic, deep male
29
+ CHINESE_WOMAN = "e90c6678-f0d3-4767-9883-5d0ecf5894a8" , // Friendly, inviting female
30
+
31
+ // German
32
+ GERMAN_MAN = "b7187e84-fe22-4344-ba4a-bc013fcb533e" , // Warm, expressive male
33
+ GERMAN_WOMAN = "4ab1ff51-476d-42bb-8019-4d315f7c0c05" , // Warm, expressive female
34
+
35
+ // Dutch
36
+ DUTCH_MAN = "af482421-80f4-4379-b00c-a118def29cde" , // Smooth, articulate male
37
+ DUTCH_WOMAN = "0eb213fe-4658-45bc-9442-33a48b24b133" , // Calm, clear female
38
+
39
+ // Portuguese
40
+ PORTUGUESE_MAN = "6a360542-a117-4ed5-9e09-e8bf9b05eabb" , // Calm, clear male
41
+ PORTUGUESE_WOMAN = "d4b44b9a-82bc-4b65-b456-763fce4c52f9" , // Friendly, natural female
42
+
43
+ // Russian
44
+ RUSSIAN_MAN = "da05e96d-ca10-4220-9042-d8acef654fa9" , // Deep, expressive male
45
+ RUSSIAN_WOMAN = "642014de-c0e3-4133-adc0-36b5309c23e6" , // Graceful, melodic female
46
+
47
+ // Hindi
48
+ HINDI_MAN = "be79f378-47fe-4f9c-b92b-f02cefa62ccf" , // Deep-toned, everyday male
49
+ HINDI_WOMAN = "dc40fa13-d344-4c77-b579-1f0aec140e05" , // Engaging, high energy female
50
+
51
+ // Korean
52
+ KOREAN_MAN = "af6beeea-d732-40b6-8292-73af0035b740" , // Warm, resonant male
53
+ KOREAN_WOMAN = "663afeec-d082-4ab5-827e-2e41bf73a25b" , // Graceful, melodic female
54
+
55
+ // Swedish
56
+ SWEDISH_WOMAN = "6c6b05bf-ae5f-4013-82ab-7348e99ffdb2" , // Warm, expressive female
57
+ SWEDISH_MAN = "38a146c3-69d7-40ad-aada-76d5a2621758" , // Deep, resonant male
58
+
59
+ // Turkish
60
+ TURKISH_WOMAN = "fa7bfcdc-603c-4bf1-a600-a371400d2f8c" , // Smooth, expressive female
61
+ TURKISH_MAN = "39f753ef-b0eb-41cd-aa53-2f3c284f948f" , // Soothing, reassuring male
62
+
63
+ // Polish
64
+ POLISH_WOMAN = "575a5d29-1fdc-4d4e-9afa-5a9a71759864" , // Graceful, melodic female
65
+ POLISH_MAN = "4ef93bb3-682a-46e6-b881-8e157b6b4388" , // Deep, resonant male
66
+ }
67
+
4
68
export enum CartesiaModelId {
5
69
SONIC = "sonic" , // stable alias, recommended for prod
6
70
SONIC_PREVIEW = "sonic-preview" , // latest features, may be unstable
@@ -13,30 +77,112 @@ export const SUPPORTED_LANGUAGES = [
13
77
14
78
export type SupportedLanguage = typeof SUPPORTED_LANGUAGES [ number ] ;
15
79
16
- interface TTSOptions {
80
+ export type Gender = "male" | "female" ;
81
+ export const GENDERS = [ "male" , "female" ] as const ;
82
+
83
+ // Voice mapping by language and gender
84
+ export const VOICE_MAP : Record < SupportedLanguage , Record < Gender , VoiceId > > = {
85
+ en : {
86
+ male : VoiceId . NATHAN ,
87
+ female : VoiceId . SARAH
88
+ } ,
89
+ ja : {
90
+ male : VoiceId . KENJI ,
91
+ female : VoiceId . YUKI
92
+ } ,
93
+ it : {
94
+ male : VoiceId . ITALIAN_MAN ,
95
+ female : VoiceId . ITALIAN_WOMAN
96
+ } ,
97
+ es : {
98
+ male : VoiceId . MEXICAN_MAN ,
99
+ female : VoiceId . MEXICAN_WOMAN
100
+ } ,
101
+ fr : {
102
+ male : VoiceId . FRENCH_MAN ,
103
+ female : VoiceId . FRENCH_WOMAN
104
+ } ,
105
+ zh : {
106
+ male : VoiceId . CHINESE_MAN ,
107
+ female : VoiceId . CHINESE_WOMAN
108
+ } ,
109
+ de : {
110
+ male : VoiceId . GERMAN_MAN ,
111
+ female : VoiceId . GERMAN_WOMAN
112
+ } ,
113
+ nl : {
114
+ male : VoiceId . DUTCH_MAN ,
115
+ female : VoiceId . DUTCH_WOMAN
116
+ } ,
117
+ pt : {
118
+ male : VoiceId . PORTUGUESE_MAN ,
119
+ female : VoiceId . PORTUGUESE_WOMAN
120
+ } ,
121
+ ru : {
122
+ male : VoiceId . RUSSIAN_MAN ,
123
+ female : VoiceId . RUSSIAN_WOMAN
124
+ } ,
125
+ hi : {
126
+ male : VoiceId . HINDI_MAN ,
127
+ female : VoiceId . HINDI_WOMAN
128
+ } ,
129
+ ko : {
130
+ male : VoiceId . KOREAN_MAN ,
131
+ female : VoiceId . KOREAN_WOMAN
132
+ } ,
133
+ sv : {
134
+ male : VoiceId . SWEDISH_MAN ,
135
+ female : VoiceId . SWEDISH_WOMAN
136
+ } ,
137
+ tr : {
138
+ male : VoiceId . TURKISH_MAN ,
139
+ female : VoiceId . TURKISH_WOMAN
140
+ } ,
141
+ pl : {
142
+ male : VoiceId . POLISH_MAN ,
143
+ female : VoiceId . POLISH_WOMAN
144
+ }
145
+ } ;
146
+
147
+ interface BaseTTSOptions {
17
148
text : string ;
18
- voiceId : string ;
19
149
language ?: SupportedLanguage ;
20
150
modelId ?: CartesiaModelId ;
21
151
}
22
152
153
+ interface VoiceIdTTSOptions extends BaseTTSOptions {
154
+ voiceId : VoiceId ;
155
+ gender ?: never ;
156
+ }
157
+
158
+ interface GenderTTSOptions extends BaseTTSOptions {
159
+ voiceId ?: never ;
160
+ gender : Gender ;
161
+ }
162
+
163
+ type TTSOptions = VoiceIdTTSOptions | GenderTTSOptions ;
164
+
23
165
// get raw audio bytes from cartesia
24
166
export async function tts ( {
25
167
text,
26
- voiceId,
27
168
language = "en" ,
28
169
modelId = CartesiaModelId . SONIC ,
170
+ voiceId,
171
+ gender = "male"
29
172
} : TTSOptions ) : Promise < Uint8Array > {
30
173
const cartesia = new CartesiaClient ( {
31
174
apiKey : getRequiredEnv ( "CARTESIA_API_KEY" ) ,
32
175
} ) ;
33
176
177
+ // Use provided voiceId or get from language/gender map
178
+ const finalVoiceId = voiceId || VOICE_MAP [ language ] [ gender ] ;
179
+
34
180
const audioData = await cartesia . tts . bytes ( {
35
181
modelId,
36
182
transcript : text ,
37
183
voice : {
38
184
mode : "id" ,
39
- id : voiceId ,
185
+ id : finalVoiceId ,
40
186
} ,
41
187
language,
42
188
outputFormat : {
0 commit comments