@@ -6,7 +6,11 @@ export interface KaikkiEntry {
6
6
senses : Array < {
7
7
raw_glosses ?: string [ ] ;
8
8
glosses ?: string [ ] ;
9
+ form_of ?: Array < {
10
+ word : string ;
11
+ } > ;
9
12
} > ;
13
+ base_word_definition ?: KaikkiEntry ; // for form_of references
10
14
}
11
15
12
16
interface KaikkiCache {
@@ -21,18 +25,38 @@ export function cleanWord(word: string): string {
21
25
// Multiple cleaning passes to handle nested cases
22
26
for ( let i = 0 ; i < 3 ; i ++ ) {
23
27
result = result
24
- . replace ( / [ . , ! ? ; : » ) } \] ] + $ / g, '' ) // trailing
25
- . replace ( / ^ [ « ( { [ \] ] + / g, '' ) // leading
26
- . replace ( / ^ [ " ' ` ] ( .+ ) [ " ' ` ] $ / g, '$1' ) // quotes
28
+ // Universal punctuation (including inverted marks)
29
+ . replace ( / [ . , ! ? ¿ ¡ ; : … ‥ ] + $ / gu, "" ) // trailing periods, ellipsis (including CJK)
30
+ . replace ( / ^ [ ¡ ¿ ] + / gu, "" ) // leading spanish marks
31
+
32
+ // Brackets/parens/braces across scripts (escape special chars)
33
+ . replace ( / [ » › 〉 》 \) \] } } > ] + $ / gu, "" ) // trailing (including CJK/JP)
34
+ . replace ( / ^ [ « ‹ 〈 《 \( \[ { { < ] + / gu, "" ) // leading (including CJK/JP)
35
+
36
+ // Quotes across scripts
37
+ . replace ( / ^ [ " ' ` " " ' ' ‹ › « » 「 」 『 』 ] + ( .+ ) [ " ' ` " " ' ' ‹ › « » 「 」 『 』 ] + $ / gu, "$1" )
38
+
39
+ // Arabic/Persian/Hebrew punctuation
40
+ . replace ( / [ ؟ ، ؛ ] + $ / gu, "" ) // Arabic question mark, comma, semicolon
41
+ . replace ( / ^ [ ؟ ، ؛ ] + / gu, "" ) // Leading Arabic punctuation
42
+
43
+ // CJK specific
44
+ . replace ( / [ ! ? 。 ] + $ / gu, "" ) // CJK exclamation, question, full stop
45
+ . replace ( / ^ [ ! ? 。 ] + / gu, "" ) // Leading CJK marks (rare but possible)
46
+
47
+ // Devanagari/Indic
48
+ . replace ( / [ । ॥ ] + $ / gu, "" ) // Devanagari danda, double danda
49
+ . replace ( / ^ [ । ॥ ] + / gu, "" ) // Leading dandas (rare but possible)
50
+
27
51
. trim ( ) ;
28
52
}
29
53
return result . toLowerCase ( ) ;
30
54
}
31
55
32
56
// from https://kaikki.org
33
- export type KaikkiLanguage =
57
+ export type KaikkiLanguage =
34
58
// Primary dictionaries
35
- | "English"
59
+ | "English"
36
60
| "Spanish"
37
61
| "Italian"
38
62
| "German"
@@ -82,27 +106,36 @@ function toTitleCase(str: string): string {
82
106
83
107
function buildKaikkiPath ( word : string , language : KaikkiLanguage ) : string {
84
108
const firstLetter = word [ 0 ] . toLowerCase ( ) ;
85
- const firstTwo = word . slice ( 0 , 2 ) . toLowerCase ( ) ;
109
+ const firstTwo = word . slice ( 0 , 2 ) . toLowerCase ( ) ;
86
110
const wordLower = word . toLowerCase ( ) ;
87
111
const languageFormatted = toTitleCase ( language ) ;
88
112
return `${ languageFormatted } /meaning/${ firstLetter } /${ firstTwo } /${ wordLower } ` ;
89
113
}
90
114
91
- export function buildKaikkiHtmlUrl ( word : string , language : KaikkiLanguage ) : string {
115
+ export function buildKaikkiHtmlUrl (
116
+ word : string ,
117
+ language : KaikkiLanguage
118
+ ) : string {
92
119
return `https://kaikki.org/dictionary/${ buildKaikkiPath ( word , language ) } .html` ;
93
120
}
94
121
95
- export function buildKaikkiJsonlUrl ( word : string , language : KaikkiLanguage ) : string {
122
+ export function buildKaikkiJsonlUrl (
123
+ word : string ,
124
+ language : KaikkiLanguage
125
+ ) : string {
96
126
return `https://kaikki.org/dictionary/${ buildKaikkiPath ( word , language ) } .jsonl` ;
97
127
}
98
128
99
- async function fetchFromKaikki ( word : string , language : KaikkiLanguage ) : Promise < KaikkiEntry [ ] > {
129
+ async function fetchFromKaikki (
130
+ word : string ,
131
+ language : KaikkiLanguage
132
+ ) : Promise < KaikkiEntry [ ] > {
100
133
const url = buildKaikkiJsonlUrl ( word , language ) ;
101
134
console . log ( `🔍 Fetching from URL: ${ url } ` ) ;
102
-
135
+
103
136
const r = await fetch ( url ) ;
104
137
console . log ( `📡 Response status: ${ r . status } ${ r . statusText } ` ) ;
105
-
138
+
106
139
if ( ! r . ok ) {
107
140
const text = await r . text ( ) . catch ( ( ) => "No response body" ) ;
108
141
console . error ( `❌ Error details:
@@ -118,24 +151,57 @@ async function fetchFromKaikki(word: string, language: KaikkiLanguage): Promise<
118
151
return lines . map ( ( line ) => JSON . parse ( line ) ) ;
119
152
}
120
153
121
- export async function fetchKaikkiDefinitions ( word : string , language : KaikkiLanguage = "English" ) : Promise < KaikkiEntry [ ] > {
154
+ export async function fetchKaikkiDefinitions (
155
+ word : string ,
156
+ language : KaikkiLanguage = "English" ,
157
+ noCache : boolean = false
158
+ ) : Promise < KaikkiEntry [ ] > {
122
159
const redis = await getRedis ( ) ;
123
160
const cacheKey = buildKaikkiKey ( word , language ) ;
124
-
125
- // Try cache first
126
- const cached = await redis . get < KaikkiCache > ( cacheKey ) ;
127
- if ( cached ) {
128
- console . log ( `🎯 Cache hit for ${ cacheKey } ` ) ;
129
- return cached . definitions ;
161
+
162
+ // Skip cache if noCache is true
163
+ if ( ! noCache ) {
164
+ // Try cache first
165
+ const cached = await redis . get < KaikkiCache > ( cacheKey ) ;
166
+ if ( cached ) {
167
+ console . log ( `🎯 Cache hit for ${ cacheKey } ` ) ;
168
+ return cached . definitions ;
169
+ }
130
170
}
131
-
171
+
132
172
// Fetch from Kaikki
133
- console . log ( `💫 Cache miss for ${ cacheKey } ` ) ;
173
+ console . log (
174
+ `💫 ${ noCache ? "Skipping cache" : "Cache miss" } for ${ cacheKey } `
175
+ ) ;
134
176
const definitions = await fetchFromKaikki ( word , language ) ;
135
-
136
- // Cache the result (1 year TTL)
137
- await redis . set ( cacheKey , { definitions } , { ex : CACHE_TTL } ) ;
138
- console . log ( `💾 Cached definitions for ${ cacheKey } (expires in 1 year)` ) ;
139
-
177
+
178
+ // Fetch base word definitions for form_of references
179
+ for ( const def of definitions ) {
180
+ for ( const sense of def . senses ) {
181
+ if ( sense . form_of ?. [ 0 ] ?. word ) {
182
+ const baseWord = sense . form_of [ 0 ] . word ;
183
+ console . log ( `🔄 Fetching base word definition for ${ baseWord } ` ) ;
184
+ try {
185
+ const baseDefs = await fetchKaikkiDefinitions (
186
+ baseWord ,
187
+ language ,
188
+ noCache
189
+ ) ;
190
+ if ( baseDefs . length > 0 ) {
191
+ def . base_word_definition = baseDefs [ 0 ] ;
192
+ }
193
+ } catch ( e ) {
194
+ console . warn ( `⚠️ Failed to fetch base word ${ baseWord } :` , e ) ;
195
+ }
196
+ }
197
+ }
198
+ }
199
+
200
+ // Cache the result (1 year TTL) unless noCache is true
201
+ if ( ! noCache ) {
202
+ await redis . set ( cacheKey , { definitions } , { ex : CACHE_TTL } ) ;
203
+ console . log ( `💾 Cached definitions for ${ cacheKey } (expires in 1 year)` ) ;
204
+ }
205
+
140
206
return definitions ;
141
207
}
0 commit comments