@@ -55,6 +55,7 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
55
55
56
56
mStart = PR_TRUE;
57
57
mDetectedCharset = nsnull;
58
+ mDetectedConfidence = 0.0 ;
58
59
mGotData = PR_FALSE;
59
60
mInputState = ePureAscii;
60
61
mLastChar = ' \0 ' ;
@@ -83,6 +84,7 @@ nsUniversalDetector::Reset()
83
84
84
85
mStart = PR_TRUE;
85
86
mDetectedCharset = nsnull;
87
+ mDetectedConfidence = 0.0 ;
86
88
mGotData = PR_FALSE;
87
89
mInputState = ePureAscii;
88
90
mLastChar = ' \0 ' ;
@@ -120,11 +122,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
120
122
if ((' \xBB ' == aBuf[1 ]) && (' \xBF ' == aBuf[2 ]))
121
123
/* EF BB BF: UTF-8 encoded BOM. */
122
124
mDetectedCharset = " UTF-8" ;
125
+ mDetectedConfidence = 0.99 ;
123
126
break ;
124
127
case ' \xFE ' :
125
128
if (' \xFF ' == aBuf[1 ])
126
129
/* FE FF: UTF-16, big endian BOM. */
127
130
mDetectedCharset = " UTF-16" ;
131
+ mDetectedConfidence = 0.99 ;
128
132
break ;
129
133
case ' \xFF ' :
130
134
if (' \xFE ' == aBuf[1 ])
@@ -135,11 +139,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
135
139
{
136
140
/* FF FE 00 00: UTF-32 (LE). */
137
141
mDetectedCharset = " UTF-32" ;
142
+ mDetectedConfidence = 0.99 ;
138
143
}
139
144
else
140
145
{
141
146
/* FF FE: UTF-16, little endian BOM. */
142
147
mDetectedCharset = " UTF-16" ;
148
+ mDetectedConfidence = 0.99 ;
143
149
}
144
150
}
145
151
break ;
@@ -151,6 +157,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
151
157
{
152
158
/* 00 00 FE FF: UTF-32 (BE). */
153
159
mDetectedCharset = " UTF-32" ;
160
+ mDetectedConfidence = 0.99 ;
154
161
}
155
162
break ;
156
163
}
@@ -241,16 +248,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
241
248
{
242
249
mDone = PR_TRUE;
243
250
mDetectedCharset = mEscCharSetProber ->GetCharSetName ();
251
+ mDetectedConfidence = mEscCharSetProber ->GetConfidence ();
244
252
}
245
253
else if (mNbspFound )
246
254
{
247
255
mDetectedCharset = " ISO-8859-1" ;
256
+ mDetectedConfidence = 1.0 ;
248
257
}
249
258
else
250
259
{
251
260
/* ASCII with the ESC character (or the sequence "~{") is still
252
261
* ASCII until proven otherwise. */
253
262
mDetectedCharset = " ASCII" ;
263
+ mDetectedConfidence = 1.0 ;
254
264
}
255
265
break ;
256
266
case eHighbyte:
@@ -263,6 +273,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
263
273
{
264
274
mDone = PR_TRUE;
265
275
mDetectedCharset = mCharSetProbers [i]->GetCharSetName ();
276
+ mDetectedConfidence = mCharSetProbers [i]->GetConfidence ();
266
277
return NS_OK;
267
278
}
268
279
}
@@ -275,11 +286,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
275
286
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
276
287
* (though it could have been any ISO-8859 encoding). */
277
288
mDetectedCharset = " ISO-8859-1" ;
289
+ mDetectedConfidence = 1.0 ;
278
290
}
279
291
else
280
292
{
281
293
/* Pure ASCII */
282
294
mDetectedCharset = " ASCII" ;
295
+ mDetectedConfidence = 1.0 ;
283
296
}
284
297
break ;
285
298
}
@@ -300,7 +313,7 @@ void nsUniversalDetector::DataEnd()
300
313
if (mDetectedCharset )
301
314
{
302
315
mDone = PR_TRUE;
303
- Report (mDetectedCharset );
316
+ Report (mDetectedCharset , mDetectedConfidence );
304
317
return ;
305
318
}
306
319
@@ -326,7 +339,7 @@ void nsUniversalDetector::DataEnd()
326
339
}
327
340
// do not report anything because we are not confident of it, that's in fact a negative answer
328
341
if (maxProberConfidence > MINIMUM_THRESHOLD)
329
- Report (mCharSetProbers [maxProber]->GetCharSetName ());
342
+ Report (mCharSetProbers [maxProber]->GetCharSetName (), mCharSetProbers [maxProber]-> GetConfidence () );
330
343
}
331
344
break ;
332
345
case eEscAscii:
0 commit comments