Skip to content

Commit f1e11d6

Browse files
committed
add uchardet_get_confidence func
1 parent b7707d4 commit f1e11d6

7 files changed

+47
-16
lines changed

src/nsSBCSGroupProber.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -128,19 +128,19 @@ nsSBCSGroupProber::nsSBCSGroupProber()
128128

129129
mProbers[46] = new nsSingleByteCharSetProber(&Windows_1250CzechModel);
130130
mProbers[47] = new nsSingleByteCharSetProber(&Iso_8859_2CzechModel);
131-
mProbers[48] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCzechModel);
131+
mProbers[48] = new nsSingleByteCharSetProber(&MaccentraleuropeCzechModel);
132132
mProbers[49] = new nsSingleByteCharSetProber(&Ibm852CzechModel);
133133

134134
mProbers[50] = new nsSingleByteCharSetProber(&Windows_1250SlovakModel);
135135
mProbers[51] = new nsSingleByteCharSetProber(&Iso_8859_2SlovakModel);
136-
mProbers[52] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSlovakModel);
136+
mProbers[52] = new nsSingleByteCharSetProber(&MaccentraleuropeSlovakModel);
137137
mProbers[53] = new nsSingleByteCharSetProber(&Ibm852SlovakModel);
138138

139139
mProbers[54] = new nsSingleByteCharSetProber(&Windows_1250PolishModel);
140140
mProbers[55] = new nsSingleByteCharSetProber(&Iso_8859_2PolishModel);
141141
mProbers[56] = new nsSingleByteCharSetProber(&Iso_8859_13PolishModel);
142142
mProbers[57] = new nsSingleByteCharSetProber(&Iso_8859_16PolishModel);
143-
mProbers[58] = new nsSingleByteCharSetProber(&Mac_CentraleuropePolishModel);
143+
mProbers[58] = new nsSingleByteCharSetProber(&MaccentraleuropePolishModel);
144144
mProbers[59] = new nsSingleByteCharSetProber(&Ibm852PolishModel);
145145

146146
mProbers[60] = new nsSingleByteCharSetProber(&Iso_8859_1FinnishModel);
@@ -160,7 +160,7 @@ nsSBCSGroupProber::nsSBCSGroupProber()
160160
mProbers[71] = new nsSingleByteCharSetProber(&Iso_8859_2CroatianModel);
161161
mProbers[72] = new nsSingleByteCharSetProber(&Iso_8859_13CroatianModel);
162162
mProbers[73] = new nsSingleByteCharSetProber(&Iso_8859_16CroatianModel);
163-
mProbers[74] = new nsSingleByteCharSetProber(&Mac_CentraleuropeCroatianModel);
163+
mProbers[74] = new nsSingleByteCharSetProber(&MaccentraleuropeCroatianModel);
164164
mProbers[75] = new nsSingleByteCharSetProber(&Ibm852CroatianModel);
165165

166166
mProbers[76] = new nsSingleByteCharSetProber(&Windows_1252EstonianModel);
@@ -182,7 +182,7 @@ nsSBCSGroupProber::nsSBCSGroupProber()
182182
mProbers[89] = new nsSingleByteCharSetProber(&Windows_1250SloveneModel);
183183
mProbers[90] = new nsSingleByteCharSetProber(&Iso_8859_2SloveneModel);
184184
mProbers[91] = new nsSingleByteCharSetProber(&Iso_8859_16SloveneModel);
185-
mProbers[92] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSloveneModel);
185+
mProbers[92] = new nsSingleByteCharSetProber(&MaccentraleuropeSloveneModel);
186186
mProbers[93] = new nsSingleByteCharSetProber(&Ibm852SloveneModel);
187187

188188
mProbers[94] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel);

src/nsSBCharSetProber.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -190,19 +190,19 @@ extern const SequenceModel Iso_8859_3MalteseModel;
190190
extern const SequenceModel Windows_1250CzechModel;
191191
extern const SequenceModel Iso_8859_2CzechModel;
192192
extern const SequenceModel Ibm852CzechModel;
193-
extern const SequenceModel Mac_CentraleuropeCzechModel;
193+
extern const SequenceModel MaccentraleuropeCzechModel;
194194

195195
extern const SequenceModel Windows_1250SlovakModel;
196196
extern const SequenceModel Iso_8859_2SlovakModel;
197197
extern const SequenceModel Ibm852SlovakModel;
198-
extern const SequenceModel Mac_CentraleuropeSlovakModel;
198+
extern const SequenceModel MaccentraleuropeSlovakModel;
199199

200200
extern const SequenceModel Windows_1250PolishModel;
201201
extern const SequenceModel Iso_8859_2PolishModel;
202202
extern const SequenceModel Iso_8859_13PolishModel;
203203
extern const SequenceModel Iso_8859_16PolishModel;
204204
extern const SequenceModel Ibm852PolishModel;
205-
extern const SequenceModel Mac_CentraleuropePolishModel;
205+
extern const SequenceModel MaccentraleuropePolishModel;
206206

207207
extern const SequenceModel Iso_8859_1FinnishModel;
208208
extern const SequenceModel Iso_8859_4FinnishModel;
@@ -222,7 +222,7 @@ extern const SequenceModel Iso_8859_2CroatianModel;
222222
extern const SequenceModel Iso_8859_13CroatianModel;
223223
extern const SequenceModel Iso_8859_16CroatianModel;
224224
extern const SequenceModel Ibm852CroatianModel;
225-
extern const SequenceModel Mac_CentraleuropeCroatianModel;
225+
extern const SequenceModel MaccentraleuropeCroatianModel;
226226

227227
extern const SequenceModel Windows_1252EstonianModel;
228228
extern const SequenceModel Windows_1257EstonianModel;
@@ -244,7 +244,7 @@ extern const SequenceModel Windows_1250SloveneModel;
244244
extern const SequenceModel Iso_8859_2SloveneModel;
245245
extern const SequenceModel Iso_8859_16SloveneModel;
246246
extern const SequenceModel Ibm852SloveneModel;
247-
extern const SequenceModel Mac_CentraleuropeSloveneModel;
247+
extern const SequenceModel MaccentraleuropeSloveneModel;
248248

249249
extern const SequenceModel Iso_8859_1SwedishModel;
250250
extern const SequenceModel Iso_8859_4SwedishModel;

src/nsUniversalDetector.cpp

+15-2
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
5555

5656
mStart = PR_TRUE;
5757
mDetectedCharset = nsnull;
58+
mDetectedConfidence = 0.0;
5859
mGotData = PR_FALSE;
5960
mInputState = ePureAscii;
6061
mLastChar = '\0';
@@ -83,6 +84,7 @@ nsUniversalDetector::Reset()
8384

8485
mStart = PR_TRUE;
8586
mDetectedCharset = nsnull;
87+
mDetectedConfidence = 0.0;
8688
mGotData = PR_FALSE;
8789
mInputState = ePureAscii;
8890
mLastChar = '\0';
@@ -120,11 +122,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
120122
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
121123
/* EF BB BF: UTF-8 encoded BOM. */
122124
mDetectedCharset = "UTF-8";
125+
mDetectedConfidence = 0.99;
123126
break;
124127
case '\xFE':
125128
if ('\xFF' == aBuf[1])
126129
/* FE FF: UTF-16, big endian BOM. */
127130
mDetectedCharset = "UTF-16";
131+
mDetectedConfidence = 0.99;
128132
break;
129133
case '\xFF':
130134
if ('\xFE' == aBuf[1])
@@ -135,11 +139,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
135139
{
136140
/* FF FE 00 00: UTF-32 (LE). */
137141
mDetectedCharset = "UTF-32";
142+
mDetectedConfidence = 0.99;
138143
}
139144
else
140145
{
141146
/* FF FE: UTF-16, little endian BOM. */
142147
mDetectedCharset = "UTF-16";
148+
mDetectedConfidence = 0.99;
143149
}
144150
}
145151
break;
@@ -151,6 +157,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
151157
{
152158
/* 00 00 FE FF: UTF-32 (BE). */
153159
mDetectedCharset = "UTF-32";
160+
mDetectedConfidence = 0.99;
154161
}
155162
break;
156163
}
@@ -241,16 +248,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
241248
{
242249
mDone = PR_TRUE;
243250
mDetectedCharset = mEscCharSetProber->GetCharSetName();
251+
mDetectedConfidence = mEscCharSetProber->GetConfidence();
244252
}
245253
else if (mNbspFound)
246254
{
247255
mDetectedCharset = "ISO-8859-1";
256+
mDetectedConfidence = 1.0;
248257
}
249258
else
250259
{
251260
/* ASCII with the ESC character (or the sequence "~{") is still
252261
* ASCII until proven otherwise. */
253262
mDetectedCharset = "ASCII";
263+
mDetectedConfidence = 1.0;
254264
}
255265
break;
256266
case eHighbyte:
@@ -263,6 +273,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
263273
{
264274
mDone = PR_TRUE;
265275
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
276+
mDetectedConfidence = mCharSetProbers[i]->GetConfidence();
266277
return NS_OK;
267278
}
268279
}
@@ -275,11 +286,13 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
275286
/* ISO-8859-1 is a good result candidate for ASCII + NBSP.
276287
* (though it could have been any ISO-8859 encoding). */
277288
mDetectedCharset = "ISO-8859-1";
289+
mDetectedConfidence = 1.0;
278290
}
279291
else
280292
{
281293
/* Pure ASCII */
282294
mDetectedCharset = "ASCII";
295+
mDetectedConfidence = 1.0;
283296
}
284297
break;
285298
}
@@ -300,7 +313,7 @@ void nsUniversalDetector::DataEnd()
300313
if (mDetectedCharset)
301314
{
302315
mDone = PR_TRUE;
303-
Report(mDetectedCharset);
316+
Report(mDetectedCharset, mDetectedConfidence);
304317
return;
305318
}
306319

@@ -326,7 +339,7 @@ void nsUniversalDetector::DataEnd()
326339
}
327340
//do not report anything because we are not confident of it, that's in fact a negative answer
328341
if (maxProberConfidence > MINIMUM_THRESHOLD)
329-
Report(mCharSetProbers[maxProber]->GetCharSetName());
342+
Report(mCharSetProbers[maxProber]->GetCharSetName(), mCharSetProbers[maxProber]->GetConfidence());
330343
}
331344
break;
332345
case eEscAscii:

src/nsUniversalDetector.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ class nsUniversalDetector {
6969
virtual void DataEnd(void);
7070

7171
protected:
72-
virtual void Report(const char* aCharset) = 0;
72+
virtual void Report(const char* aCharset, float aConfidence) = 0;
7373
virtual void Reset();
7474
nsInputState mInputState;
7575
PRBool mNbspFound;
@@ -79,6 +79,7 @@ class nsUniversalDetector {
7979
PRBool mGotData;
8080
char mLastChar;
8181
const char * mDetectedCharset;
82+
float mDetectedConfidence;
8283
PRInt32 mBestGuess;
8384
PRUint32 mLanguageFilter;
8485

src/symbols.cmake

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set(
66
uchardet_data_end
77
uchardet_reset
88
uchardet_get_charset
9+
uchardet_get_confidence
910
)
1011

1112
set (LINK_FLAGS "")

src/uchardet.cpp

+17-3
Original file line numberDiff line numberDiff line change
@@ -44,25 +44,29 @@ class HandleUniversalDetector : public nsUniversalDetector
4444
{
4545
protected:
4646
char *m_charset;
47-
47+
float m_confidence;
4848
public:
4949
HandleUniversalDetector()
5050
: nsUniversalDetector(NS_FILTER_ALL)
5151
, m_charset(0)
5252
{
53+
m_confidence = 0.0;
5354
}
5455

5556
virtual ~HandleUniversalDetector()
5657
{
57-
if (m_charset)
58+
if (m_charset) {
5859
free(m_charset);
60+
m_confidence = 0.0;
61+
}
5962
}
6063

61-
virtual void Report(const char* charset)
64+
virtual void Report(const char* charset, float confidence)
6265
{
6366
if (m_charset)
6467
free(m_charset);
6568
m_charset = strdup(charset);
69+
m_confidence = confidence;
6670
}
6771

6872
virtual void Reset()
@@ -71,12 +75,17 @@ class HandleUniversalDetector : public nsUniversalDetector
7175
if (m_charset)
7276
free(m_charset);
7377
m_charset = strdup("");
78+
m_confidence = 0.0;
7479
}
7580

7681
const char* GetCharset() const
7782
{
7883
return m_charset? m_charset : "";
7984
}
85+
86+
float GetConfidence() {
87+
return m_confidence;
88+
}
8089
};
8190

8291
uchardet_t uchardet_new(void)
@@ -109,3 +118,8 @@ const char* uchardet_get_charset(uchardet_t ud)
109118
{
110119
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset();
111120
}
121+
122+
float uchardet_get_confidence(uchardet_t ud)
123+
{
124+
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetConfidence();
125+
}

src/uchardet.h

+2
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ void uchardet_reset(uchardet_t ud);
9393
*/
9494
const char * uchardet_get_charset(uchardet_t ud);
9595

96+
float uchardet_get_confidence(uchardet_t ud);
97+
9698
#ifdef __cplusplus
9799
}
98100
#endif

0 commit comments

Comments
 (0)