Skip to content

Commit 0636dc6

Browse files
committed
Revise documents for Ancient Chinese models
1 parent f717be1 commit 0636dc6

File tree

6 files changed

+35
-21
lines changed

6 files changed

+35
-21
lines changed

docs/references.bib

+11-14
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,23 @@
11
%% This BibTeX bibliography file was created using BibDesk.
22
%% https://bibdesk.sourceforge.io/
33
4-
%% Created for hankcs at 2025-01-11 17:43:12 -0800
4+
%% Created for hankcs at 2025-01-12 16:22:17 -0800
55
66
77
%% Saved with string encoding Unicode (UTF-8)
88
99
1010
11+
@inproceedings{yasuoka2019universal,
12+
author = {Yasuoka, Koichi},
13+
booktitle = {DADH2019: 10th International Conference of Digital Archives and Digital Humanities},
14+
date-added = {2025-01-12 16:22:09 -0800},
15+
date-modified = {2025-01-12 16:22:09 -0800},
16+
organization = {Digital Archives and Digital Humanities},
17+
pages = {20--28},
18+
title = {Universal dependencies treebank of the four books in Classical Chinese},
19+
year = {2019}}
20+
1121
@inproceedings{li-etal-2022-first,
1222
abstract = {This paper presents the results of the First Ancient Chinese Word Segmentation and POS Tagging Bakeoff (EvaHan), which was held at the Second Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) 2022, in the context of the 13th Edition of the Language Resources and Evaluation Conference (LREC 2022). We give the motivation for having an international shared contest, as well as the data and tracks. The contest is consisted of two modalities, closed and open. In the closed modality, the participants are only allowed to use the training data, obtained the highest F1 score of 96.03{\%} and 92.05{\%} in word segmentation and POS tagging. In the open modality, the participants can use whatever resource they have, with the highest F1 score of 96.34{\%} and 92.56{\%} in word segmentation and POS tagging. The scores on the blind test dataset decrease around 3 points, which shows that the out-of-vocabulary words still are the bottleneck for lexical analyzers.},
1323
address = {Marseille, France},
@@ -24,19 +34,6 @@ @inproceedings{li-etal-2022-first
2434
year = {2022},
2535
bdsk-url-1 = {https://aclanthology.org/2022.lt4hala-1.19/}}
2636

27-
@inproceedings{YASK:2019,
28-
abstract = {Classical Chinese is an isolating language without notational inflection, and its texts are continuous strings of Chinese characters without spaces or punctuations between words or sentences. In order to apply Universal Dependencies for classical Chinese, we need several ``not-universal'' treatments and enhancements. In this paper such treatments and enhancements are revealed.},
29-
author = {YASUOKA, Koichi},
30-
date-added = {2025-01-11 17:39:18 -0800},
31-
date-modified = {2025-01-11 17:39:18 -0800},
32-
journal = {DADH2019: 10th International Conference of Digital Archives and Digital Humanities},
33-
month = {12},
34-
publisher = {Digital Archives and Digital Humanities},
35-
title = {Universal Dependencies Treebank of the Four Books in Classical Chinese},
36-
url = {http://hdl.handle.net/2433/245217},
37-
year = {2019},
38-
bdsk-url-1 = {http://hdl.handle.net/2433/245217}}
39-
4037
@inproceedings{wang2022uncertainty,
4138
author = {Wang, Pengyu and Ren, Zhichen},
4239
booktitle = {Proceedings of the Second Workshop on Language Technologies for Historical and Ancient Languages},

hanlp/pretrained/mtl.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip'
2020
"ERNIE (:cite:`xiao-etal-2021-ernie`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus."
2121
KYOTO_EVAHAN_TOK_LEM_POS_UDEP_LZH = HANLP_URL + 'mtl/kyoto_evahan_tok_lem_pos_udep_bert-ancient-chinese_lr_1_aug_dict_20250112_154422.zip'
22-
'Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese ' \
23-
'Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`). ' \
24-
'Performance: {tok/fine P: 98.91% R: 99.11% F1: 99.01%}{tok/coarse P: 94.71% R: 92.51% F1: 93.60%}{lem Accuracy:98.86%}{pos/upos Accuracy:94.91%}{pos/xpos Accuracy:93.79%}{pos/pku Accuracy:91.91%}{dep UAS: 88.70% LAS: 83.89%}'
22+
'''
23+
Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese
24+
Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`).
25+
Performance: ``{tok/fine P: 98.91% R: 99.11% F1: 99.01%}{tok/coarse P: 94.71% R: 92.51% F1: 93.60%}{lem Accuracy:98.86%}{pos/upos Accuracy:94.91%}{pos/xpos Accuracy:93.79%}{pos/pku Accuracy:91.91%}{dep UAS: 88.70% LAS: 83.89%}``
26+
'''
2527

2628
UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L6_no_space_20220731_161526.zip'
2729
'''

hanlp/pretrained/tok.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,12 @@
3535
'which is much higher than that of MTL model '
3636

3737
KYOTO_EVAHAN_TOK_LZH = 'http://download.hanlp.com/tok/extra/kyoto_evahan_tok_bert-ancient-chinese_tau_0.5_20250111_234146.zip'
38-
'Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese ' \
39-
'Universal Dependencies Treebank (:cite:`YASK:2019`) and EvaHan corpus (:cite:`li-etal-2022-first`). ' \
40-
'Performance: {UD P: 98.85% R: 99.00% F1: 98.92%} on UD Kyoto, ' \
41-
'and {TestA P: 95.62% R: 96.56% F1: 96.09%} {TestB P: 94.93% R: 93.05% F1: 93.98%} on EvaHan.'
38+
'''
39+
Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese
40+
Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`).
41+
Performance: ``{UD P: 98.85% R: 99.00% F1: 98.92%}`` on UD Kyoto,
42+
and ``{TestA P: 95.62% R: 96.56% F1: 96.09%} {TestB P: 94.93% R: 93.05% F1: 93.98%}`` on EvaHan.
43+
'''
4244

4345
UD_TOK_MMINILMV2L6 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L6_no_space_mul_20220619_091824.zip'
4446
'''

plugins/hanlp_demo/hanlp_demo/lzh/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import hanlp
2+
3+
HanLP = hanlp.load(hanlp.pretrained.mtl.KYOTO_EVAHAN_TOK_LEM_POS_UDEP_LZH)
4+
doc = HanLP(['晋太元中,武陵人捕鱼为业。', '司馬牛問君子'])
5+
print(doc)
6+
doc.pretty_print()
7+
8+
HanLP('司馬牛問君子', skip_tasks='tok/fine').pretty_print()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import hanlp
2+
3+
HanLP = hanlp.load(hanlp.pretrained.tok.KYOTO_EVAHAN_TOK_LZH)
4+
doc = HanLP('司馬牛問君子')
5+
print(doc)

0 commit comments

Comments
 (0)