@inproceedings{tang-etal-2021-ji,
title = "基于预训练语言模型的繁体古文自动句读研究(Automatic Traditional {A}ncient {C}hinese Texts Segmentation and Punctuation Based on Pre-training Language Model)",
author = "Tang, Xuemei and
Su, Qi and
Wang, Jun and
Chen, Yuhang and
Yang, Hao",
editor = "Li, Sheng and
Sun, Maosong and
Liu, Yang and
Wu, Hua and
Liu, Kang and
Che, Wanxiang and
He, Shizhu and
Rao, Gaoqi",
booktitle = "Proceedings of the 20th Chinese National Conference on Computational Linguistics",
month = aug,
year = "2021",
address = "Huhhot, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2021.ccl-1.61",
pages = "678--688",
abstract = "未经整理的古代典籍不含任何标点,不符合当代人的阅读习惯,古籍断句标点之后有助于阅读、研究和出版。本文提出了一种基于预训练语言模型的繁体古文自动句读框架。本文整理了约10亿字的繁体古文语料,对于训练语言模型进行增量训练,在此基础上上实现古文自动句读和标点。实验表明经过大规模繁体古文语料增量训练后的语言模型具备更好的古文语义表示能力,能够有助提升繁体古文自动句读和自动标点的效果。融合了增量训练模型之后,古文断句F1值达到95.03{\%},古文标点F1值达到了80.18{\%},分别比使用未增量训练的语言模型提升1.83{\%}和2.21{\%}。为解决现有篇章级句读方案效率低的问题,本文改进了前人的串行滑动窗口方式,在一定程度上提高了句读效率,并提出一种新的并行滑动窗口方式,能够高效准确地进行长文本自动句读。",
language = "Chinese",
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tang-etal-2021-ji">
<titleInfo>
<title>基于预训练语言模型的繁体古文自动句读研究(Automatic Traditional Ancient Chinese Texts Segmentation and Punctuation Based on Pre-training Language Model)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xuemei</namePart>
<namePart type="family">Tang</namePart>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Su</namePart>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Wang</namePart>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuhang</namePart>
<namePart type="family">Chen</namePart>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Yang</namePart>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">Chinese</languageTerm>
<languageTerm type="code" authority="iso639-2b">chi</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th Chinese National Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sheng</namePart>
<namePart type="family">Li</namePart>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maosong</namePart>
<namePart type="family">Sun</namePart>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hua</namePart>
<namePart type="family">Wu</namePart>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kang</namePart>
<namePart type="family">Liu</namePart>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shizhu</namePart>
<namePart type="family">He</namePart>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaoqi</namePart>
<namePart type="family">Rao</namePart>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Huhhot, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>未经整理的古代典籍不含任何标点,不符合当代人的阅读习惯,古籍断句标点之后有助于阅读、研究和出版。本文提出了一种基于预训练语言模型的繁体古文自动句读框架。本文整理了约10亿字的繁体古文语料,对于训练语言模型进行增量训练,在此基础上上实现古文自动句读和标点。实验表明经过大规模繁体古文语料增量训练后的语言模型具备更好的古文语义表示能力,能够有助提升繁体古文自动句读和自动标点的效果。融合了增量训练模型之后,古文断句F1值达到95.03%,古文标点F1值达到了80.18%,分别比使用未增量训练的语言模型提升1.83%和2.21%。为解决现有篇章级句读方案效率低的问题,本文改进了前人的串行滑动窗口方式,在一定程度上提高了句读效率,并提出一种新的并行滑动窗口方式,能够高效准确地进行长文本自动句读。</abstract>
<identifier type="citekey">tang-etal-2021-ji</identifier>
<location>
<url>https://aclanthology.org/2021.ccl-1.61</url>
</location>
<date>2021-08</date>
<extent unit="page">
<start>678</start>
<end>688</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T 基于预训练语言模型的繁体古文自动句读研究(Automatic Traditional Ancient Chinese Texts Segmentation and Punctuation Based on Pre-training Language Model)
%A Tang, Xuemei
%A Su, Qi
%A Wang, Jun
%A Chen, Yuhang
%A Yang, Hao
%Y Li, Sheng
%Y Sun, Maosong
%Y Liu, Yang
%Y Wu, Hua
%Y Liu, Kang
%Y Che, Wanxiang
%Y He, Shizhu
%Y Rao, Gaoqi
%S Proceedings of the 20th Chinese National Conference on Computational Linguistics
%D 2021
%8 August
%I Chinese Information Processing Society of China
%C Huhhot, China
%G Chinese
%F tang-etal-2021-ji
%X 未经整理的古代典籍不含任何标点,不符合当代人的阅读习惯,古籍断句标点之后有助于阅读、研究和出版。本文提出了一种基于预训练语言模型的繁体古文自动句读框架。本文整理了约10亿字的繁体古文语料,对于训练语言模型进行增量训练,在此基础上上实现古文自动句读和标点。实验表明经过大规模繁体古文语料增量训练后的语言模型具备更好的古文语义表示能力,能够有助提升繁体古文自动句读和自动标点的效果。融合了增量训练模型之后,古文断句F1值达到95.03%,古文标点F1值达到了80.18%,分别比使用未增量训练的语言模型提升1.83%和2.21%。为解决现有篇章级句读方案效率低的问题,本文改进了前人的串行滑动窗口方式,在一定程度上提高了句读效率,并提出一种新的并行滑动窗口方式,能够高效准确地进行长文本自动句读。
%U https://aclanthology.org/2021.ccl-1.61
%P 678-688
Markdown (Informal)
[基于预训练语言模型的繁体古文自动句读研究(Automatic Traditional Ancient Chinese Texts Segmentation and Punctuation Based on Pre-training Language Model)](https://aclanthology.org/2021.ccl-1.61) (Tang et al., CCL 2021)
ACL