https://github.com/bab2min/kiwipIepy
Python3 API document: https://bab2min.github.io/kiwipIepy
Starting with KIWI 0.5, we offer API for Python3. You can build this project and import the module in Python, or install the already -built Kiwipiepy module using PIP.
$ pip install --upgrade pip
$ pip install kiwipiepyor
$ pip3 install --upgrade pip
$ pip3 install kiwipiepyCurrently, KIWIPIEPY packages support Windows OS and Linux, and MacOS 10.12 or higher with vista versions.
In an environment where Binary Distribution, such as MacOS M1 , is not provided , it is required for cmake3.12 or later for source code compilation when installing.
$ pip install cmake
$ pip install --upgrade pip
$ pip install kiwipiepy From the 0.6.3 version of KIWI, we support the interactive interface to test immediately after installation. After the installation is completed through the PIP, you can run it as follows to test the morphological analyzer.
$ python -m kiwipiepyor
$ python3 -m kiwipiepyWhen the interactive interface starts, you can enter the desired sentence to check the results of the morphological analysis.
>> 안녕 ?
[ Token ( form = '안녕' , tag = 'IC' , start = 0 , len = 2 ), Token ( form = '?' , tag = 'SF' , start = 2 , len = 3 )]To exit the interface, press Ctrl + C.
The part -of -the -art tag used in KIWI is based on the parts of the Sejong Malmachi, and some tags are improved and used. See here for the detailed tag system.
> >> from kiwipiepy import Kiwi
> >> kiwi = Kiwi ()
# tokenize 함수로 형태소 분석 결과를 얻을 수 있습니다.
> >> kiwi . tokenize ( "안녕하세요 형태소 분석기 키위입니다." )
[ Token ( form = '안녕' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '하' , tag = 'XSA' , start = 2 , len = 1 ),
Token ( form = '시' , tag = 'EP' , start = 4 , len = 1 ),
Token ( form = '어요' , tag = 'EC' , start = 3 , len = 2 ),
Token ( form = '형태소' , tag = 'NNG' , start = 6 , len = 3 ),
Token ( form = '분석' , tag = 'NNG' , start = 10 , len = 2 ),
Token ( form = '기' , tag = 'NNG' , start = 12 , len = 1 ),
Token ( form = '키위' , tag = 'NNG' , start = 14 , len = 2 ),
Token ( form = '이' , tag = 'VCP' , start = 16 , len = 1 ),
Token ( form = 'ᆸ니다' , tag = 'EF' , start = 17 , len = 2 ),
Token ( form = '.' , tag = 'SF' , start = 19 , len = 1 )]
# normalize_coda 옵션을 사용하면
# 덧붙은 받침 때문에 분석이 깨지는 경우를 방지할 수 있습니다.
> >> kiwi . tokenize ( "ㅋㅋㅋ 이런 것도 분석이 될까욬ㅋㅋ?" , normalize_coda = True )
[ Token ( form = 'ㅋㅋㅋ' , tag = 'SW' , start = 0 , len = 3 ),
Token ( form = '이런' , tag = 'MM' , start = 4 , len = 2 ),
Token ( form = '것' , tag = 'NNB' , start = 7 , len = 1 ),
Token ( form = '도' , tag = 'JX' , start = 8 , len = 1 ),
Token ( form = '분석' , tag = 'NNG' , start = 10 , len = 2 ),
Token ( form = '이' , tag = 'JKS' , start = 12 , len = 1 ),
Token ( form = '되' , tag = 'VV' , start = 14 , len = 1 ),
Token ( form = 'ᆯ까요' , tag = 'EC' , start = 15 , len = 2 ),
Token ( form = 'ㅋㅋㅋ' , tag = 'SW' , start = 17 , len = 2 ),
Token ( form = '?' , tag = 'SF' , start = 19 , len = 1 )]
# 불용어 관리를 위한 Stopwords 클래스도 제공합니다.
> >> from kiwipiepy . utils import Stopwords
> >> stopwords = Stopwords ()
> >> kiwi . tokenize ( "분석 결과에서 불용어만 제외하고 출력할 수도 있다." , stopwords = stopwords )
[ Token ( form = '분석' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '결과' , tag = 'NNG' , start = 3 , len = 2 ),
Token ( form = '불' , tag = 'XPN' , start = 8 , len = 1 ),
Token ( form = '용어' , tag = 'NNG' , start = 9 , len = 2 ),
Token ( form = '제외' , tag = 'NNG' , start = 13 , len = 2 ),
Token ( form = '출력' , tag = 'NNG' , start = 18 , len = 2 )]
# add, remove 메소드를 이용해 불용어 목록에 단어를 추가하거나 삭제할 수도 있습니다.
> >> stopwords . add (( '결과' , 'NNG' ))
> >> kiwi . tokenize ( "분석 결과에서 불용어만 제외하고 출력할 수도 있다." , stopwords = stopwords )
[ Token ( form = '분석' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '불' , tag = 'XPN' , start = 8 , len = 1 ),
Token ( form = '용어' , tag = 'NNG' , start = 9 , len = 2 ),
Token ( form = '제외' , tag = 'NNG' , start = 13 , len = 2 ),
Token ( form = '출력' , tag = 'NNG' , start = 18 , len = 2 )]
> >> tokens = kiwi . tokenize ( "각 토큰은 여러 정보를 담고 있습니다." )
> >> tokens [ 0 ]
Token ( form = '각' , tag = 'MM' , start = 0 , len = 1 )
> >> tokens [ 0 ]. form # 형태소의 형태 정보
'각'
> >> tokens [ 0 ]. tag # 형태소의 품사 정보
'MM'
> >> tokens [ 0 ]. start # 시작 및 끝 지점 (문자 단위)
0
> >> tokens [ 0 ]. end
1
> >> tokens [ 0 ]. word_position # 현 문장에서의 어절 번호
0
> >> tokens [ 0 ]. sent_position # 형태소가 속한 문장 번호
0
> >> tokens [ 0 ]. line_number # 형태소가 속한 줄의 번호
0
# 문장 분리 기능도 지원합니다.
> >> kiwi . split_into_sents ( "여러 문장으로 구성된 텍스트네 이걸 분리해줘" )
[ Sentence ( text = '여러 문장으로 구성된 텍스트네' , start = 0 , end = 16 , tokens = None ),
Sentence ( text = '이걸 분리해줘' , start = 17 , end = 24 , tokens = None )]
# 문장 분리와 형태소 분석을 함께 수행할 수도 있습니다.
> >> kiwi . split_into_sents ( "여러 문장으로 구성된 텍스트네 이걸 분리해줘" , return_tokens = True )
[ Sentence ( text = '여러 문장으로 구성된 텍스트네' , start = 0 , end = 16 , tokens = [
Token ( form = '여러' , tag = 'MM' , start = 0 , len = 2 ),
Token ( form = '문장' , tag = 'NNG' , start = 3 , len = 2 ),
Token ( form = '으로' , tag = 'JKB' , start = 5 , len = 2 ),
Token ( form = '구성' , tag = 'NNG' , start = 8 , len = 2 ),
Token ( form = '되' , tag = 'XSV' , start = 10 , len = 1 ),
Token ( form = 'ᆫ' , tag = 'ETM' , start = 11 , len = 0 ),
Token ( form = '텍스트' , tag = 'NNG' , start = 12 , len = 3 ),
Token ( form = '이' , tag = 'VCP' , start = 15 , len = 1 ),
Token ( form = '네' , tag = 'EF' , start = 15 , len = 1 )]),
Sentence ( text = '이걸 분리해줘' , start = 17 , end = 24 , tokens = [
Token ( form = '이거' , tag = 'NP' , start = 17 , len = 2 ),
Token ( form = 'ᆯ' , tag = 'JKO' , start = 19 , len = 0 ),
Token ( form = '분리' , tag = 'NNG' , start = 20 , len = 2 ),
Token ( form = '하' , tag = 'XSV' , start = 22 , len = 1 ),
Token ( form = '어' , tag = 'EC' , start = 22 , len = 1 ),
Token ( form = '주' , tag = 'VX' , start = 23 , len = 1 ),
Token ( form = '어' , tag = 'EF' , start = 23 , len = 1 )])]
# 사전에 새로운 단어를 추가할 수 있습니다.
> >> kiwi . add_user_word ( "김갑갑" , "NNP" )
True
> >> kiwi . tokenize ( "김갑갑이 누구야" )
[ Token ( form = '김갑갑' , tag = 'NNP' , start = 0 , len = 3 ),
Token ( form = '이' , tag = 'JKS' , start = 3 , len = 1 ),
Token ( form = '누구' , tag = 'NP' , start = 5 , len = 2 ),
Token ( form = '야' , tag = 'JKV' , start = 7 , len = 1 )]
# v0.11.0 신기능
# 0.11.0 버전부터는 사용자 사전에 동사/형용사를 추가할 때, 그 활용형도 함께 등재됩니다.
# 사전에 등재되어 있지 않은 동사 `팅기다`를 분석하면, 엉뚱한 결과가 나옵니다.
> >> kiwi . tokenize ( '팅겼다' )
[ Token ( form = '팅기' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '하' , tag = 'XSA' , start = 2 , len = 0 ),
Token ( form = '다' , tag = 'EF' , start = 2 , len = 1 )]
# 형태소 `팅기/VV`를 사전에 등록하면, 이 형태소의 모든 활용형이 자동으로 추가되기에
# `팅겼다`, `팅길` 등의 형태를 모두 분석해낼 수 있습니다.
> >> kiwi . add_user_word ( '팅기' , 'VV' )
True
> >> kiwi . tokenize ( '팅겼다' )
[ Token ( form = '팅기' , tag = 'VV' , start = 0 , len = 2 ),
Token ( form = '었' , tag = 'EP' , start = 1 , len = 1 ),
Token ( form = '다' , tag = 'EF' , start = 2 , len = 1 )]
# 또한 변형된 형태소를 일괄적으로 추가하여 대상 텍스트에 맞춰 분석 성능을 높일 수 있습니다.
> >> kiwi . tokenize ( "안녕하세영, 제 이름은 이세영이에영. 학생이세영?" )
[ Token ( form = '안녕' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '하' , tag = 'XSA' , start = 2 , len = 1 ),
Token ( form = '시' , tag = 'EP' , start = 3 , len = 1 ),
Token ( form = '어' , tag = 'EC' , start = 3 , len = 1 ),
Token ( form = '영' , tag = 'MAG' , start = 4 , len = 1 ), # 오분석
Token ( form = ',' , tag = 'SP' , start = 5 , len = 1 ),
Token ( form = '저' , tag = 'NP' , start = 7 , len = 1 ),
Token ( form = '의' , tag = 'JKG' , start = 7 , len = 1 ),
Token ( form = '이름' , tag = 'NNG' , start = 9 , len = 2 ),
Token ( form = '은' , tag = 'JX' , start = 11 , len = 1 ),
Token ( form = '이세영' , tag = 'NNP' , start = 13 , len = 3 ),
Token ( form = '이' , tag = 'JKS' , start = 16 , len = 1 ),
Token ( form = '에' , tag = 'IC' , start = 17 , len = 1 ),
Token ( form = '영' , tag = 'NR' , start = 18 , len = 1 ),
Token ( form = '.' , tag = 'SF' , start = 19 , len = 1 ),
Token ( form = '님' , tag = 'NNG' , start = 21 , len = 1 ),
Token ( form = '도' , tag = 'JX' , start = 22 , len = 1 ),
Token ( form = '학생' , tag = 'NNG' , start = 24 , len = 2 ),
Token ( form = '이세영' , tag = 'NNP' , start = 26 , len = 3 ), # 오분석
Token ( form = '?' , tag = 'SF' , start = 29 , len = 1 )]
# 종결어미(EF) 중 '요'로 끝나는 것들을 '영'으로 대체하여 일괄 삽입합니다.
# 이 때 변형된 종결어미에는 -3의 페널티를 부여하여 원 형태소보다 우선하지 않도록 합니다.
# 새로 삽입된 형태소들이 반환됩니다.
> >> kiwi . add_re_rule ( 'EF' , '요$' , '영' , - 3 )
[ '어영' , '에영' , '지영' , '잖아영' , '거든영' , 'ᆯ까영' , '네영' , '구영' , '나영' , '군영' , ..., '으니깐영' ]
# 동일한 문장을 재분석하면 분석 결과가 개선된 것을 확인할 수 있습니다.
> >> kiwi . tokenize ( "안녕하세영, 제 이름은 이세영이에영. 님도 학생이세영?" )
[ Token ( form = '안녕' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '하' , tag = 'XSA' , start = 2 , len = 1 ),
Token ( form = '시' , tag = 'EP' , start = 3 , len = 1 ),
Token ( form = '어영' , tag = 'EF' , start = 3 , len = 2 ), # 분석 결과 개선
Token ( form = ',' , tag = 'SP' , start = 5 , len = 1 ),
Token ( form = '저' , tag = 'NP' , start = 7 , len = 1 ),
Token ( form = '의' , tag = 'JKG' , start = 7 , len = 1 ),
Token ( form = '이름' , tag = 'NNG' , start = 9 , len = 2 ),
Token ( form = '은' , tag = 'JX' , start = 11 , len = 1 ),
Token ( form = '이세영' , tag = 'NNP' , start = 13 , len = 3 ),
Token ( form = '이' , tag = 'VCP' , start = 16 , len = 1 ),
Token ( form = '에영' , tag = 'EF' , start = 17 , len = 2 ),
Token ( form = '.' , tag = 'SF' , start = 19 , len = 1 ),
Token ( form = '님' , tag = 'NNG' , start = 21 , len = 1 ),
Token ( form = '도' , tag = 'JX' , start = 22 , len = 1 ),
Token ( form = '학생' , tag = 'NNG' , start = 24 , len = 2 ),
Token ( form = '이' , tag = 'VCP' , start = 26 , len = 1 ),
Token ( form = '시' , tag = 'EP' , start = 27 , len = 1 ),
Token ( form = '어영' , tag = 'EF' , start = 27 , len = 2 ), # 분석 결과 개선
Token ( form = '?' , tag = 'SF' , start = 29 , len = 1 )]
# 기분석 형태를 등록하여 원하는 대로 분석되지 않는 문자열을 교정할 수도 있습니다.
# 다음 문장의 `사겼대`는 오타가 들어간 형태라 제대로 분석되지 않습니다.
> >> kiwi . tokenize ( '걔네 둘이 사겼대' )
[ Token ( form = '걔' , tag = 'NP' , start = 0 , len = 1 ),
Token ( form = '네' , tag = 'XSN' , start = 1 , len = 1 ),
Token ( form = '둘' , tag = 'NR' , start = 3 , len = 1 ),
Token ( form = '이' , tag = 'JKS' , start = 4 , len = 1 ),
Token ( form = '사' , tag = 'NR' , start = 6 , len = 1 ),
Token ( form = '기' , tag = 'VV' , start = 7 , len = 1 ),
Token ( form = '었' , tag = 'EP' , start = 7 , len = 1 ),
Token ( form = '대' , tag = 'EF' , start = 8 , len = 1 )]
# 다음과 같이 add_pre_analyzed_word 메소드를 이용하여 이를 교정할 수 있습니다.
> >> kiwi . add_pre_analyzed_word ( '사겼대' , [ '사귀/VV' , '었/EP' , '대/EF' ], - 3 )
True
# 그 뒤 동일한 문장을 다시 분석해보면 결과가 바뀐 것을 확인할 수 있습니다.
> >> kiwi . tokenize ( '걔네 둘이 사겼대' )
[ Token ( form = '걔' , tag = 'NP' , start = 0 , len = 1 ),
Token ( form = '네' , tag = 'XSN' , start = 1 , len = 1 ),
Token ( form = '둘' , tag = 'NR' , start = 3 , len = 1 ),
Token ( form = '이' , tag = 'JKS' , start = 4 , len = 1 ),
Token ( form = '사귀' , tag = 'VV' , start = 6 , len = 3 ),
Token ( form = '었' , tag = 'EP' , start = 6 , len = 3 ),
Token ( form = '대' , tag = 'EF' , start = 6 , len = 3 )]
# 단, 사귀/VV, 었/EP, 대/EF의 시작위치가 모두 6, 길이가 모두 3으로 잘못 잡히는 문제가 보입니다.
# 이를 고치기 위해서는 add_pre_analyzed_word 시 각 형태소의 위치정보도 함께 입력해주어야합니다.
> >> kiwi = Kiwi ()
> >> kiwi . add_pre_analyzed_word ( '사겼대' , [( '사귀' , 'VV' , 0 , 2 ), ( '었' , 'EP' , 1 , 2 ), ( '대' , 'EF' , 2 , 3 )], - 3 )
True
> >> kiwi . tokenize ( '걔네 둘이 사겼대' )
[ Token ( form = '걔' , tag = 'NP' , start = 0 , len = 1 ),
Token ( form = '네' , tag = 'XSN' , start = 1 , len = 1 ),
Token ( form = '둘' , tag = 'NR' , start = 3 , len = 1 ),
Token ( form = '이' , tag = 'JKS' , start = 4 , len = 1 ),
Token ( form = '사귀' , tag = 'VV' , start = 6 , len = 2 ,
Token ( form = '었' , tag = 'EP' , start = 7 len = 1 ,
Token ( form = '대' , tag = 'EF' , start = 8 len = 1 ]
# v0.12.0 신기능
# 0.12.0 버전부터는 형태소를 결합하여 문장으로 복원하는 기능이 추가되었습니다.
>> > kiwi . join ([( '길' , 'NNG' ), ( '을' , 'JKO' ), ( '묻' , 'VV' ), ( '어요' , 'EF' )])
'길을 물어요'
>> > kiwi . join ([( '흙' , 'NNG' ), ( '이' , 'JKS' ), ( '묻' , 'VV' ), ( '어요' , 'EF' )])
'흙이 묻어요'
# v0.13.0 신기능
# 더 강력한 언어 모델인 SkipBigram(sbg)이 추가되었습니다.
# 기존의 knlm과 달리 먼 거리에 있는 형태소를 고려할 수 있습니다.
>> > kiwi = Kiwi ( model_type = ' knlm ')
>> > kiwi . tokenize ( '이 번호로 전화를 이따가 꼭 반드시 걸어.' )
[ Token ( form = '이' , tag = 'MM' , start = 0 , len = 1 ),
Token ( form = '번호' , tag = 'NNG' , start = 2 , len = 2 ),
Token ( form = '로' , tag = 'JKB' , start = 4 , len = 1 ),
Token ( form = '전화' , tag = 'NNG' , start = 6 , len = 2 ),
Token ( form = '를' , tag = 'JKO' , start = 8 , len = 1 ),
Token ( form = '이따가' , tag = 'MAG' , start = 10 , len = 3 ),
Token ( form = '꼭' , tag = 'MAG' , start = 14 , len = 1 ),
Token ( form = '반드시' , tag = 'MAG' , start = 16 , len = 3 ),
Token ( form = '걷' , tag = 'VV-I' , start = 20 , len = 1 ), # 걷다/걸다 중 틀리게 '걷다'를 선택했음.
Token ( form = '어' , tag = 'EF' , start = 21 , len = 1 ),
Token ( form = '.' , tag = 'SF' , start = 22 , len = 1 )]
>> > kiwi = Kiwi ( model_type = ' sbg ')
>> > kiwi . tokenize ( '이 번호로 전화를 이따가 꼭 반드시 걸어.' )
[ Token ( form = '이' , tag = 'MM' , start = 0 , len = 1 ),
Token ( form = '번호' , tag = 'NNG' , start = 2 , len = 2 ),
Token ( form = '로' , tag = 'JKB' , start = 4 , len = 1 ),
Token ( form = '전화' , tag = 'NNG' , start = 6 , len = 2 ),
Token ( form = '를' , tag = 'JKO' , start = 8 , len = 1 ),
Token ( form = '이따가' , tag = 'MAG' , start = 10 , len = 3 ),
Token ( form = '꼭' , tag = 'MAG' , start = 14 , len = 1 ),
Token ( form = '반드시' , tag = 'MAG' , start = 16 , len = 3 ),
Token ( form = '걸' , tag = 'VV' , start = 20 , len = 1 ), # 걷다/걸다 중 바르게 '걸다'를 선택했음.
Token ( form = '어' , tag = 'EC' , start = 21 , len = 1 ),
Token ( form = '.' , tag = 'SF' , start = 22 , len = 1 )]
# 또한 오타 교정 기능이 추가되었습니다.
# 간단한 오타를 교정하여, 사소한 오타 때문에 전체 분석 결과가 어긋나는 문제를 해결할 수 있습니다.
>> > kiwi = Kiwi ( model_type = ' sbg ', typos = 'basic' )
>> > kiwi . tokenize ( '외않됀대?' ) # 오타 교정 사용 시 로딩 시간이 5~10초 정도 소요됨
[ Token ( form = '왜' , tag = 'MAG' , start = 0 , len = 1 ),
Token ( form = '안' , tag = 'MAG' , start = 1 , len = 1 ),
Token ( form = '되' , tag = 'VV' , start = 2 , len = 1 ),
Token ( form = 'ᆫ대' , tag = 'EF' , start = 2 , len = 2 ),
Token ( form = '?' , tag = 'SF' , start = 4 , len = 1 )]
>> > kiwi . tokenize ( '장례희망이 뭐냐는 선섕님의 질문에 벙어리가 됫따' )
[ Token ( form = '장래' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '희망' , tag = 'NNG' , start = 2 , len = 2 ),
Token ( form = '이' , tag = 'JKS' , start = 4 , len = 1 ),
Token ( form = '뭐' , tag = 'NP' , start = 6 , len = 1 ),
Token ( form = '이' , tag = 'VCP' , start = 7 , len = 0 ),
Token ( form = '냐는' , tag = 'ETM' , start = 7 , len = 2 ),
Token ( form = '선생' , tag = 'NNG' , start = 10 , len = 2 ),
Token ( form = '님' , tag = 'XSN' , start = 12 , len = 1 ),
Token ( form = '의' , tag = 'JKG' , start = 13 , len = 1 ),
Token ( form = '질문' , tag = 'NNG' , start = 15 , len = 2 ),
Token ( form = '에' , tag = 'JKB' , start = 17 , len = 1 ),
Token ( form = '벙어리' , tag = 'NNG' , start = 19 , len = 3 ),
Token ( form = '가' , tag = 'JKC' , start = 22 , len = 1 ),
Token ( form = '되' , tag = 'VV' , start = 24 , len = 1 ),
Token ( form = '엇' , tag = 'EP' , start = 24 , len = 1 ),
Token ( form = '다' , tag = 'EF' , start = 25 , len = 1 )]
# 0.17.1에서는 연철에 대한 오타 교정이 추가되었습니다.
# 받침 + 초성 ㅇ/ㅎ 꼴을 잘못 이어적은 경우에 대해 교정이 가능합니다.
>> > kiwi = Kiwi ( typos = 'continual' )
>> > kiwi . tokenize ( '오늘사무시레서' )
[ Token ( form = '오늘' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '사무실' , tag = 'NNG' , start = 2 , len = 4 ),
Token ( form = '에서' , tag = 'JKB' , start = 5 , len = 2 )]
>> > kiwi . tokenize ( '지가캤어요' )
[ Token ( form = '지각' , tag = 'NNG' , start = 0 , len = 3 ),
Token ( form = '하' , tag = 'XSV' , start = 2 , len = 1 ),
Token ( form = '었' , tag = 'EP' , start = 2 , len = 1 ),
Token ( form = '어요' , tag = 'EF' , start = 3 , len = 2 )]
# 기본 오타 교정에 연철 오타 교정까지 함께 사용할 수도 있습니다.
>> > kiwi = Kiwi ( typos = 'basic_with_continual' )
>> > kiwi . tokenize ( '웨 지가캤니?' )
[ Token ( form = '왜' , tag = 'MAG' , start = 0 , len = 1 ),
Token ( form = '지각' , tag = 'NNG' , start = 2 , len = 3 ),
Token ( form = '하' , tag = 'XSV' , start = 4 , len = 1 ),
Token ( form = '었' , tag = 'EP' , start = 4 , len = 1 ),
Token ( form = '니' , tag = 'EC' , start = 5 , len = 1 ),
Token ( form = '?' , tag = 'SF' , start = 6 , len = 1 )]
# 0.19.0 버전에서는 장음화 오류(한 음절을 여러 음절로 늘려 적는 오류)가
# 포함된 텍스트를 교정하는 기능도 추가되었습니다.
>> > kiwi = Kiwi ( typos = 'lengthening' )
>> > kiwi . tokenize ( '지이인짜 귀여워요' )
[ Token ( form = '진짜' , tag = 'MAG' , start = 0 , len = 4 ),
Token ( form = '귀엽' , tag = 'VA-I' , start = 5 , len = 3 ),
Token ( form = '어요' , tag = 'EF' , start = 7 , len = 2 )]
# 기본 오타 교정 + 연철 오타 교정 + 장음화 오류 교정을 함께 사용할 수도 있습니다.
>> > kiwi = Kiwi ( typos = 'basic_with_continual_and_lengthening' )
>> > kiwi . tokenize ( '지이인짜 기여워요~ 마니 좋아해' )
[ Token ( form = '진짜' , tag = 'MAG' , start = 0 , len = 4 ),
Token ( form = '귀엽' , tag = 'VA-I' , start = 5 , len = 3 ),
Token ( form = '어요' , tag = 'EF' , start = 7 , len = 2 ),
Token ( form = '~' , tag = 'SO' , start = 9 , len = 1 ),
Token ( form = '많이' , tag = 'MAG' , start = 11 , len = 2 ),
Token ( form = '좋아하' , tag = 'VV' , start = 14 , len = 3 ),
Token ( form = '어' , tag = 'EF' , start = 16 , len = 1 )]
# 0.17.0 버전부터는 사용자 사전에 공백이 있는 단어를 추가할 수 있습니다.
>> > kiwi = Kiwi ()
# '대학생 선교회'라는 단어를 등록합니다.
>> > kiwi . add_user_word ( '대학생 선교회' , 'NNP' )
True
# 등록한 것과 동일한 형태에서는
# 당연히 잘 분석됩니다.
>> > kiwi . tokenize ( '대학생 선교회에서' )
[ Token ( form = '대학생 선교회' , tag = 'NNP' , start = 0 , len = 7 ),
Token ( form = '에서' , tag = 'JKB' , start = 7 , len = 2 )]
# 추가로 공백이 없는 형태에도 일치가 가능합니다.
>> > kiwi . tokenize ( '대학생선교회에서' )
kiwi . tokenize ( '대학생선교회에서' )
[ Token ( form = '대학생 선교회' , tag = 'NNP' , start = 0 , len = 6 ),
Token ( form = '에서' , tag = 'JKB' , start = 6 , len = 2 )]
# 탭 문자나 줄바꿈 문자 등이 들어가도 일치가 가능합니다.
# 연속한 공백 문자는 공백 1번과 동일하게 처리합니다.
>> > kiwi . tokenize ( '대학생 t n 선교회에서' )
[ Token ( form = '대학생 선교회' , tag = 'NNP' , start = 0 , len = 11 ),
Token ( form = '에서' , tag = 'JKB' , start = 11 , len = 2 )]
# 그러나 사전 등재 시 공백이 없던 지점에
# 공백이 있는 경우에는 일치가 불가능합니다.
>> > kiwi . tokenize ( '대학 생선 교회에서' )
[ Token ( form = '대학' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '생선' , tag = 'NNG' , start = 3 , len = 2 ),
Token ( form = '교회' , tag = 'NNG' , start = 6 , len = 2 ),
Token ( form = '에서' , tag = 'JKB' , start = 8 , len = 2 )]
# space_tolerance를 2로 설정하여
# 공백이 두 개까지 틀린 경우를 허용하도록 하면
# '대학 생선 교회'에도 '대학생 선교회'가 일치하게 됩니다.
>> > kiwi . space_tolerance = 2
>> > kiwi . tokenize ( '대학 생선 교회에서' )
[ Token ( form = '대학생 선교회' , tag = 'NNP' , start = 0 , len = 8 ),
Token ( form = '에서' , tag = 'JKB' , start = 8 , len = 2 )]
# 0.18.0 버전에서는 외국어 문자, 이모지에 대한 지원이 강화되었습니다.
# 화면에 표시되는 글자 단위로 토큰이 분할됩니다.
>> > kiwi . tokenize ( '?☝?☝?' )
[ Token ( form = '?' , tag = 'W_EMOJI' , start = 0 , len = 1 ),
Token ( form = '☝?' , tag = 'W_EMOJI' , start = 1 , len = 2 ),
Token ( form = '☝?' , tag = 'W_EMOJI' , start = 3 , len = 2 )]
# 참고: v0.17의 결과
# [Token(form='?☝?☝?', tag='SW', start=0, len=5)]
# script 필드가 추가되어 해당 문자가
# 유니코드 상에서 어떤 영역에 속하는지 확인할 수 있습니다.
# SW, SH, SL, W_EMOJI 태그에 대해서만 script값이 부여됩니다.
>> > tokens = kiwi . tokenize ( 'ひらがなカタカナ' )
>> > tokens
[ Token ( form = 'ひらがなカタカナ' , tag = 'SW' , start = 0 , len = 8 )]
>> > tokens [ 0 ]. script
'Kana'
>> > tokens = kiwi . tokenize ( 'résumé' )
>> > tokens
[ Token ( form = 'résumé' , tag = 'SL' , start = 0 , len = 6 )]
# 참고 v0.17까지의 결과
# [Token(form='r', tag='SL', start=0, len=1),
# Token(form='é', tag='SW', start=1, len=1),
# Token(form='sum', tag='SL', start=2, len=3),
# Token(form='é', tag='SW', start=5, len=1)]
>> > tokens [ 0 ]. script
'Latin'
>> > tokens = kiwi . tokenize ( 'ἥρως' )
>> > tokens
[ Token ( form = 'ἥρως' , tag = 'SW' , start = 0 , len = 4 )]
>> > tokens [ 0 ]. script
'Greek and Coptic'
>> > tokens = kiwi . tokenize ( 'ฉันชอบกินข้าวผัด' )
>> > tokens
[ Token ( form = 'ฉันชอบกินข้าวผัด' , tag = 'SW' , start = 0 , len = 16 )]
>> > tokens [ 0 ]. script
'Thai'
# 0.18.1버전부터는 받침만으로 구성된 형태소 출력시
# 호환용 자모를 사용하는 옵션을 제공합니다.
>> > kiwi . tokenize ( '예쁜데' )
[ Token ( form = '예쁘' , tag = 'VA' , start = 0 , len = 2 ),
Token ( form = 'ᆫ데' , tag = 'EF' , start = 1 , len = 2 )]
>> > kiwi . tokenize ( '예쁜데' , compatible_jamo = True )
[ Token ( form = '예쁘' , tag = 'VA' , start = 0 , len = 2 ),
Token ( form = 'ㄴ데' , tag = 'EF' , start = 1 , len = 2 )]
# 받침 ᆫ이 호환용 자모인 ㄴ으로 변환되어 출력됨
# 0.20.0버전에서는 사이시옷 분석을 수행하는 옵션이 추가되었습니다.
# 사전에 등재되어 있지 않은, 사이시옷이 들어간 합성명사는
# 다음과 같이 잘못 분석되는 경우가 많습니다.
>> > kiwi . tokenize ( '시곗바늘' )
[ Token ( form = '시곗' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = '바늘' , tag = 'NNG' , start = 2 , len = 2 )]
# saisiot=True 옵션을 주면 사이시옷을 형태소로 간주하여
# 다음과 같이 분리해줍니다.
>> > kiwi . tokenize ( '시곗바늘' , saisiot = True )
[ Token ( form = '시계' , tag = 'NNG' , start = 0 , len = 2 ),
Token ( form = 'ᆺ' , tag = 'Z_SIOT' , start = 1 , len = 1 ),
Token ( form = '바늘' , tag = 'NNG' , start = 2 , len = 2 )]
# saisiot=False 옵션을 주면 사이시옷이 들어간 합성 명사 전체를
# 하나의 형태소로 합쳐서 출력합니다.
>> > kiwi . tokenize ( '시곗바늘' , saisiot = False )
[ Token ( form = '시곗바늘' , tag = 'NNG' , start = 0 , len = 4 )]If the KIWIPIEPY package installation has been successfully completed, no error occurs when the package is created and the KIWI object is generated.
from kiwipiepy import Kiwi , Match
kiwi = Kiwi ()KIWI constructors are as follows:
Kiwi ( num_workers = 0 , model_path = None , load_default_dict = True , integrate_allomorph = True , model_type = 'knlm' , typos = None , typo_cost_threshold = 2.5 )num_workers : If you are 2 or more, you can use multicore to analyze word extract and formal stations, so you can analyze it at a faster rate.model_path : Specifies a path with a formal analysis model. When omitted, the model path is loaded from the kiwipiepy_model package.load_default_dict : load an additional dictionary. The additional dictionary consists of a title title of Wikipedia. In this case, the loading and analysis time increases slightly, but you can better catch various proper nouns. To prevent the analysis results from being caught, set it to False.integrate_allomorph : In the mother, it automatically integrates the forms that are the same but the phonological environment, such as 'Ah/Eo', '//'.model_type : Specifies the language model to be used for morphological analysis. You can choose one of 'knlm' or 'sbg' . 'sbg' is relatively slow but can capture the relationship between morphemes.typos : Correct a simple typo when analyzing morphology. Do not perform corrections when setting to None .typo_cost_threshold : Set the maximum typo to allow typos.KIWI objects can do three types of tasks.
This is a new feature since KIWI 0.5. Know the patterns of frequently appearing strings and extract the string that is supposed to be a word. The basic idea of this feature is based on the word extraction technique of https://github.com/lovit/soynlp, which combines a string -based noun probability and extracts words that are expected to be noun.
There are two types of unregistered vocabulary extraction methods provided by KIWI:
Kiwi . extract_words ( texts , min_cnt , max_word_len , min_score )
Kiwi . extract_add_words ( texts , min_cnt , max_word_len , min_score , pos_score )extract_words(texts, min_cnt=10, max_word_len=10, min_score=0.25, pos_score=-3.0, lm_filter=True)texts : Insert the text to be analyzed in Iterable[str] . Please refer to the example below for details.min_cnt : Determine how many times the words to be extracted appear in the input text. The larger the input text, the better it is to increase the value.max_word_len : The maximum length of the words to extract. If you set this value too much, the time to scan a word will be longer, so it is better to adjust it properly.min_score : This is the minimum word score of the word to be extracted. The lower the value, the higher the likelihood of being extracted, and the more the number of words extracted is reduced as the value is increased, so it is necessary to set it to the appropriate value. The default is 0.25.pos_score : This is the minimum noun score of the words to be extracted. The lower the value, the more likely it is to extract words that are not nouns. The default is -3.lm_filter : Determines filtering using parts and speeches and language models. # 입력으로 str의 list를 줄 경우
inputs = list ( open ( 'test.txt' , encoding = 'utf-8' ))
kiwi . extract_words ( inputs , min_cnt = 10 , max_word_len = 10 , min_score = 0.25 )
'''
위의 코드에서는 모든 입력을 미리 list로 저장해두므로
test.txt 파일이 클 경우 많은 메모리를 소모할 수 있습니다.
그 대신 파일에서 필요한 부분만 가져와 사용하려면(streaming)
아래와 같이 사용해야 합니다.
'''
class IterableTextFile :
def __init__ ( self , path ):
self . path = path
def __iter__ ( self ):
yield from open ( path , encoding = 'utf-8' )
kiwi . extract_words ( IterableTextFile ( 'test.txt' ), min_cnt = 10 , max_word_len = 10 , min_score = 0.25 )extract_add_words(texts, min_cnt=10, max_word_len=10, min_score=0.25, pos_score=-3, lm_filter=True) Extract only the noun words as extract_words . However, this method automatically registers the extracted noun candidate as NNP in advance so that it can be used for morphological analysis. If you do not use this method, you must register the extracted unregistered word using the Add_user_word method in advance.
In order to properly analyze the existing words that are not registered in advance, the word must be registered in the user dictionary. This can be automatically done through extract_add_words or directly added. The following methods are the methods used to manage the user dictionary.
Kiwi . add_user_word ( word , tag , score , orig_word = None )
Kiwi . add_pre_analyzed_word ( form , analyzed , score )
Kiwi . add_rule ( tag , replacer , score )
Kiwi . add_re_rule ( tag , pattern , repl , score )
Kiwi . load_user_dictionary ( user_dict_path )add_user_word(word, tag='NNP', score=0.0, orig_word=None)Register a new type in the user dictionary.
word : It is the form of a form to register. Currently, only the characters can be registered as a word that does not contain spacing (blank characters).tag : This is part of the form of the form of registration. The default value is NNP.score : This is the score of the form of the form to be registered. If the same form is likely to be analyzed in many cases, the larger this value, the more priority the corresponding form will be.orig_word : If the additional form to be added is a variant of a certain form, you can pass the original morphemes with this factor. If there is no, it can be omitted. If this value is given, there must be a form of orig_word / tag combination in the current dictionary, otherwise it generates an exception to ValueError . If the original morphemes exist, orig_word can be specified to produce more accurate analysis results. If the formal insertion is successful, True will be returned, and if the same morphology already exists and fails, it returns False .
add_pre_analyzed_word(form, analyzed, score=0.0)Register the form of mood in the user dictionary. This allows certain forms to induce the user to analyze the morphological analysis in the desired form.
form : It is the form of mood.analyzed : Formal analysis of form . This value must be an iterable consisting of a tuple in the shape of (form, parts), or a tuple shaped (form, parts, starting point, endpoint). The morphemes specified by this value must exist in the current dictionary, otherwise they generate an exception to ValueError .score : The weight score of the form of the shape to be added. If there are several combinations of morphology that meets the form, the word that will be high is higher. If the insertion is successful, True False be returned if the same form already exists and fails.
This method is easy to add irregular analysis results to the analyzer. For example, the verb's 사귀다 brothers are 사귀었다 , but they are often 사겼다 to be wrong. You can use this method to ensure that you are 사겼다 사귀/VV + 었/EP + 다/EF .
kiwi . add_pre_analyzed_word ( '사겼다' , [ '사귀/VV' , '었/EP' , '다/EF' ], - 3 )`
kiwi . add_pre_analyzed_word ( '사겼다' , [( '사귀' , 'VV' , 0 , 2 ), ( '었' , 'EP' , 1 , 2 ), ( '다' , 'EF' , 2 , 3 )], - 3 ) In the latter case, by accurately specifying the location of each of the analysis results, start , end , and length of the corresponding morphemes are accurate in the KIWI analysis results.
add_rule(tag, replacer, score)Add the modifications modified by the rules.
tag : Parts of speeches to be addedreplacer : Rules to transform morphology. This value must be provided in the form of callable callable, and must be returned by receiving the original form stain STR and returning the strip of the modified form. If you return the same value as the input, the deformation results are ignored.score : weight scores of modified morphemes to be added. If there are several combinations of morphology that meets the form, the word that will be high is higher. Returns list of newly generated forms by replacer .
add_re_rule(tag, pattern, repl, score) It plays the same role as add_rule method, but uses regular expressions in the deformation rules.
tag : Parts of speeches to be addedpattern : Rules of Mold to Transform. This value must be a regular expression that can be compilable with re.compile .repl : The pattern found by pattern is replaced by this value. Python3 This is the same as the repl factor of the re.sub function in the regular expression module.score : weight scores of modified morphemes to be added. If there are several combinations of morphology that meets the form, the word that will be high is higher. Returns list of newly generated forms by pattern and repl .
This method is very easy to add bullators that are transformed by the rules. For example -요 can register the terminators ( 먹어염 , 뛰었구염 , 배불러염 etc.) replaced by -염 .
kiwi . add_re_rule ( 'EF' , r'요$' , r'염' , - 3.0 ) When registering these two forms in large quantities, we recommend that the score is set to a value of -3 or less so that the formation does not have a higher priority in the analysis results than the original form.
load_user_dictionary(user_dict_path)Read the user dictionary from the file. User dictionary files should be encoded with UTF-8 and should be configured in the form as follows. Each field should be separated by the tab character ( t), and the word score can be omitted.
#으로 시작하는 줄은 주석 처리됩니다.
# 각 필드는 Tab(t)문자로 구분됩니다.
#
# <단일 형태소를 추가하는 경우>
# (형태) t (품사태그) t (점수)
# * (점수)는 생략시 0으로 처리됩니다.
키위 NNP -5.0
#
# <이미 존재하는 형태소의 이형태를 추가하는 경우>
# (이형태) t (원형태소/품사태그) t (점수)
# * (점수)는 생략시 0으로 처리됩니다.
기위 키위/NNG -3.0
#
# <기분석 형태를 추가하는 경우>
# (형태) t (원형태소/품사태그 + 원형태소/품사태그 + ...) t (점수)
# * (점수)는 생략시 0으로 처리됩니다.
사겼다 사귀/VV + 었/EP + 다/EF -1.0
Successfully read the pre -file, return the number of newly added stations through the dictionary.
Please refer to the default pre -file built into KIWI for actual examples.
If you create a KIWI and add words to the user dictionary, you can use the following method to perform tasks such as formal analysis, sentence separation, spacing correction, and sentence restoration.
Kiwi . tokenize ( text , match_option , normalize_coda = False , z_coda = True , split_complex = False , compatible_jamo = False , saisiot = None , blocklist = None )
Kiwi . analyze ( text , top_n , match_option , normalize_coda = False , z_coda = True , split_complex = False , compatible_jamo = False , saisiot = None , blocklist = None )
Kiwi . split_into_sents ( text , match_options = Match . ALL , normalize_coda = False , z_coda = True , split_complex = False , compatible_jamo = False , saisiot = None , blocklist = None , return_tokens = False )
Kiwi . glue ( text_chunks , insert_new_lines = None , return_space_insertions = False )
Kiwi . space ( text , reset_whitespace = False )
Kiwi . join ( morphs , lm_search = True )
Kiwi . template ( format_str , cache = True )tokenize(text, match_option=Match.ALL, normalize_coda=False, z_coda=True, split_complex=False, compatible_jamo=False, saisiot=None, blocklist=None) Inputted the input text is analyzed to simply return the result. The analysis results are returned in the form of a list of Token as follows:
>> kiwi . tokenize ( '테스트입니다.' )
[ Token ( form = '테스트' , tag = 'NNG' , start = 0 , len = 3 ), Token ( form = '이' , tag = 'VCP' , start = 3 , len = 1 ), Token ( form = 'ᆸ니다' , tag = 'EF' , start = 4 , len = 2 )] normalize_coda solves the problem of failing to analyze when a initial body like ㅋㅋㅋ and ㅎㅎ ㅎㅎ entered the support.
>> kiwi . tokenize ( "안 먹었엌ㅋㅋ" , normalize_coda = False )
[ Token ( form = '안' , tag = 'NNP' , start = 0 , len = 1 ),
Token ( form = '먹었엌' , tag = 'NNP' , start = 2 , len = 3 ),
Token ( form = 'ㅋㅋ' , tag = 'SW' , start = 5 , len = 2 )]
>> kiwi . tokenize ( "안 먹었엌ㅋㅋ" , normalize_coda = True )
[ Token ( form = '안' , tag = 'MAG' , start = 0 , len = 1 ),
Token ( form = '먹' , tag = 'VV' , start = 2 , len = 1 ),
Token ( form = '었' , tag = 'EP' , start = 3 , len = 1 ),
Token ( form = '어' , tag = 'EF' , start = 4 , len = 1 ),
Token ( form = 'ㅋㅋㅋ' , tag = 'SW' , start = 5 , len = 2 )]analyze(text, top_n=1, match_option=Match.ALL, normalize_coda=False, z_coda=True, split_complex=False, compatible_jamo=False, saisiot=None, blocklist=None) Input the input text is analyzed to return the result. Print the total top_n results in detail. The return value is configured as follows.
[( 분석결과1 , 점수 ), ( 분석결과2 , 점수 ), ... ] The analysis results are returned in the form of a list of Token as follows:
The actual example is as follows.
>> kiwi . analyze ( '테스트입니다.' , top_n = 5 )
[([ Token ( form = '테스트' , tag = 'NNG' , start = 0 , len = 3 ), Token ( form = '이' , tag = 'VCP' , start = 3 , len = 1 ), Token ( form = 'ᆸ니다' , tag = 'EF' , start = 4 , len = 2 )], - 25.217018127441406 ),
([ Token ( form = '테스트입니' , tag = 'NNG' , start = 0 , len = 5 ), Token ( form = '다' , tag = 'EC' , start = 5 , len = 1 )], - 40.741905212402344 ),
([ Token ( form = '테스트입니' , tag = 'NNG' , start = 0 , len = 5 ), Token ( form = '다' , tag = 'MAG' , start = 5 , len = 1 )], - 41.81024932861328 ),
([ Token ( form = '테스트입니' , tag = 'NNG' , start = 0 , len = 5 ), Token ( form = '다' , tag = 'EF' , start = 5 , len = 1 )], - 42.300254821777344 ),
([ Token ( form = '테스트' , tag = 'NNG' , start = 0 , len = 3 ), Token ( form = '입' , tag = 'NNG' , start = 3 , len = 1 ), Token ( form = '니다' , tag = 'EF' , start = 4 , len = 2 )], - 45.86524200439453 )
]If the text is an iterable of the strings, several inputs are processed in parallel. The return value at this time is the iterable of the return value when entering a single text. The work is processed simultaneously in multiple threads according to the Num_workers, which is an argument when generating KIWI (). The return value is the same as the order of the value entered.
>> result_iter = kiwi . analyze ([ '테스트입니다.' , '테스트가 아닙니다.' , '사실 맞습니다.' ])
>> next ( result_iter )
[([ Token ( form = '테스트' , tag = 'NNG' , start = 0 , len = 3 ), Token ( form = '이' , tag = 'VCP' , start = 3 , len = 1 ), Token ( form = 'ᆸ니다' , tag = 'EF' , start = 4 , len = 2 ), Token ( form = '.' , tag = 'SF' , start = 6 , len = 1 )], - 20.441545486450195 )]
>> next ( result_iter )
[([ Token ( form = '테스트' , tag = 'NNG' , start = 0 , len = 3 ), Token ( form = '가' , tag = 'JKC' , start = 3 , len = 1 ), Token ( form = '아니' , tag = 'VCN' , start = 5 , len = 2 ), Token ( form = 'ᆸ니다' , tag = 'EF' , start = 7 , len = 2 ), Token ( form = '.' , tag = 'SF' , start = 9 , len = 1 )], - 30.23870277404785 )]
>> next ( result_iter )
[([ Token ( form = '사실' , tag = 'MAG' , start = 0 , len = 2 ), Token ( form = '맞' , tag = 'VV' , start = 3 , len = 1 ), Token ( form = '습니다' , tag = 'EF' , start = 4 , len = 3 ), Token ( form = '.' , tag = 'SF' , start = 7 , len = 1 )], - 22.232769012451172 )]
>> next ( result_iter )
Traceback ( most recent call last ):
File "<stdin>" , line 1 , in < module >
StopIterationThe for loop allows you to perform parallelism simpler and more conveniently. This is useful when analyzing a large amount of text data.
>> for result in kiwi . analyze ( long_list_of_text ):
tokens , score = result [ 0 ]
print ( tokens )If you give Text as an ITERABLE on the strings, the time to read this iterable may be after the call of Analyze. Therefore, if this argument is linked with other IO resources (file I / O), you should not terminate the resource until all the analysis is over.
>> file = open ( 'long_text.txt' , encoding = 'utf-8' )
>> result_iter = kiwi . analyze ( file )
>> file . close () # 파일이 종료됨
>> next ( result_iter ) # 종료된 파일에서 분석해야할 다음 텍스트를 읽어들이려고 시도함
ValueError : I / O operation on closed file .
The above exception was the direct cause of the following exception :
Traceback ( most recent call last ):
File "<stdin>" , line 1 , in < module >
SystemError : < built - in function next > returned a result with an error setsplit_into_sents( text, match_options=Match.ALL, normalize_coda=False, z_coda=True, split_complex=False, compatible_jamo=False, saisiot=None, return_tokens=False ) >> kiwi . split_into_sents ( "여러 문장으로 구성된 텍스트네 이걸 분리해줘" )
[ Sentence ( text = '여러 문장으로 구성된 텍스트네' , start = 0 , end = 16 , tokens = None ),
Sentence ( text = '이걸 분리해줘' , start = 17 , end = 24 , tokens = None )]
>> kiwi . split_into_sents ( "여러 문장으로 구성된 텍스트네 이걸 분리해줘" , return_tokens = True )
[ Sentence ( text = '여러 문장으로 구성된 텍스트네' , start = 0 , end = 16 , tokens = [
Token ( form = '여러' , tag = 'MM' , start = 0 , len = 2 ),
Token ( form = '문장' , tag = 'NNG' , start = 3 , len = 2 ),
Token ( form = '으로' , tag = 'JKB' , start = 5 , len = 2 ),
Token ( form = '구성' , tag = 'NNG' , start = 8 , len = 2 ),
Token ( form = '되' , tag = 'XSV' , start = 10 , len = 1 ),
Token ( form = 'ᆫ' , tag = 'ETM' , start = 11 , len = 0 ),
Token ( form = '텍스트' , tag = 'NNG' , start = 12 , len = 3 ),
Token ( form = '이' , tag = 'VCP' , start = 15 , len = 1 ),
Token ( form = '네' , tag = 'EF' , start = 15 , len = 1 )
]),
Sentence ( text = '이걸 분리해줘' , start = 17 , end = 24 , tokens = [
Token ( form = '이거' , tag = 'NP' , start = 17 , len = 2 ),
Token ( form = 'ᆯ' , tag = 'JKO' , start = 19 , len = 0 ),
Token ( form = '분리' , tag = 'NNG' , start = 20 , len = 2 ),
Token ( form = '하' , tag = 'XSV' , start = 22 , len = 1 ),
Token ( form = '어' , tag = 'EC' , start = 22 , len = 1 ),
Token ( form = '주' , tag = 'VX' , start = 23 , len = 1 ),
Token ( form = '어' , tag = 'EF' , start = 23 , len = 1 )
])]glue(text_chunks, return_space_insertions=False)text_chunks : A list of pieces of combined text.return_space_insertions : True, return the orbital insertion of each piece to List[bool] . >> kiwi . glue ([
"그러나 알고보니 그 봉" ,
"지 안에 있던 것은 바로" ,
"레몬이었던 것이다." ])
"그러나 알고보니 그 봉지 안에 있던 것은 바로 레몬이었던 것이다."
>> kiwi . glue ([
"그러나 알고보니 그 봉" ,
"지 안에 있던 것은 바로" ,
"레몬이었던 것이다." ], return_space_insertions = True )
( "그러나 알고보니 그 봉지 안에 있던 것은 바로 레몬이었던 것이다." , [ False , True ])space(text, reset_whitespace=False)text : The string to analyze. If this argument is given to a single STR, it is processed in a single thread and is handled by multisred if it is given to the ITERABLE of the Str.reset_whitespace True, it is also actively performed. The default value is false, and in this case, it is focused on corrections that use the words that are attached. The spacing correction function of this method is based on morphological analysis. Therefore, if the gap is inserted in the middle of the morphology, the correction result may be inaccurate. In this case, you can adjust Kiwi.space_tolerance to ignore the spaces in the morphology or to set reset_whitespace=True to ignore the existing gap and use it to improve the result.
>> kiwi . space ( "띄어쓰기없이작성된텍스트네이걸교정해줘" )
"띄어쓰기 없이 작성된 텍스트네 이걸 교정해 줘."
>> kiwi . space ( "띄 어 쓰 기 문 제 가 있 습 니 다" )
"띄어 쓰기 문 제 가 있 습 니 다"
>> kiwi . space_tolerance = 2 # 형태소 내 공백을 최대 2개까지 허용
>> kiwi . space ( "띄 어 쓰 기 문 제 가 있 습 니 다" )
"띄어 쓰기 문제가 있습니다"
>> kiwi . space ( "띄 어 쓰 기 문 제 가 있 습 니 다" , reset_whitespace = True ) # 기존 공백 전부 무시
"띄어쓰기 문제가 있습니다"join(morphs, lm_search=True)morphs : A list of morphemes to be combined. Each morphology should be a Token type obtained from Kiwi.tokenizer , or tuple type consisting of (form, part, speech).lm_search : If there is an ambiguous type of shame that can be restored in two or more forms, select the optimal morphology through the language model search if this value is true. If you are a false, you do not search for the search, but you can restore it faster. This method uses a rule similar to what is used in space when combining a form, and inserts the space properly. Since the form of the form itself does not include gap related information, the original text is not restored even if the specific text is analyzed with tokenize and then coupled to join again.
>> kiwi . join ([( '덥' , 'VA' ), ( '어' , 'EC' )])
'더워'
>> tokens = kiwi . tokenize ( "분석된결과를 다시합칠수있다!" )
# 형태소 분석 결과를 복원.
# 복원 시 공백은 규칙에 의해 삽입되므로 원문 텍스트가 그대로 복원되지는 않음.
>> kiwi . join ( tokens )
'분석된 결과를 다시 합칠 수 있다!'
>> tokens [ 3 ]
Token ( form = '결과' , tag = 'NNG' , start = 4 , len = 2 )
>> tokens [ 3 ] = ( '내용' , 'NNG' ) # 4번째 형태소를 결과->내용으로 교체
>> kiwi . join ( tokens ) # 다시 join하면 결과를->내용을 로 교체된 걸 확인 가능
'분석된 내용을 다시 합칠 수 있다!'
# 불규칙 활용여부가 모호한 경우 lm_search=True인 경우 맥락을 고려해 최적의 후보를 선택합니다.
>> kiwi . join ([( '길' , 'NNG' ), ( '을' , 'JKO' ), ( '묻' , 'VV' ), ( '어요' , 'EF' )])
'길을 물어요'
>> kiwi . join ([( '흙' , 'NNG' ), ( '이' , 'JKS' ), ( '묻' , 'VV' ), ( '어요' , 'EF' )])
'흙이 묻어요'
# lm_search=False이면 탐색을 실시하지 않습니다.
>> kiwi . join ([( '길' , 'NNG' ), ( '을' , 'JKO' ), ( '묻' , 'VV' ), ( '어요' , 'EF' )], lm_search = False )
'길을 묻어요'
>> kiwi . join ([( '흙' , 'NNG' ), ( '이' , 'JKS' ), ( '묻' , 'VV' ), ( '어요' , 'EF' )], lm_search = False )
'흙이 묻어요'
# 동사/형용사 품사 태그 뒤에 -R(규칙 활용), -I(불규칙 활용)을 덧붙여 활용법을 직접 명시할 수 있습니다.
>> kiwi . join ([( '묻' , 'VV-R' ), ( '어요' , 'EF' )])
'묻어요'
>> kiwi . join ([( '묻' , 'VV-I' ), ( '어요' , 'EF' )])
'물어요'
# 0.15.2버전부터는 Tuple의 세번째 요소로 띄어쓰기 유무를 지정할 수 있습니다.
# True일 경우 강제로 띄어쓰기, False일 경우 강제로 붙여쓰기를 수행합니다.
>> kiwi . join ([( '길' , 'NNG' ), ( '을' , 'JKO' , True ), ( '묻' , 'VV' ), ( '어요' , 'EF' )])
'길 을 물어요'
>> kiwi . join ([( '길' , 'NNG' ), ( '을' , 'JKO' ), ( '묻' , 'VV' , False ), ( '어요' , 'EF' )])
'길을물어요'
# 과거형 선어말어미를 제거하는 예시
>> remove_past = lambda s : kiwi . join ( t for t in kiwi . tokenize ( s ) if t . tagged_form != '었/EP' )
>> remove_past ( '먹었다' )
'먹다'
>> remove_past ( '먼 길을 걸었다' )
'먼 길을 걷다'
>> remove_past ( '전화를 걸었다.' )
'전화를 걸다.'template(format_str, cache=True)format_str : Template string. Using the same syntax as Python's str.format (https://docs.python.org/ko/3/library/string.html#formtrings).cache : It is a cache of template. This method helps you to use the formation combination of Kiwi.join as follows.
# 빈칸은 {}로 표시합니다.
# 이 자리에 형태소 혹은 기타 Python 객체가 들어가서 문자열을 완성시키게 됩니다.
> >> tpl = kiwi . template ( "{}가 {}을 {}었다." )
# template 객체는 format 메소드를 제공합니다.
# 이 메소드를 통해 빈 칸을 채울 수 있습니다.
# 형태소는 `kiwipiepy.Token` 타입이거나
# (형태, 품사) 혹은 (형태, 품사, 왼쪽 띄어쓰기 유무)로 구성된 tuple 타입이어야 합니다.
> >> tpl . format (( "나" , "NP" ), ( "공부" , "NNG" ), ( "하" , "VV" ))
'내가 공부를 했다.'
> >> tpl . format (( "너" , "NP" ), ( "밥" , "NNG" ), ( "먹" , "VV" ))
'네가 밥을 먹었다.'
> >> tpl . format (( "우리" , "NP" ), ( "길" , "NNG" ), ( "묻" , "VV-I" ))
'우리가 길을 물었다.'
# 형태소가 아닌 Python 객체가 입력되는 경우 `str.format`과 동일하게 동작합니다.
> >> tpl . format ( 5 , "str" , { "dict" : "dict" })
"5가 str를 {'dict': 'dict'}었다."
# 입력한 객체가 형태소가 아닌 Python 객체로 처리되길 원하는 경우 !s 변환 플래그를 사용합니다.
> >> tpl = kiwi . template ( "{!s}가 {}을 {}었다." )
> >> tpl . format (( "나" , "NP" ), ( "공부" , "NNG" ), ( "하" , "VV" ))
"('나', 'NP')가 공부를 했다."
# Python 객체에 대해서는 `str.format`과 동일한 서식 지정자를 사용할 수 있습니다.
> >> tpl = kiwi . template ( "{:.5f}가 {!r}을 {}었다." )
> >> tpl . format ( 5 , "str" , { "dict" : "dict" })
"5.00000가 'str'를 {'dict': 'dict'}었다."
# 서식 지정자가 주어진 칸에 형태소를 대입할 경우 ValueError가 발생합니다.
> >> tpl . format (( "우리" , "NP" ), "str" , ( "묻" , "VV-I" ))
ValueError : cannot specify format specifier for Kiwi Token
# 치환 필드에 index나 name을 지정하여 대입 순서를 설정할 수 있습니다.
> >> tpl = kiwi . template ( "{0}가 {obj}를 {verb}ㄴ다. {1}는 {obj}를 안 {verb}었다." )
> >> tpl . format (
[( "우리" , "NP" ), ( "들" , "XSN" )],
[( "너희" , "NP" ), ( "들" , "XSN" )],
obj = ( "길" , "NNG" ),
verb = ( "묻" , "VV-I" )
)
'우리들이 길을 묻는다. 너희들은 길을 안 물었다.'
# 위의 예시처럼 종성 자음은 호환용 자모 코드 앞에 \로 이스케이프를 사용해야합니다.
# 그렇지 않으면 종성이 아닌 초성으로 인식됩니다.
> >> tpl = kiwi . template ( "{0}가 {obj}를 {verb}ㄴ다. {1}는 {obj}를 안 {verb}었다." )
> >> tpl . format (
[( "우리" , "NP" ), ( "들" , "XSN" )],
[( "너희" , "NP" ), ( "들" , "XSN" )],
obj = ( "길" , "NNG" ),
verb = ( "묻" , "VV-I" )
)
'우리들이 길을 묻 ᄂ이다. 너희들은 길을 안 물었다.' Based on the Sejong partsa tags, it is used and used to add and modify some partial tags.
| Category | Tag | explanation |
|---|---|---|
| Step (N) | NNG | General noun |
| Nnp | Noun | |
| Nnb | Dependency | |
| NR | investigation | |
| Np | pronoun | |
| Section (V) | VV | verb |
| VA | adjective | |
| VX | Auxiliary | |
| VCP | Positive instruction (Ida) | |
| VCN | Negative instructions (no) | |
| Tubular | Mm | Tubular |
| Adverb (MA) | MAG | General adverb |
| Maj | Connection | |
| interjection | IC | interjection |
| Survey (J) | Jks | Main investigation |
| Jkc | Surrender | |
| Jkg | Tubular investigation | |
| Jko | Objective investigation | |
| Jkb | Insidership | |
| JKV | Hub survey | |
| JKQ | Quote | |
| Jx | Assistant | |
| Jc | Connection | |
| Mother (E) | EP | Sake |
| EF | End | |
| EC | Connecting mother | |
| ETN | Noun | |
| ETM | Tubular | |
| prefix | Xpn | Tempering prefix |
| Tracky (XS) | Xsn | Noun derivative suffix |
| Xsv | Verbs derivative suffix | |
| XSA | Adjective derivative suffix | |
| Xsm | Adverb derivative suffixes * | |
| radix | XR | radix |
| Code, foreign language, special characters (S) | SF | Termination code (.!?) |
| SP | Classification code (, /:;) | |
| SS | Quotes code and parentheses ('()] <> {} -' '' '' '≫, etc.) | |
| Sso | Code that opens in SS * | |
| Ssc | Code of close to SS * | |
| Se | Reduction Table (…) | |
| So | Attached table (- ~) | |
| SW | Other special characters | |
| SL | Alphabet (AZ AZ) | |
| SH | chinese character | |
| SN | Number (0-9) | |
| Sb | Ordered head (a. Me . | |
| Inability to analyze | UN | Inability to analyze * |
| Web (W) | W_URL | URL address * |
| W_email | Email Address * | |
| W_HashTag | Hashtag (#abcd) * | |
| W_MENTION | Mention (@abcd) * | |
| W_serial | Serial number (phone number, account number, IP address, etc.) * * | |
| W_emoji | Emoji * | |
| etc | Z_coda | Added support * |
| Z_siot | Sai Shi Clothes * | |
| User0 ~ 4 | Custom Tag * |
* It is a unique tag with Sejong partsa tag.
From the 0.12.0 version, the suffix -R and -I can be added to VV , VA , VX , and XSA tags. -R indicates the use of rules, -I indicates irregular use.
From version 0.10.3, we support sentence separation functions experimentally. From the 0.11.0 version, accuracy has been improved significantly. Please refer to this page for the performance of the sentence separation function.
One word can be analyzed in many ways, so KIWI has a high accuracy in a situation where it is essential to see the context. Please refer to this page for ambiguity relieving performance.
Please refer to KIWI#Citation for how to quote.