gse
v0.80.3
GO EFECTIVE NLP multilingüe y segmentación de texto; Apoya inglés, chino, japonés y otros. Y apoya con Elasticsearch y Bleve.
简体中文
GSE IS implementa Jieba de Golang, e intente agregar soporte de PNL y más características
JavaScript y otros, unidos a GSE, admiten más lenguaje.
Con soporte del módulo GO (GO 1.11+), solo importe:
import "github.com/go-ego/gse"De lo contrario, para instalar el paquete GSE, ejecute el comando:
go get -u github.com/go-ego/gse
package main
import (
_ "embed"
"fmt"
"github.com/go-ego/gse"
)
//go:embed testdata/test_en2.txt
var testDict string
//go:embed testdata/test_en.txt
var testEn string
var (
text = "To be or not to be, that's the question!"
test1 = "Hiworld, Helloworld!"
)
func main () {
var seg1 gse. Segmenter
seg1 . DictSep = ","
err := seg1 . LoadDict ( "./testdata/test_en.txt" )
if err != nil {
fmt . Println ( "Load dictionary error: " , err )
}
s1 := seg1 . Cut ( text )
fmt . Println ( "seg1 Cut: " , s1 )
// seg1 Cut: [to be or not to be , that's the question!]
var seg2 gse. Segmenter
seg2 . AlphaNum = true
seg2 . LoadDict ( "./testdata/test_en_dict3.txt" )
s2 := seg2 . Cut ( test1 )
fmt . Println ( "seg2 Cut: " , s2 )
// seg2 Cut: [hi world , hello world !]
var seg3 gse. Segmenter
seg3 . AlphaNum = true
seg3 . DictSep = ","
err = seg3 . LoadDictEmbed ( testDict + " n " + testEn )
if err != nil {
fmt . Println ( "loadDictEmbed error: " , err )
}
s3 := seg3 . Cut ( text + test1 )
fmt . Println ( "seg3 Cut: " , s3 )
// seg3 Cut: [to be or not to be , that's the question! hi world , hello world !]
// example2()
}Ejemplo2:
package main
import (
"fmt"
"regexp"
"github.com/go-ego/gse"
"github.com/go-ego/gse/hmm/pos"
)
var (
text = "Hello world, Helloworld. Winter is coming! こんにちは世界, 你好世界."
new , _ = gse . New ( "zh,testdata/test_en_dict3.txt" , "alpha" )
seg gse. Segmenter
posSeg pos. Segmenter
)
func main () {
// Loading the default dictionary
seg . LoadDict ()
// Loading the default dictionary with embed
// seg.LoadDictEmbed()
//
// Loading the Simplified Chinese dictionary
// seg.LoadDict("zh_s")
// seg.LoadDictEmbed("zh_s")
//
// Loading the Traditional Chinese dictionary
// seg.LoadDict("zh_t")
//
// Loading the Japanese dictionary
// seg.LoadDict("jp")
//
// Load the dictionary
// seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")
cut ()
segCut ()
}
func cut () {
hmm := new . Cut ( text , true )
fmt . Println ( "cut use hmm: " , hmm )
hmm = new . CutSearch ( text , true )
fmt . Println ( "cut search use hmm: " , hmm )
fmt . Println ( "analyze: " , new . Analyze ( hmm , text ))
hmm = new . CutAll ( text )
fmt . Println ( "cut all: " , hmm )
reg := regexp . MustCompile ( `(d+年|d+月|d+日|[p{Latin}]+|[p{Hangul}]+|d+.d+|[a-zA-Z0-9]+)` )
text1 := `헬로월드 헬로 서울, 2021年09月10日, 3.14`
hmm = seg . CutDAG ( text1 , reg )
fmt . Println ( "Cut with hmm and regexp: " , hmm , hmm [ 0 ], hmm [ 6 ])
}
func analyzeAndTrim ( cut [] string ) {
a := seg . Analyze ( cut , "" )
fmt . Println ( "analyze the segment: " , a )
cut = seg . Trim ( cut )
fmt . Println ( "cut all: " , cut )
fmt . Println ( seg . String ( text , true ))
fmt . Println ( seg . Slice ( text , true ))
}
func cutPos () {
po := seg . Pos ( text , true )
fmt . Println ( "pos: " , po )
po = seg . TrimPos ( po )
fmt . Println ( "trim pos: " , po )
pos . WithGse ( seg )
po = posSeg . Cut ( text , true )
fmt . Println ( "pos: " , po )
po = posSeg . TrimWithPos ( po , "zg" )
fmt . Println ( "trim pos: " , po )
}
func segCut () {
// Text Segmentation
tb := [] byte ( text )
fmt . Println ( seg . String ( text , true ))
segments := seg . Segment ( tb )
// Handle word segmentation results, search mode
fmt . Println ( gse . ToString ( segments , true ))
}Mira un ejemplo de diccionario personalizado
package main
import (
"fmt"
_ "embed"
"github.com/go-ego/gse"
)
//go:embed test_en_dict3.txt
var testDict string
func main () {
// var seg gse.Segmenter
// seg.LoadDict("zh, testdata/zh/test_dict.txt, testdata/zh/test_dict1.txt")
// seg.LoadStop()
seg , err := gse . NewEmbed ( "zh, word 20 n" + testDict , "en" )
// seg.LoadDictEmbed()
seg . LoadStopEmbed ()
text1 := "Hello world, こんにちは世界, 你好世界!"
s1 := seg . Cut ( text1 , true )
fmt . Println ( s1 )
fmt . Println ( "trim: " , seg . Trim ( s1 ))
fmt . Println ( "stop: " , seg . Stop ( s1 ))
fmt . Println ( seg . String ( text1 , true ))
segments := seg . Segment ([] byte ( text1 ))
fmt . Println ( gse . ToString ( segments ))
}Mira un ejemplo chino
Mira un ejemplo japonés
¿Cómo usarlo con Elasticsearch?
gse de Go-GSE
GSE se distribuye principalmente bajo los términos de "tanto la licencia MIT como la licencia Apache (versión 2.0)". Ver licencia-apache, licencia-mit.
Gracias por Sego y Jieba (Jiago).