Aller une NLP multilingue efficace et une segmentation de texte; Soutenez l'anglais, le chinois, le japonais et autres. Et soutient avec Elasticsearch et Bleve.
简体中文
GSE implémente Jieba par Golang, et essayez d'ajouter une prise en charge NLP et plus de fonctionnalités
GSE-Bind, Binding JavaScript et autres, prennent en charge plus de langage.
Avec la prise en charge du module GO (Go 1.11+), il suffit d'importer:
import "github.com/go-ego/gse"Sinon, pour installer le package GSE, exécutez la commande:
go get -u github.com/go-ego/gse
package main
import (
_ "embed"
"fmt"
"github.com/go-ego/gse"
)
//go:embed testdata/test_en2.txt
var testDict string
//go:embed testdata/test_en.txt
var testEn string
var (
text = "To be or not to be, that's the question!"
test1 = "Hiworld, Helloworld!"
)
func main () {
var seg1 gse. Segmenter
seg1 . DictSep = ","
err := seg1 . LoadDict ( "./testdata/test_en.txt" )
if err != nil {
fmt . Println ( "Load dictionary error: " , err )
}
s1 := seg1 . Cut ( text )
fmt . Println ( "seg1 Cut: " , s1 )
// seg1 Cut: [to be or not to be , that's the question!]
var seg2 gse. Segmenter
seg2 . AlphaNum = true
seg2 . LoadDict ( "./testdata/test_en_dict3.txt" )
s2 := seg2 . Cut ( test1 )
fmt . Println ( "seg2 Cut: " , s2 )
// seg2 Cut: [hi world , hello world !]
var seg3 gse. Segmenter
seg3 . AlphaNum = true
seg3 . DictSep = ","
err = seg3 . LoadDictEmbed ( testDict + " n " + testEn )
if err != nil {
fmt . Println ( "loadDictEmbed error: " , err )
}
s3 := seg3 . Cut ( text + test1 )
fmt . Println ( "seg3 Cut: " , s3 )
// seg3 Cut: [to be or not to be , that's the question! hi world , hello world !]
// example2()
}Exemple2:
package main
import (
"fmt"
"regexp"
"github.com/go-ego/gse"
"github.com/go-ego/gse/hmm/pos"
)
var (
text = "Hello world, Helloworld. Winter is coming! こんにちは世界, 你好世界."
new , _ = gse . New ( "zh,testdata/test_en_dict3.txt" , "alpha" )
seg gse. Segmenter
posSeg pos. Segmenter
)
func main () {
// Loading the default dictionary
seg . LoadDict ()
// Loading the default dictionary with embed
// seg.LoadDictEmbed()
//
// Loading the Simplified Chinese dictionary
// seg.LoadDict("zh_s")
// seg.LoadDictEmbed("zh_s")
//
// Loading the Traditional Chinese dictionary
// seg.LoadDict("zh_t")
//
// Loading the Japanese dictionary
// seg.LoadDict("jp")
//
// Load the dictionary
// seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")
cut ()
segCut ()
}
func cut () {
hmm := new . Cut ( text , true )
fmt . Println ( "cut use hmm: " , hmm )
hmm = new . CutSearch ( text , true )
fmt . Println ( "cut search use hmm: " , hmm )
fmt . Println ( "analyze: " , new . Analyze ( hmm , text ))
hmm = new . CutAll ( text )
fmt . Println ( "cut all: " , hmm )
reg := regexp . MustCompile ( `(d+年|d+月|d+日|[p{Latin}]+|[p{Hangul}]+|d+.d+|[a-zA-Z0-9]+)` )
text1 := `헬로월드 헬로 서울, 2021年09月10日, 3.14`
hmm = seg . CutDAG ( text1 , reg )
fmt . Println ( "Cut with hmm and regexp: " , hmm , hmm [ 0 ], hmm [ 6 ])
}
func analyzeAndTrim ( cut [] string ) {
a := seg . Analyze ( cut , "" )
fmt . Println ( "analyze the segment: " , a )
cut = seg . Trim ( cut )
fmt . Println ( "cut all: " , cut )
fmt . Println ( seg . String ( text , true ))
fmt . Println ( seg . Slice ( text , true ))
}
func cutPos () {
po := seg . Pos ( text , true )
fmt . Println ( "pos: " , po )
po = seg . TrimPos ( po )
fmt . Println ( "trim pos: " , po )
pos . WithGse ( seg )
po = posSeg . Cut ( text , true )
fmt . Println ( "pos: " , po )
po = posSeg . TrimWithPos ( po , "zg" )
fmt . Println ( "trim pos: " , po )
}
func segCut () {
// Text Segmentation
tb := [] byte ( text )
fmt . Println ( seg . String ( text , true ))
segments := seg . Segment ( tb )
// Handle word segmentation results, search mode
fmt . Println ( gse . ToString ( segments , true ))
}Regardez un exemple de dictionnaire personnalisé
package main
import (
"fmt"
_ "embed"
"github.com/go-ego/gse"
)
//go:embed test_en_dict3.txt
var testDict string
func main () {
// var seg gse.Segmenter
// seg.LoadDict("zh, testdata/zh/test_dict.txt, testdata/zh/test_dict1.txt")
// seg.LoadStop()
seg , err := gse . NewEmbed ( "zh, word 20 n" + testDict , "en" )
// seg.LoadDictEmbed()
seg . LoadStopEmbed ()
text1 := "Hello world, こんにちは世界, 你好世界!"
s1 := seg . Cut ( text1 , true )
fmt . Println ( s1 )
fmt . Println ( "trim: " , seg . Trim ( s1 ))
fmt . Println ( "stop: " , seg . Stop ( s1 ))
fmt . Println ( seg . String ( text1 , true ))
segments := seg . Segment ([] byte ( text1 ))
fmt . Println ( gse . ToString ( segments ))
}Regardez un exemple chinois
Regardez un exemple japonais
Comment l'utiliser avec Elasticsearch?
go-gse-élastique
GSE est principalement distribué en vertu des termes de "à la fois la licence MIT et la licence Apache (version 2.0)". Voir Licence-APache, Licence-MIT.
Merci pour Sego et Jieba (Jiebago).