flashtext
1.0.0
该模块可用于替换句子中的关键字或从句子中提取关键字。它基于FlashText算法。
$ pip安装flashtext
可以在flashtext上找到文档。
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> # keyword_processor.add_keyword(<unclean name>, <standardised name>)
>>> keyword_processor.add_keyword( ' Big Apple ' , ' New York ' )
>>> keyword_processor.add_keyword( ' Bay Area ' )
>>> keywords_found = keyword_processor.extract_keywords( ' I love Big Apple and Bay Area. ' )
>>> keywords_found
>>> # ['New York', 'Bay Area']>>> keyword_processor.add_keyword( ' New Delhi ' , ' NCR region ' )
>>> new_sentence = keyword_processor.replace_keywords( ' I love Big Apple and new delhi. ' )
>>> new_sentence
>>> # 'I love New York and NCR region.'>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor( case_sensitive = True )
>>> keyword_processor.add_keyword( ' Big Apple ' , ' New York ' )
>>> keyword_processor.add_keyword( ' Bay Area ' )
>>> keywords_found = keyword_processor.extract_keywords( ' I love big Apple and Bay Area. ' )
>>> keywords_found
>>> # ['Bay Area']>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword( ' Big Apple ' , ' New York ' )
>>> keyword_processor.add_keyword( ' Bay Area ' )
>>> keywords_found = keyword_processor.extract_keywords( ' I love big Apple and Bay Area. ' , span_info = True )
>>> keywords_found
>>> # [('New York', 7, 16), ('Bay Area', 21, 29)]>>> from flashtext import KeywordProcessor
>>> kp = KeywordProcessor()
>>> kp.add_keyword( ' Taj Mahal ' , ( ' Monument ' , ' Taj Mahal ' ))
>>> kp.add_keyword( ' Delhi ' , ( ' Location ' , ' Delhi ' ))
>>> kp.extract_keywords( ' Taj Mahal is in Delhi. ' )
>>> # [('Monument', 'Taj Mahal'), ('Location', 'Delhi')]
>>> # NOTE : replace_keywords feature won't work with this.>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword( ' Big Apple ' )
>>> keyword_processor.add_keyword( ' Bay Area ' )
>>> keywords_found = keyword_processor.extract_keywords( ' I love big Apple and Bay Area. ' )
>>> keywords_found
>>> # ['Big Apple', 'Bay Area']>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_dict = {
>>> " java " : [ " java_2e " , " java programing " ],
>>> " product management " : [ " PM " , " product manager " ]
>>> }
>>> # {'clean_name': ['list of unclean names']}
>>> keyword_processor.add_keywords_from_dict(keyword_dict)
>>> # Or add keywords from a list:
>>> keyword_processor.add_keywords_from_list([ " java " , " python " ])
>>> keyword_processor.extract_keywords( ' I am a product manager for a java_2e platform ' )
>>> # output ['product management', 'java']>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_dict = {
>>> " java " : [ " java_2e " , " java programing " ],
>>> " product management " : [ " PM " , " product manager " ]
>>> }
>>> keyword_processor.add_keywords_from_dict(keyword_dict)
>>> print (keyword_processor.extract_keywords( ' I am a product manager for a java_2e platform ' ))
>>> # output ['product management', 'java']
>>> keyword_processor.remove_keyword( ' java_2e ' )
>>> # you can also remove keywords from a list/ dictionary
>>> keyword_processor.remove_keywords_from_dict({ " product management " : [ " PM " ]})
>>> keyword_processor.remove_keywords_from_list([ " java programing " ])
>>> keyword_processor.extract_keywords( ' I am a product manager for a java_2e platform ' )
>>> # output ['product management']>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_dict = {
>>> " java " : [ " java_2e " , " java programing " ],
>>> " product management " : [ " PM " , " product manager " ]
>>> }
>>> keyword_processor.add_keywords_from_dict(keyword_dict)
>>> print ( len (keyword_processor))
>>> # output 4>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword( ' j2ee ' , ' Java ' )
>>> ' j2ee ' in keyword_processor
>>> # output: True
>>> keyword_processor.get_keyword( ' j2ee ' )
>>> # output: Java
>>> keyword_processor[ ' colour ' ] = ' color '
>>> keyword_processor[ ' colour ' ]
>>> # output: color>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword( ' j2ee ' , ' Java ' )
>>> keyword_processor.add_keyword( ' colour ' , ' color ' )
>>> keyword_processor.get_all_keywords()
>>> # output: {'colour': 'color', 'j2ee': 'Java'}为了检测单词边界,当前除此 w [a-za-z0-9_]以外的任何字符都被视为单词边界。
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword( ' Big Apple ' )
>>> print (keyword_processor.extract_keywords( ' I love Big Apple/Bay Area. ' ))
>>> # ['Big Apple']
>>> keyword_processor.add_non_word_boundary( ' / ' )
>>> print (keyword_processor.extract_keywords( ' I love Big Apple/Bay Area. ' ))
>>> # [] $ git克隆https://github.com/vi3k6i5/flashtext $ cd flashtext $ pip安装pytest $ python setup.py测试
$ git克隆https://github.com/vi3k6i5/flashtext $ cd flashtext/docs $ pip安装狮身人面像 $制作html $#打开_build/html/index.html在浏览器中查看本地
这是一种基于Aho-Corasick算法和Trie词典的自定义算法。

FlashText花费的时间与REGEX相比找到术语。
FlashText与Regex相比,FlashText所花费的时间替换条款。
链接到代码进行基准测试查找功能并替换功能。
该库的想法来自以下stackoverflow问题。
原始论文发表在FlashText算法上。
@Article {2017arxiv171100046s,
作者= {{singh},V。},
title =“ {更换或检索文档中的关键字}”,
日记= {arxiv e-prints},
ArchivePrefix =“ arxiv”,
eprint = {1711.00046},
primaryclass =“ cs.ds”,
关键字= {计算机科学 - 数据结构和算法},
年= 2017年,
月=十月,
adsurl = {http://adsabs.harvard.edu/abs/2017arxiv171100046s},
adsnote = {由SAO/NASA天体物理数据系统提供
}
这篇文章发表在中等五自由状运动上。
该项目是根据MIT许可证获得许可的。