python 全文檢索引擎詳解
最近一直在探索著如何用Python實現像百度那樣的關鍵詞檢索功能。說起關鍵詞檢索,我們會不由自主地聯想到正則表達式。正則表達式是所有檢索的基礎,python中有個re類,是專門用于正則匹配。然而,光光是正則表達式是不能很好實現檢索功能的。
python有一個whoosh包,是專門用于全文搜索引擎。
whoosh在國內使用的比較少,而它的性能還沒有sphinx/coreseek成熟,不過不同于前者,這是一個純python庫,對python的愛好者更為方便使用。具體的代碼如下
安裝
輸入命令行 pip install whoosh
需要導入的包有:
fromwhoosh.index import create_infromwhoosh.fields import *fromwhoosh.analysis import RegexAnalyzerfromwhoosh.analysis import Tokenizer,Token
中文分詞解析器
class ChineseTokenizer(Tokenizer): """ 中文分詞解析器 """ def __call__(self, value, positions=False, chars=False, keeporiginal=True, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode "% value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) list_seg = jieba.cut_for_search(value) for w in list_seg: t.original = t.text = w t.boost = 0.5 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield tdef chinese_analyzer(): return ChineseTokenizer()
構建索引的函數
@staticmethod def create_index(document_dir): analyzer = chinese_analyzer() schema = Schema(titel=TEXT(stored=True, analyzer=analyzer), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) ix = create_in("./", schema) writer = ix.writer() for parents, dirnames, filenames in os.walk(document_dir): for filename in filenames: title = filename.replace(".txt", "").decode('utf8') print title content = open(document_dir + '/' + filename, 'r').read().decode('utf-8') path = u"/b" writer.add_document(titel=title, path=path, content=content) writer.commit()
檢索函數
@staticmethod def search(search_str): title_list = [] print 'here' ix = open_dir("./") searcher = ix.searcher() print search_str,type(search_str) results = searcher.find("content", search_str) for hit in results: print hit['titel'] print hit.score print hit.highlights("content", top=10) title_list.append(hit['titel']) print 'tt',title_list return title_list
感謝閱讀,希望能幫助到大家,謝謝大家對本站的支持!
新聞熱點
疑難解答