如下所示:
<strong><span style="font-size:14px;">文本過濾</span></strong>
result = re.sub(r'[^/u4e00-/u9fa5,。?!,、;:“ ”‘ '( )《 》〈 〉]', "", content)#只保留中文和標點
result = re.sub(r'[^/u4e00-/u9fa5]', "",content)#只保留中文 result = re.sub(r'[^/0-9/./u4e00-/u9fa5,。?!,、;:“ ”‘ '( )《 》〈 〉]', "", content)#只保留中文和標點和數字 result = re.sub(r'[^/u4e00-/u9fa5,A-Za-z0-9]', "",content)#只保留中文、英文和數字
文本去除兩個以上空格
content=re.sub(r'/s{2,}', '', content)
bas4編碼變成中文
def bas4_decode(bas4_content): decodestr= base64.b64decode(bas4_content) result = re.sub(r'[^/0-9/./u4e00-/u9fa5,。?!,、;:“ ”‘ '( )《 》〈 〉]', "", decodestr.decode())#只保留中文和標點和數字 return result
文本去停用詞
def text_to_wordlist(text): result = re.sub(r'[^/u4e00-/u9fa5]', "",text) f1_seg_list = jieba.cut(result)#需要添加一個詞典,來彌補結巴分詞中沒有的詞語,從而保證更高的正確率 f_stop = codecs.open("./stopword.txt","r","utf-8") try: f_stop_text = f_stop.read() finally: f_stop.close() f_stop_seg_list = f_stop_text.split() test_words = [] for myword in f1_seg_list: if myword not in f_stop_seg_list: test_words.append(myword) return test_words
文本特征提取
import jieba import jieba.analyse import numpy as np #import json import redef Textrank(content): result = re.sub(r'[^/u4e00-/u9fa5]', "",content) seg = jieba.cut(result) jieba.analyse.set_stop_words('stopword.txt') keyList=jieba.analyse.textrank('|'.join(seg), topK=10, withWeight=False) return keyListdef TF_IDF(content): result = re.sub(r'[^/u4e00-/u9fa5]', "",content) seg = jieba.cut(result) jieba.analyse.set_stop_words('stopword.txt') keyWord = jieba.analyse.extract_tags( '|'.join(seg), topK=10, withWeight=False, allowPOS=())#關鍵詞提取,在這里對jieba的tfidf.py進行了修改 return keyWord
以上這篇使用python進行文本預處理和提取特征的實例就是小編分享給大家的全部內容了,希望能給大家一個參考,也希望大家多多支持武林站長站。
新聞熱點
疑難解答