发布时间: 2019-09-08 13:28:19
7、使用第5步中训练好的模型,根据第6步提取的特征向量对邮件进行分类。
2.代码目录结构
3.编写mail_savemodel.py文件
3.1.导入需要用到的标准库和扩展库对象
from re import sub
from os import listdir
from collections import Counter
from itertools import chain
from numpy import array
from jieba import cut
from sklearn.externals import joblib
from sklearn.naive_bayes import MultinomialNB
3.2.读取全部训练集,删除干扰字符或长度为1的单词
#存放所有文件中的单词
#每个元素是一个子列表,其中存放一个文件中的单词
allWords = []
def getWordsFromFile(txtFile):
words = []
with open(txtFile,encoding="utf8") as fp:
for line in fp:
line = line.strip()
#过滤干扰字符或无效字符
line = sub(r'[.【】0-9、-。,!~\*]','',line)
line = cut(line)
words.extend(line)
return words
3.3.获取并返回出现次数最多的前topN个单词
def getTopNWords(topN):
#按文件编号顺序处理当前文件夹中所有记事本文件
#共151封邮件内容,0.txt到126.txt是垃圾邮件内容
#127.txt到150.txt为正常邮件内容
txtFiles = ["data/"+str(i)+".txt" for i in range(151)]
#获取全部单词
for txtFile in txtFiles:
allWords.append(getWordsFromFile(txtFile))
#获取并返回出现次数最多的前topN个单词
freq = Counter(chain(*allWords))
return [w[0] for w in freq.most_common(topN)]
#全部训练集中出现次数最多的前600个单词
topWords = getTopNWords(600)
3.4.创建贝叶斯模型,使用已有数据进行训练
#获取特征向量,前600个单词的每个单词在每个邮件中出现的频率
vector = []
for words in allWords:
temp = list(map(lambda x:words.count(x),topWords))
vector.append(temp)
vector = array(vector)
#邮件标签,1表示垃圾邮件,0表示正常邮件
labels = array([1]*127+[0]*24)
#创建模型,使用已知训练集进行训练
model = MultinomialNB()
model.fit(vector,labels)
3.5.保存模型
joblib.dump(model,"垃圾邮件分类器.pkl")
with open("topWords.txt","w",encoding="utf8") as fp:
fp.write(",".join(topWords))
print("保存topWords成功.")
4.编写mail_loadmodel.py文件
4.1.加载模型
def getWordsFromFile(txtFile):
words = []
with open(txtFile,encoding="utf8") as fp:
for line in fp:
line = line.strip()
#过滤干扰字符或无效字符
line = sub(r'[.【】0-9、-。,!~\*]','',line)
line = cut(line)
words.extend(line)
return words
model = joblib.load("垃圾邮件分类器.pkl")
print('加载模型和训练结果成功。')
with open("topWords.txt",encoding="utf8") as fp:
topWords = fp.read().split(",")
4.2.使用训练好的模型对未知邮件内容进行分类。
def predict(txtFile):
#获取指定邮件文件内容,返回分类结果
words = getWordsFromFile(txtFile)
currentVector = array(tuple(map(lambda x:words.count(x),topWords)))
result = model.predict(currentVector.reshape(1,-1))
return "垃圾邮件" if result==1 else "正常邮件"
#151.txt至155.txt为测试邮件的内容
for mail in ('data/%d.txt'%i for i in range(151,156)):
print(mail,predict(mail),sep=":")