核心内容摘要
YOLOv8毕设完整项目:从模型训练到部署的全流程技术解析
项目概述垃圾邮件过滤是机器学习在自然语言处理领域的经典应用之一。
本项目将从零开始构建一个完整的垃圾邮件过滤系统,涵盖数据处理、模型训练、后端API开发到前端界面的全栈实现。
技术栈后端技术Python
8: 主要开发语言Flask: 轻量级Web框架scikit-learn: 机器学习库pandas numpy: 数据处理nltk: 自然语言处理前端技术HTML/CSS/JavaScript: 基础前端技术Bootstrap: UI框架Axios: HTTP客户端数据库SQLite: 轻量级数据库,存储邮件记录项目架构spam-filter/ ├── backend/ │ ├── app.py # Flask应用主文件 │ ├── model.py # 机器学习模型 │ ├── preprocessor.py # 数据预处理 │ └── database.py # 数据库操作 ├── frontend/ │ ├── index.html # 主页面 │ ├── style.css # 样式文件 │ └── script.js # 前端逻辑 ├── models/ │ └── spam_classifier.pkl # 训练好的模型 ├── data/ │ └── emails.csv # 训练数据集 └── requirements.txt # 依赖包核心功能实现
数据预处理模块文本预处理是提高模型性能的关键步骤,主要包括文本清洗、分词、去除停用词等操作。
import re import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer class TextPreprocessor: def __init__(self): nltk.download(stopwords, quietTrue) self.stop_words set(stopwords.words(english)) self.stemmer PorterStemmer() def clean_text(self, text): # 转换为小写 text text.lower() # 移除URL text re.sub(rhttp\S|www\S, , text) # 移除邮箱地址 text re.sub(r\S\S, , text) # 只保留字母和空格 text re.sub(r[^a-zA-Z\s], , text) # 移除多余空格 text re.sub(r\s, , text).strip() return text def preprocess(self, text): # 清洗文本 text self.clean_text(text) # 分词 words text.split() # 去除停用词并进行词干提取 words [self.stemmer.stem(word) for word in words if word not in self.stop_words] return .join(words)
机器学习模型训练我们使用朴素贝叶斯算法和TF-IDF特征提取来构建分类器。
朴素贝叶斯算法在文本分类任务中表现优异,且训练速度快。
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report import pandas as pd import pickle class SpamClassifier: def __init__(self): self.vectorizer TfidfVectorizer(max_features
self.model MultinomialNB() self.preprocessor TextPreprocessor() def train(self, data_path): # 加载数据 df pd.read_csv(data_path) # 预处理文本 df[processed_text] df[text].apply( self.preprocessor.preprocess ) # 划分训练集和测试集 X_train, X_test, y_train, y_test train_test_split( df[processed_text], df[label], test_size
2, random_state42 ) # TF-IDF特征提取 X_train_tfidf self.vectorizer.fit_transform(X_train) X_test_tfidf self.vectorizer.transform(X_test) # 训练模型 self.model.fit(X_train_tfidf, y_train) # 评估模型 y_pred self.model.predict(X_test_tfidf) accuracy accuracy_score(y_test, y_pred) print(f模型准确率: {accuracy:.4f}) print(\n分类报告:) print(classification_report(y_test, y_pred)) return accuracy def predict(self, text): # 预处理 processed_text self.preprocessor.preprocess(text) # 特征提取 text_tfidf self.vectorizer.transform([processed_text]) # 预测 prediction self.model.predict(text_tfidf)[0] probability self.model.predict_proba(text_tfidf)[0] return { is_spam: bool(prediction), confidence: float(max(probability)) } def save_model(self, path): with open(path, wb) as f: pickle.dump({ vectorizer: self.vectorizer, model: self.model, preprocessor: self.preprocessor }, f) def load_model(self, path): with open(path, rb) as f: data pickle.load(f) self.vectorizer data[vectorizer] self.model data[model] self.preprocessor data[preprocessor]
Flask后端API后端提供RESTful API接口,处理邮件分类请求和历史记录查询。
from flask import Flask, request, jsonify from flask_cors import CORS import sqlite3 from datetime import datetime app Flask(__name__) CORS(app) # 加载训练好的模型 classifier SpamClassifier() classifier.load_model(models/spam_classifier.pkl) # 数据库初始化 def init_db(): conn sqlite
connect(emails.db) c conn.cursor() c.execute( CREATE TABLE IF NOT EXISTS emails ( id INTEGER PRIMARY KEY AUTOINCREMENT, subject TEXT, content TEXT, is_spam INTEGER, confidence REAL, timestamp TEXT ) ) conn.commit() conn.close() init_db() app.route(/api/classify, methods[POST]) def classify_email(): try: data request.json subject data.get(subject, ) content data.get(content, ) # 组合主题和内容 full_text f{subject} {content} # 预测 result classifier.predict(full_text) # 保存到数据库 conn sqlite
connect(emails.db) c conn.cursor() c.execute( INSERT INTO emails (subject, content, is_spam, confidence, timestamp) VALUES (?, ?, ?, ?, ?) , ( subject, content, int(result[is_spam]), result[confidence], datetime.now().isoformat() )) conn.commit() conn.close() return jsonify({ success: True, result: result }) except Exception as e: return jsonify({ success: False, error: str(e) }), 500 app.route(/api/history, methods[GET]) def get_history(): try: limit request.args.get(limit, 50, typeint) conn sqlite
connect(emails.db) c conn.cursor() c.execute( SELECT id, subject, content, is_spam, confidence, timestamp FROM emails ORDER BY timestamp DESC LIMIT ? , (limit,)) rows c.fetchall() conn.close() history [] for row in rows: history.append({ id: row[0], subject: row[1], content: row[2], is_spam: bool(row[3]), confidence: row[4], timestamp: row[5] }) return jsonify({ success: True, history: history }) except Exception as e: return jsonify({ success: False, error: str(e) }), 500 app.route(/api/stats, methods[GET]) def get_stats(): try: conn sqlite
connect(emails.db) c conn.cursor() c.execute(SELECT COUNT(*) FROM emails) total c.fetchone()[0] c.execute(SELECT COUNT(*) FROM emails WHERE is_spam
spam_count c.fetchone()[0] conn.close() return jsonify({ success: True, stats: { total: total, spam: spam_count, ham: total - spam_count, spam_rate: spam_count / total if total 0 else 0 } }) except Exception as e: return jsonify({ success: False, error: str(e) }), 500 if __name__ __main__: app.run(debugTrue, port
5000)
前端界面实现前端提供简洁友好的用户界面,支持邮件分类和历史记录查看。
!DOCTYPE html html langzh-CN head meta charsetUTF-8 meta nameviewport contentwidthdevice-width, initial-scale
0 title垃圾邮件过滤系统/title link hrefhttps://cdn.jsdelivr.net/npm/bootstrap
5.
3/dist/css/bootstrap.min.css relstylesheet style body { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); min-height: 100vh; padding: 20px; } .main-container { max-width: 900px; margin: 0 auto; } .card { border-radius: 15px; box-shadow: 0 10px 30px rgba(0,0,0,
0.
; } .result-spam { background-color: #dc3545; color: white; } .result-ham { background-color: #28a745; color: white; } /style /head body div classmain-container h1 classtext-center text-white mb-4️ 垃圾邮件过滤系统/h1 !-- 统计信息 -- div classrow mb-4 div classcol-md-4 div classcard text-center div classcard-body h5总邮件数/h5 h2 idtotalCount0/h2 /div /div /div div classcol-md-4 div classcard text-center div classcard-body h5垃圾邮件/h5 h2 idspamCount classtext-danger0/h2 /div /div /div div classcol-md-4 div classcard text-center div classcard-body h5正常邮件/h5 h2 idhamCount classtext-success0/h2 /div /div /div /div !-- 邮件分类表单 -- div classcard mb-4 div classcard-body h3 classcard-title检测邮件/h3 form idemailForm div classmb-3 label classform-label邮件主题/label input typetext classform-control idsubject required /div div classmb-3 label classform-label邮件内容/label textarea classform-control idcontent rows5 required/textarea /div button typesubmit classbtn btn-primary w-100分析邮件/button /form !-- 结果显示 -- div idresult classmt-3 styledisplay:none; div classalert idresultAlert h4 idresultText/h4 p idconfidenceText/p /div /div /div /div !-- 历史记录 -- div classcard div classcard-body h3 classcard-title检测历史/h3 div idhistory classtable-responsive table classtable thead tr th时间/th th主题/th th结果/th th置信度/th /tr /thead tbody idhistoryBody/tbody /table /div /div /div /div script srchttps://cdn.jsdelivr.net/npm/axios/dist/axios.min.js/script script const API_URL http://localhost:5000/api; // 加载统计信息 async function loadStats() { const response await axios.get(${API_URL}/stats); const stats response.data.stats; document.getElementById(totalCount).textContent stats.total; document.getElementById(spamCount).textContent stats.spam; document.getElementById(hamCount).textContent stats.ham; } // 加载历史记录 async function loadHistory() { const response await axios.get(${API_URL}/history?limit
; const history response.data.history; const tbody document.getElementById(historyBody); tbody.innerHTML ; history.forEach(item { const row tbody.insertRow(); const time new Date(item.timestamp).toLocaleString(zh-CN); row.innerHTML td${time}/td td${item.subject}/td tdspan classbadge ${item.is_spam ? bg-danger : bg-success} ${item.is_spam ? 垃圾邮件 : 正常邮件} /span/td td${(item.confidence *
.toFixed(
}%/td ; }); } // 提交表单 document.getElementById(emailForm).addEventListener(submit, async (e) { e.preventDefault(); const subject document.getElementById(subject).value; const content document.getElementById(content).value; try { const response await axios.post(${API_URL}/classify, { subject: subject, content: content }); const result response.data.result; const resultDiv document.getElementById(result); const resultAlert document.getElementById(resultAlert); const resultText document.getElementById(resultText); const confidenceText document.getElementById(confidenceText); resultDiv.style.display block; if (result.is_spam) { resultAlert.className alert result-spam; resultText.textContent ⚠️ 这是一封垃圾邮件!; } else { resultAlert.className alert result-ham; resultText.textContent ✅ 这是一封正常邮件; } confidenceText.textContent 置信度: ${(result.confidence *
.toFixed(
}%; // 刷新统计和历史 loadStats(); loadHistory(); } catch (error) { alert(分类失败: error.message); } }); // 页面加载时初始化 loadStats(); loadHistory(); /script /body /html模型优化技巧
特征工程优化可以添加更多特征来提升模型性能,包括邮件长度、特殊字符比例、大写字母比例、数字比例等。
def extract_features(text): features {} features[length] len(text) features[capital_ratio] sum(1 for c in text if c.isupper()) / len(text) features[digit_ratio] sum(1 for c in text if c.isdigit()) / len(text) features[special_char_ratio] sum(1 for c in text if not c.isalnum()) / len(text) return features
集成学习可以使用多个分类器进行投票,提高预测准确性。
from sklearn.ensemble import VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC ensemble VotingClassifier( estimators[ (nb, MultinomialNB()), (lr, LogisticRegression()), (svc, SVC(probabilityTrue)) ], votingsoft )
超参数调优使用网格搜索找到最佳参数组合。
from sklearn.model_selection import GridSearchCV param_grid { alpha: [
1,
5,
0,
0], fit_prior: [True, False] } grid_search GridSearchCV( MultinomialNB(), param_grid, cv5, scoringaccuracy )部署方案本地部署直接运行Flask应用即可:python backend/app.pyDocker部署创建Dockerfile实现容器化部署:FROM python:
9-slim WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY . . EXPOSE 5000 CMD [python, app.py]云平台部署可以部署到Heroku、AWS、阿里云等平台,需要注意配置环境变量和数据库连接。
项目扩展方向多语言支持: 扩展到中文垃圾邮件检测实时监控: 添加邮件监控功能,自动过滤收件箱深度学习: 使用LSTM或BERT等深度学习模型提升性能用户反馈: 允许用户标注错误分类,持续优化模型可视化分析: 添加词云、特征重要性等可视化功能
总结本项目完整展示了从数据处理、模型训练到Web应用开发的全流程。
通过这个项目,你可以掌握机器学习在实际场景中的应用方法,以及前后端开发的基本技能。
项目代码简洁清晰,适合作为学习Python全栈开发的入门项目。
在实际应用中,还需要考虑模型的持续更新、系统的可扩展性和安全性等问题。
随着垃圾邮件技术的不断演进,模型也需要定期重新训练以保持良好的检测效果。
项目代码下载链接