Files
guba2vec/train_sentiment_model.py
T
zzy5111398 0098977172 完成股吧数据分析项目:
1. 修复词云断句问题 - 添加英文单词过滤
2. 创建 Word2Vec + CNN 情绪感知模型
3. 创建情绪时间序列分析脚本(基于大连理工大学情感词典)
4. 添加停用词文件(1427个中英文停用词)
5. 更新 analyze.py 保存时间字段 post_publish_time
6. 更新 requirements.txt 添加必要依赖
2026-05-28 15:30:16 +08:00

229 lines
7.4 KiB
Python

import os
import json
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.utils import to_categorical
import jieba
def load_stopwords(filepath='stopwords.txt'):
"""从文件加载停用词"""
stopwords = set()
if os.path.exists(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
word = line.strip()
if word:
stopwords.add(word)
print(f"已加载 {len(stopwords)} 个停用词")
else:
print(f"警告:停用词文件 {filepath} 不存在,使用默认停用词")
stopwords = {
'', '', '', '', '', '', '', '', '', '', '', '', '一个', '', '', '', '', '', '',
'', '', '', '', '没有', '', '', '自己', '', '', '', '', '', '', '', '', '什么', '怎么',
'为什么', '哪里', '', '多少', '', '', '', '', '', '', '', '', '', '今天', '明天', '昨天', '',
'', '', '已经', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '或者', '', '', '', '', '',
'股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞',
'http', 'https', 'com', 'cn', 'www', 'net', 'org'
}
return stopwords
# 加载停用词
STOPWORDS = load_stopwords()
def clean_text(text):
"""清洗文本"""
if not text or pd.isna(text):
return ""
text = str(text)
text = re.sub(r'https?://\S+|www\.\S+', '', text)
text = re.sub(r'<.*?>', '', text)
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'\b[a-zA-Z]+\d+\b', '', text)
text = re.sub(r'\b\d+[a-zA-Z]+\b', '', text)
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize(text):
"""中文分词"""
words = jieba.lcut(text)
filtered_words = []
for w in words:
if w in STOPWORDS or len(w) <= 1:
continue
if re.match(r'^[a-zA-Z]+$', w):
continue
if re.match(r'^[a-zA-Z\s]+$', w):
continue
filtered_words.append(w)
return filtered_words
def load_and_preprocess_data(filepath='output/all_posts.csv'):
"""加载并预处理数据"""
df = pd.read_csv(filepath, encoding='utf-8-sig')
print(f"原始数据: {len(df)}")
df = df.dropna(subset=['clean_text', 'label'])
df = df[df['clean_text'].str.strip() != '']
print(f"有效数据: {len(df)}")
print(f"标签分布:")
print(df['label'].value_counts())
df['tokens'] = df['clean_text'].apply(tokenize)
df = df[df['tokens'].apply(len) > 0]
print(f"分词后有效数据: {len(df)}")
return df
def train_word2vec_model(sentences, vector_size=100, window=5, min_count=5):
"""训练 Word2Vec 模型"""
print(f"\n训练 Word2Vec 模型...")
model = Word2Vec(
sentences=sentences,
vector_size=vector_size,
window=window,
min_count=min_count,
workers=4,
epochs=10
)
print(f"Word2Vec 词汇表大小: {len(model.wv)}")
return model
def build_cnn_model(vocab_size, embedding_dim, max_seq_len, embedding_matrix, num_classes=3):
"""构建 CNN 模型"""
model = Sequential()
model.add(Embedding(
input_dim=vocab_size,
output_dim=embedding_dim,
input_length=max_seq_len,
weights=[embedding_matrix],
trainable=False
))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy']
)
return model
def main():
print("="*60)
print("Word2Vec + CNN 情绪感知模型训练")
print("="*60)
# 加载数据
print("\n[1/5] 加载数据...")
df = load_and_preprocess_data()
if len(df) < 10:
print("数据不足,无法训练")
return
# 准备 Word2Vec 训练数据
sentences = df['tokens'].tolist()
# 训练 Word2Vec
print("\n[2/5] 训练 Word2Vec 词向量...")
w2v_model = train_word2vec_model(sentences)
# 构建词汇表
print("\n[3/5] 构建词汇表...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1
print(f"词汇表大小: {vocab_size}")
# 转换文本为序列
max_seq_len = max(len(s) for s in sentences)
print(f"最大序列长度: {max_seq_len}")
sequences = tokenizer.texts_to_sequences(sentences)
X = pad_sequences(sequences, maxlen=max_seq_len)
# 准备标签
label_mapping = {-1: 0, 0: 1, 1: 2}
y = df['label'].map(label_mapping).values
y = to_categorical(y, num_classes=3)
# 创建嵌入矩阵
print("\n[4/5] 创建嵌入矩阵...")
embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
if word in w2v_model.wv:
embedding_matrix[i] = w2v_model.wv[word]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"训练集: {len(X_train)}")
print(f"测试集: {len(X_test)}")
# 构建并训练 CNN 模型
print("\n[5/5] 训练 CNN 模型...")
model = build_cnn_model(vocab_size, embedding_dim, max_seq_len, embedding_matrix)
print(model.summary())
history = model.fit(
X_train, y_train,
batch_size=32,
epochs=10,
validation_split=0.1,
verbose=1
)
# 评估模型
print("\n[6/6] 评估模型...")
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)
print("\n分类报告:")
print(classification_report(y_true_classes, y_pred_classes, target_names=['负面', '中性', '正面']))
print(f"准确率: {accuracy_score(y_true_classes, y_pred_classes):.4f}")
# 保存模型
print("\n保存模型...")
os.makedirs('models', exist_ok=True)
# 保存 Word2Vec 模型
w2v_model.save('models/word2vec.model')
print("Word2Vec 模型已保存到: models/word2vec.model")
# 保存 CNN 模型
model.save('models/cnn_sentiment.h5')
print("CNN 模型已保存到: models/cnn_sentiment.h5")
# 保存 tokenizer
with open('models/tokenizer.json', 'w', encoding='utf-8') as f:
f.write(tokenizer.to_json())
print("Tokenizer 已保存到: models/tokenizer.json")
print("\n" + "="*60)
print("训练完成!")
print("="*60)
if __name__ == '__main__':
main()