0098977172
1. 修复词云断句问题 - 添加英文单词过滤 2. 创建 Word2Vec + CNN 情绪感知模型 3. 创建情绪时间序列分析脚本(基于大连理工大学情感词典) 4. 添加停用词文件(1427个中英文停用词) 5. 更新 analyze.py 保存时间字段 post_publish_time 6. 更新 requirements.txt 添加必要依赖
229 lines
7.4 KiB
Python
229 lines
7.4 KiB
Python
import os
|
|
import json
|
|
import re
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import classification_report, accuracy_score
|
|
from gensim.models import Word2Vec
|
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
|
|
from tensorflow.keras.utils import to_categorical
|
|
import jieba
|
|
|
|
def load_stopwords(filepath='stopwords.txt'):
|
|
"""从文件加载停用词"""
|
|
stopwords = set()
|
|
if os.path.exists(filepath):
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
word = line.strip()
|
|
if word:
|
|
stopwords.add(word)
|
|
print(f"已加载 {len(stopwords)} 个停用词")
|
|
else:
|
|
print(f"警告:停用词文件 {filepath} 不存在,使用默认停用词")
|
|
stopwords = {
|
|
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要',
|
|
'去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '有', '吗', '吧', '呢', '啊', '呀', '什么', '怎么',
|
|
'为什么', '哪里', '谁', '多少', '几', '个', '只', '条', '把', '本', '篇', '次', '天', '今天', '明天', '昨天', '又',
|
|
'再', '还', '已经', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '或者', '跟', '和', '与', '及', '或',
|
|
'股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞',
|
|
'http', 'https', 'com', 'cn', 'www', 'net', 'org'
|
|
}
|
|
return stopwords
|
|
|
|
# 加载停用词
|
|
STOPWORDS = load_stopwords()
|
|
|
|
def clean_text(text):
|
|
"""清洗文本"""
|
|
if not text or pd.isna(text):
|
|
return ""
|
|
text = str(text)
|
|
text = re.sub(r'https?://\S+|www\.\S+', '', text)
|
|
text = re.sub(r'<.*?>', '', text)
|
|
text = re.sub(r'\[.*?\]', '', text)
|
|
text = re.sub(r'\b[a-zA-Z]+\d+\b', '', text)
|
|
text = re.sub(r'\b\d+[a-zA-Z]+\b', '', text)
|
|
text = re.sub(r'[^\w\s]', ' ', text)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
|
|
def tokenize(text):
|
|
"""中文分词"""
|
|
words = jieba.lcut(text)
|
|
filtered_words = []
|
|
for w in words:
|
|
if w in STOPWORDS or len(w) <= 1:
|
|
continue
|
|
if re.match(r'^[a-zA-Z]+$', w):
|
|
continue
|
|
if re.match(r'^[a-zA-Z\s]+$', w):
|
|
continue
|
|
filtered_words.append(w)
|
|
return filtered_words
|
|
|
|
def load_and_preprocess_data(filepath='output/all_posts.csv'):
|
|
"""加载并预处理数据"""
|
|
df = pd.read_csv(filepath, encoding='utf-8-sig')
|
|
|
|
print(f"原始数据: {len(df)} 条")
|
|
|
|
df = df.dropna(subset=['clean_text', 'label'])
|
|
df = df[df['clean_text'].str.strip() != '']
|
|
|
|
print(f"有效数据: {len(df)} 条")
|
|
print(f"标签分布:")
|
|
print(df['label'].value_counts())
|
|
|
|
df['tokens'] = df['clean_text'].apply(tokenize)
|
|
df = df[df['tokens'].apply(len) > 0]
|
|
|
|
print(f"分词后有效数据: {len(df)} 条")
|
|
|
|
return df
|
|
|
|
def train_word2vec_model(sentences, vector_size=100, window=5, min_count=5):
|
|
"""训练 Word2Vec 模型"""
|
|
print(f"\n训练 Word2Vec 模型...")
|
|
model = Word2Vec(
|
|
sentences=sentences,
|
|
vector_size=vector_size,
|
|
window=window,
|
|
min_count=min_count,
|
|
workers=4,
|
|
epochs=10
|
|
)
|
|
print(f"Word2Vec 词汇表大小: {len(model.wv)}")
|
|
return model
|
|
|
|
def build_cnn_model(vocab_size, embedding_dim, max_seq_len, embedding_matrix, num_classes=3):
|
|
"""构建 CNN 模型"""
|
|
model = Sequential()
|
|
|
|
model.add(Embedding(
|
|
input_dim=vocab_size,
|
|
output_dim=embedding_dim,
|
|
input_length=max_seq_len,
|
|
weights=[embedding_matrix],
|
|
trainable=False
|
|
))
|
|
|
|
model.add(Conv1D(128, 5, activation='relu'))
|
|
model.add(GlobalMaxPooling1D())
|
|
model.add(Dense(64, activation='relu'))
|
|
model.add(Dropout(0.5))
|
|
model.add(Dense(num_classes, activation='softmax'))
|
|
|
|
model.compile(
|
|
optimizer='adam',
|
|
loss='categorical_crossentropy',
|
|
metrics=['accuracy']
|
|
)
|
|
|
|
return model
|
|
|
|
def main():
|
|
print("="*60)
|
|
print("Word2Vec + CNN 情绪感知模型训练")
|
|
print("="*60)
|
|
|
|
# 加载数据
|
|
print("\n[1/5] 加载数据...")
|
|
df = load_and_preprocess_data()
|
|
|
|
if len(df) < 10:
|
|
print("数据不足,无法训练")
|
|
return
|
|
|
|
# 准备 Word2Vec 训练数据
|
|
sentences = df['tokens'].tolist()
|
|
|
|
# 训练 Word2Vec
|
|
print("\n[2/5] 训练 Word2Vec 词向量...")
|
|
w2v_model = train_word2vec_model(sentences)
|
|
|
|
# 构建词汇表
|
|
print("\n[3/5] 构建词汇表...")
|
|
tokenizer = Tokenizer()
|
|
tokenizer.fit_on_texts(sentences)
|
|
vocab_size = len(tokenizer.word_index) + 1
|
|
print(f"词汇表大小: {vocab_size}")
|
|
|
|
# 转换文本为序列
|
|
max_seq_len = max(len(s) for s in sentences)
|
|
print(f"最大序列长度: {max_seq_len}")
|
|
sequences = tokenizer.texts_to_sequences(sentences)
|
|
X = pad_sequences(sequences, maxlen=max_seq_len)
|
|
|
|
# 准备标签
|
|
label_mapping = {-1: 0, 0: 1, 1: 2}
|
|
y = df['label'].map(label_mapping).values
|
|
y = to_categorical(y, num_classes=3)
|
|
|
|
# 创建嵌入矩阵
|
|
print("\n[4/5] 创建嵌入矩阵...")
|
|
embedding_dim = w2v_model.vector_size
|
|
embedding_matrix = np.zeros((vocab_size, embedding_dim))
|
|
|
|
for word, i in tokenizer.word_index.items():
|
|
if word in w2v_model.wv:
|
|
embedding_matrix[i] = w2v_model.wv[word]
|
|
|
|
# 划分训练集和测试集
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=0.2, random_state=42, stratify=y
|
|
)
|
|
|
|
print(f"训练集: {len(X_train)} 条")
|
|
print(f"测试集: {len(X_test)} 条")
|
|
|
|
# 构建并训练 CNN 模型
|
|
print("\n[5/5] 训练 CNN 模型...")
|
|
model = build_cnn_model(vocab_size, embedding_dim, max_seq_len, embedding_matrix)
|
|
print(model.summary())
|
|
|
|
history = model.fit(
|
|
X_train, y_train,
|
|
batch_size=32,
|
|
epochs=10,
|
|
validation_split=0.1,
|
|
verbose=1
|
|
)
|
|
|
|
# 评估模型
|
|
print("\n[6/6] 评估模型...")
|
|
y_pred = model.predict(X_test)
|
|
y_pred_classes = np.argmax(y_pred, axis=1)
|
|
y_true_classes = np.argmax(y_test, axis=1)
|
|
|
|
print("\n分类报告:")
|
|
print(classification_report(y_true_classes, y_pred_classes, target_names=['负面', '中性', '正面']))
|
|
print(f"准确率: {accuracy_score(y_true_classes, y_pred_classes):.4f}")
|
|
|
|
# 保存模型
|
|
print("\n保存模型...")
|
|
os.makedirs('models', exist_ok=True)
|
|
|
|
# 保存 Word2Vec 模型
|
|
w2v_model.save('models/word2vec.model')
|
|
print("Word2Vec 模型已保存到: models/word2vec.model")
|
|
|
|
# 保存 CNN 模型
|
|
model.save('models/cnn_sentiment.h5')
|
|
print("CNN 模型已保存到: models/cnn_sentiment.h5")
|
|
|
|
# 保存 tokenizer
|
|
with open('models/tokenizer.json', 'w', encoding='utf-8') as f:
|
|
f.write(tokenizer.to_json())
|
|
print("Tokenizer 已保存到: models/tokenizer.json")
|
|
|
|
print("\n" + "="*60)
|
|
print("训练完成!")
|
|
print("="*60)
|
|
|
|
if __name__ == '__main__':
|
|
main() |