import os import json import re import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, accuracy_score from gensim.models import Word2Vec from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout from tensorflow.keras.utils import to_categorical import jieba def load_stopwords(filepath='stopwords.txt'): """从文件加载停用词""" stopwords = set() if os.path.exists(filepath): with open(filepath, 'r', encoding='utf-8') as f: for line in f: word = line.strip() if word: stopwords.add(word) print(f"已加载 {len(stopwords)} 个停用词") else: print(f"警告:停用词文件 {filepath} 不存在,使用默认停用词") stopwords = { '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '有', '吗', '吧', '呢', '啊', '呀', '什么', '怎么', '为什么', '哪里', '谁', '多少', '几', '个', '只', '条', '把', '本', '篇', '次', '天', '今天', '明天', '昨天', '又', '再', '还', '已经', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '或者', '跟', '和', '与', '及', '或', '股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞', 'http', 'https', 'com', 'cn', 'www', 'net', 'org' } return stopwords # 加载停用词 STOPWORDS = load_stopwords() def clean_text(text): """清洗文本""" if not text or pd.isna(text): return "" text = str(text) text = re.sub(r'https?://\S+|www\.\S+', '', text) text = re.sub(r'<.*?>', '', text) text = re.sub(r'\[.*?\]', '', text) text = re.sub(r'\b[a-zA-Z]+\d+\b', '', text) text = re.sub(r'\b\d+[a-zA-Z]+\b', '', text) text = re.sub(r'[^\w\s]', ' ', text) text = re.sub(r'\s+', ' ', text).strip() return text def tokenize(text): """中文分词""" words = jieba.lcut(text) filtered_words = [] for w in words: if w in STOPWORDS or len(w) <= 1: continue if re.match(r'^[a-zA-Z]+$', w): continue if re.match(r'^[a-zA-Z\s]+$', w): continue filtered_words.append(w) return filtered_words def load_and_preprocess_data(filepath='output/all_posts.csv'): """加载并预处理数据""" df = pd.read_csv(filepath, encoding='utf-8-sig') print(f"原始数据: {len(df)} 条") df = df.dropna(subset=['clean_text', 'label']) df = df[df['clean_text'].str.strip() != ''] print(f"有效数据: {len(df)} 条") print(f"标签分布:") print(df['label'].value_counts()) df['tokens'] = df['clean_text'].apply(tokenize) df = df[df['tokens'].apply(len) > 0] print(f"分词后有效数据: {len(df)} 条") return df def train_word2vec_model(sentences, vector_size=100, window=5, min_count=5): """训练 Word2Vec 模型""" print(f"\n训练 Word2Vec 模型...") model = Word2Vec( sentences=sentences, vector_size=vector_size, window=window, min_count=min_count, workers=4, epochs=10 ) print(f"Word2Vec 词汇表大小: {len(model.wv)}") return model def build_cnn_model(vocab_size, embedding_dim, max_seq_len, embedding_matrix, num_classes=3): """构建 CNN 模型""" model = Sequential() model.add(Embedding( input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_len, weights=[embedding_matrix], trainable=False )) model.add(Conv1D(128, 5, activation='relu')) model.add(GlobalMaxPooling1D()) model.add(Dense(64, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes, activation='softmax')) model.compile( optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'] ) return model def main(): print("="*60) print("Word2Vec + CNN 情绪感知模型训练") print("="*60) # 加载数据 print("\n[1/5] 加载数据...") df = load_and_preprocess_data() if len(df) < 10: print("数据不足,无法训练") return # 准备 Word2Vec 训练数据 sentences = df['tokens'].tolist() # 训练 Word2Vec print("\n[2/5] 训练 Word2Vec 词向量...") w2v_model = train_word2vec_model(sentences) # 构建词汇表 print("\n[3/5] 构建词汇表...") tokenizer = Tokenizer() tokenizer.fit_on_texts(sentences) vocab_size = len(tokenizer.word_index) + 1 print(f"词汇表大小: {vocab_size}") # 转换文本为序列 max_seq_len = max(len(s) for s in sentences) print(f"最大序列长度: {max_seq_len}") sequences = tokenizer.texts_to_sequences(sentences) X = pad_sequences(sequences, maxlen=max_seq_len) # 准备标签 label_mapping = {-1: 0, 0: 1, 1: 2} y = df['label'].map(label_mapping).values y = to_categorical(y, num_classes=3) # 创建嵌入矩阵 print("\n[4/5] 创建嵌入矩阵...") embedding_dim = w2v_model.vector_size embedding_matrix = np.zeros((vocab_size, embedding_dim)) for word, i in tokenizer.word_index.items(): if word in w2v_model.wv: embedding_matrix[i] = w2v_model.wv[word] # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f"训练集: {len(X_train)} 条") print(f"测试集: {len(X_test)} 条") # 构建并训练 CNN 模型 print("\n[5/5] 训练 CNN 模型...") model = build_cnn_model(vocab_size, embedding_dim, max_seq_len, embedding_matrix) print(model.summary()) history = model.fit( X_train, y_train, batch_size=32, epochs=10, validation_split=0.1, verbose=1 ) # 评估模型 print("\n[6/6] 评估模型...") y_pred = model.predict(X_test) y_pred_classes = np.argmax(y_pred, axis=1) y_true_classes = np.argmax(y_test, axis=1) print("\n分类报告:") print(classification_report(y_true_classes, y_pred_classes, target_names=['负面', '中性', '正面'])) print(f"准确率: {accuracy_score(y_true_classes, y_pred_classes):.4f}") # 保存模型 print("\n保存模型...") os.makedirs('models', exist_ok=True) # 保存 Word2Vec 模型 w2v_model.save('models/word2vec.model') print("Word2Vec 模型已保存到: models/word2vec.model") # 保存 CNN 模型 model.save('models/cnn_sentiment.h5') print("CNN 模型已保存到: models/cnn_sentiment.h5") # 保存 tokenizer with open('models/tokenizer.json', 'w', encoding='utf-8') as f: f.write(tokenizer.to_json()) print("Tokenizer 已保存到: models/tokenizer.json") print("\n" + "="*60) print("训练完成!") print("="*60) if __name__ == '__main__': main()