import pandas as pd import jieba import time import json import os from collections import defaultdict import matplotlib.pyplot as plt import matplotlib matplotlib.use('Agg') # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS'] plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 # ============================================================ # 第一部分:构建情感词典 # ============================================================ def build_sentiment_dictionary(): """使用大连理工大学中文情感词汇本体构建情感词典""" dict_path = '大连理工大学中文情感词汇本体.xlsx' try: # 读取大连理工大学情感词汇 df = pd.read_excel(dict_path) # 选择需要的列 df = df[['词语', '词性种类', '词义数', '词义序号', '情感分类', '强度', '极性']] # 分类整理 Happy = [] Good = [] Surprise = [] Anger = [] Sad = [] Fear = [] Disgust = [] for idx, row in df.iterrows(): if row['情感分类'] in ['PA', 'PE']: Happy.append(row['词语']) if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']: Good.append(row['词语']) if row['情感分类'] in ['PC']: Surprise.append(row['词语']) if row['情感分类'] in ['NA']: Anger.append(row['词语']) if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']: Sad.append(row['词语']) if row['情感分类'] in ['NI', 'NC', 'NG']: Fear.append(row['词语']) if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']: Disgust.append(row['词语']) # 添加股票相关的补充词汇 stock_positive = ['涨', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '赚', '利好', '增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破', '创新', '优势', '超预期', '亮眼', '惊艳', '奇迹'] stock_negative = ['跌', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '赔', '损失', '套牢', '垃圾', '恶心', '坑爹', '骗局', '雷', '爆雷', '崩盘', '退市'] Good.extend(stock_positive) Disgust.extend(stock_negative) # 合并 Positive = Happy + Good + Surprise Negative = Anger + Sad + Fear + Disgust print('大连理工大学情感词典加载完成') print(f'正面情感词: {len(Positive)}个') print(f'负面情感词: {len(Negative)}个') return { 'Happy': Happy, 'Good': Good, 'Surprise': Surprise, 'Anger': Anger, 'Sad': Sad, 'Fear': Fear, 'Disgust': Disgust, 'Positive': Positive, 'Negative': Negative } except Exception as e: print(f'加载大连理工大学情感词典失败: {e}') print('使用简化版情感词典') return build_simplified_dictionary() def build_simplified_dictionary(): """构建简化的中文情感词典(备用方案)""" # 正面情感词 Happy = [ '开心', '快乐', '高兴', '喜悦', '愉快', '欣喜', '欢乐', '欢喜', '幸福', '满意', '满足', '欣慰', '愉悦', '畅快', '乐观', '积极', '美好', '成功' ] Good = [ '好', '优秀', '出色', '精彩', '卓越', '杰出', '优良', '良好', '完美', '不错', '涨', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '赚', '利好', '增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破', '创新', '优势' ] Surprise = [ '惊喜', '意外', '震惊', '惊讶', '震撼', '神奇', '奇迹', '惊艳', '亮眼', '超预期' ] # 负面情感词 Anger = [ '愤怒', '生气', '恼火', '气愤', '暴怒', '愤慨', '愤恨', '震怒', '发怒', '骂', '垃圾', '恶心', '坑爹', '骗局', '欺骗', '欺诈', '造假', '腐败', '黑暗' ] Sad = [ '伤心', '难过', '悲伤', '痛苦', '悲哀', '沮丧', '失望', '绝望', '低落', '悲观', '跌', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '赔', '损失', '套牢' ] Fear = [ '害怕', '恐惧', '担心', '担忧', '恐慌', '不安', '焦虑', '忧虑', '紧张', '恐怖', '风险', '危机', '危险', '下跌', '暴跌', '崩盘', '退市', '爆雷', '雷', '怕' ] Disgust = [ '厌恶', '恶心', '反感', '讨厌', '鄙视', '唾弃', '不屑', '蔑视', '嫌弃', '垃圾', '废物', '不行', '差劲', '差', '烂', '渣', '骗局' ] # 合并 Positive = Happy + Good + Surprise Negative = Anger + Sad + Fear + Disgust print('简化版情感词典构建完成') print(f'正面情感词: {len(Positive)}个') print(f'负面情感词: {len(Negative)}个') return { 'Happy': Happy, 'Good': Good, 'Surprise': Surprise, 'Anger': Anger, 'Sad': Sad, 'Fear': Fear, 'Disgust': Disgust, 'Positive': Positive, 'Negative': Negative } # ============================================================ # 第二部分:情绪计算函数 # ============================================================ def emotion_caculate(text, sentiment_dict): """计算单条文本的情绪""" if not text or pd.isna(text): text = '' positive = 0 negative = 0 anger = 0 disgust = 0 fear = 0 sad = 0 surprise = 0 good = 0 happy = 0 wordlist = jieba.lcut(text) wordset = set(wordlist) for word in wordset: freq = wordlist.count(word) if word in sentiment_dict['Positive']: positive += freq if word in sentiment_dict['Negative']: negative += freq if word in sentiment_dict['Anger']: anger += freq if word in sentiment_dict['Disgust']: disgust += freq if word in sentiment_dict['Fear']: fear += freq if word in sentiment_dict['Sad']: sad += freq if word in sentiment_dict['Surprise']: surprise += freq if word in sentiment_dict['Good']: good += freq if word in sentiment_dict['Happy']: happy += freq emotion_info = { 'length': len(wordlist), 'positive': positive, 'negative': negative, 'anger': anger, 'disgust': disgust, 'fear': fear, 'sadness': sad, 'surprise': surprise, 'good': good, 'happy': happy, 'sentiment_score': positive - negative if (positive + negative) > 0 else 0 } indexs = ['length', 'positive', 'negative', 'anger', 'disgust', 'fear', 'sadness', 'surprise', 'good', 'happy', 'sentiment_score'] return pd.Series(emotion_info, index=indexs) # ============================================================ # 第三部分:数据加载与分析 # ============================================================ def load_and_analyze_data(data_dir='data', output_dir='sentiment_output'): """加载数据并进行情绪分析""" os.makedirs(output_dir, exist_ok=True) # 构建情感词典 sentiment_dict = build_sentiment_dictionary() # 遍历所有JSON文件 all_results = [] stock_emotions = {} for filename in os.listdir(data_dir): if filename.endswith('.json') and filename.startswith('guba_'): filepath = os.path.join(data_dir, filename) print(f'\n正在分析: {filename}') try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) stock_name = data.get('stock_name', '未知') stock_code = data.get('stock_code', '未知') posts = data.get('posts', []) if not posts: print(f' 无数据,跳过') continue # 转换为DataFrame df = pd.DataFrame(posts) # 合并标题和内容 df['full_text'] = df.apply( lambda x: f"{x.get('post_title', '')} {x.get('post_content', '')}", axis=1 ) # 进行情绪分析 print(f' 开始分析 {len(df)} 条帖子...') start = time.time() emotion_df = df['full_text'].apply( lambda x: emotion_caculate(x, sentiment_dict) ) end = time.time() print(f' 分析完成,耗时: {end - start:.2f}秒') # 合并结果 result_df = pd.concat([df, emotion_df], axis=1) # 保存结果 output_file = os.path.join(output_dir, f'sentiment_{stock_name}_{stock_code}.csv') result_df.to_csv(output_file, index=False, encoding='utf-8-sig') print(f' 结果已保存到: {output_file}') # 统计整体情绪 stock_stats = { 'stock_code': stock_code, 'stock_name': stock_name, 'total_posts': len(result_df), 'avg_positive': result_df['positive'].mean(), 'avg_negative': result_df['negative'].mean(), 'avg_sentiment_score': result_df['sentiment_score'].mean(), 'positive_posts': (result_df['sentiment_score'] > 0).sum(), 'negative_posts': (result_df['sentiment_score'] < 0).sum(), 'neutral_posts': (result_df['sentiment_score'] == 0).sum(), 'total_anger': result_df['anger'].sum(), 'total_sadness': result_df['sadness'].sum(), 'total_fear': result_df['fear'].sum(), 'total_disgust': result_df['disgust'].sum(), 'total_good': result_df['good'].sum(), 'total_happy': result_df['happy'].sum(), 'total_surprise': result_df['surprise'].sum() } stock_emotions[stock_code] = stock_stats all_results.append(result_df) # 打印该股票情绪最高/最低的帖子 print(f'\n {stock_name} 情绪分析统计:') print(f' 平均情绪得分: {stock_stats["avg_sentiment_score"]:.2f}') print(f' 正面帖子: {stock_stats["positive_posts"]}') print(f' 负面帖子: {stock_stats["negative_posts"]}') print(f' 中性帖子: {stock_stats["neutral_posts"]}') # 最正面帖子 top_positive = result_df.nlargest(1, 'sentiment_score').iloc[0] print(f' 最正面帖子: {top_positive["full_text"][:50]}...') # 最负面帖子 top_negative = result_df.nsmallest(1, 'sentiment_score').iloc[0] print(f' 最负面帖子: {top_negative["full_text"][:50]}...') except Exception as e: print(f' 分析失败: {e}') # 保存总体统计 if stock_emotions: summary_df = pd.DataFrame(list(stock_emotions.values())) summary_file = os.path.join(output_dir, 'sentiment_summary.csv') summary_df.to_csv(summary_file, index=False, encoding='utf-8-sig') print(f'\n总体统计已保存到: {summary_file}') # 生成可视化 generate_visualizations(summary_df, stock_emotions, output_dir) return all_results, stock_emotions # ============================================================ # 第四部分:可视化 # ============================================================ def generate_visualizations(summary_df, stock_emotions, output_dir): """生成情绪分析可视化图表""" # 1. 各股票平均情绪得分对比 plt.figure(figsize=(12, 6)) colors = ['green' if x >= 0 else 'red' for x in summary_df['avg_sentiment_score']] plt.bar(summary_df['stock_name'], summary_df['avg_sentiment_score'], color=colors, alpha=0.7) plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5) plt.title('各股票平均情绪得分对比', fontsize=14) plt.xlabel('股票名称', fontsize=12) plt.ylabel('平均情绪得分', fontsize=12) plt.xticks(rotation=45) plt.tight_layout() plt.savefig(os.path.join(output_dir, 'sentiment_score_comparison.png'), dpi=300) plt.close() # 2. 正面/负面/中性帖子分布 fig, axes = plt.subplots(2, 4, figsize=(16, 10)) axes = axes.flatten() for idx, (stock_code, stats) in enumerate(stock_emotions.items()): if idx >= 8: break labels = ['正面', '负面', '中性'] sizes = [stats['positive_posts'], stats['negative_posts'], stats['neutral_posts']] colors = ['green', 'red', 'gray'] axes[idx].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90) axes[idx].set_title(f'{stats["stock_name"]} 情绪分布') plt.tight_layout() plt.savefig(os.path.join(output_dir, 'sentiment_distribution.png'), dpi=300) plt.close() # 3. 各情绪类型占比 plt.figure(figsize=(14, 7)) emotions = ['total_good', 'total_happy', 'total_surprise', 'total_anger', 'total_sadness', 'total_fear', 'total_disgust'] emotion_names = ['好评', '快乐', '惊讶', '愤怒', '悲伤', '恐惧', '厌恶'] x = range(len(emotion_names)) width = 0.1 for idx, (stock_code, stats) in enumerate(stock_emotions.items()): values = [stats[e] for e in emotions] total = sum(values) if total > 0: values = [v / total * 100 for v in values] plt.bar([xi + width * idx for xi in x], values, width, label=stats['stock_name']) plt.xlabel('情绪类型', fontsize=12) plt.ylabel('占比 (%)', fontsize=12) plt.title('各股票情绪类型分布', fontsize=14) plt.xticks([xi + width * 3.5 for xi in x], emotion_names) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() plt.savefig(os.path.join(output_dir, 'emotion_types.png'), dpi=300, bbox_inches='tight') plt.close() print(f'可视化图表已生成到 {output_dir}') # ============================================================ # 主程序 # ============================================================ if __name__ == '__main__': print('=' * 60) print('股吧数据情绪分析') print('=' * 60) # 运行分析 all_results, stock_emotions = load_and_analyze_data() print('\n' + '=' * 60) print('情绪分析完成!') print('=' * 60)