This commit is contained in:
2026-05-28 04:54:42 +08:00
commit 5231e995dd
7 changed files with 1251 additions and 0 deletions
+409
View File
@@ -0,0 +1,409 @@
import pandas as pd
import jieba
import time
import json
import os
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
# ============================================================
# 第一部分:构建情感词典
# ============================================================
def build_sentiment_dictionary():
"""使用大连理工大学中文情感词汇本体构建情感词典"""
dict_path = '大连理工大学中文情感词汇本体.xlsx'
try:
# 读取大连理工大学情感词汇
df = pd.read_excel(dict_path)
# 选择需要的列
df = df[['词语', '词性种类', '词义数', '词义序号', '情感分类', '强度', '极性']]
# 分类整理
Happy = []
Good = []
Surprise = []
Anger = []
Sad = []
Fear = []
Disgust = []
for idx, row in df.iterrows():
if row['情感分类'] in ['PA', 'PE']:
Happy.append(row['词语'])
if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']:
Good.append(row['词语'])
if row['情感分类'] in ['PC']:
Surprise.append(row['词语'])
if row['情感分类'] in ['NA']:
Anger.append(row['词语'])
if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']:
Sad.append(row['词语'])
if row['情感分类'] in ['NI', 'NC', 'NG']:
Fear.append(row['词语'])
if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']:
Disgust.append(row['词语'])
# 添加股票相关的补充词汇
stock_positive = ['', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '',
'利好', '增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破',
'创新', '优势', '超预期', '亮眼', '惊艳', '奇迹']
stock_negative = ['', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '', '损失',
'套牢', '垃圾', '恶心', '坑爹', '骗局', '', '爆雷', '崩盘', '退市']
Good.extend(stock_positive)
Disgust.extend(stock_negative)
# 合并
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust
print('大连理工大学情感词典加载完成')
print(f'正面情感词: {len(Positive)}')
print(f'负面情感词: {len(Negative)}')
return {
'Happy': Happy,
'Good': Good,
'Surprise': Surprise,
'Anger': Anger,
'Sad': Sad,
'Fear': Fear,
'Disgust': Disgust,
'Positive': Positive,
'Negative': Negative
}
except Exception as e:
print(f'加载大连理工大学情感词典失败: {e}')
print('使用简化版情感词典')
return build_simplified_dictionary()
def build_simplified_dictionary():
"""构建简化的中文情感词典(备用方案)"""
# 正面情感词
Happy = [
'开心', '快乐', '高兴', '喜悦', '愉快', '欣喜', '欢乐', '欢喜', '幸福',
'满意', '满足', '欣慰', '愉悦', '畅快', '乐观', '积极', '美好', '成功'
]
Good = [
'', '优秀', '出色', '精彩', '卓越', '杰出', '优良', '良好', '完美', '不错',
'', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '', '利好',
'增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破', '创新', '优势'
]
Surprise = [
'惊喜', '意外', '震惊', '惊讶', '震撼', '神奇', '奇迹', '惊艳', '亮眼', '超预期'
]
# 负面情感词
Anger = [
'愤怒', '生气', '恼火', '气愤', '暴怒', '愤慨', '愤恨', '震怒', '发怒',
'', '垃圾', '恶心', '坑爹', '骗局', '欺骗', '欺诈', '造假', '腐败', '黑暗'
]
Sad = [
'伤心', '难过', '悲伤', '痛苦', '悲哀', '沮丧', '失望', '绝望', '低落', '悲观',
'', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '', '损失', '套牢'
]
Fear = [
'害怕', '恐惧', '担心', '担忧', '恐慌', '不安', '焦虑', '忧虑', '紧张', '恐怖',
'风险', '危机', '危险', '下跌', '暴跌', '崩盘', '退市', '爆雷', '', ''
]
Disgust = [
'厌恶', '恶心', '反感', '讨厌', '鄙视', '唾弃', '不屑', '蔑视', '嫌弃',
'垃圾', '废物', '不行', '差劲', '', '', '', '骗局'
]
# 合并
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust
print('简化版情感词典构建完成')
print(f'正面情感词: {len(Positive)}')
print(f'负面情感词: {len(Negative)}')
return {
'Happy': Happy,
'Good': Good,
'Surprise': Surprise,
'Anger': Anger,
'Sad': Sad,
'Fear': Fear,
'Disgust': Disgust,
'Positive': Positive,
'Negative': Negative
}
# ============================================================
# 第二部分:情绪计算函数
# ============================================================
def emotion_caculate(text, sentiment_dict):
"""计算单条文本的情绪"""
if not text or pd.isna(text):
text = ''
positive = 0
negative = 0
anger = 0
disgust = 0
fear = 0
sad = 0
surprise = 0
good = 0
happy = 0
wordlist = jieba.lcut(text)
wordset = set(wordlist)
for word in wordset:
freq = wordlist.count(word)
if word in sentiment_dict['Positive']:
positive += freq
if word in sentiment_dict['Negative']:
negative += freq
if word in sentiment_dict['Anger']:
anger += freq
if word in sentiment_dict['Disgust']:
disgust += freq
if word in sentiment_dict['Fear']:
fear += freq
if word in sentiment_dict['Sad']:
sad += freq
if word in sentiment_dict['Surprise']:
surprise += freq
if word in sentiment_dict['Good']:
good += freq
if word in sentiment_dict['Happy']:
happy += freq
emotion_info = {
'length': len(wordlist),
'positive': positive,
'negative': negative,
'anger': anger,
'disgust': disgust,
'fear': fear,
'sadness': sad,
'surprise': surprise,
'good': good,
'happy': happy,
'sentiment_score': positive - negative if (positive + negative) > 0 else 0
}
indexs = ['length', 'positive', 'negative', 'anger', 'disgust', 'fear',
'sadness', 'surprise', 'good', 'happy', 'sentiment_score']
return pd.Series(emotion_info, index=indexs)
# ============================================================
# 第三部分:数据加载与分析
# ============================================================
def load_and_analyze_data(data_dir='data', output_dir='sentiment_output'):
"""加载数据并进行情绪分析"""
os.makedirs(output_dir, exist_ok=True)
# 构建情感词典
sentiment_dict = build_sentiment_dictionary()
# 遍历所有JSON文件
all_results = []
stock_emotions = {}
for filename in os.listdir(data_dir):
if filename.endswith('.json') and filename.startswith('guba_'):
filepath = os.path.join(data_dir, filename)
print(f'\n正在分析: {filename}')
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
stock_name = data.get('stock_name', '未知')
stock_code = data.get('stock_code', '未知')
posts = data.get('posts', [])
if not posts:
print(f' 无数据,跳过')
continue
# 转换为DataFrame
df = pd.DataFrame(posts)
# 合并标题和内容
df['full_text'] = df.apply(
lambda x: f"{x.get('post_title', '')} {x.get('post_content', '')}",
axis=1
)
# 进行情绪分析
print(f' 开始分析 {len(df)} 条帖子...')
start = time.time()
emotion_df = df['full_text'].apply(
lambda x: emotion_caculate(x, sentiment_dict)
)
end = time.time()
print(f' 分析完成,耗时: {end - start:.2f}')
# 合并结果
result_df = pd.concat([df, emotion_df], axis=1)
# 保存结果
output_file = os.path.join(output_dir, f'sentiment_{stock_name}_{stock_code}.csv')
result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f' 结果已保存到: {output_file}')
# 统计整体情绪
stock_stats = {
'stock_code': stock_code,
'stock_name': stock_name,
'total_posts': len(result_df),
'avg_positive': result_df['positive'].mean(),
'avg_negative': result_df['negative'].mean(),
'avg_sentiment_score': result_df['sentiment_score'].mean(),
'positive_posts': (result_df['sentiment_score'] > 0).sum(),
'negative_posts': (result_df['sentiment_score'] < 0).sum(),
'neutral_posts': (result_df['sentiment_score'] == 0).sum(),
'total_anger': result_df['anger'].sum(),
'total_sadness': result_df['sadness'].sum(),
'total_fear': result_df['fear'].sum(),
'total_disgust': result_df['disgust'].sum(),
'total_good': result_df['good'].sum(),
'total_happy': result_df['happy'].sum(),
'total_surprise': result_df['surprise'].sum()
}
stock_emotions[stock_code] = stock_stats
all_results.append(result_df)
# 打印该股票情绪最高/最低的帖子
print(f'\n {stock_name} 情绪分析统计:')
print(f' 平均情绪得分: {stock_stats["avg_sentiment_score"]:.2f}')
print(f' 正面帖子: {stock_stats["positive_posts"]}')
print(f' 负面帖子: {stock_stats["negative_posts"]}')
print(f' 中性帖子: {stock_stats["neutral_posts"]}')
# 最正面帖子
top_positive = result_df.nlargest(1, 'sentiment_score').iloc[0]
print(f' 最正面帖子: {top_positive["full_text"][:50]}...')
# 最负面帖子
top_negative = result_df.nsmallest(1, 'sentiment_score').iloc[0]
print(f' 最负面帖子: {top_negative["full_text"][:50]}...')
except Exception as e:
print(f' 分析失败: {e}')
# 保存总体统计
if stock_emotions:
summary_df = pd.DataFrame(list(stock_emotions.values()))
summary_file = os.path.join(output_dir, 'sentiment_summary.csv')
summary_df.to_csv(summary_file, index=False, encoding='utf-8-sig')
print(f'\n总体统计已保存到: {summary_file}')
# 生成可视化
generate_visualizations(summary_df, stock_emotions, output_dir)
return all_results, stock_emotions
# ============================================================
# 第四部分:可视化
# ============================================================
def generate_visualizations(summary_df, stock_emotions, output_dir):
"""生成情绪分析可视化图表"""
# 1. 各股票平均情绪得分对比
plt.figure(figsize=(12, 6))
colors = ['green' if x >= 0 else 'red' for x in summary_df['avg_sentiment_score']]
plt.bar(summary_df['stock_name'], summary_df['avg_sentiment_score'], color=colors, alpha=0.7)
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
plt.title('各股票平均情绪得分对比', fontsize=14)
plt.xlabel('股票名称', fontsize=12)
plt.ylabel('平均情绪得分', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'sentiment_score_comparison.png'), dpi=300)
plt.close()
# 2. 正面/负面/中性帖子分布
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
axes = axes.flatten()
for idx, (stock_code, stats) in enumerate(stock_emotions.items()):
if idx >= 8:
break
labels = ['正面', '负面', '中性']
sizes = [stats['positive_posts'], stats['negative_posts'], stats['neutral_posts']]
colors = ['green', 'red', 'gray']
axes[idx].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
axes[idx].set_title(f'{stats["stock_name"]} 情绪分布')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'sentiment_distribution.png'), dpi=300)
plt.close()
# 3. 各情绪类型占比
plt.figure(figsize=(14, 7))
emotions = ['total_good', 'total_happy', 'total_surprise',
'total_anger', 'total_sadness', 'total_fear', 'total_disgust']
emotion_names = ['好评', '快乐', '惊讶', '愤怒', '悲伤', '恐惧', '厌恶']
x = range(len(emotion_names))
width = 0.1
for idx, (stock_code, stats) in enumerate(stock_emotions.items()):
values = [stats[e] for e in emotions]
total = sum(values)
if total > 0:
values = [v / total * 100 for v in values]
plt.bar([xi + width * idx for xi in x], values, width, label=stats['stock_name'])
plt.xlabel('情绪类型', fontsize=12)
plt.ylabel('占比 (%)', fontsize=12)
plt.title('各股票情绪类型分布', fontsize=14)
plt.xticks([xi + width * 3.5 for xi in x], emotion_names)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'emotion_types.png'), dpi=300, bbox_inches='tight')
plt.close()
print(f'可视化图表已生成到 {output_dir}')
# ============================================================
# 主程序
# ============================================================
if __name__ == '__main__':
print('=' * 60)
print('股吧数据情绪分析')
print('=' * 60)
# 运行分析
all_results, stock_emotions = load_and_analyze_data()
print('\n' + '=' * 60)
print('情绪分析完成!')
print('=' * 60)