410 lines
15 KiB
Python
410 lines
15 KiB
Python
import pandas as pd
|
|
import jieba
|
|
import time
|
|
import json
|
|
import os
|
|
from collections import defaultdict
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib
|
|
matplotlib.use('Agg')
|
|
|
|
# 设置中文字体
|
|
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS']
|
|
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
|
|
|
|
# ============================================================
|
|
# 第一部分:构建情感词典
|
|
# ============================================================
|
|
|
|
def build_sentiment_dictionary():
|
|
"""使用大连理工大学中文情感词汇本体构建情感词典"""
|
|
|
|
dict_path = '大连理工大学中文情感词汇本体.xlsx'
|
|
|
|
try:
|
|
# 读取大连理工大学情感词汇
|
|
df = pd.read_excel(dict_path)
|
|
|
|
# 选择需要的列
|
|
df = df[['词语', '词性种类', '词义数', '词义序号', '情感分类', '强度', '极性']]
|
|
|
|
# 分类整理
|
|
Happy = []
|
|
Good = []
|
|
Surprise = []
|
|
Anger = []
|
|
Sad = []
|
|
Fear = []
|
|
Disgust = []
|
|
|
|
for idx, row in df.iterrows():
|
|
if row['情感分类'] in ['PA', 'PE']:
|
|
Happy.append(row['词语'])
|
|
if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']:
|
|
Good.append(row['词语'])
|
|
if row['情感分类'] in ['PC']:
|
|
Surprise.append(row['词语'])
|
|
if row['情感分类'] in ['NA']:
|
|
Anger.append(row['词语'])
|
|
if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']:
|
|
Sad.append(row['词语'])
|
|
if row['情感分类'] in ['NI', 'NC', 'NG']:
|
|
Fear.append(row['词语'])
|
|
if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']:
|
|
Disgust.append(row['词语'])
|
|
|
|
# 添加股票相关的补充词汇
|
|
stock_positive = ['涨', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '赚',
|
|
'利好', '增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破',
|
|
'创新', '优势', '超预期', '亮眼', '惊艳', '奇迹']
|
|
stock_negative = ['跌', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '赔', '损失',
|
|
'套牢', '垃圾', '恶心', '坑爹', '骗局', '雷', '爆雷', '崩盘', '退市']
|
|
|
|
Good.extend(stock_positive)
|
|
Disgust.extend(stock_negative)
|
|
|
|
# 合并
|
|
Positive = Happy + Good + Surprise
|
|
Negative = Anger + Sad + Fear + Disgust
|
|
|
|
print('大连理工大学情感词典加载完成')
|
|
print(f'正面情感词: {len(Positive)}个')
|
|
print(f'负面情感词: {len(Negative)}个')
|
|
|
|
return {
|
|
'Happy': Happy,
|
|
'Good': Good,
|
|
'Surprise': Surprise,
|
|
'Anger': Anger,
|
|
'Sad': Sad,
|
|
'Fear': Fear,
|
|
'Disgust': Disgust,
|
|
'Positive': Positive,
|
|
'Negative': Negative
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f'加载大连理工大学情感词典失败: {e}')
|
|
print('使用简化版情感词典')
|
|
return build_simplified_dictionary()
|
|
|
|
def build_simplified_dictionary():
|
|
"""构建简化的中文情感词典(备用方案)"""
|
|
|
|
# 正面情感词
|
|
Happy = [
|
|
'开心', '快乐', '高兴', '喜悦', '愉快', '欣喜', '欢乐', '欢喜', '幸福',
|
|
'满意', '满足', '欣慰', '愉悦', '畅快', '乐观', '积极', '美好', '成功'
|
|
]
|
|
|
|
Good = [
|
|
'好', '优秀', '出色', '精彩', '卓越', '杰出', '优良', '良好', '完美', '不错',
|
|
'涨', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '赚', '利好',
|
|
'增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破', '创新', '优势'
|
|
]
|
|
|
|
Surprise = [
|
|
'惊喜', '意外', '震惊', '惊讶', '震撼', '神奇', '奇迹', '惊艳', '亮眼', '超预期'
|
|
]
|
|
|
|
# 负面情感词
|
|
Anger = [
|
|
'愤怒', '生气', '恼火', '气愤', '暴怒', '愤慨', '愤恨', '震怒', '发怒',
|
|
'骂', '垃圾', '恶心', '坑爹', '骗局', '欺骗', '欺诈', '造假', '腐败', '黑暗'
|
|
]
|
|
|
|
Sad = [
|
|
'伤心', '难过', '悲伤', '痛苦', '悲哀', '沮丧', '失望', '绝望', '低落', '悲观',
|
|
'跌', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '赔', '损失', '套牢'
|
|
]
|
|
|
|
Fear = [
|
|
'害怕', '恐惧', '担心', '担忧', '恐慌', '不安', '焦虑', '忧虑', '紧张', '恐怖',
|
|
'风险', '危机', '危险', '下跌', '暴跌', '崩盘', '退市', '爆雷', '雷', '怕'
|
|
]
|
|
|
|
Disgust = [
|
|
'厌恶', '恶心', '反感', '讨厌', '鄙视', '唾弃', '不屑', '蔑视', '嫌弃',
|
|
'垃圾', '废物', '不行', '差劲', '差', '烂', '渣', '骗局'
|
|
]
|
|
|
|
# 合并
|
|
Positive = Happy + Good + Surprise
|
|
Negative = Anger + Sad + Fear + Disgust
|
|
|
|
print('简化版情感词典构建完成')
|
|
print(f'正面情感词: {len(Positive)}个')
|
|
print(f'负面情感词: {len(Negative)}个')
|
|
|
|
return {
|
|
'Happy': Happy,
|
|
'Good': Good,
|
|
'Surprise': Surprise,
|
|
'Anger': Anger,
|
|
'Sad': Sad,
|
|
'Fear': Fear,
|
|
'Disgust': Disgust,
|
|
'Positive': Positive,
|
|
'Negative': Negative
|
|
}
|
|
|
|
# ============================================================
|
|
# 第二部分:情绪计算函数
|
|
# ============================================================
|
|
|
|
def emotion_caculate(text, sentiment_dict):
|
|
"""计算单条文本的情绪"""
|
|
|
|
if not text or pd.isna(text):
|
|
text = ''
|
|
|
|
positive = 0
|
|
negative = 0
|
|
anger = 0
|
|
disgust = 0
|
|
fear = 0
|
|
sad = 0
|
|
surprise = 0
|
|
good = 0
|
|
happy = 0
|
|
|
|
wordlist = jieba.lcut(text)
|
|
wordset = set(wordlist)
|
|
|
|
for word in wordset:
|
|
freq = wordlist.count(word)
|
|
|
|
if word in sentiment_dict['Positive']:
|
|
positive += freq
|
|
if word in sentiment_dict['Negative']:
|
|
negative += freq
|
|
if word in sentiment_dict['Anger']:
|
|
anger += freq
|
|
if word in sentiment_dict['Disgust']:
|
|
disgust += freq
|
|
if word in sentiment_dict['Fear']:
|
|
fear += freq
|
|
if word in sentiment_dict['Sad']:
|
|
sad += freq
|
|
if word in sentiment_dict['Surprise']:
|
|
surprise += freq
|
|
if word in sentiment_dict['Good']:
|
|
good += freq
|
|
if word in sentiment_dict['Happy']:
|
|
happy += freq
|
|
|
|
emotion_info = {
|
|
'length': len(wordlist),
|
|
'positive': positive,
|
|
'negative': negative,
|
|
'anger': anger,
|
|
'disgust': disgust,
|
|
'fear': fear,
|
|
'sadness': sad,
|
|
'surprise': surprise,
|
|
'good': good,
|
|
'happy': happy,
|
|
'sentiment_score': positive - negative if (positive + negative) > 0 else 0
|
|
}
|
|
|
|
indexs = ['length', 'positive', 'negative', 'anger', 'disgust', 'fear',
|
|
'sadness', 'surprise', 'good', 'happy', 'sentiment_score']
|
|
|
|
return pd.Series(emotion_info, index=indexs)
|
|
|
|
# ============================================================
|
|
# 第三部分:数据加载与分析
|
|
# ============================================================
|
|
|
|
def load_and_analyze_data(data_dir='data', output_dir='sentiment_output'):
|
|
"""加载数据并进行情绪分析"""
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# 构建情感词典
|
|
sentiment_dict = build_sentiment_dictionary()
|
|
|
|
# 遍历所有JSON文件
|
|
all_results = []
|
|
stock_emotions = {}
|
|
|
|
for filename in os.listdir(data_dir):
|
|
if filename.endswith('.json') and filename.startswith('guba_'):
|
|
filepath = os.path.join(data_dir, filename)
|
|
|
|
print(f'\n正在分析: {filename}')
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
stock_name = data.get('stock_name', '未知')
|
|
stock_code = data.get('stock_code', '未知')
|
|
posts = data.get('posts', [])
|
|
|
|
if not posts:
|
|
print(f' 无数据,跳过')
|
|
continue
|
|
|
|
# 转换为DataFrame
|
|
df = pd.DataFrame(posts)
|
|
|
|
# 合并标题和内容
|
|
df['full_text'] = df.apply(
|
|
lambda x: f"{x.get('post_title', '')} {x.get('post_content', '')}",
|
|
axis=1
|
|
)
|
|
|
|
# 进行情绪分析
|
|
print(f' 开始分析 {len(df)} 条帖子...')
|
|
start = time.time()
|
|
|
|
emotion_df = df['full_text'].apply(
|
|
lambda x: emotion_caculate(x, sentiment_dict)
|
|
)
|
|
|
|
end = time.time()
|
|
print(f' 分析完成,耗时: {end - start:.2f}秒')
|
|
|
|
# 合并结果
|
|
result_df = pd.concat([df, emotion_df], axis=1)
|
|
|
|
# 保存结果
|
|
output_file = os.path.join(output_dir, f'sentiment_{stock_name}_{stock_code}.csv')
|
|
result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
|
|
print(f' 结果已保存到: {output_file}')
|
|
|
|
# 统计整体情绪
|
|
stock_stats = {
|
|
'stock_code': stock_code,
|
|
'stock_name': stock_name,
|
|
'total_posts': len(result_df),
|
|
'avg_positive': result_df['positive'].mean(),
|
|
'avg_negative': result_df['negative'].mean(),
|
|
'avg_sentiment_score': result_df['sentiment_score'].mean(),
|
|
'positive_posts': (result_df['sentiment_score'] > 0).sum(),
|
|
'negative_posts': (result_df['sentiment_score'] < 0).sum(),
|
|
'neutral_posts': (result_df['sentiment_score'] == 0).sum(),
|
|
'total_anger': result_df['anger'].sum(),
|
|
'total_sadness': result_df['sadness'].sum(),
|
|
'total_fear': result_df['fear'].sum(),
|
|
'total_disgust': result_df['disgust'].sum(),
|
|
'total_good': result_df['good'].sum(),
|
|
'total_happy': result_df['happy'].sum(),
|
|
'total_surprise': result_df['surprise'].sum()
|
|
}
|
|
|
|
stock_emotions[stock_code] = stock_stats
|
|
all_results.append(result_df)
|
|
|
|
# 打印该股票情绪最高/最低的帖子
|
|
print(f'\n {stock_name} 情绪分析统计:')
|
|
print(f' 平均情绪得分: {stock_stats["avg_sentiment_score"]:.2f}')
|
|
print(f' 正面帖子: {stock_stats["positive_posts"]}')
|
|
print(f' 负面帖子: {stock_stats["negative_posts"]}')
|
|
print(f' 中性帖子: {stock_stats["neutral_posts"]}')
|
|
|
|
# 最正面帖子
|
|
top_positive = result_df.nlargest(1, 'sentiment_score').iloc[0]
|
|
print(f' 最正面帖子: {top_positive["full_text"][:50]}...')
|
|
|
|
# 最负面帖子
|
|
top_negative = result_df.nsmallest(1, 'sentiment_score').iloc[0]
|
|
print(f' 最负面帖子: {top_negative["full_text"][:50]}...')
|
|
|
|
except Exception as e:
|
|
print(f' 分析失败: {e}')
|
|
|
|
# 保存总体统计
|
|
if stock_emotions:
|
|
summary_df = pd.DataFrame(list(stock_emotions.values()))
|
|
summary_file = os.path.join(output_dir, 'sentiment_summary.csv')
|
|
summary_df.to_csv(summary_file, index=False, encoding='utf-8-sig')
|
|
print(f'\n总体统计已保存到: {summary_file}')
|
|
|
|
# 生成可视化
|
|
generate_visualizations(summary_df, stock_emotions, output_dir)
|
|
|
|
return all_results, stock_emotions
|
|
|
|
# ============================================================
|
|
# 第四部分:可视化
|
|
# ============================================================
|
|
|
|
def generate_visualizations(summary_df, stock_emotions, output_dir):
|
|
"""生成情绪分析可视化图表"""
|
|
|
|
# 1. 各股票平均情绪得分对比
|
|
plt.figure(figsize=(12, 6))
|
|
colors = ['green' if x >= 0 else 'red' for x in summary_df['avg_sentiment_score']]
|
|
plt.bar(summary_df['stock_name'], summary_df['avg_sentiment_score'], color=colors, alpha=0.7)
|
|
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
|
|
plt.title('各股票平均情绪得分对比', fontsize=14)
|
|
plt.xlabel('股票名称', fontsize=12)
|
|
plt.ylabel('平均情绪得分', fontsize=12)
|
|
plt.xticks(rotation=45)
|
|
plt.tight_layout()
|
|
plt.savefig(os.path.join(output_dir, 'sentiment_score_comparison.png'), dpi=300)
|
|
plt.close()
|
|
|
|
# 2. 正面/负面/中性帖子分布
|
|
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
|
|
axes = axes.flatten()
|
|
|
|
for idx, (stock_code, stats) in enumerate(stock_emotions.items()):
|
|
if idx >= 8:
|
|
break
|
|
labels = ['正面', '负面', '中性']
|
|
sizes = [stats['positive_posts'], stats['negative_posts'], stats['neutral_posts']]
|
|
colors = ['green', 'red', 'gray']
|
|
|
|
axes[idx].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
|
|
axes[idx].set_title(f'{stats["stock_name"]} 情绪分布')
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(os.path.join(output_dir, 'sentiment_distribution.png'), dpi=300)
|
|
plt.close()
|
|
|
|
# 3. 各情绪类型占比
|
|
plt.figure(figsize=(14, 7))
|
|
emotions = ['total_good', 'total_happy', 'total_surprise',
|
|
'total_anger', 'total_sadness', 'total_fear', 'total_disgust']
|
|
emotion_names = ['好评', '快乐', '惊讶', '愤怒', '悲伤', '恐惧', '厌恶']
|
|
|
|
x = range(len(emotion_names))
|
|
width = 0.1
|
|
|
|
for idx, (stock_code, stats) in enumerate(stock_emotions.items()):
|
|
values = [stats[e] for e in emotions]
|
|
total = sum(values)
|
|
if total > 0:
|
|
values = [v / total * 100 for v in values]
|
|
plt.bar([xi + width * idx for xi in x], values, width, label=stats['stock_name'])
|
|
|
|
plt.xlabel('情绪类型', fontsize=12)
|
|
plt.ylabel('占比 (%)', fontsize=12)
|
|
plt.title('各股票情绪类型分布', fontsize=14)
|
|
plt.xticks([xi + width * 3.5 for xi in x], emotion_names)
|
|
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
|
|
plt.tight_layout()
|
|
plt.savefig(os.path.join(output_dir, 'emotion_types.png'), dpi=300, bbox_inches='tight')
|
|
plt.close()
|
|
|
|
print(f'可视化图表已生成到 {output_dir}')
|
|
|
|
# ============================================================
|
|
# 主程序
|
|
# ============================================================
|
|
|
|
if __name__ == '__main__':
|
|
print('=' * 60)
|
|
print('股吧数据情绪分析')
|
|
print('=' * 60)
|
|
|
|
# 运行分析
|
|
all_results, stock_emotions = load_and_analyze_data()
|
|
|
|
print('\n' + '=' * 60)
|
|
print('情绪分析完成!')
|
|
print('=' * 60)
|