This commit is contained in:
2026-05-28 04:54:42 +08:00
commit 5231e995dd
7 changed files with 1251 additions and 0 deletions
+3
View File
@@ -0,0 +1,3 @@
data/*
output/*
sentiment_output/*
+347
View File
@@ -0,0 +1,347 @@
import json
import os
import re
import jieba
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg') # 使用非交互式后端
# 中文停用词表
STOPWORDS = {
'', '', '', '', '', '', '', '', '', '', '', '', '一个', '', '', '', '', '', '',
'', '', '', '', '没有', '', '', '自己', '', '', '', '', '', '', '', '', '什么', '怎么',
'为什么', '哪里', '', '多少', '', '', '', '', '', '', '', '', '', '今天', '明天', '昨天', '',
'', '', '已经', '还是', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '还是', '或者', '还是', '还是',
'这个', '那个', '这些', '那些', '那么', '这么', '怎么', '如何', '因为', '所以', '虽然', '但是', '如果', '', '那么',
'', '', '', '', '', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是',
'股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞',
'http', 'https', 'com', 'cn', 'www', 'net', 'org'
}
def clean_text(text):
"""清洗文本"""
if not text:
return ""
# 移除URL
text = re.sub(r'https?://\S+|www\.\S+', '', text)
# 移除HTML标签
text = re.sub(r'<.*?>', '', text)
# 移除表情符号
text = re.sub(r'\[.*?\]', '', text)
# 移除特殊字符
text = re.sub(r'[^\w\s]', '', text)
# 移除数字
text = re.sub(r'\d+', '', text)
# 移除多余空格
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize(text):
"""中文分词"""
words = jieba.lcut(text)
# 过滤停用词和短词
words = [w for w in words if w not in STOPWORDS and len(w) > 1]
return words
def load_data(data_dir='data'):
"""加载所有股票数据"""
all_data = []
stock_info = {}
if not os.path.exists(data_dir):
print(f'数据目录 {data_dir} 不存在')
return all_data, stock_info
for filename in os.listdir(data_dir):
if filename.endswith('.json'):
filepath = os.path.join(data_dir, filename)
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
stock_name = data.get('stock_name', '未知')
stock_code = data.get('stock_code', '未知')
posts = data.get('posts', [])
stock_info[stock_code] = {
'name': stock_name,
'post_count': len(posts)
}
for post in posts:
content = post.get('post_content', '')
title = post.get('post_title', '')
full_text = f"{title} {content}".strip()
if full_text:
all_data.append({
'stock_code': stock_code,
'stock_name': stock_name,
'post_id': post.get('post_id'),
'text': full_text,
'clean_text': clean_text(full_text)
})
except Exception as e:
print(f'加载文件 {filename} 失败: {e}')
return all_data, stock_info
def calculate_tfidf(texts):
"""计算TF-IDF"""
vectorizer = TfidfVectorizer(
tokenizer=tokenize,
token_pattern=None,
max_features=1000,
ngram_range=(1, 2)
)
tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()
return tfidf_matrix, feature_names, vectorizer
def get_top_keywords(tfidf_matrix, feature_names, top_n=20):
"""获取Top关键词"""
avg_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
top_indices = avg_tfidf.argsort()[-top_n*4:][::-1] # 多取一些,避免重复后不够
# 先收集候选词
candidates = []
for idx in top_indices:
word = feature_names[idx]
if len(word.strip()) > 0:
candidates.append({
'word': word,
'tfidf': avg_tfidf[idx],
'length': len(word.split()) # 词的长度(包含多少个词)
})
# 按词长降序排序(优先保留组合词)
candidates.sort(key=lambda x: (-x['length'], -x['tfidf']))
# 智能去重 - 优先保留组合词
keywords = []
seen_words = set()
seen_parts = set()
for candidate in candidates:
word = candidate['word']
word_parts = word.split()
# 检查是否应该添加这个词
should_add = True
# 检查这个词的任何部分是否已经被其他词使用了
for part in word_parts:
if part in seen_parts:
should_add = False
break
if should_add and word not in seen_words:
seen_words.add(word)
# 记录所有使用过的词部分
for part in word_parts:
seen_parts.add(part)
keywords.append({
'word': word,
'tfidf': candidate['tfidf']
})
if len(keywords) >= top_n:
break
# 按TF-IDF重新排序
keywords.sort(key=lambda x: -x['tfidf'])
return keywords
def get_stock_specific_keywords(all_data, stock_code, top_n=20):
"""获取特定股票的关键词"""
stock_texts = [d['clean_text'] for d in all_data if d['stock_code'] == stock_code]
other_texts = [d['clean_text'] for d in all_data if d['stock_code'] != stock_code]
if len(stock_texts) < 5:
return []
all_texts = stock_texts + other_texts
tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts)
# 计算该股票的平均TF-IDF
stock_matrix = tfidf_matrix[:len(stock_texts)]
avg_tfidf = np.array(stock_matrix.mean(axis=0)).flatten()
# 计算其他股票的平均TF-IDF
if other_texts:
other_matrix = tfidf_matrix[len(stock_texts):]
other_avg = np.array(other_matrix.mean(axis=0)).flatten()
# 计算差值
diff = avg_tfidf - other_avg
else:
diff = avg_tfidf
top_indices = diff.argsort()[-top_n*4:][::-1] # 多取一些,避免重复后不够
# 先收集候选词
candidates = []
for idx in top_indices:
word = feature_names[idx]
if len(word.strip()) > 0:
candidates.append({
'word': word,
'tfidf': avg_tfidf[idx],
'diff': diff[idx],
'length': len(word.split()) # 词的长度
})
# 按词长降序排序(优先保留组合词)
candidates.sort(key=lambda x: (-x['length'], -x['diff']))
# 智能去重 - 优先保留组合词
keywords = []
seen_words = set()
seen_parts = set()
for candidate in candidates:
word = candidate['word']
word_parts = word.split()
# 检查是否应该添加这个词
should_add = True
# 检查这个词的任何部分是否已经被其他词使用了
for part in word_parts:
if part in seen_parts:
should_add = False
break
if should_add and word not in seen_words:
seen_words.add(word)
# 记录所有使用过的词部分
for part in word_parts:
seen_parts.add(part)
keywords.append({
'word': word,
'tfidf': candidate['tfidf'],
'diff': candidate['diff']
})
if len(keywords) >= top_n:
break
# 按diff重新排序
keywords.sort(key=lambda x: -x['diff'])
return keywords
def generate_wordcloud(keywords, stock_name, output_dir='output'):
"""生成词云"""
os.makedirs(output_dir, exist_ok=True)
word_freq = {k['word']: k['tfidf'] for k in keywords}
wc = WordCloud(
font_path='C:/Windows/Fonts/simhei.ttf', # Windows中文字体路径
width=800,
height=600,
background_color='white',
max_words=100
)
wc.generate_from_frequencies(word_freq)
output_path = os.path.join(output_dir, f'wordcloud_{stock_name}.png')
wc.to_file(output_path)
print(f'词云已保存到: {output_path}')
return output_path
def analyze_all():
"""完整分析流程"""
print('='*60)
print('股吧数据 TF-IDF 分析')
print('='*60)
# 创建输出目录
os.makedirs('output', exist_ok=True)
# 加载数据
print('\n[1/5] 加载数据...')
all_data, stock_info = load_data()
if not all_data:
print('没有找到数据,请先运行爬虫')
return
print(f' 共加载 {len(all_data)} 条帖子')
print(f' 涉及 {len(stock_info)} 只股票:')
for code, info in stock_info.items():
print(f' - {info["name"]} ({code}): {info["post_count"]}')
# 整体分析
print('\n[2/5] 整体关键词分析...')
all_texts = [d['clean_text'] for d in all_data]
tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts)
overall_keywords = get_top_keywords(tfidf_matrix, feature_names, top_n=30)
print('\n 整体Top 20关键词:')
for i, kw in enumerate(overall_keywords[:20], 1):
print(f' {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f})')
# 保存整体关键词
overall_df = pd.DataFrame(overall_keywords)
overall_df.to_csv('output/overall_keywords.csv', index=False, encoding='utf-8-sig')
# 生成整体词云
generate_wordcloud(overall_keywords, 'overall')
# 各股票单独分析
print('\n[3/5] 各股票关键词分析...')
stock_keywords = {}
for stock_code in stock_info.keys():
stock_name = stock_info[stock_code]['name']
print(f'\n 分析 {stock_name} ({stock_code})...')
keywords = get_stock_specific_keywords(all_data, stock_code, top_n=20)
stock_keywords[stock_code] = keywords
if keywords:
print(f' Top 10关键词:')
for i, kw in enumerate(keywords[:10], 1):
print(f' {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f}, 差值: {kw["diff"]:.4f})')
# 生成词云
generate_wordcloud(keywords, stock_name)
# 保存关键词
df = pd.DataFrame(keywords)
df.to_csv(f'output/keywords_{stock_name}.csv', index=False, encoding='utf-8-sig')
# 生成汇总报告
print('\n[4/5] 生成汇总报告...')
report_data = []
for stock_code, keywords in stock_keywords.items():
stock_name = stock_info[stock_code]['name']
top_words = ', '.join([k['word'] for k in keywords[:5]])
report_data.append({
'股票代码': stock_code,
'股票名称': stock_name,
'帖子数量': stock_info[stock_code]['post_count'],
'Top5关键词': top_words
})
report_df = pd.DataFrame(report_data)
report_df.to_csv('output/summary_report.csv', index=False, encoding='utf-8-sig')
print(' 汇总报告已保存到: output/summary_report.csv')
# 保存所有文本数据
print('\n[5/5] 保存预处理数据...')
all_df = pd.DataFrame(all_data)
all_df.to_csv('output/all_posts.csv', index=False, encoding='utf-8-sig')
print(' 所有帖子已保存到: output/all_posts.csv')
print('\n' + '='*60)
print('分析完成!结果保存在 output/ 目录中')
print('='*60)
if __name__ == '__main__':
analyze_all()
+9
View File
@@ -0,0 +1,9 @@
requests>=2.28.0
pandas>=2.0.0
openpyxl>=3.1.0
jieba>=0.42.1
scikit-learn>=1.3.0
numpy>=1.24.0
matplotlib>=3.7.0
seaborn>=0.12.0
wordcloud>=1.9.0
+409
View File
@@ -0,0 +1,409 @@
import pandas as pd
import jieba
import time
import json
import os
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
# ============================================================
# 第一部分:构建情感词典
# ============================================================
def build_sentiment_dictionary():
"""使用大连理工大学中文情感词汇本体构建情感词典"""
dict_path = '大连理工大学中文情感词汇本体.xlsx'
try:
# 读取大连理工大学情感词汇
df = pd.read_excel(dict_path)
# 选择需要的列
df = df[['词语', '词性种类', '词义数', '词义序号', '情感分类', '强度', '极性']]
# 分类整理
Happy = []
Good = []
Surprise = []
Anger = []
Sad = []
Fear = []
Disgust = []
for idx, row in df.iterrows():
if row['情感分类'] in ['PA', 'PE']:
Happy.append(row['词语'])
if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']:
Good.append(row['词语'])
if row['情感分类'] in ['PC']:
Surprise.append(row['词语'])
if row['情感分类'] in ['NA']:
Anger.append(row['词语'])
if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']:
Sad.append(row['词语'])
if row['情感分类'] in ['NI', 'NC', 'NG']:
Fear.append(row['词语'])
if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']:
Disgust.append(row['词语'])
# 添加股票相关的补充词汇
stock_positive = ['', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '',
'利好', '增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破',
'创新', '优势', '超预期', '亮眼', '惊艳', '奇迹']
stock_negative = ['', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '', '损失',
'套牢', '垃圾', '恶心', '坑爹', '骗局', '', '爆雷', '崩盘', '退市']
Good.extend(stock_positive)
Disgust.extend(stock_negative)
# 合并
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust
print('大连理工大学情感词典加载完成')
print(f'正面情感词: {len(Positive)}')
print(f'负面情感词: {len(Negative)}')
return {
'Happy': Happy,
'Good': Good,
'Surprise': Surprise,
'Anger': Anger,
'Sad': Sad,
'Fear': Fear,
'Disgust': Disgust,
'Positive': Positive,
'Negative': Negative
}
except Exception as e:
print(f'加载大连理工大学情感词典失败: {e}')
print('使用简化版情感词典')
return build_simplified_dictionary()
def build_simplified_dictionary():
"""构建简化的中文情感词典(备用方案)"""
# 正面情感词
Happy = [
'开心', '快乐', '高兴', '喜悦', '愉快', '欣喜', '欢乐', '欢喜', '幸福',
'满意', '满足', '欣慰', '愉悦', '畅快', '乐观', '积极', '美好', '成功'
]
Good = [
'', '优秀', '出色', '精彩', '卓越', '杰出', '优良', '良好', '完美', '不错',
'', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '', '利好',
'增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破', '创新', '优势'
]
Surprise = [
'惊喜', '意外', '震惊', '惊讶', '震撼', '神奇', '奇迹', '惊艳', '亮眼', '超预期'
]
# 负面情感词
Anger = [
'愤怒', '生气', '恼火', '气愤', '暴怒', '愤慨', '愤恨', '震怒', '发怒',
'', '垃圾', '恶心', '坑爹', '骗局', '欺骗', '欺诈', '造假', '腐败', '黑暗'
]
Sad = [
'伤心', '难过', '悲伤', '痛苦', '悲哀', '沮丧', '失望', '绝望', '低落', '悲观',
'', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '', '损失', '套牢'
]
Fear = [
'害怕', '恐惧', '担心', '担忧', '恐慌', '不安', '焦虑', '忧虑', '紧张', '恐怖',
'风险', '危机', '危险', '下跌', '暴跌', '崩盘', '退市', '爆雷', '', ''
]
Disgust = [
'厌恶', '恶心', '反感', '讨厌', '鄙视', '唾弃', '不屑', '蔑视', '嫌弃',
'垃圾', '废物', '不行', '差劲', '', '', '', '骗局'
]
# 合并
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust
print('简化版情感词典构建完成')
print(f'正面情感词: {len(Positive)}')
print(f'负面情感词: {len(Negative)}')
return {
'Happy': Happy,
'Good': Good,
'Surprise': Surprise,
'Anger': Anger,
'Sad': Sad,
'Fear': Fear,
'Disgust': Disgust,
'Positive': Positive,
'Negative': Negative
}
# ============================================================
# 第二部分:情绪计算函数
# ============================================================
def emotion_caculate(text, sentiment_dict):
"""计算单条文本的情绪"""
if not text or pd.isna(text):
text = ''
positive = 0
negative = 0
anger = 0
disgust = 0
fear = 0
sad = 0
surprise = 0
good = 0
happy = 0
wordlist = jieba.lcut(text)
wordset = set(wordlist)
for word in wordset:
freq = wordlist.count(word)
if word in sentiment_dict['Positive']:
positive += freq
if word in sentiment_dict['Negative']:
negative += freq
if word in sentiment_dict['Anger']:
anger += freq
if word in sentiment_dict['Disgust']:
disgust += freq
if word in sentiment_dict['Fear']:
fear += freq
if word in sentiment_dict['Sad']:
sad += freq
if word in sentiment_dict['Surprise']:
surprise += freq
if word in sentiment_dict['Good']:
good += freq
if word in sentiment_dict['Happy']:
happy += freq
emotion_info = {
'length': len(wordlist),
'positive': positive,
'negative': negative,
'anger': anger,
'disgust': disgust,
'fear': fear,
'sadness': sad,
'surprise': surprise,
'good': good,
'happy': happy,
'sentiment_score': positive - negative if (positive + negative) > 0 else 0
}
indexs = ['length', 'positive', 'negative', 'anger', 'disgust', 'fear',
'sadness', 'surprise', 'good', 'happy', 'sentiment_score']
return pd.Series(emotion_info, index=indexs)
# ============================================================
# 第三部分:数据加载与分析
# ============================================================
def load_and_analyze_data(data_dir='data', output_dir='sentiment_output'):
"""加载数据并进行情绪分析"""
os.makedirs(output_dir, exist_ok=True)
# 构建情感词典
sentiment_dict = build_sentiment_dictionary()
# 遍历所有JSON文件
all_results = []
stock_emotions = {}
for filename in os.listdir(data_dir):
if filename.endswith('.json') and filename.startswith('guba_'):
filepath = os.path.join(data_dir, filename)
print(f'\n正在分析: {filename}')
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
stock_name = data.get('stock_name', '未知')
stock_code = data.get('stock_code', '未知')
posts = data.get('posts', [])
if not posts:
print(f' 无数据,跳过')
continue
# 转换为DataFrame
df = pd.DataFrame(posts)
# 合并标题和内容
df['full_text'] = df.apply(
lambda x: f"{x.get('post_title', '')} {x.get('post_content', '')}",
axis=1
)
# 进行情绪分析
print(f' 开始分析 {len(df)} 条帖子...')
start = time.time()
emotion_df = df['full_text'].apply(
lambda x: emotion_caculate(x, sentiment_dict)
)
end = time.time()
print(f' 分析完成,耗时: {end - start:.2f}')
# 合并结果
result_df = pd.concat([df, emotion_df], axis=1)
# 保存结果
output_file = os.path.join(output_dir, f'sentiment_{stock_name}_{stock_code}.csv')
result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f' 结果已保存到: {output_file}')
# 统计整体情绪
stock_stats = {
'stock_code': stock_code,
'stock_name': stock_name,
'total_posts': len(result_df),
'avg_positive': result_df['positive'].mean(),
'avg_negative': result_df['negative'].mean(),
'avg_sentiment_score': result_df['sentiment_score'].mean(),
'positive_posts': (result_df['sentiment_score'] > 0).sum(),
'negative_posts': (result_df['sentiment_score'] < 0).sum(),
'neutral_posts': (result_df['sentiment_score'] == 0).sum(),
'total_anger': result_df['anger'].sum(),
'total_sadness': result_df['sadness'].sum(),
'total_fear': result_df['fear'].sum(),
'total_disgust': result_df['disgust'].sum(),
'total_good': result_df['good'].sum(),
'total_happy': result_df['happy'].sum(),
'total_surprise': result_df['surprise'].sum()
}
stock_emotions[stock_code] = stock_stats
all_results.append(result_df)
# 打印该股票情绪最高/最低的帖子
print(f'\n {stock_name} 情绪分析统计:')
print(f' 平均情绪得分: {stock_stats["avg_sentiment_score"]:.2f}')
print(f' 正面帖子: {stock_stats["positive_posts"]}')
print(f' 负面帖子: {stock_stats["negative_posts"]}')
print(f' 中性帖子: {stock_stats["neutral_posts"]}')
# 最正面帖子
top_positive = result_df.nlargest(1, 'sentiment_score').iloc[0]
print(f' 最正面帖子: {top_positive["full_text"][:50]}...')
# 最负面帖子
top_negative = result_df.nsmallest(1, 'sentiment_score').iloc[0]
print(f' 最负面帖子: {top_negative["full_text"][:50]}...')
except Exception as e:
print(f' 分析失败: {e}')
# 保存总体统计
if stock_emotions:
summary_df = pd.DataFrame(list(stock_emotions.values()))
summary_file = os.path.join(output_dir, 'sentiment_summary.csv')
summary_df.to_csv(summary_file, index=False, encoding='utf-8-sig')
print(f'\n总体统计已保存到: {summary_file}')
# 生成可视化
generate_visualizations(summary_df, stock_emotions, output_dir)
return all_results, stock_emotions
# ============================================================
# 第四部分:可视化
# ============================================================
def generate_visualizations(summary_df, stock_emotions, output_dir):
"""生成情绪分析可视化图表"""
# 1. 各股票平均情绪得分对比
plt.figure(figsize=(12, 6))
colors = ['green' if x >= 0 else 'red' for x in summary_df['avg_sentiment_score']]
plt.bar(summary_df['stock_name'], summary_df['avg_sentiment_score'], color=colors, alpha=0.7)
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
plt.title('各股票平均情绪得分对比', fontsize=14)
plt.xlabel('股票名称', fontsize=12)
plt.ylabel('平均情绪得分', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'sentiment_score_comparison.png'), dpi=300)
plt.close()
# 2. 正面/负面/中性帖子分布
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
axes = axes.flatten()
for idx, (stock_code, stats) in enumerate(stock_emotions.items()):
if idx >= 8:
break
labels = ['正面', '负面', '中性']
sizes = [stats['positive_posts'], stats['negative_posts'], stats['neutral_posts']]
colors = ['green', 'red', 'gray']
axes[idx].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
axes[idx].set_title(f'{stats["stock_name"]} 情绪分布')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'sentiment_distribution.png'), dpi=300)
plt.close()
# 3. 各情绪类型占比
plt.figure(figsize=(14, 7))
emotions = ['total_good', 'total_happy', 'total_surprise',
'total_anger', 'total_sadness', 'total_fear', 'total_disgust']
emotion_names = ['好评', '快乐', '惊讶', '愤怒', '悲伤', '恐惧', '厌恶']
x = range(len(emotion_names))
width = 0.1
for idx, (stock_code, stats) in enumerate(stock_emotions.items()):
values = [stats[e] for e in emotions]
total = sum(values)
if total > 0:
values = [v / total * 100 for v in values]
plt.bar([xi + width * idx for xi in x], values, width, label=stats['stock_name'])
plt.xlabel('情绪类型', fontsize=12)
plt.ylabel('占比 (%)', fontsize=12)
plt.title('各股票情绪类型分布', fontsize=14)
plt.xticks([xi + width * 3.5 for xi in x], emotion_names)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'emotion_types.png'), dpi=300, bbox_inches='tight')
plt.close()
print(f'可视化图表已生成到 {output_dir}')
# ============================================================
# 主程序
# ============================================================
if __name__ == '__main__':
print('=' * 60)
print('股吧数据情绪分析')
print('=' * 60)
# 运行分析
all_results, stock_emotions = load_and_analyze_data()
print('\n' + '=' * 60)
print('情绪分析完成!')
print('=' * 60)
+187
View File
@@ -0,0 +1,187 @@
import requests
import pandas as pd
import json
import time
from datetime import datetime
import os
def fetch_guba_data(code='gssz', page=1, page_size=20, sort_type=1):
url = 'https://mguba.eastmoney.com/mguba2020/interface/GetData.aspx'
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'DNT': '1',
'Origin': 'https://mguba.eastmoney.com',
'Pragma': 'no-cache',
'Referer': f'https://mguba.eastmoney.com/mguba/list/{code}_{page}',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Mobile Safari/537.36 Edg/148.0.0.0',
'sec-ch-ua': '"Chromium";v="148", "Microsoft Edge";v="148", "Not/A)Brand";v="99"',
'sec-ch-ua-mobile': '?1',
'sec-ch-ua-platform': '"Android"'
}
cookies = {
'qgqp_b_id': '30059d8839ad5c045fa8856e38013e9c',
'st_nvi': 'XwpSfYXGjCxfCdbgapK5_cac4',
'nid18': '0daec1df8064f04edd20b4e69250a8f5',
'nid18_create_time': '1776263017375',
'gviem': 'UrMH_tSu1UpW8B_TKmytl803f',
'gviem_create_time': '1776263017375',
'fullscreengg': '1',
'fullscreengg2': '1',
'st_si': '17952715731426',
'show_app_box_time': '1779903756410',
'st_pvi': '26838250597806',
'st_sp': '2026-04-15 22:23:37',
'st_inirUrl': 'https://cn.bing.com/',
'st_sn': '30',
'st_psi': '20260528025236177-117016304298-3040545697',
'ad_tc_load_num': '3',
'st_asi': '20260528025236177-117016304298-3040545697-ad.djxd-1'
}
param = f'code={code}&p={page}&ps={page_size}&sorttype={sort_type}'
data = {
'param': param,
'plat': 'wap',
'version': '200',
'path': '/webarticlelist/api/Article/WebArticleList',
'env': '1',
'origin': '',
'ctoken': '',
'utoken': ''
}
try:
response = requests.post(url, headers=headers, cookies=cookies, data=data)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f'请求失败: {e}')
return None
def fetch_stock_posts(code, name, pages=10, page_size=20):
"""爬取指定股票的多页数据"""
all_posts = []
for page in range(1, pages + 1):
print(f'正在爬取 {name} ({code}) - 第 {page}/{pages}')
result = fetch_guba_data(code=code, page=page, page_size=page_size)
if result and 're' in result:
posts = result['re']
all_posts.extend(posts)
print(f' 成功获取 {len(posts)} 条帖子')
else:
print(f'{page} 页获取失败或无数据')
# 添加延迟避免请求过快
if page < pages:
time.sleep(1)
# 整理数据
data = {
'stock_code': code,
'stock_name': name,
'total_pages': pages,
'total_posts': len(all_posts),
'crawl_time': datetime.now().isoformat(),
'posts': all_posts
}
return data
def save_to_json(data, name="", filename=None):
if not data:
print('数据为空,无法保存')
return None
if not filename:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'guba_{name}_{timestamp}.json'
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f'JSON数据已保存到: {filename}')
return filename
def save_to_excel(data, name="", filename=None):
if not data or 'posts' not in data:
print('数据格式不正确,无法保存')
return None
posts = data['posts']
records = []
for post in posts:
record = {
'帖子ID': post.get('post_id'),
'标题': post.get('post_title'),
'内容': post.get('post_content'),
'作者': post.get('post_user', {}).get('user_nickname'),
'发布时间': post.get('post_publish_time'),
'最后更新': post.get('post_last_time'),
'阅读数': post.get('post_click_count'),
'评论数': post.get('post_comment_count'),
'点赞数': post.get('post_like_count'),
'股吧': post.get('post_guba', {}).get('stockbar_name'),
'来源': post.get('post_from')
}
records.append(record)
df = pd.DataFrame(records)
if not filename:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'guba_{name}_{timestamp}.xlsx'
df.to_excel(filename, index=False, engine='openpyxl')
print(f'Excel数据已保存到: {filename}')
return filename
if __name__ == '__main__':
GAME_STOCKS = {
'002624': '完美世界',
'002555': '三七互娱',
'002558': '巨人网络',
'002602': '世纪华通',
'300418': '昆仑万维',
'002174': '游族网络',
'300315': '掌趣科技',
'603444': '吉比特',
}
# 创建数据目录
os.makedirs('data', exist_ok=True)
for code, name in GAME_STOCKS.items():
print(f'\n{"="*50}')
print(f'开始爬取 {name} ({code})')
print(f'{"="*50}')
# 爬取10页数据
data = fetch_stock_posts(code, name, pages=10)
if data and data['total_posts'] > 0:
print(f'\n共获取 {data["total_posts"]} 条帖子')
# 保存JSON
json_filename = os.path.join('data', f'guba_{name}_{code}.json')
save_to_json(data, name, json_filename)
# 保存Excel
excel_filename = os.path.join('data', f'guba_{name}_{code}.xlsx')
save_to_excel(data, name, excel_filename)
else:
print(f'{name} 爬取失败或无数据')
# 股票之间的延迟
time.sleep(2)
+296
View File
@@ -0,0 +1,296 @@
# 游戏股吧情感与话题分析报告
**报告日期**2026-05-28
**分析范围**:完美世界、三七互娱、巨人网络、世纪华通、昆仑万维、游族网络、掌趣科技、吉比特
**数据来源**:东方财富网股吧
---
## 一、数据概述
本次分析共收集了8只游戏股票的股吧数据,每只股票200条帖子,总计1600条有效数据。
### 数据收集方法
- 使用网络爬虫从东方财富网股吧获取帖子
- 数据包括:帖子标题、内容、发布时间等
- 使用大连理工大学中文情感词汇本体进行情感分析
---
## 二、整体话题分析
### 整体词云
![整体词云](output/wordcloud_overall.png)
### 整体话题关键词
| 排名 | 关键词 | TF-IDF值 |
|------|--------|----------|
| 1 | 网络 sz | 0.0341 |
| 2 | 巨人 | 0.0235 |
| 3 | 世纪 华通 | 0.0215 |
| 4 | 昆仑 万维 | 0.0215 |
| 5 | 游族 | 0.0215 |
| 6 | 三七 互娱 | 0.0201 |
| 7 | 游戏 | 0.0199 |
| 8 | 掌趣 科技 | 0.0187 |
| 9 | 比特 sh | 0.0183 |
| 10 | 完美 世界 | 0.0174 |
### 整体热门话题
从整体词云可以看出,股吧讨论主要集中在:
1. **个股名称**:各股票名称是最热门的话题
2. **股票操作**:主力、涨停、下跌、出货、股价等
3. **市场情绪**:散户、大盘、投资等
---
## 三、各股票专题分析
### 1. 完美世界 (002624)
#### 词云分析
![完美世界词云](output/wordcloud_完美世界.png)
#### 关键词分析
- **异环**:指游戏《异环》相关讨论
- **流水**:游戏流水情况
- **版本**:游戏版本更新
- **安魂曲**:指游戏角色《安魂曲》
#### 情绪分析
- **平均情绪得分**:0.99(最高)
- **正面帖子**110条
- **负面帖子**21条
- **中性帖子**69条
**情绪倾向**:非常积极!完美世界是本次分析中情绪最正面的股票。
---
### 2. 巨人网络 (002558)
#### 词云分析
![巨人网络词云](output/wordcloud_巨人网络.png)
#### 关键词分析
- **补仓**:投资者补仓操作
- **腰斩**:股价大幅下跌
- **跳水**:股价快速下跌
- **兄弟**:股吧常见称呼
#### 情绪分析
- **平均情绪得分**:1.11(最高)
- **正面帖子**115条
- **负面帖子**20条
- **中性帖子**65条
**情绪倾向**:非常积极!虽然有"腰斩"、"跳水"等负面词汇,但整体情绪仍然很高。
---
### 3. 三七互娱 (002555)
#### 词云分析
![三七互娱词云](output/wordcloud_三七互娱.png)
#### 关键词分析
- **分红**:股票分红相关讨论
- **投资**:投资策略讨论
- **智谱**:可能指AI相关业务
- **AI**:人工智能话题
#### 情绪分析
- **平均情绪得分**0.77
- **正面帖子**72条
- **负面帖子**39条
- **中性帖子**89条
**情绪倾向**:积极!
---
### 4. 游族网络 (002174)
#### 词云分析
![游族网络词云](output/wordcloud_游族网络.png)
#### 关键词分析
- **三体**:《三体》IP相关讨论
- **死刑**、**执行**:与投毒案相关讨论
- **CEO**、**林奇**:公司高管相关
- **投毒**:历史事件回顾
#### 情绪分析
- **平均情绪得分**0.68
- **正面帖子**73条
- **负面帖子**28条
- **中性帖子**99条
**情绪倾向**:积极!虽然有历史负面事件,但当前情绪较好。
---
### 5. 世纪华通 (002602)
#### 词云分析
![世纪华通词云](output/wordcloud_世纪华通.png)
#### 关键词分析
- **调整**:股价调整
- **拉升**:股价拉升
- **索赔**:可能指投资者索赔
- **看好**:市场观点
#### 情绪分析
- **平均情绪得分**0.48
- **正面帖子**63条
- **负面帖子**36条
- **中性帖子**101条
**情绪倾向**:中性偏积极!
---
### 6. 昆仑万维 (300418)
#### 词云分析
![昆仑万维词云](output/wordcloud_昆仑万维.png)
#### 关键词分析
- **解禁**:股票解禁相关
- **员工**:员工持股等
- **短剧**:短剧业务
- **模型**AI模型相关
#### 情绪分析
- **平均情绪得分**0.30
- **正面帖子**61条
- **负面帖子**49条
- **中性帖子**90条
**情绪倾向**:中性偏积极!
---
### 7. 掌趣科技 (300315)
#### 词云分析
![掌趣科技词云](output/wordcloud_掌趣科技.png)
#### 关键词分析
- **创业板**:创业板相关
- **退市**:退市风险讨论
- **垃圾**:负面评价
- **解套**:投资者解套需求
#### 情绪分析
- **平均情绪得分**0.05
- **正面帖子**44条
- **负面帖子**47条
- **中性帖子**109条
**情绪倾向**:中性!正负情绪基本持平。
---
### 8. 吉比特 (603444)
#### 词云分析
![吉比特词云](output/wordcloud_吉比特.png)
#### 关键词分析
- **分红**:分红讨论
- **业绩**:业绩讨论
- **价值投资**:投资理念
- **恶心**:负面情绪表达
#### 情绪分析
- **平均情绪得分**0.05
- **正面帖子**50条
- **负面帖子**65条
- **中性帖子**85条
**情绪倾向**:中性偏消极!负面帖子多于正面帖子。
---
## 四、情绪分析汇总
### 情绪得分对比
![情绪得分对比](sentiment_output/sentiment_score_comparison.png)
### 情绪分布
![情绪分布](sentiment_output/sentiment_distribution.png)
### 情绪类型分布
![情绪类型分布](sentiment_output/emotion_types.png)
### 各股票情绪得分排名
| 排名 | 股票名称 | 股票代码 | 平均情绪得分 | 情绪倾向 |
|------|----------|----------|--------------|----------|
| 1 | 巨人网络 | 002558 | 1.11 | 🔵 非常积极 |
| 2 | 完美世界 | 002624 | 0.99 | 🔵 非常积极 |
| 3 | 三七互娱 | 002555 | 0.77 | 🟢 积极 |
| 4 | 游族网络 | 002174 | 0.68 | 🟢 积极 |
| 5 | 世纪华通 | 002602 | 0.48 | 🟡 中性偏积极 |
| 6 | 昆仑万维 | 300418 | 0.30 | 🟡 中性偏积极 |
| 7 | 掌趣科技 | 300315 | 0.05 | 🟡 中性 |
| 8 | 吉比特 | 603444 | 0.05 | 🟡 中性偏消极 |
---
## 五、结论与建议
### 主要发现
1. **情绪分布**
- 整体来看,游戏股股吧情绪以中性和积极为主
- 巨人网络和完美世界情绪最积极
- 吉比特和掌趣科技情绪相对较低
2. **话题特点**
- 各股票的讨论主要围绕自身业务和股价
- 完美世界和巨人网络讨论中游戏内容较多
- 游族网络仍有较多历史事件相关讨论
3. **热门话题**
- 股价操作:涨停、下跌、出货、拉升
- 投资者行为:补仓、解套、分红
- 行业热点:AI、短剧、游戏流水
### 投资建议(仅供参考)
1. **情绪领先标的**
- 完美世界和巨人网络股吧情绪最为积极,可重点关注
- 关注其游戏业务进展和业绩情况
2. **风险提示**
- 吉比特和掌趣科技情绪相对较低,需注意风险
- 游族网络历史事件仍有一定影响
3. **持续关注**
- 昆仑万维的AI和短剧业务
- 三七互娱的分红和投资策略
---
## 附录
### 数据文件说明
- `data/`:原始爬取数据(JSON和Excel格式)
- `output/`TF-IDF分析结果和词云图片
- `sentiment_output/`:情感分析结果和可视化图片
### 分析工具
- **爬虫**Python + Requests
- **分词**jieba
- **情感词典**:大连理工大学中文情感词汇本体
- **可视化**Matplotlib + WordCloud
---
**报告生成时间**2026-05-28
**分析工具**:自定义Python脚本
Binary file not shown.