ini
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
data/*
|
||||
output/*
|
||||
sentiment_output/*
|
||||
+347
@@ -0,0 +1,347 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import jieba
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from wordcloud import WordCloud
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
matplotlib.use('Agg') # 使用非交互式后端
|
||||
|
||||
# 中文停用词表
|
||||
STOPWORDS = {
|
||||
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要',
|
||||
'去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '有', '吗', '吧', '呢', '啊', '呀', '什么', '怎么',
|
||||
'为什么', '哪里', '谁', '多少', '几', '个', '只', '条', '把', '本', '篇', '次', '天', '今天', '明天', '昨天', '又',
|
||||
'再', '还', '已经', '还是', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '还是', '或者', '还是', '还是',
|
||||
'这个', '那个', '这些', '那些', '那么', '这么', '怎么', '如何', '因为', '所以', '虽然', '但是', '如果', '就', '那么',
|
||||
'跟', '和', '与', '及', '或', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是',
|
||||
'股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞',
|
||||
'http', 'https', 'com', 'cn', 'www', 'net', 'org'
|
||||
}
|
||||
|
||||
def clean_text(text):
|
||||
"""清洗文本"""
|
||||
if not text:
|
||||
return ""
|
||||
# 移除URL
|
||||
text = re.sub(r'https?://\S+|www\.\S+', '', text)
|
||||
# 移除HTML标签
|
||||
text = re.sub(r'<.*?>', '', text)
|
||||
# 移除表情符号
|
||||
text = re.sub(r'\[.*?\]', '', text)
|
||||
# 移除特殊字符
|
||||
text = re.sub(r'[^\w\s]', '', text)
|
||||
# 移除数字
|
||||
text = re.sub(r'\d+', '', text)
|
||||
# 移除多余空格
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
def tokenize(text):
|
||||
"""中文分词"""
|
||||
words = jieba.lcut(text)
|
||||
# 过滤停用词和短词
|
||||
words = [w for w in words if w not in STOPWORDS and len(w) > 1]
|
||||
return words
|
||||
|
||||
def load_data(data_dir='data'):
|
||||
"""加载所有股票数据"""
|
||||
all_data = []
|
||||
stock_info = {}
|
||||
|
||||
if not os.path.exists(data_dir):
|
||||
print(f'数据目录 {data_dir} 不存在')
|
||||
return all_data, stock_info
|
||||
|
||||
for filename in os.listdir(data_dir):
|
||||
if filename.endswith('.json'):
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
stock_name = data.get('stock_name', '未知')
|
||||
stock_code = data.get('stock_code', '未知')
|
||||
posts = data.get('posts', [])
|
||||
|
||||
stock_info[stock_code] = {
|
||||
'name': stock_name,
|
||||
'post_count': len(posts)
|
||||
}
|
||||
|
||||
for post in posts:
|
||||
content = post.get('post_content', '')
|
||||
title = post.get('post_title', '')
|
||||
full_text = f"{title} {content}".strip()
|
||||
|
||||
if full_text:
|
||||
all_data.append({
|
||||
'stock_code': stock_code,
|
||||
'stock_name': stock_name,
|
||||
'post_id': post.get('post_id'),
|
||||
'text': full_text,
|
||||
'clean_text': clean_text(full_text)
|
||||
})
|
||||
except Exception as e:
|
||||
print(f'加载文件 {filename} 失败: {e}')
|
||||
|
||||
return all_data, stock_info
|
||||
|
||||
def calculate_tfidf(texts):
|
||||
"""计算TF-IDF"""
|
||||
vectorizer = TfidfVectorizer(
|
||||
tokenizer=tokenize,
|
||||
token_pattern=None,
|
||||
max_features=1000,
|
||||
ngram_range=(1, 2)
|
||||
)
|
||||
|
||||
tfidf_matrix = vectorizer.fit_transform(texts)
|
||||
feature_names = vectorizer.get_feature_names_out()
|
||||
|
||||
return tfidf_matrix, feature_names, vectorizer
|
||||
|
||||
def get_top_keywords(tfidf_matrix, feature_names, top_n=20):
|
||||
"""获取Top关键词"""
|
||||
avg_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
|
||||
top_indices = avg_tfidf.argsort()[-top_n*4:][::-1] # 多取一些,避免重复后不够
|
||||
|
||||
# 先收集候选词
|
||||
candidates = []
|
||||
for idx in top_indices:
|
||||
word = feature_names[idx]
|
||||
if len(word.strip()) > 0:
|
||||
candidates.append({
|
||||
'word': word,
|
||||
'tfidf': avg_tfidf[idx],
|
||||
'length': len(word.split()) # 词的长度(包含多少个词)
|
||||
})
|
||||
|
||||
# 按词长降序排序(优先保留组合词)
|
||||
candidates.sort(key=lambda x: (-x['length'], -x['tfidf']))
|
||||
|
||||
# 智能去重 - 优先保留组合词
|
||||
keywords = []
|
||||
seen_words = set()
|
||||
seen_parts = set()
|
||||
|
||||
for candidate in candidates:
|
||||
word = candidate['word']
|
||||
word_parts = word.split()
|
||||
|
||||
# 检查是否应该添加这个词
|
||||
should_add = True
|
||||
|
||||
# 检查这个词的任何部分是否已经被其他词使用了
|
||||
for part in word_parts:
|
||||
if part in seen_parts:
|
||||
should_add = False
|
||||
break
|
||||
|
||||
if should_add and word not in seen_words:
|
||||
seen_words.add(word)
|
||||
# 记录所有使用过的词部分
|
||||
for part in word_parts:
|
||||
seen_parts.add(part)
|
||||
keywords.append({
|
||||
'word': word,
|
||||
'tfidf': candidate['tfidf']
|
||||
})
|
||||
if len(keywords) >= top_n:
|
||||
break
|
||||
|
||||
# 按TF-IDF重新排序
|
||||
keywords.sort(key=lambda x: -x['tfidf'])
|
||||
return keywords
|
||||
|
||||
def get_stock_specific_keywords(all_data, stock_code, top_n=20):
|
||||
"""获取特定股票的关键词"""
|
||||
stock_texts = [d['clean_text'] for d in all_data if d['stock_code'] == stock_code]
|
||||
other_texts = [d['clean_text'] for d in all_data if d['stock_code'] != stock_code]
|
||||
|
||||
if len(stock_texts) < 5:
|
||||
return []
|
||||
|
||||
all_texts = stock_texts + other_texts
|
||||
tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts)
|
||||
|
||||
# 计算该股票的平均TF-IDF
|
||||
stock_matrix = tfidf_matrix[:len(stock_texts)]
|
||||
avg_tfidf = np.array(stock_matrix.mean(axis=0)).flatten()
|
||||
|
||||
# 计算其他股票的平均TF-IDF
|
||||
if other_texts:
|
||||
other_matrix = tfidf_matrix[len(stock_texts):]
|
||||
other_avg = np.array(other_matrix.mean(axis=0)).flatten()
|
||||
# 计算差值
|
||||
diff = avg_tfidf - other_avg
|
||||
else:
|
||||
diff = avg_tfidf
|
||||
|
||||
top_indices = diff.argsort()[-top_n*4:][::-1] # 多取一些,避免重复后不够
|
||||
|
||||
# 先收集候选词
|
||||
candidates = []
|
||||
for idx in top_indices:
|
||||
word = feature_names[idx]
|
||||
if len(word.strip()) > 0:
|
||||
candidates.append({
|
||||
'word': word,
|
||||
'tfidf': avg_tfidf[idx],
|
||||
'diff': diff[idx],
|
||||
'length': len(word.split()) # 词的长度
|
||||
})
|
||||
|
||||
# 按词长降序排序(优先保留组合词)
|
||||
candidates.sort(key=lambda x: (-x['length'], -x['diff']))
|
||||
|
||||
# 智能去重 - 优先保留组合词
|
||||
keywords = []
|
||||
seen_words = set()
|
||||
seen_parts = set()
|
||||
|
||||
for candidate in candidates:
|
||||
word = candidate['word']
|
||||
word_parts = word.split()
|
||||
|
||||
# 检查是否应该添加这个词
|
||||
should_add = True
|
||||
|
||||
# 检查这个词的任何部分是否已经被其他词使用了
|
||||
for part in word_parts:
|
||||
if part in seen_parts:
|
||||
should_add = False
|
||||
break
|
||||
|
||||
if should_add and word not in seen_words:
|
||||
seen_words.add(word)
|
||||
# 记录所有使用过的词部分
|
||||
for part in word_parts:
|
||||
seen_parts.add(part)
|
||||
keywords.append({
|
||||
'word': word,
|
||||
'tfidf': candidate['tfidf'],
|
||||
'diff': candidate['diff']
|
||||
})
|
||||
if len(keywords) >= top_n:
|
||||
break
|
||||
|
||||
# 按diff重新排序
|
||||
keywords.sort(key=lambda x: -x['diff'])
|
||||
return keywords
|
||||
|
||||
def generate_wordcloud(keywords, stock_name, output_dir='output'):
|
||||
"""生成词云"""
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
word_freq = {k['word']: k['tfidf'] for k in keywords}
|
||||
|
||||
wc = WordCloud(
|
||||
font_path='C:/Windows/Fonts/simhei.ttf', # Windows中文字体路径
|
||||
width=800,
|
||||
height=600,
|
||||
background_color='white',
|
||||
max_words=100
|
||||
)
|
||||
|
||||
wc.generate_from_frequencies(word_freq)
|
||||
|
||||
output_path = os.path.join(output_dir, f'wordcloud_{stock_name}.png')
|
||||
wc.to_file(output_path)
|
||||
print(f'词云已保存到: {output_path}')
|
||||
|
||||
return output_path
|
||||
|
||||
def analyze_all():
|
||||
"""完整分析流程"""
|
||||
print('='*60)
|
||||
print('股吧数据 TF-IDF 分析')
|
||||
print('='*60)
|
||||
|
||||
# 创建输出目录
|
||||
os.makedirs('output', exist_ok=True)
|
||||
|
||||
# 加载数据
|
||||
print('\n[1/5] 加载数据...')
|
||||
all_data, stock_info = load_data()
|
||||
|
||||
if not all_data:
|
||||
print('没有找到数据,请先运行爬虫')
|
||||
return
|
||||
|
||||
print(f' 共加载 {len(all_data)} 条帖子')
|
||||
print(f' 涉及 {len(stock_info)} 只股票:')
|
||||
for code, info in stock_info.items():
|
||||
print(f' - {info["name"]} ({code}): {info["post_count"]} 条')
|
||||
|
||||
# 整体分析
|
||||
print('\n[2/5] 整体关键词分析...')
|
||||
all_texts = [d['clean_text'] for d in all_data]
|
||||
tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts)
|
||||
overall_keywords = get_top_keywords(tfidf_matrix, feature_names, top_n=30)
|
||||
|
||||
print('\n 整体Top 20关键词:')
|
||||
for i, kw in enumerate(overall_keywords[:20], 1):
|
||||
print(f' {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f})')
|
||||
|
||||
# 保存整体关键词
|
||||
overall_df = pd.DataFrame(overall_keywords)
|
||||
overall_df.to_csv('output/overall_keywords.csv', index=False, encoding='utf-8-sig')
|
||||
|
||||
# 生成整体词云
|
||||
generate_wordcloud(overall_keywords, 'overall')
|
||||
|
||||
# 各股票单独分析
|
||||
print('\n[3/5] 各股票关键词分析...')
|
||||
stock_keywords = {}
|
||||
|
||||
for stock_code in stock_info.keys():
|
||||
stock_name = stock_info[stock_code]['name']
|
||||
print(f'\n 分析 {stock_name} ({stock_code})...')
|
||||
|
||||
keywords = get_stock_specific_keywords(all_data, stock_code, top_n=20)
|
||||
stock_keywords[stock_code] = keywords
|
||||
|
||||
if keywords:
|
||||
print(f' Top 10关键词:')
|
||||
for i, kw in enumerate(keywords[:10], 1):
|
||||
print(f' {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f}, 差值: {kw["diff"]:.4f})')
|
||||
|
||||
# 生成词云
|
||||
generate_wordcloud(keywords, stock_name)
|
||||
|
||||
# 保存关键词
|
||||
df = pd.DataFrame(keywords)
|
||||
df.to_csv(f'output/keywords_{stock_name}.csv', index=False, encoding='utf-8-sig')
|
||||
|
||||
# 生成汇总报告
|
||||
print('\n[4/5] 生成汇总报告...')
|
||||
report_data = []
|
||||
for stock_code, keywords in stock_keywords.items():
|
||||
stock_name = stock_info[stock_code]['name']
|
||||
top_words = ', '.join([k['word'] for k in keywords[:5]])
|
||||
report_data.append({
|
||||
'股票代码': stock_code,
|
||||
'股票名称': stock_name,
|
||||
'帖子数量': stock_info[stock_code]['post_count'],
|
||||
'Top5关键词': top_words
|
||||
})
|
||||
|
||||
report_df = pd.DataFrame(report_data)
|
||||
report_df.to_csv('output/summary_report.csv', index=False, encoding='utf-8-sig')
|
||||
print(' 汇总报告已保存到: output/summary_report.csv')
|
||||
|
||||
# 保存所有文本数据
|
||||
print('\n[5/5] 保存预处理数据...')
|
||||
all_df = pd.DataFrame(all_data)
|
||||
all_df.to_csv('output/all_posts.csv', index=False, encoding='utf-8-sig')
|
||||
print(' 所有帖子已保存到: output/all_posts.csv')
|
||||
|
||||
print('\n' + '='*60)
|
||||
print('分析完成!结果保存在 output/ 目录中')
|
||||
print('='*60)
|
||||
|
||||
if __name__ == '__main__':
|
||||
analyze_all()
|
||||
@@ -0,0 +1,9 @@
|
||||
requests>=2.28.0
|
||||
pandas>=2.0.0
|
||||
openpyxl>=3.1.0
|
||||
jieba>=0.42.1
|
||||
scikit-learn>=1.3.0
|
||||
numpy>=1.24.0
|
||||
matplotlib>=3.7.0
|
||||
seaborn>=0.12.0
|
||||
wordcloud>=1.9.0
|
||||
@@ -0,0 +1,409 @@
|
||||
import pandas as pd
|
||||
import jieba
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
|
||||
# 设置中文字体
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS']
|
||||
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
|
||||
|
||||
# ============================================================
|
||||
# 第一部分:构建情感词典
|
||||
# ============================================================
|
||||
|
||||
def build_sentiment_dictionary():
|
||||
"""使用大连理工大学中文情感词汇本体构建情感词典"""
|
||||
|
||||
dict_path = '大连理工大学中文情感词汇本体.xlsx'
|
||||
|
||||
try:
|
||||
# 读取大连理工大学情感词汇
|
||||
df = pd.read_excel(dict_path)
|
||||
|
||||
# 选择需要的列
|
||||
df = df[['词语', '词性种类', '词义数', '词义序号', '情感分类', '强度', '极性']]
|
||||
|
||||
# 分类整理
|
||||
Happy = []
|
||||
Good = []
|
||||
Surprise = []
|
||||
Anger = []
|
||||
Sad = []
|
||||
Fear = []
|
||||
Disgust = []
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
if row['情感分类'] in ['PA', 'PE']:
|
||||
Happy.append(row['词语'])
|
||||
if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']:
|
||||
Good.append(row['词语'])
|
||||
if row['情感分类'] in ['PC']:
|
||||
Surprise.append(row['词语'])
|
||||
if row['情感分类'] in ['NA']:
|
||||
Anger.append(row['词语'])
|
||||
if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']:
|
||||
Sad.append(row['词语'])
|
||||
if row['情感分类'] in ['NI', 'NC', 'NG']:
|
||||
Fear.append(row['词语'])
|
||||
if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']:
|
||||
Disgust.append(row['词语'])
|
||||
|
||||
# 添加股票相关的补充词汇
|
||||
stock_positive = ['涨', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '赚',
|
||||
'利好', '增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破',
|
||||
'创新', '优势', '超预期', '亮眼', '惊艳', '奇迹']
|
||||
stock_negative = ['跌', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '赔', '损失',
|
||||
'套牢', '垃圾', '恶心', '坑爹', '骗局', '雷', '爆雷', '崩盘', '退市']
|
||||
|
||||
Good.extend(stock_positive)
|
||||
Disgust.extend(stock_negative)
|
||||
|
||||
# 合并
|
||||
Positive = Happy + Good + Surprise
|
||||
Negative = Anger + Sad + Fear + Disgust
|
||||
|
||||
print('大连理工大学情感词典加载完成')
|
||||
print(f'正面情感词: {len(Positive)}个')
|
||||
print(f'负面情感词: {len(Negative)}个')
|
||||
|
||||
return {
|
||||
'Happy': Happy,
|
||||
'Good': Good,
|
||||
'Surprise': Surprise,
|
||||
'Anger': Anger,
|
||||
'Sad': Sad,
|
||||
'Fear': Fear,
|
||||
'Disgust': Disgust,
|
||||
'Positive': Positive,
|
||||
'Negative': Negative
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f'加载大连理工大学情感词典失败: {e}')
|
||||
print('使用简化版情感词典')
|
||||
return build_simplified_dictionary()
|
||||
|
||||
def build_simplified_dictionary():
|
||||
"""构建简化的中文情感词典(备用方案)"""
|
||||
|
||||
# 正面情感词
|
||||
Happy = [
|
||||
'开心', '快乐', '高兴', '喜悦', '愉快', '欣喜', '欢乐', '欢喜', '幸福',
|
||||
'满意', '满足', '欣慰', '愉悦', '畅快', '乐观', '积极', '美好', '成功'
|
||||
]
|
||||
|
||||
Good = [
|
||||
'好', '优秀', '出色', '精彩', '卓越', '杰出', '优良', '良好', '完美', '不错',
|
||||
'涨', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '赚', '利好',
|
||||
'增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破', '创新', '优势'
|
||||
]
|
||||
|
||||
Surprise = [
|
||||
'惊喜', '意外', '震惊', '惊讶', '震撼', '神奇', '奇迹', '惊艳', '亮眼', '超预期'
|
||||
]
|
||||
|
||||
# 负面情感词
|
||||
Anger = [
|
||||
'愤怒', '生气', '恼火', '气愤', '暴怒', '愤慨', '愤恨', '震怒', '发怒',
|
||||
'骂', '垃圾', '恶心', '坑爹', '骗局', '欺骗', '欺诈', '造假', '腐败', '黑暗'
|
||||
]
|
||||
|
||||
Sad = [
|
||||
'伤心', '难过', '悲伤', '痛苦', '悲哀', '沮丧', '失望', '绝望', '低落', '悲观',
|
||||
'跌', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '赔', '损失', '套牢'
|
||||
]
|
||||
|
||||
Fear = [
|
||||
'害怕', '恐惧', '担心', '担忧', '恐慌', '不安', '焦虑', '忧虑', '紧张', '恐怖',
|
||||
'风险', '危机', '危险', '下跌', '暴跌', '崩盘', '退市', '爆雷', '雷', '怕'
|
||||
]
|
||||
|
||||
Disgust = [
|
||||
'厌恶', '恶心', '反感', '讨厌', '鄙视', '唾弃', '不屑', '蔑视', '嫌弃',
|
||||
'垃圾', '废物', '不行', '差劲', '差', '烂', '渣', '骗局'
|
||||
]
|
||||
|
||||
# 合并
|
||||
Positive = Happy + Good + Surprise
|
||||
Negative = Anger + Sad + Fear + Disgust
|
||||
|
||||
print('简化版情感词典构建完成')
|
||||
print(f'正面情感词: {len(Positive)}个')
|
||||
print(f'负面情感词: {len(Negative)}个')
|
||||
|
||||
return {
|
||||
'Happy': Happy,
|
||||
'Good': Good,
|
||||
'Surprise': Surprise,
|
||||
'Anger': Anger,
|
||||
'Sad': Sad,
|
||||
'Fear': Fear,
|
||||
'Disgust': Disgust,
|
||||
'Positive': Positive,
|
||||
'Negative': Negative
|
||||
}
|
||||
|
||||
# ============================================================
|
||||
# 第二部分:情绪计算函数
|
||||
# ============================================================
|
||||
|
||||
def emotion_caculate(text, sentiment_dict):
|
||||
"""计算单条文本的情绪"""
|
||||
|
||||
if not text or pd.isna(text):
|
||||
text = ''
|
||||
|
||||
positive = 0
|
||||
negative = 0
|
||||
anger = 0
|
||||
disgust = 0
|
||||
fear = 0
|
||||
sad = 0
|
||||
surprise = 0
|
||||
good = 0
|
||||
happy = 0
|
||||
|
||||
wordlist = jieba.lcut(text)
|
||||
wordset = set(wordlist)
|
||||
|
||||
for word in wordset:
|
||||
freq = wordlist.count(word)
|
||||
|
||||
if word in sentiment_dict['Positive']:
|
||||
positive += freq
|
||||
if word in sentiment_dict['Negative']:
|
||||
negative += freq
|
||||
if word in sentiment_dict['Anger']:
|
||||
anger += freq
|
||||
if word in sentiment_dict['Disgust']:
|
||||
disgust += freq
|
||||
if word in sentiment_dict['Fear']:
|
||||
fear += freq
|
||||
if word in sentiment_dict['Sad']:
|
||||
sad += freq
|
||||
if word in sentiment_dict['Surprise']:
|
||||
surprise += freq
|
||||
if word in sentiment_dict['Good']:
|
||||
good += freq
|
||||
if word in sentiment_dict['Happy']:
|
||||
happy += freq
|
||||
|
||||
emotion_info = {
|
||||
'length': len(wordlist),
|
||||
'positive': positive,
|
||||
'negative': negative,
|
||||
'anger': anger,
|
||||
'disgust': disgust,
|
||||
'fear': fear,
|
||||
'sadness': sad,
|
||||
'surprise': surprise,
|
||||
'good': good,
|
||||
'happy': happy,
|
||||
'sentiment_score': positive - negative if (positive + negative) > 0 else 0
|
||||
}
|
||||
|
||||
indexs = ['length', 'positive', 'negative', 'anger', 'disgust', 'fear',
|
||||
'sadness', 'surprise', 'good', 'happy', 'sentiment_score']
|
||||
|
||||
return pd.Series(emotion_info, index=indexs)
|
||||
|
||||
# ============================================================
|
||||
# 第三部分:数据加载与分析
|
||||
# ============================================================
|
||||
|
||||
def load_and_analyze_data(data_dir='data', output_dir='sentiment_output'):
|
||||
"""加载数据并进行情绪分析"""
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 构建情感词典
|
||||
sentiment_dict = build_sentiment_dictionary()
|
||||
|
||||
# 遍历所有JSON文件
|
||||
all_results = []
|
||||
stock_emotions = {}
|
||||
|
||||
for filename in os.listdir(data_dir):
|
||||
if filename.endswith('.json') and filename.startswith('guba_'):
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
|
||||
print(f'\n正在分析: {filename}')
|
||||
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
stock_name = data.get('stock_name', '未知')
|
||||
stock_code = data.get('stock_code', '未知')
|
||||
posts = data.get('posts', [])
|
||||
|
||||
if not posts:
|
||||
print(f' 无数据,跳过')
|
||||
continue
|
||||
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(posts)
|
||||
|
||||
# 合并标题和内容
|
||||
df['full_text'] = df.apply(
|
||||
lambda x: f"{x.get('post_title', '')} {x.get('post_content', '')}",
|
||||
axis=1
|
||||
)
|
||||
|
||||
# 进行情绪分析
|
||||
print(f' 开始分析 {len(df)} 条帖子...')
|
||||
start = time.time()
|
||||
|
||||
emotion_df = df['full_text'].apply(
|
||||
lambda x: emotion_caculate(x, sentiment_dict)
|
||||
)
|
||||
|
||||
end = time.time()
|
||||
print(f' 分析完成,耗时: {end - start:.2f}秒')
|
||||
|
||||
# 合并结果
|
||||
result_df = pd.concat([df, emotion_df], axis=1)
|
||||
|
||||
# 保存结果
|
||||
output_file = os.path.join(output_dir, f'sentiment_{stock_name}_{stock_code}.csv')
|
||||
result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
|
||||
print(f' 结果已保存到: {output_file}')
|
||||
|
||||
# 统计整体情绪
|
||||
stock_stats = {
|
||||
'stock_code': stock_code,
|
||||
'stock_name': stock_name,
|
||||
'total_posts': len(result_df),
|
||||
'avg_positive': result_df['positive'].mean(),
|
||||
'avg_negative': result_df['negative'].mean(),
|
||||
'avg_sentiment_score': result_df['sentiment_score'].mean(),
|
||||
'positive_posts': (result_df['sentiment_score'] > 0).sum(),
|
||||
'negative_posts': (result_df['sentiment_score'] < 0).sum(),
|
||||
'neutral_posts': (result_df['sentiment_score'] == 0).sum(),
|
||||
'total_anger': result_df['anger'].sum(),
|
||||
'total_sadness': result_df['sadness'].sum(),
|
||||
'total_fear': result_df['fear'].sum(),
|
||||
'total_disgust': result_df['disgust'].sum(),
|
||||
'total_good': result_df['good'].sum(),
|
||||
'total_happy': result_df['happy'].sum(),
|
||||
'total_surprise': result_df['surprise'].sum()
|
||||
}
|
||||
|
||||
stock_emotions[stock_code] = stock_stats
|
||||
all_results.append(result_df)
|
||||
|
||||
# 打印该股票情绪最高/最低的帖子
|
||||
print(f'\n {stock_name} 情绪分析统计:')
|
||||
print(f' 平均情绪得分: {stock_stats["avg_sentiment_score"]:.2f}')
|
||||
print(f' 正面帖子: {stock_stats["positive_posts"]}')
|
||||
print(f' 负面帖子: {stock_stats["negative_posts"]}')
|
||||
print(f' 中性帖子: {stock_stats["neutral_posts"]}')
|
||||
|
||||
# 最正面帖子
|
||||
top_positive = result_df.nlargest(1, 'sentiment_score').iloc[0]
|
||||
print(f' 最正面帖子: {top_positive["full_text"][:50]}...')
|
||||
|
||||
# 最负面帖子
|
||||
top_negative = result_df.nsmallest(1, 'sentiment_score').iloc[0]
|
||||
print(f' 最负面帖子: {top_negative["full_text"][:50]}...')
|
||||
|
||||
except Exception as e:
|
||||
print(f' 分析失败: {e}')
|
||||
|
||||
# 保存总体统计
|
||||
if stock_emotions:
|
||||
summary_df = pd.DataFrame(list(stock_emotions.values()))
|
||||
summary_file = os.path.join(output_dir, 'sentiment_summary.csv')
|
||||
summary_df.to_csv(summary_file, index=False, encoding='utf-8-sig')
|
||||
print(f'\n总体统计已保存到: {summary_file}')
|
||||
|
||||
# 生成可视化
|
||||
generate_visualizations(summary_df, stock_emotions, output_dir)
|
||||
|
||||
return all_results, stock_emotions
|
||||
|
||||
# ============================================================
|
||||
# 第四部分:可视化
|
||||
# ============================================================
|
||||
|
||||
def generate_visualizations(summary_df, stock_emotions, output_dir):
|
||||
"""生成情绪分析可视化图表"""
|
||||
|
||||
# 1. 各股票平均情绪得分对比
|
||||
plt.figure(figsize=(12, 6))
|
||||
colors = ['green' if x >= 0 else 'red' for x in summary_df['avg_sentiment_score']]
|
||||
plt.bar(summary_df['stock_name'], summary_df['avg_sentiment_score'], color=colors, alpha=0.7)
|
||||
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
|
||||
plt.title('各股票平均情绪得分对比', fontsize=14)
|
||||
plt.xlabel('股票名称', fontsize=12)
|
||||
plt.ylabel('平均情绪得分', fontsize=12)
|
||||
plt.xticks(rotation=45)
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'sentiment_score_comparison.png'), dpi=300)
|
||||
plt.close()
|
||||
|
||||
# 2. 正面/负面/中性帖子分布
|
||||
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
|
||||
axes = axes.flatten()
|
||||
|
||||
for idx, (stock_code, stats) in enumerate(stock_emotions.items()):
|
||||
if idx >= 8:
|
||||
break
|
||||
labels = ['正面', '负面', '中性']
|
||||
sizes = [stats['positive_posts'], stats['negative_posts'], stats['neutral_posts']]
|
||||
colors = ['green', 'red', 'gray']
|
||||
|
||||
axes[idx].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
|
||||
axes[idx].set_title(f'{stats["stock_name"]} 情绪分布')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'sentiment_distribution.png'), dpi=300)
|
||||
plt.close()
|
||||
|
||||
# 3. 各情绪类型占比
|
||||
plt.figure(figsize=(14, 7))
|
||||
emotions = ['total_good', 'total_happy', 'total_surprise',
|
||||
'total_anger', 'total_sadness', 'total_fear', 'total_disgust']
|
||||
emotion_names = ['好评', '快乐', '惊讶', '愤怒', '悲伤', '恐惧', '厌恶']
|
||||
|
||||
x = range(len(emotion_names))
|
||||
width = 0.1
|
||||
|
||||
for idx, (stock_code, stats) in enumerate(stock_emotions.items()):
|
||||
values = [stats[e] for e in emotions]
|
||||
total = sum(values)
|
||||
if total > 0:
|
||||
values = [v / total * 100 for v in values]
|
||||
plt.bar([xi + width * idx for xi in x], values, width, label=stats['stock_name'])
|
||||
|
||||
plt.xlabel('情绪类型', fontsize=12)
|
||||
plt.ylabel('占比 (%)', fontsize=12)
|
||||
plt.title('各股票情绪类型分布', fontsize=14)
|
||||
plt.xticks([xi + width * 3.5 for xi in x], emotion_names)
|
||||
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'emotion_types.png'), dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
print(f'可视化图表已生成到 {output_dir}')
|
||||
|
||||
# ============================================================
|
||||
# 主程序
|
||||
# ============================================================
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('=' * 60)
|
||||
print('股吧数据情绪分析')
|
||||
print('=' * 60)
|
||||
|
||||
# 运行分析
|
||||
all_results, stock_emotions = load_and_analyze_data()
|
||||
|
||||
print('\n' + '=' * 60)
|
||||
print('情绪分析完成!')
|
||||
print('=' * 60)
|
||||
@@ -0,0 +1,187 @@
|
||||
import requests
|
||||
import pandas as pd
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
import os
|
||||
|
||||
def fetch_guba_data(code='gssz', page=1, page_size=20, sort_type=1):
|
||||
url = 'https://mguba.eastmoney.com/mguba2020/interface/GetData.aspx'
|
||||
|
||||
headers = {
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'DNT': '1',
|
||||
'Origin': 'https://mguba.eastmoney.com',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': f'https://mguba.eastmoney.com/mguba/list/{code}_{page}',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Mobile Safari/537.36 Edg/148.0.0.0',
|
||||
'sec-ch-ua': '"Chromium";v="148", "Microsoft Edge";v="148", "Not/A)Brand";v="99"',
|
||||
'sec-ch-ua-mobile': '?1',
|
||||
'sec-ch-ua-platform': '"Android"'
|
||||
}
|
||||
|
||||
cookies = {
|
||||
'qgqp_b_id': '30059d8839ad5c045fa8856e38013e9c',
|
||||
'st_nvi': 'XwpSfYXGjCxfCdbgapK5_cac4',
|
||||
'nid18': '0daec1df8064f04edd20b4e69250a8f5',
|
||||
'nid18_create_time': '1776263017375',
|
||||
'gviem': 'UrMH_tSu1UpW8B_TKmytl803f',
|
||||
'gviem_create_time': '1776263017375',
|
||||
'fullscreengg': '1',
|
||||
'fullscreengg2': '1',
|
||||
'st_si': '17952715731426',
|
||||
'show_app_box_time': '1779903756410',
|
||||
'st_pvi': '26838250597806',
|
||||
'st_sp': '2026-04-15 22:23:37',
|
||||
'st_inirUrl': 'https://cn.bing.com/',
|
||||
'st_sn': '30',
|
||||
'st_psi': '20260528025236177-117016304298-3040545697',
|
||||
'ad_tc_load_num': '3',
|
||||
'st_asi': '20260528025236177-117016304298-3040545697-ad.djxd-1'
|
||||
}
|
||||
|
||||
param = f'code={code}&p={page}&ps={page_size}&sorttype={sort_type}'
|
||||
data = {
|
||||
'param': param,
|
||||
'plat': 'wap',
|
||||
'version': '200',
|
||||
'path': '/webarticlelist/api/Article/WebArticleList',
|
||||
'env': '1',
|
||||
'origin': '',
|
||||
'ctoken': '',
|
||||
'utoken': ''
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(url, headers=headers, cookies=cookies, data=data)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f'请求失败: {e}')
|
||||
return None
|
||||
|
||||
def fetch_stock_posts(code, name, pages=10, page_size=20):
|
||||
"""爬取指定股票的多页数据"""
|
||||
all_posts = []
|
||||
|
||||
for page in range(1, pages + 1):
|
||||
print(f'正在爬取 {name} ({code}) - 第 {page}/{pages} 页')
|
||||
result = fetch_guba_data(code=code, page=page, page_size=page_size)
|
||||
|
||||
if result and 're' in result:
|
||||
posts = result['re']
|
||||
all_posts.extend(posts)
|
||||
print(f' 成功获取 {len(posts)} 条帖子')
|
||||
else:
|
||||
print(f' 第 {page} 页获取失败或无数据')
|
||||
|
||||
# 添加延迟避免请求过快
|
||||
if page < pages:
|
||||
time.sleep(1)
|
||||
|
||||
# 整理数据
|
||||
data = {
|
||||
'stock_code': code,
|
||||
'stock_name': name,
|
||||
'total_pages': pages,
|
||||
'total_posts': len(all_posts),
|
||||
'crawl_time': datetime.now().isoformat(),
|
||||
'posts': all_posts
|
||||
}
|
||||
|
||||
return data
|
||||
|
||||
def save_to_json(data, name="", filename=None):
|
||||
if not data:
|
||||
print('数据为空,无法保存')
|
||||
return None
|
||||
|
||||
if not filename:
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
filename = f'guba_{name}_{timestamp}.json'
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f'JSON数据已保存到: {filename}')
|
||||
return filename
|
||||
|
||||
def save_to_excel(data, name="", filename=None):
|
||||
if not data or 'posts' not in data:
|
||||
print('数据格式不正确,无法保存')
|
||||
return None
|
||||
|
||||
posts = data['posts']
|
||||
records = []
|
||||
|
||||
for post in posts:
|
||||
record = {
|
||||
'帖子ID': post.get('post_id'),
|
||||
'标题': post.get('post_title'),
|
||||
'内容': post.get('post_content'),
|
||||
'作者': post.get('post_user', {}).get('user_nickname'),
|
||||
'发布时间': post.get('post_publish_time'),
|
||||
'最后更新': post.get('post_last_time'),
|
||||
'阅读数': post.get('post_click_count'),
|
||||
'评论数': post.get('post_comment_count'),
|
||||
'点赞数': post.get('post_like_count'),
|
||||
'股吧': post.get('post_guba', {}).get('stockbar_name'),
|
||||
'来源': post.get('post_from')
|
||||
}
|
||||
records.append(record)
|
||||
|
||||
df = pd.DataFrame(records)
|
||||
|
||||
if not filename:
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
filename = f'guba_{name}_{timestamp}.xlsx'
|
||||
|
||||
df.to_excel(filename, index=False, engine='openpyxl')
|
||||
print(f'Excel数据已保存到: {filename}')
|
||||
return filename
|
||||
|
||||
if __name__ == '__main__':
|
||||
GAME_STOCKS = {
|
||||
'002624': '完美世界',
|
||||
'002555': '三七互娱',
|
||||
'002558': '巨人网络',
|
||||
'002602': '世纪华通',
|
||||
'300418': '昆仑万维',
|
||||
'002174': '游族网络',
|
||||
'300315': '掌趣科技',
|
||||
'603444': '吉比特',
|
||||
}
|
||||
|
||||
# 创建数据目录
|
||||
os.makedirs('data', exist_ok=True)
|
||||
|
||||
for code, name in GAME_STOCKS.items():
|
||||
print(f'\n{"="*50}')
|
||||
print(f'开始爬取 {name} ({code})')
|
||||
print(f'{"="*50}')
|
||||
|
||||
# 爬取10页数据
|
||||
data = fetch_stock_posts(code, name, pages=10)
|
||||
|
||||
if data and data['total_posts'] > 0:
|
||||
print(f'\n共获取 {data["total_posts"]} 条帖子')
|
||||
|
||||
# 保存JSON
|
||||
json_filename = os.path.join('data', f'guba_{name}_{code}.json')
|
||||
save_to_json(data, name, json_filename)
|
||||
|
||||
# 保存Excel
|
||||
excel_filename = os.path.join('data', f'guba_{name}_{code}.xlsx')
|
||||
save_to_excel(data, name, excel_filename)
|
||||
else:
|
||||
print(f'{name} 爬取失败或无数据')
|
||||
|
||||
# 股票之间的延迟
|
||||
time.sleep(2)
|
||||
@@ -0,0 +1,296 @@
|
||||
# 游戏股吧情感与话题分析报告
|
||||
|
||||
**报告日期**:2026-05-28
|
||||
**分析范围**:完美世界、三七互娱、巨人网络、世纪华通、昆仑万维、游族网络、掌趣科技、吉比特
|
||||
**数据来源**:东方财富网股吧
|
||||
|
||||
---
|
||||
|
||||
## 一、数据概述
|
||||
|
||||
本次分析共收集了8只游戏股票的股吧数据,每只股票200条帖子,总计1600条有效数据。
|
||||
|
||||
### 数据收集方法
|
||||
- 使用网络爬虫从东方财富网股吧获取帖子
|
||||
- 数据包括:帖子标题、内容、发布时间等
|
||||
- 使用大连理工大学中文情感词汇本体进行情感分析
|
||||
|
||||
---
|
||||
|
||||
## 二、整体话题分析
|
||||
|
||||
### 整体词云
|
||||

|
||||
|
||||
### 整体话题关键词
|
||||
| 排名 | 关键词 | TF-IDF值 |
|
||||
|------|--------|----------|
|
||||
| 1 | 网络 sz | 0.0341 |
|
||||
| 2 | 巨人 | 0.0235 |
|
||||
| 3 | 世纪 华通 | 0.0215 |
|
||||
| 4 | 昆仑 万维 | 0.0215 |
|
||||
| 5 | 游族 | 0.0215 |
|
||||
| 6 | 三七 互娱 | 0.0201 |
|
||||
| 7 | 游戏 | 0.0199 |
|
||||
| 8 | 掌趣 科技 | 0.0187 |
|
||||
| 9 | 比特 sh | 0.0183 |
|
||||
| 10 | 完美 世界 | 0.0174 |
|
||||
|
||||
### 整体热门话题
|
||||
从整体词云可以看出,股吧讨论主要集中在:
|
||||
1. **个股名称**:各股票名称是最热门的话题
|
||||
2. **股票操作**:主力、涨停、下跌、出货、股价等
|
||||
3. **市场情绪**:散户、大盘、投资等
|
||||
|
||||
---
|
||||
|
||||
## 三、各股票专题分析
|
||||
|
||||
### 1. 完美世界 (002624)
|
||||
|
||||
#### 词云分析
|
||||

|
||||
|
||||
#### 关键词分析
|
||||
- **异环**:指游戏《异环》相关讨论
|
||||
- **流水**:游戏流水情况
|
||||
- **版本**:游戏版本更新
|
||||
- **安魂曲**:指游戏角色《安魂曲》
|
||||
|
||||
#### 情绪分析
|
||||
- **平均情绪得分**:0.99(最高)
|
||||
- **正面帖子**:110条
|
||||
- **负面帖子**:21条
|
||||
- **中性帖子**:69条
|
||||
|
||||
**情绪倾向**:非常积极!完美世界是本次分析中情绪最正面的股票。
|
||||
|
||||
---
|
||||
|
||||
### 2. 巨人网络 (002558)
|
||||
|
||||
#### 词云分析
|
||||

|
||||
|
||||
#### 关键词分析
|
||||
- **补仓**:投资者补仓操作
|
||||
- **腰斩**:股价大幅下跌
|
||||
- **跳水**:股价快速下跌
|
||||
- **兄弟**:股吧常见称呼
|
||||
|
||||
#### 情绪分析
|
||||
- **平均情绪得分**:1.11(最高)
|
||||
- **正面帖子**:115条
|
||||
- **负面帖子**:20条
|
||||
- **中性帖子**:65条
|
||||
|
||||
**情绪倾向**:非常积极!虽然有"腰斩"、"跳水"等负面词汇,但整体情绪仍然很高。
|
||||
|
||||
---
|
||||
|
||||
### 3. 三七互娱 (002555)
|
||||
|
||||
#### 词云分析
|
||||

|
||||
|
||||
#### 关键词分析
|
||||
- **分红**:股票分红相关讨论
|
||||
- **投资**:投资策略讨论
|
||||
- **智谱**:可能指AI相关业务
|
||||
- **AI**:人工智能话题
|
||||
|
||||
#### 情绪分析
|
||||
- **平均情绪得分**:0.77
|
||||
- **正面帖子**:72条
|
||||
- **负面帖子**:39条
|
||||
- **中性帖子**:89条
|
||||
|
||||
**情绪倾向**:积极!
|
||||
|
||||
---
|
||||
|
||||
### 4. 游族网络 (002174)
|
||||
|
||||
#### 词云分析
|
||||

|
||||
|
||||
#### 关键词分析
|
||||
- **三体**:《三体》IP相关讨论
|
||||
- **死刑**、**执行**:与投毒案相关讨论
|
||||
- **CEO**、**林奇**:公司高管相关
|
||||
- **投毒**:历史事件回顾
|
||||
|
||||
#### 情绪分析
|
||||
- **平均情绪得分**:0.68
|
||||
- **正面帖子**:73条
|
||||
- **负面帖子**:28条
|
||||
- **中性帖子**:99条
|
||||
|
||||
**情绪倾向**:积极!虽然有历史负面事件,但当前情绪较好。
|
||||
|
||||
---
|
||||
|
||||
### 5. 世纪华通 (002602)
|
||||
|
||||
#### 词云分析
|
||||

|
||||
|
||||
#### 关键词分析
|
||||
- **调整**:股价调整
|
||||
- **拉升**:股价拉升
|
||||
- **索赔**:可能指投资者索赔
|
||||
- **看好**:市场观点
|
||||
|
||||
#### 情绪分析
|
||||
- **平均情绪得分**:0.48
|
||||
- **正面帖子**:63条
|
||||
- **负面帖子**:36条
|
||||
- **中性帖子**:101条
|
||||
|
||||
**情绪倾向**:中性偏积极!
|
||||
|
||||
---
|
||||
|
||||
### 6. 昆仑万维 (300418)
|
||||
|
||||
#### 词云分析
|
||||

|
||||
|
||||
#### 关键词分析
|
||||
- **解禁**:股票解禁相关
|
||||
- **员工**:员工持股等
|
||||
- **短剧**:短剧业务
|
||||
- **模型**:AI模型相关
|
||||
|
||||
#### 情绪分析
|
||||
- **平均情绪得分**:0.30
|
||||
- **正面帖子**:61条
|
||||
- **负面帖子**:49条
|
||||
- **中性帖子**:90条
|
||||
|
||||
**情绪倾向**:中性偏积极!
|
||||
|
||||
---
|
||||
|
||||
### 7. 掌趣科技 (300315)
|
||||
|
||||
#### 词云分析
|
||||

|
||||
|
||||
#### 关键词分析
|
||||
- **创业板**:创业板相关
|
||||
- **退市**:退市风险讨论
|
||||
- **垃圾**:负面评价
|
||||
- **解套**:投资者解套需求
|
||||
|
||||
#### 情绪分析
|
||||
- **平均情绪得分**:0.05
|
||||
- **正面帖子**:44条
|
||||
- **负面帖子**:47条
|
||||
- **中性帖子**:109条
|
||||
|
||||
**情绪倾向**:中性!正负情绪基本持平。
|
||||
|
||||
---
|
||||
|
||||
### 8. 吉比特 (603444)
|
||||
|
||||
#### 词云分析
|
||||

|
||||
|
||||
#### 关键词分析
|
||||
- **分红**:分红讨论
|
||||
- **业绩**:业绩讨论
|
||||
- **价值投资**:投资理念
|
||||
- **恶心**:负面情绪表达
|
||||
|
||||
#### 情绪分析
|
||||
- **平均情绪得分**:0.05
|
||||
- **正面帖子**:50条
|
||||
- **负面帖子**:65条
|
||||
- **中性帖子**:85条
|
||||
|
||||
**情绪倾向**:中性偏消极!负面帖子多于正面帖子。
|
||||
|
||||
---
|
||||
|
||||
## 四、情绪分析汇总
|
||||
|
||||
### 情绪得分对比
|
||||

|
||||
|
||||
### 情绪分布
|
||||

|
||||
|
||||
### 情绪类型分布
|
||||

|
||||
|
||||
### 各股票情绪得分排名
|
||||
|
||||
| 排名 | 股票名称 | 股票代码 | 平均情绪得分 | 情绪倾向 |
|
||||
|------|----------|----------|--------------|----------|
|
||||
| 1 | 巨人网络 | 002558 | 1.11 | 🔵 非常积极 |
|
||||
| 2 | 完美世界 | 002624 | 0.99 | 🔵 非常积极 |
|
||||
| 3 | 三七互娱 | 002555 | 0.77 | 🟢 积极 |
|
||||
| 4 | 游族网络 | 002174 | 0.68 | 🟢 积极 |
|
||||
| 5 | 世纪华通 | 002602 | 0.48 | 🟡 中性偏积极 |
|
||||
| 6 | 昆仑万维 | 300418 | 0.30 | 🟡 中性偏积极 |
|
||||
| 7 | 掌趣科技 | 300315 | 0.05 | 🟡 中性 |
|
||||
| 8 | 吉比特 | 603444 | 0.05 | 🟡 中性偏消极 |
|
||||
|
||||
---
|
||||
|
||||
## 五、结论与建议
|
||||
|
||||
### 主要发现
|
||||
|
||||
1. **情绪分布**:
|
||||
- 整体来看,游戏股股吧情绪以中性和积极为主
|
||||
- 巨人网络和完美世界情绪最积极
|
||||
- 吉比特和掌趣科技情绪相对较低
|
||||
|
||||
2. **话题特点**:
|
||||
- 各股票的讨论主要围绕自身业务和股价
|
||||
- 完美世界和巨人网络讨论中游戏内容较多
|
||||
- 游族网络仍有较多历史事件相关讨论
|
||||
|
||||
3. **热门话题**:
|
||||
- 股价操作:涨停、下跌、出货、拉升
|
||||
- 投资者行为:补仓、解套、分红
|
||||
- 行业热点:AI、短剧、游戏流水
|
||||
|
||||
### 投资建议(仅供参考)
|
||||
|
||||
1. **情绪领先标的**:
|
||||
- 完美世界和巨人网络股吧情绪最为积极,可重点关注
|
||||
- 关注其游戏业务进展和业绩情况
|
||||
|
||||
2. **风险提示**:
|
||||
- 吉比特和掌趣科技情绪相对较低,需注意风险
|
||||
- 游族网络历史事件仍有一定影响
|
||||
|
||||
3. **持续关注**:
|
||||
- 昆仑万维的AI和短剧业务
|
||||
- 三七互娱的分红和投资策略
|
||||
|
||||
---
|
||||
|
||||
## 附录
|
||||
|
||||
### 数据文件说明
|
||||
|
||||
- `data/`:原始爬取数据(JSON和Excel格式)
|
||||
- `output/`:TF-IDF分析结果和词云图片
|
||||
- `sentiment_output/`:情感分析结果和可视化图片
|
||||
|
||||
### 分析工具
|
||||
|
||||
- **爬虫**:Python + Requests
|
||||
- **分词**:jieba
|
||||
- **情感词典**:大连理工大学中文情感词汇本体
|
||||
- **可视化**:Matplotlib + WordCloud
|
||||
|
||||
---
|
||||
|
||||
**报告生成时间**:2026-05-28
|
||||
**分析工具**:自定义Python脚本
|
||||
Binary file not shown.
Reference in New Issue
Block a user