commit 5231e995dda1fed6fab6e5d712129610b92b1ea4 Author: zzy5111398 Date: Thu May 28 04:54:42 2026 +0800 ini diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..29286c9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +data/* +output/* +sentiment_output/* diff --git a/analyze.py b/analyze.py new file mode 100644 index 0000000..12e221a --- /dev/null +++ b/analyze.py @@ -0,0 +1,347 @@ +import json +import os +import re +import jieba +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from wordcloud import WordCloud +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') # 使用非交互式后端 + +# 中文停用词表 +STOPWORDS = { + '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', + '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '有', '吗', '吧', '呢', '啊', '呀', '什么', '怎么', + '为什么', '哪里', '谁', '多少', '几', '个', '只', '条', '把', '本', '篇', '次', '天', '今天', '明天', '昨天', '又', + '再', '还', '已经', '还是', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '还是', '或者', '还是', '还是', + '这个', '那个', '这些', '那些', '那么', '这么', '怎么', '如何', '因为', '所以', '虽然', '但是', '如果', '就', '那么', + '跟', '和', '与', '及', '或', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', + '股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞', + 'http', 'https', 'com', 'cn', 'www', 'net', 'org' +} + +def clean_text(text): + """清洗文本""" + if not text: + return "" + # 移除URL + text = re.sub(r'https?://\S+|www\.\S+', '', text) + # 移除HTML标签 + text = re.sub(r'<.*?>', '', text) + # 移除表情符号 + text = re.sub(r'\[.*?\]', '', text) + # 移除特殊字符 + text = re.sub(r'[^\w\s]', '', text) + # 移除数字 + text = re.sub(r'\d+', '', text) + # 移除多余空格 + text = re.sub(r'\s+', ' ', text).strip() + return text + +def tokenize(text): + """中文分词""" + words = jieba.lcut(text) + # 过滤停用词和短词 + words = [w for w in words if w not in STOPWORDS and len(w) > 1] + return words + +def load_data(data_dir='data'): + """加载所有股票数据""" + all_data = [] + stock_info = {} + + if not os.path.exists(data_dir): + print(f'数据目录 {data_dir} 不存在') + return all_data, stock_info + + for filename in os.listdir(data_dir): + if filename.endswith('.json'): + filepath = os.path.join(data_dir, filename) + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + stock_name = data.get('stock_name', '未知') + stock_code = data.get('stock_code', '未知') + posts = data.get('posts', []) + + stock_info[stock_code] = { + 'name': stock_name, + 'post_count': len(posts) + } + + for post in posts: + content = post.get('post_content', '') + title = post.get('post_title', '') + full_text = f"{title} {content}".strip() + + if full_text: + all_data.append({ + 'stock_code': stock_code, + 'stock_name': stock_name, + 'post_id': post.get('post_id'), + 'text': full_text, + 'clean_text': clean_text(full_text) + }) + except Exception as e: + print(f'加载文件 {filename} 失败: {e}') + + return all_data, stock_info + +def calculate_tfidf(texts): + """计算TF-IDF""" + vectorizer = TfidfVectorizer( + tokenizer=tokenize, + token_pattern=None, + max_features=1000, + ngram_range=(1, 2) + ) + + tfidf_matrix = vectorizer.fit_transform(texts) + feature_names = vectorizer.get_feature_names_out() + + return tfidf_matrix, feature_names, vectorizer + +def get_top_keywords(tfidf_matrix, feature_names, top_n=20): + """获取Top关键词""" + avg_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten() + top_indices = avg_tfidf.argsort()[-top_n*4:][::-1] # 多取一些,避免重复后不够 + + # 先收集候选词 + candidates = [] + for idx in top_indices: + word = feature_names[idx] + if len(word.strip()) > 0: + candidates.append({ + 'word': word, + 'tfidf': avg_tfidf[idx], + 'length': len(word.split()) # 词的长度(包含多少个词) + }) + + # 按词长降序排序(优先保留组合词) + candidates.sort(key=lambda x: (-x['length'], -x['tfidf'])) + + # 智能去重 - 优先保留组合词 + keywords = [] + seen_words = set() + seen_parts = set() + + for candidate in candidates: + word = candidate['word'] + word_parts = word.split() + + # 检查是否应该添加这个词 + should_add = True + + # 检查这个词的任何部分是否已经被其他词使用了 + for part in word_parts: + if part in seen_parts: + should_add = False + break + + if should_add and word not in seen_words: + seen_words.add(word) + # 记录所有使用过的词部分 + for part in word_parts: + seen_parts.add(part) + keywords.append({ + 'word': word, + 'tfidf': candidate['tfidf'] + }) + if len(keywords) >= top_n: + break + + # 按TF-IDF重新排序 + keywords.sort(key=lambda x: -x['tfidf']) + return keywords + +def get_stock_specific_keywords(all_data, stock_code, top_n=20): + """获取特定股票的关键词""" + stock_texts = [d['clean_text'] for d in all_data if d['stock_code'] == stock_code] + other_texts = [d['clean_text'] for d in all_data if d['stock_code'] != stock_code] + + if len(stock_texts) < 5: + return [] + + all_texts = stock_texts + other_texts + tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts) + + # 计算该股票的平均TF-IDF + stock_matrix = tfidf_matrix[:len(stock_texts)] + avg_tfidf = np.array(stock_matrix.mean(axis=0)).flatten() + + # 计算其他股票的平均TF-IDF + if other_texts: + other_matrix = tfidf_matrix[len(stock_texts):] + other_avg = np.array(other_matrix.mean(axis=0)).flatten() + # 计算差值 + diff = avg_tfidf - other_avg + else: + diff = avg_tfidf + + top_indices = diff.argsort()[-top_n*4:][::-1] # 多取一些,避免重复后不够 + + # 先收集候选词 + candidates = [] + for idx in top_indices: + word = feature_names[idx] + if len(word.strip()) > 0: + candidates.append({ + 'word': word, + 'tfidf': avg_tfidf[idx], + 'diff': diff[idx], + 'length': len(word.split()) # 词的长度 + }) + + # 按词长降序排序(优先保留组合词) + candidates.sort(key=lambda x: (-x['length'], -x['diff'])) + + # 智能去重 - 优先保留组合词 + keywords = [] + seen_words = set() + seen_parts = set() + + for candidate in candidates: + word = candidate['word'] + word_parts = word.split() + + # 检查是否应该添加这个词 + should_add = True + + # 检查这个词的任何部分是否已经被其他词使用了 + for part in word_parts: + if part in seen_parts: + should_add = False + break + + if should_add and word not in seen_words: + seen_words.add(word) + # 记录所有使用过的词部分 + for part in word_parts: + seen_parts.add(part) + keywords.append({ + 'word': word, + 'tfidf': candidate['tfidf'], + 'diff': candidate['diff'] + }) + if len(keywords) >= top_n: + break + + # 按diff重新排序 + keywords.sort(key=lambda x: -x['diff']) + return keywords + +def generate_wordcloud(keywords, stock_name, output_dir='output'): + """生成词云""" + os.makedirs(output_dir, exist_ok=True) + + word_freq = {k['word']: k['tfidf'] for k in keywords} + + wc = WordCloud( + font_path='C:/Windows/Fonts/simhei.ttf', # Windows中文字体路径 + width=800, + height=600, + background_color='white', + max_words=100 + ) + + wc.generate_from_frequencies(word_freq) + + output_path = os.path.join(output_dir, f'wordcloud_{stock_name}.png') + wc.to_file(output_path) + print(f'词云已保存到: {output_path}') + + return output_path + +def analyze_all(): + """完整分析流程""" + print('='*60) + print('股吧数据 TF-IDF 分析') + print('='*60) + + # 创建输出目录 + os.makedirs('output', exist_ok=True) + + # 加载数据 + print('\n[1/5] 加载数据...') + all_data, stock_info = load_data() + + if not all_data: + print('没有找到数据,请先运行爬虫') + return + + print(f' 共加载 {len(all_data)} 条帖子') + print(f' 涉及 {len(stock_info)} 只股票:') + for code, info in stock_info.items(): + print(f' - {info["name"]} ({code}): {info["post_count"]} 条') + + # 整体分析 + print('\n[2/5] 整体关键词分析...') + all_texts = [d['clean_text'] for d in all_data] + tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts) + overall_keywords = get_top_keywords(tfidf_matrix, feature_names, top_n=30) + + print('\n 整体Top 20关键词:') + for i, kw in enumerate(overall_keywords[:20], 1): + print(f' {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f})') + + # 保存整体关键词 + overall_df = pd.DataFrame(overall_keywords) + overall_df.to_csv('output/overall_keywords.csv', index=False, encoding='utf-8-sig') + + # 生成整体词云 + generate_wordcloud(overall_keywords, 'overall') + + # 各股票单独分析 + print('\n[3/5] 各股票关键词分析...') + stock_keywords = {} + + for stock_code in stock_info.keys(): + stock_name = stock_info[stock_code]['name'] + print(f'\n 分析 {stock_name} ({stock_code})...') + + keywords = get_stock_specific_keywords(all_data, stock_code, top_n=20) + stock_keywords[stock_code] = keywords + + if keywords: + print(f' Top 10关键词:') + for i, kw in enumerate(keywords[:10], 1): + print(f' {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f}, 差值: {kw["diff"]:.4f})') + + # 生成词云 + generate_wordcloud(keywords, stock_name) + + # 保存关键词 + df = pd.DataFrame(keywords) + df.to_csv(f'output/keywords_{stock_name}.csv', index=False, encoding='utf-8-sig') + + # 生成汇总报告 + print('\n[4/5] 生成汇总报告...') + report_data = [] + for stock_code, keywords in stock_keywords.items(): + stock_name = stock_info[stock_code]['name'] + top_words = ', '.join([k['word'] for k in keywords[:5]]) + report_data.append({ + '股票代码': stock_code, + '股票名称': stock_name, + '帖子数量': stock_info[stock_code]['post_count'], + 'Top5关键词': top_words + }) + + report_df = pd.DataFrame(report_data) + report_df.to_csv('output/summary_report.csv', index=False, encoding='utf-8-sig') + print(' 汇总报告已保存到: output/summary_report.csv') + + # 保存所有文本数据 + print('\n[5/5] 保存预处理数据...') + all_df = pd.DataFrame(all_data) + all_df.to_csv('output/all_posts.csv', index=False, encoding='utf-8-sig') + print(' 所有帖子已保存到: output/all_posts.csv') + + print('\n' + '='*60) + print('分析完成!结果保存在 output/ 目录中') + print('='*60) + +if __name__ == '__main__': + analyze_all() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a6f11e9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +requests>=2.28.0 +pandas>=2.0.0 +openpyxl>=3.1.0 +jieba>=0.42.1 +scikit-learn>=1.3.0 +numpy>=1.24.0 +matplotlib>=3.7.0 +seaborn>=0.12.0 +wordcloud>=1.9.0 diff --git a/sentiment_analysis.py b/sentiment_analysis.py new file mode 100644 index 0000000..128e55a --- /dev/null +++ b/sentiment_analysis.py @@ -0,0 +1,409 @@ +import pandas as pd +import jieba +import time +import json +import os +from collections import defaultdict +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') + +# 设置中文字体 +plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS'] +plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 + +# ============================================================ +# 第一部分:构建情感词典 +# ============================================================ + +def build_sentiment_dictionary(): + """使用大连理工大学中文情感词汇本体构建情感词典""" + + dict_path = '大连理工大学中文情感词汇本体.xlsx' + + try: + # 读取大连理工大学情感词汇 + df = pd.read_excel(dict_path) + + # 选择需要的列 + df = df[['词语', '词性种类', '词义数', '词义序号', '情感分类', '强度', '极性']] + + # 分类整理 + Happy = [] + Good = [] + Surprise = [] + Anger = [] + Sad = [] + Fear = [] + Disgust = [] + + for idx, row in df.iterrows(): + if row['情感分类'] in ['PA', 'PE']: + Happy.append(row['词语']) + if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']: + Good.append(row['词语']) + if row['情感分类'] in ['PC']: + Surprise.append(row['词语']) + if row['情感分类'] in ['NA']: + Anger.append(row['词语']) + if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']: + Sad.append(row['词语']) + if row['情感分类'] in ['NI', 'NC', 'NG']: + Fear.append(row['词语']) + if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']: + Disgust.append(row['词语']) + + # 添加股票相关的补充词汇 + stock_positive = ['涨', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '赚', + '利好', '增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破', + '创新', '优势', '超预期', '亮眼', '惊艳', '奇迹'] + stock_negative = ['跌', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '赔', '损失', + '套牢', '垃圾', '恶心', '坑爹', '骗局', '雷', '爆雷', '崩盘', '退市'] + + Good.extend(stock_positive) + Disgust.extend(stock_negative) + + # 合并 + Positive = Happy + Good + Surprise + Negative = Anger + Sad + Fear + Disgust + + print('大连理工大学情感词典加载完成') + print(f'正面情感词: {len(Positive)}个') + print(f'负面情感词: {len(Negative)}个') + + return { + 'Happy': Happy, + 'Good': Good, + 'Surprise': Surprise, + 'Anger': Anger, + 'Sad': Sad, + 'Fear': Fear, + 'Disgust': Disgust, + 'Positive': Positive, + 'Negative': Negative + } + + except Exception as e: + print(f'加载大连理工大学情感词典失败: {e}') + print('使用简化版情感词典') + return build_simplified_dictionary() + +def build_simplified_dictionary(): + """构建简化的中文情感词典(备用方案)""" + + # 正面情感词 + Happy = [ + '开心', '快乐', '高兴', '喜悦', '愉快', '欣喜', '欢乐', '欢喜', '幸福', + '满意', '满足', '欣慰', '愉悦', '畅快', '乐观', '积极', '美好', '成功' + ] + + Good = [ + '好', '优秀', '出色', '精彩', '卓越', '杰出', '优良', '良好', '完美', '不错', + '涨', '上涨', '暴涨', '拉升', '涨停', '盈利', '收益', '赚钱', '赚', '利好', + '增长', '上升', '增加', '发展', '进步', '提升', '改善', '突破', '创新', '优势' + ] + + Surprise = [ + '惊喜', '意外', '震惊', '惊讶', '震撼', '神奇', '奇迹', '惊艳', '亮眼', '超预期' + ] + + # 负面情感词 + Anger = [ + '愤怒', '生气', '恼火', '气愤', '暴怒', '愤慨', '愤恨', '震怒', '发怒', + '骂', '垃圾', '恶心', '坑爹', '骗局', '欺骗', '欺诈', '造假', '腐败', '黑暗' + ] + + Sad = [ + '伤心', '难过', '悲伤', '痛苦', '悲哀', '沮丧', '失望', '绝望', '低落', '悲观', + '跌', '下跌', '暴跌', '跳水', '跌停', '亏损', '亏钱', '赔', '损失', '套牢' + ] + + Fear = [ + '害怕', '恐惧', '担心', '担忧', '恐慌', '不安', '焦虑', '忧虑', '紧张', '恐怖', + '风险', '危机', '危险', '下跌', '暴跌', '崩盘', '退市', '爆雷', '雷', '怕' + ] + + Disgust = [ + '厌恶', '恶心', '反感', '讨厌', '鄙视', '唾弃', '不屑', '蔑视', '嫌弃', + '垃圾', '废物', '不行', '差劲', '差', '烂', '渣', '骗局' + ] + + # 合并 + Positive = Happy + Good + Surprise + Negative = Anger + Sad + Fear + Disgust + + print('简化版情感词典构建完成') + print(f'正面情感词: {len(Positive)}个') + print(f'负面情感词: {len(Negative)}个') + + return { + 'Happy': Happy, + 'Good': Good, + 'Surprise': Surprise, + 'Anger': Anger, + 'Sad': Sad, + 'Fear': Fear, + 'Disgust': Disgust, + 'Positive': Positive, + 'Negative': Negative + } + +# ============================================================ +# 第二部分:情绪计算函数 +# ============================================================ + +def emotion_caculate(text, sentiment_dict): + """计算单条文本的情绪""" + + if not text or pd.isna(text): + text = '' + + positive = 0 + negative = 0 + anger = 0 + disgust = 0 + fear = 0 + sad = 0 + surprise = 0 + good = 0 + happy = 0 + + wordlist = jieba.lcut(text) + wordset = set(wordlist) + + for word in wordset: + freq = wordlist.count(word) + + if word in sentiment_dict['Positive']: + positive += freq + if word in sentiment_dict['Negative']: + negative += freq + if word in sentiment_dict['Anger']: + anger += freq + if word in sentiment_dict['Disgust']: + disgust += freq + if word in sentiment_dict['Fear']: + fear += freq + if word in sentiment_dict['Sad']: + sad += freq + if word in sentiment_dict['Surprise']: + surprise += freq + if word in sentiment_dict['Good']: + good += freq + if word in sentiment_dict['Happy']: + happy += freq + + emotion_info = { + 'length': len(wordlist), + 'positive': positive, + 'negative': negative, + 'anger': anger, + 'disgust': disgust, + 'fear': fear, + 'sadness': sad, + 'surprise': surprise, + 'good': good, + 'happy': happy, + 'sentiment_score': positive - negative if (positive + negative) > 0 else 0 + } + + indexs = ['length', 'positive', 'negative', 'anger', 'disgust', 'fear', + 'sadness', 'surprise', 'good', 'happy', 'sentiment_score'] + + return pd.Series(emotion_info, index=indexs) + +# ============================================================ +# 第三部分:数据加载与分析 +# ============================================================ + +def load_and_analyze_data(data_dir='data', output_dir='sentiment_output'): + """加载数据并进行情绪分析""" + + os.makedirs(output_dir, exist_ok=True) + + # 构建情感词典 + sentiment_dict = build_sentiment_dictionary() + + # 遍历所有JSON文件 + all_results = [] + stock_emotions = {} + + for filename in os.listdir(data_dir): + if filename.endswith('.json') and filename.startswith('guba_'): + filepath = os.path.join(data_dir, filename) + + print(f'\n正在分析: {filename}') + + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + + stock_name = data.get('stock_name', '未知') + stock_code = data.get('stock_code', '未知') + posts = data.get('posts', []) + + if not posts: + print(f' 无数据,跳过') + continue + + # 转换为DataFrame + df = pd.DataFrame(posts) + + # 合并标题和内容 + df['full_text'] = df.apply( + lambda x: f"{x.get('post_title', '')} {x.get('post_content', '')}", + axis=1 + ) + + # 进行情绪分析 + print(f' 开始分析 {len(df)} 条帖子...') + start = time.time() + + emotion_df = df['full_text'].apply( + lambda x: emotion_caculate(x, sentiment_dict) + ) + + end = time.time() + print(f' 分析完成,耗时: {end - start:.2f}秒') + + # 合并结果 + result_df = pd.concat([df, emotion_df], axis=1) + + # 保存结果 + output_file = os.path.join(output_dir, f'sentiment_{stock_name}_{stock_code}.csv') + result_df.to_csv(output_file, index=False, encoding='utf-8-sig') + print(f' 结果已保存到: {output_file}') + + # 统计整体情绪 + stock_stats = { + 'stock_code': stock_code, + 'stock_name': stock_name, + 'total_posts': len(result_df), + 'avg_positive': result_df['positive'].mean(), + 'avg_negative': result_df['negative'].mean(), + 'avg_sentiment_score': result_df['sentiment_score'].mean(), + 'positive_posts': (result_df['sentiment_score'] > 0).sum(), + 'negative_posts': (result_df['sentiment_score'] < 0).sum(), + 'neutral_posts': (result_df['sentiment_score'] == 0).sum(), + 'total_anger': result_df['anger'].sum(), + 'total_sadness': result_df['sadness'].sum(), + 'total_fear': result_df['fear'].sum(), + 'total_disgust': result_df['disgust'].sum(), + 'total_good': result_df['good'].sum(), + 'total_happy': result_df['happy'].sum(), + 'total_surprise': result_df['surprise'].sum() + } + + stock_emotions[stock_code] = stock_stats + all_results.append(result_df) + + # 打印该股票情绪最高/最低的帖子 + print(f'\n {stock_name} 情绪分析统计:') + print(f' 平均情绪得分: {stock_stats["avg_sentiment_score"]:.2f}') + print(f' 正面帖子: {stock_stats["positive_posts"]}') + print(f' 负面帖子: {stock_stats["negative_posts"]}') + print(f' 中性帖子: {stock_stats["neutral_posts"]}') + + # 最正面帖子 + top_positive = result_df.nlargest(1, 'sentiment_score').iloc[0] + print(f' 最正面帖子: {top_positive["full_text"][:50]}...') + + # 最负面帖子 + top_negative = result_df.nsmallest(1, 'sentiment_score').iloc[0] + print(f' 最负面帖子: {top_negative["full_text"][:50]}...') + + except Exception as e: + print(f' 分析失败: {e}') + + # 保存总体统计 + if stock_emotions: + summary_df = pd.DataFrame(list(stock_emotions.values())) + summary_file = os.path.join(output_dir, 'sentiment_summary.csv') + summary_df.to_csv(summary_file, index=False, encoding='utf-8-sig') + print(f'\n总体统计已保存到: {summary_file}') + + # 生成可视化 + generate_visualizations(summary_df, stock_emotions, output_dir) + + return all_results, stock_emotions + +# ============================================================ +# 第四部分:可视化 +# ============================================================ + +def generate_visualizations(summary_df, stock_emotions, output_dir): + """生成情绪分析可视化图表""" + + # 1. 各股票平均情绪得分对比 + plt.figure(figsize=(12, 6)) + colors = ['green' if x >= 0 else 'red' for x in summary_df['avg_sentiment_score']] + plt.bar(summary_df['stock_name'], summary_df['avg_sentiment_score'], color=colors, alpha=0.7) + plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5) + plt.title('各股票平均情绪得分对比', fontsize=14) + plt.xlabel('股票名称', fontsize=12) + plt.ylabel('平均情绪得分', fontsize=12) + plt.xticks(rotation=45) + plt.tight_layout() + plt.savefig(os.path.join(output_dir, 'sentiment_score_comparison.png'), dpi=300) + plt.close() + + # 2. 正面/负面/中性帖子分布 + fig, axes = plt.subplots(2, 4, figsize=(16, 10)) + axes = axes.flatten() + + for idx, (stock_code, stats) in enumerate(stock_emotions.items()): + if idx >= 8: + break + labels = ['正面', '负面', '中性'] + sizes = [stats['positive_posts'], stats['negative_posts'], stats['neutral_posts']] + colors = ['green', 'red', 'gray'] + + axes[idx].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90) + axes[idx].set_title(f'{stats["stock_name"]} 情绪分布') + + plt.tight_layout() + plt.savefig(os.path.join(output_dir, 'sentiment_distribution.png'), dpi=300) + plt.close() + + # 3. 各情绪类型占比 + plt.figure(figsize=(14, 7)) + emotions = ['total_good', 'total_happy', 'total_surprise', + 'total_anger', 'total_sadness', 'total_fear', 'total_disgust'] + emotion_names = ['好评', '快乐', '惊讶', '愤怒', '悲伤', '恐惧', '厌恶'] + + x = range(len(emotion_names)) + width = 0.1 + + for idx, (stock_code, stats) in enumerate(stock_emotions.items()): + values = [stats[e] for e in emotions] + total = sum(values) + if total > 0: + values = [v / total * 100 for v in values] + plt.bar([xi + width * idx for xi in x], values, width, label=stats['stock_name']) + + plt.xlabel('情绪类型', fontsize=12) + plt.ylabel('占比 (%)', fontsize=12) + plt.title('各股票情绪类型分布', fontsize=14) + plt.xticks([xi + width * 3.5 for xi in x], emotion_names) + plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + plt.tight_layout() + plt.savefig(os.path.join(output_dir, 'emotion_types.png'), dpi=300, bbox_inches='tight') + plt.close() + + print(f'可视化图表已生成到 {output_dir}') + +# ============================================================ +# 主程序 +# ============================================================ + +if __name__ == '__main__': + print('=' * 60) + print('股吧数据情绪分析') + print('=' * 60) + + # 运行分析 + all_results, stock_emotions = load_and_analyze_data() + + print('\n' + '=' * 60) + print('情绪分析完成!') + print('=' * 60) diff --git a/spider.py b/spider.py new file mode 100644 index 0000000..5bb3bb0 --- /dev/null +++ b/spider.py @@ -0,0 +1,187 @@ +import requests +import pandas as pd +import json +import time +from datetime import datetime +import os + +def fetch_guba_data(code='gssz', page=1, page_size=20, sort_type=1): + url = 'https://mguba.eastmoney.com/mguba2020/interface/GetData.aspx' + + headers = { + 'Accept': '*/*', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Content-Type': 'application/x-www-form-urlencoded', + 'DNT': '1', + 'Origin': 'https://mguba.eastmoney.com', + 'Pragma': 'no-cache', + 'Referer': f'https://mguba.eastmoney.com/mguba/list/{code}_{page}', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Mobile Safari/537.36 Edg/148.0.0.0', + 'sec-ch-ua': '"Chromium";v="148", "Microsoft Edge";v="148", "Not/A)Brand";v="99"', + 'sec-ch-ua-mobile': '?1', + 'sec-ch-ua-platform': '"Android"' + } + + cookies = { + 'qgqp_b_id': '30059d8839ad5c045fa8856e38013e9c', + 'st_nvi': 'XwpSfYXGjCxfCdbgapK5_cac4', + 'nid18': '0daec1df8064f04edd20b4e69250a8f5', + 'nid18_create_time': '1776263017375', + 'gviem': 'UrMH_tSu1UpW8B_TKmytl803f', + 'gviem_create_time': '1776263017375', + 'fullscreengg': '1', + 'fullscreengg2': '1', + 'st_si': '17952715731426', + 'show_app_box_time': '1779903756410', + 'st_pvi': '26838250597806', + 'st_sp': '2026-04-15 22:23:37', + 'st_inirUrl': 'https://cn.bing.com/', + 'st_sn': '30', + 'st_psi': '20260528025236177-117016304298-3040545697', + 'ad_tc_load_num': '3', + 'st_asi': '20260528025236177-117016304298-3040545697-ad.djxd-1' + } + + param = f'code={code}&p={page}&ps={page_size}&sorttype={sort_type}' + data = { + 'param': param, + 'plat': 'wap', + 'version': '200', + 'path': '/webarticlelist/api/Article/WebArticleList', + 'env': '1', + 'origin': '', + 'ctoken': '', + 'utoken': '' + } + + try: + response = requests.post(url, headers=headers, cookies=cookies, data=data) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + print(f'请求失败: {e}') + return None + +def fetch_stock_posts(code, name, pages=10, page_size=20): + """爬取指定股票的多页数据""" + all_posts = [] + + for page in range(1, pages + 1): + print(f'正在爬取 {name} ({code}) - 第 {page}/{pages} 页') + result = fetch_guba_data(code=code, page=page, page_size=page_size) + + if result and 're' in result: + posts = result['re'] + all_posts.extend(posts) + print(f' 成功获取 {len(posts)} 条帖子') + else: + print(f' 第 {page} 页获取失败或无数据') + + # 添加延迟避免请求过快 + if page < pages: + time.sleep(1) + + # 整理数据 + data = { + 'stock_code': code, + 'stock_name': name, + 'total_pages': pages, + 'total_posts': len(all_posts), + 'crawl_time': datetime.now().isoformat(), + 'posts': all_posts + } + + return data + +def save_to_json(data, name="", filename=None): + if not data: + print('数据为空,无法保存') + return None + + if not filename: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + filename = f'guba_{name}_{timestamp}.json' + + with open(filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + print(f'JSON数据已保存到: {filename}') + return filename + +def save_to_excel(data, name="", filename=None): + if not data or 'posts' not in data: + print('数据格式不正确,无法保存') + return None + + posts = data['posts'] + records = [] + + for post in posts: + record = { + '帖子ID': post.get('post_id'), + '标题': post.get('post_title'), + '内容': post.get('post_content'), + '作者': post.get('post_user', {}).get('user_nickname'), + '发布时间': post.get('post_publish_time'), + '最后更新': post.get('post_last_time'), + '阅读数': post.get('post_click_count'), + '评论数': post.get('post_comment_count'), + '点赞数': post.get('post_like_count'), + '股吧': post.get('post_guba', {}).get('stockbar_name'), + '来源': post.get('post_from') + } + records.append(record) + + df = pd.DataFrame(records) + + if not filename: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + filename = f'guba_{name}_{timestamp}.xlsx' + + df.to_excel(filename, index=False, engine='openpyxl') + print(f'Excel数据已保存到: {filename}') + return filename + +if __name__ == '__main__': + GAME_STOCKS = { + '002624': '完美世界', + '002555': '三七互娱', + '002558': '巨人网络', + '002602': '世纪华通', + '300418': '昆仑万维', + '002174': '游族网络', + '300315': '掌趣科技', + '603444': '吉比特', + } + + # 创建数据目录 + os.makedirs('data', exist_ok=True) + + for code, name in GAME_STOCKS.items(): + print(f'\n{"="*50}') + print(f'开始爬取 {name} ({code})') + print(f'{"="*50}') + + # 爬取10页数据 + data = fetch_stock_posts(code, name, pages=10) + + if data and data['total_posts'] > 0: + print(f'\n共获取 {data["total_posts"]} 条帖子') + + # 保存JSON + json_filename = os.path.join('data', f'guba_{name}_{code}.json') + save_to_json(data, name, json_filename) + + # 保存Excel + excel_filename = os.path.join('data', f'guba_{name}_{code}.xlsx') + save_to_excel(data, name, excel_filename) + else: + print(f'{name} 爬取失败或无数据') + + # 股票之间的延迟 + time.sleep(2) diff --git a/分析报告.md b/分析报告.md new file mode 100644 index 0000000..eb599ff --- /dev/null +++ b/分析报告.md @@ -0,0 +1,296 @@ +# 游戏股吧情感与话题分析报告 + +**报告日期**:2026-05-28 +**分析范围**:完美世界、三七互娱、巨人网络、世纪华通、昆仑万维、游族网络、掌趣科技、吉比特 +**数据来源**:东方财富网股吧 + +--- + +## 一、数据概述 + +本次分析共收集了8只游戏股票的股吧数据,每只股票200条帖子,总计1600条有效数据。 + +### 数据收集方法 +- 使用网络爬虫从东方财富网股吧获取帖子 +- 数据包括:帖子标题、内容、发布时间等 +- 使用大连理工大学中文情感词汇本体进行情感分析 + +--- + +## 二、整体话题分析 + +### 整体词云 +![整体词云](output/wordcloud_overall.png) + +### 整体话题关键词 +| 排名 | 关键词 | TF-IDF值 | +|------|--------|----------| +| 1 | 网络 sz | 0.0341 | +| 2 | 巨人 | 0.0235 | +| 3 | 世纪 华通 | 0.0215 | +| 4 | 昆仑 万维 | 0.0215 | +| 5 | 游族 | 0.0215 | +| 6 | 三七 互娱 | 0.0201 | +| 7 | 游戏 | 0.0199 | +| 8 | 掌趣 科技 | 0.0187 | +| 9 | 比特 sh | 0.0183 | +| 10 | 完美 世界 | 0.0174 | + +### 整体热门话题 +从整体词云可以看出,股吧讨论主要集中在: +1. **个股名称**:各股票名称是最热门的话题 +2. **股票操作**:主力、涨停、下跌、出货、股价等 +3. **市场情绪**:散户、大盘、投资等 + +--- + +## 三、各股票专题分析 + +### 1. 完美世界 (002624) + +#### 词云分析 +![完美世界词云](output/wordcloud_完美世界.png) + +#### 关键词分析 +- **异环**:指游戏《异环》相关讨论 +- **流水**:游戏流水情况 +- **版本**:游戏版本更新 +- **安魂曲**:指游戏角色《安魂曲》 + +#### 情绪分析 +- **平均情绪得分**:0.99(最高) +- **正面帖子**:110条 +- **负面帖子**:21条 +- **中性帖子**:69条 + +**情绪倾向**:非常积极!完美世界是本次分析中情绪最正面的股票。 + +--- + +### 2. 巨人网络 (002558) + +#### 词云分析 +![巨人网络词云](output/wordcloud_巨人网络.png) + +#### 关键词分析 +- **补仓**:投资者补仓操作 +- **腰斩**:股价大幅下跌 +- **跳水**:股价快速下跌 +- **兄弟**:股吧常见称呼 + +#### 情绪分析 +- **平均情绪得分**:1.11(最高) +- **正面帖子**:115条 +- **负面帖子**:20条 +- **中性帖子**:65条 + +**情绪倾向**:非常积极!虽然有"腰斩"、"跳水"等负面词汇,但整体情绪仍然很高。 + +--- + +### 3. 三七互娱 (002555) + +#### 词云分析 +![三七互娱词云](output/wordcloud_三七互娱.png) + +#### 关键词分析 +- **分红**:股票分红相关讨论 +- **投资**:投资策略讨论 +- **智谱**:可能指AI相关业务 +- **AI**:人工智能话题 + +#### 情绪分析 +- **平均情绪得分**:0.77 +- **正面帖子**:72条 +- **负面帖子**:39条 +- **中性帖子**:89条 + +**情绪倾向**:积极! + +--- + +### 4. 游族网络 (002174) + +#### 词云分析 +![游族网络词云](output/wordcloud_游族网络.png) + +#### 关键词分析 +- **三体**:《三体》IP相关讨论 +- **死刑**、**执行**:与投毒案相关讨论 +- **CEO**、**林奇**:公司高管相关 +- **投毒**:历史事件回顾 + +#### 情绪分析 +- **平均情绪得分**:0.68 +- **正面帖子**:73条 +- **负面帖子**:28条 +- **中性帖子**:99条 + +**情绪倾向**:积极!虽然有历史负面事件,但当前情绪较好。 + +--- + +### 5. 世纪华通 (002602) + +#### 词云分析 +![世纪华通词云](output/wordcloud_世纪华通.png) + +#### 关键词分析 +- **调整**:股价调整 +- **拉升**:股价拉升 +- **索赔**:可能指投资者索赔 +- **看好**:市场观点 + +#### 情绪分析 +- **平均情绪得分**:0.48 +- **正面帖子**:63条 +- **负面帖子**:36条 +- **中性帖子**:101条 + +**情绪倾向**:中性偏积极! + +--- + +### 6. 昆仑万维 (300418) + +#### 词云分析 +![昆仑万维词云](output/wordcloud_昆仑万维.png) + +#### 关键词分析 +- **解禁**:股票解禁相关 +- **员工**:员工持股等 +- **短剧**:短剧业务 +- **模型**:AI模型相关 + +#### 情绪分析 +- **平均情绪得分**:0.30 +- **正面帖子**:61条 +- **负面帖子**:49条 +- **中性帖子**:90条 + +**情绪倾向**:中性偏积极! + +--- + +### 7. 掌趣科技 (300315) + +#### 词云分析 +![掌趣科技词云](output/wordcloud_掌趣科技.png) + +#### 关键词分析 +- **创业板**:创业板相关 +- **退市**:退市风险讨论 +- **垃圾**:负面评价 +- **解套**:投资者解套需求 + +#### 情绪分析 +- **平均情绪得分**:0.05 +- **正面帖子**:44条 +- **负面帖子**:47条 +- **中性帖子**:109条 + +**情绪倾向**:中性!正负情绪基本持平。 + +--- + +### 8. 吉比特 (603444) + +#### 词云分析 +![吉比特词云](output/wordcloud_吉比特.png) + +#### 关键词分析 +- **分红**:分红讨论 +- **业绩**:业绩讨论 +- **价值投资**:投资理念 +- **恶心**:负面情绪表达 + +#### 情绪分析 +- **平均情绪得分**:0.05 +- **正面帖子**:50条 +- **负面帖子**:65条 +- **中性帖子**:85条 + +**情绪倾向**:中性偏消极!负面帖子多于正面帖子。 + +--- + +## 四、情绪分析汇总 + +### 情绪得分对比 +![情绪得分对比](sentiment_output/sentiment_score_comparison.png) + +### 情绪分布 +![情绪分布](sentiment_output/sentiment_distribution.png) + +### 情绪类型分布 +![情绪类型分布](sentiment_output/emotion_types.png) + +### 各股票情绪得分排名 + +| 排名 | 股票名称 | 股票代码 | 平均情绪得分 | 情绪倾向 | +|------|----------|----------|--------------|----------| +| 1 | 巨人网络 | 002558 | 1.11 | 🔵 非常积极 | +| 2 | 完美世界 | 002624 | 0.99 | 🔵 非常积极 | +| 3 | 三七互娱 | 002555 | 0.77 | 🟢 积极 | +| 4 | 游族网络 | 002174 | 0.68 | 🟢 积极 | +| 5 | 世纪华通 | 002602 | 0.48 | 🟡 中性偏积极 | +| 6 | 昆仑万维 | 300418 | 0.30 | 🟡 中性偏积极 | +| 7 | 掌趣科技 | 300315 | 0.05 | 🟡 中性 | +| 8 | 吉比特 | 603444 | 0.05 | 🟡 中性偏消极 | + +--- + +## 五、结论与建议 + +### 主要发现 + +1. **情绪分布**: + - 整体来看,游戏股股吧情绪以中性和积极为主 + - 巨人网络和完美世界情绪最积极 + - 吉比特和掌趣科技情绪相对较低 + +2. **话题特点**: + - 各股票的讨论主要围绕自身业务和股价 + - 完美世界和巨人网络讨论中游戏内容较多 + - 游族网络仍有较多历史事件相关讨论 + +3. **热门话题**: + - 股价操作:涨停、下跌、出货、拉升 + - 投资者行为:补仓、解套、分红 + - 行业热点:AI、短剧、游戏流水 + +### 投资建议(仅供参考) + +1. **情绪领先标的**: + - 完美世界和巨人网络股吧情绪最为积极,可重点关注 + - 关注其游戏业务进展和业绩情况 + +2. **风险提示**: + - 吉比特和掌趣科技情绪相对较低,需注意风险 + - 游族网络历史事件仍有一定影响 + +3. **持续关注**: + - 昆仑万维的AI和短剧业务 + - 三七互娱的分红和投资策略 + +--- + +## 附录 + +### 数据文件说明 + +- `data/`:原始爬取数据(JSON和Excel格式) +- `output/`:TF-IDF分析结果和词云图片 +- `sentiment_output/`:情感分析结果和可视化图片 + +### 分析工具 + +- **爬虫**:Python + Requests +- **分词**:jieba +- **情感词典**:大连理工大学中文情感词汇本体 +- **可视化**:Matplotlib + WordCloud + +--- + +**报告生成时间**:2026-05-28 +**分析工具**:自定义Python脚本 diff --git a/大连理工大学中文情感词汇本体.xlsx b/大连理工大学中文情感词汇本体.xlsx new file mode 100644 index 0000000..34ef21b Binary files /dev/null and b/大连理工大学中文情感词汇本体.xlsx differ