import json import os import re import jieba import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from wordcloud import WordCloud import matplotlib.pyplot as plt import matplotlib matplotlib.use('Agg') # 使用非交互式后端 # 中文停用词表 STOPWORDS = { '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '有', '吗', '吧', '呢', '啊', '呀', '什么', '怎么', '为什么', '哪里', '谁', '多少', '几', '个', '只', '条', '把', '本', '篇', '次', '天', '今天', '明天', '昨天', '又', '再', '还', '已经', '还是', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '还是', '或者', '还是', '还是', '这个', '那个', '这些', '那些', '那么', '这么', '怎么', '如何', '因为', '所以', '虽然', '但是', '如果', '就', '那么', '跟', '和', '与', '及', '或', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞', 'http', 'https', 'com', 'cn', 'www', 'net', 'org' } def clean_text(text): """清洗文本""" if not text: return "" # 移除URL text = re.sub(r'https?://\S+|www\.\S+', '', text) # 移除HTML标签 text = re.sub(r'<.*?>', '', text) # 移除表情符号 text = re.sub(r'\[.*?\]', '', text) # 移除特殊字符 text = re.sub(r'[^\w\s]', '', text) # 移除数字 text = re.sub(r'\d+', '', text) # 移除多余空格 text = re.sub(r'\s+', ' ', text).strip() return text def tokenize(text): """中文分词""" words = jieba.lcut(text) # 过滤停用词和短词 words = [w for w in words if w not in STOPWORDS and len(w) > 1] return words def load_data(data_dir='data'): """加载所有股票数据""" all_data = [] stock_info = {} if not os.path.exists(data_dir): print(f'数据目录 {data_dir} 不存在') return all_data, stock_info for filename in os.listdir(data_dir): if filename.endswith('.json'): filepath = os.path.join(data_dir, filename) try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) stock_name = data.get('stock_name', '未知') stock_code = data.get('stock_code', '未知') posts = data.get('posts', []) stock_info[stock_code] = { 'name': stock_name, 'post_count': len(posts) } for post in posts: content = post.get('post_content', '') title = post.get('post_title', '') full_text = f"{title} {content}".strip() if full_text: all_data.append({ 'stock_code': stock_code, 'stock_name': stock_name, 'post_id': post.get('post_id'), 'text': full_text, 'clean_text': clean_text(full_text) }) except Exception as e: print(f'加载文件 {filename} 失败: {e}') return all_data, stock_info def calculate_tfidf(texts): """计算TF-IDF""" vectorizer = TfidfVectorizer( tokenizer=tokenize, token_pattern=None, max_features=1000, ngram_range=(1, 2) ) tfidf_matrix = vectorizer.fit_transform(texts) feature_names = vectorizer.get_feature_names_out() return tfidf_matrix, feature_names, vectorizer def get_top_keywords(tfidf_matrix, feature_names, top_n=20): """获取Top关键词""" avg_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten() top_indices = avg_tfidf.argsort()[-top_n*4:][::-1] # 多取一些,避免重复后不够 # 先收集候选词 candidates = [] for idx in top_indices: word = feature_names[idx] if len(word.strip()) > 0: candidates.append({ 'word': word, 'tfidf': avg_tfidf[idx], 'length': len(word.split()) # 词的长度(包含多少个词) }) # 按词长降序排序(优先保留组合词) candidates.sort(key=lambda x: (-x['length'], -x['tfidf'])) # 智能去重 - 优先保留组合词 keywords = [] seen_words = set() seen_parts = set() for candidate in candidates: word = candidate['word'] word_parts = word.split() # 检查是否应该添加这个词 should_add = True # 检查这个词的任何部分是否已经被其他词使用了 for part in word_parts: if part in seen_parts: should_add = False break if should_add and word not in seen_words: seen_words.add(word) # 记录所有使用过的词部分 for part in word_parts: seen_parts.add(part) keywords.append({ 'word': word, 'tfidf': candidate['tfidf'] }) if len(keywords) >= top_n: break # 按TF-IDF重新排序 keywords.sort(key=lambda x: -x['tfidf']) return keywords def get_stock_specific_keywords(all_data, stock_code, top_n=20): """获取特定股票的关键词""" stock_texts = [d['clean_text'] for d in all_data if d['stock_code'] == stock_code] other_texts = [d['clean_text'] for d in all_data if d['stock_code'] != stock_code] if len(stock_texts) < 5: return [] all_texts = stock_texts + other_texts tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts) # 计算该股票的平均TF-IDF stock_matrix = tfidf_matrix[:len(stock_texts)] avg_tfidf = np.array(stock_matrix.mean(axis=0)).flatten() # 计算其他股票的平均TF-IDF if other_texts: other_matrix = tfidf_matrix[len(stock_texts):] other_avg = np.array(other_matrix.mean(axis=0)).flatten() # 计算差值 diff = avg_tfidf - other_avg else: diff = avg_tfidf top_indices = diff.argsort()[-top_n*4:][::-1] # 多取一些,避免重复后不够 # 先收集候选词 candidates = [] for idx in top_indices: word = feature_names[idx] if len(word.strip()) > 0: candidates.append({ 'word': word, 'tfidf': avg_tfidf[idx], 'diff': diff[idx], 'length': len(word.split()) # 词的长度 }) # 按词长降序排序(优先保留组合词) candidates.sort(key=lambda x: (-x['length'], -x['diff'])) # 智能去重 - 优先保留组合词 keywords = [] seen_words = set() seen_parts = set() for candidate in candidates: word = candidate['word'] word_parts = word.split() # 检查是否应该添加这个词 should_add = True # 检查这个词的任何部分是否已经被其他词使用了 for part in word_parts: if part in seen_parts: should_add = False break if should_add and word not in seen_words: seen_words.add(word) # 记录所有使用过的词部分 for part in word_parts: seen_parts.add(part) keywords.append({ 'word': word, 'tfidf': candidate['tfidf'], 'diff': candidate['diff'] }) if len(keywords) >= top_n: break # 按diff重新排序 keywords.sort(key=lambda x: -x['diff']) return keywords def generate_wordcloud(keywords, stock_name, output_dir='output'): """生成词云""" os.makedirs(output_dir, exist_ok=True) word_freq = {k['word']: k['tfidf'] for k in keywords} wc = WordCloud( font_path='C:/Windows/Fonts/simhei.ttf', # Windows中文字体路径 width=800, height=600, background_color='white', max_words=100 ) wc.generate_from_frequencies(word_freq) output_path = os.path.join(output_dir, f'wordcloud_{stock_name}.png') wc.to_file(output_path) print(f'词云已保存到: {output_path}') return output_path def analyze_all(): """完整分析流程""" print('='*60) print('股吧数据 TF-IDF 分析') print('='*60) # 创建输出目录 os.makedirs('output', exist_ok=True) # 加载数据 print('\n[1/5] 加载数据...') all_data, stock_info = load_data() if not all_data: print('没有找到数据,请先运行爬虫') return print(f' 共加载 {len(all_data)} 条帖子') print(f' 涉及 {len(stock_info)} 只股票:') for code, info in stock_info.items(): print(f' - {info["name"]} ({code}): {info["post_count"]} 条') # 整体分析 print('\n[2/5] 整体关键词分析...') all_texts = [d['clean_text'] for d in all_data] tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts) overall_keywords = get_top_keywords(tfidf_matrix, feature_names, top_n=30) print('\n 整体Top 20关键词:') for i, kw in enumerate(overall_keywords[:20], 1): print(f' {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f})') # 保存整体关键词 overall_df = pd.DataFrame(overall_keywords) overall_df.to_csv('output/overall_keywords.csv', index=False, encoding='utf-8-sig') # 生成整体词云 generate_wordcloud(overall_keywords, 'overall') # 各股票单独分析 print('\n[3/5] 各股票关键词分析...') stock_keywords = {} for stock_code in stock_info.keys(): stock_name = stock_info[stock_code]['name'] print(f'\n 分析 {stock_name} ({stock_code})...') keywords = get_stock_specific_keywords(all_data, stock_code, top_n=20) stock_keywords[stock_code] = keywords if keywords: print(f' Top 10关键词:') for i, kw in enumerate(keywords[:10], 1): print(f' {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f}, 差值: {kw["diff"]:.4f})') # 生成词云 generate_wordcloud(keywords, stock_name) # 保存关键词 df = pd.DataFrame(keywords) df.to_csv(f'output/keywords_{stock_name}.csv', index=False, encoding='utf-8-sig') # 生成汇总报告 print('\n[4/5] 生成汇总报告...') report_data = [] for stock_code, keywords in stock_keywords.items(): stock_name = stock_info[stock_code]['name'] top_words = ', '.join([k['word'] for k in keywords[:5]]) report_data.append({ '股票代码': stock_code, '股票名称': stock_name, '帖子数量': stock_info[stock_code]['post_count'], 'Top5关键词': top_words }) report_df = pd.DataFrame(report_data) report_df.to_csv('output/summary_report.csv', index=False, encoding='utf-8-sig') print(' 汇总报告已保存到: output/summary_report.csv') # 保存所有文本数据 print('\n[5/5] 保存预处理数据...') all_df = pd.DataFrame(all_data) all_df.to_csv('output/all_posts.csv', index=False, encoding='utf-8-sig') print(' 所有帖子已保存到: output/all_posts.csv') print('\n' + '='*60) print('分析完成!结果保存在 output/ 目录中') print('='*60) if __name__ == '__main__': analyze_all()