guba2vec/analyze.py

import json
import os
import re
import jieba
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')  # 使用非交互式后端

# 中文停用词表
STOPWORDS = {
    '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要',
    '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '有', '吗', '吧', '呢', '啊', '呀', '什么', '怎么',
    '为什么', '哪里', '谁', '多少', '几', '个', '只', '条', '把', '本', '篇', '次', '天', '今天', '明天', '昨天', '又',
    '再', '还', '已经', '还是', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '还是', '或者', '还是', '还是',
    '这个', '那个', '这些', '那些', '那么', '这么', '怎么', '如何', '因为', '所以', '虽然', '但是', '如果', '就', '那么',
    '跟', '和', '与', '及', '或', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是',
    '股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞',
    'http', 'https', 'com', 'cn', 'www', 'net', 'org'
}

def clean_text(text):
    """清洗文本"""
    if not text:
        return ""
    # 移除URL
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # 移除HTML标签
    text = re.sub(r'<.*?>', '', text)
    # 移除表情符号
    text = re.sub(r'\[.*?\]', '', text)
    # 移除特殊字符
    text = re.sub(r'[^\w\s]', '', text)
    # 移除数字
    text = re.sub(r'\d+', '', text)
    # 移除多余空格
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize(text):
    """中文分词"""
    words = jieba.lcut(text)
    # 过滤停用词和短词
    words = [w for w in words if w not in STOPWORDS and len(w) > 1]
    return words

def load_data(data_dir='data'):
    """加载所有股票数据"""
    all_data = []
    stock_info = {}

    if not os.path.exists(data_dir):
        print(f'数据目录 {data_dir} 不存在')
        return all_data, stock_info

    for filename in os.listdir(data_dir):
        if filename.endswith('.json'):
            filepath = os.path.join(data_dir, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    stock_name = data.get('stock_name', '未知')
                    stock_code = data.get('stock_code', '未知')
                    posts = data.get('posts', [])

                    stock_info[stock_code] = {
                        'name': stock_name,
                        'post_count': len(posts)
                    }

                    for post in posts:
                        content = post.get('post_content', '')
                        title = post.get('post_title', '')
                        full_text = f"{title} {content}".strip()

                        if full_text:
                            all_data.append({
                                'stock_code': stock_code,
                                'stock_name': stock_name,
                                'post_id': post.get('post_id'),
                                'text': full_text,
                                'clean_text': clean_text(full_text)
                            })
            except Exception as e:
                print(f'加载文件 {filename} 失败: {e}')

    return all_data, stock_info

def calculate_tfidf(texts):
    """计算TF-IDF"""
    vectorizer = TfidfVectorizer(
        tokenizer=tokenize,
        token_pattern=None,
        max_features=1000,
        ngram_range=(1, 2)
    )

    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()

    return tfidf_matrix, feature_names, vectorizer

def get_top_keywords(tfidf_matrix, feature_names, top_n=20):
    """获取Top关键词"""
    avg_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
    top_indices = avg_tfidf.argsort()[-top_n*4:][::-1]  # 多取一些，避免重复后不够

    # 先收集候选词
    candidates = []
    for idx in top_indices:
        word = feature_names[idx]
        if len(word.strip()) > 0:
            candidates.append({
                'word': word,
                'tfidf': avg_tfidf[idx],
                'length': len(word.split())  # 词的长度（包含多少个词）
            })

    # 按词长降序排序（优先保留组合词）
    candidates.sort(key=lambda x: (-x['length'], -x['tfidf']))

    # 智能去重 - 优先保留组合词
    keywords = []
    seen_words = set()
    seen_parts = set()

    for candidate in candidates:
        word = candidate['word']
        word_parts = word.split()

        # 检查是否应该添加这个词
        should_add = True

        # 检查这个词的任何部分是否已经被其他词使用了
        for part in word_parts:
            if part in seen_parts:
                should_add = False
                break

        if should_add and word not in seen_words:
            seen_words.add(word)
            # 记录所有使用过的词部分
            for part in word_parts:
                seen_parts.add(part)
            keywords.append({
                'word': word,
                'tfidf': candidate['tfidf']
            })
            if len(keywords) >= top_n:
                break

    # 按TF-IDF重新排序
    keywords.sort(key=lambda x: -x['tfidf'])
    return keywords

def get_stock_specific_keywords(all_data, stock_code, top_n=20):
    """获取特定股票的关键词"""
    stock_texts = [d['clean_text'] for d in all_data if d['stock_code'] == stock_code]
    other_texts = [d['clean_text'] for d in all_data if d['stock_code'] != stock_code]

    if len(stock_texts) < 5:
        return []

    all_texts = stock_texts + other_texts
    tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts)

    # 计算该股票的平均TF-IDF
    stock_matrix = tfidf_matrix[:len(stock_texts)]
    avg_tfidf = np.array(stock_matrix.mean(axis=0)).flatten()

    # 计算其他股票的平均TF-IDF
    if other_texts:
        other_matrix = tfidf_matrix[len(stock_texts):]
        other_avg = np.array(other_matrix.mean(axis=0)).flatten()
        # 计算差值
        diff = avg_tfidf - other_avg
    else:
        diff = avg_tfidf

    top_indices = diff.argsort()[-top_n*4:][::-1]  # 多取一些，避免重复后不够

    # 先收集候选词
    candidates = []
    for idx in top_indices:
        word = feature_names[idx]
        if len(word.strip()) > 0:
            candidates.append({
                'word': word,
                'tfidf': avg_tfidf[idx],
                'diff': diff[idx],
                'length': len(word.split())  # 词的长度
            })

    # 按词长降序排序（优先保留组合词）
    candidates.sort(key=lambda x: (-x['length'], -x['diff']))

    # 智能去重 - 优先保留组合词
    keywords = []
    seen_words = set()
    seen_parts = set()

    for candidate in candidates:
        word = candidate['word']
        word_parts = word.split()

        # 检查是否应该添加这个词
        should_add = True

        # 检查这个词的任何部分是否已经被其他词使用了
        for part in word_parts:
            if part in seen_parts:
                should_add = False
                break

        if should_add and word not in seen_words:
            seen_words.add(word)
            # 记录所有使用过的词部分
            for part in word_parts:
                seen_parts.add(part)
            keywords.append({
                'word': word,
                'tfidf': candidate['tfidf'],
                'diff': candidate['diff']
            })
            if len(keywords) >= top_n:
                break

    # 按diff重新排序
    keywords.sort(key=lambda x: -x['diff'])
    return keywords

def generate_wordcloud(keywords, stock_name, output_dir='output'):
    """生成词云"""
    os.makedirs(output_dir, exist_ok=True)

    word_freq = {k['word']: k['tfidf'] for k in keywords}

    wc = WordCloud(
        font_path='C:/Windows/Fonts/simhei.ttf',  # Windows中文字体路径
        width=800,
        height=600,
        background_color='white',
        max_words=100
    )

    wc.generate_from_frequencies(word_freq)

    output_path = os.path.join(output_dir, f'wordcloud_{stock_name}.png')
    wc.to_file(output_path)
    print(f'词云已保存到: {output_path}')

    return output_path

def analyze_all():
    """完整分析流程"""
    print('='*60)
    print('股吧数据 TF-IDF 分析')
    print('='*60)

    # 创建输出目录
    os.makedirs('output', exist_ok=True)

    # 加载数据
    print('\n[1/5] 加载数据...')
    all_data, stock_info = load_data()

    if not all_data:
        print('没有找到数据，请先运行爬虫')
        return

    print(f'  共加载 {len(all_data)} 条帖子')
    print(f'  涉及 {len(stock_info)} 只股票:')
    for code, info in stock_info.items():
        print(f'    - {info["name"]} ({code}): {info["post_count"]} 条')

    # 整体分析
    print('\n[2/5] 整体关键词分析...')
    all_texts = [d['clean_text'] for d in all_data]
    tfidf_matrix, feature_names, vectorizer = calculate_tfidf(all_texts)
    overall_keywords = get_top_keywords(tfidf_matrix, feature_names, top_n=30)

    print('\n  整体Top 20关键词:')
    for i, kw in enumerate(overall_keywords[:20], 1):
        print(f'    {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f})')

    # 保存整体关键词
    overall_df = pd.DataFrame(overall_keywords)
    overall_df.to_csv('output/overall_keywords.csv', index=False, encoding='utf-8-sig')

    # 生成整体词云
    generate_wordcloud(overall_keywords, 'overall')

    # 各股票单独分析
    print('\n[3/5] 各股票关键词分析...')
    stock_keywords = {}

    for stock_code in stock_info.keys():
        stock_name = stock_info[stock_code]['name']
        print(f'\n  分析 {stock_name} ({stock_code})...')

        keywords = get_stock_specific_keywords(all_data, stock_code, top_n=20)
        stock_keywords[stock_code] = keywords

        if keywords:
            print(f'  Top 10关键词:')
            for i, kw in enumerate(keywords[:10], 1):
                print(f'    {i:2d}. {kw["word"]:10s} (TF-IDF: {kw["tfidf"]:.4f}, 差值: {kw["diff"]:.4f})')

            # 生成词云
            generate_wordcloud(keywords, stock_name)

            # 保存关键词
            df = pd.DataFrame(keywords)
            df.to_csv(f'output/keywords_{stock_name}.csv', index=False, encoding='utf-8-sig')

    # 生成汇总报告
    print('\n[4/5] 生成汇总报告...')
    report_data = []
    for stock_code, keywords in stock_keywords.items():
        stock_name = stock_info[stock_code]['name']
        top_words = ', '.join([k['word'] for k in keywords[:5]])
        report_data.append({
            '股票代码': stock_code,
            '股票名称': stock_name,
            '帖子数量': stock_info[stock_code]['post_count'],
            'Top5关键词': top_words
        })

    report_df = pd.DataFrame(report_data)
    report_df.to_csv('output/summary_report.csv', index=False, encoding='utf-8-sig')
    print('  汇总报告已保存到: output/summary_report.csv')

    # 保存所有文本数据
    print('\n[5/5] 保存预处理数据...')
    all_df = pd.DataFrame(all_data)
    all_df.to_csv('output/all_posts.csv', index=False, encoding='utf-8-sig')
    print('  所有帖子已保存到: output/all_posts.csv')

    print('\n' + '='*60)
    print('分析完成！结果保存在 output/ 目录中')
    print('='*60)

if __name__ == '__main__':
    analyze_all()