完成股吧数据分析项目：

1. 修复词云断句问题 - 添加英文单词过滤 2. 创建 Word2Vec + CNN 情绪感知模型 3. 创建情绪时间序列分析脚本（基于大连理工大学情感词典） 4. 添加停用词文件（1427个中英文停用词） 5. 更新 analyze.py 保存时间字段 post_publish_time 6. 更新 requirements.txt 添加必要依赖
2026-05-28 15:30:16 +08:00
parent 5231e995dd
commit 0098977172
7 changed files with 2165 additions and 19 deletions
@@ -10,17 +10,30 @@ import matplotlib.pyplot as plt
 import matplotlib
 matplotlib.use('Agg')  # 使用非交互式后端

-# 中文停用词表
-STOPWORDS = {
-    '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要',
-    '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '有', '吗', '吧', '呢', '啊', '呀', '什么', '怎么',
-    '为什么', '哪里', '谁', '多少', '几', '个', '只', '条', '把', '本', '篇', '次', '天', '今天', '明天', '昨天', '又',
-    '再', '还', '已经', '还是', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '还是', '或者', '还是', '还是',
-    '这个', '那个', '这些', '那些', '那么', '这么', '怎么', '如何', '因为', '所以', '虽然', '但是', '如果', '就', '那么',
-    '跟', '和', '与', '及', '或', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是', '还是',
-    '股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞',
-    'http', 'https', 'com', 'cn', 'www', 'net', 'org'
-}
+def load_stopwords(filepath='stopwords.txt'):
+    """从文件加载停用词"""
+    stopwords = set()
+    if os.path.exists(filepath):
+        with open(filepath, 'r', encoding='utf-8') as f:
+            for line in f:
+                word = line.strip()
+                if word:
+                    stopwords.add(word)
+        print(f"已加载 {len(stopwords)} 个停用词")
+    else:
+        print(f"警告：停用词文件 {filepath} 不存在，使用默认停用词")
+        stopwords = {
+            '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要',
+            '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '有', '吗', '吧', '呢', '啊', '呀', '什么', '怎么',
+            '为什么', '哪里', '谁', '多少', '几', '个', '只', '条', '把', '本', '篇', '次', '天', '今天', '明天', '昨天', '又',
+            '再', '还', '已经', '还是', '但是', '可是', '不过', '只是', '只有', '就是', '或者', '跟', '和', '与', '及', '或',
+            '股吧', '东方财富', '帖子', '发表', '回复', '点击', '查看', '更多', '原文', '转发', '分享', '收藏', '评论', '点赞',
+            'http', 'https', 'com', 'cn', 'www', 'net', 'org'
+        }
+    return stopwords
+
+# 加载停用词
+STOPWORDS = load_stopwords()

 def clean_text(text):
    """清洗文本"""
@@ -32,10 +45,11 @@ def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    # 移除表情符号
    text = re.sub(r'\[.*?\]', '', text)
-    # 移除特殊字符
-    text = re.sub(r'[^\w\s]', '', text)
-    # 移除数字
-    text = re.sub(r'\d+', '', text)
+    # 移除纯英文和数字混合的无效标记（如 sh123、abc456等）
+    text = re.sub(r'\b[a-zA-Z]+\d+\b', '', text)
+    text = re.sub(r'\b\d+[a-zA-Z]+\b', '', text)
+    # 移除特殊字符（保留中文、英文、数字）
+    text = re.sub(r'[^\w\s]', ' ', text)
    # 移除多余空格
    text = re.sub(r'\s+', ' ', text).strip()
    return text
@@ -43,9 +57,21 @@ def clean_text(text):
 def tokenize(text):
    """中文分词"""
    words = jieba.lcut(text)
-    # 过滤停用词和短词
-    words = [w for w in words if w not in STOPWORDS and len(w) > 1]
-    return words
+    # 过滤停用词、短词、纯英文单词和无意义字符
+    filtered_words = []
+    for w in words:
+        # 跳过停用词和短词
+        if w in STOPWORDS or len(w) <= 1:
+            continue
+        # 检查是否是纯英文单词
+        if re.match(r'^[a-zA-Z]+$', w):
+            # 过滤掉纯英文单词（通常是论坛标记、无意义的缩写等）
+            continue
+        # 检查是否包含无意义的英文字符组合
+        if re.match(r'^[a-zA-Z\s]+$', w):
+            continue
+        filtered_words.append(w)
+    return filtered_words

 def load_data(data_dir='data'):
    """加载所有股票数据"""
@@ -74,6 +100,7 @@ def load_data(data_dir='data'):
                    for post in posts:
                        content = post.get('post_content', '')
                        title = post.get('post_title', '')
+                        publish_time = post.get('post_publish_time', '')
                        full_text = f"{title} {content}".strip()
                        
                        if full_text:
@@ -81,6 +108,7 @@ def load_data(data_dir='data'):
                                'stock_code': stock_code,
                                'stock_name': stock_name,
                                'post_id': post.get('post_id'),
+                                'post_publish_time': publish_time,
                                'text': full_text,
                                'clean_text': clean_text(full_text)
                            })