0098977172
1. 修复词云断句问题 - 添加英文单词过滤 2. 创建 Word2Vec + CNN 情绪感知模型 3. 创建情绪时间序列分析脚本(基于大连理工大学情感词典) 4. 添加停用词文件(1427个中英文停用词) 5. 更新 analyze.py 保存时间字段 post_publish_time 6. 更新 requirements.txt 添加必要依赖
188 lines
6.2 KiB
Python
188 lines
6.2 KiB
Python
import requests
|
|
import pandas as pd
|
|
import json
|
|
import time
|
|
from datetime import datetime
|
|
import os
|
|
|
|
def fetch_guba_data(code='gssz', page=1, page_size=20, sort_type=1):
|
|
url = 'https://mguba.eastmoney.com/mguba2020/interface/GetData.aspx'
|
|
|
|
headers = {
|
|
'Accept': '*/*',
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
'Cache-Control': 'no-cache',
|
|
'Connection': 'keep-alive',
|
|
'Content-Type': 'application/x-www-form-urlencoded',
|
|
'DNT': '1',
|
|
'Origin': 'https://mguba.eastmoney.com',
|
|
'Pragma': 'no-cache',
|
|
'Referer': f'https://mguba.eastmoney.com/mguba/list/{code}_{page}',
|
|
'Sec-Fetch-Dest': 'empty',
|
|
'Sec-Fetch-Mode': 'cors',
|
|
'Sec-Fetch-Site': 'same-origin',
|
|
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Mobile Safari/537.36 Edg/148.0.0.0',
|
|
'sec-ch-ua': '"Chromium";v="148", "Microsoft Edge";v="148", "Not/A)Brand";v="99"',
|
|
'sec-ch-ua-mobile': '?1',
|
|
'sec-ch-ua-platform': '"Android"'
|
|
}
|
|
|
|
cookies = {
|
|
'qgqp_b_id': '30059d8839ad5c045fa8856e38013e9c',
|
|
'st_nvi': 'XwpSfYXGjCxfCdbgapK5_cac4',
|
|
'nid18': '0daec1df8064f04edd20b4e69250a8f5',
|
|
'nid18_create_time': '1776263017375',
|
|
'gviem': 'UrMH_tSu1UpW8B_TKmytl803f',
|
|
'gviem_create_time': '1776263017375',
|
|
'fullscreengg': '1',
|
|
'fullscreengg2': '1',
|
|
'st_si': '17952715731426',
|
|
'show_app_box_time': '1779903756410',
|
|
'st_pvi': '26838250597806',
|
|
'st_sp': '2026-04-15 22:23:37',
|
|
'st_inirUrl': 'https://cn.bing.com/',
|
|
'st_sn': '30',
|
|
'st_psi': '20260528025236177-117016304298-3040545697',
|
|
'ad_tc_load_num': '3',
|
|
'st_asi': '20260528025236177-117016304298-3040545697-ad.djxd-1'
|
|
}
|
|
|
|
param = f'code={code}&p={page}&ps={page_size}&sorttype={sort_type}'
|
|
data = {
|
|
'param': param,
|
|
'plat': 'wap',
|
|
'version': '200',
|
|
'path': '/webarticlelist/api/Article/WebArticleList',
|
|
'env': '1',
|
|
'origin': '',
|
|
'ctoken': '',
|
|
'utoken': ''
|
|
}
|
|
|
|
try:
|
|
response = requests.post(url, headers=headers, cookies=cookies, data=data)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except requests.exceptions.RequestException as e:
|
|
print(f'请求失败: {e}')
|
|
return None
|
|
|
|
def fetch_stock_posts(code, name, pages=10, page_size=20):
|
|
"""爬取指定股票的多页数据"""
|
|
all_posts = []
|
|
|
|
for page in range(1, pages + 1):
|
|
print(f'正在爬取 {name} ({code}) - 第 {page}/{pages} 页')
|
|
result = fetch_guba_data(code=code, page=page, page_size=page_size)
|
|
|
|
if result and 're' in result:
|
|
posts = result['re']
|
|
all_posts.extend(posts)
|
|
print(f' 成功获取 {len(posts)} 条帖子')
|
|
else:
|
|
print(f' 第 {page} 页获取失败或无数据')
|
|
|
|
# 添加延迟避免请求过快
|
|
if page < pages:
|
|
time.sleep(1)
|
|
|
|
# 整理数据
|
|
data = {
|
|
'stock_code': code,
|
|
'stock_name': name,
|
|
'total_pages': pages,
|
|
'total_posts': len(all_posts),
|
|
'crawl_time': datetime.now().isoformat(),
|
|
'posts': all_posts
|
|
}
|
|
|
|
return data
|
|
|
|
def save_to_json(data, name="", filename=None):
|
|
if not data:
|
|
print('数据为空,无法保存')
|
|
return None
|
|
|
|
if not filename:
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
filename = f'guba_{name}_{timestamp}.json'
|
|
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f'JSON数据已保存到: {filename}')
|
|
return filename
|
|
|
|
def save_to_excel(data, name="", filename=None):
|
|
if not data or 'posts' not in data:
|
|
print('数据格式不正确,无法保存')
|
|
return None
|
|
|
|
posts = data['posts']
|
|
records = []
|
|
|
|
for post in posts:
|
|
record = {
|
|
'帖子ID': post.get('post_id'),
|
|
'标题': post.get('post_title'),
|
|
'内容': post.get('post_content'),
|
|
'作者': post.get('post_user', {}).get('user_nickname'),
|
|
'发布时间': post.get('post_publish_time'),
|
|
'最后更新': post.get('post_last_time'),
|
|
'阅读数': post.get('post_click_count'),
|
|
'评论数': post.get('post_comment_count'),
|
|
'点赞数': post.get('post_like_count'),
|
|
'股吧': post.get('post_guba', {}).get('stockbar_name'),
|
|
'来源': post.get('post_from')
|
|
}
|
|
records.append(record)
|
|
|
|
df = pd.DataFrame(records)
|
|
|
|
if not filename:
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
filename = f'guba_{name}_{timestamp}.xlsx'
|
|
|
|
df.to_excel(filename, index=False, engine='openpyxl')
|
|
print(f'Excel数据已保存到: {filename}')
|
|
return filename
|
|
|
|
if __name__ == '__main__':
|
|
GAME_STOCKS = {
|
|
'002624': '完美世界',
|
|
'002555': '三七互娱',
|
|
'002558': '巨人网络',
|
|
'002602': '世纪华通',
|
|
'300418': '昆仑万维',
|
|
'002174': '游族网络',
|
|
'300315': '掌趣科技',
|
|
'603444': '吉比特',
|
|
}
|
|
|
|
# 创建数据目录
|
|
os.makedirs('data', exist_ok=True)
|
|
|
|
for code, name in GAME_STOCKS.items():
|
|
print(f'\n{"="*50}')
|
|
print(f'开始爬取 {name} ({code})')
|
|
print(f'{"="*50}')
|
|
|
|
# 爬取10页数据
|
|
data = fetch_stock_posts(code, name, pages=30)
|
|
|
|
if data and data['total_posts'] > 0:
|
|
print(f'\n共获取 {data["total_posts"]} 条帖子')
|
|
|
|
# 保存JSON
|
|
json_filename = os.path.join('data', f'guba_{name}_{code}.json')
|
|
save_to_json(data, name, json_filename)
|
|
|
|
# 保存Excel
|
|
excel_filename = os.path.join('data', f'guba_{name}_{code}.xlsx')
|
|
save_to_excel(data, name, excel_filename)
|
|
else:
|
|
print(f'{name} 爬取失败或无数据')
|
|
|
|
# 股票之间的延迟
|
|
time.sleep(2)
|