This commit is contained in:
2026-05-28 04:54:42 +08:00
commit 5231e995dd
7 changed files with 1251 additions and 0 deletions
+187
View File
@@ -0,0 +1,187 @@
import requests
import pandas as pd
import json
import time
from datetime import datetime
import os
def fetch_guba_data(code='gssz', page=1, page_size=20, sort_type=1):
url = 'https://mguba.eastmoney.com/mguba2020/interface/GetData.aspx'
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'DNT': '1',
'Origin': 'https://mguba.eastmoney.com',
'Pragma': 'no-cache',
'Referer': f'https://mguba.eastmoney.com/mguba/list/{code}_{page}',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Mobile Safari/537.36 Edg/148.0.0.0',
'sec-ch-ua': '"Chromium";v="148", "Microsoft Edge";v="148", "Not/A)Brand";v="99"',
'sec-ch-ua-mobile': '?1',
'sec-ch-ua-platform': '"Android"'
}
cookies = {
'qgqp_b_id': '30059d8839ad5c045fa8856e38013e9c',
'st_nvi': 'XwpSfYXGjCxfCdbgapK5_cac4',
'nid18': '0daec1df8064f04edd20b4e69250a8f5',
'nid18_create_time': '1776263017375',
'gviem': 'UrMH_tSu1UpW8B_TKmytl803f',
'gviem_create_time': '1776263017375',
'fullscreengg': '1',
'fullscreengg2': '1',
'st_si': '17952715731426',
'show_app_box_time': '1779903756410',
'st_pvi': '26838250597806',
'st_sp': '2026-04-15 22:23:37',
'st_inirUrl': 'https://cn.bing.com/',
'st_sn': '30',
'st_psi': '20260528025236177-117016304298-3040545697',
'ad_tc_load_num': '3',
'st_asi': '20260528025236177-117016304298-3040545697-ad.djxd-1'
}
param = f'code={code}&p={page}&ps={page_size}&sorttype={sort_type}'
data = {
'param': param,
'plat': 'wap',
'version': '200',
'path': '/webarticlelist/api/Article/WebArticleList',
'env': '1',
'origin': '',
'ctoken': '',
'utoken': ''
}
try:
response = requests.post(url, headers=headers, cookies=cookies, data=data)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f'请求失败: {e}')
return None
def fetch_stock_posts(code, name, pages=10, page_size=20):
"""爬取指定股票的多页数据"""
all_posts = []
for page in range(1, pages + 1):
print(f'正在爬取 {name} ({code}) - 第 {page}/{pages}')
result = fetch_guba_data(code=code, page=page, page_size=page_size)
if result and 're' in result:
posts = result['re']
all_posts.extend(posts)
print(f' 成功获取 {len(posts)} 条帖子')
else:
print(f'{page} 页获取失败或无数据')
# 添加延迟避免请求过快
if page < pages:
time.sleep(1)
# 整理数据
data = {
'stock_code': code,
'stock_name': name,
'total_pages': pages,
'total_posts': len(all_posts),
'crawl_time': datetime.now().isoformat(),
'posts': all_posts
}
return data
def save_to_json(data, name="", filename=None):
if not data:
print('数据为空,无法保存')
return None
if not filename:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'guba_{name}_{timestamp}.json'
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f'JSON数据已保存到: {filename}')
return filename
def save_to_excel(data, name="", filename=None):
if not data or 'posts' not in data:
print('数据格式不正确,无法保存')
return None
posts = data['posts']
records = []
for post in posts:
record = {
'帖子ID': post.get('post_id'),
'标题': post.get('post_title'),
'内容': post.get('post_content'),
'作者': post.get('post_user', {}).get('user_nickname'),
'发布时间': post.get('post_publish_time'),
'最后更新': post.get('post_last_time'),
'阅读数': post.get('post_click_count'),
'评论数': post.get('post_comment_count'),
'点赞数': post.get('post_like_count'),
'股吧': post.get('post_guba', {}).get('stockbar_name'),
'来源': post.get('post_from')
}
records.append(record)
df = pd.DataFrame(records)
if not filename:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'guba_{name}_{timestamp}.xlsx'
df.to_excel(filename, index=False, engine='openpyxl')
print(f'Excel数据已保存到: {filename}')
return filename
if __name__ == '__main__':
GAME_STOCKS = {
'002624': '完美世界',
'002555': '三七互娱',
'002558': '巨人网络',
'002602': '世纪华通',
'300418': '昆仑万维',
'002174': '游族网络',
'300315': '掌趣科技',
'603444': '吉比特',
}
# 创建数据目录
os.makedirs('data', exist_ok=True)
for code, name in GAME_STOCKS.items():
print(f'\n{"="*50}')
print(f'开始爬取 {name} ({code})')
print(f'{"="*50}')
# 爬取10页数据
data = fetch_stock_posts(code, name, pages=10)
if data and data['total_posts'] > 0:
print(f'\n共获取 {data["total_posts"]} 条帖子')
# 保存JSON
json_filename = os.path.join('data', f'guba_{name}_{code}.json')
save_to_json(data, name, json_filename)
# 保存Excel
excel_filename = os.path.join('data', f'guba_{name}_{code}.xlsx')
save_to_excel(data, name, excel_filename)
else:
print(f'{name} 爬取失败或无数据')
# 股票之间的延迟
time.sleep(2)