import requests import pandas as pd import json import time from datetime import datetime import os def fetch_guba_data(code='gssz', page=1, page_size=20, sort_type=1): url = 'https://mguba.eastmoney.com/mguba2020/interface/GetData.aspx' headers = { 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'DNT': '1', 'Origin': 'https://mguba.eastmoney.com', 'Pragma': 'no-cache', 'Referer': f'https://mguba.eastmoney.com/mguba/list/{code}_{page}', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Mobile Safari/537.36 Edg/148.0.0.0', 'sec-ch-ua': '"Chromium";v="148", "Microsoft Edge";v="148", "Not/A)Brand";v="99"', 'sec-ch-ua-mobile': '?1', 'sec-ch-ua-platform': '"Android"' } cookies = { 'qgqp_b_id': '30059d8839ad5c045fa8856e38013e9c', 'st_nvi': 'XwpSfYXGjCxfCdbgapK5_cac4', 'nid18': '0daec1df8064f04edd20b4e69250a8f5', 'nid18_create_time': '1776263017375', 'gviem': 'UrMH_tSu1UpW8B_TKmytl803f', 'gviem_create_time': '1776263017375', 'fullscreengg': '1', 'fullscreengg2': '1', 'st_si': '17952715731426', 'show_app_box_time': '1779903756410', 'st_pvi': '26838250597806', 'st_sp': '2026-04-15 22:23:37', 'st_inirUrl': 'https://cn.bing.com/', 'st_sn': '30', 'st_psi': '20260528025236177-117016304298-3040545697', 'ad_tc_load_num': '3', 'st_asi': '20260528025236177-117016304298-3040545697-ad.djxd-1' } param = f'code={code}&p={page}&ps={page_size}&sorttype={sort_type}' data = { 'param': param, 'plat': 'wap', 'version': '200', 'path': '/webarticlelist/api/Article/WebArticleList', 'env': '1', 'origin': '', 'ctoken': '', 'utoken': '' } try: response = requests.post(url, headers=headers, cookies=cookies, data=data) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: print(f'请求失败: {e}') return None def fetch_stock_posts(code, name, pages=10, page_size=20): """爬取指定股票的多页数据""" all_posts = [] for page in range(1, pages + 1): print(f'正在爬取 {name} ({code}) - 第 {page}/{pages} 页') result = fetch_guba_data(code=code, page=page, page_size=page_size) if result and 're' in result: posts = result['re'] all_posts.extend(posts) print(f' 成功获取 {len(posts)} 条帖子') else: print(f' 第 {page} 页获取失败或无数据') # 添加延迟避免请求过快 if page < pages: time.sleep(1) # 整理数据 data = { 'stock_code': code, 'stock_name': name, 'total_pages': pages, 'total_posts': len(all_posts), 'crawl_time': datetime.now().isoformat(), 'posts': all_posts } return data def save_to_json(data, name="", filename=None): if not data: print('数据为空,无法保存') return None if not filename: timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f'guba_{name}_{timestamp}.json' with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f'JSON数据已保存到: {filename}') return filename def save_to_excel(data, name="", filename=None): if not data or 'posts' not in data: print('数据格式不正确,无法保存') return None posts = data['posts'] records = [] for post in posts: record = { '帖子ID': post.get('post_id'), '标题': post.get('post_title'), '内容': post.get('post_content'), '作者': post.get('post_user', {}).get('user_nickname'), '发布时间': post.get('post_publish_time'), '最后更新': post.get('post_last_time'), '阅读数': post.get('post_click_count'), '评论数': post.get('post_comment_count'), '点赞数': post.get('post_like_count'), '股吧': post.get('post_guba', {}).get('stockbar_name'), '来源': post.get('post_from') } records.append(record) df = pd.DataFrame(records) if not filename: timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f'guba_{name}_{timestamp}.xlsx' df.to_excel(filename, index=False, engine='openpyxl') print(f'Excel数据已保存到: {filename}') return filename if __name__ == '__main__': GAME_STOCKS = { '002624': '完美世界', '002555': '三七互娱', '002558': '巨人网络', '002602': '世纪华通', '300418': '昆仑万维', '002174': '游族网络', '300315': '掌趣科技', '603444': '吉比特', } # 创建数据目录 os.makedirs('data', exist_ok=True) for code, name in GAME_STOCKS.items(): print(f'\n{"="*50}') print(f'开始爬取 {name} ({code})') print(f'{"="*50}') # 爬取10页数据 data = fetch_stock_posts(code, name, pages=30) if data and data['total_posts'] > 0: print(f'\n共获取 {data["total_posts"]} 条帖子') # 保存JSON json_filename = os.path.join('data', f'guba_{name}_{code}.json') save_to_json(data, name, json_filename) # 保存Excel excel_filename = os.path.join('data', f'guba_{name}_{code}.xlsx') save_to_excel(data, name, excel_filename) else: print(f'{name} 爬取失败或无数据') # 股票之间的延迟 time.sleep(2)