import json import re import time import urllib.request import urllib.parse from datetime import datetime headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36', 'Referer': 'https://guba.eastmoney.com/', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'Sec-Ch-Ua': '"Chromium";v="148", "Not;A=Brand";v="24", "Microsoft Edge";v="148"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Windows"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Cookie': 'qgqp_b_id=30059d8839ad5c045fa8856e38013e9c; st_nvi=XwpSfYXGjCxfCdbgapK5_cac4; nid18=0daec1df8064f04edd20b4e69250a8f5; nid18_create_time=1776263017375; gviem=UrMH_tSu1UpW8B_TKmytl803f; gviem_create_time=1776263017375; fullscreengg=1; fullscreengg2=1; st_si=63999118594852; wsc_checkuser_ok=1; st_asi=delete; st_pvi=26838250597806; st_sp=2026-04-15%2022%3A23%3A37; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=30; st_psi=20260520214901287-117001354293-0422265952', } comment_headers = { 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'https://guba.eastmoney.com', 'Pragma': 'no-cache', 'Referer': 'https://guba.eastmoney.com/', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', 'Cookie': 'qgqp_b_id=30059d8839ad5c045fa8856e38013e9c; st_nvi=XwpSfYXGjCxfCdbgapK5_cac4; nid18=0daec1df8064f04edd20b4e69250a8f5; nid18_create_time=1776263017375; gviem=UrMH_tSu1UpW8B_TKmytl803f; gviem_create_time=1776263017375; fullscreengg=1; fullscreengg2=1; st_si=63999118594852; wsc_checkuser_ok=1; st_asi=delete; st_pvi=26838250597806; st_sp=2026-04-15%2022%3A23%3A37; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=30; st_psi=20260520214901287-117001354293-0422265952', } MAX_RETRIES = 3 DELAY_BETWEEN_REQUESTS = 2.0 DELAY_BETWEEN_PAGES = 5.0 OUTPUT_FILE = 'guba_data.json' def fetch(url, headers, method='GET', data=None, timeout=15): for attempt in range(MAX_RETRIES): try: req = urllib.request.Request(url, headers=headers, method=method, data=data) with urllib.request.urlopen(req, timeout=timeout) as response: if response.status == 429: print(f' 请求过于频繁,等待10秒后重试...') time.sleep(10) continue if response.status == 403: print(f' 请求被拒绝,第{attempt+1}次重试...') time.sleep(5) continue if response.status != 200: print(f' 请求失败,状态码: {response.status}') return None content = response.read().decode('utf-8', errors='ignore') return content except urllib.error.URLError as e: print(f' 请求超时,第{attempt+1}次重试...') time.sleep(5) except Exception as e: print(f' 请求异常: {str(e)}') if attempt < MAX_RETRIES - 1: time.sleep(5) return None def initialize_session(): print('正在初始化会话...') fetch('https://guba.eastmoney.com/', headers) time.sleep(2) print('会话初始化完成') def get_post_list(stock_code='002624', page=1): if page == 1: url = f'https://guba.eastmoney.com/list,{stock_code},f.html' else: url = f'https://guba.eastmoney.com/list,{stock_code},f{page}.html' html = fetch(url, headers) if not html: return [] posts = [] pattern = r'var article_list=\s*({"re":.*?});' match = re.search(pattern, html, re.DOTALL) if match: try: data = json.loads(match.group(1)) for item in data.get('re', []): post_id = item.get('post_id', '') title = item.get('post_title', '').strip() author = item.get('user_nickname', '').strip() post_time = item.get('post_display_time', '') comment_count = item.get('post_comment_count', 0) click_count = item.get('post_click_count', 0) forward_count = item.get('post_forward_count', 0) like_count = item.get('post_like_count', 0) if post_id and title: posts.append({ 'post_id': post_id, 'title': title, 'author': author, 'post_time': post_time, 'comment_count': comment_count, 'click_count': click_count, 'forward_count': forward_count, 'like_count': like_count, 'url': f'https://guba.eastmoney.com/news,{stock_code},{post_id}.html' }) except json.JSONDecodeError: pass return posts def get_comments(stock_code, post_id, page=1, page_size=30): url = f'https://guba.eastmoney.com/api/getData?code={stock_code}&path=reply/api/Reply/ArticleNewReplyList' payload = { 'param': f'postid={post_id}&sort=1&sorttype=1&p={page}&ps={page_size}', 'plat': 'Web', 'path': 'reply/api/Reply/ArticleNewReplyList', 'env': '2', 'origin': '', 'version': '2022', 'product': 'Guba' } data = urllib.parse.urlencode(payload).encode('utf-8') response_text = fetch(url, comment_headers, method='POST', data=data) if not response_text: return [] try: data = json.loads(response_text) if 're' in data: reply_list = data.get('re', []) elif 'data' in data and 'reply_list' in data['data']: reply_list = data['data'].get('reply_list', []) else: print(f' 未知的响应结构: {list(data.keys())}') return [] if not isinstance(reply_list, list) or len(reply_list) == 0: return [] comments = [] for item in reply_list: reply_user = item.get('reply_user', {}) comment = { 'reply_id': str(item.get('reply_id', '')), 'user_nickname': reply_user.get('user_nickname', '').strip(), 'reply_content': item.get('reply_text', '').strip(), 'reply_time': item.get('reply_time', ''), 'reply_like_count': item.get('reply_like_count', 0), 'reply_against_count': item.get('reply_against_count', 0), } if comment['reply_content']: comments.append(comment) return comments except json.JSONDecodeError as e: print(f' JSON解析失败: {str(e)}') return [] def get_all_comments(stock_code, post_id, total_comments): all_comments = [] page_size = 30 page = 1 while True: comments = get_comments(stock_code, post_id, page, page_size) if not comments: break all_comments.extend(comments) print(f' 第{page}页评论获取完成,累计{len(all_comments)}条') if len(comments) < page_size: break page += 1 time.sleep(DELAY_BETWEEN_REQUESTS) return all_comments def process_post(stock_code, post): post_id = post['post_id'] title = post['title'] print(f' 获取帖子: {title[:40]}... (评论:{post["comment_count"]})') post_data = { 'post_id': post_id, 'title': title, 'author': post.get('author', ''), 'post_time': post.get('post_time', ''), 'url': post['url'], 'comment_count': post.get('comment_count', 0), 'click_count': post.get('click_count', 0), 'forward_count': post.get('forward_count', 0), 'like_count': post.get('like_count', 0), 'comments': [] } if post['comment_count'] > 0: print(f' 正在获取评论...') comments = get_all_comments(stock_code, post_id, post['comment_count']) post_data['comments'] = comments print(f' 评论获取完成,共{len(comments)}条') time.sleep(DELAY_BETWEEN_REQUESTS) return post_data def scrape_guba(stock_code='002624', stock_name='完美世界', total_pages=3, min_comment_count=0): all_posts = [] seen_post_ids = set() print(f'开始爬取{stock_name}({stock_code})股吧前{total_pages}页帖子...') print(f'爬取时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') if min_comment_count > 0: print(f'筛选条件: 评论数 >= {min_comment_count}') print('-' * 60) initialize_session() for page in range(1, total_pages + 1): print(f'\n正在爬取第{page}/{total_pages}页...') posts = get_post_list(stock_code, page) if not posts: print(f' 第{page}页未找到数据') continue print(f' 找到{len(posts)}个帖子') filtered_posts = [] for post in posts: post_id = post['post_id'] if post_id in seen_post_ids: continue seen_post_ids.add(post_id) if min_comment_count > 0 and post['comment_count'] < min_comment_count: continue filtered_posts.append(post) if not filtered_posts: print(f' 第{page}页没有符合条件的帖子') continue for post in filtered_posts: post_data = process_post(stock_code, post) all_posts.append(post_data) print(f' 第{page}页完成,已获取{len(all_posts)}个帖子') if page < total_pages: time.sleep(DELAY_BETWEEN_PAGES) return all_posts def save_to_json(data, filename): output = { 'scrape_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'total_posts': len(data), 'posts': data } with open(filename, 'w', encoding='utf-8') as f: json.dump(output, f, ensure_ascii=False, indent=2) return output if __name__ == '__main__': stock_code = '002624' stock_name = '完美世界' total_pages = 3 min_comment_count = 0 print(f'使用 Python: {__import__("sys").version}') print(f'脚本路径: {__file__}') print(f'工作目录: {__import__("os").getcwd()}') start_time = datetime.now() all_posts = scrape_guba(stock_code, stock_name, total_pages, min_comment_count) end_time = datetime.now() print('\n' + '=' * 60) if all_posts: output = save_to_json(all_posts, OUTPUT_FILE) print(f'爬取完成!') print(f' - 帖子数量: {output["total_posts"]}') print(f' - 数据已保存到: {OUTPUT_FILE}') print(f' - 耗时: {(end_time - start_time).total_seconds():.2f} 秒') print('\n前3个帖子预览:') for i, post in enumerate(all_posts[:3], 1): print(f'\n--- 帖子{i} ---') print(f'标题: {post["title"]}') print(f'作者: {post["author"]}') print(f'时间: {post["post_time"]}') print(f'URL: {post["url"]}') print(f'评论数: {post["comment_count"]}') print(f'实际获取评论数: {len(post["comments"])}') if post.get('comments'): print(f'第一条评论: {post["comments"][0]["reply_content"][:30]}...') else: print('未获取到任何数据') print(f'耗时: {(end_time - start_time).total_seconds():.2f} 秒')