guba
This commit is contained in:
+343
@@ -0,0 +1,343 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
from datetime import datetime
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36',
|
||||
'Referer': 'https://guba.eastmoney.com/',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Ch-Ua': '"Chromium";v="148", "Not;A=Brand";v="24", "Microsoft Edge";v="148"',
|
||||
'Sec-Ch-Ua-Mobile': '?0',
|
||||
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Cookie': 'qgqp_b_id=30059d8839ad5c045fa8856e38013e9c; st_nvi=XwpSfYXGjCxfCdbgapK5_cac4; nid18=0daec1df8064f04edd20b4e69250a8f5; nid18_create_time=1776263017375; gviem=UrMH_tSu1UpW8B_TKmytl803f; gviem_create_time=1776263017375; fullscreengg=1; fullscreengg2=1; st_si=63999118594852; wsc_checkuser_ok=1; st_asi=delete; st_pvi=26838250597806; st_sp=2026-04-15%2022%3A23%3A37; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=30; st_psi=20260520214901287-117001354293-0422265952',
|
||||
}
|
||||
|
||||
comment_headers = {
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'Origin': 'https://guba.eastmoney.com',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': 'https://guba.eastmoney.com/',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Cookie': 'qgqp_b_id=30059d8839ad5c045fa8856e38013e9c; st_nvi=XwpSfYXGjCxfCdbgapK5_cac4; nid18=0daec1df8064f04edd20b4e69250a8f5; nid18_create_time=1776263017375; gviem=UrMH_tSu1UpW8B_TKmytl803f; gviem_create_time=1776263017375; fullscreengg=1; fullscreengg2=1; st_si=63999118594852; wsc_checkuser_ok=1; st_asi=delete; st_pvi=26838250597806; st_sp=2026-04-15%2022%3A23%3A37; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=30; st_psi=20260520214901287-117001354293-0422265952',
|
||||
}
|
||||
|
||||
MAX_RETRIES = 3
|
||||
DELAY_BETWEEN_REQUESTS = 2.0
|
||||
DELAY_BETWEEN_PAGES = 5.0
|
||||
OUTPUT_FILE = 'guba_data.json'
|
||||
|
||||
|
||||
def fetch(url, headers, method='GET', data=None, timeout=15):
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers, method=method, data=data)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
if response.status == 429:
|
||||
print(f' 请求过于频繁,等待10秒后重试...')
|
||||
time.sleep(10)
|
||||
continue
|
||||
|
||||
if response.status == 403:
|
||||
print(f' 请求被拒绝,第{attempt+1}次重试...')
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
if response.status != 200:
|
||||
print(f' 请求失败,状态码: {response.status}')
|
||||
return None
|
||||
|
||||
content = response.read().decode('utf-8', errors='ignore')
|
||||
return content
|
||||
|
||||
except urllib.error.URLError as e:
|
||||
print(f' 请求超时,第{attempt+1}次重试...')
|
||||
time.sleep(5)
|
||||
except Exception as e:
|
||||
print(f' 请求异常: {str(e)}')
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
time.sleep(5)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def initialize_session():
|
||||
print('正在初始化会话...')
|
||||
fetch('https://guba.eastmoney.com/', headers)
|
||||
time.sleep(2)
|
||||
print('会话初始化完成')
|
||||
|
||||
|
||||
def get_post_list(stock_code='002624', page=1):
|
||||
if page == 1:
|
||||
url = f'https://guba.eastmoney.com/list,{stock_code},f.html'
|
||||
else:
|
||||
url = f'https://guba.eastmoney.com/list,{stock_code},f{page}.html'
|
||||
|
||||
html = fetch(url, headers)
|
||||
|
||||
if not html:
|
||||
return []
|
||||
|
||||
posts = []
|
||||
pattern = r'var article_list=\s*({"re":.*?});'
|
||||
match = re.search(pattern, html, re.DOTALL)
|
||||
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
for item in data.get('re', []):
|
||||
post_id = item.get('post_id', '')
|
||||
title = item.get('post_title', '').strip()
|
||||
author = item.get('user_nickname', '').strip()
|
||||
post_time = item.get('post_display_time', '')
|
||||
comment_count = item.get('post_comment_count', 0)
|
||||
click_count = item.get('post_click_count', 0)
|
||||
forward_count = item.get('post_forward_count', 0)
|
||||
like_count = item.get('post_like_count', 0)
|
||||
|
||||
if post_id and title:
|
||||
posts.append({
|
||||
'post_id': post_id,
|
||||
'title': title,
|
||||
'author': author,
|
||||
'post_time': post_time,
|
||||
'comment_count': comment_count,
|
||||
'click_count': click_count,
|
||||
'forward_count': forward_count,
|
||||
'like_count': like_count,
|
||||
'url': f'https://guba.eastmoney.com/news,{stock_code},{post_id}.html'
|
||||
})
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return posts
|
||||
|
||||
|
||||
def get_comments(stock_code, post_id, page=1, page_size=30):
|
||||
url = f'https://guba.eastmoney.com/api/getData?code={stock_code}&path=reply/api/Reply/ArticleNewReplyList'
|
||||
|
||||
payload = {
|
||||
'param': f'postid={post_id}&sort=1&sorttype=1&p={page}&ps={page_size}',
|
||||
'plat': 'Web',
|
||||
'path': 'reply/api/Reply/ArticleNewReplyList',
|
||||
'env': '2',
|
||||
'origin': '',
|
||||
'version': '2022',
|
||||
'product': 'Guba'
|
||||
}
|
||||
|
||||
data = urllib.parse.urlencode(payload).encode('utf-8')
|
||||
response_text = fetch(url, comment_headers, method='POST', data=data)
|
||||
|
||||
if not response_text:
|
||||
return []
|
||||
|
||||
try:
|
||||
data = json.loads(response_text)
|
||||
|
||||
if 're' in data:
|
||||
reply_list = data.get('re', [])
|
||||
elif 'data' in data and 'reply_list' in data['data']:
|
||||
reply_list = data['data'].get('reply_list', [])
|
||||
else:
|
||||
print(f' 未知的响应结构: {list(data.keys())}')
|
||||
return []
|
||||
|
||||
if not isinstance(reply_list, list) or len(reply_list) == 0:
|
||||
return []
|
||||
|
||||
comments = []
|
||||
for item in reply_list:
|
||||
reply_user = item.get('reply_user', {})
|
||||
comment = {
|
||||
'reply_id': str(item.get('reply_id', '')),
|
||||
'user_nickname': reply_user.get('user_nickname', '').strip(),
|
||||
'reply_content': item.get('reply_text', '').strip(),
|
||||
'reply_time': item.get('reply_time', ''),
|
||||
'reply_like_count': item.get('reply_like_count', 0),
|
||||
'reply_against_count': item.get('reply_against_count', 0),
|
||||
}
|
||||
if comment['reply_content']:
|
||||
comments.append(comment)
|
||||
|
||||
return comments
|
||||
except json.JSONDecodeError as e:
|
||||
print(f' JSON解析失败: {str(e)}')
|
||||
return []
|
||||
|
||||
|
||||
def get_all_comments(stock_code, post_id, total_comments):
|
||||
all_comments = []
|
||||
page_size = 30
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
comments = get_comments(stock_code, post_id, page, page_size)
|
||||
|
||||
if not comments:
|
||||
break
|
||||
|
||||
all_comments.extend(comments)
|
||||
print(f' 第{page}页评论获取完成,累计{len(all_comments)}条')
|
||||
|
||||
if len(comments) < page_size:
|
||||
break
|
||||
|
||||
page += 1
|
||||
time.sleep(DELAY_BETWEEN_REQUESTS)
|
||||
|
||||
return all_comments
|
||||
|
||||
|
||||
def process_post(stock_code, post):
|
||||
post_id = post['post_id']
|
||||
title = post['title']
|
||||
print(f' 获取帖子: {title[:40]}... (评论:{post["comment_count"]})')
|
||||
|
||||
post_data = {
|
||||
'post_id': post_id,
|
||||
'title': title,
|
||||
'author': post.get('author', ''),
|
||||
'post_time': post.get('post_time', ''),
|
||||
'url': post['url'],
|
||||
'comment_count': post.get('comment_count', 0),
|
||||
'click_count': post.get('click_count', 0),
|
||||
'forward_count': post.get('forward_count', 0),
|
||||
'like_count': post.get('like_count', 0),
|
||||
'comments': []
|
||||
}
|
||||
|
||||
if post['comment_count'] > 0:
|
||||
print(f' 正在获取评论...')
|
||||
comments = get_all_comments(stock_code, post_id, post['comment_count'])
|
||||
post_data['comments'] = comments
|
||||
print(f' 评论获取完成,共{len(comments)}条')
|
||||
|
||||
time.sleep(DELAY_BETWEEN_REQUESTS)
|
||||
return post_data
|
||||
|
||||
|
||||
def scrape_guba(stock_code='002624', stock_name='完美世界', total_pages=3, min_comment_count=0):
|
||||
all_posts = []
|
||||
seen_post_ids = set()
|
||||
|
||||
print(f'开始爬取{stock_name}({stock_code})股吧前{total_pages}页帖子...')
|
||||
print(f'爬取时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
|
||||
if min_comment_count > 0:
|
||||
print(f'筛选条件: 评论数 >= {min_comment_count}')
|
||||
print('-' * 60)
|
||||
|
||||
initialize_session()
|
||||
|
||||
for page in range(1, total_pages + 1):
|
||||
print(f'\n正在爬取第{page}/{total_pages}页...')
|
||||
|
||||
posts = get_post_list(stock_code, page)
|
||||
|
||||
if not posts:
|
||||
print(f' 第{page}页未找到数据')
|
||||
continue
|
||||
|
||||
print(f' 找到{len(posts)}个帖子')
|
||||
|
||||
filtered_posts = []
|
||||
for post in posts:
|
||||
post_id = post['post_id']
|
||||
if post_id in seen_post_ids:
|
||||
continue
|
||||
seen_post_ids.add(post_id)
|
||||
|
||||
if min_comment_count > 0 and post['comment_count'] < min_comment_count:
|
||||
continue
|
||||
|
||||
filtered_posts.append(post)
|
||||
|
||||
if not filtered_posts:
|
||||
print(f' 第{page}页没有符合条件的帖子')
|
||||
continue
|
||||
|
||||
for post in filtered_posts:
|
||||
post_data = process_post(stock_code, post)
|
||||
all_posts.append(post_data)
|
||||
|
||||
print(f' 第{page}页完成,已获取{len(all_posts)}个帖子')
|
||||
|
||||
if page < total_pages:
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
return all_posts
|
||||
|
||||
|
||||
def save_to_json(data, filename):
|
||||
output = {
|
||||
'scrape_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'total_posts': len(data),
|
||||
'posts': data
|
||||
}
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
stock_code = '002624'
|
||||
stock_name = '完美世界'
|
||||
total_pages = 3
|
||||
min_comment_count = 0
|
||||
|
||||
print(f'使用 Python: {__import__("sys").version}')
|
||||
print(f'脚本路径: {__file__}')
|
||||
print(f'工作目录: {__import__("os").getcwd()}')
|
||||
|
||||
start_time = datetime.now()
|
||||
|
||||
all_posts = scrape_guba(stock_code, stock_name, total_pages, min_comment_count)
|
||||
|
||||
end_time = datetime.now()
|
||||
|
||||
print('\n' + '=' * 60)
|
||||
|
||||
if all_posts:
|
||||
output = save_to_json(all_posts, OUTPUT_FILE)
|
||||
|
||||
print(f'爬取完成!')
|
||||
print(f' - 帖子数量: {output["total_posts"]}')
|
||||
print(f' - 数据已保存到: {OUTPUT_FILE}')
|
||||
print(f' - 耗时: {(end_time - start_time).total_seconds():.2f} 秒')
|
||||
|
||||
print('\n前3个帖子预览:')
|
||||
for i, post in enumerate(all_posts[:3], 1):
|
||||
print(f'\n--- 帖子{i} ---')
|
||||
print(f'标题: {post["title"]}')
|
||||
print(f'作者: {post["author"]}')
|
||||
print(f'时间: {post["post_time"]}')
|
||||
print(f'URL: {post["url"]}')
|
||||
print(f'评论数: {post["comment_count"]}')
|
||||
print(f'实际获取评论数: {len(post["comments"])}')
|
||||
if post.get('comments'):
|
||||
print(f'第一条评论: {post["comments"][0]["reply_content"][:30]}...')
|
||||
else:
|
||||
print('未获取到任何数据')
|
||||
print(f'耗时: {(end_time - start_time).total_seconds():.2f} 秒')
|
||||
Reference in New Issue
Block a user