ini
This commit is contained in:
@@ -0,0 +1,187 @@
|
||||
import requests
|
||||
import pandas as pd
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
import os
|
||||
|
||||
def fetch_guba_data(code='gssz', page=1, page_size=20, sort_type=1):
|
||||
url = 'https://mguba.eastmoney.com/mguba2020/interface/GetData.aspx'
|
||||
|
||||
headers = {
|
||||
'Accept': '*/*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'DNT': '1',
|
||||
'Origin': 'https://mguba.eastmoney.com',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': f'https://mguba.eastmoney.com/mguba/list/{code}_{page}',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Mobile Safari/537.36 Edg/148.0.0.0',
|
||||
'sec-ch-ua': '"Chromium";v="148", "Microsoft Edge";v="148", "Not/A)Brand";v="99"',
|
||||
'sec-ch-ua-mobile': '?1',
|
||||
'sec-ch-ua-platform': '"Android"'
|
||||
}
|
||||
|
||||
cookies = {
|
||||
'qgqp_b_id': '30059d8839ad5c045fa8856e38013e9c',
|
||||
'st_nvi': 'XwpSfYXGjCxfCdbgapK5_cac4',
|
||||
'nid18': '0daec1df8064f04edd20b4e69250a8f5',
|
||||
'nid18_create_time': '1776263017375',
|
||||
'gviem': 'UrMH_tSu1UpW8B_TKmytl803f',
|
||||
'gviem_create_time': '1776263017375',
|
||||
'fullscreengg': '1',
|
||||
'fullscreengg2': '1',
|
||||
'st_si': '17952715731426',
|
||||
'show_app_box_time': '1779903756410',
|
||||
'st_pvi': '26838250597806',
|
||||
'st_sp': '2026-04-15 22:23:37',
|
||||
'st_inirUrl': 'https://cn.bing.com/',
|
||||
'st_sn': '30',
|
||||
'st_psi': '20260528025236177-117016304298-3040545697',
|
||||
'ad_tc_load_num': '3',
|
||||
'st_asi': '20260528025236177-117016304298-3040545697-ad.djxd-1'
|
||||
}
|
||||
|
||||
param = f'code={code}&p={page}&ps={page_size}&sorttype={sort_type}'
|
||||
data = {
|
||||
'param': param,
|
||||
'plat': 'wap',
|
||||
'version': '200',
|
||||
'path': '/webarticlelist/api/Article/WebArticleList',
|
||||
'env': '1',
|
||||
'origin': '',
|
||||
'ctoken': '',
|
||||
'utoken': ''
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(url, headers=headers, cookies=cookies, data=data)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f'请求失败: {e}')
|
||||
return None
|
||||
|
||||
def fetch_stock_posts(code, name, pages=10, page_size=20):
|
||||
"""爬取指定股票的多页数据"""
|
||||
all_posts = []
|
||||
|
||||
for page in range(1, pages + 1):
|
||||
print(f'正在爬取 {name} ({code}) - 第 {page}/{pages} 页')
|
||||
result = fetch_guba_data(code=code, page=page, page_size=page_size)
|
||||
|
||||
if result and 're' in result:
|
||||
posts = result['re']
|
||||
all_posts.extend(posts)
|
||||
print(f' 成功获取 {len(posts)} 条帖子')
|
||||
else:
|
||||
print(f' 第 {page} 页获取失败或无数据')
|
||||
|
||||
# 添加延迟避免请求过快
|
||||
if page < pages:
|
||||
time.sleep(1)
|
||||
|
||||
# 整理数据
|
||||
data = {
|
||||
'stock_code': code,
|
||||
'stock_name': name,
|
||||
'total_pages': pages,
|
||||
'total_posts': len(all_posts),
|
||||
'crawl_time': datetime.now().isoformat(),
|
||||
'posts': all_posts
|
||||
}
|
||||
|
||||
return data
|
||||
|
||||
def save_to_json(data, name="", filename=None):
|
||||
if not data:
|
||||
print('数据为空,无法保存')
|
||||
return None
|
||||
|
||||
if not filename:
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
filename = f'guba_{name}_{timestamp}.json'
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f'JSON数据已保存到: {filename}')
|
||||
return filename
|
||||
|
||||
def save_to_excel(data, name="", filename=None):
|
||||
if not data or 'posts' not in data:
|
||||
print('数据格式不正确,无法保存')
|
||||
return None
|
||||
|
||||
posts = data['posts']
|
||||
records = []
|
||||
|
||||
for post in posts:
|
||||
record = {
|
||||
'帖子ID': post.get('post_id'),
|
||||
'标题': post.get('post_title'),
|
||||
'内容': post.get('post_content'),
|
||||
'作者': post.get('post_user', {}).get('user_nickname'),
|
||||
'发布时间': post.get('post_publish_time'),
|
||||
'最后更新': post.get('post_last_time'),
|
||||
'阅读数': post.get('post_click_count'),
|
||||
'评论数': post.get('post_comment_count'),
|
||||
'点赞数': post.get('post_like_count'),
|
||||
'股吧': post.get('post_guba', {}).get('stockbar_name'),
|
||||
'来源': post.get('post_from')
|
||||
}
|
||||
records.append(record)
|
||||
|
||||
df = pd.DataFrame(records)
|
||||
|
||||
if not filename:
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
filename = f'guba_{name}_{timestamp}.xlsx'
|
||||
|
||||
df.to_excel(filename, index=False, engine='openpyxl')
|
||||
print(f'Excel数据已保存到: {filename}')
|
||||
return filename
|
||||
|
||||
if __name__ == '__main__':
|
||||
GAME_STOCKS = {
|
||||
'002624': '完美世界',
|
||||
'002555': '三七互娱',
|
||||
'002558': '巨人网络',
|
||||
'002602': '世纪华通',
|
||||
'300418': '昆仑万维',
|
||||
'002174': '游族网络',
|
||||
'300315': '掌趣科技',
|
||||
'603444': '吉比特',
|
||||
}
|
||||
|
||||
# 创建数据目录
|
||||
os.makedirs('data', exist_ok=True)
|
||||
|
||||
for code, name in GAME_STOCKS.items():
|
||||
print(f'\n{"="*50}')
|
||||
print(f'开始爬取 {name} ({code})')
|
||||
print(f'{"="*50}')
|
||||
|
||||
# 爬取10页数据
|
||||
data = fetch_stock_posts(code, name, pages=10)
|
||||
|
||||
if data and data['total_posts'] > 0:
|
||||
print(f'\n共获取 {data["total_posts"]} 条帖子')
|
||||
|
||||
# 保存JSON
|
||||
json_filename = os.path.join('data', f'guba_{name}_{code}.json')
|
||||
save_to_json(data, name, json_filename)
|
||||
|
||||
# 保存Excel
|
||||
excel_filename = os.path.join('data', f'guba_{name}_{code}.xlsx')
|
||||
save_to_excel(data, name, excel_filename)
|
||||
else:
|
||||
print(f'{name} 爬取失败或无数据')
|
||||
|
||||
# 股票之间的延迟
|
||||
time.sleep(2)
|
||||
Reference in New Issue
Block a user