Files
tophux_scrape/jusuan.py
2025-11-14 21:03:48 +08:00

155 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 巨量算数,区域指南的,景区数据
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from loguru import logger
import time
import json
import os
# 配置日志
logger.add("jusuan_scraper.log", rotation="10 MB", level="INFO")
def scrape_jusuan_data():
"""
抓取巨量算数网页上的景区数据
"""
try:
# 配置Chrome选项指定调试端口与命令行端口一致
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "localhost:9222")
# 尝试初始化WebDriver
logger.info("正在连接到Chrome浏览器...")
driver = None
# 方法1尝试使用默认的Chrome驱动
try:
driver = webdriver.Chrome(options=chrome_options)
logger.info("使用默认Chrome驱动连接成功")
except Exception as e:
logger.warning(f"使用默认Chrome驱动失败: {str(e)}")
# 方法2尝试使用webdriver-manager自动管理驱动
try:
from webdriver_manager.chrome import ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
logger.info("使用webdriver-manager连接成功")
except Exception as e2:
logger.warning(f"使用webdriver-manager失败: {str(e2)}")
# 方法3尝试指定常见的ChromeDriver路径
common_paths = [
r"C:\chromedriver\chromedriver.exe",
r"C:\Program Files\Google\Chrome\Application\chromedriver.exe",
r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe",
os.path.join(os.getcwd(), "chromedriver.exe")
]
for path in common_paths:
if os.path.exists(path):
try:
service = Service(path)
driver = webdriver.Chrome(service=service, options=chrome_options)
logger.info(f"使用路径 {path} 连接成功")
break
except Exception as e3:
logger.warning(f"使用路径 {path} 失败: {str(e3)}")
continue
if driver is None:
raise Exception("所有连接Chrome浏览器的方法都失败了")
# 访问目标网页
target_url = "https://trendinsight.oceanengine.com/area?dates=daily-20251112_weekly-20251109_monthly-202510&area=%5B%2211%22%5D&category_id=3&rankStyle=monthly"
logger.info(f"正在访问网页: {target_url}")
driver.get(target_url)
# 等待页面加载
logger.info("等待页面加载完成...")
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, "byted-table-body"))
)
# 获取表格主体
table_body = driver.find_element(By.CLASS_NAME, "byted-table-body")
logger.info("找到表格主体,开始抓取数据...")
# 获取所有行
rows = table_body.find_elements(By.TAG_NAME, "div")
logger.info(f"找到 {len(rows)} 行数据")
# 存储抓取的数据
scraped_data = []
# 遍历每一行
for i, row in enumerate(rows):
try:
# 查找景区名称 (class包含"poiTitle-")
poi_title_element = row.find_element(By.CSS_SELECTOR, '[class*="poiTitle-"]')
poi_name = poi_title_element.text.strip()
# 查找景区分类 (class包含"categoryIconBox-")
category_element = row.find_element(By.CSS_SELECTOR, '[class*="categoryIconBox-"]')
category = category_element.text.strip()
# 查找热度指数值 (class包含"numberValue-")
heat_index_element = row.find_element(By.CSS_SELECTOR, '[class*="numberValue-"]')
heat_index = heat_index_element.text.strip()
# 将数据添加到列表
data_entry = {
"序号": i + 1,
"景区名称": poi_name,
"景区分类": category,
"热度指数": heat_index
}
scraped_data.append(data_entry)
logger.info(f"抓取第 {i+1} 条数据: {poi_name} | {category} | {heat_index}")
except Exception as e:
logger.error(f"处理第 {i+1} 行时出错: {str(e)}")
continue
# 将数据保存为JSON文件
output_file = "jusuan_scenic_spots_data.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(scraped_data, f, ensure_ascii=False, indent=2)
logger.info(f"数据抓取完成,共 {len(scraped_data)} 条记录,已保存到 {output_file}")
# 打印前5条数据作为预览
logger.info("前5条数据预览:")
for i, data in enumerate(scraped_data[:5]):
logger.info(f"{i+1}. {data['景区名称']} | {data['景区分类']} | {data['热度指数']}")
return scraped_data
except Exception as e:
logger.error(f"抓取过程中发生错误: {str(e)}")
return None
finally:
# 关闭浏览器连接(但不关闭浏览器本身)
if driver:
try:
driver.quit()
except:
pass
if __name__ == "__main__":
logger.info("开始执行巨量算数景区数据抓取...")
logger.info("请确保Chrome浏览器已通过以下命令启动:")
logger.info('"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\\tmp"')
result = scrape_jusuan_data()
if result:
logger.info("抓取任务完成")
else:
logger.error("抓取任务失败")
logger.info("请尝试安装webdriver-manager: pip install webdriver-manager")
logger.info("或者手动下载ChromeDriver并放在系统PATH中")