155 lines
6.6 KiB
Python
155 lines
6.6 KiB
Python
# 巨量算数,区域指南的,景区数据
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.chrome.service import Service
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from loguru import logger
|
||
import time
|
||
import json
|
||
import os
|
||
|
||
# 配置日志
|
||
logger.add("jusuan_scraper.log", rotation="10 MB", level="INFO")
|
||
|
||
def scrape_jusuan_data():
|
||
"""
|
||
抓取巨量算数网页上的景区数据
|
||
"""
|
||
try:
|
||
# 配置Chrome选项,指定调试端口(与命令行端口一致)
|
||
chrome_options = Options()
|
||
chrome_options.add_experimental_option("debuggerAddress", "localhost:9222")
|
||
|
||
# 尝试初始化WebDriver
|
||
logger.info("正在连接到Chrome浏览器...")
|
||
driver = None
|
||
|
||
# 方法1:尝试使用默认的Chrome驱动
|
||
try:
|
||
driver = webdriver.Chrome(options=chrome_options)
|
||
logger.info("使用默认Chrome驱动连接成功")
|
||
except Exception as e:
|
||
logger.warning(f"使用默认Chrome驱动失败: {str(e)}")
|
||
|
||
# 方法2:尝试使用webdriver-manager自动管理驱动
|
||
try:
|
||
from webdriver_manager.chrome import ChromeDriverManager
|
||
service = Service(ChromeDriverManager().install())
|
||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||
logger.info("使用webdriver-manager连接成功")
|
||
except Exception as e2:
|
||
logger.warning(f"使用webdriver-manager失败: {str(e2)}")
|
||
|
||
# 方法3:尝试指定常见的ChromeDriver路径
|
||
common_paths = [
|
||
r"C:\chromedriver\chromedriver.exe",
|
||
r"C:\Program Files\Google\Chrome\Application\chromedriver.exe",
|
||
r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe",
|
||
os.path.join(os.getcwd(), "chromedriver.exe")
|
||
]
|
||
|
||
for path in common_paths:
|
||
if os.path.exists(path):
|
||
try:
|
||
service = Service(path)
|
||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||
logger.info(f"使用路径 {path} 连接成功")
|
||
break
|
||
except Exception as e3:
|
||
logger.warning(f"使用路径 {path} 失败: {str(e3)}")
|
||
continue
|
||
|
||
if driver is None:
|
||
raise Exception("所有连接Chrome浏览器的方法都失败了")
|
||
|
||
# 访问目标网页
|
||
target_url = "https://trendinsight.oceanengine.com/area?dates=daily-20251112_weekly-20251109_monthly-202510&area=%5B%2211%22%5D&category_id=3&rankStyle=monthly"
|
||
logger.info(f"正在访问网页: {target_url}")
|
||
driver.get(target_url)
|
||
|
||
# 等待页面加载
|
||
logger.info("等待页面加载完成...")
|
||
WebDriverWait(driver, 15).until(
|
||
EC.presence_of_element_located((By.CLASS_NAME, "byted-table-body"))
|
||
)
|
||
|
||
# 获取表格主体
|
||
table_body = driver.find_element(By.CLASS_NAME, "byted-table-body")
|
||
logger.info("找到表格主体,开始抓取数据...")
|
||
|
||
# 获取所有行
|
||
rows = table_body.find_elements(By.TAG_NAME, "div")
|
||
logger.info(f"找到 {len(rows)} 行数据")
|
||
|
||
# 存储抓取的数据
|
||
scraped_data = []
|
||
|
||
# 遍历每一行
|
||
for i, row in enumerate(rows):
|
||
try:
|
||
# 查找景区名称 (class包含"poiTitle-")
|
||
poi_title_element = row.find_element(By.CSS_SELECTOR, '[class*="poiTitle-"]')
|
||
poi_name = poi_title_element.text.strip()
|
||
|
||
# 查找景区分类 (class包含"categoryIconBox-")
|
||
category_element = row.find_element(By.CSS_SELECTOR, '[class*="categoryIconBox-"]')
|
||
category = category_element.text.strip()
|
||
|
||
# 查找热度指数值 (class包含"numberValue-")
|
||
heat_index_element = row.find_element(By.CSS_SELECTOR, '[class*="numberValue-"]')
|
||
heat_index = heat_index_element.text.strip()
|
||
|
||
# 将数据添加到列表
|
||
data_entry = {
|
||
"序号": i + 1,
|
||
"景区名称": poi_name,
|
||
"景区分类": category,
|
||
"热度指数": heat_index
|
||
}
|
||
scraped_data.append(data_entry)
|
||
|
||
logger.info(f"抓取第 {i+1} 条数据: {poi_name} | {category} | {heat_index}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理第 {i+1} 行时出错: {str(e)}")
|
||
continue
|
||
|
||
# 将数据保存为JSON文件
|
||
output_file = "jusuan_scenic_spots_data.json"
|
||
with open(output_file, "w", encoding="utf-8") as f:
|
||
json.dump(scraped_data, f, ensure_ascii=False, indent=2)
|
||
|
||
logger.info(f"数据抓取完成,共 {len(scraped_data)} 条记录,已保存到 {output_file}")
|
||
|
||
# 打印前5条数据作为预览
|
||
logger.info("前5条数据预览:")
|
||
for i, data in enumerate(scraped_data[:5]):
|
||
logger.info(f"{i+1}. {data['景区名称']} | {data['景区分类']} | {data['热度指数']}")
|
||
|
||
return scraped_data
|
||
|
||
except Exception as e:
|
||
logger.error(f"抓取过程中发生错误: {str(e)}")
|
||
return None
|
||
finally:
|
||
# 关闭浏览器连接(但不关闭浏览器本身)
|
||
if driver:
|
||
try:
|
||
driver.quit()
|
||
except:
|
||
pass
|
||
|
||
if __name__ == "__main__":
|
||
logger.info("开始执行巨量算数景区数据抓取...")
|
||
logger.info("请确保Chrome浏览器已通过以下命令启动:")
|
||
logger.info('"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\\tmp"')
|
||
|
||
result = scrape_jusuan_data()
|
||
if result:
|
||
logger.info("抓取任务完成")
|
||
else:
|
||
logger.error("抓取任务失败")
|
||
logger.info("请尝试安装webdriver-manager: pip install webdriver-manager")
|
||
logger.info("或者手动下载ChromeDriver并放在系统PATH中") |