更新了今天的数据
This commit is contained in:
155
jusuan.py
Normal file
155
jusuan.py
Normal file
@@ -0,0 +1,155 @@
|
||||
# 巨量算数,区域指南的,景区数据
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from loguru import logger
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
|
||||
# 配置日志
|
||||
logger.add("jusuan_scraper.log", rotation="10 MB", level="INFO")
|
||||
|
||||
def scrape_jusuan_data():
|
||||
"""
|
||||
抓取巨量算数网页上的景区数据
|
||||
"""
|
||||
try:
|
||||
# 配置Chrome选项,指定调试端口(与命令行端口一致)
|
||||
chrome_options = Options()
|
||||
chrome_options.add_experimental_option("debuggerAddress", "localhost:9222")
|
||||
|
||||
# 尝试初始化WebDriver
|
||||
logger.info("正在连接到Chrome浏览器...")
|
||||
driver = None
|
||||
|
||||
# 方法1:尝试使用默认的Chrome驱动
|
||||
try:
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
logger.info("使用默认Chrome驱动连接成功")
|
||||
except Exception as e:
|
||||
logger.warning(f"使用默认Chrome驱动失败: {str(e)}")
|
||||
|
||||
# 方法2:尝试使用webdriver-manager自动管理驱动
|
||||
try:
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
logger.info("使用webdriver-manager连接成功")
|
||||
except Exception as e2:
|
||||
logger.warning(f"使用webdriver-manager失败: {str(e2)}")
|
||||
|
||||
# 方法3:尝试指定常见的ChromeDriver路径
|
||||
common_paths = [
|
||||
r"C:\chromedriver\chromedriver.exe",
|
||||
r"C:\Program Files\Google\Chrome\Application\chromedriver.exe",
|
||||
r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe",
|
||||
os.path.join(os.getcwd(), "chromedriver.exe")
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
service = Service(path)
|
||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
logger.info(f"使用路径 {path} 连接成功")
|
||||
break
|
||||
except Exception as e3:
|
||||
logger.warning(f"使用路径 {path} 失败: {str(e3)}")
|
||||
continue
|
||||
|
||||
if driver is None:
|
||||
raise Exception("所有连接Chrome浏览器的方法都失败了")
|
||||
|
||||
# 访问目标网页
|
||||
target_url = "https://trendinsight.oceanengine.com/area?dates=daily-20251112_weekly-20251109_monthly-202510&area=%5B%2211%22%5D&category_id=3&rankStyle=monthly"
|
||||
logger.info(f"正在访问网页: {target_url}")
|
||||
driver.get(target_url)
|
||||
|
||||
# 等待页面加载
|
||||
logger.info("等待页面加载完成...")
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "byted-table-body"))
|
||||
)
|
||||
|
||||
# 获取表格主体
|
||||
table_body = driver.find_element(By.CLASS_NAME, "byted-table-body")
|
||||
logger.info("找到表格主体,开始抓取数据...")
|
||||
|
||||
# 获取所有行
|
||||
rows = table_body.find_elements(By.TAG_NAME, "div")
|
||||
logger.info(f"找到 {len(rows)} 行数据")
|
||||
|
||||
# 存储抓取的数据
|
||||
scraped_data = []
|
||||
|
||||
# 遍历每一行
|
||||
for i, row in enumerate(rows):
|
||||
try:
|
||||
# 查找景区名称 (class包含"poiTitle-")
|
||||
poi_title_element = row.find_element(By.CSS_SELECTOR, '[class*="poiTitle-"]')
|
||||
poi_name = poi_title_element.text.strip()
|
||||
|
||||
# 查找景区分类 (class包含"categoryIconBox-")
|
||||
category_element = row.find_element(By.CSS_SELECTOR, '[class*="categoryIconBox-"]')
|
||||
category = category_element.text.strip()
|
||||
|
||||
# 查找热度指数值 (class包含"numberValue-")
|
||||
heat_index_element = row.find_element(By.CSS_SELECTOR, '[class*="numberValue-"]')
|
||||
heat_index = heat_index_element.text.strip()
|
||||
|
||||
# 将数据添加到列表
|
||||
data_entry = {
|
||||
"序号": i + 1,
|
||||
"景区名称": poi_name,
|
||||
"景区分类": category,
|
||||
"热度指数": heat_index
|
||||
}
|
||||
scraped_data.append(data_entry)
|
||||
|
||||
logger.info(f"抓取第 {i+1} 条数据: {poi_name} | {category} | {heat_index}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理第 {i+1} 行时出错: {str(e)}")
|
||||
continue
|
||||
|
||||
# 将数据保存为JSON文件
|
||||
output_file = "jusuan_scenic_spots_data.json"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(scraped_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"数据抓取完成,共 {len(scraped_data)} 条记录,已保存到 {output_file}")
|
||||
|
||||
# 打印前5条数据作为预览
|
||||
logger.info("前5条数据预览:")
|
||||
for i, data in enumerate(scraped_data[:5]):
|
||||
logger.info(f"{i+1}. {data['景区名称']} | {data['景区分类']} | {data['热度指数']}")
|
||||
|
||||
return scraped_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"抓取过程中发生错误: {str(e)}")
|
||||
return None
|
||||
finally:
|
||||
# 关闭浏览器连接(但不关闭浏览器本身)
|
||||
if driver:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("开始执行巨量算数景区数据抓取...")
|
||||
logger.info("请确保Chrome浏览器已通过以下命令启动:")
|
||||
logger.info('"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\\tmp"')
|
||||
|
||||
result = scrape_jusuan_data()
|
||||
if result:
|
||||
logger.info("抓取任务完成")
|
||||
else:
|
||||
logger.error("抓取任务失败")
|
||||
logger.info("请尝试安装webdriver-manager: pip install webdriver-manager")
|
||||
logger.info("或者手动下载ChromeDriver并放在系统PATH中")
|
||||
Reference in New Issue
Block a user