更新了今天的数据

This commit is contained in:
2025-11-14 21:03:48 +08:00
parent 1507416806
commit d6ec1eadc9
9 changed files with 12610 additions and 3 deletions

5900
2025年11月14日185430.txt Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -1,2 +0,0 @@
2025-11-07 23:39:42.157 | INFO | __main__:<module>:42 - 开始GUI测试
2025-11-07 23:39:47.875 | INFO | __main__:close_app:30 - 测试完成,关闭应用程序

155
jusuan.py Normal file
View File

@@ -0,0 +1,155 @@
# 巨量算数,区域指南的,景区数据
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from loguru import logger
import time
import json
import os
# 配置日志
logger.add("jusuan_scraper.log", rotation="10 MB", level="INFO")
def scrape_jusuan_data():
"""
抓取巨量算数网页上的景区数据
"""
try:
# 配置Chrome选项指定调试端口与命令行端口一致
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "localhost:9222")
# 尝试初始化WebDriver
logger.info("正在连接到Chrome浏览器...")
driver = None
# 方法1尝试使用默认的Chrome驱动
try:
driver = webdriver.Chrome(options=chrome_options)
logger.info("使用默认Chrome驱动连接成功")
except Exception as e:
logger.warning(f"使用默认Chrome驱动失败: {str(e)}")
# 方法2尝试使用webdriver-manager自动管理驱动
try:
from webdriver_manager.chrome import ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
logger.info("使用webdriver-manager连接成功")
except Exception as e2:
logger.warning(f"使用webdriver-manager失败: {str(e2)}")
# 方法3尝试指定常见的ChromeDriver路径
common_paths = [
r"C:\chromedriver\chromedriver.exe",
r"C:\Program Files\Google\Chrome\Application\chromedriver.exe",
r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe",
os.path.join(os.getcwd(), "chromedriver.exe")
]
for path in common_paths:
if os.path.exists(path):
try:
service = Service(path)
driver = webdriver.Chrome(service=service, options=chrome_options)
logger.info(f"使用路径 {path} 连接成功")
break
except Exception as e3:
logger.warning(f"使用路径 {path} 失败: {str(e3)}")
continue
if driver is None:
raise Exception("所有连接Chrome浏览器的方法都失败了")
# 访问目标网页
target_url = "https://trendinsight.oceanengine.com/area?dates=daily-20251112_weekly-20251109_monthly-202510&area=%5B%2211%22%5D&category_id=3&rankStyle=monthly"
logger.info(f"正在访问网页: {target_url}")
driver.get(target_url)
# 等待页面加载
logger.info("等待页面加载完成...")
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, "byted-table-body"))
)
# 获取表格主体
table_body = driver.find_element(By.CLASS_NAME, "byted-table-body")
logger.info("找到表格主体,开始抓取数据...")
# 获取所有行
rows = table_body.find_elements(By.TAG_NAME, "div")
logger.info(f"找到 {len(rows)} 行数据")
# 存储抓取的数据
scraped_data = []
# 遍历每一行
for i, row in enumerate(rows):
try:
# 查找景区名称 (class包含"poiTitle-")
poi_title_element = row.find_element(By.CSS_SELECTOR, '[class*="poiTitle-"]')
poi_name = poi_title_element.text.strip()
# 查找景区分类 (class包含"categoryIconBox-")
category_element = row.find_element(By.CSS_SELECTOR, '[class*="categoryIconBox-"]')
category = category_element.text.strip()
# 查找热度指数值 (class包含"numberValue-")
heat_index_element = row.find_element(By.CSS_SELECTOR, '[class*="numberValue-"]')
heat_index = heat_index_element.text.strip()
# 将数据添加到列表
data_entry = {
"序号": i + 1,
"景区名称": poi_name,
"景区分类": category,
"热度指数": heat_index
}
scraped_data.append(data_entry)
logger.info(f"抓取第 {i+1} 条数据: {poi_name} | {category} | {heat_index}")
except Exception as e:
logger.error(f"处理第 {i+1} 行时出错: {str(e)}")
continue
# 将数据保存为JSON文件
output_file = "jusuan_scenic_spots_data.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(scraped_data, f, ensure_ascii=False, indent=2)
logger.info(f"数据抓取完成,共 {len(scraped_data)} 条记录,已保存到 {output_file}")
# 打印前5条数据作为预览
logger.info("前5条数据预览:")
for i, data in enumerate(scraped_data[:5]):
logger.info(f"{i+1}. {data['景区名称']} | {data['景区分类']} | {data['热度指数']}")
return scraped_data
except Exception as e:
logger.error(f"抓取过程中发生错误: {str(e)}")
return None
finally:
# 关闭浏览器连接(但不关闭浏览器本身)
if driver:
try:
driver.quit()
except:
pass
if __name__ == "__main__":
logger.info("开始执行巨量算数景区数据抓取...")
logger.info("请确保Chrome浏览器已通过以下命令启动:")
logger.info('"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\\tmp"')
result = scrape_jusuan_data()
if result:
logger.info("抓取任务完成")
else:
logger.error("抓取任务失败")
logger.info("请尝试安装webdriver-manager: pip install webdriver-manager")
logger.info("或者手动下载ChromeDriver并放在系统PATH中")

View File

@@ -4,3 +4,4 @@ tqdm>=4.61.2
loguru>=0.5.3
zhipuai>=2.1.0
PySide6>=6.0.0
selenium>=4.15.0

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -248,6 +248,8 @@ class TopHubScraper:
return
# 调用tophub_add_data_to_db.py脚本
logger.info("正在调用tophub_add_data_to_db.py...")
# 使用Popen方式处理可能的编码问题