252 lines
9.5 KiB
Python
252 lines
9.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Product Hunt网站数据抓取脚本
|
|||
|
|
使用Selenium启动新的Chrome实例,抓取Product Hunt产品信息
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from selenium import webdriver
|
|||
|
|
from selenium.webdriver.common.by import By
|
|||
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|||
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|||
|
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
|||
|
|
from webdriver_manager.chrome import ChromeDriverManager
|
|||
|
|
from selenium.webdriver.chrome.service import Service
|
|||
|
|
import time
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
from datetime import datetime
|
|||
|
|
from loguru import logger
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logger.add("producthunt_scraper.log", rotation="10 MB", level="INFO")
|
|||
|
|
|
|||
|
|
class ProductHuntScraper:
|
|||
|
|
"""Product Hunt网站数据抓取器"""
|
|||
|
|
|
|||
|
|
def __init__(self, use_debug_address=False, debug_address="127.0.0.1:5003"):
|
|||
|
|
"""
|
|||
|
|
初始化抓取器
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
use_debug_address (bool): 是否使用调试地址连接到现有Chrome实例
|
|||
|
|
debug_address (str): Chrome调试地址,默认为"127.0.0.1:5003"
|
|||
|
|
"""
|
|||
|
|
self.use_debug_address = use_debug_address
|
|||
|
|
self.debug_address = debug_address
|
|||
|
|
self.driver = None
|
|||
|
|
self.wait = None
|
|||
|
|
|
|||
|
|
def connect_to_chrome(self):
|
|||
|
|
"""
|
|||
|
|
连接到Chrome实例(现有或新建)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
bool: 连接是否成功
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
options = webdriver.ChromeOptions()
|
|||
|
|
|
|||
|
|
if self.use_debug_address:
|
|||
|
|
# 连接到现有的Chrome实例
|
|||
|
|
logger.info(f"尝试连接到Chrome调试实例: {self.debug_address}")
|
|||
|
|
options.add_experimental_option("debuggerAddress", self.debug_address)
|
|||
|
|
else:
|
|||
|
|
# 启动新的Chrome实例
|
|||
|
|
logger.info("启动新的Chrome实例")
|
|||
|
|
# 添加一些有用的选项
|
|||
|
|
options.add_argument("--no-sandbox")
|
|||
|
|
options.add_argument("--disable-dev-shm-usage")
|
|||
|
|
options.add_argument("--disable-gpu")
|
|||
|
|
options.add_argument("--window-size=1920,1080")
|
|||
|
|
# 设置用户代理,模拟真实浏览器
|
|||
|
|
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
|||
|
|
|
|||
|
|
# 使用webdriver-manager自动管理Chrome驱动程序
|
|||
|
|
service = Service(ChromeDriverManager().install())
|
|||
|
|
self.driver = webdriver.Chrome(service=service, options=options)
|
|||
|
|
self.wait = WebDriverWait(self.driver, 10)
|
|||
|
|
|
|||
|
|
logger.info(f"成功连接到Chrome实例,当前页面标题: {self.driver.title}")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"连接Chrome实例失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def navigate_to_product(self, product_url):
|
|||
|
|
"""
|
|||
|
|
导航到指定的Product Hunt产品页面
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
product_url (str): 产品页面URL
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
bool: 导航是否成功
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
logger.info(f"正在导航到产品页面: {product_url}")
|
|||
|
|
self.driver.get(product_url)
|
|||
|
|
|
|||
|
|
# 等待页面加载完成
|
|||
|
|
self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
|||
|
|
time.sleep(5) # 额外等待,确保动态内容加载
|
|||
|
|
|
|||
|
|
logger.info(f"成功导航到产品页面,当前标题: {self.driver.title}")
|
|||
|
|
return True
|
|||
|
|
except TimeoutException:
|
|||
|
|
logger.error("页面加载超时")
|
|||
|
|
return False
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"导航到产品页面失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def extract_product_info(self):
|
|||
|
|
"""
|
|||
|
|
从当前页面提取产品信息
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
dict: 包含产品信息的字典
|
|||
|
|
"""
|
|||
|
|
product_info = {}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 提取产品名称 - h1标签下的字符串
|
|||
|
|
try:
|
|||
|
|
h1_element = self.driver.find_element(By.TAG_NAME, "h1")
|
|||
|
|
product_name = h1_element.text.strip()
|
|||
|
|
logger.info(f"找到产品名称: {product_name}")
|
|||
|
|
product_info['name'] = product_name or "未找到产品名称"
|
|||
|
|
except NoSuchElementException:
|
|||
|
|
logger.error("未找到h1标签")
|
|||
|
|
product_info['name'] = "未找到产品名称"
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"提取产品名称失败: {e}")
|
|||
|
|
product_info['name'] = "提取失败"
|
|||
|
|
|
|||
|
|
# 提取产品简介 - div的class="relative text-16 font-normal text-gray-700"的字符串
|
|||
|
|
try:
|
|||
|
|
description_element = self.driver.find_element(By.CSS_SELECTOR, "div.relative.text-16.font-normal.text-gray-700")
|
|||
|
|
product_description = description_element.text.strip()
|
|||
|
|
logger.info(f"找到产品简介: {product_description[:50]}...")
|
|||
|
|
product_info['description'] = product_description or "未找到产品简介"
|
|||
|
|
except NoSuchElementException:
|
|||
|
|
logger.error("未找到产品简介div")
|
|||
|
|
product_info['description'] = "未找到产品简介"
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"提取产品简介失败: {e}")
|
|||
|
|
product_info['description'] = "提取失败"
|
|||
|
|
|
|||
|
|
# 提取第一个评论 - div的class是flex flex-1 flex-col gap-2,提取字符串的所有内容
|
|||
|
|
try:
|
|||
|
|
comment_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.flex.flex-1.flex-col.gap-2")
|
|||
|
|
if comment_elements:
|
|||
|
|
first_comment = comment_elements[0].text.strip()
|
|||
|
|
logger.info(f"找到第一个评论: {first_comment[:50]}...")
|
|||
|
|
product_info['first_comment'] = first_comment
|
|||
|
|
else:
|
|||
|
|
logger.warning("未找到任何评论")
|
|||
|
|
product_info['first_comment'] = "未找到评论"
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"提取第一个评论失败: {e}")
|
|||
|
|
product_info['first_comment'] = "提取失败"
|
|||
|
|
|
|||
|
|
# 添加当前URL和抓取时间
|
|||
|
|
product_info['url'] = self.driver.current_url
|
|||
|
|
product_info['scraped_at'] = datetime.now().isoformat()
|
|||
|
|
|
|||
|
|
return product_info
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"提取产品信息时出错: {e}")
|
|||
|
|
return {
|
|||
|
|
'name': "提取失败",
|
|||
|
|
'description': "提取失败",
|
|||
|
|
'first_comment': "提取失败",
|
|||
|
|
'url': self.driver.current_url if self.driver else "未知",
|
|||
|
|
'scraped_at': datetime.now().isoformat(),
|
|||
|
|
'error': str(e)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def scrape_product(self, product_url):
|
|||
|
|
"""
|
|||
|
|
抓取指定URL的产品信息
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
product_url (str): 产品页面URL
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
dict: 产品信息字典
|
|||
|
|
"""
|
|||
|
|
if not self.driver:
|
|||
|
|
if not self.connect_to_chrome():
|
|||
|
|
logger.error("无法连接到Chrome实例")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
if not self.navigate_to_product(product_url):
|
|||
|
|
logger.error("无法导航到产品页面")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
return self.extract_product_info()
|
|||
|
|
|
|||
|
|
def save_to_file(self, product_info, filename=None):
|
|||
|
|
"""
|
|||
|
|
将产品信息保存到JSON文件
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
product_info (dict): 产品信息
|
|||
|
|
filename (str, optional): 文件名。如果未提供,将自动生成
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
str: 保存的文件名
|
|||
|
|
"""
|
|||
|
|
if not filename:
|
|||
|
|
now = datetime.now()
|
|||
|
|
filename = f"product_{now.strftime('%Y%m%d_%H%M%S')}.json"
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(product_info, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
logger.info(f"产品信息已保存到 {filename}")
|
|||
|
|
return filename
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"保存文件失败: {e}")
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
def close(self):
|
|||
|
|
"""
|
|||
|
|
关闭连接
|
|||
|
|
"""
|
|||
|
|
if self.driver:
|
|||
|
|
self.driver.quit()
|
|||
|
|
logger.info("已关闭Chrome连接")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
# 示例用法
|
|||
|
|
# 如果您有现有的Chrome调试实例,设置use_debug_address=True
|
|||
|
|
scraper = ProductHuntScraper(use_debug_address=False)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 要抓取的产品URL
|
|||
|
|
product_url = "https://www.producthunt.com/products/elsie-ai-beta"
|
|||
|
|
|
|||
|
|
# 抓取产品信息
|
|||
|
|
product_info = scraper.scrape_product(product_url)
|
|||
|
|
|
|||
|
|
if product_info:
|
|||
|
|
# 打印产品信息
|
|||
|
|
logger.info("抓取到的产品信息:")
|
|||
|
|
logger.info(json.dumps(product_info, ensure_ascii=False, indent=2))
|
|||
|
|
|
|||
|
|
# 保存到文件
|
|||
|
|
filename = scraper.save_to_file(product_info)
|
|||
|
|
logger.info(f"产品信息已保存到: {filename}")
|
|||
|
|
else:
|
|||
|
|
logger.error("未能获取产品信息")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"程序执行出错: {e}")
|
|||
|
|
finally:
|
|||
|
|
# 关闭Chrome实例
|
|||
|
|
scraper.close()
|