#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Product Hunt网站数据抓取脚本 使用Selenium启动新的Chrome实例,抓取Product Hunt产品信息 """ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service import time import json import re from datetime import datetime from loguru import logger # 配置日志 logger.add("producthunt_scraper.log", rotation="10 MB", level="INFO") class ProductHuntScraper: """Product Hunt网站数据抓取器""" def __init__(self, use_debug_address=False, debug_address="127.0.0.1:5003"): """ 初始化抓取器 Args: use_debug_address (bool): 是否使用调试地址连接到现有Chrome实例 debug_address (str): Chrome调试地址,默认为"127.0.0.1:5003" """ self.use_debug_address = use_debug_address self.debug_address = debug_address self.driver = None self.wait = None def connect_to_chrome(self): """ 连接到Chrome实例(现有或新建) Returns: bool: 连接是否成功 """ try: options = webdriver.ChromeOptions() if self.use_debug_address: # 连接到现有的Chrome实例 logger.info(f"尝试连接到Chrome调试实例: {self.debug_address}") options.add_experimental_option("debuggerAddress", self.debug_address) else: # 启动新的Chrome实例 logger.info("启动新的Chrome实例") # 添加一些有用的选项 options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--disable-gpu") options.add_argument("--window-size=1920,1080") # 设置用户代理,模拟真实浏览器 options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # 使用webdriver-manager自动管理Chrome驱动程序 service = Service(ChromeDriverManager().install()) self.driver = webdriver.Chrome(service=service, options=options) self.wait = WebDriverWait(self.driver, 10) logger.info(f"成功连接到Chrome实例,当前页面标题: {self.driver.title}") return True except Exception as e: logger.error(f"连接Chrome实例失败: {e}") return False def navigate_to_product(self, product_url): """ 导航到指定的Product Hunt产品页面 Args: product_url (str): 产品页面URL Returns: bool: 导航是否成功 """ try: logger.info(f"正在导航到产品页面: {product_url}") self.driver.get(product_url) # 等待页面加载完成 self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) time.sleep(5) # 额外等待,确保动态内容加载 logger.info(f"成功导航到产品页面,当前标题: {self.driver.title}") return True except TimeoutException: logger.error("页面加载超时") return False except Exception as e: logger.error(f"导航到产品页面失败: {e}") return False def extract_product_info(self): """ 从当前页面提取产品信息 Returns: dict: 包含产品信息的字典 """ product_info = {} try: # 提取产品名称 - h1标签下的字符串 try: h1_element = self.driver.find_element(By.TAG_NAME, "h1") product_name = h1_element.text.strip() logger.info(f"找到产品名称: {product_name}") product_info['name'] = product_name or "未找到产品名称" except NoSuchElementException: logger.error("未找到h1标签") product_info['name'] = "未找到产品名称" except Exception as e: logger.error(f"提取产品名称失败: {e}") product_info['name'] = "提取失败" # 提取产品简介 - div的class="relative text-16 font-normal text-gray-700"的字符串 try: description_element = self.driver.find_element(By.CSS_SELECTOR, "div.relative.text-16.font-normal.text-gray-700") product_description = description_element.text.strip() logger.info(f"找到产品简介: {product_description[:50]}...") product_info['description'] = product_description or "未找到产品简介" except NoSuchElementException: logger.error("未找到产品简介div") product_info['description'] = "未找到产品简介" except Exception as e: logger.error(f"提取产品简介失败: {e}") product_info['description'] = "提取失败" # 提取第一个评论 - div的class是flex flex-1 flex-col gap-2,提取字符串的所有内容 try: comment_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.flex.flex-1.flex-col.gap-2") if comment_elements: first_comment = comment_elements[0].text.strip() logger.info(f"找到第一个评论: {first_comment[:50]}...") product_info['first_comment'] = first_comment else: logger.warning("未找到任何评论") product_info['first_comment'] = "未找到评论" except Exception as e: logger.error(f"提取第一个评论失败: {e}") product_info['first_comment'] = "提取失败" # 添加当前URL和抓取时间 product_info['url'] = self.driver.current_url product_info['scraped_at'] = datetime.now().isoformat() return product_info except Exception as e: logger.error(f"提取产品信息时出错: {e}") return { 'name': "提取失败", 'description': "提取失败", 'first_comment': "提取失败", 'url': self.driver.current_url if self.driver else "未知", 'scraped_at': datetime.now().isoformat(), 'error': str(e) } def scrape_product(self, product_url): """ 抓取指定URL的产品信息 Args: product_url (str): 产品页面URL Returns: dict: 产品信息字典 """ if not self.driver: if not self.connect_to_chrome(): logger.error("无法连接到Chrome实例") return None if not self.navigate_to_product(product_url): logger.error("无法导航到产品页面") return None return self.extract_product_info() def save_to_file(self, product_info, filename=None): """ 将产品信息保存到JSON文件 Args: product_info (dict): 产品信息 filename (str, optional): 文件名。如果未提供,将自动生成 Returns: str: 保存的文件名 """ if not filename: now = datetime.now() filename = f"product_{now.strftime('%Y%m%d_%H%M%S')}.json" try: with open(filename, 'w', encoding='utf-8') as f: json.dump(product_info, f, ensure_ascii=False, indent=2) logger.info(f"产品信息已保存到 {filename}") return filename except Exception as e: logger.error(f"保存文件失败: {e}") raise def close(self): """ 关闭连接 """ if self.driver: self.driver.quit() logger.info("已关闭Chrome连接") if __name__ == "__main__": # 示例用法 # 如果您有现有的Chrome调试实例,设置use_debug_address=True scraper = ProductHuntScraper(use_debug_address=False) try: # 要抓取的产品URL product_url = "https://www.producthunt.com/products/elsie-ai-beta" # 抓取产品信息 product_info = scraper.scrape_product(product_url) if product_info: # 打印产品信息 logger.info("抓取到的产品信息:") logger.info(json.dumps(product_info, ensure_ascii=False, indent=2)) # 保存到文件 filename = scraper.save_to_file(product_info) logger.info(f"产品信息已保存到: {filename}") else: logger.error("未能获取产品信息") except Exception as e: logger.error(f"程序执行出错: {e}") finally: # 关闭Chrome实例 scraper.close()