import os import json import time from datetime import datetime from playwright.sync_api import sync_playwright from loguru import logger class ProductHuntScraper: def __init__(self): self.browser = None self.context = None self.page = None self.product_url = "https://www.producthunt.com/products/elsie-ai-beta" def connect_to_browser(self): """连接到浏览器""" try: logger.info("正在初始化Playwright浏览器...") # 启动Playwright self.playwright = sync_playwright().start() # 启动Chromium浏览器 self.browser = self.playwright.chromium.launch( headless=False, # 设置为False以便观察浏览器行为 args=[ "--no-sandbox", "--disable-dev-shm-usage", "--disable-gpu", "--window-size=1920,1080" ] ) # 创建浏览器上下文 self.context = self.browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", viewport={"width": 1920, "height": 1080} ) # 创建新页面 self.page = self.context.new_page() logger.info("成功连接到Playwright浏览器") return True except Exception as e: logger.error(f"连接浏览器失败: {str(e)}") return False def navigate_to_product(self): """导航到产品页面""" try: logger.info(f"正在导航到产品页面: {self.product_url}") # 首先访问主页 logger.info("首先访问ProductHunt主页") self.page.goto("https://www.producthunt.com/", wait_until="domcontentloaded") time.sleep(3) # 等待页面加载 # 然后访问产品页面 self.page.goto(self.product_url, wait_until="domcontentloaded") # 等待页面加载 logger.info("等待页面加载...") time.sleep(10) # 等待动态内容加载 logger.info("页面加载完成") return True except Exception as e: logger.error(f"导航到产品页面失败: {str(e)}") return False def extract_product_info(self): """提取产品信息""" try: logger.info("开始提取产品信息") product_info = { "url": self.product_url, "scraped_at": datetime.now().isoformat() } # 提取产品名称 (h1标签) try: name_element = self.page.query_selector("h1") if name_element: product_info["name"] = name_element.text_content().strip() logger.info(f"产品名称: {product_info['name']}") else: logger.warning("未找到产品名称 (h1标签)") product_info["name"] = "未找到" except Exception as e: logger.warning(f"提取产品名称时出错: {str(e)}") product_info["name"] = "未找到" # 提取产品简介 (class为"relative text-16 font-normal text-gray-700"的div) try: desc_selector = "div.relative.text-16.font-normal.text-gray-700" desc_element = self.page.query_selector(desc_selector) if desc_element: product_info["description"] = desc_element.text_content().strip() logger.info(f"产品简介: {product_info['description'][:50]}...") else: logger.warning(f"未找到产品简介 ({desc_selector})") product_info["description"] = "未找到" except Exception as e: logger.warning(f"提取产品简介时出错: {str(e)}") product_info["description"] = "未找到" # 提取第一个评论 (class为"flex flex-1 flex-col gap-2"的div) try: comment_selector = "div.flex.flex-1.flex-col.gap-2" comment_element = self.page.query_selector(comment_selector) if comment_element: product_info["first_comment"] = comment_element.text_content().strip() logger.info(f"第一个评论: {product_info['first_comment'][:50]}...") else: logger.warning(f"未找到第一个评论 ({comment_selector})") product_info["first_comment"] = "未找到" except Exception as e: logger.warning(f"提取第一个评论时出错: {str(e)}") product_info["first_comment"] = "未找到" # 尝试提取其他有用信息 try: # 尝试获取产品标签 tag_elements = self.page.query_selector_all("[class*='tag'], [class*='category'], [class*='topic']") if tag_elements: tags = [tag.text_content().strip() for tag in tag_elements if tag.text_content().strip()] product_info["tags"] = tags[:5] # 最多取5个标签 logger.info(f"找到标签: {tags[:3]}") except Exception as e: logger.debug(f"提取标签时出错: {str(e)}") # 尝试获取点赞数 try: like_elements = self.page.query_selector_all("[class*='vote'], [class*='like'], [class*='upvote']") if like_elements: product_info["likes"] = like_elements[0].text_content().strip() logger.info(f"点赞数: {product_info['likes']}") except Exception as e: logger.debug(f"提取点赞数时出错: {str(e)}") # 尝试获取评论数 try: comment_count_elements = self.page.query_selector_all("[class*='comment-count'], [class*='comments']") if comment_count_elements: product_info["comment_count"] = comment_count_elements[0].text_content().strip() logger.info(f"评论数: {product_info['comment_count']}") except Exception as e: logger.debug(f"提取评论数时出错: {str(e)}") return product_info except Exception as e: logger.error(f"提取产品信息失败: {str(e)}") return None def save_to_file(self, data, filename="product_info.json"): """保存数据到文件""" try: with open(filename, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) logger.info(f"数据已保存到 {filename}") return True except Exception as e: logger.error(f"保存数据失败: {str(e)}") return False def save_screenshot(self, filename="product_screenshot.png"): """保存页面截图,用于调试""" try: self.page.screenshot(path=filename) logger.info(f"页面截图已保存到 {filename}") return True except Exception as e: logger.error(f"保存页面截图失败: {str(e)}") return False def close(self): """关闭浏览器""" try: if self.context: self.context.close() if self.browser: self.browser.close() if self.playwright: self.playwright.stop() logger.info("浏览器已关闭") except Exception as e: logger.error(f"关闭浏览器时出错: {str(e)}") def scrape_product(self): """执行完整的抓取流程""" if not self.connect_to_browser(): logger.error("无法连接到浏览器") return False try: if not self.navigate_to_product(): logger.error("无法导航到产品页面") return False # 保存截图用于调试 self.save_screenshot() product_info = self.extract_product_info() if product_info: self.save_to_file(product_info) return True else: logger.error("未能提取产品信息") return False finally: self.close() def main(): logger.info("开始ProductHunt产品信息抓取") scraper = ProductHuntScraper() # 可以修改product_url来抓取其他产品 # scraper.product_url = "https://www.producthunt.com/products/your-product" success = scraper.scrape_product() if success: logger.info("产品信息抓取完成") else: logger.error("产品信息抓取失败") if __name__ == "__main__": main()