更新playwright控制chrome远程端口

2025-11-17 22:10:40 +08:00
parent e851d0d5fb
commit 74dfa978cf
30 changed files with 60875 additions and 64671 deletions
--- a/product/playwright-get-data.py
+++ b/product/playwright-get-data.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+使用Playwright连接远程Chrome调试端口访问ProductHunt页面
+"""
+
+import asyncio
+from playwright.async_api import async_playwright
+from loguru import logger
+import sys
+
+# 配置日志
+logger.remove()
+logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
+
+
+class ProductHuntScraper:
+    """ProductHunt数据抓取器"""
+    
+    def __init__(self, debug_port=9222):
+        self.debug_port = debug_port
+        self.browser = None
+        self.page = None
+        
+    async def connect_to_existing_chrome(self):
+        """连接到已运行的Chrome实例"""
+        logger.info(f"正在连接到Chrome远程调试端口 {self.debug_port}")
+        
+        try:
+            # 创建Playwright实例并保持引用
+            self.playwright = await async_playwright().start()
+            
+            # 连接到已运行的Chrome实例
+            self.browser = await self.playwright.chromium.connect_over_cdp(
+                f"http://localhost:{self.debug_port}"
+            )
+            
+            # 获取第一个上下文（通常是默认的）
+            contexts = self.browser.contexts
+            if contexts:
+                context = contexts[0]
+                # 获取第一个页面
+                pages = context.pages
+                if pages:
+                    self.page = pages[0]
+                else:
+                    # 如果没有页面，创建新页面
+                    self.page = await context.new_page()
+            else:
+                # 如果没有上下文，创建新上下文
+                context = await self.browser.new_context()
+                self.page = await context.new_page()
+            
+            logger.success("成功连接到Chrome浏览器")
+            return True
+            
+        except Exception as e:
+            logger.error(f"连接Chrome失败: {e}")
+            return False
+    
+    async def navigate_to_producthunt(self, url):
+        """导航到ProductHunt页面"""
+        if not self.page:
+            logger.error("页面未初始化")
+            return False
+            
+        try:
+            logger.info(f"正在访问: {url}")
+            # 增加页面导航超时时间到300秒
+            await self.page.goto(url, wait_until="domcontentloaded", timeout=300000)
+            
+            # 等待页面加载完成，增加超时时间
+            await self.page.wait_for_load_state("networkidle", timeout=300000)
+            
+            # 等待页面标题包含"Product Hunt"，最长等待300秒
+            logger.info("等待页面标题包含'Product Hunt'...")
+            max_wait_time = 300  # 最大等待时间（秒）
+            wait_interval = 5   # 检查间隔（秒）
+            waited_time = 0
+            
+            while waited_time < max_wait_time:
+                # 获取页面标题
+                title = await self.page.title()
+                logger.info(f"当前页面标题: {title}")
+                
+                # 检查标题是否包含"Product Hunt"
+                if "Product Hunt" in title:
+                    logger.success(f"页面标题已包含'Product Hunt'，等待时间: {waited_time}秒")
+                    logger.success("Product Hunt网站已成功打开")
+                    return True
+                
+                # 等待一段时间后再次检查
+                await asyncio.sleep(wait_interval)
+                waited_time += wait_interval
+                logger.info(f"已等待 {waited_time} 秒，继续等待...")
+            
+            # 如果超时仍未找到目标标题
+            logger.warning(f"等待超时（{max_wait_time}秒），页面标题仍未包含'Product Hunt'")
+            logger.info(f"最终页面标题: {await self.page.title()}")
+            
+            # 即使超时，如果页面正常加载也返回True
+            final_title = await self.page.title()
+            if final_title and "Not Found" not in final_title and "Error" not in final_title:
+                logger.success("页面已正常加载，但标题不符合预期")
+                return True
+            else:
+                logger.error("页面加载失败")
+                return False
+            
+        except Exception as e:
+            logger.error(f"访问页面失败: {e}")
+            return False
+    
+    async def extract_product_info(self):
+        """提取产品信息"""
+        if not self.page:
+            logger.error("页面未初始化")
+            return None
+            
+        try:
+            product_info = {}
+            
+            # 提取产品名称
+            name_element = await self.page.query_selector("h1")
+            if name_element:
+                product_info["name"] = await name_element.text_content()
+                logger.info(f"产品名称: {product_info['name']}")
+            
+            # 提取产品描述
+            desc_element = await self.page.query_selector("[data-testid='product-description']")
+            if not desc_element:
+                desc_element = await self.page.query_selector(".styles_description__")
+            
+            if desc_element:
+                product_info["description"] = await desc_element.text_content()
+                logger.info(f"产品描述: {product_info['description'][:100]}...")
+            
+            # 提取投票数
+            votes_element = await self.page.query_selector("[data-testid='vote-button']")
+            if votes_element:
+                votes_text = await votes_element.text_content()
+                product_info["votes"] = votes_text
+                logger.info(f"投票数: {votes_text}")
+            
+            # 提取产品链接
+            website_element = await self.page.query_selector("a[href*='://']")
+            if website_element:
+                product_info["website"] = await website_element.get_attribute("href")
+                logger.info(f"产品网站: {product_info['website']}")
+            
+            # 截取页面截图
+            screenshot_path = "product_screenshot.png"
+            await self.page.screenshot(path=screenshot_path, full_page=True)
+            logger.info(f"页面截图已保存到: {screenshot_path}")
+            
+            return product_info
+            
+        except Exception as e:
+            logger.error(f"提取产品信息失败: {e}")
+            return None
+    
+    async def close(self):
+        """关闭连接"""
+        if self.browser:
+            await self.browser.close()
+            logger.info("浏览器连接已关闭")
+        
+        if hasattr(self, 'playwright') and self.playwright:
+            await self.playwright.stop()
+            logger.info("Playwright实例已关闭")
+
+
+async def main():
+    """主函数"""
+    logger.info("开始ProductHunt数据抓取任务")
+    
+    # 目标URL
+    target_url = "https://www.producthunt.com/products/notion"
+    
+    # 创建抓取器实例
+    scraper = ProductHuntScraper(debug_port=9222)
+    
+    try:
+        # 连接到Chrome
+        if not await scraper.connect_to_existing_chrome():
+            logger.error("无法连接到Chrome，请确保Chrome已启动并启用远程调试")
+            return
+        
+        # 导航到目标页面
+        if not await scraper.navigate_to_producthunt(target_url):
+            logger.error("页面访问失败")
+            return
+        
+        # 提取产品信息
+        product_info = await scraper.extract_product_info()
+        
+        if product_info:
+            logger.success("产品信息提取完成")
+            # 保存产品信息到JSON文件
+            import json
+            with open("product_info.json", "w", encoding="utf-8") as f:
+                json.dump(product_info, f, ensure_ascii=False, indent=2)
+            logger.info("产品信息已保存到 product_info.json")
+        else:
+            logger.warning("未能提取到产品信息")
+        
+    except Exception as e:
+        logger.error(f"执行过程中发生错误: {e}")
+    
+    finally:
+        # 关闭连接
+        await scraper.close()
+        logger.info("任务完成")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())