生成并改进抓取producthunt.com网站的内容到脚本

2025-11-18 22:24:04 +08:00
parent 1da5501e55
commit 40df4ee171
5 changed files with 174 additions and 14 deletions
--- a/product/playwright-get-data.py
+++ b/product/playwright-get-data.py
@@ -69,10 +69,7 @@ class ProductHuntScraper:
            logger.info(f"正在访问: {url}")
            # 增加页面导航超时时间到300秒
            await self.page.goto(url, wait_until="domcontentloaded", timeout=300000)
-            
-            # 等待页面加载完成，增加超时时间
-            await self.page.wait_for_load_state("networkidle", timeout=300000)
-            
+
            # 等待页面标题包含"Product Hunt"，最长等待300秒
            logger.info("等待页面标题包含'Product Hunt'...")
            max_wait_time = 300  # 最大等待时间（秒）
@@ -112,6 +109,137 @@ class ProductHuntScraper:
            logger.error(f"访问页面失败: {e}")
            return False
    
+    async def extract_maker_statement_from_new_window(self, maker_link, maker_text):
+        """模拟点击链接在新窗口中提取制作人发言内容"""
+        try:
+            logger.info("模拟点击制作人链接...")
+            
+            # 查找要点击的a标签
+            a_element = await self.page.query_selector(f'a[href*="{maker_link.split("/")[-1]}"]')
+            if not a_element:
+                # 如果找不到特定href的a标签，尝试查找包含制作人文本的a标签
+                a_element = await self.page.query_selector(f'a:has-text("{maker_text}")')
+            
+            if not a_element:
+                logger.warning("未找到要点击的a标签，使用备用方法")
+                # 备用方法：直接打开新窗口
+                return await self._extract_maker_statement_direct_open(maker_link, maker_text)
+            
+            # 获取a标签的边界框，用于点击中间位置
+            bbox = await a_element.bounding_box()
+            if not bbox:
+                logger.warning("无法获取a标签边界框，使用备用方法")
+                return await self._extract_maker_statement_direct_open(maker_link, maker_text)
+            
+            # 计算中间位置
+            center_x = bbox['x'] + bbox['width'] / 2
+            center_y = bbox['y'] + bbox['height'] / 2
+            
+            logger.info(f"点击a标签中间位置: ({center_x:.1f}, {center_y:.1f})")
+            
+            # 监听新窗口打开事件
+            async with self.page.context.expect_page() as new_page_info:
+                # 模拟点击a标签中间位置
+                await self.page.mouse.click(center_x, center_y)
+            
+            # 获取新页面
+            new_page = await new_page_info.value
+            
+            # 等待新页面加载完成
+            await new_page.wait_for_load_state("domcontentloaded")
+            await new_page.wait_for_timeout(5000)  # 额外等待2秒确保内容加载
+            
+            logger.success("新窗口已加载完成")
+            
+            # 抓取第一个section的tag
+            first_section = await new_page.query_selector('section')
+            if first_section:
+                logger.success("找到第一个section标签")
+                
+                # 在section下面找一个没有任何class的div标签
+                div_without_class = await first_section.query_selector('div:not([class])')
+                if div_without_class:
+                    logger.success("找到无class的div标签")
+                    
+                    # 提取div及其子标签的所有文本内容
+                    maker_statement = await div_without_class.inner_text()
+                    result = maker_statement.strip()
+                    
+                    logger.info(f"制作人发言(新窗口): {result[:2000]}...")
+                else:
+                    logger.warning("未找到无class的div标签")
+                    # 回退到提取section的文本内容
+                    section_text = await first_section.inner_text()
+                    result = section_text.strip()
+                    logger.info(f"制作人发言(回退section): {result[:200]}...")
+            else:
+                logger.warning("未找到section标签")
+                # 回退到原始a标签文本
+                result = maker_text
+                logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
+            
+            # 关闭新页面
+            await new_page.close()
+            logger.info("新窗口已关闭")
+            
+            return result
+            
+        except Exception as new_page_error:
+            logger.error(f"模拟点击操作失败: {new_page_error}")
+            # 如果模拟点击失败，使用备用方法
+            return await self._extract_maker_statement_direct_open(maker_link, maker_text)
+    
+    async def _extract_maker_statement_direct_open(self, maker_link, maker_text):
+        """备用方法：直接在新窗口中打开链接"""
+        try:
+            logger.info("使用备用方法：直接在新窗口中打开链接...")
+            # 创建新页面
+            new_page = await self.browser.new_page()
+            
+            # 导航到制作人页面
+            await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=300000)
+            
+            # 等待页面加载
+            await new_page.wait_for_timeout(3000)
+            
+            # 抓取第一个section的tag
+            first_section = await new_page.query_selector('section')
+            if first_section:
+                logger.success("找到第一个section标签")
+                
+                # 在section下面找一个没有任何class的div标签
+                div_without_class = await first_section.query_selector('div:not([class])')
+                if div_without_class:
+                    logger.success("找到无class的div标签")
+                    
+                    # 提取div及其子标签的所有文本内容
+                    maker_statement = await div_without_class.inner_text()
+                    result = maker_statement.strip()
+                    
+                    logger.info(f"制作人发言(新窗口): {result[:2000]}...")
+                else:
+                    logger.warning("未找到无class的div标签")
+                    # 回退到提取section的文本内容
+                    section_text = await first_section.inner_text()
+                    result = section_text.strip()
+                    logger.info(f"制作人发言(回退section): {result[:200]}...")
+            else:
+                logger.warning("未找到section标签")
+                # 回退到原始a标签文本
+                result = maker_text
+                logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
+            
+            # 关闭新页面
+            await new_page.close()
+            logger.info("新窗口已关闭")
+            
+            return result
+            
+        except Exception as e:
+            logger.error(f"备用方法也失败: {e}")
+            # 如果备用方法也失败，回退到原始a标签文本
+            return maker_text
+    
    async def extract_product_info(self):
        """提取产品信息"""
        if not self.page:
@@ -150,17 +278,39 @@ class ProductHuntScraper:
                if section_element:
                    logger.success("制作人发言区域已加载")
                    
-                    # 提取制作人发言（class为"flex flex-col gap-1"的div里面的span标签）
-                    maker_div = await section_element.query_selector('div.flex.flex-col.gap-1')
-                    if maker_div:
-                        span_element = await maker_div.query_selector('span')
+                    # 查找section标签下面的第一个a标签
+                    a_element = await section_element.query_selector('a')
+                    if a_element:
+                        # 提取a标签的文本内容
+                        maker_text = (await a_element.text_content()).strip()
+                        # 提取a标签的href属性（超链接）
+                        maker_link = await a_element.get_attribute('href')
+                        
+                        # 拼凑完整的URL
+                        if maker_link and not maker_link.startswith('http'):
+                            # 如果是相对路径，拼凑为完整URL
+                            base_url = "https://www.producthunt.com"
+                            if maker_link.startswith('/'):
+                                maker_link = base_url + maker_link
+                            else:
+                                maker_link = base_url + '/' + maker_link
+                        
+                        product_info["maker_link"] = maker_link
+                        logger.info(f"制作人链接: {maker_link}")
+                        
+                        # 调用子函数在新窗口中提取制作人发言
+                        product_info["maker_statement"] = await self.extract_maker_statement_from_new_window(maker_link, maker_text)
+                            
+                    else:
+                        logger.warning("在section中未找到a标签")
+                        # 如果没有a标签，尝试查找span标签
+                        span_element = await section_element.query_selector('span')
                        if span_element:
                            product_info["maker_statement"] = (await span_element.text_content()).strip()
-                            logger.info(f"制作人发言: {product_info['maker_statement'][:200]}...")
+                            logger.info(f"制作人发言(回退span): {product_info['maker_statement'][:200]}...")
                        else:
-                            logger.warning("在div中未找到span标签")
-                    else:
-                        logger.warning("未找到class为'flex flex-col gap-1'的div")
+                            logger.warning("未找到span标签")
+                      
                else:
                    logger.warning("制作人发言区域未加载")
            except Exception as e:
--- a/product/start_chrome.bat
+++ b/product/start_chrome.bat
@@ -0,0 +1 @@
+"C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\temp\chrome_debug"
--- a/product_info.json
+++ b/product_info.json
@@ -0,0 +1,7 @@
+{
+  "name": "Notion",
+  "introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.",
+  "maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion",
+  "maker_statement": "AI Meeting Notes by Notion",
+  "user_count": "15K followers"
+}
--- a/product_screenshot.png
+++ b/product_screenshot.png
--- a/temp_product_info.txt
+++ b/temp_product_info.txt
@@ -4,6 +4,8 @@

 产品简介: Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.

-制作人发言: 未获取
+制作人发言: AI Meeting Notes by Notion

-提取时间: 2025-11-17 22:51:58
+用户数: 15K followers
+
+提取时间: 2025-11-18 22:18:51
				`@@ -0,0 +1 @@`
				`"C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\temp\chrome_debug"`