diff --git a/product/playwright-get-data.py b/product/playwright-get-data.py index 4b84eb2..1340a5a 100644 --- a/product/playwright-get-data.py +++ b/product/playwright-get-data.py @@ -69,10 +69,7 @@ class ProductHuntScraper: logger.info(f"正在访问: {url}") # 增加页面导航超时时间到300秒 await self.page.goto(url, wait_until="domcontentloaded", timeout=300000) - - # 等待页面加载完成,增加超时时间 - await self.page.wait_for_load_state("networkidle", timeout=300000) - + # 等待页面标题包含"Product Hunt",最长等待300秒 logger.info("等待页面标题包含'Product Hunt'...") max_wait_time = 300 # 最大等待时间(秒) @@ -112,6 +109,137 @@ class ProductHuntScraper: logger.error(f"访问页面失败: {e}") return False + async def extract_maker_statement_from_new_window(self, maker_link, maker_text): + """模拟点击链接在新窗口中提取制作人发言内容""" + try: + logger.info("模拟点击制作人链接...") + + # 查找要点击的a标签 + a_element = await self.page.query_selector(f'a[href*="{maker_link.split("/")[-1]}"]') + if not a_element: + # 如果找不到特定href的a标签,尝试查找包含制作人文本的a标签 + a_element = await self.page.query_selector(f'a:has-text("{maker_text}")') + + if not a_element: + logger.warning("未找到要点击的a标签,使用备用方法") + # 备用方法:直接打开新窗口 + return await self._extract_maker_statement_direct_open(maker_link, maker_text) + + # 获取a标签的边界框,用于点击中间位置 + bbox = await a_element.bounding_box() + if not bbox: + logger.warning("无法获取a标签边界框,使用备用方法") + return await self._extract_maker_statement_direct_open(maker_link, maker_text) + + # 计算中间位置 + center_x = bbox['x'] + bbox['width'] / 2 + center_y = bbox['y'] + bbox['height'] / 2 + + logger.info(f"点击a标签中间位置: ({center_x:.1f}, {center_y:.1f})") + + # 监听新窗口打开事件 + async with self.page.context.expect_page() as new_page_info: + # 模拟点击a标签中间位置 + await self.page.mouse.click(center_x, center_y) + + # 获取新页面 + new_page = await new_page_info.value + + # 等待新页面加载完成 + await new_page.wait_for_load_state("domcontentloaded") + await new_page.wait_for_timeout(5000) # 额外等待2秒确保内容加载 + + logger.success("新窗口已加载完成") + + # 抓取第一个section的tag + first_section = await new_page.query_selector('section') + if first_section: + logger.success("找到第一个section标签") + + # 在section下面找一个没有任何class的div标签 + div_without_class = await first_section.query_selector('div:not([class])') + if div_without_class: + logger.success("找到无class的div标签") + + # 提取div及其子标签的所有文本内容 + maker_statement = await div_without_class.inner_text() + result = maker_statement.strip() + + logger.info(f"制作人发言(新窗口): {result[:2000]}...") + else: + logger.warning("未找到无class的div标签") + # 回退到提取section的文本内容 + section_text = await first_section.inner_text() + result = section_text.strip() + logger.info(f"制作人发言(回退section): {result[:200]}...") + else: + logger.warning("未找到section标签") + # 回退到原始a标签文本 + result = maker_text + logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...") + + # 关闭新页面 + await new_page.close() + logger.info("新窗口已关闭") + + return result + + except Exception as new_page_error: + logger.error(f"模拟点击操作失败: {new_page_error}") + # 如果模拟点击失败,使用备用方法 + return await self._extract_maker_statement_direct_open(maker_link, maker_text) + + async def _extract_maker_statement_direct_open(self, maker_link, maker_text): + """备用方法:直接在新窗口中打开链接""" + try: + logger.info("使用备用方法:直接在新窗口中打开链接...") + # 创建新页面 + new_page = await self.browser.new_page() + + # 导航到制作人页面 + await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=300000) + + # 等待页面加载 + await new_page.wait_for_timeout(3000) + + # 抓取第一个section的tag + first_section = await new_page.query_selector('section') + if first_section: + logger.success("找到第一个section标签") + + # 在section下面找一个没有任何class的div标签 + div_without_class = await first_section.query_selector('div:not([class])') + if div_without_class: + logger.success("找到无class的div标签") + + # 提取div及其子标签的所有文本内容 + maker_statement = await div_without_class.inner_text() + result = maker_statement.strip() + + logger.info(f"制作人发言(新窗口): {result[:2000]}...") + else: + logger.warning("未找到无class的div标签") + # 回退到提取section的文本内容 + section_text = await first_section.inner_text() + result = section_text.strip() + logger.info(f"制作人发言(回退section): {result[:200]}...") + else: + logger.warning("未找到section标签") + # 回退到原始a标签文本 + result = maker_text + logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...") + + # 关闭新页面 + await new_page.close() + logger.info("新窗口已关闭") + + return result + + except Exception as e: + logger.error(f"备用方法也失败: {e}") + # 如果备用方法也失败,回退到原始a标签文本 + return maker_text + async def extract_product_info(self): """提取产品信息""" if not self.page: @@ -150,17 +278,39 @@ class ProductHuntScraper: if section_element: logger.success("制作人发言区域已加载") - # 提取制作人发言(class为"flex flex-col gap-1"的div里面的span标签) - maker_div = await section_element.query_selector('div.flex.flex-col.gap-1') - if maker_div: - span_element = await maker_div.query_selector('span') + # 查找section标签下面的第一个a标签 + a_element = await section_element.query_selector('a') + if a_element: + # 提取a标签的文本内容 + maker_text = (await a_element.text_content()).strip() + # 提取a标签的href属性(超链接) + maker_link = await a_element.get_attribute('href') + + # 拼凑完整的URL + if maker_link and not maker_link.startswith('http'): + # 如果是相对路径,拼凑为完整URL + base_url = "https://www.producthunt.com" + if maker_link.startswith('/'): + maker_link = base_url + maker_link + else: + maker_link = base_url + '/' + maker_link + + product_info["maker_link"] = maker_link + logger.info(f"制作人链接: {maker_link}") + + # 调用子函数在新窗口中提取制作人发言 + product_info["maker_statement"] = await self.extract_maker_statement_from_new_window(maker_link, maker_text) + + else: + logger.warning("在section中未找到a标签") + # 如果没有a标签,尝试查找span标签 + span_element = await section_element.query_selector('span') if span_element: product_info["maker_statement"] = (await span_element.text_content()).strip() - logger.info(f"制作人发言: {product_info['maker_statement'][:200]}...") + logger.info(f"制作人发言(回退span): {product_info['maker_statement'][:200]}...") else: - logger.warning("在div中未找到span标签") - else: - logger.warning("未找到class为'flex flex-col gap-1'的div") + logger.warning("未找到span标签") + else: logger.warning("制作人发言区域未加载") except Exception as e: diff --git a/product/start_chrome.bat b/product/start_chrome.bat new file mode 100644 index 0000000..8192e4c --- /dev/null +++ b/product/start_chrome.bat @@ -0,0 +1 @@ +"C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\temp\chrome_debug" \ No newline at end of file diff --git a/product_info.json b/product_info.json new file mode 100644 index 0000000..cae5632 --- /dev/null +++ b/product_info.json @@ -0,0 +1,7 @@ +{ + "name": "Notion", + "introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.", + "maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion", + "maker_statement": "AI Meeting Notes by Notion", + "user_count": "15K followers" +} \ No newline at end of file diff --git a/product_screenshot.png b/product_screenshot.png index 60a59b2..9326969 100644 Binary files a/product_screenshot.png and b/product_screenshot.png differ diff --git a/temp_product_info.txt b/temp_product_info.txt index fe8de4c..5402e4b 100644 --- a/temp_product_info.txt +++ b/temp_product_info.txt @@ -4,6 +4,8 @@ 产品简介: Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows. -制作人发言: 未获取 +制作人发言: AI Meeting Notes by Notion -提取时间: 2025-11-17 22:51:58 +用户数: 15K followers + +提取时间: 2025-11-18 22:18:51