生成并改进抓取producthunt.com网站的内容到脚本

This commit is contained in:
2025-11-18 22:24:04 +08:00
parent 1da5501e55
commit 40df4ee171
5 changed files with 174 additions and 14 deletions

View File

@@ -69,10 +69,7 @@ class ProductHuntScraper:
logger.info(f"正在访问: {url}")
# 增加页面导航超时时间到300秒
await self.page.goto(url, wait_until="domcontentloaded", timeout=300000)
# 等待页面加载完成,增加超时时间
await self.page.wait_for_load_state("networkidle", timeout=300000)
# 等待页面标题包含"Product Hunt"最长等待300秒
logger.info("等待页面标题包含'Product Hunt'...")
max_wait_time = 300 # 最大等待时间(秒)
@@ -112,6 +109,137 @@ class ProductHuntScraper:
logger.error(f"访问页面失败: {e}")
return False
async def extract_maker_statement_from_new_window(self, maker_link, maker_text):
"""模拟点击链接在新窗口中提取制作人发言内容"""
try:
logger.info("模拟点击制作人链接...")
# 查找要点击的a标签
a_element = await self.page.query_selector(f'a[href*="{maker_link.split("/")[-1]}"]')
if not a_element:
# 如果找不到特定href的a标签尝试查找包含制作人文本的a标签
a_element = await self.page.query_selector(f'a:has-text("{maker_text}")')
if not a_element:
logger.warning("未找到要点击的a标签使用备用方法")
# 备用方法:直接打开新窗口
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
# 获取a标签的边界框用于点击中间位置
bbox = await a_element.bounding_box()
if not bbox:
logger.warning("无法获取a标签边界框使用备用方法")
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
# 计算中间位置
center_x = bbox['x'] + bbox['width'] / 2
center_y = bbox['y'] + bbox['height'] / 2
logger.info(f"点击a标签中间位置: ({center_x:.1f}, {center_y:.1f})")
# 监听新窗口打开事件
async with self.page.context.expect_page() as new_page_info:
# 模拟点击a标签中间位置
await self.page.mouse.click(center_x, center_y)
# 获取新页面
new_page = await new_page_info.value
# 等待新页面加载完成
await new_page.wait_for_load_state("domcontentloaded")
await new_page.wait_for_timeout(5000) # 额外等待2秒确保内容加载
logger.success("新窗口已加载完成")
# 抓取第一个section的tag
first_section = await new_page.query_selector('section')
if first_section:
logger.success("找到第一个section标签")
# 在section下面找一个没有任何class的div标签
div_without_class = await first_section.query_selector('div:not([class])')
if div_without_class:
logger.success("找到无class的div标签")
# 提取div及其子标签的所有文本内容
maker_statement = await div_without_class.inner_text()
result = maker_statement.strip()
logger.info(f"制作人发言(新窗口): {result[:2000]}...")
else:
logger.warning("未找到无class的div标签")
# 回退到提取section的文本内容
section_text = await first_section.inner_text()
result = section_text.strip()
logger.info(f"制作人发言(回退section): {result[:200]}...")
else:
logger.warning("未找到section标签")
# 回退到原始a标签文本
result = maker_text
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
# 关闭新页面
await new_page.close()
logger.info("新窗口已关闭")
return result
except Exception as new_page_error:
logger.error(f"模拟点击操作失败: {new_page_error}")
# 如果模拟点击失败,使用备用方法
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
async def _extract_maker_statement_direct_open(self, maker_link, maker_text):
"""备用方法:直接在新窗口中打开链接"""
try:
logger.info("使用备用方法:直接在新窗口中打开链接...")
# 创建新页面
new_page = await self.browser.new_page()
# 导航到制作人页面
await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=300000)
# 等待页面加载
await new_page.wait_for_timeout(3000)
# 抓取第一个section的tag
first_section = await new_page.query_selector('section')
if first_section:
logger.success("找到第一个section标签")
# 在section下面找一个没有任何class的div标签
div_without_class = await first_section.query_selector('div:not([class])')
if div_without_class:
logger.success("找到无class的div标签")
# 提取div及其子标签的所有文本内容
maker_statement = await div_without_class.inner_text()
result = maker_statement.strip()
logger.info(f"制作人发言(新窗口): {result[:2000]}...")
else:
logger.warning("未找到无class的div标签")
# 回退到提取section的文本内容
section_text = await first_section.inner_text()
result = section_text.strip()
logger.info(f"制作人发言(回退section): {result[:200]}...")
else:
logger.warning("未找到section标签")
# 回退到原始a标签文本
result = maker_text
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
# 关闭新页面
await new_page.close()
logger.info("新窗口已关闭")
return result
except Exception as e:
logger.error(f"备用方法也失败: {e}")
# 如果备用方法也失败回退到原始a标签文本
return maker_text
async def extract_product_info(self):
"""提取产品信息"""
if not self.page:
@@ -150,17 +278,39 @@ class ProductHuntScraper:
if section_element:
logger.success("制作人发言区域已加载")
# 提取制作人发言class为"flex flex-col gap-1"的div里面的span标签
maker_div = await section_element.query_selector('div.flex.flex-col.gap-1')
if maker_div:
span_element = await maker_div.query_selector('span')
# 查找section标签下面的第一个a标签
a_element = await section_element.query_selector('a')
if a_element:
# 提取a标签的文本内容
maker_text = (await a_element.text_content()).strip()
# 提取a标签的href属性超链接
maker_link = await a_element.get_attribute('href')
# 拼凑完整的URL
if maker_link and not maker_link.startswith('http'):
# 如果是相对路径拼凑为完整URL
base_url = "https://www.producthunt.com"
if maker_link.startswith('/'):
maker_link = base_url + maker_link
else:
maker_link = base_url + '/' + maker_link
product_info["maker_link"] = maker_link
logger.info(f"制作人链接: {maker_link}")
# 调用子函数在新窗口中提取制作人发言
product_info["maker_statement"] = await self.extract_maker_statement_from_new_window(maker_link, maker_text)
else:
logger.warning("在section中未找到a标签")
# 如果没有a标签尝试查找span标签
span_element = await section_element.query_selector('span')
if span_element:
product_info["maker_statement"] = (await span_element.text_content()).strip()
logger.info(f"制作人发言: {product_info['maker_statement'][:200]}...")
logger.info(f"制作人发言(回退span): {product_info['maker_statement'][:200]}...")
else:
logger.warning("在div中未找到span标签")
else:
logger.warning("未找到class为'flex flex-col gap-1'的div")
logger.warning("未找到span标签")
else:
logger.warning("制作人发言区域未加载")
except Exception as e:

1
product/start_chrome.bat Normal file
View File

@@ -0,0 +1 @@
"C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\temp\chrome_debug"

7
product_info.json Normal file
View File

@@ -0,0 +1,7 @@
{
"name": "Notion",
"introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.",
"maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion",
"maker_statement": "AI Meeting Notes by Notion",
"user_count": "15K followers"
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 642 KiB

After

Width:  |  Height:  |  Size: 616 KiB

View File

@@ -4,6 +4,8 @@
产品简介: Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.
制作人发言: 未获取
制作人发言: AI Meeting Notes by Notion
提取时间: 2025-11-17 22:51:58
用户数: 15K followers
提取时间: 2025-11-18 22:18:51