生成并改进抓取producthunt.com网站的内容到脚本
This commit is contained in:
@@ -70,9 +70,6 @@ class ProductHuntScraper:
|
||||
# 增加页面导航超时时间到300秒
|
||||
await self.page.goto(url, wait_until="domcontentloaded", timeout=300000)
|
||||
|
||||
# 等待页面加载完成,增加超时时间
|
||||
await self.page.wait_for_load_state("networkidle", timeout=300000)
|
||||
|
||||
# 等待页面标题包含"Product Hunt",最长等待300秒
|
||||
logger.info("等待页面标题包含'Product Hunt'...")
|
||||
max_wait_time = 300 # 最大等待时间(秒)
|
||||
@@ -112,6 +109,137 @@ class ProductHuntScraper:
|
||||
logger.error(f"访问页面失败: {e}")
|
||||
return False
|
||||
|
||||
async def extract_maker_statement_from_new_window(self, maker_link, maker_text):
|
||||
"""模拟点击链接在新窗口中提取制作人发言内容"""
|
||||
try:
|
||||
logger.info("模拟点击制作人链接...")
|
||||
|
||||
# 查找要点击的a标签
|
||||
a_element = await self.page.query_selector(f'a[href*="{maker_link.split("/")[-1]}"]')
|
||||
if not a_element:
|
||||
# 如果找不到特定href的a标签,尝试查找包含制作人文本的a标签
|
||||
a_element = await self.page.query_selector(f'a:has-text("{maker_text}")')
|
||||
|
||||
if not a_element:
|
||||
logger.warning("未找到要点击的a标签,使用备用方法")
|
||||
# 备用方法:直接打开新窗口
|
||||
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
|
||||
|
||||
# 获取a标签的边界框,用于点击中间位置
|
||||
bbox = await a_element.bounding_box()
|
||||
if not bbox:
|
||||
logger.warning("无法获取a标签边界框,使用备用方法")
|
||||
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
|
||||
|
||||
# 计算中间位置
|
||||
center_x = bbox['x'] + bbox['width'] / 2
|
||||
center_y = bbox['y'] + bbox['height'] / 2
|
||||
|
||||
logger.info(f"点击a标签中间位置: ({center_x:.1f}, {center_y:.1f})")
|
||||
|
||||
# 监听新窗口打开事件
|
||||
async with self.page.context.expect_page() as new_page_info:
|
||||
# 模拟点击a标签中间位置
|
||||
await self.page.mouse.click(center_x, center_y)
|
||||
|
||||
# 获取新页面
|
||||
new_page = await new_page_info.value
|
||||
|
||||
# 等待新页面加载完成
|
||||
await new_page.wait_for_load_state("domcontentloaded")
|
||||
await new_page.wait_for_timeout(5000) # 额外等待2秒确保内容加载
|
||||
|
||||
logger.success("新窗口已加载完成")
|
||||
|
||||
# 抓取第一个section的tag
|
||||
first_section = await new_page.query_selector('section')
|
||||
if first_section:
|
||||
logger.success("找到第一个section标签")
|
||||
|
||||
# 在section下面找一个没有任何class的div标签
|
||||
div_without_class = await first_section.query_selector('div:not([class])')
|
||||
if div_without_class:
|
||||
logger.success("找到无class的div标签")
|
||||
|
||||
# 提取div及其子标签的所有文本内容
|
||||
maker_statement = await div_without_class.inner_text()
|
||||
result = maker_statement.strip()
|
||||
|
||||
logger.info(f"制作人发言(新窗口): {result[:2000]}...")
|
||||
else:
|
||||
logger.warning("未找到无class的div标签")
|
||||
# 回退到提取section的文本内容
|
||||
section_text = await first_section.inner_text()
|
||||
result = section_text.strip()
|
||||
logger.info(f"制作人发言(回退section): {result[:200]}...")
|
||||
else:
|
||||
logger.warning("未找到section标签")
|
||||
# 回退到原始a标签文本
|
||||
result = maker_text
|
||||
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
|
||||
|
||||
# 关闭新页面
|
||||
await new_page.close()
|
||||
logger.info("新窗口已关闭")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as new_page_error:
|
||||
logger.error(f"模拟点击操作失败: {new_page_error}")
|
||||
# 如果模拟点击失败,使用备用方法
|
||||
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
|
||||
|
||||
async def _extract_maker_statement_direct_open(self, maker_link, maker_text):
|
||||
"""备用方法:直接在新窗口中打开链接"""
|
||||
try:
|
||||
logger.info("使用备用方法:直接在新窗口中打开链接...")
|
||||
# 创建新页面
|
||||
new_page = await self.browser.new_page()
|
||||
|
||||
# 导航到制作人页面
|
||||
await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=300000)
|
||||
|
||||
# 等待页面加载
|
||||
await new_page.wait_for_timeout(3000)
|
||||
|
||||
# 抓取第一个section的tag
|
||||
first_section = await new_page.query_selector('section')
|
||||
if first_section:
|
||||
logger.success("找到第一个section标签")
|
||||
|
||||
# 在section下面找一个没有任何class的div标签
|
||||
div_without_class = await first_section.query_selector('div:not([class])')
|
||||
if div_without_class:
|
||||
logger.success("找到无class的div标签")
|
||||
|
||||
# 提取div及其子标签的所有文本内容
|
||||
maker_statement = await div_without_class.inner_text()
|
||||
result = maker_statement.strip()
|
||||
|
||||
logger.info(f"制作人发言(新窗口): {result[:2000]}...")
|
||||
else:
|
||||
logger.warning("未找到无class的div标签")
|
||||
# 回退到提取section的文本内容
|
||||
section_text = await first_section.inner_text()
|
||||
result = section_text.strip()
|
||||
logger.info(f"制作人发言(回退section): {result[:200]}...")
|
||||
else:
|
||||
logger.warning("未找到section标签")
|
||||
# 回退到原始a标签文本
|
||||
result = maker_text
|
||||
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
|
||||
|
||||
# 关闭新页面
|
||||
await new_page.close()
|
||||
logger.info("新窗口已关闭")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"备用方法也失败: {e}")
|
||||
# 如果备用方法也失败,回退到原始a标签文本
|
||||
return maker_text
|
||||
|
||||
async def extract_product_info(self):
|
||||
"""提取产品信息"""
|
||||
if not self.page:
|
||||
@@ -150,17 +278,39 @@ class ProductHuntScraper:
|
||||
if section_element:
|
||||
logger.success("制作人发言区域已加载")
|
||||
|
||||
# 提取制作人发言(class为"flex flex-col gap-1"的div里面的span标签)
|
||||
maker_div = await section_element.query_selector('div.flex.flex-col.gap-1')
|
||||
if maker_div:
|
||||
span_element = await maker_div.query_selector('span')
|
||||
# 查找section标签下面的第一个a标签
|
||||
a_element = await section_element.query_selector('a')
|
||||
if a_element:
|
||||
# 提取a标签的文本内容
|
||||
maker_text = (await a_element.text_content()).strip()
|
||||
# 提取a标签的href属性(超链接)
|
||||
maker_link = await a_element.get_attribute('href')
|
||||
|
||||
# 拼凑完整的URL
|
||||
if maker_link and not maker_link.startswith('http'):
|
||||
# 如果是相对路径,拼凑为完整URL
|
||||
base_url = "https://www.producthunt.com"
|
||||
if maker_link.startswith('/'):
|
||||
maker_link = base_url + maker_link
|
||||
else:
|
||||
maker_link = base_url + '/' + maker_link
|
||||
|
||||
product_info["maker_link"] = maker_link
|
||||
logger.info(f"制作人链接: {maker_link}")
|
||||
|
||||
# 调用子函数在新窗口中提取制作人发言
|
||||
product_info["maker_statement"] = await self.extract_maker_statement_from_new_window(maker_link, maker_text)
|
||||
|
||||
else:
|
||||
logger.warning("在section中未找到a标签")
|
||||
# 如果没有a标签,尝试查找span标签
|
||||
span_element = await section_element.query_selector('span')
|
||||
if span_element:
|
||||
product_info["maker_statement"] = (await span_element.text_content()).strip()
|
||||
logger.info(f"制作人发言: {product_info['maker_statement'][:200]}...")
|
||||
logger.info(f"制作人发言(回退span): {product_info['maker_statement'][:200]}...")
|
||||
else:
|
||||
logger.warning("在div中未找到span标签")
|
||||
else:
|
||||
logger.warning("未找到class为'flex flex-col gap-1'的div")
|
||||
logger.warning("未找到span标签")
|
||||
|
||||
else:
|
||||
logger.warning("制作人发言区域未加载")
|
||||
except Exception as e:
|
||||
|
||||
1
product/start_chrome.bat
Normal file
1
product/start_chrome.bat
Normal file
@@ -0,0 +1 @@
|
||||
"C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 --user-data-dir="C:\temp\chrome_debug"
|
||||
7
product_info.json
Normal file
7
product_info.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"name": "Notion",
|
||||
"introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.",
|
||||
"maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion",
|
||||
"maker_statement": "AI Meeting Notes by Notion",
|
||||
"user_count": "15K followers"
|
||||
}
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 642 KiB After Width: | Height: | Size: 616 KiB |
@@ -4,6 +4,8 @@
|
||||
|
||||
产品简介: Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.
|
||||
|
||||
制作人发言: 未获取
|
||||
制作人发言: AI Meeting Notes by Notion
|
||||
|
||||
提取时间: 2025-11-17 22:51:58
|
||||
用户数: 15K followers
|
||||
|
||||
提取时间: 2025-11-18 22:18:51
|
||||
|
||||
Reference in New Issue
Block a user