更新了抓取producthunt的代码
This commit is contained in:
@@ -114,7 +114,7 @@ class ProductHuntScraper:
|
||||
|
||||
# 等待页面标题包含"Product Hunt",最长等待300秒
|
||||
logger.info("等待页面标题包含'Product Hunt'...")
|
||||
max_wait_time = 300 # 最大等待时间(秒)
|
||||
max_wait_time = 60 # 最大等待时间(秒)
|
||||
wait_interval = 5 # 检查间隔(秒)
|
||||
waited_time = 0
|
||||
|
||||
@@ -129,6 +129,23 @@ class ProductHuntScraper:
|
||||
logger.success("Product Hunt网站已成功打开")
|
||||
return True
|
||||
|
||||
# 检查是否遇到Cloudflare验证
|
||||
if "Just a moment" in title or "请稍候" in title or "Checking your browser" in title:
|
||||
logger.info("遇到Cloudflare验证,等待验证完成...")
|
||||
await asyncio.sleep(10) # 等待10秒
|
||||
waited_time += 10
|
||||
continue
|
||||
|
||||
# 检查是否已成功加载页面内容
|
||||
try:
|
||||
# 尝试查找页面中的关键元素
|
||||
h1_element = await self.page.query_selector("h1")
|
||||
if h1_element:
|
||||
logger.success("检测到页面内容已加载")
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 等待一段时间后再次检查
|
||||
await asyncio.sleep(wait_interval)
|
||||
waited_time += wait_interval
|
||||
@@ -165,59 +182,231 @@ class ProductHuntScraper:
|
||||
# 记录点击制作人链接的行为
|
||||
await self.record_click("制作人链接", "点击制作人链接在当前窗口打开")
|
||||
|
||||
# 保存当前页面的URL,以便后续返回
|
||||
original_url = self.page.url
|
||||
logger.info(f"保存当前页面URL: {original_url}")
|
||||
|
||||
# 在当前页面导航到制作人链接
|
||||
logger.info(f"正在在当前窗口打开制作人链接: {maker_link}")
|
||||
await self.page.goto(maker_link, wait_until="domcontentloaded")
|
||||
|
||||
# 设置更长的超时时间来处理模态窗口
|
||||
try:
|
||||
await self.page.goto(maker_link, wait_until="domcontentloaded", timeout=60000)
|
||||
logger.success("页面导航成功")
|
||||
except Exception as e:
|
||||
logger.error(f"页面导航失败: {e}")
|
||||
# 尝试返回原始页面
|
||||
try:
|
||||
await self.page.goto(original_url, wait_until="domcontentloaded")
|
||||
logger.success(f"已返回原始页面: {original_url}")
|
||||
except Exception as return_error:
|
||||
logger.error(f"返回原始页面失败: {return_error}")
|
||||
return ""
|
||||
|
||||
# 等待页面加载
|
||||
await self.page.wait_for_load_state("networkidle")
|
||||
|
||||
# 等待title元素出现并包含产品名称(最长等待2分钟)
|
||||
logger.info("等待title元素出现并包含产品名称(最长等待2分钟)...")
|
||||
# 检查并处理可能的模态窗口
|
||||
try:
|
||||
# 等待title元素出现,最长等待2分钟
|
||||
await self.page.wait_for_selector("title", timeout=120000)
|
||||
logger.info("检查是否存在模态窗口...")
|
||||
modal_selectors = [
|
||||
"[role='dialog']",
|
||||
".modal",
|
||||
".modal-dialog",
|
||||
"[data-testid='modal']",
|
||||
"[class*='modal']",
|
||||
"[class*='overlay']",
|
||||
"[class*='dialog']",
|
||||
"[class*='popup']"
|
||||
]
|
||||
|
||||
# 检查title是否包含产品名称
|
||||
for selector in modal_selectors:
|
||||
try:
|
||||
modal_element = await self.page.query_selector(selector)
|
||||
if modal_element:
|
||||
logger.info(f"检测到模态窗口,选择器: {selector}")
|
||||
|
||||
# 尝试关闭模态窗口
|
||||
close_selectors = [
|
||||
"[aria-label='Close']",
|
||||
".close",
|
||||
".modal-close",
|
||||
"[data-testid='close']",
|
||||
"button:has-text('Close')",
|
||||
"button:has-text('关闭')",
|
||||
"button:has-text('X')"
|
||||
]
|
||||
|
||||
for close_selector in close_selectors:
|
||||
try:
|
||||
close_button = await modal_element.query_selector(close_selector)
|
||||
if close_button:
|
||||
await close_button.click()
|
||||
logger.success(f"已关闭模态窗口,使用选择器: {close_selector}")
|
||||
await self.page.wait_for_timeout(1000) # 等待关闭动画
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 如果模态窗口仍然存在,尝试点击模态窗口外部关闭
|
||||
try:
|
||||
await self.page.mouse.click(10, 10) # 点击页面左上角
|
||||
logger.info("尝试点击页面外部关闭模态窗口")
|
||||
await self.page.wait_for_timeout(1000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.warning(f"检查模态窗口时出错: {e}")
|
||||
|
||||
# 快速检查页面是否已加载
|
||||
logger.info("快速检查页面加载状态...")
|
||||
|
||||
# 立即尝试获取页面内容,不等待特定元素
|
||||
try:
|
||||
title_text = await self.page.title()
|
||||
logger.info(f"页面标题: {title_text}")
|
||||
|
||||
# 获取产品名称(从maker_text参数中获取)
|
||||
product_name = maker_text.strip() if maker_text else ""
|
||||
|
||||
if product_name and product_name.lower() in title_text.lower():
|
||||
logger.success(f"标题包含产品名称: {product_name}")
|
||||
else:
|
||||
logger.warning(f"标题不包含产品名称,产品名称: {product_name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"等待title元素失败: {e}")
|
||||
logger.warning(f"获取页面标题失败: {e}")
|
||||
|
||||
# 再等待30秒,确保页面完全加载
|
||||
logger.info("再等待30秒,确保页面完全加载...")
|
||||
await self.page.wait_for_timeout(30000) # 等待30秒
|
||||
|
||||
# 提取制作人评论内容(XPath: //*[@id=\"comment-4597755\"]/div/div[2]/div/div/div)
|
||||
logger.info("正在提取制作人评论内容...")
|
||||
# 快速检查页面是否有内容
|
||||
try:
|
||||
# 使用XPath查找评论元素
|
||||
comment_element = await self.page.query_selector(
|
||||
'xpath=//*[@id="comment-4597755"]/div/div[2]/div/div/div'
|
||||
)
|
||||
if comment_element:
|
||||
maker_statement = (await comment_element.text_content()).strip()
|
||||
logger.info(f"制作人评论内容: {maker_statement[:200]}...")
|
||||
|
||||
return maker_statement
|
||||
else:
|
||||
logger.warning("未找到XPath为//*[@id=\"comment-4597755\"]/div/div[2]/div/div/div的元素")
|
||||
body_element = await self.page.query_selector("body")
|
||||
if body_element:
|
||||
body_text = await body_element.text_content()
|
||||
if len(body_text.strip()) > 10:
|
||||
logger.success("页面内容已加载")
|
||||
else:
|
||||
logger.warning("页面内容为空或过短")
|
||||
except Exception as e:
|
||||
logger.error(f"提取制作人评论内容失败: {e}")
|
||||
logger.warning(f"检查页面内容失败: {e}")
|
||||
|
||||
# 短暂等待确保DOM稳定
|
||||
logger.info("等待DOM稳定...")
|
||||
await self.page.wait_for_timeout(2000) # 等待2秒
|
||||
|
||||
# 保存模态窗口截图用于调试
|
||||
modal_screenshot = "modal_window_debug.png"
|
||||
await self.page.screenshot(path=modal_screenshot, full_page=True)
|
||||
logger.info(f"模态窗口调试截图已保存到: {modal_screenshot}")
|
||||
|
||||
# 首先检查页面内容,获取页面主要文本
|
||||
try:
|
||||
page_content = await self.page.content()
|
||||
logger.info("页面内容已获取")
|
||||
|
||||
# 检查页面是否包含常见的关键词
|
||||
keywords = ['comment', 'discussion', 'maker', 'creator', 'author', 'statement', 'description']
|
||||
found_keywords = [kw for kw in keywords if kw in page_content.lower()]
|
||||
if found_keywords:
|
||||
logger.info(f"页面包含关键词: {found_keywords}")
|
||||
else:
|
||||
logger.warning("页面未检测到常见关键词")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取页面内容失败: {e}")
|
||||
|
||||
# 提取制作人评论内容 - 针对模态窗口的多种选择器策略
|
||||
logger.info("正在提取制作人评论内容...")
|
||||
|
||||
# 策略1:尝试多种XPath选择器
|
||||
xpath_selectors = [
|
||||
# 新的主要选择器:包含prose、prose-format和richText类的div
|
||||
"//div[contains(@class, 'prose') and contains(@class, 'prose-format') and contains(@class, 'richText')]",
|
||||
# 备用选择器
|
||||
'//*[@id="comment-4597755"]/div/div[2]/div/div/div', # 原始选择器
|
||||
'//div[contains(@class, "comment")]//div[contains(@class, "text")]', # 通用评论选择器
|
||||
'//div[contains(@class, "modal")]//div[contains(@class, "content")]', # 模态窗口内容
|
||||
'//div[contains(@class, "dialog")]//div[contains(@class, "body")]', # 对话框内容
|
||||
'//section//div[contains(@class, "text")]', # section内的文本内容
|
||||
'//div[contains(@class, "launch")]//div[contains(@class, "description")]', # 发布描述
|
||||
'//article//div[contains(@class, "content")]', # 文章内容
|
||||
'//main//div[contains(@class, "text")]', # 主要内容区文本
|
||||
# 其他备用选择器
|
||||
"//div[contains(@class, 'styles_commentsContainer')]//div[contains(@class, 'styles_comment')]//div[contains(@class, 'styles_commentBody')]//p",
|
||||
"//div[contains(@class, 'comment')]//p",
|
||||
"//div[contains(@class, 'comments')]//p",
|
||||
]
|
||||
|
||||
for i, xpath in enumerate(xpath_selectors, 1):
|
||||
try:
|
||||
logger.info(f"尝试选择器 {i}/{len(xpath_selectors)}: {xpath}")
|
||||
comment_element = await self.page.query_selector(f'xpath={xpath}')
|
||||
if comment_element:
|
||||
maker_statement = (await comment_element.text_content()).strip()
|
||||
if maker_statement: # 确保有内容
|
||||
logger.success(f"使用选择器 {i} 成功提取制作人评论内容: {maker_statement[:200]}...")
|
||||
|
||||
# 提取完成后返回原始页面
|
||||
logger.info("提取完成,正在返回原始产品页面...")
|
||||
await self.page.goto(original_url, wait_until="domcontentloaded")
|
||||
logger.success(f"已成功返回原始页面: {original_url}")
|
||||
|
||||
return maker_statement
|
||||
else:
|
||||
logger.warning(f"选择器 {i} 提取的内容为空")
|
||||
except Exception as e:
|
||||
logger.warning(f"选择器 {i} 失败: {e}")
|
||||
|
||||
# 策略2:如果所有选择器都失败,尝试提取页面主要文本内容
|
||||
logger.info("所有选择器失败,尝试提取页面主要文本内容...")
|
||||
try:
|
||||
# 获取页面body文本
|
||||
body_element = await self.page.query_selector('body')
|
||||
if body_element:
|
||||
full_text = (await body_element.text_content()).strip()
|
||||
# 提取前500个字符作为制作人发言
|
||||
if len(full_text) > 100:
|
||||
maker_statement = full_text[:500]
|
||||
logger.info(f"提取页面主要文本内容: {maker_statement[:200]}...")
|
||||
|
||||
# 提取完成后返回原始页面
|
||||
logger.info("提取完成,正在返回原始产品页面...")
|
||||
await self.page.goto(original_url, wait_until="domcontentloaded")
|
||||
logger.success(f"已成功返回原始页面: {original_url}")
|
||||
|
||||
return maker_statement
|
||||
except Exception as e:
|
||||
logger.error(f"提取页面主要文本内容失败: {e}")
|
||||
|
||||
# 策略3:如果仍然失败,记录页面截图以便调试
|
||||
logger.warning("所有提取策略都失败,保存截图用于调试...")
|
||||
try:
|
||||
screenshot_path = "modal_debug_screenshot.png"
|
||||
await self.page.screenshot(path=screenshot_path, full_page=True)
|
||||
logger.info(f"模态窗口截图已保存到: {screenshot_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"保存截图失败: {e}")
|
||||
|
||||
# 即使未找到元素,也返回原始页面
|
||||
logger.info("正在返回原始产品页面...")
|
||||
await self.page.goto(original_url, wait_until="domcontentloaded")
|
||||
logger.success(f"已成功返回原始页面: {original_url}")
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"在当前窗口打开制作人链接失败: {e}")
|
||||
|
||||
# 保存当前页面截图用于调试
|
||||
try:
|
||||
debug_screenshot = "debug_maker_link_failure.png"
|
||||
await self.page.screenshot(path=debug_screenshot, full_page=True)
|
||||
logger.info(f"错误调试截图已保存到: {debug_screenshot}")
|
||||
except Exception as screenshot_error:
|
||||
logger.error(f"保存调试截图失败: {screenshot_error}")
|
||||
|
||||
# 发生异常时也尝试返回原始页面
|
||||
try:
|
||||
logger.info("发生异常,尝试返回原始产品页面...")
|
||||
await self.page.goto(original_url, wait_until="domcontentloaded")
|
||||
logger.success(f"已成功返回原始页面: {original_url}")
|
||||
except Exception as return_error:
|
||||
logger.error(f"返回原始页面失败: {return_error}")
|
||||
|
||||
return ""
|
||||
|
||||
async def _extract_maker_statement_direct_open(self, maker_link, maker_text):
|
||||
@@ -351,19 +540,30 @@ class ProductHuntScraper:
|
||||
maker_link = await a_element.get_attribute('href')
|
||||
|
||||
# 拼凑完整的URL
|
||||
if maker_link and not maker_link.startswith('http'):
|
||||
# 如果是相对路径,拼凑为完整URL
|
||||
base_url = "https://www.producthunt.com"
|
||||
if maker_link.startswith('/'):
|
||||
maker_link = base_url + maker_link
|
||||
if maker_link:
|
||||
if not maker_link.startswith('http'):
|
||||
# 如果是相对路径,拼凑为完整URL
|
||||
base_url = "https://www.producthunt.com"
|
||||
if maker_link.startswith('/'):
|
||||
maker_link = base_url + maker_link
|
||||
else:
|
||||
maker_link = base_url + '/' + maker_link
|
||||
|
||||
# 验证URL是否有效(不能只是根路径)
|
||||
if maker_link == "https://www.producthunt.com/" or maker_link == "https://www.producthunt.com":
|
||||
logger.warning(f"制作人链接无效,跳过提取: {maker_link}")
|
||||
product_info["maker_link"] = ""
|
||||
product_info["maker_statement"] = ""
|
||||
else:
|
||||
maker_link = base_url + '/' + maker_link
|
||||
|
||||
product_info["maker_link"] = maker_link
|
||||
logger.info(f"制作人链接: {maker_link}")
|
||||
|
||||
# 调用子函数在当前窗口中提取制作人发言
|
||||
product_info["maker_statement"] = await self.extract_maker_statement_from_current_window(maker_link, maker_text)
|
||||
product_info["maker_link"] = maker_link
|
||||
logger.info(f"制作人链接: {maker_link}")
|
||||
|
||||
# 调用子函数在当前窗口中提取制作人发言
|
||||
product_info["maker_statement"] = await self.extract_maker_statement_from_current_window(maker_link, maker_text)
|
||||
else:
|
||||
logger.warning("未获取到制作人链接")
|
||||
product_info["maker_link"] = ""
|
||||
product_info["maker_statement"] = ""
|
||||
else:
|
||||
logger.warning("未找到制作人链接的a标签")
|
||||
else:
|
||||
@@ -410,7 +610,7 @@ async def main():
|
||||
logger.info("开始ProductHunt数据抓取任务")
|
||||
|
||||
# 目标URL
|
||||
target_url = "https://www.producthunt.com/products/notion"
|
||||
target_url = "https://www.producthunt.com/products/palettebrain"
|
||||
|
||||
# 创建抓取器实例
|
||||
scraper = ProductHuntScraper(debug_port=9222)
|
||||
|
||||
Reference in New Issue
Block a user