Files
tophux_scrape/product/new_data_stealth.py

363 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import asyncio
from loguru import logger
from playwright.async_api import async_playwright
from playwright_stealth.stealth import Stealth
class ProductHuntScraper:
def __init__(self):
self.browser = None
self.page = None
self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
async def start_browser(self):
"""启动浏览器"""
try:
logger.info("正在启动Playwright浏览器...")
playwright = await async_playwright().start()
# 使用更真实的浏览器配置
self.browser = await playwright.chromium.launch(
headless=True, # 设置为True避免显示浏览器窗口
args=[
'--disable-blink-features=AutomationControlled',
'--disable-web-security',
'--disable-features=VizDisplayCompositor',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--disable-gpu'
]
)
# 创建页面上下文,使用更真实的用户代理
context = await self.browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
viewport={'width': 1920, 'height': 1080},
locale='en-US',
timezone_id='America/New_York'
)
self.page = await context.new_page()
# 应用stealth设置使浏览器看起来更像真实用户
stealth = Stealth()
await stealth.apply_stealth_async(self.page)
# 设置额外的请求头
await self.page.set_extra_http_headers({
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache'
})
logger.success("浏览器启动成功")
return True
except Exception as e:
logger.error(f"启动浏览器失败: {str(e)}")
return False
async def wait_for_cloudflare(self, timeout=120000):
"""等待Cloudflare验证完成"""
try:
logger.info("等待Cloudflare验证完成...")
# 等待页面标题不再是"Just a moment..."或者验证成功元素出现
await self.page.wait_for_function(
"""() => {
return document.title !== "Just a moment..." &&
!document.querySelector('.lds-ring') &&
!document.querySelector('#challenge-error-text');
}""",
timeout=timeout
)
logger.success("Cloudflare验证完成")
return True
except Exception as e:
logger.error(f"等待Cloudflare验证超时: {str(e)}")
return False
async def navigate_to_product(self):
"""导航到产品页面"""
try:
logger.info(f"正在导航到产品页面: {self.product_url}")
# 先访问主页建立会话
logger.info("先访问ProductHunt主页...")
await self.page.goto("https://www.producthunt.com", {"waitUntil": "networkidle", "timeout": 60000})
# 等待一下,模拟真实用户行为
await asyncio.sleep(3)
# 再访问产品页面
logger.info("访问产品页面...")
await self.page.goto(self.product_url, {"waitUntil": "networkidle", "timeout": 60000})
# 等待Cloudflare验证
if not await self.wait_for_cloudflare():
logger.error("Cloudflare验证失败")
return False
# 等待页面加载完成
await asyncio.sleep(5)
logger.success("成功导航到产品页面")
return True
except Exception as e:
logger.error(f"导航到产品页面失败: {str(e)}")
return False
async def extract_product_info(self):
"""提取产品信息"""
try:
logger.info("正在提取产品信息...")
# 尝试多种选择器来获取产品名称
name_selectors = [
'h1[data-test="post-name"]',
'h1[data-test="post-title"]',
'h1[class*="styles_name"]',
'h1',
'[data-test="post-name"]',
'[data-test="post-title"]',
'.styles_name__',
'.styles_title__',
'h1[class*="name"]',
'h1[class*="title"]'
]
product_name = "未找到产品名称"
for selector in name_selectors:
try:
element = await self.page.wait_for_selector(selector, {"timeout": 5000})
if element:
product_name = await element.inner_text()
if product_name and product_name.strip():
logger.info(f"使用选择器 {selector} 找到产品名称: {product_name}")
break
except:
continue
# 尝试多种选择器来获取产品简介
description_selectors = [
'[data-test="post-description"]',
'[data-test="post-tagline"]',
'.styles_tagline__',
'.styles_description__',
'div[class*="tagline"]',
'div[class*="description"]',
'p[class*="tagline"]',
'p[class*="description"]'
]
product_description = "未找到产品简介"
for selector in description_selectors:
try:
element = await self.page.wait_for_selector(selector, {"timeout": 5000})
if element:
product_description = await element.inner_text()
if product_description and product_description.strip():
logger.info(f"使用选择器 {selector} 找到产品简介: {product_description}")
break
except:
continue
# 尝试获取评论
comments_selectors = [
'[data-test="comment-item"]',
'.styles_comment__',
'div[class*="comment"]',
'article[class*="comment"]'
]
comments = []
for selector in comments_selectors:
try:
elements = await self.page.query_selector_all(selector)
if elements:
for element in elements[:5]: # 只获取前5条评论
try:
comment_text = await element.inner_text()
if comment_text and comment_text.strip():
comments.append(comment_text.strip())
except:
continue
if comments:
logger.info(f"使用选择器 {selector} 找到 {len(comments)} 条评论")
break
except:
continue
if not comments:
comments = ["未找到评论"]
# 尝试获取标签
tags_selectors = [
'[data-test="post-topic"]',
'.styles_topic__',
'a[class*="topic"]',
'span[class*="topic"]'
]
tags = []
for selector in tags_selectors:
try:
elements = await self.page.query_selector_all(selector)
if elements:
for element in elements:
try:
tag_text = await element.inner_text()
if tag_text and tag_text.strip():
tags.append(tag_text.strip())
except:
continue
if tags:
logger.info(f"使用选择器 {selector} 找到 {len(tags)} 个标签")
break
except:
continue
if not tags:
tags = ["未找到标签"]
# 尝试获取点赞数和评论数
upvotes = "未找到"
comments_count = "未找到"
try:
upvotes_element = await self.page.query_selector('[data-test="vote-button"]')
if upvotes_element:
upvotes_text = await upvotes_element.inner_text()
if upvotes_text and upvotes_text.strip():
upvotes = upvotes_text.strip()
except:
pass
try:
comments_count_element = await self.page.query_selector('[data-test="comment-count"]')
if comments_count_element:
comments_count_text = await comments_count_element.inner_text()
if comments_count_text and comments_count_text.strip():
comments_count = comments_count_text.strip()
except:
pass
# 尝试获取产品图片
image_url = "未找到图片"
try:
image_element = await self.page.query_selector('img[data-test="product-image"]')
if image_element:
image_url = await image_element.get_attribute('src')
if not image_url:
image_url = await image_element.get_attribute('data-src')
except:
pass
product_info = {
"name": product_name,
"description": product_description,
"tags": tags,
"upvotes": upvotes,
"comments_count": comments_count,
"comments": comments,
"image_url": image_url,
"url": self.product_url
}
logger.success("产品信息提取完成")
return product_info
except Exception as e:
logger.error(f"提取产品信息失败: {str(e)}")
return None
async def save_data(self, data):
"""保存数据到JSON文件"""
try:
with open('product_info_stealth.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.success("数据已保存到 product_info_stealth.json")
return True
except Exception as e:
logger.error(f"保存数据失败: {str(e)}")
return False
async def take_screenshot(self):
"""保存页面截图"""
try:
await self.page.screenshot(path='product_screenshot_stealth.png', full_page=True)
logger.success("页面截图已保存到 product_screenshot_stealth.png")
return True
except Exception as e:
logger.error(f"保存截图失败: {str(e)}")
return False
async def save_html(self):
"""保存页面HTML内容"""
try:
html_content = await self.page.content()
with open('product_page_stealth.html', 'w', encoding='utf-8') as f:
f.write(html_content)
logger.success("页面HTML已保存到 product_page_stealth.html")
return True
except Exception as e:
logger.error(f"保存HTML失败: {str(e)}")
return False
async def close_browser(self):
"""关闭浏览器"""
if self.browser:
await self.browser.close()
logger.info("浏览器已关闭")
async def scrape(self):
"""执行完整的抓取流程"""
try:
if not await self.start_browser():
return False
if not await self.navigate_to_product():
return False
# 保存HTML和截图用于调试
await self.save_html()
await self.take_screenshot()
product_info = await self.extract_product_info()
if product_info:
await self.save_data(product_info)
logger.info(f"抓取完成: {product_info['name']}")
return True
else:
logger.error("未能提取产品信息")
return False
except Exception as e:
logger.error(f"抓取过程中发生错误: {str(e)}")
return False
finally:
await self.close_browser()
async def main():
"""主函数"""
logger.info("开始ProductHunt产品信息抓取使用Stealth模式")
scraper = ProductHuntScraper()
success = await scraper.scrape()
if success:
logger.success("抓取成功完成")
else:
logger.error("抓取失败")
if __name__ == "__main__":
asyncio.run(main())