tophux_scrape/product/new_data_stealth.py

import json
import asyncio
from loguru import logger
from playwright.async_api import async_playwright
from playwright_stealth.stealth import Stealth

class ProductHuntScraper:
    def __init__(self):
        self.browser = None
        self.page = None
        self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"

    async def start_browser(self):
        """启动浏览器"""
        try:
            logger.info("正在启动Playwright浏览器...")
            playwright = await async_playwright().start()

            # 使用更真实的浏览器配置
            self.browser = await playwright.chromium.launch(
                headless=True,  # 设置为True避免显示浏览器窗口
                args=[
                    '--disable-blink-features=AutomationControlled',
                    '--disable-web-security',
                    '--disable-features=VizDisplayCompositor',
                    '--no-sandbox',
                    '--disable-setuid-sandbox',
                    '--disable-infobars',
                    '--disable-dev-shm-usage',
                    '--disable-accelerated-2d-canvas',
                    '--no-first-run',
                    '--no-zygote',
                    '--disable-gpu'
                ]
            )

            # 创建页面上下文，使用更真实的用户代理
            context = await self.browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                locale='en-US',
                timezone_id='America/New_York'
            )

            self.page = await context.new_page()

            # 应用stealth设置，使浏览器看起来更像真实用户
            stealth = Stealth()
            await stealth.apply_stealth_async(self.page)

            # 设置额外的请求头
            await self.page.set_extra_http_headers({
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Cache-Control': 'no-cache',
                'Pragma': 'no-cache'
            })

            logger.success("浏览器启动成功")
            return True

        except Exception as e:
            logger.error(f"启动浏览器失败: {str(e)}")
            return False

    async def wait_for_cloudflare(self, timeout=120000):
        """等待Cloudflare验证完成"""
        try:
            logger.info("等待Cloudflare验证完成...")

            # 等待页面标题不再是"Just a moment..."或者验证成功元素出现
            await self.page.wait_for_function(
                """() => {
                    return document.title !== "Just a moment..." &&
                           !document.querySelector('.lds-ring') &&
                           !document.querySelector('#challenge-error-text');
                }""",
                timeout=timeout
            )

            logger.success("Cloudflare验证完成")
            return True

        except Exception as e:
            logger.error(f"等待Cloudflare验证超时: {str(e)}")
            return False

    async def navigate_to_product(self):
        """导航到产品页面"""
        try:
            logger.info(f"正在导航到产品页面: {self.product_url}")

            # 先访问主页建立会话
            logger.info("先访问ProductHunt主页...")
            await self.page.goto("https://www.producthunt.com", {"waitUntil": "networkidle", "timeout": 60000})

            # 等待一下，模拟真实用户行为
            await asyncio.sleep(3)

            # 再访问产品页面
            logger.info("访问产品页面...")
            await self.page.goto(self.product_url, {"waitUntil": "networkidle", "timeout": 60000})

            # 等待Cloudflare验证
            if not await self.wait_for_cloudflare():
                logger.error("Cloudflare验证失败")
                return False

            # 等待页面加载完成
            await asyncio.sleep(5)

            logger.success("成功导航到产品页面")
            return True

        except Exception as e:
            logger.error(f"导航到产品页面失败: {str(e)}")
            return False

    async def extract_product_info(self):
        """提取产品信息"""
        try:
            logger.info("正在提取产品信息...")

            # 尝试多种选择器来获取产品名称
            name_selectors = [
                'h1[data-test="post-name"]',
                'h1[data-test="post-title"]',
                'h1[class*="styles_name"]',
                'h1',
                '[data-test="post-name"]',
                '[data-test="post-title"]',
                '.styles_name__',
                '.styles_title__',
                'h1[class*="name"]',
                'h1[class*="title"]'
            ]

            product_name = "未找到产品名称"
            for selector in name_selectors:
                try:
                    element = await self.page.wait_for_selector(selector, {"timeout": 5000})
                    if element:
                        product_name = await element.inner_text()
                        if product_name and product_name.strip():
                            logger.info(f"使用选择器 {selector} 找到产品名称: {product_name}")
                            break
                except:
                    continue

            # 尝试多种选择器来获取产品简介
            description_selectors = [
                '[data-test="post-description"]',
                '[data-test="post-tagline"]',
                '.styles_tagline__',
                '.styles_description__',
                'div[class*="tagline"]',
                'div[class*="description"]',
                'p[class*="tagline"]',
                'p[class*="description"]'
            ]

            product_description = "未找到产品简介"
            for selector in description_selectors:
                try:
                    element = await self.page.wait_for_selector(selector, {"timeout": 5000})
                    if element:
                        product_description = await element.inner_text()
                        if product_description and product_description.strip():
                            logger.info(f"使用选择器 {selector} 找到产品简介: {product_description}")
                            break
                except:
                    continue

            # 尝试获取评论
            comments_selectors = [
                '[data-test="comment-item"]',
                '.styles_comment__',
                'div[class*="comment"]',
                'article[class*="comment"]'
            ]

            comments = []
            for selector in comments_selectors:
                try:
                    elements = await self.page.query_selector_all(selector)
                    if elements:
                        for element in elements[:5]:  # 只获取前5条评论
                            try:
                                comment_text = await element.inner_text()
                                if comment_text and comment_text.strip():
                                    comments.append(comment_text.strip())
                            except:
                                continue
                        if comments:
                            logger.info(f"使用选择器 {selector} 找到 {len(comments)} 条评论")
                            break
                except:
                    continue

            if not comments:
                comments = ["未找到评论"]

            # 尝试获取标签
            tags_selectors = [
                '[data-test="post-topic"]',
                '.styles_topic__',
                'a[class*="topic"]',
                'span[class*="topic"]'
            ]

            tags = []
            for selector in tags_selectors:
                try:
                    elements = await self.page.query_selector_all(selector)
                    if elements:
                        for element in elements:
                            try:
                                tag_text = await element.inner_text()
                                if tag_text and tag_text.strip():
                                    tags.append(tag_text.strip())
                            except:
                                continue
                        if tags:
                            logger.info(f"使用选择器 {selector} 找到 {len(tags)} 个标签")
                            break
                except:
                    continue

            if not tags:
                tags = ["未找到标签"]

            # 尝试获取点赞数和评论数
            upvotes = "未找到"
            comments_count = "未找到"

            try:
                upvotes_element = await self.page.query_selector('[data-test="vote-button"]')
                if upvotes_element:
                    upvotes_text = await upvotes_element.inner_text()
                    if upvotes_text and upvotes_text.strip():
                        upvotes = upvotes_text.strip()
            except:
                pass

            try:
                comments_count_element = await self.page.query_selector('[data-test="comment-count"]')
                if comments_count_element:
                    comments_count_text = await comments_count_element.inner_text()
                    if comments_count_text and comments_count_text.strip():
                        comments_count = comments_count_text.strip()
            except:
                pass

            # 尝试获取产品图片
            image_url = "未找到图片"
            try:
                image_element = await self.page.query_selector('img[data-test="product-image"]')
                if image_element:
                    image_url = await image_element.get_attribute('src')
                    if not image_url:
                        image_url = await image_element.get_attribute('data-src')
            except:
                pass

            product_info = {
                "name": product_name,
                "description": product_description,
                "tags": tags,
                "upvotes": upvotes,
                "comments_count": comments_count,
                "comments": comments,
                "image_url": image_url,
                "url": self.product_url
            }

            logger.success("产品信息提取完成")
            return product_info

        except Exception as e:
            logger.error(f"提取产品信息失败: {str(e)}")
            return None

    async def save_data(self, data):
        """保存数据到JSON文件"""
        try:
            with open('product_info_stealth.json', 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            logger.success("数据已保存到 product_info_stealth.json")
            return True
        except Exception as e:
            logger.error(f"保存数据失败: {str(e)}")
            return False

    async def take_screenshot(self):
        """保存页面截图"""
        try:
            await self.page.screenshot(path='product_screenshot_stealth.png', full_page=True)
            logger.success("页面截图已保存到 product_screenshot_stealth.png")
            return True
        except Exception as e:
            logger.error(f"保存截图失败: {str(e)}")
            return False

    async def save_html(self):
        """保存页面HTML内容"""
        try:
            html_content = await self.page.content()
            with open('product_page_stealth.html', 'w', encoding='utf-8') as f:
                f.write(html_content)
            logger.success("页面HTML已保存到 product_page_stealth.html")
            return True
        except Exception as e:
            logger.error(f"保存HTML失败: {str(e)}")
            return False

    async def close_browser(self):
        """关闭浏览器"""
        if self.browser:
            await self.browser.close()
            logger.info("浏览器已关闭")

    async def scrape(self):
        """执行完整的抓取流程"""
        try:
            if not await self.start_browser():
                return False

            if not await self.navigate_to_product():
                return False

            # 保存HTML和截图用于调试
            await self.save_html()
            await self.take_screenshot()

            product_info = await self.extract_product_info()
            if product_info:
                await self.save_data(product_info)
                logger.info(f"抓取完成: {product_info['name']}")
                return True
            else:
                logger.error("未能提取产品信息")
                return False

        except Exception as e:
            logger.error(f"抓取过程中发生错误: {str(e)}")
            return False
        finally:
            await self.close_browser()

async def main():
    """主函数"""
    logger.info("开始ProductHunt产品信息抓取（使用Stealth模式）")
    scraper = ProductHuntScraper()
    success = await scraper.scrape()

    if success:
        logger.success("抓取成功完成")
    else:
        logger.error("抓取失败")

if __name__ == "__main__":
    asyncio.run(main())