tophux_scrape/product/new_data_playwright_cloudflare.py

import os
import json
import time
from datetime import datetime
from playwright.sync_api import sync_playwright
from loguru import logger

class ProductHuntScraper:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"

    def connect_to_browser(self):
        """连接到浏览器"""
        try:
            logger.info("正在初始化Playwright浏览器...")

            # 启动Playwright
            self.playwright = sync_playwright().start()

            # 启动Chromium浏览器
            self.browser = self.playwright.chromium.launch(
                headless=False,  # 设置为False以便观察浏览器行为
                args=[
                    "--no-sandbox",
                    "--disable-dev-shm-usage",
                    "--disable-gpu",
                    "--window-size=1920,1080",
                    "--disable-blink-features=AutomationControlled",
                    "--disable-web-security",
                    "--disable-features=VizDisplayCompositor"
                ]
            )

            # 创建浏览器上下文
            self.context = self.browser.new_context(
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                viewport={"width": 1920, "height": 1080},
                ignore_https_errors=True,
                extra_http_headers={
                    "Accept-Language": "en-US,en;q=0.9",
                    "Accept-Encoding": "gzip, deflate, br",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                    "Cache-Control": "no-cache",
                    "Pragma": "no-cache"
                }
            )

            # 创建新页面
            self.page = self.context.new_page()

            # 添加额外的初始化脚本，防止被检测为自动化工具
            self.page.add_init_script("""
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined
                });

                // 覆盖permissions API
                const originalQuery = window.navigator.permissions.query;
                window.navigator.permissions.query = (parameters) => (
                    parameters.name === 'notifications' ?
                        Promise.resolve({ state: Notification.permission }) :
                        originalQuery(parameters)
                );
            """)

            logger.info("成功连接到Playwright浏览器")
            return True

        except Exception as e:
            logger.error(f"连接浏览器失败: {str(e)}")
            return False

    def wait_for_cloudflare(self, timeout=120):
        """等待Cloudflare验证完成"""
        logger.info("等待Cloudflare验证完成...")
        start_time = time.time()

        while time.time() - start_time < timeout:
            try:
                # 检查是否还在Cloudflare验证页面
                title = self.page.title()
                logger.info(f"当前页面标题: {title}")

                # 如果标题包含"Product Hunt"或产品名称，说明验证成功
                if "Product Hunt" in title or "elsie" in title.lower():
                    logger.info("Cloudflare验证成功")
                    return True

                # 检查是否有Cloudflare验证元素
                cloudflare_element = self.page.query_selector("#challenge-form")
                if cloudflare_element:
                    logger.info("检测到Cloudflare验证页面，等待验证...")
                    time.sleep(5)
                    continue

                # 检查是否有验证成功的元素
                success_element = self.page.query_selector("#challenge-success-text")
                if success_element:
                    logger.info("Cloudflare验证成功，等待页面跳转...")
                    time.sleep(5)
                    continue

                # 检查是否已经跳转到产品页面
                current_url = self.page.url
                if "products/elsie-ai-beta" in current_url and "challenge" not in current_url:
                    logger.info("已成功跳转到产品页面")
                    return True

                time.sleep(2)
            except Exception as e:
                logger.debug(f"等待Cloudflare验证时出错: {str(e)}")
                time.sleep(2)

        logger.warning(f"等待Cloudflare验证超时 ({timeout}秒)")
        return False

    def navigate_to_product(self):
        """导航到产品页面"""
        try:
            logger.info(f"正在导航到产品页面: {self.product_url}")

            # 直接访问产品页面
            logger.info("直接访问产品页面")
            self.page.goto(self.product_url, wait_until="domcontentloaded", timeout=60000)

            # 等待Cloudflare验证完成
            if not self.wait_for_cloudflare():
                logger.error("Cloudflare验证失败或超时")
                return False

            # 等待页面加载
            logger.info("等待页面内容加载...")
            time.sleep(10)  # 等待动态内容加载

            # 检查页面URL和标题
            current_url = self.page.url
            page_title = self.page.title()
            logger.info(f"当前页面URL: {current_url}")
            logger.info(f"页面标题: {page_title}")

            # 尝试等待特定元素加载
            try:
                logger.info("等待页面内容加载...")
                # 等待可能的加载指示器消失
                self.page.wait_for_selector("body", timeout=30000)

                # 尝试等待一些可能存在的元素
                possible_selectors = [
                    "h1",
                    "[data-test='product-name']",
                    ".product-name",
                    "div[class*='product']",
                    "div[class*='styles_']"
                ]

                for selector in possible_selectors:
                    try:
                        self.page.wait_for_selector(selector, timeout=5000)
                        logger.info(f"找到元素: {selector}")
                        break
                    except:
                        continue

            except Exception as e:
                logger.warning(f"等待页面元素时出错: {str(e)}")

            logger.info("页面加载完成")
            return True
        except Exception as e:
            logger.error(f"导航到产品页面失败: {str(e)}")
            return False

    def extract_product_info(self):
        """提取产品信息"""
        try:
            logger.info("开始提取产品信息")

            product_info = {
                "url": self.page.url,
                "scraped_at": datetime.now().isoformat()
            }

            # 提取产品名称 - 尝试多种选择器
            name_selectors = [
                "h1",
                "[data-test='product-name']",
                ".product-name",
                "[class*='product'][class*='name']",
                ".styles_productName__",
                "[class*='heading'][class*='xl']",
                "div[class*='text-2xl']",
                "div[class*='text-3xl']",
                "div[class*='text-4xl']",
                "div[class*='text-5xl']",
                "div[class*='text-6xl']",
                "div[class*='font-bold']",
                "div[class*='font-semibold']"
            ]

            for selector in name_selectors:
                try:
                    name_element = self.page.query_selector(selector)
                    if name_element:
                        name_text = name_element.text_content().strip()
                        if name_text and name_text != "www.producthunt.com" and len(name_text) > 2:
                            product_info["name"] = name_text
                            logger.info(f"使用选择器 {selector} 找到产品名称: {product_info['name']}")
                            break
                except Exception as e:
                    logger.debug(f"使用选择器 {selector} 提取产品名称时出错: {str(e)}")

            if "name" not in product_info:
                logger.warning("未找到产品名称")
                product_info["name"] = "未找到"

            # 提取产品简介 - 尝试多种选择器
            desc_selectors = [
                "div.relative.text-16.font-normal.text-gray-700",
                ".text-16.font-normal.text-gray-700",
                "[class*='text-16'][class*='font-normal'][class*='text-gray-700']",
                "div[class*='description']",
                ".product-description",
                "div[class*='tagline']",
                "[data-test='product-tagline']",
                ".styles_tagline__",
                "p[class*='text-gray']",
                "div[class*='mb-4']",
                "div[class*='text-base']",
                "div[class*='text-lg']",
                "div[class*='text-gray-600']",
                "div[class*='text-gray-700']",
                "div[class*='text-gray-800']"
            ]

            for selector in desc_selectors:
                try:
                    desc_element = self.page.query_selector(selector)
                    if desc_element:
                        desc_text = desc_element.text_content().strip()
                        if desc_text and len(desc_text) > 10:  # 确保是有意义的描述
                            product_info["description"] = desc_text
                            logger.info(f"使用选择器 {selector} 找到产品简介: {product_info['description'][:50]}...")
                            break
                except Exception as e:
                    logger.debug(f"使用选择器 {selector} 提取产品简介时出错: {str(e)}")

            if "description" not in product_info:
                logger.warning("未找到产品简介")
                product_info["description"] = "未找到"

            # 提取第一个评论 - 尝试多种选择器
            comment_selectors = [
                "div.flex.flex-1.flex-col.gap-2",
                ".flex.flex-1.flex-col.gap-2",
                "[class*='flex'][class*='flex-1'][class*='flex-col'][class*='gap-2']",
                "div[class*='comment']",
                ".comment-text",
                "div[class*='review']",
                "div[class*='feedback']",
                "blockquote",
                "div[class*='border']",
                "[data-test='comment']",
                "div[class*='text-sm']",
                "div[class*='text-xs']",
                "div[class*='mt-2']",
                "div[class*='mb-2']"
            ]

            for selector in comment_selectors:
                try:
                    comment_element = self.page.query_selector(selector)
                    if comment_element:
                        comment_text = comment_element.text_content().strip()
                        if comment_text and len(comment_text) > 10:  # 确保是有意义的评论
                            product_info["first_comment"] = comment_text
                            logger.info(f"使用选择器 {selector} 找到第一个评论: {product_info['first_comment'][:50]}...")
                            break
                except Exception as e:
                    logger.debug(f"使用选择器 {selector} 提取第一个评论时出错: {str(e)}")

            if "first_comment" not in product_info:
                logger.warning("未找到第一个评论")
                product_info["first_comment"] = "未找到"

            # 尝试提取其他有用信息
            try:
                # 尝试获取产品标签
                tag_elements = self.page.query_selector_all("[class*='tag'], [class*='category'], [class*='topic'], [class*='badge']")
                if tag_elements:
                    tags = [tag.text_content().strip() for tag in tag_elements if tag.text_content().strip()]
                    product_info["tags"] = tags[:5]  # 最多取5个标签
                    logger.info(f"找到标签: {tags[:3]}")
            except Exception as e:
                logger.debug(f"提取标签时出错: {str(e)}")

            # 尝试获取点赞数
            try:
                like_elements = self.page.query_selector_all("[class*='vote'], [class*='like'], [class*='upvote'], [data-test='vote-count'], [class*='count']")
                if like_elements:
                    product_info["likes"] = like_elements[0].text_content().strip()
                    logger.info(f"点赞数: {product_info['likes']}")
            except Exception as e:
                logger.debug(f"提取点赞数时出错: {str(e)}")

            # 尝试获取评论数
            try:
                comment_count_elements = self.page.query_selector_all("[class*='comment-count'], [class*='comments'], [data-test='comment-count'], [class*='count']")
                if comment_count_elements:
                    product_info["comment_count"] = comment_count_elements[0].text_content().strip()
                    logger.info(f"评论数: {product_info['comment_count']}")
            except Exception as e:
                logger.debug(f"提取评论数时出错: {str(e)}")

            # 尝试获取产品图片
            try:
                img_elements = self.page.query_selector_all("img[class*='product'], img[alt*='product'], img[alt*='logo'], img[src*='product']")
                if img_elements:
                    product_info["image_url"] = img_elements[0].get_attribute("src")
                    logger.info(f"产品图片URL: {product_info['image_url']}")
            except Exception as e:
                logger.debug(f"提取产品图片时出错: {str(e)}")

            return product_info

        except Exception as e:
            logger.error(f"提取产品信息失败: {str(e)}")
            return None

    def save_to_file(self, data, filename="product_info.json"):
        """保存数据到文件"""
        try:
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            logger.info(f"数据已保存到 {filename}")
            return True
        except Exception as e:
            logger.error(f"保存数据失败: {str(e)}")
            return False

    def save_screenshot(self, filename="product_screenshot.png"):
        """保存页面截图，用于调试"""
        try:
            self.page.screenshot(path=filename, full_page=True)
            logger.info(f"页面截图已保存到 {filename}")
            return True
        except Exception as e:
            logger.error(f"保存页面截图失败: {str(e)}")
            return False

    def save_html(self, filename="product_page.html"):
        """保存页面HTML，用于调试"""
        try:
            html_content = self.page.content()
            with open(filename, "w", encoding="utf-8") as f:
                f.write(html_content)
            logger.info(f"页面HTML已保存到 {filename}")
            return True
        except Exception as e:
            logger.error(f"保存页面HTML失败: {str(e)}")
            return False

    def close(self):
        """关闭浏览器"""
        try:
            if self.context:
                self.context.close()
            if self.browser:
                self.browser.close()
            if self.playwright:
                self.playwright.stop()
            logger.info("浏览器已关闭")
        except Exception as e:
            logger.error(f"关闭浏览器时出错: {str(e)}")

    def scrape_product(self):
        """执行完整的抓取流程"""
        if not self.connect_to_browser():
            logger.error("无法连接到浏览器")
            return False

        try:
            if not self.navigate_to_product():
                logger.error("无法导航到产品页面")
                return False

            # 保存截图和HTML用于调试
            self.save_screenshot()
            self.save_html()

            product_info = self.extract_product_info()
            if product_info:
                self.save_to_file(product_info)
                return True
            else:
                logger.error("未能提取产品信息")
                return False
        finally:
            self.close()

def main():
    logger.info("开始ProductHunt产品信息抓取")
    scraper = ProductHuntScraper()

    # 可以修改product_url来抓取其他产品
    # scraper.product_url = "https://www.producthunt.com/products/your-product"

    success = scraper.scrape_product()

    if success:
        logger.info("产品信息抓取完成")
    else:
        logger.error("产品信息抓取失败")

if __name__ == "__main__":
    main()